Parent Directory
|
Revision Log
Revision 1.260 - (view) (download) (as text)
1 : | efrank | 1.1 | package FIG; |
2 : | |||
3 : | olson | 1.111 | use strict; |
4 : | |||
5 : | overbeek | 1.135 | use Fcntl qw/:flock/; # import LOCK_* constants |
6 : | |||
7 : | olson | 1.116 | use POSIX; |
8 : | olson | 1.158 | use IPC::Open2; |
9 : | olson | 1.116 | |
10 : | efrank | 1.1 | use DBrtns; |
11 : | use Sim; | ||
12 : | use Blast; | ||
13 : | use FIG_Config; | ||
14 : | overbeek | 1.36 | use tree_utilities; |
15 : | olson | 1.93 | use Subsystem; |
16 : | olson | 1.162 | use SeedDas; |
17 : | olson | 1.183 | use Construct; |
18 : | parrello | 1.200 | use FIGRules; |
19 : | parrello | 1.210 | use Tracer; |
20 : | olson | 1.260 | |
21 : | olson | 1.245 | eval { require FigGFF; }; |
22 : | olson | 1.260 | if ($@ and $ENV{USER} eq "olson") |
23 : | { | ||
24 : | warn $@; | ||
25 : | } | ||
26 : | olson | 1.79 | |
27 : | # | ||
28 : | # Conditionally evaluate this in case its prerequisites are not available. | ||
29 : | # | ||
30 : | |||
31 : | our $ClearinghouseOK = eval { | ||
32 : | require Clearinghouse; | ||
33 : | }; | ||
34 : | efrank | 1.1 | |
35 : | olson | 1.10 | use IO::Socket; |
36 : | |||
37 : | efrank | 1.1 | use FileHandle; |
38 : | |||
39 : | use Carp; | ||
40 : | use Data::Dumper; | ||
41 : | overbeek | 1.25 | use Time::Local; |
42 : | olson | 1.93 | use File::Spec; |
43 : | olson | 1.123 | use File::Copy; |
44 : | olson | 1.112 | # |
45 : | # Try to load the RPC stuff; it might fail on older versions of the software. | ||
46 : | # | ||
47 : | eval { | ||
48 : | require FIGrpc; | ||
49 : | }; | ||
50 : | |||
51 : | my $xmlrpc_available = 1; | ||
52 : | if ($@ ne "") | ||
53 : | { | ||
54 : | $xmlrpc_available = 0; | ||
55 : | } | ||
56 : | |||
57 : | efrank | 1.1 | |
58 : | olson | 1.111 | use FIGAttributes; |
59 : | use base 'FIGAttributes'; | ||
60 : | |||
61 : | use vars qw(%_FunctionAttributes); | ||
62 : | |||
63 : | use Data::Dumper; | ||
64 : | |||
65 : | olson | 1.124 | # |
66 : | # Force all new files to be all-writable. | ||
67 : | # | ||
68 : | |||
69 : | umask 0; | ||
70 : | |||
71 : | parrello | 1.210 | =head1 FIG Genome Annotation System |
72 : | |||
73 : | =head2 Introduction | ||
74 : | |||
75 : | This is the main object for access to the SEED data store. The data store | ||
76 : | itself is a combination of flat files and a database. The flat files can | ||
77 : | be moved easily between systems and the database rebuilt as needed. | ||
78 : | |||
79 : | A reduced set of this object's functions are available via the B<SFXlate> | ||
80 : | object. The SFXlate object uses a single database to represent all its | ||
81 : | genomic information. It provides a much smaller capability for updating | ||
82 : | the data, and eliminates all similarities except for bidirectional best | ||
83 : | hits. | ||
84 : | |||
85 : | The key to making the FIG system work is proper configuration of the | ||
86 : | C<FIG_Config.pm> file. This file contains names and URLs for the key | ||
87 : | directories as well as the type and login information for the database. | ||
88 : | |||
89 : | =cut | ||
90 : | |||
91 : | #: Constructor FIG->new(); | ||
92 : | |||
93 : | =head2 Public Methods | ||
94 : | |||
95 : | =head3 new | ||
96 : | |||
97 : | C<< my $fig = FIG->new(); >> | ||
98 : | |||
99 : | This is the constructor for a FIG object. It uses no parameters. | ||
100 : | |||
101 : | =cut | ||
102 : | |||
103 : | efrank | 1.1 | sub new { |
104 : | my($class) = @_; | ||
105 : | |||
106 : | olson | 1.102 | # |
107 : | # Check to see if we have a FIG_URL environment variable set. | ||
108 : | # If we do, don't actually create a FIG object, but rather | ||
109 : | # create a FIGrpc and return that as the return from this constructor. | ||
110 : | # | ||
111 : | |||
112 : | parrello | 1.210 | if ($ENV{FIG_URL} ne "" && $xmlrpc_available) { |
113 : | Trace("Creating figrpc for '$ENV{FIG_URL}'") if T(0); | ||
114 : | my $figrpc = new FIGrpc($ENV{FIG_URL}); | ||
115 : | return $figrpc; | ||
116 : | olson | 1.102 | } |
117 : | |||
118 : | efrank | 1.1 | my $rdbH = new DBrtns; |
119 : | bless { | ||
120 : | parrello | 1.210 | _dbf => $rdbH, |
121 : | }, $class; | ||
122 : | efrank | 1.1 | } |
123 : | |||
124 : | parrello | 1.210 | |
125 : | =head3 get_system_name | ||
126 : | |||
127 : | C<< my $name = $fig->get_system_name; >> | ||
128 : | |||
129 : | Returns C<seed>, indicating that this is object is using the SEED | ||
130 : | database. The same method on an SFXlate object will return C<sprout>. | ||
131 : | |||
132 : | =cut | ||
133 : | #: Return Type $; | ||
134 : | sub get_system_name { | ||
135 : | olson | 1.207 | return "seed"; |
136 : | olson | 1.205 | } |
137 : | parrello | 1.210 | |
138 : | # Destructor: releases the database handle. | ||
139 : | olson | 1.205 | |
140 : | efrank | 1.1 | sub DESTROY { |
141 : | my($self) = @_; | ||
142 : | my($rdbH); | ||
143 : | |||
144 : | parrello | 1.210 | if ($rdbH = $self->db_handle) { |
145 : | $rdbH->DESTROY; | ||
146 : | efrank | 1.1 | } |
147 : | } | ||
148 : | |||
149 : | parrello | 1.210 | =head3 delete_genomes |
150 : | |||
151 : | C<< $fig->delete_genomes(\@genomes); >> | ||
152 : | |||
153 : | Delete the specified genomes from the data store. This requires making | ||
154 : | system calls to move and delete files. | ||
155 : | |||
156 : | =cut | ||
157 : | #: Return Type ; | ||
158 : | overbeek | 1.7 | sub delete_genomes { |
159 : | my($self,$genomes) = @_; | ||
160 : | my $tmpD = "$FIG_Config::temp/tmp.deleted.$$"; | ||
161 : | my $tmp_Data = "$FIG_Config::temp/Data.$$"; | ||
162 : | |||
163 : | my %to_del = map { $_ => 1 } @$genomes; | ||
164 : | open(TMP,">$tmpD") || die "could not open $tmpD"; | ||
165 : | |||
166 : | my $genome; | ||
167 : | foreach $genome ($self->genomes) | ||
168 : | { | ||
169 : | if (! $to_del{$genome}) | ||
170 : | { | ||
171 : | print TMP "$genome\n"; | ||
172 : | } | ||
173 : | } | ||
174 : | close(TMP); | ||
175 : | |||
176 : | &run("extract_genomes $tmpD $FIG_Config::data $tmp_Data"); | ||
177 : | parrello | 1.200 | |
178 : | overbeek | 1.47 | # &run("mv $FIG_Config::data $FIG_Config::data.deleted; mv $tmp_Data $FIG_Config::data; fig load_all; rm -rf $FIG_Config::data.deleted"); |
179 : | parrello | 1.200 | |
180 : | &run("mv $FIG_Config::data $FIG_Config::data.deleted"); | ||
181 : | overbeek | 1.47 | &run("mv $tmp_Data $FIG_Config::data"); |
182 : | &run("fig load_all"); | ||
183 : | &run("rm -rf $FIG_Config::data.deleted"); | ||
184 : | overbeek | 1.7 | } |
185 : | parrello | 1.200 | |
186 : | parrello | 1.210 | =head3 add_genome |
187 : | |||
188 : | C<< my $ok = $fig->add_genome($genomeF); >> | ||
189 : | |||
190 : | Add a new genome to the data store. A genome's data is kept in a directory | ||
191 : | by itself, underneath the main organism directory. | ||
192 : | |||
193 : | =over 4 | ||
194 : | |||
195 : | =item genomeF | ||
196 : | |||
197 : | Name of the directory containing the genome files. | ||
198 : | |||
199 : | =item RETURN | ||
200 : | |||
201 : | Returns TRUE if successful, else FALSE. | ||
202 : | |||
203 : | =back | ||
204 : | |||
205 : | =cut | ||
206 : | #: Return Type $; | ||
207 : | efrank | 1.1 | sub add_genome { |
208 : | my($self,$genomeF) = @_; | ||
209 : | |||
210 : | my $rc = 0; | ||
211 : | olson | 1.93 | |
212 : | my(undef, $path, $genome) = File::Spec->splitpath($genomeF); | ||
213 : | |||
214 : | if ($genome !~ /^\d+\.\d+$/) | ||
215 : | { | ||
216 : | warn "Invalid genome filename $genomeF\n"; | ||
217 : | return $rc; | ||
218 : | } | ||
219 : | |||
220 : | if (-d $FIG_Config::organisms/$genome) | ||
221 : | { | ||
222 : | warn "Organism already exists for $genome\n"; | ||
223 : | return $rc; | ||
224 : | } | ||
225 : | parrello | 1.200 | |
226 : | olson | 1.93 | |
227 : | # | ||
228 : | # We're okay, it doesn't exist. | ||
229 : | # | ||
230 : | |||
231 : | my @errors = `$FIG_Config::bin/verify_genome_directory $genomeF`; | ||
232 : | |||
233 : | if (@errors) | ||
234 : | efrank | 1.1 | { |
235 : | olson | 1.93 | warn "Errors found while verifying genome directory $genomeF:\n"; |
236 : | print join("", @errors); | ||
237 : | return $rc; | ||
238 : | } | ||
239 : | parrello | 1.200 | |
240 : | olson | 1.93 | &run("cp -r $genomeF $FIG_Config::organisms"); |
241 : | &run("chmod -R 777 $FIG_Config::organisms/$genome"); | ||
242 : | |||
243 : | &run("index_contigs $genome"); | ||
244 : | &run("compute_genome_counts $genome"); | ||
245 : | &run("load_features $genome"); | ||
246 : | |||
247 : | $rc = 1; | ||
248 : | if (-s "$FIG_Config::organisms/$genome/Features/peg/fasta") | ||
249 : | { | ||
250 : | &run("index_translations $genome"); | ||
251 : | my @tmp = `cut -f1 $FIG_Config::organisms/$genome/Features/peg/tbl`; | ||
252 : | chomp @tmp; | ||
253 : | &run("cat $FIG_Config::organisms/$genome/Features/peg/fasta >> $FIG_Config::data/Global/nr"); | ||
254 : | &enqueue_similarities(\@tmp); | ||
255 : | } | ||
256 : | if ((-s "$FIG_Config::organisms/$genome/assigned_functions") || | ||
257 : | (-d "$FIG_Config::organisms/$genome/UserModels")) | ||
258 : | { | ||
259 : | &run("add_assertions_of_function $genome"); | ||
260 : | efrank | 1.1 | } |
261 : | parrello | 1.200 | |
262 : | efrank | 1.1 | return $rc; |
263 : | } | ||
264 : | |||
265 : | parrello | 1.210 | =head3 enqueue_similarities |
266 : | olson | 1.93 | |
267 : | usage: enqueue_similarities(\@sims) | ||
268 : | |||
269 : | Queue the passed fids (a reference to a list) for similarity | ||
270 : | computation. | ||
271 : | |||
272 : | =cut | ||
273 : | parrello | 1.210 | #: Return Type ; |
274 : | olson | 1.93 | sub enqueue_similarities { |
275 : | efrank | 1.1 | my($fids) = @_; |
276 : | my $fid; | ||
277 : | |||
278 : | olson | 1.93 | my $sim_q = "$FIG_Config::global/queued_similarities"; |
279 : | |||
280 : | open(TMP,">>$sim_q") | ||
281 : | || die "could not open $sim_q"; | ||
282 : | |||
283 : | # | ||
284 : | # We need to lock here so that if a computation is creating a snapshot of the | ||
285 : | # queue, we block until it's done. | ||
286 : | # | ||
287 : | |||
288 : | flock(TMP, LOCK_EX) or die "Cannot lock $sim_q\n"; | ||
289 : | |||
290 : | efrank | 1.1 | foreach $fid (@$fids) |
291 : | { | ||
292 : | print TMP "$fid\n"; | ||
293 : | } | ||
294 : | close(TMP); | ||
295 : | olson | 1.10 | } |
296 : | |||
297 : | parrello | 1.210 | =head3 create_sim_askfor_pool |
298 : | olson | 1.93 | |
299 : | usage: create_sim_askfor_pool() | ||
300 : | |||
301 : | olson | 1.123 | Creates an askfor pool, a snapshot of the current NR and similarity |
302 : | queue. Zeros out the old queue. | ||
303 : | |||
304 : | The askfor pool needs to keep track of which sequences need to be | ||
305 : | calculated, which have been handed out, etc. To simplify this task we | ||
306 : | chunk the sequences into fairly small numbers (10-20 sequences) and | ||
307 : | allocate work on a per-chunk basis. We make use of the relational | ||
308 : | database to keep track of chunk status as well as the seek locations | ||
309 : | into the file of sequence data. The initial creation of the pool | ||
310 : | involves indexing the sequence data with seek offsets and lengths and | ||
311 : | populating the sim_askfor_index table with this information and with | ||
312 : | initial status information. | ||
313 : | olson | 1.93 | |
314 : | parrello | 1.200 | =cut |
315 : | parrello | 1.210 | #: Return Type $; |
316 : | olson | 1.93 | sub create_sim_askfor_pool |
317 : | { | ||
318 : | olson | 1.123 | my($self, $chunk_size) = @_; |
319 : | |||
320 : | $chunk_size = 15 unless $chunk_size =~ /^\d+$/; | ||
321 : | olson | 1.93 | |
322 : | my $pool_dir = "$FIG_Config::global/sim_pools"; | ||
323 : | &verify_dir($pool_dir); | ||
324 : | |||
325 : | # | ||
326 : | # Lock the pool directory. | ||
327 : | # | ||
328 : | open(my $lock, ">$pool_dir/lockfile"); | ||
329 : | |||
330 : | flock($lock, LOCK_EX); | ||
331 : | |||
332 : | my $num = 0; | ||
333 : | if (open(my $toc, "<$pool_dir/TOC")) | ||
334 : | { | ||
335 : | while (<$toc>) | ||
336 : | { | ||
337 : | chomp; | ||
338 : | olson | 1.123 | # print STDERR "Have toc entry $_\n"; |
339 : | olson | 1.93 | my ($idx, $time, $str) = split(/\s+/, $_, 3); |
340 : | |||
341 : | $num = max($num, $idx); | ||
342 : | } | ||
343 : | close($toc); | ||
344 : | } | ||
345 : | $num++; | ||
346 : | open(my $toc, ">>$pool_dir/TOC") or die "Cannot write $pool_dir/TOC: $!\n"; | ||
347 : | |||
348 : | print $toc "$num ", time(), " New toc entry\n"; | ||
349 : | close($toc); | ||
350 : | |||
351 : | olson | 1.123 | my $cpool_id = sprintf "%04d", $num; |
352 : | my $cpool_dir = "$pool_dir/$cpool_id"; | ||
353 : | olson | 1.93 | |
354 : | # | ||
355 : | # All set, create the directory for this pool. | ||
356 : | # | ||
357 : | |||
358 : | &verify_dir($cpool_dir); | ||
359 : | |||
360 : | # | ||
361 : | # Now we can copy the nr and sim queue here. | ||
362 : | # Do this stuff inside an eval so we can clean up | ||
363 : | # the lockfile. | ||
364 : | # | ||
365 : | |||
366 : | eval { | ||
367 : | my $sim_q = "$FIG_Config::global/queued_similarities"; | ||
368 : | |||
369 : | olson | 1.123 | copy("$sim_q", "$cpool_dir/q"); |
370 : | copy("$FIG_Config::data/Global/nr", "$cpool_dir/nr"); | ||
371 : | olson | 1.93 | |
372 : | open(F, ">$sim_q") or die "Cannot open $sim_q to truncate it: $!\n"; | ||
373 : | close(F); | ||
374 : | }; | ||
375 : | parrello | 1.200 | |
376 : | olson | 1.93 | unlink("$pool_dir/lockfile"); |
377 : | close($lock); | ||
378 : | olson | 1.123 | |
379 : | # | ||
380 : | # We've created our pool; we can now run the formatdb and | ||
381 : | # extract the sequences for the blast run. | ||
382 : | # | ||
383 : | my $child_pid = $self->run_in_background(sub { | ||
384 : | # | ||
385 : | # Need to close db or there's all sorts of trouble. | ||
386 : | # | ||
387 : | |||
388 : | my $cmd = "$FIG_Config::ext_bin/formatdb -i $cpool_dir/nr -p T -l $cpool_dir/formatdb.log"; | ||
389 : | print "Will run '$cmd'\n"; | ||
390 : | &run($cmd); | ||
391 : | print "finished. Logfile:\n"; | ||
392 : | print &FIG::file_read("$cpool_dir/formatdb.log"); | ||
393 : | unlink("$cpool_dir/formatdb.pid"); | ||
394 : | }); | ||
395 : | print "Running formatdb in background job $child_pid\n"; | ||
396 : | open(FPID, ">$cpool_dir/formatdb.pid"); | ||
397 : | print FPID "$child_pid\n"; | ||
398 : | close(FPID); | ||
399 : | |||
400 : | my $db = $self->db_handle(); | ||
401 : | if (!$db->table_exists("sim_queue")) | ||
402 : | { | ||
403 : | $db->create_table(tbl => "sim_queue", | ||
404 : | flds => "qid varchar(32), chunk_id INTEGER, seek INTEGER, len INTEGER, " . | ||
405 : | "assigned BOOL, finished BOOL, output_file varchar(255), " . | ||
406 : | "assignment_expires INTEGER, worker_info varchar(255)" | ||
407 : | ); | ||
408 : | } | ||
409 : | |||
410 : | # | ||
411 : | # Write the fasta input file. Keep track of how many have been written, | ||
412 : | # and write seek info into the database as appropriate. | ||
413 : | # | ||
414 : | |||
415 : | open(my $seq_fh, ">$cpool_dir/fasta.in"); | ||
416 : | |||
417 : | my($chunk_idx, $chunk_begin, $seq_idx); | ||
418 : | |||
419 : | $chunk_idx = 0; | ||
420 : | $chunk_begin = 0; | ||
421 : | $seq_idx = 0; | ||
422 : | |||
423 : | my(@seeks); | ||
424 : | |||
425 : | open(my $q_fh, "<$cpool_dir/q"); | ||
426 : | while (my $id = <$q_fh>) | ||
427 : | { | ||
428 : | chomp $id; | ||
429 : | |||
430 : | my $seq = $self->get_translation($id); | ||
431 : | |||
432 : | # | ||
433 : | # check if we're at the beginning of a chunk | ||
434 : | # | ||
435 : | |||
436 : | print $seq_fh ">$id\n$seq\n"; | ||
437 : | |||
438 : | # | ||
439 : | # Check if we're at the end of a chunk | ||
440 : | # | ||
441 : | |||
442 : | if ((($seq_idx + 1) % $chunk_size) == 0) | ||
443 : | { | ||
444 : | my $chunk_end = tell($seq_fh); | ||
445 : | my $chunk_len = $chunk_end - $chunk_begin; | ||
446 : | |||
447 : | push(@seeks, [$cpool_id, $chunk_idx, $chunk_begin, $chunk_len]); | ||
448 : | $chunk_idx++; | ||
449 : | $chunk_begin = $chunk_end; | ||
450 : | } | ||
451 : | $seq_idx++; | ||
452 : | } | ||
453 : | |||
454 : | if ((($seq_idx) % $chunk_size) != 0) | ||
455 : | { | ||
456 : | my $chunk_end = tell($seq_fh); | ||
457 : | my $chunk_len = $chunk_end - $chunk_begin; | ||
458 : | |||
459 : | push(@seeks, [$cpool_id, $chunk_idx, $chunk_begin, $chunk_len]); | ||
460 : | |||
461 : | $chunk_idx++; | ||
462 : | $chunk_begin = $chunk_end; | ||
463 : | } | ||
464 : | |||
465 : | close($q_fh); | ||
466 : | close($seq_fh); | ||
467 : | |||
468 : | print "Write seqs\n"; | ||
469 : | |||
470 : | for my $seek (@seeks) | ||
471 : | { | ||
472 : | my($cpool_id, $chunk_idx, $chunk_begin, $chunk_len) = @$seek; | ||
473 : | parrello | 1.200 | |
474 : | olson | 1.123 | $db->SQL("insert into sim_queue (qid, chunk_id, seek, len, assigned, finished) " . |
475 : | "values('$cpool_id', $chunk_idx, $chunk_begin, $chunk_len, FALSE, FALSE)"); | ||
476 : | } | ||
477 : | parrello | 1.200 | |
478 : | olson | 1.123 | return $cpool_id; |
479 : | } | ||
480 : | |||
481 : | parrello | 1.210 | #=head3 get_sim_queue |
482 : | # | ||
483 : | #usage: get_sim_queue($pool_id, $all_sims) | ||
484 : | # | ||
485 : | #Returns the sims in the given pool. If $all_sims is true, return the entire queue. Otherwise, | ||
486 : | #just return the sims awaiting processing. | ||
487 : | # | ||
488 : | #=cut | ||
489 : | olson | 1.123 | |
490 : | sub get_sim_queue | ||
491 : | { | ||
492 : | my($self, $pool_id, $all_sims) = @_; | ||
493 : | } | ||
494 : | |||
495 : | parrello | 1.210 | =head3 get_active_sim_pools |
496 : | olson | 1.123 | |
497 : | usage: get_active_sim_pools() | ||
498 : | |||
499 : | Return a list of the pool id's for the sim processing queues that have entries awaiting | ||
500 : | computation. | ||
501 : | |||
502 : | =cut | ||
503 : | parrello | 1.210 | #: Return Type @; |
504 : | olson | 1.123 | sub get_active_sim_pools |
505 : | { | ||
506 : | my($self) = @_; | ||
507 : | |||
508 : | my $dbh = $self->db_handle(); | ||
509 : | |||
510 : | my $res = $dbh->SQL("select distinct qid from sim_queue where not finished"); | ||
511 : | return undef unless $res; | ||
512 : | |||
513 : | return map { $_->[0] } @$res; | ||
514 : | } | ||
515 : | |||
516 : | parrello | 1.210 | =head3 get_sim_pool_info |
517 : | olson | 1.123 | |
518 : | usage: get_sim_pool_info($pool_id) | ||
519 : | |||
520 : | Return information about the given sim pool. Return value | ||
521 : | is a list ($total_entries, $n_finished, $n_assigned, $n_unassigned) | ||
522 : | |||
523 : | =cut | ||
524 : | parrello | 1.210 | #: Return Type @; |
525 : | olson | 1.123 | sub get_sim_pool_info |
526 : | { | ||
527 : | my($self, $pool_id) = @_; | ||
528 : | my($dbh, $res, $total_entries, $n_finished, $n_assigned, $n_unassigned); | ||
529 : | |||
530 : | $dbh = $self->db_handle(); | ||
531 : | |||
532 : | $res = $dbh->SQL("select count(chunk_id) from sim_queue where qid = '$pool_id'"); | ||
533 : | parrello | 1.200 | $total_entries = $res->[0]->[0]; |
534 : | olson | 1.123 | |
535 : | $res = $dbh->SQL("select count(chunk_id) from sim_queue where qid = '$pool_id' and finished"); | ||
536 : | $n_finished = $res->[0]->[0]; | ||
537 : | |||
538 : | $res = $dbh->SQL("select count(chunk_id) from sim_queue where qid = '$pool_id' and assigned and not finished"); | ||
539 : | $n_assigned = $res->[0]->[0]; | ||
540 : | |||
541 : | $res = $dbh->SQL("select count(chunk_id) from sim_queue where qid = '$pool_id' and not finished and not assigned"); | ||
542 : | $n_unassigned = $res->[0]->[0]; | ||
543 : | |||
544 : | return ($total_entries, $n_finished, $n_assigned, $n_unassigned); | ||
545 : | olson | 1.93 | } |
546 : | |||
547 : | parrello | 1.210 | #=head3 get_sim_chunk |
548 : | # | ||
549 : | #usage: get_sim_chunk($n_seqs, $worker_id) | ||
550 : | # | ||
551 : | #Returns a chunk of $n_seqs of work. | ||
552 : | # | ||
553 : | #From Ross, about how sims are processed: | ||
554 : | # | ||
555 : | #Here is how I process them: | ||
556 : | # | ||
557 : | # | ||
558 : | # bash$ cd /Volumes/seed/olson/Sims/June22.out | ||
559 : | # bash$ for i in really* | ||
560 : | # > do | ||
561 : | # > cat < $i >> /Volumes/laptop/new.sims | ||
562 : | # > done | ||
563 : | # | ||
564 : | # | ||
565 : | #Then, I need to "reformat" them by adding to columns to each one | ||
566 : | # and split the result into files of about 3M each This I do using | ||
567 : | # | ||
568 : | #reduce_sims /Volumes/laptop/NR/NewNR/peg.synonyms.june21 300 < /Volumes/laptop/new.sims | | ||
569 : | # reformat_sims /Volumes/laptop/NR/NewNR/checked.nr.june21 > /Volumes/laptop/reformated.sims | ||
570 : | #rm /Volumes/laptop/new.sims | ||
571 : | #split_sims /Volumes/laptop/NewSims sims.june24 reformated.sims | ||
572 : | #rm reformatted.sims | ||
573 : | # | ||
574 : | #=cut | ||
575 : | olson | 1.93 | |
576 : | parrello | 1.210 | sub get_sim_chunk |
577 : | { | ||
578 : | my($self, $n_seqs, $worker_id) = @_; | ||
579 : | olson | 1.93 | |
580 : | |||
581 : | parrello | 1.210 | } |
582 : | olson | 1.123 | |
583 : | parrello | 1.210 | =head3 get_local_hostname |
584 : | parrello | 1.200 | |
585 : | parrello | 1.213 | usage: my $result = $fig->get_local_hostname(); |
586 : | olson | 1.123 | |
587 : | olson | 1.93 | =cut |
588 : | parrello | 1.213 | #: Return Type $; |
589 : | olson | 1.10 | sub get_local_hostname { |
590 : | olson | 1.52 | |
591 : | # | ||
592 : | # See if there is a FIGdisk/config/hostname file. If there | ||
593 : | # is, force the hostname to be that. | ||
594 : | # | ||
595 : | |||
596 : | my $hostfile = "$FIG_Config::fig_disk/config/hostname"; | ||
597 : | if (-f $hostfile) | ||
598 : | { | ||
599 : | my $fh; | ||
600 : | if (open($fh, $hostfile)) | ||
601 : | { | ||
602 : | my $hostname = <$fh>; | ||
603 : | chomp($hostname); | ||
604 : | return $hostname; | ||
605 : | } | ||
606 : | } | ||
607 : | parrello | 1.200 | |
608 : | olson | 1.10 | # |
609 : | # First check to see if we our hostname is correct. | ||
610 : | # | ||
611 : | # Map it to an IP address, and try to bind to that ip. | ||
612 : | # | ||
613 : | |||
614 : | my $tcp = getprotobyname('tcp'); | ||
615 : | parrello | 1.200 | |
616 : | olson | 1.10 | my $hostname = `hostname`; |
617 : | golsen | 1.44 | chomp($hostname); |
618 : | olson | 1.10 | |
619 : | my @hostent = gethostbyname($hostname); | ||
620 : | |||
621 : | if (@hostent > 0) | ||
622 : | { | ||
623 : | my $sock; | ||
624 : | my $ip = $hostent[4]; | ||
625 : | parrello | 1.200 | |
626 : | olson | 1.10 | socket($sock, PF_INET, SOCK_STREAM, $tcp); |
627 : | if (bind($sock, sockaddr_in(0, $ip))) | ||
628 : | { | ||
629 : | # | ||
630 : | # It worked. Reverse-map back to a hopefully fqdn. | ||
631 : | # | ||
632 : | |||
633 : | my @rev = gethostbyaddr($ip, AF_INET); | ||
634 : | if (@rev > 0) | ||
635 : | { | ||
636 : | olson | 1.28 | my $host = $rev[0]; |
637 : | # | ||
638 : | # Check to see if we have a FQDN. | ||
639 : | # | ||
640 : | |||
641 : | if ($host =~ /\./) | ||
642 : | { | ||
643 : | # | ||
644 : | # Good. | ||
645 : | # | ||
646 : | return $host; | ||
647 : | } | ||
648 : | else | ||
649 : | { | ||
650 : | # | ||
651 : | # We didn't get a fqdn; bail and return the IP address. | ||
652 : | # | ||
653 : | return get_hostname_by_adapter() | ||
654 : | } | ||
655 : | olson | 1.10 | } |
656 : | else | ||
657 : | { | ||
658 : | return inet_ntoa($ip); | ||
659 : | } | ||
660 : | } | ||
661 : | else | ||
662 : | { | ||
663 : | # | ||
664 : | # Our hostname must be wrong; we can't bind to the IP | ||
665 : | # address it maps to. | ||
666 : | # Return the name associated with the adapter. | ||
667 : | # | ||
668 : | return get_hostname_by_adapter() | ||
669 : | } | ||
670 : | } | ||
671 : | else | ||
672 : | { | ||
673 : | # | ||
674 : | # Our hostname isn't known to DNS. This isn't good. | ||
675 : | # Return the name associated with the adapter. | ||
676 : | # | ||
677 : | return get_hostname_by_adapter() | ||
678 : | } | ||
679 : | } | ||
680 : | |||
681 : | parrello | 1.213 | =head3 get_hostname_by_adapter |
682 : | |||
683 : | usage: my $name = $fig->get_hostname_by_adapter(); | ||
684 : | |||
685 : | =cut | ||
686 : | #: Return Type $; | ||
687 : | olson | 1.10 | sub get_hostname_by_adapter { |
688 : | olson | 1.111 | shift if UNIVERSAL::isa($_[0],__PACKAGE__); |
689 : | olson | 1.10 | # |
690 : | # Attempt to determine our local hostname based on the | ||
691 : | # network environment. | ||
692 : | # | ||
693 : | # This implementation reads the routing table for the default route. | ||
694 : | # We then look at the interface config for the interface that holds the default. | ||
695 : | # | ||
696 : | # | ||
697 : | # Linux routing table: | ||
698 : | # [olson@yips 0.0.0]$ netstat -rn | ||
699 : | # Kernel IP routing table | ||
700 : | # Destination Gateway Genmask Flags MSS Window irtt Iface | ||
701 : | # 140.221.34.32 0.0.0.0 255.255.255.224 U 0 0 0 eth0 | ||
702 : | # 169.254.0.0 0.0.0.0 255.255.0.0 U 0 0 0 eth0 | ||
703 : | # 127.0.0.0 0.0.0.0 255.0.0.0 U 0 0 0 lo | ||
704 : | # 0.0.0.0 140.221.34.61 0.0.0.0 UG 0 0 0 eth0 | ||
705 : | parrello | 1.200 | # |
706 : | olson | 1.10 | # Mac routing table: |
707 : | parrello | 1.200 | # |
708 : | olson | 1.10 | # bash-2.05a$ netstat -rn |
709 : | # Routing tables | ||
710 : | parrello | 1.200 | # |
711 : | olson | 1.10 | # Internet: |
712 : | # Destination Gateway Flags Refs Use Netif Expire | ||
713 : | # default 140.221.11.253 UGSc 12 120 en0 | ||
714 : | # 127.0.0.1 127.0.0.1 UH 16 8415486 lo0 | ||
715 : | # 140.221.8/22 link#4 UCS 12 0 en0 | ||
716 : | # 140.221.8.78 0:6:5b:f:51:c4 UHLW 0 183 en0 408 | ||
717 : | # 140.221.8.191 0:3:93:84:ab:e8 UHLW 0 92 en0 622 | ||
718 : | # 140.221.8.198 0:e0:98:8e:36:e2 UHLW 0 5 en0 691 | ||
719 : | # 140.221.9.6 0:6:5b:f:51:d6 UHLW 1 63 en0 1197 | ||
720 : | # 140.221.10.135 0:d0:59:34:26:34 UHLW 2 2134 en0 1199 | ||
721 : | # 140.221.10.152 0:30:1b:b0:ec:dd UHLW 1 137 en0 1122 | ||
722 : | # 140.221.10.153 127.0.0.1 UHS 0 0 lo0 | ||
723 : | # 140.221.11.37 0:9:6b:53:4e:4b UHLW 1 624 en0 1136 | ||
724 : | # 140.221.11.103 0:30:48:22:59:e6 UHLW 3 973 en0 1016 | ||
725 : | # 140.221.11.224 0:a:95:6f:7:10 UHLW 1 1 en0 605 | ||
726 : | # 140.221.11.237 0:1:30:b8:80:c0 UHLW 0 0 en0 1158 | ||
727 : | # 140.221.11.250 0:1:30:3:1:0 UHLW 0 0 en0 1141 | ||
728 : | # 140.221.11.253 0:d0:3:e:70:a UHLW 13 0 en0 1199 | ||
729 : | # 169.254 link#4 UCS 0 0 en0 | ||
730 : | parrello | 1.200 | # |
731 : | olson | 1.10 | # Internet6: |
732 : | # Destination Gateway Flags Netif Expire | ||
733 : | # UH lo0 | ||
734 : | # fe80::%lo0/64 Uc lo0 | ||
735 : | # link#1 UHL lo0 | ||
736 : | # fe80::%en0/64 link#4 UC en0 | ||
737 : | # 0:a:95:a8:26:68 UHL lo0 | ||
738 : | # ff01::/32 U lo0 | ||
739 : | # ff02::%lo0/32 UC lo0 | ||
740 : | # ff02::%en0/32 link#4 UC en0 | ||
741 : | |||
742 : | my($fh); | ||
743 : | |||
744 : | if (!open($fh, "netstat -rn |")) | ||
745 : | { | ||
746 : | warn "Cannot run netstat to determine local IP address\n"; | ||
747 : | return "localhost"; | ||
748 : | } | ||
749 : | |||
750 : | my $interface_name; | ||
751 : | parrello | 1.200 | |
752 : | olson | 1.10 | while (<$fh>) |
753 : | { | ||
754 : | my @cols = split(); | ||
755 : | |||
756 : | if ($cols[0] eq "default" || $cols[0] eq "0.0.0.0") | ||
757 : | { | ||
758 : | $interface_name = $cols[$#cols]; | ||
759 : | } | ||
760 : | } | ||
761 : | close($fh); | ||
762 : | parrello | 1.200 | |
763 : | olson | 1.11 | # print "Default route on $interface_name\n"; |
764 : | olson | 1.10 | |
765 : | # | ||
766 : | # Find ifconfig. | ||
767 : | # | ||
768 : | |||
769 : | my $ifconfig; | ||
770 : | |||
771 : | for my $dir ((split(":", $ENV{PATH}), "/sbin", "/usr/sbin")) | ||
772 : | { | ||
773 : | if (-x "$dir/ifconfig") | ||
774 : | { | ||
775 : | $ifconfig = "$dir/ifconfig"; | ||
776 : | last; | ||
777 : | } | ||
778 : | } | ||
779 : | |||
780 : | if ($ifconfig eq "") | ||
781 : | { | ||
782 : | warn "Ifconfig not found\n"; | ||
783 : | return "localhost"; | ||
784 : | } | ||
785 : | olson | 1.11 | # print "Foudn $ifconfig\n"; |
786 : | olson | 1.10 | |
787 : | if (!open($fh, "$ifconfig $interface_name |")) | ||
788 : | { | ||
789 : | warn "Could not run $ifconfig: $!\n"; | ||
790 : | return "localhost"; | ||
791 : | } | ||
792 : | |||
793 : | my $ip; | ||
794 : | while (<$fh>) | ||
795 : | { | ||
796 : | # | ||
797 : | # Mac: | ||
798 : | # inet 140.221.10.153 netmask 0xfffffc00 broadcast 140.221.11.255 | ||
799 : | # Linux: | ||
800 : | # inet addr:140.221.34.37 Bcast:140.221.34.63 Mask:255.255.255.224 | ||
801 : | # | ||
802 : | |||
803 : | chomp; | ||
804 : | s/^\s*//; | ||
805 : | |||
806 : | olson | 1.11 | # print "Have '$_'\n"; |
807 : | olson | 1.10 | if (/inet\s+addr:(\d+\.\d+\.\d+\.\d+)\s+/) |
808 : | { | ||
809 : | # | ||
810 : | # Linux hit. | ||
811 : | # | ||
812 : | $ip = $1; | ||
813 : | olson | 1.11 | # print "Got linux $ip\n"; |
814 : | olson | 1.10 | last; |
815 : | } | ||
816 : | elsif (/inet\s+(\d+\.\d+\.\d+\.\d+)\s+/) | ||
817 : | { | ||
818 : | # | ||
819 : | # Mac hit. | ||
820 : | # | ||
821 : | $ip = $1; | ||
822 : | olson | 1.11 | # print "Got mac $ip\n"; |
823 : | olson | 1.10 | last; |
824 : | } | ||
825 : | } | ||
826 : | close($fh); | ||
827 : | |||
828 : | if ($ip eq "") | ||
829 : | { | ||
830 : | warn "Didn't find an IP\n"; | ||
831 : | return "localhost"; | ||
832 : | } | ||
833 : | |||
834 : | return $ip; | ||
835 : | efrank | 1.1 | } |
836 : | |||
837 : | parrello | 1.213 | =head3 get_seed_id |
838 : | |||
839 : | usage: my $id = $fig->get_seed_id(); | ||
840 : | |||
841 : | =cut | ||
842 : | #: Return type $; | ||
843 : | olson | 1.38 | sub get_seed_id { |
844 : | # | ||
845 : | # Retrieve the seed identifer from FIGdisk/config/seed_id. | ||
846 : | # | ||
847 : | # If it's not there, create one, and make it readonly. | ||
848 : | # | ||
849 : | |||
850 : | olson | 1.111 | shift if UNIVERSAL::isa($_[0],__PACKAGE__); |
851 : | olson | 1.38 | my $id; |
852 : | my $id_file = "$FIG_Config::fig_disk/config/seed_id"; | ||
853 : | if (! -f $id_file) | ||
854 : | { | ||
855 : | my $newid = `uuidgen`; | ||
856 : | if (!$newid) | ||
857 : | { | ||
858 : | die "Cannot run uuidgen: $!"; | ||
859 : | } | ||
860 : | |||
861 : | chomp($newid); | ||
862 : | my $fh = new FileHandle(">$id_file"); | ||
863 : | if (!$fh) | ||
864 : | { | ||
865 : | die "error creating $id_file: $!"; | ||
866 : | } | ||
867 : | print $fh "$newid\n"; | ||
868 : | $fh->close(); | ||
869 : | chmod(0444, $id_file); | ||
870 : | } | ||
871 : | my $fh = new FileHandle("<$id_file"); | ||
872 : | $id = <$fh>; | ||
873 : | chomp($id); | ||
874 : | return $id; | ||
875 : | } | ||
876 : | |||
877 : | olson | 1.155 | =pod |
878 : | |||
879 : | =head1 get_release_info | ||
880 : | |||
881 : | olson | 1.195 | Return the current data release information. It is returned as the list |
882 : | parrello | 1.200 | ($name, $id, $inst, $email, $parent_id, $description). |
883 : | olson | 1.195 | |
884 : | The release info comes from the file FIG/Data/RELEASE. It is formatted as: | ||
885 : | |||
886 : | <release-name> | ||
887 : | <unique id> | ||
888 : | <institution> | ||
889 : | <contact email> | ||
890 : | <unique id of data release this release derived from> | ||
891 : | <description> | ||
892 : | |||
893 : | For instance: | ||
894 : | ----- | ||
895 : | SEED Data Release, 09/15/2004. | ||
896 : | 4148208C-1DF2-11D9-8417-000A95D52EF6 | ||
897 : | ANL/FIG | ||
898 : | olson@mcs.anl.gov | ||
899 : | |||
900 : | Test release. | ||
901 : | ----- | ||
902 : | |||
903 : | If no RELEASE file exists, this routine will create one with a new unique ID. This | ||
904 : | lets a peer optimize the data transfer by being able to cache ID translations | ||
905 : | from this instance. | ||
906 : | olson | 1.155 | |
907 : | =cut | ||
908 : | parrello | 1.213 | #: Return Type @; |
909 : | olson | 1.155 | sub get_release_info |
910 : | { | ||
911 : | olson | 1.196 | my($fig, $no_create) = @_; |
912 : | olson | 1.195 | |
913 : | my $rel_file = "$FIG_Config::data/RELEASE"; | ||
914 : | |||
915 : | olson | 1.196 | if (! -f $rel_file and !$no_create) |
916 : | olson | 1.195 | { |
917 : | # | ||
918 : | # Create a new one. | ||
919 : | # | ||
920 : | |||
921 : | my $newid = `uuidgen`; | ||
922 : | if (!$newid) | ||
923 : | { | ||
924 : | die "Cannot run uuidgen: $!"; | ||
925 : | } | ||
926 : | |||
927 : | chomp($newid); | ||
928 : | |||
929 : | my $relinfo = "Automatically generated release info " . localtime(); | ||
930 : | my $inst = "Unknown"; | ||
931 : | my $contact = "Unknown"; | ||
932 : | my $parent = ""; | ||
933 : | my( $a, $b, $e, $v, $env ) = $fig->genome_counts; | ||
934 : | my $description = "Automatically generated release info\n"; | ||
935 : | $description .= "Contains $a archaeal, $b bacterial, $e eukaryal, $v viral and $env environmental genomes.\n"; | ||
936 : | |||
937 : | my $fh = new FileHandle(">$rel_file"); | ||
938 : | if (!$fh) | ||
939 : | { | ||
940 : | warn "error creating $rel_file: $!"; | ||
941 : | return undef; | ||
942 : | } | ||
943 : | print $fh "$relinfo\n"; | ||
944 : | print $fh "$newid\n"; | ||
945 : | print $fh "$inst\n"; | ||
946 : | print $fh "$contact\n"; | ||
947 : | print $fh "$parent\n"; | ||
948 : | print $fh $description; | ||
949 : | $fh->close(); | ||
950 : | chmod(0444, $rel_file); | ||
951 : | } | ||
952 : | |||
953 : | if (open(my $fh, $rel_file)) | ||
954 : | { | ||
955 : | my(@lines) = <$fh>; | ||
956 : | close($fh); | ||
957 : | parrello | 1.200 | |
958 : | olson | 1.195 | chomp(@lines); |
959 : | parrello | 1.200 | |
960 : | olson | 1.195 | my($info, $id, $inst, $contact, $parent, @desc) = @lines; |
961 : | |||
962 : | return ($info, $id, $inst, $contact, $parent, join("\n", @desc)); | ||
963 : | } | ||
964 : | olson | 1.155 | |
965 : | return undef; | ||
966 : | } | ||
967 : | |||
968 : | =pod | ||
969 : | |||
970 : | =head1 get_peer_last_update | ||
971 : | |||
972 : | parrello | 1.213 | usage: my $date = $fig->get_peer_last_update($peer_id); |
973 : | |||
974 : | olson | 1.155 | Return the timestamp from the last successful peer-to-peer update with |
975 : | parrello | 1.200 | the given peer. |
976 : | olson | 1.155 | |
977 : | We store this information in FIG/Data/Global/Peers/<peer-id>. | ||
978 : | |||
979 : | =cut | ||
980 : | parrello | 1.213 | #: Return Type $; |
981 : | olson | 1.155 | sub get_peer_last_update |
982 : | { | ||
983 : | my($self, $peer_id) = @_; | ||
984 : | |||
985 : | my $dir = "$FIG_Config::data/Global/Peers"; | ||
986 : | &verify_dir($dir); | ||
987 : | $dir .= "/$peer_id"; | ||
988 : | &verify_dir($dir); | ||
989 : | |||
990 : | my $update_file = "$dir/last_update"; | ||
991 : | if (-f $update_file) | ||
992 : | { | ||
993 : | my $time = file_head($update_file, 1); | ||
994 : | chomp $time; | ||
995 : | return $time; | ||
996 : | } | ||
997 : | else | ||
998 : | { | ||
999 : | return undef; | ||
1000 : | } | ||
1001 : | } | ||
1002 : | |||
1003 : | parrello | 1.213 | =pod |
1004 : | |||
1005 : | =head1 set_peer_last_update | ||
1006 : | |||
1007 : | usage: $fig->set_peer_last_update($peer_id, $time); | ||
1008 : | |||
1009 : | =cut | ||
1010 : | #: Return Type ; | ||
1011 : | |||
1012 : | olson | 1.155 | sub set_peer_last_update |
1013 : | { | ||
1014 : | my($self, $peer_id, $time) = @_; | ||
1015 : | |||
1016 : | my $dir = "$FIG_Config::data/Global/Peers"; | ||
1017 : | &verify_dir($dir); | ||
1018 : | $dir .= "/$peer_id"; | ||
1019 : | &verify_dir($dir); | ||
1020 : | |||
1021 : | my $update_file = "$dir/last_update"; | ||
1022 : | open(F, ">$update_file"); | ||
1023 : | print F "$time\n"; | ||
1024 : | close(F); | ||
1025 : | } | ||
1026 : | |||
1027 : | parrello | 1.213 | =head3 cgi_url |
1028 : | |||
1029 : | usage: my $url = $fig->cgi_url(); | ||
1030 : | |||
1031 : | =cut | ||
1032 : | #: Return Type $; | ||
1033 : | efrank | 1.1 | sub cgi_url { |
1034 : | olson | 1.111 | shift if UNIVERSAL::isa($_[0],__PACKAGE__); |
1035 : | efrank | 1.1 | return &plug_url($FIG_Config::cgi_url); |
1036 : | } | ||
1037 : | parrello | 1.200 | |
1038 : | parrello | 1.213 | =head3 temp_url |
1039 : | |||
1040 : | usage: my $url = $fig->temp_url(); | ||
1041 : | |||
1042 : | =cut | ||
1043 : | #: Return Type $; | ||
1044 : | efrank | 1.1 | sub temp_url { |
1045 : | olson | 1.111 | shift if UNIVERSAL::isa($_[0],__PACKAGE__); |
1046 : | efrank | 1.1 | return &plug_url($FIG_Config::temp_url); |
1047 : | } | ||
1048 : | parrello | 1.200 | |
1049 : | parrello | 1.213 | =head3 plug_url |
1050 : | |||
1051 : | usage: my $url2 = $fig->plug_url($url); | ||
1052 : | |||
1053 : | =cut | ||
1054 : | #: Return Type $; | ||
1055 : | efrank | 1.1 | sub plug_url { |
1056 : | olson | 1.111 | shift if UNIVERSAL::isa($_[0],__PACKAGE__); |
1057 : | efrank | 1.1 | my($url) = @_; |
1058 : | |||
1059 : | golsen | 1.44 | my $name; |
1060 : | |||
1061 : | # Revised by GJO | ||
1062 : | # First try to get url from the current http request | ||
1063 : | |||
1064 : | if ( defined( $ENV{ 'HTTP_HOST' } ) # This is where $cgi->url gets its value | ||
1065 : | && ( $name = $ENV{ 'HTTP_HOST' } ) | ||
1066 : | && ( $url =~ s~^http://[^/]*~http://$name~ ) # ~ is delimiter | ||
1067 : | ) {} | ||
1068 : | |||
1069 : | # Otherwise resort to alternative sources | ||
1070 : | |||
1071 : | elsif ( ( $name = &get_local_hostname ) | ||
1072 : | && ( $url =~ s~^http://[^/]*~http://$name~ ) # ~ is delimiter | ||
1073 : | ) {} | ||
1074 : | |||
1075 : | efrank | 1.1 | return $url; |
1076 : | } | ||
1077 : | |||
1078 : | parrello | 1.213 | =head3 file_read |
1079 : | |||
1080 : | usage: my $text = $fig->file_read($fileName); | ||
1081 : | |||
1082 : | usage: my @lines = $fig->file_read($fileName); | ||
1083 : | |||
1084 : | =cut | ||
1085 : | #: Return Type $; | ||
1086 : | #: Return Type @; | ||
1087 : | olson | 1.90 | sub file_read |
1088 : | { | ||
1089 : | olson | 1.111 | shift if UNIVERSAL::isa($_[0],__PACKAGE__); |
1090 : | olson | 1.90 | my($file) = @_; |
1091 : | |||
1092 : | if (open(my $fh, "<$file")) | ||
1093 : | { | ||
1094 : | parrello | 1.200 | if (wantarray) |
1095 : | olson | 1.90 | { |
1096 : | my @ret = <$fh>; | ||
1097 : | return @ret; | ||
1098 : | } | ||
1099 : | else | ||
1100 : | { | ||
1101 : | local $/; | ||
1102 : | my $text = <$fh>; | ||
1103 : | close($fh); | ||
1104 : | return $text; | ||
1105 : | } | ||
1106 : | } | ||
1107 : | } | ||
1108 : | |||
1109 : | |||
1110 : | parrello | 1.213 | =head3 file_head |
1111 : | |||
1112 : | usage: my $text = $fig->file_read($fileName, $count); | ||
1113 : | |||
1114 : | usage: my @lines = $fig->file_read($fileName, $count); | ||
1115 : | |||
1116 : | =cut | ||
1117 : | #: Return Type $; | ||
1118 : | #: Return Type @; | ||
1119 : | olson | 1.90 | sub file_head |
1120 : | { | ||
1121 : | olson | 1.111 | shift if UNIVERSAL::isa($_[0],__PACKAGE__); |
1122 : | olson | 1.90 | my($file, $n) = @_; |
1123 : | |||
1124 : | if (!$n) | ||
1125 : | { | ||
1126 : | $n = 1; | ||
1127 : | } | ||
1128 : | |||
1129 : | if (open(my $fh, "<$file")) | ||
1130 : | { | ||
1131 : | my(@ret, $i); | ||
1132 : | |||
1133 : | $i = 0; | ||
1134 : | while (<$fh>) | ||
1135 : | { | ||
1136 : | push(@ret, $_); | ||
1137 : | $i++; | ||
1138 : | last if $i >= $n; | ||
1139 : | } | ||
1140 : | olson | 1.93 | close($fh); |
1141 : | olson | 1.155 | |
1142 : | if (wantarray) | ||
1143 : | { | ||
1144 : | return @ret; | ||
1145 : | } | ||
1146 : | else | ||
1147 : | { | ||
1148 : | return join("", @ret); | ||
1149 : | } | ||
1150 : | olson | 1.90 | } |
1151 : | } | ||
1152 : | |||
1153 : | |||
1154 : | efrank | 1.1 | =pod |
1155 : | |||
1156 : | =head1 hiding/caching in a FIG object | ||
1157 : | |||
1158 : | We save the DB handle, cache taxonomies, and put a few other odds and ends in the | ||
1159 : | FIG object. We expect users to invoke these services using the object $fig constructed | ||
1160 : | using: | ||
1161 : | |||
1162 : | use FIG; | ||
1163 : | my $fig = new FIG; | ||
1164 : | |||
1165 : | $fig is then used as the basic mechanism for accessing FIG services. It is, of course, | ||
1166 : | just a hash that is used to retain/cache data. The most commonly accessed item is the | ||
1167 : | DB filehandle, which is accessed via $self->db_handle. | ||
1168 : | |||
1169 : | We cache genus/species expansions, taxonomies, distances (very crudely estimated) estimated | ||
1170 : | parrello | 1.200 | between genomes, and a variety of other things. I am not sure that using cached/2 was a |
1171 : | efrank | 1.1 | good idea, but I did it. |
1172 : | |||
1173 : | =cut | ||
1174 : | |||
1175 : | sub db_handle { | ||
1176 : | my($self) = @_; | ||
1177 : | |||
1178 : | return $self->{_dbf}; | ||
1179 : | } | ||
1180 : | |||
1181 : | sub cached { | ||
1182 : | my($self,$what) = @_; | ||
1183 : | |||
1184 : | my $x = $self->{$what}; | ||
1185 : | if (! $x) | ||
1186 : | { | ||
1187 : | $x = $self->{$what} = {}; | ||
1188 : | } | ||
1189 : | return $x; | ||
1190 : | } | ||
1191 : | |||
1192 : | ################ Basic Routines [ existed since WIT ] ########################## | ||
1193 : | |||
1194 : | |||
1195 : | =pod | ||
1196 : | |||
1197 : | =head1 min | ||
1198 : | |||
1199 : | usage: $n = &FIG::min(@x) | ||
1200 : | |||
1201 : | Assumes @x contains numeric values. Returns the minimum of the values. | ||
1202 : | |||
1203 : | =cut | ||
1204 : | parrello | 1.213 | #: Return Type $; |
1205 : | efrank | 1.1 | sub min { |
1206 : | olson | 1.111 | shift if UNIVERSAL::isa($_[0],__PACKAGE__); |
1207 : | efrank | 1.1 | my(@x) = @_; |
1208 : | my($min,$i); | ||
1209 : | |||
1210 : | (@x > 0) || return undef; | ||
1211 : | $min = $x[0]; | ||
1212 : | for ($i=1; ($i < @x); $i++) | ||
1213 : | { | ||
1214 : | $min = ($min > $x[$i]) ? $x[$i] : $min; | ||
1215 : | } | ||
1216 : | return $min; | ||
1217 : | } | ||
1218 : | |||
1219 : | =pod | ||
1220 : | |||
1221 : | =head1 max | ||
1222 : | |||
1223 : | usage: $n = &FIG::max(@x) | ||
1224 : | |||
1225 : | Assumes @x contains numeric values. Returns the maximum of the values. | ||
1226 : | |||
1227 : | =cut | ||
1228 : | parrello | 1.213 | #: Return Type $; |
1229 : | efrank | 1.1 | sub max { |
1230 : | olson | 1.111 | shift if UNIVERSAL::isa($_[0],__PACKAGE__); |
1231 : | efrank | 1.1 | my(@x) = @_; |
1232 : | my($max,$i); | ||
1233 : | |||
1234 : | (@x > 0) || return undef; | ||
1235 : | $max = $x[0]; | ||
1236 : | for ($i=1; ($i < @x); $i++) | ||
1237 : | { | ||
1238 : | $max = ($max < $x[$i]) ? $x[$i] : $max; | ||
1239 : | } | ||
1240 : | return $max; | ||
1241 : | } | ||
1242 : | |||
1243 : | =pod | ||
1244 : | |||
1245 : | =head1 between | ||
1246 : | |||
1247 : | usage: &FIG::between($x,$y,$z) | ||
1248 : | |||
1249 : | Returns true iff $y is between $x and $z. | ||
1250 : | |||
1251 : | =cut | ||
1252 : | parrello | 1.213 | #: Return Type $; |
1253 : | efrank | 1.1 | sub between { |
1254 : | olson | 1.111 | shift if UNIVERSAL::isa($_[0],__PACKAGE__); |
1255 : | efrank | 1.1 | my($x,$y,$z) = @_; |
1256 : | |||
1257 : | if ($x < $z) | ||
1258 : | { | ||
1259 : | return (($x <= $y) && ($y <= $z)); | ||
1260 : | } | ||
1261 : | else | ||
1262 : | { | ||
1263 : | return (($x >= $y) && ($y >= $z)); | ||
1264 : | } | ||
1265 : | } | ||
1266 : | |||
1267 : | =pod | ||
1268 : | |||
1269 : | =head1 standard_genetic_code | ||
1270 : | |||
1271 : | usage: $code = &FIG::standard_genetic_code() | ||
1272 : | |||
1273 : | Routines like "translate" can take a "genetic code" as an argument. I implemented such | ||
1274 : | codes using hashes that assumed uppercase DNA triplets as keys. | ||
1275 : | |||
1276 : | =cut | ||
1277 : | parrello | 1.213 | #: Return Type $; |
1278 : | efrank | 1.1 | sub standard_genetic_code { |
1279 : | olson | 1.111 | shift if UNIVERSAL::isa($_[0],__PACKAGE__); |
1280 : | parrello | 1.200 | |
1281 : | efrank | 1.1 | my $code = {}; |
1282 : | |||
1283 : | $code->{"AAA"} = "K"; | ||
1284 : | $code->{"AAC"} = "N"; | ||
1285 : | $code->{"AAG"} = "K"; | ||
1286 : | $code->{"AAT"} = "N"; | ||
1287 : | $code->{"ACA"} = "T"; | ||
1288 : | $code->{"ACC"} = "T"; | ||
1289 : | $code->{"ACG"} = "T"; | ||
1290 : | $code->{"ACT"} = "T"; | ||
1291 : | $code->{"AGA"} = "R"; | ||
1292 : | $code->{"AGC"} = "S"; | ||
1293 : | $code->{"AGG"} = "R"; | ||
1294 : | $code->{"AGT"} = "S"; | ||
1295 : | $code->{"ATA"} = "I"; | ||
1296 : | $code->{"ATC"} = "I"; | ||
1297 : | $code->{"ATG"} = "M"; | ||
1298 : | $code->{"ATT"} = "I"; | ||
1299 : | $code->{"CAA"} = "Q"; | ||
1300 : | $code->{"CAC"} = "H"; | ||
1301 : | $code->{"CAG"} = "Q"; | ||
1302 : | $code->{"CAT"} = "H"; | ||
1303 : | $code->{"CCA"} = "P"; | ||
1304 : | $code->{"CCC"} = "P"; | ||
1305 : | $code->{"CCG"} = "P"; | ||
1306 : | $code->{"CCT"} = "P"; | ||
1307 : | $code->{"CGA"} = "R"; | ||
1308 : | $code->{"CGC"} = "R"; | ||
1309 : | $code->{"CGG"} = "R"; | ||
1310 : | $code->{"CGT"} = "R"; | ||
1311 : | $code->{"CTA"} = "L"; | ||
1312 : | $code->{"CTC"} = "L"; | ||
1313 : | $code->{"CTG"} = "L"; | ||
1314 : | $code->{"CTT"} = "L"; | ||
1315 : | $code->{"GAA"} = "E"; | ||
1316 : | $code->{"GAC"} = "D"; | ||
1317 : | $code->{"GAG"} = "E"; | ||
1318 : | $code->{"GAT"} = "D"; | ||
1319 : | $code->{"GCA"} = "A"; | ||
1320 : | $code->{"GCC"} = "A"; | ||
1321 : | $code->{"GCG"} = "A"; | ||
1322 : | $code->{"GCT"} = "A"; | ||
1323 : | $code->{"GGA"} = "G"; | ||
1324 : | $code->{"GGC"} = "G"; | ||
1325 : | $code->{"GGG"} = "G"; | ||
1326 : | $code->{"GGT"} = "G"; | ||
1327 : | $code->{"GTA"} = "V"; | ||
1328 : | $code->{"GTC"} = "V"; | ||
1329 : | $code->{"GTG"} = "V"; | ||
1330 : | $code->{"GTT"} = "V"; | ||
1331 : | $code->{"TAA"} = "*"; | ||
1332 : | $code->{"TAC"} = "Y"; | ||
1333 : | $code->{"TAG"} = "*"; | ||
1334 : | $code->{"TAT"} = "Y"; | ||
1335 : | $code->{"TCA"} = "S"; | ||
1336 : | $code->{"TCC"} = "S"; | ||
1337 : | $code->{"TCG"} = "S"; | ||
1338 : | $code->{"TCT"} = "S"; | ||
1339 : | $code->{"TGA"} = "*"; | ||
1340 : | $code->{"TGC"} = "C"; | ||
1341 : | $code->{"TGG"} = "W"; | ||
1342 : | $code->{"TGT"} = "C"; | ||
1343 : | $code->{"TTA"} = "L"; | ||
1344 : | $code->{"TTC"} = "F"; | ||
1345 : | $code->{"TTG"} = "L"; | ||
1346 : | $code->{"TTT"} = "F"; | ||
1347 : | parrello | 1.200 | |
1348 : | efrank | 1.1 | return $code; |
1349 : | } | ||
1350 : | |||
1351 : | =pod | ||
1352 : | |||
1353 : | =head1 translate | ||
1354 : | |||
1355 : | usage: $aa_seq = &FIG::translate($dna_seq,$code,$fix_start); | ||
1356 : | |||
1357 : | If $code is undefined, I use the standard genetic code. If $fix_start is true, I | ||
1358 : | will translate initial TTG or GTG to 'M'. | ||
1359 : | |||
1360 : | =cut | ||
1361 : | parrello | 1.213 | #: Return Type $; |
1362 : | efrank | 1.1 | sub translate { |
1363 : | olson | 1.111 | shift if UNIVERSAL::isa($_[0],__PACKAGE__); |
1364 : | efrank | 1.1 | my( $dna,$code,$start) = @_; |
1365 : | my( $i,$j,$ln ); | ||
1366 : | my( $x,$y ); | ||
1367 : | my( $prot ); | ||
1368 : | |||
1369 : | if (! defined($code)) | ||
1370 : | { | ||
1371 : | $code = &FIG::standard_genetic_code; | ||
1372 : | } | ||
1373 : | $ln = length($dna); | ||
1374 : | $prot = "X" x ($ln/3); | ||
1375 : | $dna =~ tr/a-z/A-Z/; | ||
1376 : | |||
1377 : | for ($i=0,$j=0; ($i < ($ln-2)); $i += 3,$j++) | ||
1378 : | { | ||
1379 : | $x = substr($dna,$i,3); | ||
1380 : | if ($y = $code->{$x}) | ||
1381 : | { | ||
1382 : | substr($prot,$j,1) = $y; | ||
1383 : | } | ||
1384 : | } | ||
1385 : | parrello | 1.200 | |
1386 : | efrank | 1.1 | if (($start) && ($ln >= 3) && (substr($dna,0,3) =~ /^[GT]TG$/)) |
1387 : | { | ||
1388 : | substr($prot,0,1) = 'M'; | ||
1389 : | } | ||
1390 : | return $prot; | ||
1391 : | } | ||
1392 : | |||
1393 : | =pod | ||
1394 : | |||
1395 : | parrello | 1.213 | =head1 reverse_comp |
1396 : | efrank | 1.1 | |
1397 : | usage: $dnaR = &FIG::reverse_comp($dna) or | ||
1398 : | $dnaRP = &FIG::rev_comp($seqP) | ||
1399 : | |||
1400 : | In WIT, we implemented reverse complement passing a pointer to a sequence and returning | ||
1401 : | a pointer to a sequence. In most cases the pointers are a pain (although in a few they | ||
1402 : | are just what is needed). Hence, I kept both versions of the function to allow you | ||
1403 : | to use whichever you like. Use rev_comp only for long strings where passing pointers is a | ||
1404 : | reasonable effeciency issue. | ||
1405 : | |||
1406 : | =cut | ||
1407 : | parrello | 1.213 | #: Return Type $; |
1408 : | efrank | 1.1 | sub reverse_comp { |
1409 : | olson | 1.111 | shift if UNIVERSAL::isa($_[0],__PACKAGE__); |
1410 : | efrank | 1.1 | my($seq) = @_; |
1411 : | |||
1412 : | return ${&rev_comp(\$seq)}; | ||
1413 : | } | ||
1414 : | |||
1415 : | parrello | 1.213 | =head1 rev_comp |
1416 : | |||
1417 : | =cut | ||
1418 : | #: Return Type $; | ||
1419 : | efrank | 1.1 | sub rev_comp { |
1420 : | olson | 1.111 | shift if UNIVERSAL::isa($_[0],__PACKAGE__); |
1421 : | efrank | 1.1 | my( $seqP ) = @_; |
1422 : | my( $rev ); | ||
1423 : | |||
1424 : | $rev = reverse( $$seqP ); | ||
1425 : | $rev =~ tr/a-z/A-Z/; | ||
1426 : | $rev =~ tr/ACGTUMRWSYKBDHV/TGCAAKYWSRMVHDB/; | ||
1427 : | return \$rev; | ||
1428 : | } | ||
1429 : | |||
1430 : | =pod | ||
1431 : | |||
1432 : | =head1 verify_dir | ||
1433 : | |||
1434 : | usage: &FIG::verify_dir($dir) | ||
1435 : | |||
1436 : | Makes sure that $dir exists. If it has to create it, it sets permissions to 0777. | ||
1437 : | |||
1438 : | =cut | ||
1439 : | |||
1440 : | sub verify_dir { | ||
1441 : | olson | 1.111 | shift if UNIVERSAL::isa($_[0],__PACKAGE__); |
1442 : | efrank | 1.1 | my($dir) = @_; |
1443 : | |||
1444 : | if (-d $dir) { return } | ||
1445 : | if ($dir =~ /^(.*)\/[^\/]+$/) | ||
1446 : | { | ||
1447 : | &verify_dir($1); | ||
1448 : | } | ||
1449 : | olson | 1.153 | mkdir($dir,0777) || die "could not make $dir: $!"; |
1450 : | olson | 1.184 | # chmod 02777,$dir; |
1451 : | efrank | 1.1 | } |
1452 : | |||
1453 : | =pod | ||
1454 : | |||
1455 : | =head1 run | ||
1456 : | |||
1457 : | usage: &FIG::run($cmd) | ||
1458 : | |||
1459 : | Runs $cmd and fails (with trace) if the command fails. | ||
1460 : | |||
1461 : | =cut | ||
1462 : | |||
1463 : | sub run { | ||
1464 : | olson | 1.111 | shift if UNIVERSAL::isa($_[0],__PACKAGE__); |
1465 : | efrank | 1.1 | my($cmd) = @_; |
1466 : | |||
1467 : | golsen | 1.44 | # my @tmp = `date`; chomp @tmp; print STDERR "$tmp[0]: running $cmd\n"; |
1468 : | efrank | 1.1 | (system($cmd) == 0) || confess "FAILED: $cmd"; |
1469 : | } | ||
1470 : | |||
1471 : | gdpusch | 1.45 | |
1472 : | |||
1473 : | =pod | ||
1474 : | |||
1475 : | =head1 read_fasta_record(\*FILEHANDLE) | ||
1476 : | |||
1477 : | gdpusch | 1.109 | Usage: ( $seq_id, $seq_pointer, $comment ) = &read_fasta_record(\*FILEHANDLE); |
1478 : | gdpusch | 1.45 | |
1479 : | Function: Reads a FASTA-formatted sequence file one record at a time. | ||
1480 : | The input filehandle defaults to STDIN if not specified. | ||
1481 : | parrello | 1.200 | Returns a sequence ID, a pointer to the sequence, and an optional |
1482 : | record comment (NOTE: Record comments are deprecated, as some tools | ||
1483 : | such as BLAST do not handle them gracefully). Returns an empty list | ||
1484 : | if attempting to read a record results in an undefined value | ||
1485 : | gdpusch | 1.45 | (e.g., due to reaching the EOF). |
1486 : | |||
1487 : | Author: Gordon D. Pusch | ||
1488 : | |||
1489 : | Date: 2004-Feb-18 | ||
1490 : | |||
1491 : | =cut | ||
1492 : | parrello | 1.213 | #: Return Type @; |
1493 : | gdpusch | 1.45 | sub read_fasta_record |
1494 : | { | ||
1495 : | olson | 1.111 | shift if UNIVERSAL::isa($_[0],__PACKAGE__); |
1496 : | gdpusch | 1.45 | my ($file_handle) = @_; |
1497 : | gdpusch | 1.46 | my ( $old_end_of_record, $fasta_record, @lines, $head, $sequence, $seq_id, $comment, @parsed_fasta_record ); |
1498 : | parrello | 1.200 | |
1499 : | gdpusch | 1.45 | if (not defined($file_handle)) { $file_handle = \*STDIN; } |
1500 : | parrello | 1.200 | |
1501 : | gdpusch | 1.45 | $old_end_of_record = $/; |
1502 : | $/ = "\n>"; | ||
1503 : | parrello | 1.200 | |
1504 : | gdpusch | 1.45 | if (defined($fasta_record = <$file_handle>)) |
1505 : | { | ||
1506 : | chomp $fasta_record; | ||
1507 : | @lines = split( /\n/, $fasta_record ); | ||
1508 : | $head = shift @lines; | ||
1509 : | $head =~ s/^>?//; | ||
1510 : | $head =~ m/^(\S+)/; | ||
1511 : | $seq_id = $1; | ||
1512 : | parrello | 1.200 | |
1513 : | gdpusch | 1.45 | if ($head =~ m/^\S+\s+(.*)$/) { $comment = $1; } else { $comment = ""; } |
1514 : | parrello | 1.200 | |
1515 : | gdpusch | 1.45 | $sequence = join( "", @lines ); |
1516 : | parrello | 1.200 | |
1517 : | gdpusch | 1.45 | @parsed_fasta_record = ( $seq_id, \$sequence, $comment ); |
1518 : | } | ||
1519 : | else | ||
1520 : | { | ||
1521 : | @parsed_fasta_record = (); | ||
1522 : | } | ||
1523 : | parrello | 1.200 | |
1524 : | gdpusch | 1.45 | $/ = $old_end_of_record; |
1525 : | parrello | 1.200 | |
1526 : | gdpusch | 1.45 | return @parsed_fasta_record; |
1527 : | } | ||
1528 : | |||
1529 : | |||
1530 : | efrank | 1.1 | =pod |
1531 : | |||
1532 : | =head1 display_id_and_seq | ||
1533 : | |||
1534 : | usage: &FIG::display_id_and_seq($id_and_comment,$seqP,$fh) | ||
1535 : | |||
1536 : | This command has always been used to put out fasta sequences. Note that it | ||
1537 : | takes a pointer to the sequence. $fh is optional and defalts to STDOUT. | ||
1538 : | |||
1539 : | =cut | ||
1540 : | |||
1541 : | mkubal | 1.53 | |
1542 : | efrank | 1.1 | sub display_id_and_seq { |
1543 : | olson | 1.111 | shift if UNIVERSAL::isa($_[0],__PACKAGE__); |
1544 : | efrank | 1.1 | my( $id, $seq, $fh ) = @_; |
1545 : | parrello | 1.200 | |
1546 : | efrank | 1.1 | if (! defined($fh) ) { $fh = \*STDOUT; } |
1547 : | parrello | 1.200 | |
1548 : | efrank | 1.1 | print $fh ">$id\n"; |
1549 : | &display_seq($seq, $fh); | ||
1550 : | } | ||
1551 : | |||
1552 : | sub display_seq { | ||
1553 : | olson | 1.111 | shift if UNIVERSAL::isa($_[0],__PACKAGE__); |
1554 : | efrank | 1.1 | my ( $seq, $fh ) = @_; |
1555 : | my ( $i, $n, $ln ); | ||
1556 : | parrello | 1.200 | |
1557 : | efrank | 1.1 | if (! defined($fh) ) { $fh = \*STDOUT; } |
1558 : | |||
1559 : | $n = length($$seq); | ||
1560 : | # confess "zero-length sequence ???" if ( (! defined($n)) || ($n == 0) ); | ||
1561 : | for ($i=0; ($i < $n); $i += 60) | ||
1562 : | { | ||
1563 : | if (($i + 60) <= $n) | ||
1564 : | { | ||
1565 : | $ln = substr($$seq,$i,60); | ||
1566 : | } | ||
1567 : | else | ||
1568 : | { | ||
1569 : | $ln = substr($$seq,$i,($n-$i)); | ||
1570 : | } | ||
1571 : | print $fh "$ln\n"; | ||
1572 : | } | ||
1573 : | } | ||
1574 : | |||
1575 : | ########## I commented the pods on the following routines out, since they should not | ||
1576 : | ########## be part of the SOAP/WSTL interface | ||
1577 : | #=pod | ||
1578 : | # | ||
1579 : | #=head1 file2N | ||
1580 : | # | ||
1581 : | #usage: $n = $fig->file2N($file) | ||
1582 : | # | ||
1583 : | #In some of the databases I need to store filenames, which can waste a lot of | ||
1584 : | #space. Hence, I maintain a database for converting filenames to/from integers. | ||
1585 : | # | ||
1586 : | #=cut | ||
1587 : | # | ||
1588 : | olson | 1.111 | sub file2N :scalar { |
1589 : | efrank | 1.1 | my($self,$file) = @_; |
1590 : | my($relational_db_response); | ||
1591 : | |||
1592 : | my $rdbH = $self->db_handle; | ||
1593 : | |||
1594 : | if (($relational_db_response = $rdbH->SQL("SELECT fileno FROM file_table WHERE ( file = \'$file\')")) && | ||
1595 : | (@$relational_db_response == 1)) | ||
1596 : | { | ||
1597 : | return $relational_db_response->[0]->[0]; | ||
1598 : | } | ||
1599 : | elsif (($relational_db_response = $rdbH->SQL("SELECT MAX(fileno) FROM file_table ")) && (@$relational_db_response == 1) && ($relational_db_response->[0]->[0])) | ||
1600 : | { | ||
1601 : | my $fileno = $relational_db_response->[0]->[0] + 1; | ||
1602 : | if ($rdbH->SQL("INSERT INTO file_table ( file, fileno ) VALUES ( \'$file\', $fileno )")) | ||
1603 : | { | ||
1604 : | return $fileno; | ||
1605 : | } | ||
1606 : | } | ||
1607 : | elsif ($rdbH->SQL("INSERT INTO file_table ( file, fileno ) VALUES ( \'$file\', 1 )")) | ||
1608 : | { | ||
1609 : | return 1; | ||
1610 : | } | ||
1611 : | return undef; | ||
1612 : | } | ||
1613 : | |||
1614 : | #=pod | ||
1615 : | # | ||
1616 : | #=head1 N2file | ||
1617 : | # | ||
1618 : | #usage: $filename = $fig->N2file($n) | ||
1619 : | # | ||
1620 : | #In some of the databases I need to store filenames, which can waste a lot of | ||
1621 : | #space. Hence, I maintain a database for converting filenames to/from integers. | ||
1622 : | # | ||
1623 : | #=cut | ||
1624 : | # | ||
1625 : | olson | 1.111 | sub N2file :scalar { |
1626 : | efrank | 1.1 | my($self,$fileno) = @_; |
1627 : | my($relational_db_response); | ||
1628 : | |||
1629 : | my $rdbH = $self->db_handle; | ||
1630 : | |||
1631 : | if (($relational_db_response = $rdbH->SQL("SELECT file FROM file_table WHERE ( fileno = $fileno )")) && | ||
1632 : | (@$relational_db_response == 1)) | ||
1633 : | { | ||
1634 : | return $relational_db_response->[0]->[0]; | ||
1635 : | } | ||
1636 : | return undef; | ||
1637 : | } | ||
1638 : | |||
1639 : | |||
1640 : | #=pod | ||
1641 : | # | ||
1642 : | #=head1 openF | ||
1643 : | # | ||
1644 : | #usage: $fig->openF($filename) | ||
1645 : | # | ||
1646 : | #Parts of the system rely on accessing numerous different files. The most obvious case is | ||
1647 : | #the situation with similarities. It is important that the system be able to run in cases in | ||
1648 : | #which an arbitrary number of files cannot be open simultaneously. This routine (with closeF) is | ||
1649 : | #a hack to handle this. I should probably just pitch them and insist that the OS handle several | ||
1650 : | #hundred open filehandles. | ||
1651 : | # | ||
1652 : | #=cut | ||
1653 : | # | ||
1654 : | sub openF { | ||
1655 : | my($self,$file) = @_; | ||
1656 : | my($fxs,$x,@fxs,$fh); | ||
1657 : | |||
1658 : | $fxs = $self->cached('_openF'); | ||
1659 : | if ($x = $fxs->{$file}) | ||
1660 : | { | ||
1661 : | $x->[1] = time(); | ||
1662 : | return $x->[0]; | ||
1663 : | } | ||
1664 : | parrello | 1.200 | |
1665 : | efrank | 1.1 | @fxs = keys(%$fxs); |
1666 : | if (defined($fh = new FileHandle "<$file")) | ||
1667 : | { | ||
1668 : | overbeek | 1.98 | if (@fxs >= 50) |
1669 : | efrank | 1.1 | { |
1670 : | @fxs = sort { $fxs->{$a}->[1] <=> $fxs->{$b}->[1] } @fxs; | ||
1671 : | $x = $fxs->{$fxs[0]}; | ||
1672 : | undef $x->[0]; | ||
1673 : | delete $fxs->{$fxs[0]}; | ||
1674 : | } | ||
1675 : | $fxs->{$file} = [$fh,time()]; | ||
1676 : | return $fh; | ||
1677 : | } | ||
1678 : | return undef; | ||
1679 : | } | ||
1680 : | |||
1681 : | #=pod | ||
1682 : | # | ||
1683 : | #=head1 closeF | ||
1684 : | # | ||
1685 : | #usage: $fig->closeF($filename) | ||
1686 : | # | ||
1687 : | #Parts of the system rely on accessing numerous different files. The most obvious case is | ||
1688 : | #the situation with similarities. It is important that the system be able to run in cases in | ||
1689 : | #which an arbitrary number of files cannot be open simultaneously. This routine (with openF) is | ||
1690 : | #a hack to handle this. I should probably just pitch them and insist that the OS handle several | ||
1691 : | #hundred open filehandles. | ||
1692 : | # | ||
1693 : | #=cut | ||
1694 : | # | ||
1695 : | sub closeF { | ||
1696 : | my($self,$file) = @_; | ||
1697 : | my($fxs,$x); | ||
1698 : | |||
1699 : | parrello | 1.200 | if (($fxs = $self->{_openF}) && |
1700 : | efrank | 1.1 | ($x = $fxs->{$file})) |
1701 : | { | ||
1702 : | undef $x->[0]; | ||
1703 : | delete $fxs->{$file}; | ||
1704 : | } | ||
1705 : | } | ||
1706 : | |||
1707 : | =pod | ||
1708 : | |||
1709 : | =head1 ec_name | ||
1710 : | |||
1711 : | usage: $enzymatic_function = $fig->ec_name($ec) | ||
1712 : | |||
1713 : | Returns enzymatic name for EC. | ||
1714 : | |||
1715 : | =cut | ||
1716 : | |||
1717 : | sub ec_name { | ||
1718 : | my($self,$ec) = @_; | ||
1719 : | |||
1720 : | ($ec =~ /^\d+\.\d+\.\d+\.\d+$/) || return ""; | ||
1721 : | my $rdbH = $self->db_handle; | ||
1722 : | my $relational_db_response = $rdbH->SQL("SELECT name FROM ec_names WHERE ( ec = \'$ec\' )"); | ||
1723 : | |||
1724 : | return (@$relational_db_response == 1) ? $relational_db_response->[0]->[0] : ""; | ||
1725 : | return ""; | ||
1726 : | } | ||
1727 : | |||
1728 : | =pod | ||
1729 : | |||
1730 : | =head1 all_roles | ||
1731 : | |||
1732 : | usage: @roles = $fig->all_roles | ||
1733 : | |||
1734 : | mkubal | 1.54 | Supposed to return all known roles. For now, we get all ECs with "names". |
1735 : | efrank | 1.1 | |
1736 : | =cut | ||
1737 : | |||
1738 : | sub all_roles { | ||
1739 : | my($self) = @_; | ||
1740 : | |||
1741 : | my $rdbH = $self->db_handle; | ||
1742 : | my $relational_db_response = $rdbH->SQL("SELECT ec,name FROM ec_names"); | ||
1743 : | |||
1744 : | return @$relational_db_response; | ||
1745 : | } | ||
1746 : | |||
1747 : | =pod | ||
1748 : | |||
1749 : | =head1 expand_ec | ||
1750 : | |||
1751 : | usage: $expanded_ec = $fig->expand_ec($ec) | ||
1752 : | |||
1753 : | Expands "1.1.1.1" to "1.1.1.1 - alcohol dehydrogenase" or something like that. | ||
1754 : | |||
1755 : | =cut | ||
1756 : | |||
1757 : | sub expand_ec { | ||
1758 : | my($self,$ec) = @_; | ||
1759 : | my($name); | ||
1760 : | |||
1761 : | return ($name = $self->ec_name($ec)) ? "$ec - $name" : $ec; | ||
1762 : | } | ||
1763 : | |||
1764 : | |||
1765 : | =pod | ||
1766 : | |||
1767 : | =head1 clean_tmp | ||
1768 : | |||
1769 : | usage: &FIG::clean_tmp | ||
1770 : | |||
1771 : | We store temporary files in $FIG_Config::temp. There are specific classes of files | ||
1772 : | that are created and should be saved for at least a few days. This routine can be | ||
1773 : | invoked to clean out those that are over two days old. | ||
1774 : | |||
1775 : | =cut | ||
1776 : | |||
1777 : | sub clean_tmp { | ||
1778 : | |||
1779 : | my($file); | ||
1780 : | if (opendir(TMP,"$FIG_Config::temp")) | ||
1781 : | { | ||
1782 : | # change the pattern to pick up other files that need to be cleaned up | ||
1783 : | my @temp = grep { $_ =~ /^(Geno|tmp)/ } readdir(TMP); | ||
1784 : | foreach $file (@temp) | ||
1785 : | { | ||
1786 : | if (-M "$FIG_Config::temp/$file" > 2) | ||
1787 : | { | ||
1788 : | unlink("$FIG_Config::temp/$file"); | ||
1789 : | } | ||
1790 : | } | ||
1791 : | } | ||
1792 : | } | ||
1793 : | |||
1794 : | ################ Routines to process genomes and genome IDs ########################## | ||
1795 : | |||
1796 : | |||
1797 : | =pod | ||
1798 : | |||
1799 : | =head1 genomes | ||
1800 : | |||
1801 : | golsen | 1.150 | usage: @genome_ids = $fig->genomes( $complete, $restrictions, $domain ); |
1802 : | efrank | 1.1 | |
1803 : | Genomes are assigned ids of the form X.Y where X is the taxonomic id maintained by | ||
1804 : | NCBI for the species (not the specific strain), and Y is a sequence digit assigned to | ||
1805 : | this particular genome (as one of a set with the same genus/species). Genomes also | ||
1806 : | have versions, but that is a separate issue. | ||
1807 : | |||
1808 : | =cut | ||
1809 : | |||
1810 : | olson | 1.111 | sub genomes :remote :list { |
1811 : | golsen | 1.150 | my( $self, $complete, $restrictions, $domain ) = @_; |
1812 : | overbeek | 1.13 | |
1813 : | my $rdbH = $self->db_handle; | ||
1814 : | |||
1815 : | my @where = (); | ||
1816 : | if ($complete) | ||
1817 : | { | ||
1818 : | push(@where,"( complete = \'1\' )") | ||
1819 : | } | ||
1820 : | |||
1821 : | if ($restrictions) | ||
1822 : | { | ||
1823 : | push(@where,"( restrictions = \'1\' )") | ||
1824 : | } | ||
1825 : | golsen | 1.150 | |
1826 : | if ($domain) | ||
1827 : | { | ||
1828 : | push( @where, "( maindomain = '$domain' )" ) | ||
1829 : | } | ||
1830 : | |||
1831 : | overbeek | 1.13 | my $relational_db_response; |
1832 : | if (@where > 0) | ||
1833 : | { | ||
1834 : | my $where = join(" AND ",@where); | ||
1835 : | $relational_db_response = $rdbH->SQL("SELECT genome FROM genome where $where"); | ||
1836 : | } | ||
1837 : | else | ||
1838 : | { | ||
1839 : | $relational_db_response = $rdbH->SQL("SELECT genome FROM genome"); | ||
1840 : | } | ||
1841 : | my @genomes = sort { $a <=> $b } map { $_->[0] } @$relational_db_response; | ||
1842 : | efrank | 1.1 | return @genomes; |
1843 : | } | ||
1844 : | |||
1845 : | overbeek | 1.180 | sub is_complete { |
1846 : | my($self,$genome) = @_; | ||
1847 : | |||
1848 : | my $rdbH = $self->db_handle; | ||
1849 : | my $relational_db_response = $rdbH->SQL("SELECT genome FROM genome where (genome = '$genome') AND (complete = '1')"); | ||
1850 : | return (@$relational_db_response == 1) | ||
1851 : | } | ||
1852 : | golsen | 1.150 | |
1853 : | efrank | 1.2 | sub genome_counts { |
1854 : | overbeek | 1.13 | my($self,$complete) = @_; |
1855 : | my($x,$relational_db_response); | ||
1856 : | efrank | 1.2 | |
1857 : | overbeek | 1.13 | my $rdbH = $self->db_handle; |
1858 : | |||
1859 : | parrello | 1.200 | if ($complete) |
1860 : | overbeek | 1.13 | { |
1861 : | gdpusch | 1.107 | $relational_db_response = $rdbH->SQL("SELECT genome,maindomain FROM genome where complete = '1'"); |
1862 : | overbeek | 1.13 | } |
1863 : | else | ||
1864 : | { | ||
1865 : | gdpusch | 1.107 | $relational_db_response = $rdbH->SQL("SELECT genome,maindomain FROM genome"); |
1866 : | overbeek | 1.13 | } |
1867 : | |||
1868 : | gdpusch | 1.107 | my ($arch, $bact, $euk, $vir, $env, $unk) = (0, 0, 0, 0, 0, 0); |
1869 : | overbeek | 1.13 | if (@$relational_db_response > 0) |
1870 : | efrank | 1.2 | { |
1871 : | overbeek | 1.13 | foreach $x (@$relational_db_response) |
1872 : | efrank | 1.2 | { |
1873 : | gdpusch | 1.107 | if ($x->[1] =~ /^archaea/i) { ++$arch } |
1874 : | elsif ($x->[1] =~ /^bacter/i) { ++$bact } | ||
1875 : | elsif ($x->[1] =~ /^eukar/i) { ++$euk } | ||
1876 : | elsif ($x->[1] =~ /^vir/i) { ++$vir } | ||
1877 : | elsif ($x->[1] =~ /^env/i) { ++$env } | ||
1878 : | else { ++$unk } | ||
1879 : | efrank | 1.2 | } |
1880 : | } | ||
1881 : | parrello | 1.200 | |
1882 : | gdpusch | 1.107 | return ($arch, $bact, $euk, $vir, $env, $unk); |
1883 : | } | ||
1884 : | |||
1885 : | |||
1886 : | =pod | ||
1887 : | |||
1888 : | =head1 genome_domain | ||
1889 : | |||
1890 : | usage: $domain = $fig->genome_domain($genome_id); | ||
1891 : | |||
1892 : | Returns the domain of a genome ID, and 'undef' if it is not in the database. | ||
1893 : | |||
1894 : | =cut | ||
1895 : | |||
1896 : | sub genome_domain { | ||
1897 : | my($self,$genome) = @_; | ||
1898 : | my $relational_db_response; | ||
1899 : | my $rdbH = $self->db_handle; | ||
1900 : | parrello | 1.200 | |
1901 : | if ($genome) | ||
1902 : | gdpusch | 1.107 | { |
1903 : | if (($relational_db_response = $rdbH->SQL("SELECT genome,maindomain FROM genome WHERE ( genome = \'$genome\' )")) | ||
1904 : | && (@$relational_db_response == 1)) | ||
1905 : | { | ||
1906 : | # die Dumper($relational_db_response); | ||
1907 : | return $relational_db_response->[0]->[1]; | ||
1908 : | } | ||
1909 : | } | ||
1910 : | return undef; | ||
1911 : | efrank | 1.2 | } |
1912 : | |||
1913 : | gdpusch | 1.92 | |
1914 : | =pod | ||
1915 : | |||
1916 : | =head1 genome_pegs | ||
1917 : | |||
1918 : | gdpusch | 1.107 | usage: $num_pegs = $fig->genome_pegs($genome_id); |
1919 : | gdpusch | 1.92 | |
1920 : | gdpusch | 1.107 | Returns the number of protein-encoding genes (PEGs) in $genome_id if |
1921 : | "$genome_id" is indexed in the "genome" database, and 'undef' otherwise. | ||
1922 : | gdpusch | 1.92 | |
1923 : | =cut | ||
1924 : | |||
1925 : | sub genome_pegs { | ||
1926 : | my($self,$genome) = @_; | ||
1927 : | my $relational_db_response; | ||
1928 : | my $rdbH = $self->db_handle; | ||
1929 : | parrello | 1.200 | |
1930 : | if ($genome) | ||
1931 : | gdpusch | 1.92 | { |
1932 : | if (($relational_db_response = $rdbH->SQL("SELECT pegs FROM genome WHERE ( genome = \'$genome\' )")) | ||
1933 : | && (@$relational_db_response == 1)) | ||
1934 : | { | ||
1935 : | return $relational_db_response->[0]->[0]; | ||
1936 : | } | ||
1937 : | } | ||
1938 : | return undef; | ||
1939 : | } | ||
1940 : | |||
1941 : | |||
1942 : | efrank | 1.1 | =pod |
1943 : | |||
1944 : | gdpusch | 1.92 | =head1 genome_rnas |
1945 : | |||
1946 : | gdpusch | 1.107 | usage: $num_rnas = $fig->genome_rnas($genome_id); |
1947 : | gdpusch | 1.92 | |
1948 : | gdpusch | 1.107 | Returns the number of RNA-encoding genes (RNAs) in $genome_id if |
1949 : | "$genome_id" is indexed in the "genome" database, and 'undef' otherwise. | ||
1950 : | gdpusch | 1.92 | |
1951 : | =cut | ||
1952 : | |||
1953 : | sub genome_rnas { | ||
1954 : | my($self,$genome) = @_; | ||
1955 : | my $relational_db_response; | ||
1956 : | my $rdbH = $self->db_handle; | ||
1957 : | parrello | 1.200 | |
1958 : | if ($genome) | ||
1959 : | gdpusch | 1.92 | { |
1960 : | if (($relational_db_response = $rdbH->SQL("SELECT rnas FROM genome WHERE ( genome = \'$genome\' )")) | ||
1961 : | && (@$relational_db_response == 1)) | ||
1962 : | { | ||
1963 : | return $relational_db_response->[0]->[0]; | ||
1964 : | } | ||
1965 : | } | ||
1966 : | return undef; | ||
1967 : | } | ||
1968 : | |||
1969 : | |||
1970 : | =pod | ||
1971 : | |||
1972 : | =head1 genome_szdna | ||
1973 : | efrank | 1.1 | |
1974 : | gdpusch | 1.92 | usage: $szdna = $fig->genome_szdna($genome_id); |
1975 : | gdpusch | 1.91 | |
1976 : | gdpusch | 1.107 | Returns the number of DNA base-pairs in the genome contigs file(s) of $genome_id |
1977 : | "$genome_id" is indexed in the "genome" database, and 'undef' otherwise. | ||
1978 : | gdpusch | 1.91 | |
1979 : | =cut | ||
1980 : | |||
1981 : | gdpusch | 1.92 | sub genome_szdna { |
1982 : | gdpusch | 1.91 | my($self,$genome) = @_; |
1983 : | my $relational_db_response; | ||
1984 : | my $rdbH = $self->db_handle; | ||
1985 : | parrello | 1.200 | |
1986 : | if ($genome) | ||
1987 : | gdpusch | 1.91 | { |
1988 : | if (($relational_db_response = $rdbH->SQL("SELECT szdna FROM genome WHERE ( genome = \'$genome\' )")) | ||
1989 : | && (@$relational_db_response == 1)) | ||
1990 : | { | ||
1991 : | return $relational_db_response->[0]->[0]; | ||
1992 : | } | ||
1993 : | } | ||
1994 : | return undef; | ||
1995 : | } | ||
1996 : | |||
1997 : | |||
1998 : | =pod | ||
1999 : | |||
2000 : | =head1 genome_version | ||
2001 : | |||
2002 : | efrank | 1.1 | usage: $version = $fig->genome_version($genome_id); |
2003 : | |||
2004 : | Versions are incremented for major updates. They are put in as major | ||
2005 : | updates of the form 1.0, 2.0, ... | ||
2006 : | |||
2007 : | Users may do local "editing" of the DNA for a genome, but when they do, | ||
2008 : | they increment the digits to the right of the decimal. Two genomes remain | ||
2009 : | parrello | 1.200 | comparable only if the versions match identically. Hence, minor updating should be |
2010 : | efrank | 1.1 | committed only by the person/group responsible for updating that genome. |
2011 : | |||
2012 : | We can, of course, identify which genes are identical between any two genomes (by matching | ||
2013 : | the DNA or amino acid sequences). However, the basic intent of the system is to | ||
2014 : | support editing by the main group issuing periodic major updates. | ||
2015 : | |||
2016 : | =cut | ||
2017 : | |||
2018 : | olson | 1.113 | sub genome_version :scalar { |
2019 : | efrank | 1.1 | my($self,$genome) = @_; |
2020 : | |||
2021 : | my(@tmp); | ||
2022 : | if ((-s "$FIG_Config::organisms/$genome/VERSION") && | ||
2023 : | (@tmp = `cat $FIG_Config::organisms/$genome/VERSION`) && | ||
2024 : | overbeek | 1.84 | ($tmp[0] =~ /^(\S+)$/)) |
2025 : | efrank | 1.1 | { |
2026 : | return $1; | ||
2027 : | } | ||
2028 : | return undef; | ||
2029 : | } | ||
2030 : | |||
2031 : | olson | 1.236 | =head1 genome_md5sum |
2032 : | |||
2033 : | usage: $md5sum = $fig->genome_md5sum($genome_id); | ||
2034 : | |||
2035 : | Returns the MD5 checksum of genome $genome_id. | ||
2036 : | |||
2037 : | The checksum of a genome is defined as the checksum of its signature file. The signature | ||
2038 : | file consists of tab-separated lines, one for each contig, ordered by the contig id. | ||
2039 : | Each line contains the contig ID, the length of the contig in nucleotides, and the | ||
2040 : | MD5 checksum of the nucleotide data, with uppercase letters forced to lower case. | ||
2041 : | |||
2042 : | |||
2043 : | =cut | ||
2044 : | |||
2045 : | olson | 1.237 | sub genome_md5sum :scalar { |
2046 : | olson | 1.236 | my($self,$genome) = @_; |
2047 : | my $relational_db_response; | ||
2048 : | my $rdbH = $self->db_handle; | ||
2049 : | |||
2050 : | if ($genome) | ||
2051 : | { | ||
2052 : | if (($relational_db_response = $rdbH->SQL("SELECT md5sum FROM genome_md5sum WHERE ( genome = \'$genome\' )")) | ||
2053 : | && (@$relational_db_response == 1)) | ||
2054 : | { | ||
2055 : | return $relational_db_response->[0]->[0]; | ||
2056 : | } | ||
2057 : | } | ||
2058 : | return undef; | ||
2059 : | } | ||
2060 : | |||
2061 : | olson | 1.260 | =head1 genome_with_md5sum |
2062 : | |||
2063 : | usage: $genome = $fig->genome_with_md5sum($cksum); | ||
2064 : | |||
2065 : | Find a genome with checksum $cksum. | ||
2066 : | |||
2067 : | =cut | ||
2068 : | |||
2069 : | sub genome_with_md5sum :scalar { | ||
2070 : | my($self,$cksum) = @_; | ||
2071 : | my $relational_db_response; | ||
2072 : | my $rdbH = $self->db_handle; | ||
2073 : | |||
2074 : | if (($relational_db_response = $rdbH->SQL("SELECT genome FROM genome_md5sum WHERE ( md5sum = \'$cksum\' )")) | ||
2075 : | && (@$relational_db_response == 1)) | ||
2076 : | { | ||
2077 : | return $relational_db_response->[0]->[0]; | ||
2078 : | } | ||
2079 : | |||
2080 : | return undef; | ||
2081 : | } | ||
2082 : | |||
2083 : | olson | 1.237 | sub contig_md5sum :scalar { |
2084 : | my($self, $genome, $contig) = @_; | ||
2085 : | my $relational_db_response; | ||
2086 : | my $rdbH = $self->db_handle; | ||
2087 : | |||
2088 : | if ($genome) | ||
2089 : | { | ||
2090 : | if (($relational_db_response = $rdbH->SQL(qq(SELECT md5 FROM contig_md5sums | ||
2091 : | WHERE (genome = ? AND | ||
2092 : | contig = ?)), undef, $genome, $contig)) | ||
2093 : | && (@$relational_db_response == 1)) | ||
2094 : | { | ||
2095 : | return $relational_db_response->[0]->[0]; | ||
2096 : | } | ||
2097 : | } | ||
2098 : | return undef; | ||
2099 : | } | ||
2100 : | |||
2101 : | efrank | 1.1 | =pod |
2102 : | |||
2103 : | =head1 genus_species | ||
2104 : | |||
2105 : | usage: $gs = $fig->genus_species($genome_id) | ||
2106 : | |||
2107 : | parrello | 1.200 | Returns the genus and species (and strain if that has been properly recorded) |
2108 : | efrank | 1.1 | in a printable form. |
2109 : | |||
2110 : | =cut | ||
2111 : | |||
2112 : | olson | 1.111 | sub genus_species :scalar { |
2113 : | efrank | 1.1 | my ($self,$genome) = @_; |
2114 : | overbeek | 1.13 | my $ans; |
2115 : | efrank | 1.1 | |
2116 : | my $genus_species = $self->cached('_genus_species'); | ||
2117 : | if (! ($ans = $genus_species->{$genome})) | ||
2118 : | { | ||
2119 : | overbeek | 1.13 | my $rdbH = $self->db_handle; |
2120 : | my $relational_db_response = $rdbH->SQL("SELECT genome,gname FROM genome"); | ||
2121 : | my $pair; | ||
2122 : | foreach $pair (@$relational_db_response) | ||
2123 : | efrank | 1.1 | { |
2124 : | overbeek | 1.13 | $genus_species->{$pair->[0]} = $pair->[1]; |
2125 : | efrank | 1.1 | } |
2126 : | overbeek | 1.13 | $ans = $genus_species->{$genome}; |
2127 : | efrank | 1.1 | } |
2128 : | return $ans; | ||
2129 : | } | ||
2130 : | |||
2131 : | =pod | ||
2132 : | |||
2133 : | =head1 org_of | ||
2134 : | |||
2135 : | usage: $org = $fig->org_of($prot_id) | ||
2136 : | |||
2137 : | In the case of external proteins, we can usually determine an organism, but not | ||
2138 : | parrello | 1.200 | anything more precise than genus/species (and often not that). This routine takes |
2139 : | efrank | 1.2 | a protein ID (which may be a feature ID) and returns "the organism". |
2140 : | efrank | 1.1 | |
2141 : | =cut | ||
2142 : | |||
2143 : | sub org_of { | ||
2144 : | my($self,$prot_id) = @_; | ||
2145 : | my $relational_db_response; | ||
2146 : | my $rdbH = $self->db_handle; | ||
2147 : | |||
2148 : | if ($prot_id =~ /^fig\|/) | ||
2149 : | { | ||
2150 : | golsen | 1.138 | # |
2151 : | # Trying to guess what Ross wanted (there was a servere bug): | ||
2152 : | # | ||
2153 : | # deleted -> undefined | ||
2154 : | # failed lookup -> "" | ||
2155 : | # | ||
2156 : | return $self->is_deleted_fid( $prot_id) ? undef | ||
2157 : | : $self->genus_species( $self->genome_of( $prot_id ) ) || ""; | ||
2158 : | efrank | 1.1 | } |
2159 : | |||
2160 : | if (($relational_db_response = $rdbH->SQL("SELECT org FROM external_orgs WHERE ( prot = \'$prot_id\' )")) && | ||
2161 : | (@$relational_db_response >= 1)) | ||
2162 : | { | ||
2163 : | overbeek | 1.248 | $relational_db_response->[0]->[0] =~ s/^\d+://; |
2164 : | efrank | 1.1 | return $relational_db_response->[0]->[0]; |
2165 : | } | ||
2166 : | return ""; | ||
2167 : | } | ||
2168 : | |||
2169 : | golsen | 1.130 | # |
2170 : | # Support for colorizing organisms by domain | ||
2171 : | # -- GJO | ||
2172 : | # | ||
2173 : | =pod | ||
2174 : | |||
2175 : | =head1 genus_species_domain | ||
2176 : | |||
2177 : | usage: ($gs, $domain) = $fig->genus_species_domain($genome_id) | ||
2178 : | |||
2179 : | parrello | 1.200 | Returns the genus and species (and strain if that has been properly recorded) |
2180 : | golsen | 1.130 | in a printable form, and domain. |
2181 : | |||
2182 : | =cut | ||
2183 : | |||
2184 : | sub genus_species_domain { | ||
2185 : | my ($self, $genome) = @_; | ||
2186 : | |||
2187 : | my $genus_species_domain = $self->cached('_genus_species_domain'); | ||
2188 : | if ( ! $genus_species_domain->{ $genome } ) | ||
2189 : | { | ||
2190 : | my $rdbH = $self->db_handle; | ||
2191 : | my $relational_db_response = $rdbH->SQL("SELECT genome,gname,maindomain FROM genome"); | ||
2192 : | my $triple; | ||
2193 : | foreach $triple ( @$relational_db_response ) | ||
2194 : | { | ||
2195 : | $genus_species_domain->{ $triple->[0] } = [ $triple->[1], $triple->[2] ]; | ||
2196 : | } | ||
2197 : | } | ||
2198 : | my $gsdref = $genus_species_domain->{ $genome }; | ||
2199 : | return $gsdref ? @$gsdref : ( "", "" ); | ||
2200 : | } | ||
2201 : | |||
2202 : | |||
2203 : | my %domain_color = ( AR => "#DDFFFF", BA => "#FFDDFF", EU => "#FFFFDD", | ||
2204 : | VI => "#DDDDDD", EN => "#BBBBBB" ); | ||
2205 : | |||
2206 : | sub domain_color { | ||
2207 : | my ($domain) = @_; | ||
2208 : | defined $domain || return "#FFFFFF"; | ||
2209 : | return $domain_color{ uc substr($domain, 0, 2) } || "#FFFFFF"; | ||
2210 : | } | ||
2211 : | |||
2212 : | |||
2213 : | =pod | ||
2214 : | |||
2215 : | =head1 org_and_color_of | ||
2216 : | |||
2217 : | usage: ($org, $color) = $fig->org_and_domain_of($prot_id) | ||
2218 : | |||
2219 : | Return the best guess organism and domain html color string of an organism. | ||
2220 : | In the case of external proteins, we can usually determine an organism, but not | ||
2221 : | parrello | 1.200 | anything more precise than genus/species (and often not that). This routine takes |
2222 : | golsen | 1.130 | a protein ID (which may be a feature ID) and returns "the organism". |
2223 : | |||
2224 : | =cut | ||
2225 : | |||
2226 : | sub org_and_color_of { | ||
2227 : | my($self,$prot_id) = @_; | ||
2228 : | my $relational_db_response; | ||
2229 : | my $rdbH = $self->db_handle; | ||
2230 : | |||
2231 : | if ($prot_id =~ /^fig\|/) | ||
2232 : | { | ||
2233 : | my( $gs, $domain ) = $self->genus_species_domain($self->genome_of($prot_id)); | ||
2234 : | return ( $gs, domain_color( $domain ) ); | ||
2235 : | } | ||
2236 : | |||
2237 : | if (($relational_db_response = $rdbH->SQL("SELECT org FROM external_orgs WHERE ( prot = \'$prot_id\' )")) && | ||
2238 : | (@$relational_db_response >= 1)) | ||
2239 : | { | ||
2240 : | return ($relational_db_response->[0]->[0], "#FFFFFF"); | ||
2241 : | } | ||
2242 : | return ("", "#FFFFFF"); | ||
2243 : | } | ||
2244 : | |||
2245 : | # | ||
2246 : | # End of support for colorizing organisms by domain | ||
2247 : | # -- GJO | ||
2248 : | # | ||
2249 : | |||
2250 : | efrank | 1.1 | =pod |
2251 : | |||
2252 : | =head1 abbrev | ||
2253 : | |||
2254 : | usage: $abbreviated_name = $fig->abbrev($genome_name) | ||
2255 : | |||
2256 : | For alignments and such, it is very useful to be able to produce an abbreviation of genus/species. | ||
2257 : | That's what this does. Note that multiple genus/species might reduce to the same abbreviation, so | ||
2258 : | be careful (disambiguate them, if you must). | ||
2259 : | |||
2260 : | =cut | ||
2261 : | |||
2262 : | olson | 1.111 | sub abbrev :scalar { |
2263 : | shift if UNIVERSAL::isa($_[0],__PACKAGE__); | ||
2264 : | efrank | 1.1 | my($genome_name) = @_; |
2265 : | |||
2266 : | $genome_name =~ s/^(\S{3})\S+/$1./; | ||
2267 : | overbeek | 1.198 | $genome_name =~ s/^(\S+)\s+(\S{3})\S+/$1$2./; |
2268 : | overbeek | 1.257 | $genome_name =~ s/ //g; |
2269 : | if (length($genome_name) > 10) | ||
2270 : | efrank | 1.1 | { |
2271 : | overbeek | 1.257 | $genome_name = substr($genome_name,0,10); |
2272 : | efrank | 1.1 | } |
2273 : | return $genome_name; | ||
2274 : | } | ||
2275 : | |||
2276 : | ################ Routines to process Features and Feature IDs ########################## | ||
2277 : | |||
2278 : | =pod | ||
2279 : | |||
2280 : | =head1 ftype | ||
2281 : | |||
2282 : | usage: $type = &FIG::ftype($fid) | ||
2283 : | |||
2284 : | Returns the type of a feature, given the feature ID. This just amounts | ||
2285 : | to lifting it out of the feature ID, since features have IDs of tghe form | ||
2286 : | |||
2287 : | fig|x.y.f.n | ||
2288 : | |||
2289 : | where | ||
2290 : | x.y is the genome ID | ||
2291 : | f is the type pf feature | ||
2292 : | n is an integer that is unique within the genome/type | ||
2293 : | |||
2294 : | =cut | ||
2295 : | |||
2296 : | sub ftype { | ||
2297 : | olson | 1.111 | shift if UNIVERSAL::isa($_[0],__PACKAGE__); |
2298 : | efrank | 1.1 | my($feature_id) = @_; |
2299 : | |||
2300 : | if ($feature_id =~ /^fig\|\d+\.\d+\.([^\.]+)/) | ||
2301 : | { | ||
2302 : | return $1; | ||
2303 : | } | ||
2304 : | return undef; | ||
2305 : | } | ||
2306 : | |||
2307 : | =pod | ||
2308 : | |||
2309 : | =head1 genome_of | ||
2310 : | |||
2311 : | usage: $genome_id = $fig->genome_of($fid) | ||
2312 : | |||
2313 : | This just extracts the genome ID from a feature ID. | ||
2314 : | |||
2315 : | =cut | ||
2316 : | |||
2317 : | |||
2318 : | olson | 1.113 | sub genome_of :scalar { |
2319 : | olson | 1.111 | shift if UNIVERSAL::isa($_[0],__PACKAGE__); |
2320 : | parrello | 1.200 | my $prot_id = (@_ == 1) ? $_[0] : $_[1]; |
2321 : | efrank | 1.1 | |
2322 : | if ($prot_id =~ /^fig\|(\d+\.\d+)/) { return $1; } | ||
2323 : | return undef; | ||
2324 : | } | ||
2325 : | |||
2326 : | olson | 1.96 | =head1 genome_and_peg_of |
2327 : | |||
2328 : | usage: ($genome_id, $peg_number = $fig->genome_and_peg_of($fid) | ||
2329 : | |||
2330 : | This just extracts the genome ID and peg number from a feature ID. | ||
2331 : | |||
2332 : | =cut | ||
2333 : | |||
2334 : | |||
2335 : | sub genome_and_peg_of { | ||
2336 : | olson | 1.111 | shift if UNIVERSAL::isa($_[0],__PACKAGE__); |
2337 : | parrello | 1.200 | my $prot_id = (@_ == 1) ? $_[0] : $_[1]; |
2338 : | olson | 1.96 | |
2339 : | if ($prot_id =~ /^fig\|(\d+\.\d+)\.peg\.(\d+)/) | ||
2340 : | { | ||
2341 : | return ($1, $2); | ||
2342 : | } | ||
2343 : | return undef; | ||
2344 : | } | ||
2345 : | |||
2346 : | efrank | 1.1 | =pod |
2347 : | |||
2348 : | =head1 by_fig_id | ||
2349 : | |||
2350 : | usage: @sorted_by_fig_id = sort { &FIG::by_fig_id($a,$b) } @fig_ids | ||
2351 : | |||
2352 : | This is a bit of a clutzy way to sort a list of FIG feature IDs, but it works. | ||
2353 : | |||
2354 : | =cut | ||
2355 : | |||
2356 : | sub by_fig_id { | ||
2357 : | my($a,$b) = @_; | ||
2358 : | my($g1,$g2,$t1,$t2,$n1,$n2); | ||
2359 : | if (($a =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g1,$t1,$n1) = ($1,$2,$3)) && | ||
2360 : | ($b =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g2,$t2,$n2) = ($1,$2,$3))) | ||
2361 : | { | ||
2362 : | ($g1 <=> $g2) or ($t1 cmp $t2) or ($n1 <=> $n2); | ||
2363 : | } | ||
2364 : | else | ||
2365 : | { | ||
2366 : | $a cmp $b; | ||
2367 : | } | ||
2368 : | } | ||
2369 : | |||
2370 : | =pod | ||
2371 : | |||
2372 : | =head1 genes_in_region | ||
2373 : | |||
2374 : | usage: ($features_in_region,$beg1,$end1) = $fig->genes_in_region($genome,$contig,$beg,$end) | ||
2375 : | |||
2376 : | It is often important to be able to find the genes that occur in a specific region on | ||
2377 : | a chromosome. This routine is designed to provide this information. It returns all genes | ||
2378 : | that overlap the region ($genome,$contig,$beg,$end). $beg1 is set to the minimum coordinate of | ||
2379 : | the returned genes (which may be before the given region), and $end1 the maximum coordinate. | ||
2380 : | |||
2381 : | The routine assumes that genes are not more than 10000 bases long, which is certainly not true | ||
2382 : | in eukaryotes. Hence, in euks you may well miss genes that overlap the boundaries of the specified | ||
2383 : | region (sorry). | ||
2384 : | |||
2385 : | =cut | ||
2386 : | parrello | 1.213 | #: Return Type @; |
2387 : | efrank | 1.1 | sub genes_in_region { |
2388 : | my($self,$genome,$contig,$beg,$end) = @_; | ||
2389 : | my($x,$relational_db_response,$feature_id,$b1,$e1,@feat,@tmp,$l,$u); | ||
2390 : | |||
2391 : | my $pad = 10000; | ||
2392 : | my $rdbH = $self->db_handle; | ||
2393 : | |||
2394 : | parrello | 1.200 | my $minV = $beg - $pad; |
2395 : | efrank | 1.1 | my $maxV = $end + $pad; |
2396 : | parrello | 1.200 | if (($relational_db_response = $rdbH->SQL("SELECT id FROM features |
2397 : | golsen | 1.141 | WHERE ( minloc > $minV ) AND ( minloc < $maxV ) AND ( maxloc < $maxV) AND |
2398 : | efrank | 1.1 | ( genome = \'$genome\' ) AND ( contig = \'$contig\' );")) && |
2399 : | (@$relational_db_response >= 1)) | ||
2400 : | { | ||
2401 : | parrello | 1.200 | @tmp = sort { ($a->[1] cmp $b->[1]) or |
2402 : | overbeek | 1.129 | (($a->[2]+$a->[3]) <=> ($b->[2]+$b->[3])) |
2403 : | efrank | 1.1 | } |
2404 : | parrello | 1.200 | map { $feature_id = $_->[0]; |
2405 : | $x = $self->feature_location($feature_id); | ||
2406 : | $x ? [$feature_id,&boundaries_of($x)] : () | ||
2407 : | efrank | 1.1 | } @$relational_db_response; |
2408 : | |||
2409 : | |||
2410 : | ($l,$u) = (10000000000,0); | ||
2411 : | foreach $x (@tmp) | ||
2412 : | { | ||
2413 : | ($feature_id,undef,$b1,$e1) = @$x; | ||
2414 : | if (&between($beg,&min($b1,$e1),$end) || &between(&min($b1,$e1),$beg,&max($b1,$e1))) | ||
2415 : | { | ||
2416 : | overbeek | 1.136 | if (! $self->is_deleted_fid($feature_id)) |
2417 : | { | ||
2418 : | push(@feat,$feature_id); | ||
2419 : | $l = &min($l,&min($b1,$e1)); | ||
2420 : | $u = &max($u,&max($b1,$e1)); | ||
2421 : | } | ||
2422 : | efrank | 1.1 | } |
2423 : | } | ||
2424 : | (@feat <= 0) || return ([@feat],$l,$u); | ||
2425 : | } | ||
2426 : | return ([],$l,$u); | ||
2427 : | } | ||
2428 : | |||
2429 : | golsen | 1.141 | |
2430 : | #============================================================================= | ||
2431 : | # Using the following version is better, but it brings out a very annoying | ||
2432 : | # issue with some genomes. It already exists in the current code (above) | ||
2433 : | # for some genes in some genomes. For example, visit fig|70601.1.peg.1. | ||
2434 : | # This is true for any genome that has a feature that crosses the origin. | ||
2435 : | # The root of the problem lies in boundaries_of. I am working on a fix that | ||
2436 : | # replaces boundaries_of with a more sophisticated function. When it is | ||
2437 : | # all done, genes_in_retion should behave as desired. -- GJO, Aug. 22, 2004 | ||
2438 : | #============================================================================= | ||
2439 : | parrello | 1.200 | # |
2440 : | golsen | 1.141 | # =pod |
2441 : | parrello | 1.200 | # |
2442 : | golsen | 1.141 | # =head1 genes_in_region |
2443 : | parrello | 1.200 | # |
2444 : | golsen | 1.141 | # usage: ( $features_in_region, $min_coord, $max_coord ) |
2445 : | # = $fig->genes_in_region( $genome, $contig, $beg, $end ) | ||
2446 : | parrello | 1.200 | # |
2447 : | golsen | 1.141 | # It is often important to be able to find the genes that occur in a specific |
2448 : | # region on a chromosome. This routine is designed to provide this information. | ||
2449 : | # It returns all genes that overlap the region ( $genome, $contig, $beg, $end ). | ||
2450 : | # $min_coord is set to the minimum coordinate of the returned genes (which may | ||
2451 : | # preceed the given region), and $max_coord the maximum coordinate. Because | ||
2452 : | # the database is indexed by the leftmost and rightmost coordinates of each | ||
2453 : | # feature, the function makes no assumption about the length of the feature, but | ||
2454 : | # it can (and probably will) miss features spanning multiple contigs. | ||
2455 : | parrello | 1.200 | # |
2456 : | golsen | 1.141 | # =cut |
2457 : | parrello | 1.200 | # |
2458 : | # | ||
2459 : | golsen | 1.141 | # sub genes_in_region { |
2460 : | # my ( $self, $genome, $contig, $beg, $end ) = @_; | ||
2461 : | # my ( $x, $db_response, $feature_id, $b1, $e1, @tmp, @bounds ); | ||
2462 : | # my ( $min_coord, $max_coord ); | ||
2463 : | parrello | 1.200 | # |
2464 : | golsen | 1.141 | # my @features = (); |
2465 : | # my $rdbH = $self->db_handle; | ||
2466 : | parrello | 1.200 | # |
2467 : | golsen | 1.141 | # if ( ( $db_response = $rdbH->SQL( "SELECT id |
2468 : | # FROM features | ||
2469 : | # WHERE ( contig = '$contig' ) | ||
2470 : | # AND ( genome = '$genome' ) | ||
2471 : | parrello | 1.200 | # AND ( minloc <= $end ) |
2472 : | golsen | 1.141 | # AND ( maxloc >= $beg );" |
2473 : | # ) | ||
2474 : | # ) | ||
2475 : | # && ( @$db_response > 0 ) | ||
2476 : | # ) | ||
2477 : | # { | ||
2478 : | # # The sort is unnecessary, but provides a consistent ordering | ||
2479 : | parrello | 1.200 | # |
2480 : | golsen | 1.141 | # @tmp = sort { ( $a->[1] cmp $b->[1] ) # contig |
2481 : | # || ( ($a->[2] + $a->[3] ) <=> ( $b->[2] + $b->[3] ) ) # midpoint | ||
2482 : | # } | ||
2483 : | # map { $feature_id = $_->[0]; | ||
2484 : | # ( ( ! $self->is_deleted_fid( $feature_id ) ) # not deleted | ||
2485 : | # && ( $x = $self->feature_location( $feature_id ) ) # and has location | ||
2486 : | # && ( ( @bounds = boundaries_of( $x ) ) == 3 ) # and has bounds | ||
2487 : | parrello | 1.200 | # ) ? [ $feature_id, @bounds ] : () |
2488 : | golsen | 1.141 | # } @$db_response; |
2489 : | parrello | 1.200 | # |
2490 : | golsen | 1.141 | # ( $min_coord, $max_coord ) = ( 10000000000, 0 ); |
2491 : | parrello | 1.200 | # |
2492 : | golsen | 1.141 | # foreach $x ( @tmp ) |
2493 : | # { | ||
2494 : | # ( $feature_id, undef, $b1, $e1 ) = @$x; | ||
2495 : | # push @features, $feature_id; | ||
2496 : | # my ( $min, $max ) = ( $b1 <= $e1 ) ? ( $b1, $e1 ) : ( $e1, $b1 ); | ||
2497 : | # ( $min_coord <= $min ) || ( $min_coord = $min ); | ||
2498 : | # ( $max_coord >= $max ) || ( $max_coord = $max ); | ||
2499 : | # } | ||
2500 : | # } | ||
2501 : | parrello | 1.200 | # |
2502 : | golsen | 1.141 | # return ( @features ) ? ( [ @features ], $min_coord, $max_coord ) |
2503 : | # : ( [], undef, undef ); | ||
2504 : | # } | ||
2505 : | |||
2506 : | # These will be part of the fix to genes_in_region. -- GJO | ||
2507 : | |||
2508 : | =pod | ||
2509 : | |||
2510 : | =head1 regions_spanned | ||
2511 : | |||
2512 : | usage: ( [ $contig, $beg, $end ], ... ) = $fig->regions_spanned( $loc ) | ||
2513 : | |||
2514 : | The location of a feature in a scalar context is | ||
2515 : | |||
2516 : | contig_b1_e1,contig_b2_e2,... [one contig_b_e for each segment] | ||
2517 : | |||
2518 : | This routine takes as input a fig location and reduces it to one or more | ||
2519 : | regions spanned by the gene. Unlike boundaries_of, regions_spanned handles | ||
2520 : | wrapping through the orgin, features split over contigs and exons that are | ||
2521 : | not ordered nicely along the chromosome (ugly but true). | ||
2522 : | |||
2523 : | =cut | ||
2524 : | |||
2525 : | sub regions_spanned { | ||
2526 : | shift if UNIVERSAL::isa( $_[0], __PACKAGE__ ); | ||
2527 : | my( $location ) = ( @_ == 1 ) ? $_[0] : $_[1]; | ||
2528 : | defined( $location ) || return undef; | ||
2529 : | |||
2530 : | my @regions = (); | ||
2531 : | |||
2532 : | my ( $cur_contig, $cur_beg, $cur_end, $cur_dir ); | ||
2533 : | my ( $contig, $beg, $end, $dir ); | ||
2534 : | my @segs = split( /\s*,\s*/, $location ); # should not have space, but ... | ||
2535 : | @segs || return undef; | ||
2536 : | |||
2537 : | # Process the first segment | ||
2538 : | |||
2539 : | my $seg = shift @segs; | ||
2540 : | ( ( $cur_contig, $cur_beg, $cur_end ) = ( $seg =~ /^(\S+)_(\d+)_\d+$/ ) ) | ||
2541 : | || return undef; | ||
2542 : | $cur_dir = ( $cur_end >= $cur_beg ) ? 1 : -1; | ||
2543 : | |||
2544 : | foreach $seg ( @segs ) { | ||
2545 : | ( ( $contig, $beg, $end ) = ( $seg =~ /^(\S+)_(\d+)_\d+$/ ) ) || next; | ||
2546 : | $dir = ( $end >= $beg ) ? 1 : -1; | ||
2547 : | |||
2548 : | # Is this a continuation? Update end | ||
2549 : | |||
2550 : | if ( ( $contig eq $cur_contig ) | ||
2551 : | && ( $dir == $cur_dir ) | ||
2552 : | && ( ( ( $dir > 0 ) && ( $end > $cur_end ) ) | ||
2553 : | || ( ( $dir < 0 ) && ( $end < $cur_end ) ) ) | ||
2554 : | ) | ||
2555 : | { | ||
2556 : | $cur_end = $end; | ||
2557 : | } | ||
2558 : | |||
2559 : | # Not a continuation. Report previous and update current. | ||
2560 : | |||
2561 : | else | ||
2562 : | { | ||
2563 : | push @regions, [ $cur_contig, $cur_beg, $cur_end ]; | ||
2564 : | ( $cur_contig, $cur_beg, $cur_end, $cur_dir ) | ||
2565 : | = ( $contig, $beg, $end, $dir ); | ||
2566 : | } | ||
2567 : | } | ||
2568 : | |||
2569 : | # There should alwasy be a valid, unreported region. | ||
2570 : | |||
2571 : | push @regions, [ $cur_contig, $cur_beg, $cur_end ]; | ||
2572 : | |||
2573 : | return wantarray ? @regions : \@regions; | ||
2574 : | } | ||
2575 : | |||
2576 : | |||
2577 : | =pod | ||
2578 : | |||
2579 : | =head1 filter_regions | ||
2580 : | |||
2581 : | usage: @regions = filter_regions( $contig, $min, $max, @regions ) | ||
2582 : | \@regions = filter_regions( $contig, $min, $max, @regions ) | ||
2583 : | @regions = filter_regions( $contig, $min, $max, \@regions ) | ||
2584 : | \@regions = filter_regions( $contig, $min, $max, \@regions ) | ||
2585 : | |||
2586 : | This function provides a simple filter for extracting a list of genome regions | ||
2587 : | for those that overlap a particular interval. Region definitions correspond | ||
2588 : | to those produced by regions_spanned. That is, [ contig, beg, end ]. In the | ||
2589 : | function call, either $contig or $min and $max can be undefined (permitting | ||
2590 : | anything). | ||
2591 : | |||
2592 : | =cut | ||
2593 : | |||
2594 : | sub filter_regions { | ||
2595 : | my ( $contig, $min, $max, @regions ) = @_; | ||
2596 : | |||
2597 : | @regions || return (); | ||
2598 : | ( ref( $regions[0] ) eq "ARRAY" ) || return undef; | ||
2599 : | |||
2600 : | # Is it a region list, or a reference to a region list? | ||
2601 : | |||
2602 : | if ( ref( $regions[0]->[0] ) eq "ARRAY" ) { @regions = @{ $regions[0] } } | ||
2603 : | |||
2604 : | if ( ! defined( $contig ) ) | ||
2605 : | { | ||
2606 : | ( defined( $min ) && defined( $max ) ) || return undef; | ||
2607 : | } | ||
2608 : | else # with a defined contig name, allow undefined range | ||
2609 : | { | ||
2610 : | defined( $min ) || ( $min = 1 ); | ||
2611 : | defined( $max ) || ( $max = 1000000000 ); | ||
2612 : | } | ||
2613 : | ( $min <= $max ) || return (); | ||
2614 : | |||
2615 : | my ( $c, $b, $e ); | ||
2616 : | my @filtered = grep { ( @$_ >= 3 ) # Allow extra fields? | ||
2617 : | && ( ( $c, $b, $e ) = @$_ ) | ||
2618 : | && ( ( ! defined( $contig ) ) || ( $c eq $contig ) ) | ||
2619 : | && ( ( $e >= $b ) || ( ( $b, $e ) = ( $e, $b ) ) ) | ||
2620 : | && ( ( $b <= $max ) && ( $e >= $min ) ) | ||
2621 : | } @regions; | ||
2622 : | |||
2623 : | return wantarray ? @filtered : \@filtered; | ||
2624 : | } | ||
2625 : | |||
2626 : | |||
2627 : | efrank | 1.1 | sub close_genes { |
2628 : | my($self,$fid,$dist) = @_; | ||
2629 : | parrello | 1.200 | |
2630 : | mkubal | 1.147 | # warn "In close_genes, self=$self, fid=$fid"; |
2631 : | parrello | 1.200 | |
2632 : | efrank | 1.1 | my $loc = $self->feature_location($fid); |
2633 : | if ($loc) | ||
2634 : | { | ||
2635 : | my($contig,$beg,$end) = &FIG::boundaries_of($loc); | ||
2636 : | if ($contig && $beg && $end) | ||
2637 : | { | ||
2638 : | my $min = &min($beg,$end) - $dist; | ||
2639 : | my $max = &max($beg,$end) + $dist; | ||
2640 : | my $feat; | ||
2641 : | ($feat,undef,undef) = $self->genes_in_region(&FIG::genome_of($fid),$contig,$min,$max); | ||
2642 : | return @$feat; | ||
2643 : | } | ||
2644 : | } | ||
2645 : | return (); | ||
2646 : | } | ||
2647 : | |||
2648 : | mkubal | 1.147 | sub adjacent_genes |
2649 : | { | ||
2650 : | my ($self, $fid, $dist) = @_; | ||
2651 : | my (@close, $strand, $i); | ||
2652 : | parrello | 1.200 | |
2653 : | mkubal | 1.147 | # warn "In adjacent_genes, self=$self, fid=$fid"; |
2654 : | parrello | 1.200 | |
2655 : | |||
2656 : | mkubal | 1.147 | $strand = $self->strand_of($fid); |
2657 : | parrello | 1.200 | |
2658 : | mkubal | 1.147 | $dist = $dist || 2000; |
2659 : | @close = $self->close_genes($fid, $dist); | ||
2660 : | for ($i=0; $i < @close; ++$i) { last if ($close[$i] eq $fid); } | ||
2661 : | parrello | 1.200 | |
2662 : | redwards | 1.157 | # RAE note that if $self->strand_of($close[$i-1]) ne $strand then left/right neighbors |
2663 : | # were never set! oops! | ||
2664 : | parrello | 1.200 | |
2665 : | redwards | 1.157 | # I think the concept of Left and right is confused here. In my mind, left and right |
2666 : | # are independent of strand ?? E.g. take a look at PEG fig|196600.1.peg.1806 | ||
2667 : | # this is something like | ||
2668 : | # | ||
2669 : | # ---> <--1805--- --1806--> <--1807-- <---- | ||
2670 : | # | ||
2671 : | # 1805 is always the left neighbor, no? | ||
2672 : | |||
2673 : | my ($left_neighbor, $right_neighbor) = ($close[$i-1], $close[$i+1]); | ||
2674 : | parrello | 1.200 | |
2675 : | redwards | 1.157 | if (0) # this was if ($i > 0) I just skip this whole section! |
2676 : | mkubal | 1.147 | { |
2677 : | if ($self->strand_of($close[$i-1]) eq $strand) { $left_neighbor = $close[$i-1]; } | ||
2678 : | redwards | 1.157 | # else {$left_neighbor=$close[$i+1]} # RAE: this is the alternative that is needed if you do it by strand |
2679 : | mkubal | 1.147 | } |
2680 : | parrello | 1.200 | |
2681 : | mkubal | 1.147 | if ($i < $#close) |
2682 : | { | ||
2683 : | if ($self->strand_of($close[$i+1]) eq $strand) { $right_neighbor = $close[$i+1]; } | ||
2684 : | redwards | 1.157 | # else {$right_neighbor = $close[$i-1]} # RAE: this is the alternative that is needed if you do it by strand |
2685 : | mkubal | 1.147 | } |
2686 : | parrello | 1.200 | |
2687 : | mkubal | 1.147 | # ...return genes in transcription order... |
2688 : | parrello | 1.200 | if ($strand eq '-') |
2689 : | mkubal | 1.147 | { |
2690 : | ($left_neighbor, $right_neighbor) = ($right_neighbor, $left_neighbor); | ||
2691 : | } | ||
2692 : | parrello | 1.200 | |
2693 : | mkubal | 1.147 | return ($left_neighbor, $right_neighbor) ; |
2694 : | } | ||
2695 : | |||
2696 : | efrank | 1.1 | |
2697 : | =pod | ||
2698 : | |||
2699 : | =head1 feature_location | ||
2700 : | |||
2701 : | usage: $loc = $fig->feature_location($fid) OR | ||
2702 : | @loc = $fig->feature_location($fid) | ||
2703 : | |||
2704 : | The location of a feature in a scalar context is | ||
2705 : | |||
2706 : | contig_b1_e1,contig_b2_e2,... [one contig_b_e for each exon] | ||
2707 : | |||
2708 : | In a list context it is | ||
2709 : | |||
2710 : | (contig_b1_e1,contig_b2_e2,...) | ||
2711 : | |||
2712 : | =cut | ||
2713 : | |||
2714 : | olson | 1.111 | sub feature_location :scalar :list { |
2715 : | efrank | 1.1 | my($self,$feature_id) = @_; |
2716 : | my($relational_db_response,$locations,$location); | ||
2717 : | parrello | 1.200 | |
2718 : | mkubal | 1.147 | # warn "In feature_location, self=$self, fid=$feature_id"; |
2719 : | parrello | 1.200 | |
2720 : | overbeek | 1.136 | if ($self->is_deleted_fid($feature_id)) { return undef } |
2721 : | |||
2722 : | efrank | 1.1 | $locations = $self->cached('_location'); |
2723 : | if (! ($location = $locations->{$feature_id})) | ||
2724 : | { | ||
2725 : | my $rdbH = $self->db_handle; | ||
2726 : | if (($relational_db_response = $rdbH->SQL("SELECT location FROM features WHERE ( id = \'$feature_id\' )")) && | ||
2727 : | (@$relational_db_response == 1)) | ||
2728 : | { | ||
2729 : | $locations->{$feature_id} = $location = $relational_db_response->[0]->[0]; | ||
2730 : | } | ||
2731 : | } | ||
2732 : | |||
2733 : | if ($location) | ||
2734 : | { | ||
2735 : | return wantarray() ? split(/,/,$location) : $location; | ||
2736 : | } | ||
2737 : | return undef; | ||
2738 : | } | ||
2739 : | |||
2740 : | mkubal | 1.147 | sub contig_of |
2741 : | { | ||
2742 : | my ($self, $locus) = @_; | ||
2743 : | parrello | 1.200 | |
2744 : | olson | 1.159 | $locus =~ m/^([^,]+)_\d+_\d+/; |
2745 : | parrello | 1.200 | |
2746 : | mkubal | 1.147 | return $1; |
2747 : | } | ||
2748 : | |||
2749 : | sub beg_of | ||
2750 : | { | ||
2751 : | my ($self, $locus) = @_; | ||
2752 : | parrello | 1.200 | |
2753 : | olson | 1.159 | $locus =~ m/^[^,]+_(\d+)_\d+/; |
2754 : | parrello | 1.200 | |
2755 : | mkubal | 1.147 | return $1; |
2756 : | } | ||
2757 : | |||
2758 : | sub end_of | ||
2759 : | { | ||
2760 : | my ($self, $locus) = @_; | ||
2761 : | parrello | 1.200 | |
2762 : | mkubal | 1.147 | $locus =~ m/\S+_\d+_(\d+)$/; |
2763 : | parrello | 1.200 | |
2764 : | mkubal | 1.147 | return $1; |
2765 : | } | ||
2766 : | |||
2767 : | parrello | 1.200 | sub strand_of |
2768 : | mkubal | 1.147 | { |
2769 : | my ($self, $fid) = @_; | ||
2770 : | my ($beg, $end); | ||
2771 : | parrello | 1.200 | |
2772 : | mkubal | 1.147 | # warn "In strand_of, self=$self, fid=$fid"; |
2773 : | parrello | 1.200 | |
2774 : | mkubal | 1.147 | (undef, $beg, $end) = $self->boundaries_of($self->feature_location($fid)); |
2775 : | parrello | 1.200 | |
2776 : | mkubal | 1.147 | if ($beg < $end) { return '+'; } else { return '-'; } |
2777 : | } | ||
2778 : | |||
2779 : | olson | 1.158 | =pod |
2780 : | |||
2781 : | =head1 find_contig_with_checksum | ||
2782 : | |||
2783 : | Find a contig in the given genome with the given checksum. | ||
2784 : | |||
2785 : | |||
2786 : | =cut | ||
2787 : | |||
2788 : | sub find_contig_with_checksum | ||
2789 : | { | ||
2790 : | my($self, $genome, $checksum) = @_; | ||
2791 : | parrello | 1.200 | |
2792 : | olson | 1.158 | # |
2793 : | # This implementation scans all the contig files for the organism; when | ||
2794 : | # we have contig checksums in the database this will simplify | ||
2795 : | # significantly. | ||
2796 : | # | ||
2797 : | # For some efficiency, we cache the checksums we compute along the way since | ||
2798 : | # it's probably likely we'll poke at other contigs for this organism. | ||
2799 : | # | ||
2800 : | |||
2801 : | my $gdir = "$FIG_Config::organisms/$genome"; | ||
2802 : | |||
2803 : | my $cached_cksum = $self->cached('_contig_checksum'); | ||
2804 : | parrello | 1.200 | |
2805 : | olson | 1.158 | if (opendir(my $dh, $gdir)) |
2806 : | { | ||
2807 : | for my $file (map { "$gdir/$_" } grep { $_ =~ /^contigs\d*$/ } readdir($dh)) | ||
2808 : | { | ||
2809 : | local $/ = "\n>"; | ||
2810 : | parrello | 1.200 | |
2811 : | olson | 1.158 | if (open(my $fh, "<$file")) |
2812 : | { | ||
2813 : | while (<$fh>) | ||
2814 : | { | ||
2815 : | chomp; | ||
2816 : | olson | 1.160 | |
2817 : | # | ||
2818 : | # Pull the contig identifier from the first line. | ||
2819 : | # We need the >? to handle the first line in the file; | ||
2820 : | # the others are removed by the chomp above because | ||
2821 : | # $/ is set to "\n>"; | ||
2822 : | # | ||
2823 : | parrello | 1.200 | |
2824 : | olson | 1.160 | if (s/^>?\s*(\S+)([^\n]*)\n//) |
2825 : | olson | 1.158 | { |
2826 : | my $ident = $1; | ||
2827 : | my $contig_txt = $_; | ||
2828 : | parrello | 1.200 | |
2829 : | olson | 1.158 | $contig_txt =~ s/\s//sg; |
2830 : | $contig_txt = uc($contig_txt); | ||
2831 : | |||
2832 : | # | ||
2833 : | # See if we have a cached value. | ||
2834 : | # | ||
2835 : | |||
2836 : | my $this_checksum; | ||
2837 : | $this_checksum = $cached_cksum->{$genome, $ident}; | ||
2838 : | if (!$this_checksum) | ||
2839 : | { | ||
2840 : | parrello | 1.200 | |
2841 : | olson | 1.158 | my($rd, $wr, $pid); |
2842 : | parrello | 1.200 | |
2843 : | olson | 1.158 | if (!($pid = open2($rd, $wr, "cksum"))) |
2844 : | { | ||
2845 : | die "Cannot run open2 cksum: $!"; | ||
2846 : | } | ||
2847 : | parrello | 1.200 | |
2848 : | olson | 1.158 | $wr->write($contig_txt, length($contig_txt)); |
2849 : | parrello | 1.200 | |
2850 : | olson | 1.158 | close($wr); |
2851 : | parrello | 1.200 | |
2852 : | olson | 1.158 | $_ = <$rd>; |
2853 : | close($rd); | ||
2854 : | waitpid $pid, 0; | ||
2855 : | chomp; | ||
2856 : | parrello | 1.200 | |
2857 : | olson | 1.158 | my @vals = split(/\s+/, $_); |
2858 : | $this_checksum = $vals[0]; | ||
2859 : | $cached_cksum->{$genome, $ident} = $this_checksum; | ||
2860 : | } | ||
2861 : | if ($this_checksum == $checksum) | ||
2862 : | { | ||
2863 : | return $ident; | ||
2864 : | } | ||
2865 : | } | ||
2866 : | } | ||
2867 : | } | ||
2868 : | } | ||
2869 : | } | ||
2870 : | } | ||
2871 : | |||
2872 : | sub contig_checksum | ||
2873 : | { | ||
2874 : | my($self, $genome, $contig) = @_; | ||
2875 : | |||
2876 : | my $contig_txt = $self->read_contig($genome, $contig); | ||
2877 : | |||
2878 : | $contig_txt =~ s/\s//sg; | ||
2879 : | $contig_txt = uc($contig_txt); | ||
2880 : | |||
2881 : | olson | 1.159 | my($rd, $wr, $pid); |
2882 : | olson | 1.158 | |
2883 : | olson | 1.159 | if (!($pid = open2($rd, $wr, "cksum"))) |
2884 : | olson | 1.158 | { |
2885 : | die "Cannot run open2 cksum: $!"; | ||
2886 : | } | ||
2887 : | |||
2888 : | $wr->write($contig_txt, length($contig_txt)); | ||
2889 : | parrello | 1.200 | |
2890 : | olson | 1.158 | close($wr); |
2891 : | |||
2892 : | $_ = <$rd>; | ||
2893 : | olson | 1.159 | close($rd); |
2894 : | waitpid $pid, 0; | ||
2895 : | |||
2896 : | olson | 1.158 | chomp; |
2897 : | my @vals = split(/\s+/, $_); | ||
2898 : | if (wantarray) | ||
2899 : | { | ||
2900 : | return @vals; | ||
2901 : | } | ||
2902 : | else | ||
2903 : | { | ||
2904 : | return $vals[0]; | ||
2905 : | } | ||
2906 : | } | ||
2907 : | |||
2908 : | =pod | ||
2909 : | |||
2910 : | =head1 read_contig | ||
2911 : | |||
2912 : | Read a single contig from the contigs file. | ||
2913 : | |||
2914 : | =cut | ||
2915 : | sub read_contig | ||
2916 : | { | ||
2917 : | my($self, $genome, $contig) = @_; | ||
2918 : | |||
2919 : | # | ||
2920 : | # Read the contig. The database has it in a set of chunks, but we | ||
2921 : | # just use the seek to the starting point, and read up to the next "\n>". | ||
2922 : | # | ||
2923 : | |||
2924 : | my $ret = $self->db_handle->SQL(qq!SELECT fileno, seek FROM contig_seeks | ||
2925 : | WHERE genome = '$genome' and contig = '$contig' and | ||
2926 : | startn = 0!); | ||
2927 : | if (!$ret or @$ret != 1) | ||
2928 : | { | ||
2929 : | return undef; | ||
2930 : | } | ||
2931 : | |||
2932 : | my($fileno, $seek) = @{$ret->[0]}; | ||
2933 : | my $file = $self->N2file($fileno); | ||
2934 : | |||
2935 : | my($fh, $contig_txt); | ||
2936 : | parrello | 1.200 | |
2937 : | olson | 1.158 | if (!open($fh, "<$file")) |
2938 : | { | ||
2939 : | warn "contig_checksum: could not open $file: $!\n"; | ||
2940 : | return undef; | ||
2941 : | } | ||
2942 : | |||
2943 : | seek($fh, $seek, 0); | ||
2944 : | |||
2945 : | { | ||
2946 : | local $/ = "\n>"; | ||
2947 : | |||
2948 : | $contig_txt = <$fh>; | ||
2949 : | chomp($contig_txt); | ||
2950 : | } | ||
2951 : | |||
2952 : | return $contig_txt; | ||
2953 : | } | ||
2954 : | mkubal | 1.147 | |
2955 : | efrank | 1.1 | =pod |
2956 : | |||
2957 : | =head1 boundaries_of | ||
2958 : | |||
2959 : | usage: ($contig,$beg,$end) = $fig->boundaries_of($loc) | ||
2960 : | |||
2961 : | The location of a feature in a scalar context is | ||
2962 : | |||
2963 : | contig_b1_e1,contig_b2_e2,... [one contig_b_e for each exon] | ||
2964 : | |||
2965 : | This routine takes as input such a location and reduces it to a single | ||
2966 : | description of the entire region containing the gene. | ||
2967 : | |||
2968 : | =cut | ||
2969 : | |||
2970 : | sub boundaries_of { | ||
2971 : | olson | 1.111 | shift if UNIVERSAL::isa($_[0],__PACKAGE__); |
2972 : | efrank | 1.1 | my($location) = (@_ == 1) ? $_[0] : $_[1]; |
2973 : | my($contigQ); | ||
2974 : | |||
2975 : | if (defined($location)) | ||
2976 : | { | ||
2977 : | my @exons = split(/,/,$location); | ||
2978 : | my($contig,$beg,$end); | ||
2979 : | olson | 1.203 | |
2980 : | parrello | 1.200 | if (($exons[0] =~ /^(\S+)_(\d+)_\d+$/) && |
2981 : | efrank | 1.1 | (($contig,$beg) = ($1,$2)) && ($contigQ = quotemeta $contig) && |
2982 : | parrello | 1.200 | ($exons[$#exons] =~ /^$contigQ\_\d+_(\d+)$/) && |
2983 : | efrank | 1.1 | ($end = $1)) |
2984 : | { | ||
2985 : | return ($contig,$beg,$end); | ||
2986 : | } | ||
2987 : | } | ||
2988 : | return undef; | ||
2989 : | } | ||
2990 : | |||
2991 : | heiko | 1.175 | =pod |
2992 : | |||
2993 : | =head1 all_features_detailed | ||
2994 : | |||
2995 : | usage: $fig->all_features_detailed($genome) | ||
2996 : | |||
2997 : | Returns a list of all feature IDs, location, aliases, type in the designated genome. | ||
2998 : | Used in gendb import to speed up the process | ||
2999 : | =cut | ||
3000 : | |||
3001 : | sub all_features_detailed { | ||
3002 : | my($self,$genome) = @_; | ||
3003 : | |||
3004 : | my $rdbH = $self->db_handle; | ||
3005 : | my $relational_db_response = $rdbH->SQL("SELECT id, location, aliases, type FROM features WHERE (genome = \'$genome\')"); | ||
3006 : | my @features; | ||
3007 : | foreach my $tuple (@$relational_db_response) | ||
3008 : | { | ||
3009 : | push @features, $tuple unless ($self->is_deleted_fid($tuple->[0])); | ||
3010 : | } | ||
3011 : | return \@features; | ||
3012 : | } | ||
3013 : | |||
3014 : | efrank | 1.1 | |
3015 : | =pod | ||
3016 : | |||
3017 : | =head1 all_features | ||
3018 : | |||
3019 : | usage: $fig->all_features($genome,$type) | ||
3020 : | |||
3021 : | Returns a list of all feature IDs of a specified type in the designated genome. You would | ||
3022 : | parrello | 1.200 | usually use just |
3023 : | efrank | 1.1 | |
3024 : | parrello | 1.200 | $fig->pegs_of($genome) or |
3025 : | efrank | 1.1 | $fig->rnas_of($genome) |
3026 : | |||
3027 : | which simply invoke this routine. | ||
3028 : | |||
3029 : | =cut | ||
3030 : | |||
3031 : | sub all_features { | ||
3032 : | my($self,$genome,$type) = @_; | ||
3033 : | |||
3034 : | my $rdbH = $self->db_handle; | ||
3035 : | my $relational_db_response = $rdbH->SQL("SELECT id FROM features WHERE (genome = \'$genome\' AND (type = \'$type\'))"); | ||
3036 : | |||
3037 : | if (@$relational_db_response > 0) | ||
3038 : | { | ||
3039 : | overbeek | 1.136 | return grep { ! $self->is_deleted_fid($_) } map { $_->[0] } @$relational_db_response; |
3040 : | efrank | 1.1 | } |
3041 : | return (); | ||
3042 : | } | ||
3043 : | |||
3044 : | |||
3045 : | =pod | ||
3046 : | |||
3047 : | overbeek | 1.152 | =head1 pegs_of |
3048 : | efrank | 1.1 | |
3049 : | overbeek | 1.152 | usage: $fig->pegs_of($genome) |
3050 : | efrank | 1.1 | |
3051 : | Returns a list of all PEGs in the specified genome. Note that order is not | ||
3052 : | specified. | ||
3053 : | |||
3054 : | =cut | ||
3055 : | |||
3056 : | sub pegs_of { | ||
3057 : | my($self,$genome) = @_; | ||
3058 : | parrello | 1.200 | |
3059 : | efrank | 1.1 | return $self->all_features($genome,"peg"); |
3060 : | } | ||
3061 : | |||
3062 : | |||
3063 : | =pod | ||
3064 : | |||
3065 : | overbeek | 1.152 | =head1 rnas_of |
3066 : | efrank | 1.1 | |
3067 : | overbeek | 1.152 | usage: $fig->rnas_of($genome) |
3068 : | efrank | 1.1 | |
3069 : | Returns a list of all RNAs for the given genome. | ||
3070 : | |||
3071 : | =cut | ||
3072 : | |||
3073 : | sub rnas_of { | ||
3074 : | my($self,$genome) = @_; | ||
3075 : | parrello | 1.200 | |
3076 : | efrank | 1.1 | return $self->all_features($genome,"rna"); |
3077 : | } | ||
3078 : | |||
3079 : | =pod | ||
3080 : | |||
3081 : | =head1 feature_aliases | ||
3082 : | |||
3083 : | usage: @aliases = $fig->feature_aliases($fid) OR | ||
3084 : | parrello | 1.200 | $aliases = $fig->feature_aliases($fid) |
3085 : | efrank | 1.1 | |
3086 : | Returns a list of aliases (gene IDs, arbitrary numbers assigned by authors, etc.) for the feature. | ||
3087 : | These must come from the tbl files, so add them there if you want to see them here. | ||
3088 : | |||
3089 : | In a scalar context, the aliases come back with commas separating them. | ||
3090 : | |||
3091 : | =cut | ||
3092 : | |||
3093 : | sub feature_aliases { | ||
3094 : | my($self,$feature_id) = @_; | ||
3095 : | overbeek | 1.87 | my($rdbH,$relational_db_response,@aliases,$aliases,%aliases,$x); |
3096 : | efrank | 1.1 | |
3097 : | overbeek | 1.136 | if ($self->is_deleted_fid($feature_id)) { return undef } |
3098 : | |||
3099 : | efrank | 1.1 | $rdbH = $self->db_handle; |
3100 : | overbeek | 1.87 | @aliases = (); |
3101 : | efrank | 1.1 | if (($relational_db_response = $rdbH->SQL("SELECT aliases FROM features WHERE ( id = \'$feature_id\' )")) && |
3102 : | (@$relational_db_response == 1)) | ||
3103 : | { | ||
3104 : | $aliases = $relational_db_response->[0]->[0]; | ||
3105 : | overbeek | 1.87 | %aliases = map { $_ => 1 } split(/,/,$aliases); |
3106 : | overbeek | 1.173 | } |
3107 : | |||
3108 : | if (($relational_db_response = $rdbH->SQL("SELECT alias FROM ext_alias WHERE ( id = \'$feature_id\' )")) && | ||
3109 : | (@$relational_db_response > 0)) | ||
3110 : | { | ||
3111 : | foreach $x (@$relational_db_response) | ||
3112 : | overbeek | 1.87 | { |
3113 : | overbeek | 1.173 | $aliases{$x->[0]} = 1; |
3114 : | overbeek | 1.87 | } |
3115 : | efrank | 1.1 | } |
3116 : | parrello | 1.200 | |
3117 : | overbeek | 1.173 | @aliases = sort keys(%aliases); |
3118 : | overbeek | 1.87 | |
3119 : | overbeek | 1.131 | return wantarray() ? @aliases : join(",",@aliases); |
3120 : | efrank | 1.1 | } |
3121 : | |||
3122 : | =pod | ||
3123 : | |||
3124 : | overbeek | 1.34 | =head1 by_alias |
3125 : | |||
3126 : | parrello | 1.200 | usage: $peg = $fig->by_alias($alias) |
3127 : | overbeek | 1.34 | |
3128 : | Returns a FIG id if the alias can be converted. Right now we convert aliases | ||
3129 : | overbeek | 1.211 | of the form NP_* (RefSeq IDs), gi|* (GenBank IDs), sp|* (Swiss Prot), uni|* (UniProt), |
3130 : | kegg|* (KEGG) and maybe a few more | ||
3131 : | overbeek | 1.34 | |
3132 : | =cut | ||
3133 : | |||
3134 : | sub by_alias { | ||
3135 : | overbeek | 1.148 | my($self,$alias,$genome) = @_; |
3136 : | overbeek | 1.34 | my($rdbH,$relational_db_response,$peg); |
3137 : | parrello | 1.200 | my ($peg, $flag) = FIGRules::NormalizeAlias($alias); |
3138 : | if ($flag) { | ||
3139 : | return $peg; | ||
3140 : | } else { | ||
3141 : | parrello | 1.201 | my $genomeQ = $genome ? quotemeta $genome : ""; |
3142 : | $rdbH = $self->db_handle; | ||
3143 : | parrello | 1.200 | |
3144 : | parrello | 1.201 | if (($relational_db_response = $rdbH->SQL("SELECT id FROM ext_alias WHERE ( alias = ? )", undef, $peg)) && |
3145 : | (@$relational_db_response > 0)) { | ||
3146 : | |||
3147 : | overbeek | 1.209 | if (@$relational_db_response == 1) { |
3148 : | $peg = $relational_db_response->[0]->[0]; | ||
3149 : | return wantarray() ? ($peg) : $peg; | ||
3150 : | } elsif (wantarray()) { | ||
3151 : | return map { $_->[0] } @$relational_db_response; | ||
3152 : | } | ||
3153 : | } | ||
3154 : | return wantarray() ? () : ""; | ||
3155 : | overbeek | 1.148 | } |
3156 : | overbeek | 1.34 | } |
3157 : | |||
3158 : | overbeek | 1.170 | sub to_alias { |
3159 : | my($self,$fid,$type) = @_; | ||
3160 : | parrello | 1.200 | |
3161 : | overbeek | 1.170 | my @aliases = grep { $_ =~ /^$type\|/ } $self->feature_aliases($fid); |
3162 : | |||
3163 : | overbeek | 1.171 | if (wantarray()) |
3164 : | { | ||
3165 : | return @aliases; | ||
3166 : | } | ||
3167 : | elsif (@aliases > 0) | ||
3168 : | overbeek | 1.170 | { |
3169 : | return $aliases[0]; | ||
3170 : | } | ||
3171 : | else | ||
3172 : | { | ||
3173 : | return ""; | ||
3174 : | } | ||
3175 : | } | ||
3176 : | |||
3177 : | overbeek | 1.34 | =pod |
3178 : | |||
3179 : | efrank | 1.1 | =head1 possibly_truncated |
3180 : | |||
3181 : | usage: $fig->possibly_truncated($fid) | ||
3182 : | |||
3183 : | Returns true iff the feature occurs near the end of a contig. | ||
3184 : | |||
3185 : | =cut | ||
3186 : | |||
3187 : | sub possibly_truncated { | ||
3188 : | my($self,$feature_id) = @_; | ||
3189 : | my($loc); | ||
3190 : | |||
3191 : | if ($loc = $self->feature_location($feature_id)) | ||
3192 : | { | ||
3193 : | parrello | 1.200 | my $genome = $self->genome_of($feature_id); |
3194 : | my ($contig,$beg,$end) = $self->boundaries_of($loc); | ||
3195 : | efrank | 1.1 | if ((! $self->near_end($genome,$contig,$beg)) && (! $self->near_end($genome,$contig,$end))) |
3196 : | { | ||
3197 : | return 0; | ||
3198 : | } | ||
3199 : | } | ||
3200 : | return 1; | ||
3201 : | } | ||
3202 : | |||
3203 : | sub near_end { | ||
3204 : | my($self,$genome,$contig,$x) = @_; | ||
3205 : | |||
3206 : | return (($x < 300) || ($x > ($self->contig_ln($genome,$contig) - 300))); | ||
3207 : | } | ||
3208 : | |||
3209 : | overbeek | 1.27 | sub is_real_feature { |
3210 : | my($self,$fid) = @_; | ||
3211 : | my($relational_db_response); | ||
3212 : | |||
3213 : | overbeek | 1.136 | if ($self->is_deleted_fid($fid)) { return 0 } |
3214 : | |||
3215 : | overbeek | 1.27 | my $rdbH = $self->db_handle; |
3216 : | return (($relational_db_response = $rdbH->SQL("SELECT id FROM features WHERE ( id = \'$fid\' )")) && | ||
3217 : | mkubal | 1.53 | (@$relational_db_response == 1)) ? 1 : 0; |
3218 : | overbeek | 1.27 | } |
3219 : | |||
3220 : | efrank | 1.1 | ################ Routines to process functional coupling for PEGs ########################## |
3221 : | |||
3222 : | =pod | ||
3223 : | |||
3224 : | =head1 coupling_and_evidence | ||
3225 : | |||
3226 : | usage: @coupling_data = $fig->coupling_and_evidence($fid,$bound,$sim_cutoff,$coupling_cutoff,$keep_record) | ||
3227 : | |||
3228 : | A computation of couplings and evidence starts with a given peg and produces a list of | ||
3229 : | 3-tuples. Each 3-tuple is of the form | ||
3230 : | |||
3231 : | [Score,CoupledToFID,Evidence] | ||
3232 : | |||
3233 : | Evidence is a list of 2-tuples of FIDs that are close in other genomes (producing | ||
3234 : | a "pair of close homologs" of [$peg,CoupledToFID]). The maximum score for a single | ||
3235 : | PCH is 1, but "Score" is the sum of the scores for the entire set of PCHs. | ||
3236 : | |||
3237 : | parrello | 1.200 | If $keep_record is true, the system records the information, asserting coupling for each |
3238 : | efrank | 1.1 | of the pairs in the set of evidence, and asserting a pin from the given $fd through all |
3239 : | of the PCH entries used in forming the score. | ||
3240 : | |||
3241 : | =cut | ||
3242 : | |||
3243 : | sub coupling_and_evidence { | ||
3244 : | my($self,$feature_id,$bound,$sim_cutoff,$coupling_cutoff,$keep_record) = @_; | ||
3245 : | my($neighbors,$neigh,$similar1,$similar2,@hits,$sc,$ev,$genome1); | ||
3246 : | |||
3247 : | overbeek | 1.136 | if ($self->is_deleted_fid($feature_id)) { return undef } |
3248 : | |||
3249 : | efrank | 1.1 | if ($feature_id =~ /^fig\|(\d+\.\d+)/) |
3250 : | { | ||
3251 : | $genome1 = $1; | ||
3252 : | } | ||
3253 : | overbeek | 1.136 | else |
3254 : | { | ||
3255 : | return undef; | ||
3256 : | } | ||
3257 : | parrello | 1.200 | my $locations = $self->feature_location($feature_id); |
3258 : | my($contig,$beg,$end) = $self->boundaries_of($locations); | ||
3259 : | efrank | 1.1 | if (! $contig) { return () } |
3260 : | |||
3261 : | parrello | 1.200 | ($neighbors,undef,undef) = $self->genes_in_region($self->genome_of($feature_id), |
3262 : | efrank | 1.1 | $contig, |
3263 : | parrello | 1.200 | &min($beg,$end) - $bound, |
3264 : | efrank | 1.1 | &max($beg,$end) + $bound); |
3265 : | if (@$neighbors == 0) { return () } | ||
3266 : | $similar1 = $self->acceptably_close($feature_id,$sim_cutoff); | ||
3267 : | @hits = (); | ||
3268 : | |||
3269 : | foreach $neigh (grep { $_ =~ /peg/ } @$neighbors) | ||
3270 : | { | ||
3271 : | next if ($neigh eq $feature_id); | ||
3272 : | $similar2 = $self->acceptably_close($neigh,$sim_cutoff); | ||
3273 : | ($sc,$ev) = $self->coupling_ev($genome1,$similar1,$similar2,$bound); | ||
3274 : | if ($sc >= $coupling_cutoff) | ||
3275 : | { | ||
3276 : | push(@hits,[$sc,$neigh,$ev]); | ||
3277 : | } | ||
3278 : | } | ||
3279 : | if ($keep_record) | ||
3280 : | { | ||
3281 : | $self->add_chr_clusters_and_pins($feature_id,\@hits); | ||
3282 : | } | ||
3283 : | return sort { $b->[0] <=> $a->[0] } @hits; | ||
3284 : | } | ||
3285 : | |||
3286 : | overbeek | 1.35 | sub fast_coupling { |
3287 : | my($self,$peg,$bound,$coupling_cutoff) = @_; | ||
3288 : | my($genome,$genome1,$genome2,$peg1,$peg2,$peg3,%maps,$loc,$loc1,$loc2,$loc3); | ||
3289 : | my($pairs,$sc,%ev); | ||
3290 : | |||
3291 : | overbeek | 1.136 | if ($self->is_deleted_fid($peg)) { return undef } |
3292 : | |||
3293 : | overbeek | 1.35 | my @ans = (); |
3294 : | |||
3295 : | $genome = &genome_of($peg); | ||
3296 : | foreach $peg1 ($self->in_pch_pin_with($peg)) | ||
3297 : | { | ||
3298 : | $peg1 =~ s/,.*$//; | ||
3299 : | if ($peg ne $peg1) | ||
3300 : | { | ||
3301 : | $genome1 = &genome_of($peg1); | ||
3302 : | $maps{$peg}->{$genome1} = $peg1; | ||
3303 : | } | ||
3304 : | } | ||
3305 : | |||
3306 : | $loc = [&boundaries_of(scalar $self->feature_location($peg))]; | ||
3307 : | foreach $peg1 ($self->in_cluster_with($peg)) | ||
3308 : | { | ||
3309 : | if ($peg ne $peg1) | ||
3310 : | { | ||
3311 : | # print STDERR "peg1=$peg1\n"; | ||
3312 : | $loc1 = [&boundaries_of(scalar $self->feature_location($peg1))]; | ||
3313 : | if (&close_enough($loc,$loc1,$bound)) | ||
3314 : | { | ||
3315 : | foreach $peg2 ($self->in_pch_pin_with($peg1)) | ||
3316 : | { | ||
3317 : | $genome2 = &genome_of($peg2); | ||
3318 : | if (($peg3 = $maps{$peg}->{$genome2}) && ($peg2 ne $peg3)) | ||
3319 : | { | ||
3320 : | $loc2 = [&boundaries_of(scalar $self->feature_location($peg2))]; | ||
3321 : | $loc3 = [&boundaries_of(scalar $self->feature_location($peg3))]; | ||
3322 : | if (&close_enough($loc2,$loc3,$bound)) | ||
3323 : | { | ||
3324 : | push(@{$ev{$peg1}},[$peg3,$peg2]); | ||
3325 : | } | ||
3326 : | } | ||
3327 : | } | ||
3328 : | } | ||
3329 : | } | ||
3330 : | } | ||
3331 : | foreach $peg1 (keys(%ev)) | ||
3332 : | { | ||
3333 : | $pairs = $ev{$peg1}; | ||
3334 : | overbeek | 1.43 | $sc = $self->score([$peg,map { $_->[0] } @$pairs]); |
3335 : | overbeek | 1.35 | if ($sc >= $coupling_cutoff) |
3336 : | { | ||
3337 : | push(@ans,[$sc,$peg1]); | ||
3338 : | } | ||
3339 : | } | ||
3340 : | return sort { $b->[0] <=> $a->[0] } @ans; | ||
3341 : | } | ||
3342 : | |||
3343 : | |||
3344 : | sub score { | ||
3345 : | overbeek | 1.43 | my($self,$pegs) = @_; |
3346 : | overbeek | 1.51 | my(@ids); |
3347 : | overbeek | 1.35 | |
3348 : | overbeek | 1.51 | if ($self->{_no9s_scoring}) |
3349 : | { | ||
3350 : | @ids = map { $self->maps_to_id($_) } grep { $_ !~ /^fig\|999999/ } @$pegs; | ||
3351 : | } | ||
3352 : | else | ||
3353 : | { | ||
3354 : | @ids = map { $self->maps_to_id($_) } @$pegs; | ||
3355 : | } | ||
3356 : | overbeek | 1.43 | return &score1($self,\@ids) - 1; |
3357 : | } | ||
3358 : | |||
3359 : | sub score1 { | ||
3360 : | my($self,$pegs) = @_; | ||
3361 : | my($sim); | ||
3362 : | overbeek | 1.204 | |
3363 : | my($iden_cutoff) = 97; | ||
3364 : | my($iden_cutoff_gap) = 100 - $iden_cutoff; | ||
3365 : | |||
3366 : | overbeek | 1.43 | my($first,@rest) = @$pegs; |
3367 : | my $count = 1; | ||
3368 : | my %hits = map { $_ => 1 } @rest; | ||
3369 : | my @ordered = sort { $b->[0] <=> $a->[0] } | ||
3370 : | map { $sim = $_; [$sim->iden,$sim->id2] } | ||
3371 : | grep { $hits{$_->id2} } | ||
3372 : | $self->sims($first,1000,1,"raw"); | ||
3373 : | overbeek | 1.76 | my %ordered = map { $_->[1] => 1 } @ordered; |
3374 : | foreach $_ (@rest) | ||
3375 : | { | ||
3376 : | if (! $ordered{$_}) | ||
3377 : | { | ||
3378 : | push(@ordered,[0,$_]); | ||
3379 : | } | ||
3380 : | } | ||
3381 : | |||
3382 : | overbeek | 1.204 | while ((@ordered > 0) && ($ordered[0]->[0] >= $iden_cutoff)) |
3383 : | overbeek | 1.35 | { |
3384 : | overbeek | 1.43 | shift @ordered ; |
3385 : | } | ||
3386 : | while (@ordered > 0) | ||
3387 : | { | ||
3388 : | my $start = $ordered[0]->[0]; | ||
3389 : | $_ = shift @ordered; | ||
3390 : | my @sub = ( $_->[1] ); | ||
3391 : | overbeek | 1.204 | while ((@ordered > 0) && ($ordered[0]->[0] > ($start-$iden_cutoff_gap))) |
3392 : | overbeek | 1.35 | { |
3393 : | overbeek | 1.43 | $_ = shift @ordered; |
3394 : | push(@sub, $_->[1]); | ||
3395 : | overbeek | 1.35 | } |
3396 : | |||
3397 : | overbeek | 1.43 | if (@sub == 1) |
3398 : | { | ||
3399 : | $count++; | ||
3400 : | } | ||
3401 : | else | ||
3402 : | { | ||
3403 : | $count += &score1($self,\@sub); | ||
3404 : | } | ||
3405 : | overbeek | 1.35 | } |
3406 : | overbeek | 1.43 | return $count; |
3407 : | overbeek | 1.35 | } |
3408 : | |||
3409 : | efrank | 1.1 | =pod |
3410 : | |||
3411 : | =head1 add_chr_clusters_and_pins | ||
3412 : | |||
3413 : | usage: $fig->add_chr_clusters_and_pins($peg,$hits) | ||
3414 : | |||
3415 : | The system supports retaining data relating to functional coupling. If a user | ||
3416 : | computes evidence once and then saves it with this routine, data relating to | ||
3417 : | both "the pin" and the "clusters" (in all of the organisms supporting the | ||
3418 : | functional coupling) will be saved. | ||
3419 : | |||
3420 : | $hits must be a pointer to a list of 3-tuples of the sort returned by | ||
3421 : | $fig->coupling_and_evidence. | ||
3422 : | |||
3423 : | =cut | ||
3424 : | |||
3425 : | sub add_chr_clusters_and_pins { | ||
3426 : | my($self,$peg,$hits) = @_; | ||
3427 : | my(@clusters,@pins,$x,$sc,$neigh,$pairs,$y,@corr,@orgs,%projection); | ||
3428 : | my($genome,$cluster,$pin,$peg2); | ||
3429 : | |||
3430 : | if (@$hits > 0) | ||
3431 : | { | ||
3432 : | @clusters = (); | ||
3433 : | @pins = (); | ||
3434 : | push(@clusters,[$peg,map { $_->[1] } @$hits]); | ||
3435 : | foreach $x (@$hits) | ||
3436 : | { | ||
3437 : | ($sc,$neigh,$pairs) = @$x; | ||
3438 : | push(@pins,[$neigh,map { $_->[1] } @$pairs]); | ||
3439 : | foreach $y (@$pairs) | ||
3440 : | { | ||
3441 : | $peg2 = $y->[0]; | ||
3442 : | if ($peg2 =~ /^fig\|(\d+\.\d+)/) | ||
3443 : | { | ||
3444 : | $projection{$1}->{$peg2} = 1; | ||
3445 : | } | ||
3446 : | } | ||
3447 : | } | ||
3448 : | @corr = (); | ||
3449 : | @orgs = keys(%projection); | ||
3450 : | if (@orgs > 0) | ||
3451 : | { | ||
3452 : | foreach $genome (sort { $a <=> $b } @orgs) | ||
3453 : | { | ||
3454 : | push(@corr,sort { &FIG::by_fig_id($a,$b) } keys(%{$projection{$genome}})); | ||
3455 : | } | ||
3456 : | push(@pins,[$peg,@corr]); | ||
3457 : | } | ||
3458 : | |||
3459 : | foreach $cluster (@clusters) | ||
3460 : | { | ||
3461 : | $self->add_chromosomal_cluster($cluster); | ||
3462 : | } | ||
3463 : | |||
3464 : | foreach $pin (@pins) | ||
3465 : | { | ||
3466 : | $self->add_pch_pin($pin); | ||
3467 : | } | ||
3468 : | } | ||
3469 : | } | ||
3470 : | |||
3471 : | sub coupling_ev { | ||
3472 : | my($self,$genome1,$sim1,$sim2,$bound) = @_; | ||
3473 : | my($ev,$sc,$i,$j); | ||
3474 : | |||
3475 : | $ev = []; | ||
3476 : | |||
3477 : | $i = 0; | ||
3478 : | $j = 0; | ||
3479 : | while (($i < @$sim1) && ($j < @$sim2)) | ||
3480 : | { | ||
3481 : | if ($sim1->[$i]->[0] < $sim2->[$j]->[0]) | ||
3482 : | { | ||
3483 : | $i++; | ||
3484 : | } | ||
3485 : | elsif ($sim1->[$i]->[0] > $sim2->[$j]->[0]) | ||
3486 : | { | ||
3487 : | $j++; | ||
3488 : | } | ||
3489 : | else | ||
3490 : | { | ||
3491 : | overbeek | 1.193 | $self->accumulate_ev($genome1,$sim1->[$i]->[1],$sim2->[$j]->[1],$bound,$ev); |
3492 : | efrank | 1.1 | $i++; |
3493 : | $j++; | ||
3494 : | } | ||
3495 : | } | ||
3496 : | overbeek | 1.43 | return ($self->score([map { $_->[0] } @$ev]),$ev); |
3497 : | efrank | 1.1 | } |
3498 : | |||
3499 : | sub accumulate_ev { | ||
3500 : | my($self,$genome1,$feature_ids1,$feature_ids2,$bound,$ev) = @_; | ||
3501 : | overbeek | 1.43 | my($genome2,@locs1,@locs2,$i,$j,$x); |
3502 : | efrank | 1.1 | |
3503 : | if ((@$feature_ids1 == 0) || (@$feature_ids2 == 0)) { return 0 } | ||
3504 : | |||
3505 : | $feature_ids1->[0] =~ /^fig\|(\d+\.\d+)/; | ||
3506 : | $genome2 = $1; | ||
3507 : | @locs1 = map { $x = $self->feature_location($_); $x ? [&boundaries_of($x)] : () } @$feature_ids1; | ||
3508 : | @locs2 = map { $x = $self->feature_location($_); $x ? [&boundaries_of($x)] : () } @$feature_ids2; | ||
3509 : | |||
3510 : | for ($i=0; ($i < @$feature_ids1); $i++) | ||
3511 : | { | ||
3512 : | for ($j=0; ($j < @$feature_ids2); $j++) | ||
3513 : | { | ||
3514 : | parrello | 1.200 | if (($feature_ids1->[$i] ne $feature_ids2->[$j]) && |
3515 : | efrank | 1.1 | &close_enough($locs1[$i],$locs2[$j],$bound)) |
3516 : | { | ||
3517 : | push(@$ev,[$feature_ids1->[$i],$feature_ids2->[$j]]); | ||
3518 : | } | ||
3519 : | } | ||
3520 : | } | ||
3521 : | } | ||
3522 : | |||
3523 : | sub close_enough { | ||
3524 : | olson | 1.111 | shift if UNIVERSAL::isa($_[0],__PACKAGE__); |
3525 : | efrank | 1.1 | my($locs1,$locs2,$bound) = @_; |
3526 : | |||
3527 : | # print STDERR &Dumper(["close enough",$locs1,$locs2]); | ||
3528 : | return (($locs1->[0] eq $locs2->[0]) && (abs((($locs1->[1]+$locs1->[2])/2) - (($locs2->[1]+$locs2->[2])/2)) <= $bound)); | ||
3529 : | } | ||
3530 : | |||
3531 : | sub acceptably_close { | ||
3532 : | my($self,$feature_id,$sim_cutoff) = @_; | ||
3533 : | my(%by_org,$id2,$genome,$sim); | ||
3534 : | |||
3535 : | my($ans) = []; | ||
3536 : | |||
3537 : | overbeek | 1.31 | foreach $sim ($self->sims($feature_id,1000,$sim_cutoff,"fig")) |
3538 : | efrank | 1.1 | { |
3539 : | $id2 = $sim->id2; | ||
3540 : | if ($id2 =~ /^fig\|(\d+\.\d+)/) | ||
3541 : | { | ||
3542 : | my $genome = $1; | ||
3543 : | overbeek | 1.51 | if (! $self->is_eukaryotic($genome)) |
3544 : | efrank | 1.1 | { |
3545 : | push(@{$by_org{$genome}},$id2); | ||
3546 : | } | ||
3547 : | } | ||
3548 : | } | ||
3549 : | foreach $genome (sort { $a <=> $b } keys(%by_org)) | ||
3550 : | { | ||
3551 : | push(@$ans,[$genome,$by_org{$genome}]); | ||
3552 : | } | ||
3553 : | return $ans; | ||
3554 : | } | ||
3555 : | |||
3556 : | ################ Translations of PEGsand External Protein Sequences ########################## | ||
3557 : | |||
3558 : | |||
3559 : | =pod | ||
3560 : | |||
3561 : | =head1 translatable | ||
3562 : | |||
3563 : | usage: $fig->translatable($prot_id) | ||
3564 : | |||
3565 : | The system takes any number of sources of protein sequences as input (and builds an nr | ||
3566 : | for the purpose of computing similarities). For each of these input fasta files, it saves | ||
3567 : | (in the DB) a filename, seek address and length so that it can go get the translation if | ||
3568 : | needed. This routine simply returns true iff info on the translation exists. | ||
3569 : | |||
3570 : | =cut | ||
3571 : | |||
3572 : | sub translatable { | ||
3573 : | my($self,$prot) = @_; | ||
3574 : | |||
3575 : | return &translation_length($self,$prot) ? 1 : 0; | ||
3576 : | } | ||
3577 : | parrello | 1.200 | |
3578 : | efrank | 1.1 | |
3579 : | =pod | ||
3580 : | |||
3581 : | =head1 translation_length | ||
3582 : | |||
3583 : | usage: $len = $fig->translation_length($prot_id) | ||
3584 : | |||
3585 : | The system takes any number of sources of protein sequences as input (and builds an nr | ||
3586 : | for the purpose of computing similarities). For each of these input fasta files, it saves | ||
3587 : | (in the DB) a filename, seek address and length so that it can go get the translation if | ||
3588 : | needed. This routine returns the length of a translation. This does not require actually | ||
3589 : | retrieving the translation. | ||
3590 : | |||
3591 : | =cut | ||
3592 : | |||
3593 : | sub translation_length { | ||
3594 : | my($self,$prot) = @_; | ||
3595 : | |||
3596 : | overbeek | 1.136 | if ($self->is_deleted_fid($prot)) { return undef } |
3597 : | |||
3598 : | efrank | 1.1 | $prot =~ s/^([^\|]+\|[^\|]+)\|.*$/$1/; |
3599 : | my $rdbH = $self->db_handle; | ||
3600 : | parrello | 1.200 | my $relational_db_response = $rdbH->SQL("SELECT slen,seek FROM protein_sequence_seeks |
3601 : | efrank | 1.1 | WHERE id = \'$prot\' "); |
3602 : | |||
3603 : | overbeek | 1.145 | my @vals = sort { $b->[1] <=> $a->[1] } @$relational_db_response; |
3604 : | return (@vals > 0) ? $vals[0]->[0] : undef; | ||
3605 : | efrank | 1.1 | } |
3606 : | |||
3607 : | |||
3608 : | =pod | ||
3609 : | |||
3610 : | =head1 get_translation | ||
3611 : | |||
3612 : | usage: $translation = $fig->get_translation($prot_id) | ||
3613 : | |||
3614 : | The system takes any number of sources of protein sequences as input (and builds an nr | ||
3615 : | for the purpose of computing similarities). For each of these input fasta files, it saves | ||
3616 : | (in the DB) a filename, seek address and length so that it can go get the translation if | ||
3617 : | needed. This routine returns a protein sequence. | ||
3618 : | |||
3619 : | =cut | ||
3620 : | |||
3621 : | sub get_translation { | ||
3622 : | my($self,$id) = @_; | ||
3623 : | my($rdbH,$relational_db_response,$fileN,$file,$fh,$seek,$ln,$tran); | ||
3624 : | |||
3625 : | overbeek | 1.136 | if ($self->is_deleted_fid($id)) { return '' } |
3626 : | |||
3627 : | efrank | 1.1 | $rdbH = $self->db_handle; |
3628 : | $id =~ s/^([^\|]+\|[^\|]+)\|.*$/$1/; | ||
3629 : | |||
3630 : | $relational_db_response = $rdbH->SQL("SELECT fileno, seek, len FROM protein_sequence_seeks WHERE id = \'$id\' "); | ||
3631 : | |||
3632 : | overbeek | 1.256 | if ((! ($relational_db_response && @$relational_db_response > 0)) && |
3633 : | ($id !~ /^fig\|/) && | ||
3634 : | ($id = $self->by_alias($id))) | ||
3635 : | { | ||
3636 : | $relational_db_response = $rdbH->SQL("SELECT fileno, seek, len FROM protein_sequence_seeks WHERE id = \'$id\' "); | ||
3637 : | } | ||
3638 : | |||
3639 : | overbeek | 1.145 | if ($relational_db_response && @$relational_db_response > 0) |
3640 : | efrank | 1.1 | { |
3641 : | overbeek | 1.145 | my @vals = sort { $b->[1] <=> $a->[1] } @$relational_db_response; |
3642 : | ($fileN,$seek,$ln) = @{$vals[0]}; | ||
3643 : | efrank | 1.1 | if (($fh = $self->openF($self->N2file($fileN))) && |
3644 : | ($ln > 10)) | ||
3645 : | { | ||
3646 : | seek($fh,$seek,0); | ||
3647 : | read($fh,$tran,$ln-1); | ||
3648 : | $tran =~ s/\s//g; | ||
3649 : | return $tran; | ||
3650 : | } | ||
3651 : | } | ||
3652 : | return ''; | ||
3653 : | } | ||
3654 : | |||
3655 : | =pod | ||
3656 : | |||
3657 : | =head1 mapped_prot_ids | ||
3658 : | |||
3659 : | usage: @mapped = $fig->mapped_prot_ids($prot) | ||
3660 : | |||
3661 : | This routine is at the heart of maintaining synonyms for protein sequences. The system | ||
3662 : | determines which protein sequences are "essentially the same". These may differ in length | ||
3663 : | (presumably due to miscalled starts), but the tails are identical (and the heads are not "too" extended). | ||
3664 : | Anyway, the set of synonyms is returned as a list of 2-tuples [Id,length] sorted | ||
3665 : | parrello | 1.200 | by length. |
3666 : | efrank | 1.1 | |
3667 : | =cut | ||
3668 : | |||
3669 : | sub mapped_prot_ids { | ||
3670 : | my($self,$id) = @_; | ||
3671 : | parrello | 1.200 | |
3672 : | overbeek | 1.136 | if ($self->is_deleted_fid($id)) { return () } |
3673 : | |||
3674 : | efrank | 1.1 | my $rdbH = $self->db_handle; |
3675 : | my $relational_db_response = $rdbH->SQL("SELECT maps_to FROM peg_synonyms WHERE syn_id = \'$id\' "); | ||
3676 : | if ($relational_db_response && (@$relational_db_response == 1)) | ||
3677 : | { | ||
3678 : | $id = $relational_db_response->[0]->[0]; | ||
3679 : | } | ||
3680 : | |||
3681 : | $relational_db_response = $rdbH->SQL("SELECT syn_id,syn_ln,maps_to_ln FROM peg_synonyms WHERE maps_to = \'$id\' "); | ||
3682 : | if ($relational_db_response && (@$relational_db_response > 0)) | ||
3683 : | { | ||
3684 : | return ([$id,$relational_db_response->[0]->[2]],map { [$_->[0],$_->[1]] } @$relational_db_response); | ||
3685 : | } | ||
3686 : | else | ||
3687 : | { | ||
3688 : | overbeek | 1.241 | my $len = $self->translation_length($id); |
3689 : | if ($len) | ||
3690 : | { | ||
3691 : | return ([$id,$len]); | ||
3692 : | } | ||
3693 : | else | ||
3694 : | { | ||
3695 : | return (); | ||
3696 : | } | ||
3697 : | efrank | 1.1 | } |
3698 : | overbeek | 1.14 | } |
3699 : | |||
3700 : | sub maps_to_id { | ||
3701 : | my($self,$id) = @_; | ||
3702 : | parrello | 1.200 | |
3703 : | overbeek | 1.14 | my $rdbH = $self->db_handle; |
3704 : | my $relational_db_response = $rdbH->SQL("SELECT maps_to FROM peg_synonyms WHERE syn_id = \'$id\' "); | ||
3705 : | return ($relational_db_response && (@$relational_db_response == 1)) ? $relational_db_response->[0]->[0] : $id; | ||
3706 : | efrank | 1.1 | } |
3707 : | |||
3708 : | olson | 1.244 | ################ GFF3 utilities ########################## |
3709 : | |||
3710 : | sub get_gff_writer | ||
3711 : | { | ||
3712 : | my($self, %options) = @_; | ||
3713 : | |||
3714 : | olson | 1.246 | my $w = GFFWriter->new($self, %options); |
3715 : | olson | 1.244 | |
3716 : | return $w; | ||
3717 : | } | ||
3718 : | |||
3719 : | |||
3720 : | efrank | 1.1 | ################ Assignments of Function to PEGs ########################## |
3721 : | |||
3722 : | overbeek | 1.146 | # set to undef to unset user |
3723 : | # | ||
3724 : | sub set_user { | ||
3725 : | my($self,$user) = @_; | ||
3726 : | |||
3727 : | $self->{_user} = $user; | ||
3728 : | } | ||
3729 : | |||
3730 : | sub get_user { | ||
3731 : | my($self) = @_; | ||
3732 : | |||
3733 : | return $self->{_user}; | ||
3734 : | } | ||
3735 : | |||
3736 : | efrank | 1.1 | =pod |
3737 : | |||
3738 : | =head1 function_of | ||
3739 : | |||
3740 : | usage: @functions = $fig->function_of($peg) OR | ||
3741 : | $function = $fig->function_of($peg,$user) | ||
3742 : | parrello | 1.200 | |
3743 : | efrank | 1.1 | In a list context, you get back a list of 2-tuples. Each 2-tuple is of the |
3744 : | form [MadeBy,Function]. | ||
3745 : | |||
3746 : | In a scalar context, | ||
3747 : | |||
3748 : | 1. user is "master" if not specified | ||
3749 : | 2. function returned is the user's, if one exists; otherwise, master's, if one exists | ||
3750 : | |||
3751 : | In a scalar context, you get just the function. | ||
3752 : | |||
3753 : | =cut | ||
3754 : | |||
3755 : | # Note that we do not return confidence. I propose a separate function to get both | ||
3756 : | # function and confidence | ||
3757 : | # | ||
3758 : | sub function_of { | ||
3759 : | my($self,$id,$user) = @_; | ||
3760 : | my($relational_db_response,@tmp,$entry,$i); | ||
3761 : | my $wantarray = wantarray(); | ||
3762 : | my $rdbH = $self->db_handle; | ||
3763 : | |||
3764 : | overbeek | 1.136 | if ($self->is_deleted_fid($id)) { return $wantarray ? () : "" } |
3765 : | |||
3766 : | efrank | 1.1 | if (($id =~ /^fig\|(\d+\.\d+\.peg\.\d+)/) && ($wantarray || $user)) |
3767 : | { | ||
3768 : | if (($relational_db_response = $rdbH->SQL("SELECT made_by,assigned_function FROM assigned_functions WHERE ( prot = \'$id\' )")) && | ||
3769 : | (@$relational_db_response >= 1)) | ||
3770 : | { | ||
3771 : | overbeek | 1.191 | @tmp = sort { $a->[0] cmp $b->[0] } map { $_->[1] =~ s/^\s//; $_->[1] =~ s/(\t\S)?\s*$//; [$_->[0],$_->[1]] } @$relational_db_response; |
3772 : | efrank | 1.1 | for ($i=0; ($i < @tmp) && ($tmp[$i]->[0] ne "master"); $i++) {} |
3773 : | if ($i < @tmp) | ||
3774 : | { | ||
3775 : | $entry = splice(@tmp,$i,1); | ||
3776 : | unshift @tmp, ($entry); | ||
3777 : | } | ||
3778 : | |||
3779 : | my $val; | ||
3780 : | if ($wantarray) { return @tmp } | ||
3781 : | elsif ($user && ($val = &extract_by_who(\@tmp,$user))) { return $val } | ||
3782 : | elsif ($user && ($val = &extract_by_who(\@tmp,"master"))) { return $val } | ||
3783 : | else { return "" } | ||
3784 : | } | ||
3785 : | } | ||
3786 : | else | ||
3787 : | { | ||
3788 : | if (($relational_db_response = $rdbH->SQL("SELECT assigned_function FROM assigned_functions WHERE ( prot = \'$id\' AND made_by = \'master\' )")) && | ||
3789 : | (@$relational_db_response >= 1)) | ||
3790 : | { | ||
3791 : | parrello | 1.200 | $relational_db_response->[0]->[0] =~ s/^\s//; $relational_db_response->[0]->[0] =~ s/(\t\S)?\s*$//; |
3792 : | efrank | 1.1 | return $wantarray ? (["master",$relational_db_response->[0]->[0]]) : $relational_db_response->[0]->[0]; |
3793 : | } | ||
3794 : | } | ||
3795 : | parrello | 1.200 | |
3796 : | efrank | 1.1 | return $wantarray ? () : ""; |
3797 : | } | ||
3798 : | |||
3799 : | =pod | ||
3800 : | |||
3801 : | =head1 translated_function_of | ||
3802 : | |||
3803 : | usage: $function = $fig->translated_function_of($peg,$user) | ||
3804 : | parrello | 1.200 | |
3805 : | efrank | 1.1 | You get just the translated function. |
3806 : | |||
3807 : | =cut | ||
3808 : | |||
3809 : | sub translated_function_of { | ||
3810 : | my($self,$id,$user) = @_; | ||
3811 : | |||
3812 : | my $func = $self->function_of($id,$user); | ||
3813 : | if ($func) | ||
3814 : | { | ||
3815 : | $func = $self->translate_function($func); | ||
3816 : | } | ||
3817 : | return $func; | ||
3818 : | } | ||
3819 : | parrello | 1.200 | |
3820 : | efrank | 1.1 | |
3821 : | sub extract_by_who { | ||
3822 : | olson | 1.111 | shift if UNIVERSAL::isa($_[0],__PACKAGE__); |
3823 : | efrank | 1.1 | my($xL,$who) = @_; |
3824 : | my($i); | ||
3825 : | |||
3826 : | for ($i=0; ($i < @$xL) && ($xL->[$i]->[0] ne $who); $i++) {} | ||
3827 : | return ($i < @$xL) ? $xL->[$i]->[1] : ""; | ||
3828 : | } | ||
3829 : | |||
3830 : | |||
3831 : | =pod | ||
3832 : | |||
3833 : | =head1 translate_function | ||
3834 : | |||
3835 : | usage: $translated_func = $fig->translate_function($func) | ||
3836 : | |||
3837 : | parrello | 1.200 | Translates a function based on the function.synonyms table. |
3838 : | efrank | 1.1 | |
3839 : | =cut | ||
3840 : | |||
3841 : | sub translate_function { | ||
3842 : | my($self,$function) = @_; | ||
3843 : | |||
3844 : | my ($tran,$from,$to,$line); | ||
3845 : | if (! ($tran = $self->{_function_translation})) | ||
3846 : | { | ||
3847 : | $tran = {}; | ||
3848 : | if (open(TMP,"<$FIG_Config::global/function.synonyms")) | ||
3849 : | { | ||
3850 : | while (defined($line = <TMP>)) | ||
3851 : | { | ||
3852 : | golsen | 1.44 | chomp $line; |
3853 : | efrank | 1.1 | ($from,$to) = split(/\t/,$line); |
3854 : | $tran->{$from} = $to; | ||
3855 : | } | ||
3856 : | close(TMP); | ||
3857 : | } | ||
3858 : | overbeek | 1.22 | foreach $from (keys(%$tran)) |
3859 : | { | ||
3860 : | $to = $tran->{$from}; | ||
3861 : | if ($tran->{$to}) | ||
3862 : | { | ||
3863 : | delete $tran->{$from}; | ||
3864 : | } | ||
3865 : | } | ||
3866 : | efrank | 1.1 | $self->{_function_translation} = $tran; |
3867 : | } | ||
3868 : | overbeek | 1.4 | |
3869 : | while ($to = $tran->{$function}) | ||
3870 : | { | ||
3871 : | $function = $to; | ||
3872 : | } | ||
3873 : | return $function; | ||
3874 : | efrank | 1.1 | } |
3875 : | parrello | 1.200 | |
3876 : | efrank | 1.1 | =pod |
3877 : | |||
3878 : | =head1 assign_function | ||
3879 : | |||
3880 : | usage: $fig->assign_function($peg,$user,$function,$confidence) | ||
3881 : | |||
3882 : | Assigns a function. Note that confidence can (and should be if unusual) included. | ||
3883 : | Note that no annotation is written. This should normally be done in a separate | ||
3884 : | call of the form | ||
3885 : | |||
3886 : | parrello | 1.200 | |
3887 : | efrank | 1.1 | |
3888 : | =cut | ||
3889 : | |||
3890 : | sub assign_function { | ||
3891 : | my($self,$peg,$user,$function,$confidence) = @_; | ||
3892 : | overbeek | 1.229 | my($role,$roleQ,$kvs,$kv,$k,$v); |
3893 : | efrank | 1.1 | |
3894 : | overbeek | 1.197 | if (! $self->is_real_feature($peg)) { return 0 } |
3895 : | overbeek | 1.136 | |
3896 : | overbeek | 1.229 | my $genome = $self->genome_of($peg); |
3897 : | |||
3898 : | overbeek | 1.228 | $function =~ s/\s+/ /sg; |
3899 : | $function =~ s/^\s+//; | ||
3900 : | $function =~ s/\s+$//; | ||
3901 : | overbeek | 1.229 | if ($function =~ /^(.*?)\!(.*)/) |
3902 : | { | ||
3903 : | ($function,$kvs) = ($1,$2); | ||
3904 : | if ($kvs) | ||
3905 : | { | ||
3906 : | $kvs =~ s/^\s+//; | ||
3907 : | $kvs =~ s/\s+$//; | ||
3908 : | foreach $kv (split(/\s+\!\s+/,$kvs)) | ||
3909 : | { | ||
3910 : | if ($kv =~ /^([A-Za-z0-9._\-\+\%]+)\s+\^\s+(.*)$/) | ||
3911 : | { | ||
3912 : | ($k,$v) = ($1,$2); | ||
3913 : | if ($v !~ /\S/) | ||
3914 : | { | ||
3915 : | &replace_peg_key_value($self,$peg,$k,""); | ||
3916 : | } | ||
3917 : | else | ||
3918 : | { | ||
3919 : | &replace_peg_key_value($self,$peg,$k,$v); | ||
3920 : | } | ||
3921 : | } | ||
3922 : | elsif ($kv =~ /^([A-Za-z0-9._\-\+\%]+)$/) | ||
3923 : | { | ||
3924 : | &replace_peg_key_value($self,$peg,$1,1); | ||
3925 : | } | ||
3926 : | } | ||
3927 : | } | ||
3928 : | } | ||
3929 : | overbeek | 1.228 | |
3930 : | efrank | 1.1 | my $rdbH = $self->db_handle; |
3931 : | $confidence = $confidence ? $confidence : ""; | ||
3932 : | |||
3933 : | $rdbH->SQL("DELETE FROM assigned_functions WHERE ( prot = \'$peg\' AND made_by = \'$user\' )"); | ||
3934 : | |||
3935 : | my $funcQ = quotemeta $function; | ||
3936 : | $rdbH->SQL("INSERT INTO assigned_functions ( prot, made_by, assigned_function, quality, org ) VALUES ( \'$peg\', \'$user\', \'$funcQ\', \'$confidence\', \'$genome\' )"); | ||
3937 : | $rdbH->SQL("DELETE FROM roles WHERE ( prot = \'$peg\' AND made_by = \'$user\' )"); | ||
3938 : | |||
3939 : | foreach $role (&roles_of_function($function)) | ||
3940 : | { | ||
3941 : | $roleQ = quotemeta $role; | ||
3942 : | $rdbH->SQL("INSERT INTO roles ( prot, role, made_by, org ) VALUES ( \'$peg\', '$roleQ\', \'$user\', \'$genome\' )"); | ||
3943 : | } | ||
3944 : | |||
3945 : | &verify_dir("$FIG_Config::organisms/$genome/UserModels"); | ||
3946 : | parrello | 1.200 | if ($user ne "master") |
3947 : | efrank | 1.1 | { |
3948 : | &verify_dir("$FIG_Config::organisms/$genome/UserModels/$user"); | ||
3949 : | } | ||
3950 : | |||
3951 : | overbeek | 1.66 | my $file; |
3952 : | if ((($user eq "master") && ($file = "$FIG_Config::organisms/$genome/assigned_functions") && open(TMP,">>$file")) || | ||
3953 : | (($user ne "master") && ($file = "$FIG_Config::organisms/$genome/UserModels/$user/assigned_functions") && open(TMP,">>$file"))) | ||
3954 : | efrank | 1.1 | { |
3955 : | flock(TMP,LOCK_EX) || confess "cannot lock assigned_functions"; | ||
3956 : | seek(TMP,0,2) || confess "failed to seek to the end of the file"; | ||
3957 : | print TMP "$peg\t$function\t$confidence\n"; | ||
3958 : | close(TMP); | ||
3959 : | overbeek | 1.66 | chmod(0777,$file); |
3960 : | efrank | 1.1 | return 1; |
3961 : | } | ||
3962 : | overbeek | 1.125 | else |
3963 : | { | ||
3964 : | print STDERR "FAILED ASSIGNMENT: $peg\t$function\t$confidence\n"; | ||
3965 : | } | ||
3966 : | efrank | 1.1 | return 0; |
3967 : | } | ||
3968 : | |||
3969 : | sub hypo { | ||
3970 : | olson | 1.111 | shift if UNIVERSAL::isa($_[0],__PACKAGE__); |
3971 : | efrank | 1.1 | my $x = (@_ == 1) ? $_[0] : $_[1]; |
3972 : | |||
3973 : | overbeek | 1.23 | if (! $x) { return 1 } |
3974 : | if ($x =~ /hypoth/i) { return 1 } | ||
3975 : | if ($x =~ /conserved protein/i) { return 1 } | ||
3976 : | overbeek | 1.63 | if ($x =~ /gene product/i) { return 1 } |
3977 : | if ($x =~ /interpro/i) { return 1 } | ||
3978 : | if ($x =~ /B[sl][lr]\d/i) { return 1 } | ||
3979 : | if ($x =~ /^U\d/) { return 1 } | ||
3980 : | if ($x =~ /^orf/i) { return 1 } | ||
3981 : | if ($x =~ /uncharacterized/i) { return 1 } | ||
3982 : | if ($x =~ /psedogene/i) { return 1 } | ||
3983 : | if ($x =~ /^predicted/i) { return 1 } | ||
3984 : | if ($x =~ /AGR_/) { return 1 } | ||
3985 : | overbeek | 1.51 | if ($x =~ /similar to/i) { return 1 } |
3986 : | overbeek | 1.63 | if ($x =~ /similarity/i) { return 1 } |
3987 : | if ($x =~ /glimmer/i) { return 1 } | ||
3988 : | overbeek | 1.23 | if ($x =~ /unknown/i) { return 1 } |
3989 : | overbeek | 1.204 | if (($x =~ /domain/i) || |
3990 : | ($x =~ /^y[a-z]{2,4}\b/i) || | ||
3991 : | ($x =~ /complete/i) || | ||
3992 : | overbeek | 1.250 | ($x =~ /ensang/i) || |
3993 : | ($x =~ /unnamed/i) || | ||
3994 : | ($x =~ /EG:/) || | ||
3995 : | ($x =~ /[a-zA-Z]{2,3}\|/) || | ||
3996 : | overbeek | 1.204 | ($x =~ /predicted by Psort/) || |
3997 : | ($x =~ /^bh\d+/i) || | ||
3998 : | ($x =~ /cds_/i) || | ||
3999 : | ($x =~ /^[a-z]{2,3}\d+/i) || | ||
4000 : | ($x =~ /similar to/i) || | ||
4001 : | ($x =~ / identi/i) || | ||
4002 : | ($x =~ /structural feature/i)) { return 1 } | ||
4003 : | overbeek | 1.23 | return 0; |
4004 : | efrank | 1.1 | } |
4005 : | |||
4006 : | ############################ Similarities ############################### | ||
4007 : | |||
4008 : | =pod | ||
4009 : | |||
4010 : | =head1 sims | ||
4011 : | |||
4012 : | usage: @sims = $fig->sims($peg,$maxN,$maxP,$select) | ||
4013 : | |||
4014 : | Returns a list of similarities for $peg such that | ||
4015 : | |||
4016 : | there will be at most $maxN similarities, | ||
4017 : | |||
4018 : | each similarity will have a P-score <= $maxP, and | ||
4019 : | |||
4020 : | $select gives processing instructions: | ||
4021 : | |||
4022 : | "raw" means that the similarities will not be expanded (by far fastest option) | ||
4023 : | "fig" means return only similarities to fig genes | ||
4024 : | "all" means that you want all the expanded similarities. | ||
4025 : | |||
4026 : | By "expanded", we refer to taking a "raw similarity" against an entry in the non-redundant | ||
4027 : | parrello | 1.200 | protein collection, and converting it to a set of similarities (one for each of the |
4028 : | efrank | 1.1 | proteins that are essentially identical to the representative in the nr). |
4029 : | |||
4030 : | redwards | 1.188 | Each entry in @sims is a refence to an array. These are the values in each array position: |
4031 : | |||
4032 : | golsen | 1.238 | 0. The query peg |
4033 : | 1. The similar peg | ||
4034 : | 2. The percent id | ||
4035 : | 3. Alignment length | ||
4036 : | 4. Mismatches | ||
4037 : | 5. Gap openings | ||
4038 : | 6. The start of the match in the query peg | ||
4039 : | 7. The end of the match in the query peg | ||
4040 : | 8. The start of the match in the similar peg | ||
4041 : | 9. The end of the match in the similar peg | ||
4042 : | redwards | 1.188 | 10. E value |
4043 : | redwards | 1.189 | 11. Bit score |
4044 : | redwards | 1.188 | 12. Length of query peg |
4045 : | 13. Length of similar peg | ||
4046 : | 14. Method | ||
4047 : | |||
4048 : | efrank | 1.1 | =cut |
4049 : | parrello | 1.200 | |
4050 : | efrank | 1.1 | sub sims { |
4051 : | golsen | 1.238 | my ( $self, $id, $maxN, $maxP, $select, $max_expand, $filters ) = @_; |
4052 : | my( $sim ); | ||
4053 : | efrank | 1.1 | |
4054 : | golsen | 1.238 | $max_expand = defined( $max_expand ) ? $max_expand : $maxN; |
4055 : | |||
4056 : | golsen | 1.255 | return () if $self->is_deleted_fid( $id ); |
4057 : | golsen | 1.238 | |
4058 : | my @maps_to = $self->mapped_prot_ids( $id ); | ||
4059 : | ( @maps_to > 0 ) or return (); | ||
4060 : | |||
4061 : | my $rep_id = $maps_to[0]->[0]; | ||
4062 : | if ( ! defined( $maps_to[0]->[1] ) ) | ||
4063 : | { | ||
4064 : | print STDERR &Dumper( \@maps_to ); | ||
4065 : | confess "bad"; | ||
4066 : | } | ||
4067 : | |||
4068 : | my @entry = grep { $_->[0] eq $id } @maps_to; | ||
4069 : | ( @entry == 1 ) and defined( $entry[0]->[1] ) or return (); | ||
4070 : | |||
4071 : | # Get the similarities | ||
4072 : | |||
4073 : | my @raw_sims = get_raw_sims( $self, $rep_id, $maxP, $filters ); | ||
4074 : | |||
4075 : | # If the query is not the representative, make sims look like it is | ||
4076 : | # by replacing id1 and fixing match coordinates if lengths differ. | ||
4077 : | overbeek | 1.136 | |
4078 : | golsen | 1.238 | my $delta = $maps_to[0]->[1] - $entry[0]->[1]; |
4079 : | if ( $id ne $rep_id ) | ||
4080 : | efrank | 1.1 | { |
4081 : | golsen | 1.238 | foreach $sim ( @raw_sims ) |
4082 : | efrank | 1.1 | { |
4083 : | golsen | 1.238 | $sim->[0] = $id; |
4084 : | $sim->[6] -= $delta; | ||
4085 : | $sim->[7] -= $delta; | ||
4086 : | } | ||
4087 : | } | ||
4088 : | |||
4089 : | # The query must be present for expanding matches to identical sequences. | ||
4090 : | efrank | 1.1 | |
4091 : | golsen | 1.238 | if ( ( $max_expand > 0 ) && ( $select ne "raw" ) ) |
4092 : | { | ||
4093 : | unshift( @raw_sims, bless( [ $id, | ||
4094 : | $rep_id, | ||
4095 : | "100.00", | ||
4096 : | $entry[0]->[1], | ||
4097 : | 0, | ||
4098 : | 0, | ||
4099 : | 1, $entry[0]->[1], | ||
4100 : | $delta+1, $maps_to[0]->[1], | ||
4101 : | 0.0, | ||
4102 : | 2 * $entry[0]->[1], | ||
4103 : | $entry[0]->[1], | ||
4104 : | $maps_to[0]->[1], | ||
4105 : | "blastp" | ||
4106 : | ], 'Sim' | ||
4107 : | ) | ||
4108 : | ); | ||
4109 : | $max_expand++; | ||
4110 : | efrank | 1.1 | } |
4111 : | golsen | 1.238 | |
4112 : | # print STDERR "\n\n"; for ( @raw_sims ) { print STDERR join( ", ", @{ $_ } ), "\n" } | ||
4113 : | |||
4114 : | # expand_raw_sims now handles sanity checks on id1 eq id2 and id2 | ||
4115 : | # is not deleted. This lets it keep count of the actual number of | ||
4116 : | # sims reported! | ||
4117 : | |||
4118 : | return expand_raw_sims( $self, \@raw_sims, $maxP, $select, 0, $max_expand, $filters ); | ||
4119 : | efrank | 1.1 | } |
4120 : | |||
4121 : | golsen | 1.238 | |
4122 : | # maxP is not used. It is checked by both functions that call here. | ||
4123 : | |||
4124 : | efrank | 1.1 | sub expand_raw_sims { |
4125 : | golsen | 1.238 | my( $self, $raw_sims, $maxP, $select, $dups, $max_expand, $filters ) = @_; |
4126 : | my( $sim, $id1, $id2, %others, $x ); | ||
4127 : | |||
4128 : | # Set up behavior defaults (pretty wide open): | ||
4129 : | |||
4130 : | my ( $maxN, $show_env ); | ||
4131 : | if ( $filters && ref( $filters ) eq "HASH" ) | ||
4132 : | { | ||
4133 : | defined( $filters->{ maxN } ) and $maxN = $filters->{ maxN }; | ||
4134 : | defined( $filters->{ select } ) and $select = $filters->{ select }; | ||
4135 : | defined( $filters->{ max_expand } ) and $max_expand = $filters->{ max_expand }; | ||
4136 : | defined( $filters->{ show_env } ) and $show_env = $filters->{ show_env }; | ||
4137 : | defined( $filters->{ dups } ) and $dups = $filters->{ dups }; | ||
4138 : | } | ||
4139 : | defined( $maxN ) or $maxN = 1000000; # Unlimited sims | ||
4140 : | defined( $select ) or $select = 'all'; # Show all expansions | ||
4141 : | defined( $max_expand ) or $max_expand = 0; # But none by default | ||
4142 : | defined( $show_env ) or $show_env = 1; # Show environmental by default | ||
4143 : | efrank | 1.1 | |
4144 : | golsen | 1.253 | $max_expand = 1000000000 if ( $select =~ /^figx/ ); # figx forces unlimited expand |
4145 : | |||
4146 : | efrank | 1.1 | my @sims = (); |
4147 : | golsen | 1.238 | foreach $sim ( @$raw_sims ) |
4148 : | efrank | 1.1 | { |
4149 : | $id2 = $sim->id2; | ||
4150 : | golsen | 1.238 | if ( ! $dups ) |
4151 : | { | ||
4152 : | next if $others{ $id2 }; | ||
4153 : | $others{ $id2 } = 1; | ||
4154 : | } | ||
4155 : | overbeek | 1.136 | |
4156 : | golsen | 1.238 | $id1 = $sim->id1; |
4157 : | if ( ( $select eq "raw" ) || ( $max_expand <= 0 ) ) | ||
4158 : | efrank | 1.1 | { |
4159 : | golsen | 1.238 | next if ( ! $show_env && ( $id2 =~ /^fig\|9999999/ ) ); |
4160 : | next if ( $id1 eq $id2 ) || $self->is_deleted_fid( $id2 ); | ||
4161 : | push( @sims, $sim ); | ||
4162 : | golsen | 1.249 | return @sims if ( @sims >= $maxN ); |
4163 : | efrank | 1.1 | } |
4164 : | else | ||
4165 : | { | ||
4166 : | golsen | 1.243 | my @relevant = (); |
4167 : | overbeek | 1.29 | $max_expand--; |
4168 : | |||
4169 : | golsen | 1.238 | my @maps_to = $self->mapped_prot_ids( $id2 ); |
4170 : | golsen | 1.253 | if ( $select =~ /^figx?$/ ) # Only fig |
4171 : | efrank | 1.1 | { |
4172 : | golsen | 1.238 | @relevant = grep { $_->[0] =~ /^fig/ } @maps_to; |
4173 : | efrank | 1.1 | } |
4174 : | golsen | 1.253 | elsif ( $select =~ /^figx?_?pref/ ) # FIG preferred |
4175 : | golsen | 1.238 | { |
4176 : | @relevant = grep { $_->[0] =~ /^fig/ } @maps_to; | ||
4177 : | if ( ! @relevant ) | ||
4178 : | { | ||
4179 : | push @sims, $sim; | ||
4180 : | golsen | 1.249 | return @sims if ( @sims >= $maxN ); |
4181 : | golsen | 1.238 | next; |
4182 : | } | ||
4183 : | } | ||
4184 : | golsen | 1.253 | elsif ( $select =~ /^ext/i ) # Not fig |
4185 : | efrank | 1.1 | { |
4186 : | @relevant = grep { $_->[0] !~ /^fig/ } @maps_to; | ||
4187 : | } | ||
4188 : | golsen | 1.253 | else # All |
4189 : | efrank | 1.1 | { |
4190 : | @relevant = @maps_to; | ||
4191 : | } | ||
4192 : | |||
4193 : | golsen | 1.238 | foreach $x ( @relevant ) |
4194 : | efrank | 1.1 | { |
4195 : | golsen | 1.238 | my ( $x_id, $x_ln ) = @$x; |
4196 : | defined( $x_ln ) || confess "x_ln id2='$id2' x_id='$x_id'"; | ||
4197 : | next if ( ! $show_env && ( $x_id =~ /^fig\|9999999/ ) ); | ||
4198 : | next if ( $id1 eq $x_id ) || $self->is_deleted_fid( $x_id ); | ||
4199 : | |||
4200 : | golsen | 1.243 | defined( $maps_to[0]->[1] ) || confess "maps_to"; |
4201 : | golsen | 1.238 | my $delta2 = $maps_to[0]->[1] - $x_ln; # Coordinate shift |
4202 : | my $sim1 = [ @$sim ]; # Make a copy | ||
4203 : | $sim1->[1] = $x_id; | ||
4204 : | efrank | 1.1 | $sim1->[8] -= $delta2; |
4205 : | $sim1->[9] -= $delta2; | ||
4206 : | golsen | 1.238 | bless( $sim1, "Sim" ); |
4207 : | push( @sims, $sim1 ); | ||
4208 : | golsen | 1.243 | return @sims if ( @sims >= $maxN ); |
4209 : | efrank | 1.1 | } |
4210 : | } | ||
4211 : | } | ||
4212 : | golsen | 1.238 | |
4213 : | efrank | 1.1 | return @sims; |
4214 : | } | ||
4215 : | |||
4216 : | golsen | 1.238 | |
4217 : | efrank | 1.1 | sub get_raw_sims { |
4218 : | golsen | 1.238 | my ( $self, $rep_id, $maxP, $filters ) = @_; |
4219 : | my ( $sim_chunk, $seek, $fileN, $ln, $fh, $file, @lines, $sim ); | ||
4220 : | |||
4221 : | # Set up behavior defaults (pretty wide open): | ||
4222 : | efrank | 1.1 | |
4223 : | golsen | 1.238 | my ( $show_env, $min_sim, $sim_meas, $min_q_cov, $min_s_cov, $sort_by ); |
4224 : | if ( $filters && ref( $filters ) eq "HASH" ) | ||
4225 : | { | ||
4226 : | defined( $filters->{ maxP } ) and $maxP = $filters->{ maxP }; | ||
4227 : | defined( $filters->{ show_env } ) and $show_env = $filters->{ show_env }; | ||
4228 : | defined( $filters->{ min_sim } ) and $min_sim = $filters->{ min_sim }; | ||
4229 : | defined( $filters->{ sim_meas } ) and $sim_meas = $filters->{ sim_meas }; | ||
4230 : | defined( $filters->{ min_q_cov } ) and $min_q_cov = $filters->{ min_q_cov }; | ||
4231 : | defined( $filters->{ min_s_cov } ) and $min_s_cov = $filters->{ min_s_cov }; | ||
4232 : | defined( $filters->{ sort_by } ) and $sort_by = $filters->{ sort_by }; | ||
4233 : | } | ||
4234 : | defined( $maxP ) or $maxP = 10; | ||
4235 : | defined( $show_env ) or $show_env = 1; | ||
4236 : | defined( $min_sim ) or $min_sim = 0; | ||
4237 : | defined( $sim_meas ) or $sim_meas = 'id'; | ||
4238 : | defined( $min_q_cov ) or $min_q_cov = 0; | ||
4239 : | defined( $min_s_cov ) or $min_s_cov = 0; | ||
4240 : | defined( $sort_by ) or $sort_by = 'bits'; | ||
4241 : | efrank | 1.1 | |
4242 : | my $rdbH = $self->db_handle; | ||
4243 : | my $relational_db_response = $rdbH->SQL("SELECT seek, fileN, len FROM sim_seeks WHERE id = \'$rep_id\' "); | ||
4244 : | golsen | 1.238 | |
4245 : | # Gather all of the acceptable "lines" from the sim chunks | ||
4246 : | |||
4247 : | foreach $sim_chunk ( @$relational_db_response ) | ||
4248 : | efrank | 1.1 | { |
4249 : | golsen | 1.238 | ( $seek, $fileN, $ln ) = @$sim_chunk; |
4250 : | $file = $self->N2file( $fileN ); | ||
4251 : | $fh = $self->openF( $file ); | ||
4252 : | $fh or confess "could not open sims for $file"; | ||
4253 : | parrello | 1.200 | |
4254 : | golsen | 1.238 | # Read file, parse lines, sanity check values, and filter E-value |
4255 : | # 0. The query peg | ||
4256 : | # 1. The similar peg | ||
4257 : | # 2. The percent id | ||
4258 : | # 3. Alignment length | ||
4259 : | # 4. Mismatches | ||
4260 : | # 5. Gap openings | ||
4261 : | # 6. The start of the match in the query peg | ||
4262 : | # 7. The end of the match in the query peg | ||
4263 : | # 8. The start of the match in the similar peg | ||
4264 : | # 9. The end of the match in the similar peg | ||
4265 : | # 10. E-value | ||
4266 : | # 11. Bit score | ||
4267 : | # 12. Length of query peg | ||
4268 : | # 13. Length of similar peg | ||
4269 : | # 14. Method | ||
4270 : | |||
4271 : | push @lines, grep { ( @$_ >= 15 ) && | ||
4272 : | ( $_->[10] =~ /^[0-9.e-]+$/ ) && # E-value | ||
4273 : | ( $_->[10] <= $maxP ) && # E-value test | ||
4274 : | ( $_->[11] =~ /^[0-9.]+$/ ) && # bit score | ||
4275 : | ( $_->[12] =~ /^\d+$/ ) && # query len | ||
4276 : | ( $_->[13] =~ /^\d+$/ ) && # subj len | ||
4277 : | ( $_->[6] =~ /^\d+$/ ) && # q-match start | ||
4278 : | ( $_->[7] =~ /^\d+$/ ) && # q-match end | ||
4279 : | ( $_->[8] =~ /^\d+$/ ) && # s-match start | ||
4280 : | ( $_->[9] =~ /^\d+$/ ) && # s-match end | ||
4281 : | ( $_->[2] =~ /^[0-9.]+$/ ) # percent id | ||
4282 : | } | ||
4283 : | map { [ split( /\t/, $_ ), "blastp" ] } | ||
4284 : | @{ read_block( $fh, $seek, $ln-1 ) }; | ||
4285 : | } | ||
4286 : | |||
4287 : | # Similarity filter | ||
4288 : | efrank | 1.1 | |
4289 : | golsen | 1.238 | if ( $min_sim > 0 ) |
4290 : | { | ||
4291 : | if ( $sim_meas eq 'id' ) | ||
4292 : | { | ||
4293 : | @lines = grep { $_->[2] >= $min_sim } @lines; | ||
4294 : | } | ||
4295 : | elsif ( $sim_meas eq 'bpp' ) | ||
4296 : | efrank | 1.1 | { |
4297 : | golsen | 1.238 | @lines = grep { $_->[11] >= $min_sim * ( $_->[7] - $_->[6] + 1 ) } @lines; |
4298 : | efrank | 1.1 | } |
4299 : | } | ||
4300 : | golsen | 1.238 | |
4301 : | # Query coverage filter | ||
4302 : | |||
4303 : | if ( $min_q_cov > 0 ) | ||
4304 : | { | ||
4305 : | my $thresh = 0.01 * $min_q_cov; | ||
4306 : | @lines = grep { ( abs( $_->[7] - $_->[6] ) + 1 ) >= ( $thresh * $_->[12] ) } @lines; | ||
4307 : | } | ||
4308 : | |||
4309 : | # Subject coverage filter | ||
4310 : | |||
4311 : | if ( $min_s_cov > 0 ) | ||
4312 : | { | ||
4313 : | my $thresh = 0.01 * $min_s_cov; | ||
4314 : | @lines = grep { ( abs( $_->[9] - $_->[8] ) + 1 ) >= ( $thresh * $_->[13] ) } @lines; | ||
4315 : | } | ||
4316 : | |||
4317 : | golsen | 1.251 | # Order the surviving raw sims by requested criterion: |
4318 : | golsen | 1.238 | |
4319 : | if ( $sort_by eq 'id' ) # Percent identity | ||
4320 : | { | ||
4321 : | @lines = sort { $b->[2] <=> $a->[2] } @lines; | ||
4322 : | } | ||
4323 : | golsen | 1.251 | |
4324 : | elsif ( $sort_by eq 'id2' ) # Percent identity adjusted | ||
4325 : | { | ||
4326 : | # Lower percent identity by 2 standard deviations to prevent random | ||
4327 : | # fluctuation in short sequences from moving them up so often. | ||
4328 : | |||
4329 : | my ( $p, $len, $sigma ); | ||
4330 : | @lines = map { $_->[0] } | ||
4331 : | sort { $b->[1] <=> $a->[1] } | ||
4332 : | map { $p = 0.01 * $_->[2]; # fraction identity | ||
4333 : | $len = abs( $_->[7] - $_->[6] ) + 1; # seq len | ||
4334 : | $sigma = sqrt( $p * ( 1 - $p ) / $len ); # binomial sigma | ||
4335 : | [ $_, $_->[2] - 200 * $sigma ] | ||
4336 : | } | ||
4337 : | @lines; | ||
4338 : | } | ||
4339 : | |||
4340 : | golsen | 1.238 | elsif ( $sort_by eq 'bpp' ) # Bits per position |
4341 : | { | ||
4342 : | @lines = map { $_->[0] } | ||
4343 : | sort { $b->[1] <=> $a->[1] } | ||
4344 : | map { [ $_, $_->[11] / abs( $_->[7] - $_->[6] ) ] } | ||
4345 : | @lines; | ||
4346 : | } | ||
4347 : | golsen | 1.251 | |
4348 : | elsif ( $sort_by eq 'bpp2' ) # Bits per position adjusted | ||
4349 : | { | ||
4350 : | # Lower score by 2 standard deviations to prevent random | ||
4351 : | # fluctuation in short sequences from moving them up so often. | ||
4352 : | |||
4353 : | my ( $bpp, $len, $sigma ); | ||
4354 : | @lines = map { $_->[0] } | ||
4355 : | sort { $b->[1] <=> $a->[1] } | ||
4356 : | map { $len = abs( $_->[7] - $_->[6] ) + 1; # seq len | ||
4357 : | $bpp = $_->[11] / $len; # bit per pos | ||
4358 : | $sigma = 2.5 * sqrt( 1 / $len ); # simple estimate | ||
4359 : | [ $_, $bpp - 2 * $sigma ] | ||
4360 : | } | ||
4361 : | @lines; | ||
4362 : | } | ||
4363 : | |||
4364 : | else # Bit score (bits) | ||
4365 : | golsen | 1.238 | { |
4366 : | @lines = sort { $b->[11] <=> $a->[11] } @lines; | ||
4367 : | } | ||
4368 : | |||
4369 : | # Bless the raw sims: | ||
4370 : | |||
4371 : | return map { bless( $_, 'Sim' ); $_ } @lines; | ||
4372 : | efrank | 1.1 | } |
4373 : | |||
4374 : | golsen | 1.238 | |
4375 : | overbeek | 1.84 | sub read_block { |
4376 : | olson | 1.111 | shift if UNIVERSAL::isa($_[0],__PACKAGE__); |
4377 : | overbeek | 1.84 | my($fh,$seek,$ln) = @_; |
4378 : | my($piece,$readN); | ||
4379 : | |||
4380 : | seek($fh,$seek,0); | ||
4381 : | overbeek | 1.98 | my @lines = (); |
4382 : | my $leftover = ""; | ||
4383 : | overbeek | 1.84 | while ($ln > 0) |
4384 : | { | ||
4385 : | my $ln1 = ($ln <= 10000) ? $ln : 10000; | ||
4386 : | $readN = read($fh,$piece,$ln1); | ||
4387 : | parrello | 1.200 | ($readN == $ln1) |
4388 : | overbeek | 1.84 | || confess "could not read the block of sims at $seek for $ln1 characters; $readN actually read"; |
4389 : | overbeek | 1.98 | my @tmp = split(/\n/,$piece); |
4390 : | if ($leftover) | ||
4391 : | { | ||
4392 : | $tmp[0] = $leftover . $tmp[0]; | ||
4393 : | } | ||
4394 : | |||
4395 : | if (substr($piece,-1) eq "\n") | ||
4396 : | { | ||
4397 : | $leftover = ""; | ||
4398 : | } | ||
4399 : | parrello | 1.200 | else |
4400 : | overbeek | 1.98 | { |
4401 : | $leftover = pop @tmp; | ||
4402 : | } | ||
4403 : | push(@lines,@tmp); | ||
4404 : | overbeek | 1.84 | $ln -= 10000; |
4405 : | } | ||
4406 : | overbeek | 1.98 | if ($leftover) { push(@lines,$leftover) } |
4407 : | return \@lines; | ||
4408 : | overbeek | 1.84 | } |
4409 : | |||
4410 : | |||
4411 : | overbeek | 1.73 | sub bbhs { |
4412 : | overbeek | 1.101 | my($self,$peg,$cutoff,$frac_match) = @_; |
4413 : | overbeek | 1.74 | my($sim,$peg2,$genome2,$i,@sims2,%seen); |
4414 : | overbeek | 1.73 | |
4415 : | overbeek | 1.136 | if ($self->is_deleted_fid($peg)) { return () } |
4416 : | |||
4417 : | overbeek | 1.101 | $frac_match = defined($frac_match) ? $frac_match : 0; |
4418 : | |||
4419 : | overbeek | 1.73 | $cutoff = defined($cutoff) ? $cutoff : 1.0e-10; |
4420 : | my @bbhs = (); | ||
4421 : | overbeek | 1.100 | my @precomputed = (); |
4422 : | my $rdbH = $self->db_handle; | ||
4423 : | my $relational_db_response = $rdbH->SQL("SELECT seek, others FROM bbhs WHERE peg = \'$peg\' "); | ||
4424 : | if (@$relational_db_response == 1) | ||
4425 : | { | ||
4426 : | overbeek | 1.190 | my($seek_set,$others) = @{$relational_db_response->[0]}; |
4427 : | overbeek | 1.100 | if (open(CORES,"<$FIG_Config::global/bbh.cores")) |
4428 : | { | ||
4429 : | overbeek | 1.190 | my $seek; |
4430 : | foreach $seek (split(/,/,$seek_set)) | ||
4431 : | { | ||
4432 : | seek(CORES,$seek,0); | ||
4433 : | $_ = <CORES>; | ||
4434 : | chop; | ||
4435 : | push(@precomputed,split(/,/,$_)); | ||
4436 : | } | ||
4437 : | overbeek | 1.100 | close(CORES); |
4438 : | } | ||
4439 : | push(@precomputed,split(/,/,$others)); | ||
4440 : | } | ||
4441 : | my %bbhs = map { $_ => 1 } @precomputed; | ||
4442 : | overbeek | 1.73 | |
4443 : | foreach $sim ($self->sims($peg,10000,$cutoff,"fig")) | ||
4444 : | { | ||
4445 : | $peg2 = $sim->id2; | ||
4 |