package FIG; use strict; use Fcntl qw/:flock/; # import LOCK_* constants use POSIX; use IPC::Open2; use DBrtns; use Sim; use Blast; use FIG_Config; use tree_utilities; use Subsystem; use SeedDas; use Construct; use FIGRules; # # Conditionally evaluate this in case its prerequisites are not available. # our $ClearinghouseOK = eval { require Clearinghouse; }; use IO::Socket; use FileHandle; use Carp; use Data::Dumper; use Time::Local; use File::Spec; use File::Copy; # # Try to load the RPC stuff; it might fail on older versions of the software. # eval { require FIGrpc; }; my $xmlrpc_available = 1; if ($@ ne "") { $xmlrpc_available = 0; } use FIGAttributes; use base 'FIGAttributes'; use vars qw(%_FunctionAttributes); use Data::Dumper; # # Force all new files to be all-writable. # umask 0; sub new { my($class) = @_; # # Check to see if we have a FIG_URL environment variable set. # If we do, don't actually create a FIG object, but rather # create a FIGrpc and return that as the return from this constructor. # if ($ENV{FIG_URL} ne "" && $xmlrpc_available) { print "Creating figrpc for '$ENV{FIG_URL}'\n"; my $figrpc = new FIGrpc($ENV{FIG_URL}); return $figrpc; } my $rdbH = new DBrtns; bless { _dbf => $rdbH, }, $class; } sub DESTROY { my($self) = @_; my($rdbH); if ($rdbH = $self->db_handle) { $rdbH->DESTROY; } } sub delete_genomes { my($self,$genomes) = @_; my $tmpD = "$FIG_Config::temp/tmp.deleted.$$"; my $tmp_Data = "$FIG_Config::temp/Data.$$"; my %to_del = map { $_ => 1 } @$genomes; open(TMP,">$tmpD") || die "could not open $tmpD"; my $genome; foreach $genome ($self->genomes) { if (! $to_del{$genome}) { print TMP "$genome\n"; } } close(TMP); &run("extract_genomes $tmpD $FIG_Config::data $tmp_Data"); # &run("mv $FIG_Config::data $FIG_Config::data.deleted; mv $tmp_Data $FIG_Config::data; fig load_all; rm -rf $FIG_Config::data.deleted"); &run("mv $FIG_Config::data $FIG_Config::data.deleted"); &run("mv $tmp_Data $FIG_Config::data"); &run("fig load_all"); &run("rm -rf $FIG_Config::data.deleted"); } sub add_genome { my($self,$genomeF) = @_; my $rc = 0; my(undef, $path, $genome) = File::Spec->splitpath($genomeF); if ($genome !~ /^\d+\.\d+$/) { warn "Invalid genome filename $genomeF\n"; return $rc; } if (-d $FIG_Config::organisms/$genome) { warn "Organism already exists for $genome\n"; return $rc; } # # We're okay, it doesn't exist. # my @errors = `$FIG_Config::bin/verify_genome_directory $genomeF`; if (@errors) { warn "Errors found while verifying genome directory $genomeF:\n"; print join("", @errors); return $rc; } &run("cp -r $genomeF $FIG_Config::organisms"); &run("chmod -R 777 $FIG_Config::organisms/$genome"); &run("index_contigs $genome"); &run("compute_genome_counts $genome"); &run("load_features $genome"); $rc = 1; if (-s "$FIG_Config::organisms/$genome/Features/peg/fasta") { &run("index_translations $genome"); my @tmp = `cut -f1 $FIG_Config::organisms/$genome/Features/peg/tbl`; chomp @tmp; &run("cat $FIG_Config::organisms/$genome/Features/peg/fasta >> $FIG_Config::data/Global/nr"); &enqueue_similarities(\@tmp); } if ((-s "$FIG_Config::organisms/$genome/assigned_functions") || (-d "$FIG_Config::organisms/$genome/UserModels")) { &run("add_assertions_of_function $genome"); } return $rc; } =pod =head1 enqueue_similarities usage: enqueue_similarities(\@sims) Queue the passed fids (a reference to a list) for similarity computation. =cut sub enqueue_similarities { my($fids) = @_; my $fid; my $sim_q = "$FIG_Config::global/queued_similarities"; open(TMP,">>$sim_q") || die "could not open $sim_q"; # # We need to lock here so that if a computation is creating a snapshot of the # queue, we block until it's done. # flock(TMP, LOCK_EX) or die "Cannot lock $sim_q\n"; foreach $fid (@$fids) { print TMP "$fid\n"; } close(TMP); } =pod =head1 create_sim_askfor_pool usage: create_sim_askfor_pool() Creates an askfor pool, a snapshot of the current NR and similarity queue. Zeros out the old queue. The askfor pool needs to keep track of which sequences need to be calculated, which have been handed out, etc. To simplify this task we chunk the sequences into fairly small numbers (10-20 sequences) and allocate work on a per-chunk basis. We make use of the relational database to keep track of chunk status as well as the seek locations into the file of sequence data. The initial creation of the pool involves indexing the sequence data with seek offsets and lengths and populating the sim_askfor_index table with this information and with initial status information. =cut sub create_sim_askfor_pool { my($self, $chunk_size) = @_; $chunk_size = 15 unless $chunk_size =~ /^\d+$/; my $pool_dir = "$FIG_Config::global/sim_pools"; &verify_dir($pool_dir); # # Lock the pool directory. # open(my $lock, ">$pool_dir/lockfile"); flock($lock, LOCK_EX); my $num = 0; if (open(my $toc, "<$pool_dir/TOC")) { while (<$toc>) { chomp; # print STDERR "Have toc entry $_\n"; my ($idx, $time, $str) = split(/\s+/, $_, 3); $num = max($num, $idx); } close($toc); } $num++; open(my $toc, ">>$pool_dir/TOC") or die "Cannot write $pool_dir/TOC: $!\n"; print $toc "$num ", time(), " New toc entry\n"; close($toc); my $cpool_id = sprintf "%04d", $num; my $cpool_dir = "$pool_dir/$cpool_id"; # # All set, create the directory for this pool. # &verify_dir($cpool_dir); # # Now we can copy the nr and sim queue here. # Do this stuff inside an eval so we can clean up # the lockfile. # eval { my $sim_q = "$FIG_Config::global/queued_similarities"; copy("$sim_q", "$cpool_dir/q"); copy("$FIG_Config::data/Global/nr", "$cpool_dir/nr"); open(F, ">$sim_q") or die "Cannot open $sim_q to truncate it: $!\n"; close(F); }; unlink("$pool_dir/lockfile"); close($lock); # # We've created our pool; we can now run the formatdb and # extract the sequences for the blast run. # my $child_pid = $self->run_in_background(sub { # # Need to close db or there's all sorts of trouble. # my $cmd = "$FIG_Config::ext_bin/formatdb -i $cpool_dir/nr -p T -l $cpool_dir/formatdb.log"; print "Will run '$cmd'\n"; &run($cmd); print "finished. Logfile:\n"; print &FIG::file_read("$cpool_dir/formatdb.log"); unlink("$cpool_dir/formatdb.pid"); }); print "Running formatdb in background job $child_pid\n"; open(FPID, ">$cpool_dir/formatdb.pid"); print FPID "$child_pid\n"; close(FPID); my $db = $self->db_handle(); if (!$db->table_exists("sim_queue")) { $db->create_table(tbl => "sim_queue", flds => "qid varchar(32), chunk_id INTEGER, seek INTEGER, len INTEGER, " . "assigned BOOL, finished BOOL, output_file varchar(255), " . "assignment_expires INTEGER, worker_info varchar(255)" ); } # # Write the fasta input file. Keep track of how many have been written, # and write seek info into the database as appropriate. # open(my $seq_fh, ">$cpool_dir/fasta.in"); my($chunk_idx, $chunk_begin, $seq_idx); $chunk_idx = 0; $chunk_begin = 0; $seq_idx = 0; my(@seeks); open(my $q_fh, "<$cpool_dir/q"); while (my $id = <$q_fh>) { chomp $id; my $seq = $self->get_translation($id); # # check if we're at the beginning of a chunk # print $seq_fh ">$id\n$seq\n"; # # Check if we're at the end of a chunk # if ((($seq_idx + 1) % $chunk_size) == 0) { my $chunk_end = tell($seq_fh); my $chunk_len = $chunk_end - $chunk_begin; push(@seeks, [$cpool_id, $chunk_idx, $chunk_begin, $chunk_len]); $chunk_idx++; $chunk_begin = $chunk_end; } $seq_idx++; } if ((($seq_idx) % $chunk_size) != 0) { my $chunk_end = tell($seq_fh); my $chunk_len = $chunk_end - $chunk_begin; push(@seeks, [$cpool_id, $chunk_idx, $chunk_begin, $chunk_len]); $chunk_idx++; $chunk_begin = $chunk_end; } close($q_fh); close($seq_fh); print "Write seqs\n"; for my $seek (@seeks) { my($cpool_id, $chunk_idx, $chunk_begin, $chunk_len) = @$seek; $db->SQL("insert into sim_queue (qid, chunk_id, seek, len, assigned, finished) " . "values('$cpool_id', $chunk_idx, $chunk_begin, $chunk_len, FALSE, FALSE)"); } return $cpool_id; } =pod =head1 get_sim_queue usage: get_sim_queue($pool_id, $all_sims) Returns the sims in the given pool. If $all_sims is true, return the entire queue. Otherwise, just return the sims awaiting processing. =cut sub get_sim_queue { my($self, $pool_id, $all_sims) = @_; } =pod =head1 get_active_sim_pools usage: get_active_sim_pools() Return a list of the pool id's for the sim processing queues that have entries awaiting computation. =cut sub get_active_sim_pools { my($self) = @_; my $dbh = $self->db_handle(); my $res = $dbh->SQL("select distinct qid from sim_queue where not finished"); return undef unless $res; return map { $_->[0] } @$res; } =pod =head1 get_sim_pool_info usage: get_sim_pool_info($pool_id) Return information about the given sim pool. Return value is a list ($total_entries, $n_finished, $n_assigned, $n_unassigned) =cut sub get_sim_pool_info { my($self, $pool_id) = @_; my($dbh, $res, $total_entries, $n_finished, $n_assigned, $n_unassigned); $dbh = $self->db_handle(); $res = $dbh->SQL("select count(chunk_id) from sim_queue where qid = '$pool_id'"); $total_entries = $res->[0]->[0]; $res = $dbh->SQL("select count(chunk_id) from sim_queue where qid = '$pool_id' and finished"); $n_finished = $res->[0]->[0]; $res = $dbh->SQL("select count(chunk_id) from sim_queue where qid = '$pool_id' and assigned and not finished"); $n_assigned = $res->[0]->[0]; $res = $dbh->SQL("select count(chunk_id) from sim_queue where qid = '$pool_id' and not finished and not assigned"); $n_unassigned = $res->[0]->[0]; return ($total_entries, $n_finished, $n_assigned, $n_unassigned); } =pod =head1 get_sim_chunk usage: get_sim_chunk($n_seqs, $worker_id) Returns a chunk of $n_seqs of work. From Ross, about how sims are processed: Here is how I process them: bash$ cd /Volumes/seed/olson/Sims/June22.out bash$ for i in really* > do > cat < $i >> /Volumes/laptop/new.sims > done Then, I need to "reformat" them by adding to columns to each one and split the result into files of about 3M each This I do using reduce_sims /Volumes/laptop/NR/NewNR/peg.synonyms.june21 300 < /Volumes/laptop/new.sims | reformat_sims /Volumes/laptop/NR/NewNR/checked.nr.june21 > /Volumes/laptop/reformated.sims rm /Volumes/laptop/new.sims split_sims /Volumes/laptop/NewSims sims.june24 reformated.sims rm reformatted.sims =cut sub get_sim_chunk { my($self, $n_seqs, $worker_id) = @_; } sub get_local_hostname { # # See if there is a FIGdisk/config/hostname file. If there # is, force the hostname to be that. # my $hostfile = "$FIG_Config::fig_disk/config/hostname"; if (-f $hostfile) { my $fh; if (open($fh, $hostfile)) { my $hostname = <$fh>; chomp($hostname); return $hostname; } } # # First check to see if we our hostname is correct. # # Map it to an IP address, and try to bind to that ip. # my $tcp = getprotobyname('tcp'); my $hostname = `hostname`; chomp($hostname); my @hostent = gethostbyname($hostname); if (@hostent > 0) { my $sock; my $ip = $hostent[4]; socket($sock, PF_INET, SOCK_STREAM, $tcp); if (bind($sock, sockaddr_in(0, $ip))) { # # It worked. Reverse-map back to a hopefully fqdn. # my @rev = gethostbyaddr($ip, AF_INET); if (@rev > 0) { my $host = $rev[0]; # # Check to see if we have a FQDN. # if ($host =~ /\./) { # # Good. # return $host; } else { # # We didn't get a fqdn; bail and return the IP address. # return get_hostname_by_adapter() } } else { return inet_ntoa($ip); } } else { # # Our hostname must be wrong; we can't bind to the IP # address it maps to. # Return the name associated with the adapter. # return get_hostname_by_adapter() } } else { # # Our hostname isn't known to DNS. This isn't good. # Return the name associated with the adapter. # return get_hostname_by_adapter() } } sub get_hostname_by_adapter { shift if UNIVERSAL::isa($_[0],__PACKAGE__); # # Attempt to determine our local hostname based on the # network environment. # # This implementation reads the routing table for the default route. # We then look at the interface config for the interface that holds the default. # # # Linux routing table: # [olson@yips 0.0.0]$ netstat -rn # Kernel IP routing table # Destination Gateway Genmask Flags MSS Window irtt Iface # 140.221.34.32 0.0.0.0 255.255.255.224 U 0 0 0 eth0 # 169.254.0.0 0.0.0.0 255.255.0.0 U 0 0 0 eth0 # 127.0.0.0 0.0.0.0 255.0.0.0 U 0 0 0 lo # 0.0.0.0 140.221.34.61 0.0.0.0 UG 0 0 0 eth0 # # Mac routing table: # # bash-2.05a$ netstat -rn # Routing tables # # Internet: # Destination Gateway Flags Refs Use Netif Expire # default 140.221.11.253 UGSc 12 120 en0 # 127.0.0.1 127.0.0.1 UH 16 8415486 lo0 # 140.221.8/22 link#4 UCS 12 0 en0 # 140.221.8.78 0:6:5b:f:51:c4 UHLW 0 183 en0 408 # 140.221.8.191 0:3:93:84:ab:e8 UHLW 0 92 en0 622 # 140.221.8.198 0:e0:98:8e:36:e2 UHLW 0 5 en0 691 # 140.221.9.6 0:6:5b:f:51:d6 UHLW 1 63 en0 1197 # 140.221.10.135 0:d0:59:34:26:34 UHLW 2 2134 en0 1199 # 140.221.10.152 0:30:1b:b0:ec:dd UHLW 1 137 en0 1122 # 140.221.10.153 127.0.0.1 UHS 0 0 lo0 # 140.221.11.37 0:9:6b:53:4e:4b UHLW 1 624 en0 1136 # 140.221.11.103 0:30:48:22:59:e6 UHLW 3 973 en0 1016 # 140.221.11.224 0:a:95:6f:7:10 UHLW 1 1 en0 605 # 140.221.11.237 0:1:30:b8:80:c0 UHLW 0 0 en0 1158 # 140.221.11.250 0:1:30:3:1:0 UHLW 0 0 en0 1141 # 140.221.11.253 0:d0:3:e:70:a UHLW 13 0 en0 1199 # 169.254 link#4 UCS 0 0 en0 # # Internet6: # Destination Gateway Flags Netif Expire # UH lo0 # fe80::%lo0/64 Uc lo0 # link#1 UHL lo0 # fe80::%en0/64 link#4 UC en0 # 0:a:95:a8:26:68 UHL lo0 # ff01::/32 U lo0 # ff02::%lo0/32 UC lo0 # ff02::%en0/32 link#4 UC en0 my($fh); if (!open($fh, "netstat -rn |")) { warn "Cannot run netstat to determine local IP address\n"; return "localhost"; } my $interface_name; while (<$fh>) { my @cols = split(); if ($cols[0] eq "default" || $cols[0] eq "0.0.0.0") { $interface_name = $cols[$#cols]; } } close($fh); # print "Default route on $interface_name\n"; # # Find ifconfig. # my $ifconfig; for my $dir ((split(":", $ENV{PATH}), "/sbin", "/usr/sbin")) { if (-x "$dir/ifconfig") { $ifconfig = "$dir/ifconfig"; last; } } if ($ifconfig eq "") { warn "Ifconfig not found\n"; return "localhost"; } # print "Foudn $ifconfig\n"; if (!open($fh, "$ifconfig $interface_name |")) { warn "Could not run $ifconfig: $!\n"; return "localhost"; } my $ip; while (<$fh>) { # # Mac: # inet 140.221.10.153 netmask 0xfffffc00 broadcast 140.221.11.255 # Linux: # inet addr:140.221.34.37 Bcast:140.221.34.63 Mask:255.255.255.224 # chomp; s/^\s*//; # print "Have '$_'\n"; if (/inet\s+addr:(\d+\.\d+\.\d+\.\d+)\s+/) { # # Linux hit. # $ip = $1; # print "Got linux $ip\n"; last; } elsif (/inet\s+(\d+\.\d+\.\d+\.\d+)\s+/) { # # Mac hit. # $ip = $1; # print "Got mac $ip\n"; last; } } close($fh); if ($ip eq "") { warn "Didn't find an IP\n"; return "localhost"; } return $ip; } sub get_seed_id { # # Retrieve the seed identifer from FIGdisk/config/seed_id. # # If it's not there, create one, and make it readonly. # shift if UNIVERSAL::isa($_[0],__PACKAGE__); my $id; my $id_file = "$FIG_Config::fig_disk/config/seed_id"; if (! -f $id_file) { my $newid = `uuidgen`; if (!$newid) { die "Cannot run uuidgen: $!"; } chomp($newid); my $fh = new FileHandle(">$id_file"); if (!$fh) { die "error creating $id_file: $!"; } print $fh "$newid\n"; $fh->close(); chmod(0444, $id_file); } my $fh = new FileHandle("<$id_file"); $id = <$fh>; chomp($id); return $id; } =pod =head1 get_release_info Return the current data release information. It is returned as the list ($name, $id, $inst, $email, $parent_id, $description). The release info comes from the file FIG/Data/RELEASE. It is formatted as: For instance: ----- SEED Data Release, 09/15/2004. 4148208C-1DF2-11D9-8417-000A95D52EF6 ANL/FIG olson@mcs.anl.gov Test release. ----- If no RELEASE file exists, this routine will create one with a new unique ID. This lets a peer optimize the data transfer by being able to cache ID translations from this instance. =cut sub get_release_info { my($fig, $no_create) = @_; my $rel_file = "$FIG_Config::data/RELEASE"; if (! -f $rel_file and !$no_create) { # # Create a new one. # my $newid = `uuidgen`; if (!$newid) { die "Cannot run uuidgen: $!"; } chomp($newid); my $relinfo = "Automatically generated release info " . localtime(); my $inst = "Unknown"; my $contact = "Unknown"; my $parent = ""; my( $a, $b, $e, $v, $env ) = $fig->genome_counts; my $description = "Automatically generated release info\n"; $description .= "Contains $a archaeal, $b bacterial, $e eukaryal, $v viral and $env environmental genomes.\n"; my $fh = new FileHandle(">$rel_file"); if (!$fh) { warn "error creating $rel_file: $!"; return undef; } print $fh "$relinfo\n"; print $fh "$newid\n"; print $fh "$inst\n"; print $fh "$contact\n"; print $fh "$parent\n"; print $fh $description; $fh->close(); chmod(0444, $rel_file); } if (open(my $fh, $rel_file)) { my(@lines) = <$fh>; close($fh); chomp(@lines); my($info, $id, $inst, $contact, $parent, @desc) = @lines; return ($info, $id, $inst, $contact, $parent, join("\n", @desc)); } return undef; } =pod =head1 get_peer_last_update Return the timestamp from the last successful peer-to-peer update with the given peer. We store this information in FIG/Data/Global/Peers/. =cut sub get_peer_last_update { my($self, $peer_id) = @_; my $dir = "$FIG_Config::data/Global/Peers"; &verify_dir($dir); $dir .= "/$peer_id"; &verify_dir($dir); my $update_file = "$dir/last_update"; if (-f $update_file) { my $time = file_head($update_file, 1); chomp $time; return $time; } else { return undef; } } sub set_peer_last_update { my($self, $peer_id, $time) = @_; my $dir = "$FIG_Config::data/Global/Peers"; &verify_dir($dir); $dir .= "/$peer_id"; &verify_dir($dir); my $update_file = "$dir/last_update"; open(F, ">$update_file"); print F "$time\n"; close(F); } sub cgi_url { shift if UNIVERSAL::isa($_[0],__PACKAGE__); return &plug_url($FIG_Config::cgi_url); } sub temp_url { shift if UNIVERSAL::isa($_[0],__PACKAGE__); return &plug_url($FIG_Config::temp_url); } sub plug_url { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($url) = @_; my $name; # Revised by GJO # First try to get url from the current http request if ( defined( $ENV{ 'HTTP_HOST' } ) # This is where $cgi->url gets its value && ( $name = $ENV{ 'HTTP_HOST' } ) && ( $url =~ s~^http://[^/]*~http://$name~ ) # ~ is delimiter ) {} # Otherwise resort to alternative sources elsif ( ( $name = &get_local_hostname ) && ( $url =~ s~^http://[^/]*~http://$name~ ) # ~ is delimiter ) {} return $url; } sub file_read { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($file) = @_; if (open(my $fh, "<$file")) { if (wantarray) { my @ret = <$fh>; return @ret; } else { local $/; my $text = <$fh>; close($fh); return $text; } } } sub file_head { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($file, $n) = @_; if (!$n) { $n = 1; } if (open(my $fh, "<$file")) { my(@ret, $i); $i = 0; while (<$fh>) { push(@ret, $_); $i++; last if $i >= $n; } close($fh); if (wantarray) { return @ret; } else { return join("", @ret); } } } =pod =head1 hiding/caching in a FIG object We save the DB handle, cache taxonomies, and put a few other odds and ends in the FIG object. We expect users to invoke these services using the object $fig constructed using: use FIG; my $fig = new FIG; $fig is then used as the basic mechanism for accessing FIG services. It is, of course, just a hash that is used to retain/cache data. The most commonly accessed item is the DB filehandle, which is accessed via $self->db_handle. We cache genus/species expansions, taxonomies, distances (very crudely estimated) estimated between genomes, and a variety of other things. I am not sure that using cached/2 was a good idea, but I did it. =cut sub db_handle { my($self) = @_; return $self->{_dbf}; } sub cached { my($self,$what) = @_; my $x = $self->{$what}; if (! $x) { $x = $self->{$what} = {}; } return $x; } ################ Basic Routines [ existed since WIT ] ########################## =pod =head1 min usage: $n = &FIG::min(@x) Assumes @x contains numeric values. Returns the minimum of the values. =cut sub min { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my(@x) = @_; my($min,$i); (@x > 0) || return undef; $min = $x[0]; for ($i=1; ($i < @x); $i++) { $min = ($min > $x[$i]) ? $x[$i] : $min; } return $min; } =pod =head1 max usage: $n = &FIG::max(@x) Assumes @x contains numeric values. Returns the maximum of the values. =cut sub max { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my(@x) = @_; my($max,$i); (@x > 0) || return undef; $max = $x[0]; for ($i=1; ($i < @x); $i++) { $max = ($max < $x[$i]) ? $x[$i] : $max; } return $max; } =pod =head1 between usage: &FIG::between($x,$y,$z) Returns true iff $y is between $x and $z. =cut sub between { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($x,$y,$z) = @_; if ($x < $z) { return (($x <= $y) && ($y <= $z)); } else { return (($x >= $y) && ($y >= $z)); } } =pod =head1 standard_genetic_code usage: $code = &FIG::standard_genetic_code() Routines like "translate" can take a "genetic code" as an argument. I implemented such codes using hashes that assumed uppercase DNA triplets as keys. =cut sub standard_genetic_code { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my $code = {}; $code->{"AAA"} = "K"; $code->{"AAC"} = "N"; $code->{"AAG"} = "K"; $code->{"AAT"} = "N"; $code->{"ACA"} = "T"; $code->{"ACC"} = "T"; $code->{"ACG"} = "T"; $code->{"ACT"} = "T"; $code->{"AGA"} = "R"; $code->{"AGC"} = "S"; $code->{"AGG"} = "R"; $code->{"AGT"} = "S"; $code->{"ATA"} = "I"; $code->{"ATC"} = "I"; $code->{"ATG"} = "M"; $code->{"ATT"} = "I"; $code->{"CAA"} = "Q"; $code->{"CAC"} = "H"; $code->{"CAG"} = "Q"; $code->{"CAT"} = "H"; $code->{"CCA"} = "P"; $code->{"CCC"} = "P"; $code->{"CCG"} = "P"; $code->{"CCT"} = "P"; $code->{"CGA"} = "R"; $code->{"CGC"} = "R"; $code->{"CGG"} = "R"; $code->{"CGT"} = "R"; $code->{"CTA"} = "L"; $code->{"CTC"} = "L"; $code->{"CTG"} = "L"; $code->{"CTT"} = "L"; $code->{"GAA"} = "E"; $code->{"GAC"} = "D"; $code->{"GAG"} = "E"; $code->{"GAT"} = "D"; $code->{"GCA"} = "A"; $code->{"GCC"} = "A"; $code->{"GCG"} = "A"; $code->{"GCT"} = "A"; $code->{"GGA"} = "G"; $code->{"GGC"} = "G"; $code->{"GGG"} = "G"; $code->{"GGT"} = "G"; $code->{"GTA"} = "V"; $code->{"GTC"} = "V"; $code->{"GTG"} = "V"; $code->{"GTT"} = "V"; $code->{"TAA"} = "*"; $code->{"TAC"} = "Y"; $code->{"TAG"} = "*"; $code->{"TAT"} = "Y"; $code->{"TCA"} = "S"; $code->{"TCC"} = "S"; $code->{"TCG"} = "S"; $code->{"TCT"} = "S"; $code->{"TGA"} = "*"; $code->{"TGC"} = "C"; $code->{"TGG"} = "W"; $code->{"TGT"} = "C"; $code->{"TTA"} = "L"; $code->{"TTC"} = "F"; $code->{"TTG"} = "L"; $code->{"TTT"} = "F"; return $code; } =pod =head1 translate usage: $aa_seq = &FIG::translate($dna_seq,$code,$fix_start); If $code is undefined, I use the standard genetic code. If $fix_start is true, I will translate initial TTG or GTG to 'M'. =cut sub translate { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my( $dna,$code,$start) = @_; my( $i,$j,$ln ); my( $x,$y ); my( $prot ); if (! defined($code)) { $code = &FIG::standard_genetic_code; } $ln = length($dna); $prot = "X" x ($ln/3); $dna =~ tr/a-z/A-Z/; for ($i=0,$j=0; ($i < ($ln-2)); $i += 3,$j++) { $x = substr($dna,$i,3); if ($y = $code->{$x}) { substr($prot,$j,1) = $y; } } if (($start) && ($ln >= 3) && (substr($dna,0,3) =~ /^[GT]TG$/)) { substr($prot,0,1) = 'M'; } return $prot; } =pod =head1 reverse_comp and rev_comp usage: $dnaR = &FIG::reverse_comp($dna) or $dnaRP = &FIG::rev_comp($seqP) In WIT, we implemented reverse complement passing a pointer to a sequence and returning a pointer to a sequence. In most cases the pointers are a pain (although in a few they are just what is needed). Hence, I kept both versions of the function to allow you to use whichever you like. Use rev_comp only for long strings where passing pointers is a reasonable effeciency issue. =cut sub reverse_comp { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($seq) = @_; return ${&rev_comp(\$seq)}; } sub rev_comp { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my( $seqP ) = @_; my( $rev ); $rev = reverse( $$seqP ); $rev =~ tr/a-z/A-Z/; $rev =~ tr/ACGTUMRWSYKBDHV/TGCAAKYWSRMVHDB/; return \$rev; } =pod =head1 verify_dir usage: &FIG::verify_dir($dir) Makes sure that $dir exists. If it has to create it, it sets permissions to 0777. =cut sub verify_dir { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($dir) = @_; if (-d $dir) { return } if ($dir =~ /^(.*)\/[^\/]+$/) { &verify_dir($1); } mkdir($dir,0777) || die "could not make $dir: $!"; # chmod 02777,$dir; } =pod =head1 run usage: &FIG::run($cmd) Runs $cmd and fails (with trace) if the command fails. =cut sub run { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($cmd) = @_; # my @tmp = `date`; chomp @tmp; print STDERR "$tmp[0]: running $cmd\n"; (system($cmd) == 0) || confess "FAILED: $cmd"; } =pod =head1 read_fasta_record(\*FILEHANDLE) Usage: ( $seq_id, $seq_pointer, $comment ) = &read_fasta_record(\*FILEHANDLE); Function: Reads a FASTA-formatted sequence file one record at a time. The input filehandle defaults to STDIN if not specified. Returns a sequence ID, a pointer to the sequence, and an optional record comment (NOTE: Record comments are deprecated, as some tools such as BLAST do not handle them gracefully). Returns an empty list if attempting to read a record results in an undefined value (e.g., due to reaching the EOF). Author: Gordon D. Pusch Date: 2004-Feb-18 =cut sub read_fasta_record { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my ($file_handle) = @_; my ( $old_end_of_record, $fasta_record, @lines, $head, $sequence, $seq_id, $comment, @parsed_fasta_record ); if (not defined($file_handle)) { $file_handle = \*STDIN; } $old_end_of_record = $/; $/ = "\n>"; if (defined($fasta_record = <$file_handle>)) { chomp $fasta_record; @lines = split( /\n/, $fasta_record ); $head = shift @lines; $head =~ s/^>?//; $head =~ m/^(\S+)/; $seq_id = $1; if ($head =~ m/^\S+\s+(.*)$/) { $comment = $1; } else { $comment = ""; } $sequence = join( "", @lines ); @parsed_fasta_record = ( $seq_id, \$sequence, $comment ); } else { @parsed_fasta_record = (); } $/ = $old_end_of_record; return @parsed_fasta_record; } =pod =head1 display_id_and_seq usage: &FIG::display_id_and_seq($id_and_comment,$seqP,$fh) This command has always been used to put out fasta sequences. Note that it takes a pointer to the sequence. $fh is optional and defalts to STDOUT. =cut sub display_id_and_seq { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my( $id, $seq, $fh ) = @_; if (! defined($fh) ) { $fh = \*STDOUT; } print $fh ">$id\n"; &display_seq($seq, $fh); } sub display_seq { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my ( $seq, $fh ) = @_; my ( $i, $n, $ln ); if (! defined($fh) ) { $fh = \*STDOUT; } $n = length($$seq); # confess "zero-length sequence ???" if ( (! defined($n)) || ($n == 0) ); for ($i=0; ($i < $n); $i += 60) { if (($i + 60) <= $n) { $ln = substr($$seq,$i,60); } else { $ln = substr($$seq,$i,($n-$i)); } print $fh "$ln\n"; } } ########## I commented the pods on the following routines out, since they should not ########## be part of the SOAP/WSTL interface #=pod # #=head1 file2N # #usage: $n = $fig->file2N($file) # #In some of the databases I need to store filenames, which can waste a lot of #space. Hence, I maintain a database for converting filenames to/from integers. # #=cut # sub file2N :scalar { my($self,$file) = @_; my($relational_db_response); my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT fileno FROM file_table WHERE ( file = \'$file\')")) && (@$relational_db_response == 1)) { return $relational_db_response->[0]->[0]; } elsif (($relational_db_response = $rdbH->SQL("SELECT MAX(fileno) FROM file_table ")) && (@$relational_db_response == 1) && ($relational_db_response->[0]->[0])) { my $fileno = $relational_db_response->[0]->[0] + 1; if ($rdbH->SQL("INSERT INTO file_table ( file, fileno ) VALUES ( \'$file\', $fileno )")) { return $fileno; } } elsif ($rdbH->SQL("INSERT INTO file_table ( file, fileno ) VALUES ( \'$file\', 1 )")) { return 1; } return undef; } #=pod # #=head1 N2file # #usage: $filename = $fig->N2file($n) # #In some of the databases I need to store filenames, which can waste a lot of #space. Hence, I maintain a database for converting filenames to/from integers. # #=cut # sub N2file :scalar { my($self,$fileno) = @_; my($relational_db_response); my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT file FROM file_table WHERE ( fileno = $fileno )")) && (@$relational_db_response == 1)) { return $relational_db_response->[0]->[0]; } return undef; } #=pod # #=head1 openF # #usage: $fig->openF($filename) # #Parts of the system rely on accessing numerous different files. The most obvious case is #the situation with similarities. It is important that the system be able to run in cases in #which an arbitrary number of files cannot be open simultaneously. This routine (with closeF) is #a hack to handle this. I should probably just pitch them and insist that the OS handle several #hundred open filehandles. # #=cut # sub openF { my($self,$file) = @_; my($fxs,$x,@fxs,$fh); $fxs = $self->cached('_openF'); if ($x = $fxs->{$file}) { $x->[1] = time(); return $x->[0]; } @fxs = keys(%$fxs); if (defined($fh = new FileHandle "<$file")) { if (@fxs >= 50) { @fxs = sort { $fxs->{$a}->[1] <=> $fxs->{$b}->[1] } @fxs; $x = $fxs->{$fxs[0]}; undef $x->[0]; delete $fxs->{$fxs[0]}; } $fxs->{$file} = [$fh,time()]; return $fh; } return undef; } #=pod # #=head1 closeF # #usage: $fig->closeF($filename) # #Parts of the system rely on accessing numerous different files. The most obvious case is #the situation with similarities. It is important that the system be able to run in cases in #which an arbitrary number of files cannot be open simultaneously. This routine (with openF) is #a hack to handle this. I should probably just pitch them and insist that the OS handle several #hundred open filehandles. # #=cut # sub closeF { my($self,$file) = @_; my($fxs,$x); if (($fxs = $self->{_openF}) && ($x = $fxs->{$file})) { undef $x->[0]; delete $fxs->{$file}; } } =pod =head1 ec_name usage: $enzymatic_function = $fig->ec_name($ec) Returns enzymatic name for EC. =cut sub ec_name { my($self,$ec) = @_; ($ec =~ /^\d+\.\d+\.\d+\.\d+$/) || return ""; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT name FROM ec_names WHERE ( ec = \'$ec\' )"); return (@$relational_db_response == 1) ? $relational_db_response->[0]->[0] : ""; return ""; } =pod =head1 all_roles usage: @roles = $fig->all_roles Supposed to return all known roles. For now, we get all ECs with "names". =cut sub all_roles { my($self) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT ec,name FROM ec_names"); return @$relational_db_response; } =pod =head1 expand_ec usage: $expanded_ec = $fig->expand_ec($ec) Expands "1.1.1.1" to "1.1.1.1 - alcohol dehydrogenase" or something like that. =cut sub expand_ec { my($self,$ec) = @_; my($name); return ($name = $self->ec_name($ec)) ? "$ec - $name" : $ec; } =pod =head1 clean_tmp usage: &FIG::clean_tmp We store temporary files in $FIG_Config::temp. There are specific classes of files that are created and should be saved for at least a few days. This routine can be invoked to clean out those that are over two days old. =cut sub clean_tmp { my($file); if (opendir(TMP,"$FIG_Config::temp")) { # change the pattern to pick up other files that need to be cleaned up my @temp = grep { $_ =~ /^(Geno|tmp)/ } readdir(TMP); foreach $file (@temp) { if (-M "$FIG_Config::temp/$file" > 2) { unlink("$FIG_Config::temp/$file"); } } } } ################ Routines to process genomes and genome IDs ########################## =pod =head1 genomes usage: @genome_ids = $fig->genomes( $complete, $restrictions, $domain ); Genomes are assigned ids of the form X.Y where X is the taxonomic id maintained by NCBI for the species (not the specific strain), and Y is a sequence digit assigned to this particular genome (as one of a set with the same genus/species). Genomes also have versions, but that is a separate issue. =cut sub genomes :remote :list { my( $self, $complete, $restrictions, $domain ) = @_; my $rdbH = $self->db_handle; my @where = (); if ($complete) { push(@where,"( complete = \'1\' )") } if ($restrictions) { push(@where,"( restrictions = \'1\' )") } if ($domain) { push( @where, "( maindomain = '$domain' )" ) } my $relational_db_response; if (@where > 0) { my $where = join(" AND ",@where); $relational_db_response = $rdbH->SQL("SELECT genome FROM genome where $where"); } else { $relational_db_response = $rdbH->SQL("SELECT genome FROM genome"); } my @genomes = sort { $a <=> $b } map { $_->[0] } @$relational_db_response; return @genomes; } sub is_complete { my($self,$genome) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT genome FROM genome where (genome = '$genome') AND (complete = '1')"); return (@$relational_db_response == 1) } sub genome_counts { my($self,$complete) = @_; my($x,$relational_db_response); my $rdbH = $self->db_handle; if ($complete) { $relational_db_response = $rdbH->SQL("SELECT genome,maindomain FROM genome where complete = '1'"); } else { $relational_db_response = $rdbH->SQL("SELECT genome,maindomain FROM genome"); } my ($arch, $bact, $euk, $vir, $env, $unk) = (0, 0, 0, 0, 0, 0); if (@$relational_db_response > 0) { foreach $x (@$relational_db_response) { if ($x->[1] =~ /^archaea/i) { ++$arch } elsif ($x->[1] =~ /^bacter/i) { ++$bact } elsif ($x->[1] =~ /^eukar/i) { ++$euk } elsif ($x->[1] =~ /^vir/i) { ++$vir } elsif ($x->[1] =~ /^env/i) { ++$env } else { ++$unk } } } return ($arch, $bact, $euk, $vir, $env, $unk); } =pod =head1 genome_domain usage: $domain = $fig->genome_domain($genome_id); Returns the domain of a genome ID, and 'undef' if it is not in the database. =cut sub genome_domain { my($self,$genome) = @_; my $relational_db_response; my $rdbH = $self->db_handle; if ($genome) { if (($relational_db_response = $rdbH->SQL("SELECT genome,maindomain FROM genome WHERE ( genome = \'$genome\' )")) && (@$relational_db_response == 1)) { # die Dumper($relational_db_response); return $relational_db_response->[0]->[1]; } } return undef; } =pod =head1 genome_pegs usage: $num_pegs = $fig->genome_pegs($genome_id); Returns the number of protein-encoding genes (PEGs) in $genome_id if "$genome_id" is indexed in the "genome" database, and 'undef' otherwise. =cut sub genome_pegs { my($self,$genome) = @_; my $relational_db_response; my $rdbH = $self->db_handle; if ($genome) { if (($relational_db_response = $rdbH->SQL("SELECT pegs FROM genome WHERE ( genome = \'$genome\' )")) && (@$relational_db_response == 1)) { return $relational_db_response->[0]->[0]; } } return undef; } =pod =head1 genome_rnas usage: $num_rnas = $fig->genome_rnas($genome_id); Returns the number of RNA-encoding genes (RNAs) in $genome_id if "$genome_id" is indexed in the "genome" database, and 'undef' otherwise. =cut sub genome_rnas { my($self,$genome) = @_; my $relational_db_response; my $rdbH = $self->db_handle; if ($genome) { if (($relational_db_response = $rdbH->SQL("SELECT rnas FROM genome WHERE ( genome = \'$genome\' )")) && (@$relational_db_response == 1)) { return $relational_db_response->[0]->[0]; } } return undef; } =pod =head1 genome_szdna usage: $szdna = $fig->genome_szdna($genome_id); Returns the number of DNA base-pairs in the genome contigs file(s) of $genome_id "$genome_id" is indexed in the "genome" database, and 'undef' otherwise. =cut sub genome_szdna { my($self,$genome) = @_; my $relational_db_response; my $rdbH = $self->db_handle; if ($genome) { if (($relational_db_response = $rdbH->SQL("SELECT szdna FROM genome WHERE ( genome = \'$genome\' )")) && (@$relational_db_response == 1)) { return $relational_db_response->[0]->[0]; } } return undef; } =pod =head1 genome_version usage: $version = $fig->genome_version($genome_id); Versions are incremented for major updates. They are put in as major updates of the form 1.0, 2.0, ... Users may do local "editing" of the DNA for a genome, but when they do, they increment the digits to the right of the decimal. Two genomes remain comparable only if the versions match identically. Hence, minor updating should be committed only by the person/group responsible for updating that genome. We can, of course, identify which genes are identical between any two genomes (by matching the DNA or amino acid sequences). However, the basic intent of the system is to support editing by the main group issuing periodic major updates. =cut sub genome_version :scalar { my($self,$genome) = @_; my(@tmp); if ((-s "$FIG_Config::organisms/$genome/VERSION") && (@tmp = `cat $FIG_Config::organisms/$genome/VERSION`) && ($tmp[0] =~ /^(\S+)$/)) { return $1; } return undef; } =pod =head1 genus_species usage: $gs = $fig->genus_species($genome_id) Returns the genus and species (and strain if that has been properly recorded) in a printable form. =cut sub genus_species :scalar { my ($self,$genome) = @_; my $ans; my $genus_species = $self->cached('_genus_species'); if (! ($ans = $genus_species->{$genome})) { my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT genome,gname FROM genome"); my $pair; foreach $pair (@$relational_db_response) { $genus_species->{$pair->[0]} = $pair->[1]; } $ans = $genus_species->{$genome}; } return $ans; } =pod =head1 org_of usage: $org = $fig->org_of($prot_id) In the case of external proteins, we can usually determine an organism, but not anything more precise than genus/species (and often not that). This routine takes a protein ID (which may be a feature ID) and returns "the organism". =cut sub org_of { my($self,$prot_id) = @_; my $relational_db_response; my $rdbH = $self->db_handle; if ($prot_id =~ /^fig\|/) { # # Trying to guess what Ross wanted (there was a servere bug): # # deleted -> undefined # failed lookup -> "" # return $self->is_deleted_fid( $prot_id) ? undef : $self->genus_species( $self->genome_of( $prot_id ) ) || ""; } if (($relational_db_response = $rdbH->SQL("SELECT org FROM external_orgs WHERE ( prot = \'$prot_id\' )")) && (@$relational_db_response >= 1)) { return $relational_db_response->[0]->[0]; } return ""; } # # Support for colorizing organisms by domain # -- GJO # =pod =head1 genus_species_domain usage: ($gs, $domain) = $fig->genus_species_domain($genome_id) Returns the genus and species (and strain if that has been properly recorded) in a printable form, and domain. =cut sub genus_species_domain { my ($self, $genome) = @_; my $genus_species_domain = $self->cached('_genus_species_domain'); if ( ! $genus_species_domain->{ $genome } ) { my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT genome,gname,maindomain FROM genome"); my $triple; foreach $triple ( @$relational_db_response ) { $genus_species_domain->{ $triple->[0] } = [ $triple->[1], $triple->[2] ]; } } my $gsdref = $genus_species_domain->{ $genome }; return $gsdref ? @$gsdref : ( "", "" ); } my %domain_color = ( AR => "#DDFFFF", BA => "#FFDDFF", EU => "#FFFFDD", VI => "#DDDDDD", EN => "#BBBBBB" ); sub domain_color { my ($domain) = @_; defined $domain || return "#FFFFFF"; return $domain_color{ uc substr($domain, 0, 2) } || "#FFFFFF"; } =pod =head1 org_and_color_of usage: ($org, $color) = $fig->org_and_domain_of($prot_id) Return the best guess organism and domain html color string of an organism. In the case of external proteins, we can usually determine an organism, but not anything more precise than genus/species (and often not that). This routine takes a protein ID (which may be a feature ID) and returns "the organism". =cut sub org_and_color_of { my($self,$prot_id) = @_; my $relational_db_response; my $rdbH = $self->db_handle; if ($prot_id =~ /^fig\|/) { my( $gs, $domain ) = $self->genus_species_domain($self->genome_of($prot_id)); return ( $gs, domain_color( $domain ) ); } if (($relational_db_response = $rdbH->SQL("SELECT org FROM external_orgs WHERE ( prot = \'$prot_id\' )")) && (@$relational_db_response >= 1)) { return ($relational_db_response->[0]->[0], "#FFFFFF"); } return ("", "#FFFFFF"); } # # End of support for colorizing organisms by domain # -- GJO # =pod =head1 abbrev usage: $abbreviated_name = $fig->abbrev($genome_name) For alignments and such, it is very useful to be able to produce an abbreviation of genus/species. That's what this does. Note that multiple genus/species might reduce to the same abbreviation, so be careful (disambiguate them, if you must). =cut sub abbrev :scalar { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($genome_name) = @_; $genome_name =~ s/^(\S{3})\S+/$1./; $genome_name =~ s/^(\S+)\s+(\S{3})\S+/$1$2./; if (length($genome_name) > 13) { $genome_name = substr($genome_name,0,13); } return $genome_name; } ################ Routines to process Features and Feature IDs ########################## =pod =head1 ftype usage: $type = &FIG::ftype($fid) Returns the type of a feature, given the feature ID. This just amounts to lifting it out of the feature ID, since features have IDs of tghe form fig|x.y.f.n where x.y is the genome ID f is the type pf feature n is an integer that is unique within the genome/type =cut sub ftype { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($feature_id) = @_; if ($feature_id =~ /^fig\|\d+\.\d+\.([^\.]+)/) { return $1; } return undef; } =pod =head1 genome_of usage: $genome_id = $fig->genome_of($fid) This just extracts the genome ID from a feature ID. =cut sub genome_of :scalar { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my $prot_id = (@_ == 1) ? $_[0] : $_[1]; if ($prot_id =~ /^fig\|(\d+\.\d+)/) { return $1; } return undef; } =head1 genome_and_peg_of usage: ($genome_id, $peg_number = $fig->genome_and_peg_of($fid) This just extracts the genome ID and peg number from a feature ID. =cut sub genome_and_peg_of { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my $prot_id = (@_ == 1) ? $_[0] : $_[1]; if ($prot_id =~ /^fig\|(\d+\.\d+)\.peg\.(\d+)/) { return ($1, $2); } return undef; } =pod =head1 by_fig_id usage: @sorted_by_fig_id = sort { &FIG::by_fig_id($a,$b) } @fig_ids This is a bit of a clutzy way to sort a list of FIG feature IDs, but it works. =cut sub by_fig_id { my($a,$b) = @_; my($g1,$g2,$t1,$t2,$n1,$n2); if (($a =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g1,$t1,$n1) = ($1,$2,$3)) && ($b =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g2,$t2,$n2) = ($1,$2,$3))) { ($g1 <=> $g2) or ($t1 cmp $t2) or ($n1 <=> $n2); } else { $a cmp $b; } } =pod =head1 genes_in_region usage: ($features_in_region,$beg1,$end1) = $fig->genes_in_region($genome,$contig,$beg,$end) It is often important to be able to find the genes that occur in a specific region on a chromosome. This routine is designed to provide this information. It returns all genes that overlap the region ($genome,$contig,$beg,$end). $beg1 is set to the minimum coordinate of the returned genes (which may be before the given region), and $end1 the maximum coordinate. The routine assumes that genes are not more than 10000 bases long, which is certainly not true in eukaryotes. Hence, in euks you may well miss genes that overlap the boundaries of the specified region (sorry). =cut sub genes_in_region { my($self,$genome,$contig,$beg,$end) = @_; my($x,$relational_db_response,$feature_id,$b1,$e1,@feat,@tmp,$l,$u); my $pad = 10000; my $rdbH = $self->db_handle; my $minV = $beg - $pad; my $maxV = $end + $pad; if (($relational_db_response = $rdbH->SQL("SELECT id FROM features WHERE ( minloc > $minV ) AND ( minloc < $maxV ) AND ( maxloc < $maxV) AND ( genome = \'$genome\' ) AND ( contig = \'$contig\' );")) && (@$relational_db_response >= 1)) { @tmp = sort { ($a->[1] cmp $b->[1]) or (($a->[2]+$a->[3]) <=> ($b->[2]+$b->[3])) } map { $feature_id = $_->[0]; $x = $self->feature_location($feature_id); $x ? [$feature_id,&boundaries_of($x)] : () } @$relational_db_response; ($l,$u) = (10000000000,0); foreach $x (@tmp) { ($feature_id,undef,$b1,$e1) = @$x; if (&between($beg,&min($b1,$e1),$end) || &between(&min($b1,$e1),$beg,&max($b1,$e1))) { if (! $self->is_deleted_fid($feature_id)) { push(@feat,$feature_id); $l = &min($l,&min($b1,$e1)); $u = &max($u,&max($b1,$e1)); } } } (@feat <= 0) || return ([@feat],$l,$u); } return ([],$l,$u); } #============================================================================= # Using the following version is better, but it brings out a very annoying # issue with some genomes. It already exists in the current code (above) # for some genes in some genomes. For example, visit fig|70601.1.peg.1. # This is true for any genome that has a feature that crosses the origin. # The root of the problem lies in boundaries_of. I am working on a fix that # replaces boundaries_of with a more sophisticated function. When it is # all done, genes_in_retion should behave as desired. -- GJO, Aug. 22, 2004 #============================================================================= # # =pod # # =head1 genes_in_region # # usage: ( $features_in_region, $min_coord, $max_coord ) # = $fig->genes_in_region( $genome, $contig, $beg, $end ) # # It is often important to be able to find the genes that occur in a specific # region on a chromosome. This routine is designed to provide this information. # It returns all genes that overlap the region ( $genome, $contig, $beg, $end ). # $min_coord is set to the minimum coordinate of the returned genes (which may # preceed the given region), and $max_coord the maximum coordinate. Because # the database is indexed by the leftmost and rightmost coordinates of each # feature, the function makes no assumption about the length of the feature, but # it can (and probably will) miss features spanning multiple contigs. # # =cut # # # sub genes_in_region { # my ( $self, $genome, $contig, $beg, $end ) = @_; # my ( $x, $db_response, $feature_id, $b1, $e1, @tmp, @bounds ); # my ( $min_coord, $max_coord ); # # my @features = (); # my $rdbH = $self->db_handle; # # if ( ( $db_response = $rdbH->SQL( "SELECT id # FROM features # WHERE ( contig = '$contig' ) # AND ( genome = '$genome' ) # AND ( minloc <= $end ) # AND ( maxloc >= $beg );" # ) # ) # && ( @$db_response > 0 ) # ) # { # # The sort is unnecessary, but provides a consistent ordering # # @tmp = sort { ( $a->[1] cmp $b->[1] ) # contig # || ( ($a->[2] + $a->[3] ) <=> ( $b->[2] + $b->[3] ) ) # midpoint # } # map { $feature_id = $_->[0]; # ( ( ! $self->is_deleted_fid( $feature_id ) ) # not deleted # && ( $x = $self->feature_location( $feature_id ) ) # and has location # && ( ( @bounds = boundaries_of( $x ) ) == 3 ) # and has bounds # ) ? [ $feature_id, @bounds ] : () # } @$db_response; # # ( $min_coord, $max_coord ) = ( 10000000000, 0 ); # # foreach $x ( @tmp ) # { # ( $feature_id, undef, $b1, $e1 ) = @$x; # push @features, $feature_id; # my ( $min, $max ) = ( $b1 <= $e1 ) ? ( $b1, $e1 ) : ( $e1, $b1 ); # ( $min_coord <= $min ) || ( $min_coord = $min ); # ( $max_coord >= $max ) || ( $max_coord = $max ); # } # } # # return ( @features ) ? ( [ @features ], $min_coord, $max_coord ) # : ( [], undef, undef ); # } # These will be part of the fix to genes_in_region. -- GJO =pod =head1 regions_spanned usage: ( [ $contig, $beg, $end ], ... ) = $fig->regions_spanned( $loc ) The location of a feature in a scalar context is contig_b1_e1,contig_b2_e2,... [one contig_b_e for each segment] This routine takes as input a fig location and reduces it to one or more regions spanned by the gene. Unlike boundaries_of, regions_spanned handles wrapping through the orgin, features split over contigs and exons that are not ordered nicely along the chromosome (ugly but true). =cut sub regions_spanned { shift if UNIVERSAL::isa( $_[0], __PACKAGE__ ); my( $location ) = ( @_ == 1 ) ? $_[0] : $_[1]; defined( $location ) || return undef; my @regions = (); my ( $cur_contig, $cur_beg, $cur_end, $cur_dir ); my ( $contig, $beg, $end, $dir ); my @segs = split( /\s*,\s*/, $location ); # should not have space, but ... @segs || return undef; # Process the first segment my $seg = shift @segs; ( ( $cur_contig, $cur_beg, $cur_end ) = ( $seg =~ /^(\S+)_(\d+)_\d+$/ ) ) || return undef; $cur_dir = ( $cur_end >= $cur_beg ) ? 1 : -1; foreach $seg ( @segs ) { ( ( $contig, $beg, $end ) = ( $seg =~ /^(\S+)_(\d+)_\d+$/ ) ) || next; $dir = ( $end >= $beg ) ? 1 : -1; # Is this a continuation? Update end if ( ( $contig eq $cur_contig ) && ( $dir == $cur_dir ) && ( ( ( $dir > 0 ) && ( $end > $cur_end ) ) || ( ( $dir < 0 ) && ( $end < $cur_end ) ) ) ) { $cur_end = $end; } # Not a continuation. Report previous and update current. else { push @regions, [ $cur_contig, $cur_beg, $cur_end ]; ( $cur_contig, $cur_beg, $cur_end, $cur_dir ) = ( $contig, $beg, $end, $dir ); } } # There should alwasy be a valid, unreported region. push @regions, [ $cur_contig, $cur_beg, $cur_end ]; return wantarray ? @regions : \@regions; } =pod =head1 filter_regions usage: @regions = filter_regions( $contig, $min, $max, @regions ) \@regions = filter_regions( $contig, $min, $max, @regions ) @regions = filter_regions( $contig, $min, $max, \@regions ) \@regions = filter_regions( $contig, $min, $max, \@regions ) This function provides a simple filter for extracting a list of genome regions for those that overlap a particular interval. Region definitions correspond to those produced by regions_spanned. That is, [ contig, beg, end ]. In the function call, either $contig or $min and $max can be undefined (permitting anything). =cut sub filter_regions { my ( $contig, $min, $max, @regions ) = @_; @regions || return (); ( ref( $regions[0] ) eq "ARRAY" ) || return undef; # Is it a region list, or a reference to a region list? if ( ref( $regions[0]->[0] ) eq "ARRAY" ) { @regions = @{ $regions[0] } } if ( ! defined( $contig ) ) { ( defined( $min ) && defined( $max ) ) || return undef; } else # with a defined contig name, allow undefined range { defined( $min ) || ( $min = 1 ); defined( $max ) || ( $max = 1000000000 ); } ( $min <= $max ) || return (); my ( $c, $b, $e ); my @filtered = grep { ( @$_ >= 3 ) # Allow extra fields? && ( ( $c, $b, $e ) = @$_ ) && ( ( ! defined( $contig ) ) || ( $c eq $contig ) ) && ( ( $e >= $b ) || ( ( $b, $e ) = ( $e, $b ) ) ) && ( ( $b <= $max ) && ( $e >= $min ) ) } @regions; return wantarray ? @filtered : \@filtered; } sub close_genes { my($self,$fid,$dist) = @_; # warn "In close_genes, self=$self, fid=$fid"; my $loc = $self->feature_location($fid); if ($loc) { my($contig,$beg,$end) = &FIG::boundaries_of($loc); if ($contig && $beg && $end) { my $min = &min($beg,$end) - $dist; my $max = &max($beg,$end) + $dist; my $feat; ($feat,undef,undef) = $self->genes_in_region(&FIG::genome_of($fid),$contig,$min,$max); return @$feat; } } return (); } sub adjacent_genes { my ($self, $fid, $dist) = @_; my (@close, $strand, $i); # warn "In adjacent_genes, self=$self, fid=$fid"; $strand = $self->strand_of($fid); $dist = $dist || 2000; @close = $self->close_genes($fid, $dist); for ($i=0; $i < @close; ++$i) { last if ($close[$i] eq $fid); } # RAE note that if $self->strand_of($close[$i-1]) ne $strand then left/right neighbors # were never set! oops! # I think the concept of Left and right is confused here. In my mind, left and right # are independent of strand ?? E.g. take a look at PEG fig|196600.1.peg.1806 # this is something like # # ---> <--1805--- --1806--> <--1807-- <---- # # 1805 is always the left neighbor, no? my ($left_neighbor, $right_neighbor) = ($close[$i-1], $close[$i+1]); if (0) # this was if ($i > 0) I just skip this whole section! { if ($self->strand_of($close[$i-1]) eq $strand) { $left_neighbor = $close[$i-1]; } # else {$left_neighbor=$close[$i+1]} # RAE: this is the alternative that is needed if you do it by strand } if ($i < $#close) { if ($self->strand_of($close[$i+1]) eq $strand) { $right_neighbor = $close[$i+1]; } # else {$right_neighbor = $close[$i-1]} # RAE: this is the alternative that is needed if you do it by strand } # ...return genes in transcription order... if ($strand eq '-') { ($left_neighbor, $right_neighbor) = ($right_neighbor, $left_neighbor); } return ($left_neighbor, $right_neighbor) ; } =pod =head1 feature_location usage: $loc = $fig->feature_location($fid) OR @loc = $fig->feature_location($fid) The location of a feature in a scalar context is contig_b1_e1,contig_b2_e2,... [one contig_b_e for each exon] In a list context it is (contig_b1_e1,contig_b2_e2,...) =cut sub feature_location :scalar :list { my($self,$feature_id) = @_; my($relational_db_response,$locations,$location); # warn "In feature_location, self=$self, fid=$feature_id"; if ($self->is_deleted_fid($feature_id)) { return undef } $locations = $self->cached('_location'); if (! ($location = $locations->{$feature_id})) { my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT location FROM features WHERE ( id = \'$feature_id\' )")) && (@$relational_db_response == 1)) { $locations->{$feature_id} = $location = $relational_db_response->[0]->[0]; } } if ($location) { return wantarray() ? split(/,/,$location) : $location; } return undef; } sub contig_of { my ($self, $locus) = @_; $locus =~ m/^([^,]+)_\d+_\d+/; return $1; } sub beg_of { my ($self, $locus) = @_; $locus =~ m/^[^,]+_(\d+)_\d+/; return $1; } sub end_of { my ($self, $locus) = @_; $locus =~ m/\S+_\d+_(\d+)$/; return $1; } sub strand_of { my ($self, $fid) = @_; my ($beg, $end); # warn "In strand_of, self=$self, fid=$fid"; (undef, $beg, $end) = $self->boundaries_of($self->feature_location($fid)); if ($beg < $end) { return '+'; } else { return '-'; } } =pod =head1 find_contig_with_checksum Find a contig in the given genome with the given checksum. =cut sub find_contig_with_checksum { my($self, $genome, $checksum) = @_; # # This implementation scans all the contig files for the organism; when # we have contig checksums in the database this will simplify # significantly. # # For some efficiency, we cache the checksums we compute along the way since # it's probably likely we'll poke at other contigs for this organism. # my $gdir = "$FIG_Config::organisms/$genome"; my $cached_cksum = $self->cached('_contig_checksum'); if (opendir(my $dh, $gdir)) { for my $file (map { "$gdir/$_" } grep { $_ =~ /^contigs\d*$/ } readdir($dh)) { local $/ = "\n>"; if (open(my $fh, "<$file")) { while (<$fh>) { chomp; # # Pull the contig identifier from the first line. # We need the >? to handle the first line in the file; # the others are removed by the chomp above because # $/ is set to "\n>"; # if (s/^>?\s*(\S+)([^\n]*)\n//) { my $ident = $1; my $contig_txt = $_; $contig_txt =~ s/\s//sg; $contig_txt = uc($contig_txt); # # See if we have a cached value. # my $this_checksum; $this_checksum = $cached_cksum->{$genome, $ident}; if (!$this_checksum) { my($rd, $wr, $pid); if (!($pid = open2($rd, $wr, "cksum"))) { die "Cannot run open2 cksum: $!"; } $wr->write($contig_txt, length($contig_txt)); close($wr); $_ = <$rd>; close($rd); waitpid $pid, 0; chomp; my @vals = split(/\s+/, $_); $this_checksum = $vals[0]; $cached_cksum->{$genome, $ident} = $this_checksum; } if ($this_checksum == $checksum) { return $ident; } } } } } } } sub contig_checksum { my($self, $genome, $contig) = @_; my $contig_txt = $self->read_contig($genome, $contig); $contig_txt =~ s/\s//sg; $contig_txt = uc($contig_txt); my($rd, $wr, $pid); if (!($pid = open2($rd, $wr, "cksum"))) { die "Cannot run open2 cksum: $!"; } $wr->write($contig_txt, length($contig_txt)); close($wr); $_ = <$rd>; close($rd); waitpid $pid, 0; chomp; my @vals = split(/\s+/, $_); if (wantarray) { return @vals; } else { return $vals[0]; } } =pod =head1 read_contig Read a single contig from the contigs file. =cut sub read_contig { my($self, $genome, $contig) = @_; # # Read the contig. The database has it in a set of chunks, but we # just use the seek to the starting point, and read up to the next "\n>". # my $ret = $self->db_handle->SQL(qq!SELECT fileno, seek FROM contig_seeks WHERE genome = '$genome' and contig = '$contig' and startn = 0!); if (!$ret or @$ret != 1) { return undef; } my($fileno, $seek) = @{$ret->[0]}; my $file = $self->N2file($fileno); my($fh, $contig_txt); if (!open($fh, "<$file")) { warn "contig_checksum: could not open $file: $!\n"; return undef; } seek($fh, $seek, 0); { local $/ = "\n>"; $contig_txt = <$fh>; chomp($contig_txt); } return $contig_txt; } =pod =head1 boundaries_of usage: ($contig,$beg,$end) = $fig->boundaries_of($loc) The location of a feature in a scalar context is contig_b1_e1,contig_b2_e2,... [one contig_b_e for each exon] This routine takes as input such a location and reduces it to a single description of the entire region containing the gene. =cut sub boundaries_of { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($location) = (@_ == 1) ? $_[0] : $_[1]; my($contigQ); if (defined($location)) { my @exons = split(/,/,$location); my($contig,$beg,$end); if (($exons[0] =~ /^(\S+)_(\d+)_\d+$/) && (($contig,$beg) = ($1,$2)) && ($contigQ = quotemeta $contig) && ($exons[$#exons] =~ /^$contigQ\_\d+_(\d+)$/) && ($end = $1)) { return ($contig,$beg,$end); } } return undef; } =pod =head1 all_features_detailed usage: $fig->all_features_detailed($genome) Returns a list of all feature IDs, location, aliases, type in the designated genome. Used in gendb import to speed up the process =cut sub all_features_detailed { my($self,$genome) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT id, location, aliases, type FROM features WHERE (genome = \'$genome\')"); my @features; foreach my $tuple (@$relational_db_response) { push @features, $tuple unless ($self->is_deleted_fid($tuple->[0])); } return \@features; } =pod =head1 all_features usage: $fig->all_features($genome,$type) Returns a list of all feature IDs of a specified type in the designated genome. You would usually use just $fig->pegs_of($genome) or $fig->rnas_of($genome) which simply invoke this routine. =cut sub all_features { my($self,$genome,$type) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT id FROM features WHERE (genome = \'$genome\' AND (type = \'$type\'))"); if (@$relational_db_response > 0) { return grep { ! $self->is_deleted_fid($_) } map { $_->[0] } @$relational_db_response; } return (); } =pod =head1 pegs_of usage: $fig->pegs_of($genome) Returns a list of all PEGs in the specified genome. Note that order is not specified. =cut sub pegs_of { my($self,$genome) = @_; return $self->all_features($genome,"peg"); } =pod =head1 rnas_of usage: $fig->rnas_of($genome) Returns a list of all RNAs for the given genome. =cut sub rnas_of { my($self,$genome) = @_; return $self->all_features($genome,"rna"); } =pod =head1 feature_aliases usage: @aliases = $fig->feature_aliases($fid) OR $aliases = $fig->feature_aliases($fid) Returns a list of aliases (gene IDs, arbitrary numbers assigned by authors, etc.) for the feature. These must come from the tbl files, so add them there if you want to see them here. In a scalar context, the aliases come back with commas separating them. =cut sub feature_aliases { my($self,$feature_id) = @_; my($rdbH,$relational_db_response,@aliases,$aliases,%aliases,$x); if ($self->is_deleted_fid($feature_id)) { return undef } $rdbH = $self->db_handle; @aliases = (); if (($relational_db_response = $rdbH->SQL("SELECT aliases FROM features WHERE ( id = \'$feature_id\' )")) && (@$relational_db_response == 1)) { $aliases = $relational_db_response->[0]->[0]; %aliases = map { $_ => 1 } split(/,/,$aliases); } if (($relational_db_response = $rdbH->SQL("SELECT alias FROM ext_alias WHERE ( id = \'$feature_id\' )")) && (@$relational_db_response > 0)) { foreach $x (@$relational_db_response) { $aliases{$x->[0]} = 1; } } @aliases = sort keys(%aliases); return wantarray() ? @aliases : join(",",@aliases); } =pod =head1 by_alias usage: $peg = $fig->by_alias($alias) Returns a FIG id if the alias can be converted. Right now we convert aliases of the form NP_* (RefSeq IDs) or gi|* (GenBank IDs) =cut sub by_alias { my($self,$alias,$genome) = @_; my($rdbH,$relational_db_response,$peg); my ($peg, $flag) = FIGRules::NormalizeAlias($alias); if ($flag) { return $peg; } else { my $genomeQ = $genome ? quotemeta $genome : ""; $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT id FROM ext_alias WHERE ( alias = ? )", undef, $peg)) && (@$relational_db_response > 0)) { if (@$relational_db_response == 1) { $peg = $relational_db_response->[0]->[0]; return wantarray() ? ($peg) : $peg; } elsif (wantarray()) { return map { $_->[0] } @$relational_db_response; } else { return wantarray() ? () : ""; } } } } sub to_alias { my($self,$fid,$type) = @_; my @aliases = grep { $_ =~ /^$type\|/ } $self->feature_aliases($fid); if (wantarray()) { return @aliases; } elsif (@aliases > 0) { return $aliases[0]; } else { return ""; } } =pod =head1 possibly_truncated usage: $fig->possibly_truncated($fid) Returns true iff the feature occurs near the end of a contig. =cut sub possibly_truncated { my($self,$feature_id) = @_; my($loc); if ($loc = $self->feature_location($feature_id)) { my $genome = $self->genome_of($feature_id); my ($contig,$beg,$end) = $self->boundaries_of($loc); if ((! $self->near_end($genome,$contig,$beg)) && (! $self->near_end($genome,$contig,$end))) { return 0; } } return 1; } sub near_end { my($self,$genome,$contig,$x) = @_; return (($x < 300) || ($x > ($self->contig_ln($genome,$contig) - 300))); } sub is_real_feature { my($self,$fid) = @_; my($relational_db_response); if ($self->is_deleted_fid($fid)) { return 0 } my $rdbH = $self->db_handle; return (($relational_db_response = $rdbH->SQL("SELECT id FROM features WHERE ( id = \'$fid\' )")) && (@$relational_db_response == 1)) ? 1 : 0; } ################ Routines to process functional coupling for PEGs ########################## =pod =head1 coupling_and_evidence usage: @coupling_data = $fig->coupling_and_evidence($fid,$bound,$sim_cutoff,$coupling_cutoff,$keep_record) A computation of couplings and evidence starts with a given peg and produces a list of 3-tuples. Each 3-tuple is of the form [Score,CoupledToFID,Evidence] Evidence is a list of 2-tuples of FIDs that are close in other genomes (producing a "pair of close homologs" of [$peg,CoupledToFID]). The maximum score for a single PCH is 1, but "Score" is the sum of the scores for the entire set of PCHs. If $keep_record is true, the system records the information, asserting coupling for each of the pairs in the set of evidence, and asserting a pin from the given $fd through all of the PCH entries used in forming the score. =cut sub coupling_and_evidence { my($self,$feature_id,$bound,$sim_cutoff,$coupling_cutoff,$keep_record) = @_; my($neighbors,$neigh,$similar1,$similar2,@hits,$sc,$ev,$genome1); if ($self->is_deleted_fid($feature_id)) { return undef } if ($feature_id =~ /^fig\|(\d+\.\d+)/) { $genome1 = $1; } else { return undef; } my $locations = $self->feature_location($feature_id); my($contig,$beg,$end) = $self->boundaries_of($locations); if (! $contig) { return () } ($neighbors,undef,undef) = $self->genes_in_region($self->genome_of($feature_id), $contig, &min($beg,$end) - $bound, &max($beg,$end) + $bound); if (@$neighbors == 0) { return () } $similar1 = $self->acceptably_close($feature_id,$sim_cutoff); @hits = (); foreach $neigh (grep { $_ =~ /peg/ } @$neighbors) { next if ($neigh eq $feature_id); $similar2 = $self->acceptably_close($neigh,$sim_cutoff); ($sc,$ev) = $self->coupling_ev($genome1,$similar1,$similar2,$bound); if ($sc >= $coupling_cutoff) { push(@hits,[$sc,$neigh,$ev]); } } if ($keep_record) { $self->add_chr_clusters_and_pins($feature_id,\@hits); } return sort { $b->[0] <=> $a->[0] } @hits; } sub fast_coupling { my($self,$peg,$bound,$coupling_cutoff) = @_; my($genome,$genome1,$genome2,$peg1,$peg2,$peg3,%maps,$loc,$loc1,$loc2,$loc3); my($pairs,$sc,%ev); if ($self->is_deleted_fid($peg)) { return undef } my @ans = (); $genome = &genome_of($peg); foreach $peg1 ($self->in_pch_pin_with($peg)) { $peg1 =~ s/,.*$//; if ($peg ne $peg1) { $genome1 = &genome_of($peg1); $maps{$peg}->{$genome1} = $peg1; } } $loc = [&boundaries_of(scalar $self->feature_location($peg))]; foreach $peg1 ($self->in_cluster_with($peg)) { if ($peg ne $peg1) { # print STDERR "peg1=$peg1\n"; $loc1 = [&boundaries_of(scalar $self->feature_location($peg1))]; if (&close_enough($loc,$loc1,$bound)) { foreach $peg2 ($self->in_pch_pin_with($peg1)) { $genome2 = &genome_of($peg2); if (($peg3 = $maps{$peg}->{$genome2}) && ($peg2 ne $peg3)) { $loc2 = [&boundaries_of(scalar $self->feature_location($peg2))]; $loc3 = [&boundaries_of(scalar $self->feature_location($peg3))]; if (&close_enough($loc2,$loc3,$bound)) { push(@{$ev{$peg1}},[$peg3,$peg2]); } } } } } } foreach $peg1 (keys(%ev)) { $pairs = $ev{$peg1}; $sc = $self->score([$peg,map { $_->[0] } @$pairs]); if ($sc >= $coupling_cutoff) { push(@ans,[$sc,$peg1]); } } return sort { $b->[0] <=> $a->[0] } @ans; } sub score { my($self,$pegs) = @_; my(@ids); if ($self->{_no9s_scoring}) { @ids = map { $self->maps_to_id($_) } grep { $_ !~ /^fig\|999999/ } @$pegs; } else { @ids = map { $self->maps_to_id($_) } @$pegs; } return &score1($self,\@ids) - 1; } sub score1 { my($self,$pegs) = @_; my($sim); my($first,@rest) = @$pegs; my $count = 1; my %hits = map { $_ => 1 } @rest; my @ordered = sort { $b->[0] <=> $a->[0] } map { $sim = $_; [$sim->iden,$sim->id2] } grep { $hits{$_->id2} } $self->sims($first,1000,1,"raw"); my %ordered = map { $_->[1] => 1 } @ordered; foreach $_ (@rest) { if (! $ordered{$_}) { push(@ordered,[0,$_]); } } while ((@ordered > 0) && ($ordered[0]->[0] >= 97)) { shift @ordered ; } while (@ordered > 0) { my $start = $ordered[0]->[0]; $_ = shift @ordered; my @sub = ( $_->[1] ); while ((@ordered > 0) && ($ordered[0]->[0] > ($start-3))) { $_ = shift @ordered; push(@sub, $_->[1]); } if (@sub == 1) { $count++; } else { $count += &score1($self,\@sub); } } return $count; } =pod =head1 add_chr_clusters_and_pins usage: $fig->add_chr_clusters_and_pins($peg,$hits) The system supports retaining data relating to functional coupling. If a user computes evidence once and then saves it with this routine, data relating to both "the pin" and the "clusters" (in all of the organisms supporting the functional coupling) will be saved. $hits must be a pointer to a list of 3-tuples of the sort returned by $fig->coupling_and_evidence. =cut sub add_chr_clusters_and_pins { my($self,$peg,$hits) = @_; my(@clusters,@pins,$x,$sc,$neigh,$pairs,$y,@corr,@orgs,%projection); my($genome,$cluster,$pin,$peg2); if (@$hits > 0) { @clusters = (); @pins = (); push(@clusters,[$peg,map { $_->[1] } @$hits]); foreach $x (@$hits) { ($sc,$neigh,$pairs) = @$x; push(@pins,[$neigh,map { $_->[1] } @$pairs]); foreach $y (@$pairs) { $peg2 = $y->[0]; if ($peg2 =~ /^fig\|(\d+\.\d+)/) { $projection{$1}->{$peg2} = 1; } } } @corr = (); @orgs = keys(%projection); if (@orgs > 0) { foreach $genome (sort { $a <=> $b } @orgs) { push(@corr,sort { &FIG::by_fig_id($a,$b) } keys(%{$projection{$genome}})); } push(@pins,[$peg,@corr]); } foreach $cluster (@clusters) { $self->add_chromosomal_cluster($cluster); } foreach $pin (@pins) { $self->add_pch_pin($pin); } } } sub coupling_ev { my($self,$genome1,$sim1,$sim2,$bound) = @_; my($ev,$sc,$i,$j); $ev = []; $i = 0; $j = 0; while (($i < @$sim1) && ($j < @$sim2)) { if ($sim1->[$i]->[0] < $sim2->[$j]->[0]) { $i++; } elsif ($sim1->[$i]->[0] > $sim2->[$j]->[0]) { $j++; } else { $self->accumulate_ev($genome1,$sim1->[$i]->[1],$sim2->[$j]->[1],$bound,$ev); $i++; $j++; } } return ($self->score([map { $_->[0] } @$ev]),$ev); } sub accumulate_ev { my($self,$genome1,$feature_ids1,$feature_ids2,$bound,$ev) = @_; my($genome2,@locs1,@locs2,$i,$j,$x); if ((@$feature_ids1 == 0) || (@$feature_ids2 == 0)) { return 0 } $feature_ids1->[0] =~ /^fig\|(\d+\.\d+)/; $genome2 = $1; @locs1 = map { $x = $self->feature_location($_); $x ? [&boundaries_of($x)] : () } @$feature_ids1; @locs2 = map { $x = $self->feature_location($_); $x ? [&boundaries_of($x)] : () } @$feature_ids2; for ($i=0; ($i < @$feature_ids1); $i++) { for ($j=0; ($j < @$feature_ids2); $j++) { if (($feature_ids1->[$i] ne $feature_ids2->[$j]) && &close_enough($locs1[$i],$locs2[$j],$bound)) { push(@$ev,[$feature_ids1->[$i],$feature_ids2->[$j]]); } } } } sub close_enough { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($locs1,$locs2,$bound) = @_; # print STDERR &Dumper(["close enough",$locs1,$locs2]); return (($locs1->[0] eq $locs2->[0]) && (abs((($locs1->[1]+$locs1->[2])/2) - (($locs2->[1]+$locs2->[2])/2)) <= $bound)); } sub acceptably_close { my($self,$feature_id,$sim_cutoff) = @_; my(%by_org,$id2,$genome,$sim); my($ans) = []; foreach $sim ($self->sims($feature_id,1000,$sim_cutoff,"fig")) { $id2 = $sim->id2; if ($id2 =~ /^fig\|(\d+\.\d+)/) { my $genome = $1; if (! $self->is_eukaryotic($genome)) { push(@{$by_org{$genome}},$id2); } } } foreach $genome (sort { $a <=> $b } keys(%by_org)) { push(@$ans,[$genome,$by_org{$genome}]); } return $ans; } ################ Translations of PEGsand External Protein Sequences ########################## =pod =head1 translatable usage: $fig->translatable($prot_id) The system takes any number of sources of protein sequences as input (and builds an nr for the purpose of computing similarities). For each of these input fasta files, it saves (in the DB) a filename, seek address and length so that it can go get the translation if needed. This routine simply returns true iff info on the translation exists. =cut sub translatable { my($self,$prot) = @_; return &translation_length($self,$prot) ? 1 : 0; } =pod =head1 translation_length usage: $len = $fig->translation_length($prot_id) The system takes any number of sources of protein sequences as input (and builds an nr for the purpose of computing similarities). For each of these input fasta files, it saves (in the DB) a filename, seek address and length so that it can go get the translation if needed. This routine returns the length of a translation. This does not require actually retrieving the translation. =cut sub translation_length { my($self,$prot) = @_; if ($self->is_deleted_fid($prot)) { return undef } $prot =~ s/^([^\|]+\|[^\|]+)\|.*$/$1/; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT slen,seek FROM protein_sequence_seeks WHERE id = \'$prot\' "); my @vals = sort { $b->[1] <=> $a->[1] } @$relational_db_response; return (@vals > 0) ? $vals[0]->[0] : undef; } =pod =head1 get_translation usage: $translation = $fig->get_translation($prot_id) The system takes any number of sources of protein sequences as input (and builds an nr for the purpose of computing similarities). For each of these input fasta files, it saves (in the DB) a filename, seek address and length so that it can go get the translation if needed. This routine returns a protein sequence. =cut sub get_translation { my($self,$id) = @_; my($rdbH,$relational_db_response,$fileN,$file,$fh,$seek,$ln,$tran); if ($self->is_deleted_fid($id)) { return '' } $rdbH = $self->db_handle; $id =~ s/^([^\|]+\|[^\|]+)\|.*$/$1/; $relational_db_response = $rdbH->SQL("SELECT fileno, seek, len FROM protein_sequence_seeks WHERE id = \'$id\' "); if ($relational_db_response && @$relational_db_response > 0) { my @vals = sort { $b->[1] <=> $a->[1] } @$relational_db_response; ($fileN,$seek,$ln) = @{$vals[0]}; if (($fh = $self->openF($self->N2file($fileN))) && ($ln > 10)) { seek($fh,$seek,0); read($fh,$tran,$ln-1); $tran =~ s/\s//g; return $tran; } } return ''; } =pod =head1 mapped_prot_ids usage: @mapped = $fig->mapped_prot_ids($prot) This routine is at the heart of maintaining synonyms for protein sequences. The system determines which protein sequences are "essentially the same". These may differ in length (presumably due to miscalled starts), but the tails are identical (and the heads are not "too" extended). Anyway, the set of synonyms is returned as a list of 2-tuples [Id,length] sorted by length. =cut sub mapped_prot_ids { my($self,$id) = @_; if ($self->is_deleted_fid($id)) { return () } my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT maps_to FROM peg_synonyms WHERE syn_id = \'$id\' "); if ($relational_db_response && (@$relational_db_response == 1)) { $id = $relational_db_response->[0]->[0]; } $relational_db_response = $rdbH->SQL("SELECT syn_id,syn_ln,maps_to_ln FROM peg_synonyms WHERE maps_to = \'$id\' "); if ($relational_db_response && (@$relational_db_response > 0)) { return ([$id,$relational_db_response->[0]->[2]],map { [$_->[0],$_->[1]] } @$relational_db_response); } else { return ([$id,$self->translation_length($id)]); } } sub maps_to_id { my($self,$id) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT maps_to FROM peg_synonyms WHERE syn_id = \'$id\' "); return ($relational_db_response && (@$relational_db_response == 1)) ? $relational_db_response->[0]->[0] : $id; } ################ Assignments of Function to PEGs ########################## # set to undef to unset user # sub set_user { my($self,$user) = @_; $self->{_user} = $user; } sub get_user { my($self) = @_; return $self->{_user}; } =pod =head1 function_of usage: @functions = $fig->function_of($peg) OR $function = $fig->function_of($peg,$user) In a list context, you get back a list of 2-tuples. Each 2-tuple is of the form [MadeBy,Function]. In a scalar context, 1. user is "master" if not specified 2. function returned is the user's, if one exists; otherwise, master's, if one exists In a scalar context, you get just the function. =cut # Note that we do not return confidence. I propose a separate function to get both # function and confidence # sub function_of { my($self,$id,$user) = @_; my($relational_db_response,@tmp,$entry,$i); my $wantarray = wantarray(); my $rdbH = $self->db_handle; if ($self->is_deleted_fid($id)) { return $wantarray ? () : "" } if (($id =~ /^fig\|(\d+\.\d+\.peg\.\d+)/) && ($wantarray || $user)) { if (($relational_db_response = $rdbH->SQL("SELECT made_by,assigned_function FROM assigned_functions WHERE ( prot = \'$id\' )")) && (@$relational_db_response >= 1)) { @tmp = sort { $a->[0] cmp $b->[0] } map { $_->[1] =~ s/^\s//; $_->[1] =~ s/(\t\S)?\s*$//; [$_->[0],$_->[1]] } @$relational_db_response; for ($i=0; ($i < @tmp) && ($tmp[$i]->[0] ne "master"); $i++) {} if ($i < @tmp) { $entry = splice(@tmp,$i,1); unshift @tmp, ($entry); } my $val; if ($wantarray) { return @tmp } elsif ($user && ($val = &extract_by_who(\@tmp,$user))) { return $val } elsif ($user && ($val = &extract_by_who(\@tmp,"master"))) { return $val } else { return "" } } } else { if (($relational_db_response = $rdbH->SQL("SELECT assigned_function FROM assigned_functions WHERE ( prot = \'$id\' AND made_by = \'master\' )")) && (@$relational_db_response >= 1)) { $relational_db_response->[0]->[0] =~ s/^\s//; $relational_db_response->[0]->[0] =~ s/(\t\S)?\s*$//; return $wantarray ? (["master",$relational_db_response->[0]->[0]]) : $relational_db_response->[0]->[0]; } } return $wantarray ? () : ""; } =pod =head1 translated_function_of usage: $function = $fig->translated_function_of($peg,$user) You get just the translated function. =cut sub translated_function_of { my($self,$id,$user) = @_; my $func = $self->function_of($id,$user); if ($func) { $func = $self->translate_function($func); } return $func; } sub extract_by_who { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($xL,$who) = @_; my($i); for ($i=0; ($i < @$xL) && ($xL->[$i]->[0] ne $who); $i++) {} return ($i < @$xL) ? $xL->[$i]->[1] : ""; } =pod =head1 translate_function usage: $translated_func = $fig->translate_function($func) Translates a function based on the function.synonyms table. =cut sub translate_function { my($self,$function) = @_; my ($tran,$from,$to,$line); if (! ($tran = $self->{_function_translation})) { $tran = {}; if (open(TMP,"<$FIG_Config::global/function.synonyms")) { while (defined($line = )) { chomp $line; ($from,$to) = split(/\t/,$line); $tran->{$from} = $to; } close(TMP); } foreach $from (keys(%$tran)) { $to = $tran->{$from}; if ($tran->{$to}) { delete $tran->{$from}; } } $self->{_function_translation} = $tran; } while ($to = $tran->{$function}) { $function = $to; } return $function; } =pod =head1 assign_function usage: $fig->assign_function($peg,$user,$function,$confidence) Assigns a function. Note that confidence can (and should be if unusual) included. Note that no annotation is written. This should normally be done in a separate call of the form =cut sub assign_function { my($self,$peg,$user,$function,$confidence) = @_; my($role,$roleQ); if (! $self->is_real_feature($peg)) { return 0 } my $rdbH = $self->db_handle; $confidence = $confidence ? $confidence : ""; my $genome = $self->genome_of($peg); $rdbH->SQL("DELETE FROM assigned_functions WHERE ( prot = \'$peg\' AND made_by = \'$user\' )"); my $funcQ = quotemeta $function; $rdbH->SQL("INSERT INTO assigned_functions ( prot, made_by, assigned_function, quality, org ) VALUES ( \'$peg\', \'$user\', \'$funcQ\', \'$confidence\', \'$genome\' )"); $rdbH->SQL("DELETE FROM roles WHERE ( prot = \'$peg\' AND made_by = \'$user\' )"); foreach $role (&roles_of_function($function)) { $roleQ = quotemeta $role; $rdbH->SQL("INSERT INTO roles ( prot, role, made_by, org ) VALUES ( \'$peg\', '$roleQ\', \'$user\', \'$genome\' )"); } &verify_dir("$FIG_Config::organisms/$genome/UserModels"); if ($user ne "master") { &verify_dir("$FIG_Config::organisms/$genome/UserModels/$user"); } my $file; if ((($user eq "master") && ($file = "$FIG_Config::organisms/$genome/assigned_functions") && open(TMP,">>$file")) || (($user ne "master") && ($file = "$FIG_Config::organisms/$genome/UserModels/$user/assigned_functions") && open(TMP,">>$file"))) { flock(TMP,LOCK_EX) || confess "cannot lock assigned_functions"; seek(TMP,0,2) || confess "failed to seek to the end of the file"; print TMP "$peg\t$function\t$confidence\n"; close(TMP); chmod(0777,$file); return 1; } else { print STDERR "FAILED ASSIGNMENT: $peg\t$function\t$confidence\n"; } return 0; } sub hypo { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my $x = (@_ == 1) ? $_[0] : $_[1]; if (! $x) { return 1 } if ($x =~ /hypoth/i) { return 1 } if ($x =~ /conserved protein/i) { return 1 } if ($x =~ /gene product/i) { return 1 } if ($x =~ /interpro/i) { return 1 } if ($x =~ /B[sl][lr]\d/i) { return 1 } if ($x =~ /^U\d/) { return 1 } if ($x =~ /^orf/i) { return 1 } if ($x =~ /uncharacterized/i) { return 1 } if ($x =~ /psedogene/i) { return 1 } if ($x =~ /^predicted/i) { return 1 } if ($x =~ /AGR_/) { return 1 } if ($x =~ /similar to/i) { return 1 } if ($x =~ /similarity/i) { return 1 } if ($x =~ /glimmer/i) { return 1 } if ($x =~ /unknown/i) { return 1 } return 0; } ############################ Similarities ############################### =pod =head1 sims usage: @sims = $fig->sims($peg,$maxN,$maxP,$select) Returns a list of similarities for $peg such that there will be at most $maxN similarities, each similarity will have a P-score <= $maxP, and $select gives processing instructions: "raw" means that the similarities will not be expanded (by far fastest option) "fig" means return only similarities to fig genes "all" means that you want all the expanded similarities. By "expanded", we refer to taking a "raw similarity" against an entry in the non-redundant protein collection, and converting it to a set of similarities (one for each of the proteins that are essentially identical to the representative in the nr). Each entry in @sims is a refence to an array. These are the values in each array position: 0. The query peg 1. The similar peg 2. The percent id 3. Alignment length 4. Mismatches 5. Gap openings 6. The start of the match in the query peg 7. The end of the match in the query peg 8. The start of the match in the similar peg 9. The end of the match in the similar peg 10. E value 11. Bit score 12. Length of query peg 13. Length of similar peg 14. Method =cut sub sims { my ($self,$id,$maxN,$maxP,$select,$max_expand) = @_; my($sim); $max_expand = defined($max_expand) ? $max_expand : $maxN; if ($self->is_deleted_fid($id)) { return () } my @sims = (); my @maps_to = $self->mapped_prot_ids($id); if (@maps_to > 0) { my $rep_id = $maps_to[0]->[0]; my @entry = grep { $_->[0] eq $id } @maps_to; if ((@entry == 1) && defined($entry[0]->[1])) { if ((! defined($maps_to[0]->[1])) || (! defined($entry[0]->[1]))) { print STDERR &Dumper(\@maps_to,\@entry); confess "bad"; } my $delta = $maps_to[0]->[1] - $entry[0]->[1]; my @raw_sims = &get_raw_sims($self,$rep_id,$maxN,$maxP); if ($id ne $rep_id) { foreach $sim (@raw_sims) { $sim->[0] = $id; $sim->[6] -= $delta; $sim->[7] -= $delta; } } if (($max_expand > 0) && ($select ne "raw")) { unshift(@raw_sims,bless([$id, $rep_id, 100.00, $entry[0]->[1], 0, 0, 1,$entry[0]->[1], $delta+1,$maps_to[0]->[1], 0.0, 2 * $entry[0]->[1], $entry[0]->[1], $maps_to[0]->[1], "blastp", undef, undef ],'Sim')); $max_expand++; } @sims = grep { $_->id1 ne $_->id2 } &expand_raw_sims($self,\@raw_sims,$maxP,$select,0,$max_expand); } } return grep { ! $self->is_deleted_fid($_->id2) } @sims; } sub expand_raw_sims { my($self,$raw_sims,$maxP,$select,$dups,$max_expand) = @_; my($sim,$id2,%others,$x); my @sims = (); foreach $sim (@$raw_sims) { next if ($sim->psc > $maxP); $id2 = $sim->id2; next if ($others{$id2} && (! $dups)); $others{$id2} = 1; if (($select && ($select eq "raw")) || ($max_expand <= 0)) { push(@sims,$sim); } else { my @relevant; $max_expand--; my @maps_to = $self->mapped_prot_ids($id2); if ((! $select) || ($select eq "fig")) { @relevant = grep { $_->[0] =~ /^fig/ } @maps_to; } elsif ($select && ($select =~ /^ext/i)) { @relevant = grep { $_->[0] !~ /^fig/ } @maps_to; } else { @relevant = @maps_to; } foreach $x (@relevant) { my $sim1 = [@$sim]; my($x_id,$x_ln) = @$x; defined($x_ln) || confess "x_ln id2=$id2 x_id=$x_id"; defined($maps_to[0]->[1]) || confess "maps_to"; my $delta2 = $maps_to[0]->[1] - $x_ln; $sim1->[1] = $x_id; $sim1->[8] -= $delta2; $sim1->[9] -= $delta2; bless($sim1,"Sim"); push(@sims,$sim1); } } } return @sims; } sub get_raw_sims { my($self,$rep_id,$maxN,$maxP) = @_; my(@sims,$seek,$fileN,$ln,$fh,$file,$readC,@lines,$i,$sim); my($sim_chunk,$psc,$id2); $maxN = $maxN ? $maxN : 500; @sims = (); my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT seek, fileN, len FROM sim_seeks WHERE id = \'$rep_id\' "); foreach $sim_chunk (@$relational_db_response) { ($seek,$fileN,$ln) = @$sim_chunk; $file = $self->N2file($fileN); $fh = $self->openF($file); if (! $fh) { confess "could not open sims for $file"; } $readC = &read_block($fh,$seek,$ln-1); @lines = grep { (@$_ == 15) && ($_->[12] =~ /^\d+$/) && ($_->[13] =~ /^\d+$/) && ($_->[6] =~ /^\d+$/) && ($_->[7] =~ /^\d+$/) && ($_->[8] =~ /^\d+$/) && ($_->[9] =~ /^\d+$/) && ($_->[2] =~ /^[0-9.]+$/) && ($_->[10] =~ /^[0-9.e-]+$/) } map { [split(/\t/,$_),"blastp"] } @$readC; @lines = sort { $b->[11] <=> $a->[11] } @lines; for ($i=0; ($i < @lines); $i++) { $psc = $lines[$i]->[10]; $id2 = $lines[$i]->[1]; if ($maxP >= $psc) { $sim = $lines[$i]; bless($sim,"Sim"); push(@sims,$sim); if (@sims == $maxN) { return @sims } } } } return @sims; } sub read_block { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($fh,$seek,$ln) = @_; my($piece,$readN); seek($fh,$seek,0); my @lines = (); my $leftover = ""; while ($ln > 0) { my $ln1 = ($ln <= 10000) ? $ln : 10000; $readN = read($fh,$piece,$ln1); ($readN == $ln1) || confess "could not read the block of sims at $seek for $ln1 characters; $readN actually read"; my @tmp = split(/\n/,$piece); if ($leftover) { $tmp[0] = $leftover . $tmp[0]; } if (substr($piece,-1) eq "\n") { $leftover = ""; } else { $leftover = pop @tmp; } push(@lines,@tmp); $ln -= 10000; } if ($leftover) { push(@lines,$leftover) } return \@lines; } sub bbhs { my($self,$peg,$cutoff,$frac_match) = @_; my($sim,$peg2,$genome2,$i,@sims2,%seen); if ($self->is_deleted_fid($peg)) { return () } $frac_match = defined($frac_match) ? $frac_match : 0; $cutoff = defined($cutoff) ? $cutoff : 1.0e-10; my @bbhs = (); my @precomputed = (); my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT seek, others FROM bbhs WHERE peg = \'$peg\' "); if (@$relational_db_response == 1) { my($seek_set,$others) = @{$relational_db_response->[0]}; if (open(CORES,"<$FIG_Config::global/bbh.cores")) { my $seek; foreach $seek (split(/,/,$seek_set)) { seek(CORES,$seek,0); $_ = ; chop; push(@precomputed,split(/,/,$_)); } close(CORES); } push(@precomputed,split(/,/,$others)); } my %bbhs = map { $_ => 1 } @precomputed; foreach $sim ($self->sims($peg,10000,$cutoff,"fig")) { $peg2 = $sim->id2; my $frac = &FIG::min(($sim->e1+1 - $sim->b1) / $sim->ln1, ($sim->e2+1 - $sim->b2) / $sim->ln2); if ($bbhs{$peg2} && ($frac >= $frac_match)) { push(@bbhs,[$peg2,$sim->psc]); } } return @bbhs; } # # Modeled after the Sprout call of the same name. # sub bbh_list { my($self, $genome, $features) = @_; my $cutoff = 1.0e-10; my $out = {}; for my $feature (@$features) { my @bbhs = $self->bbhs($feature, $cutoff); $out->{$feature} = [grep { /fig\|$genome\.peg/ } map { $_->[0] } @bbhs]; } return $out; } =pod =head1 dsims usage: @sims = $fig->dsims($peg,$maxN,$maxP,$select) Returns a list of similarities for $peg such that there will be at most $maxN similarities, each similarity will have a P-score <= $maxP, and $select gives processing instructions: "raw" means that the similarities will not be expanded (by far fastest option) "fig" means return only similarities to fig genes "all" means that you want all the expanded similarities. By "expanded", we refer to taking a "raw similarity" against an entry in the non-redundant protein collection, and converting it to a set of similarities (one for each of the proteins that are essentially identical to the representative in the nr). The "dsims" or "dynamic sims" are not precomputed. They are computed using a heuristic which is much faster than blast, but misses some similarities. Essentially, you have an "index" or representative sequences, a quick blast is done against it, and if there are any hits these are used to indicate which sub-databases to blast against. =cut sub dsims { my($self,$id,$seq,$maxN,$maxP,$select) = @_; my($sim,$sub_dir,$db,$hit,@hits,%in); my @index = &blastit($id,$seq,"$FIG_Config::global/SimGen/exemplar.fasta",1.0e-3); foreach $sim (@index) { if ($sim->id2 =~ /_(\d+)$/) { $in{$1}++; } } @hits = (); foreach $db (keys(%in)) { $sub_dir = $db % 1000; push(@hits,&blastit($id,$seq,"$FIG_Config::global/SimGen/AccessSets/$sub_dir/$db",$maxP)); } if (@hits == 0) { push(@hits,&blastit($id,$seq,"$FIG_Config::global/SimGen/nohit.fasta",$maxP)); } @hits = sort { ($a->psc <=> $b->psc) or ($a->iden cmp $b->iden) } grep { $_->id2 ne $id } @hits; if ($maxN && ($maxN < @hits)) { $#hits = $maxN - 1 } return &expand_raw_sims($self,\@hits,$maxP,$select); } sub blastit { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($id,$seq,$db,$maxP) = @_; if (! $maxP) { $maxP = 1.0e-5 } my $tmp = &Blast::blastp([[$id,$seq]],$db,"-e $maxP"); my $tmp1 = $tmp->{$id}; if ($tmp1) { return @$tmp1; } return (); } sub related_by_func_sim { my($self,$peg,$user) = @_; my($func,$sim,$id2,%related); if ($self->is_deleted_fid($peg)) { return () } if (($func = $self->function_of($peg,$user)) && (! &FIG::hypo($func))) { foreach $sim ($self->sims($peg,500,1,"fig",500)) { $id2 = $sim->id2; if ($func eq $self->function_of($id2,$user)) { $related{$id2} = 1; } } } return keys(%related); } ################################# chromosomal clusters #################################### =pod =head1 in_cluster_with usage: @pegs = $fig->in_cluster_with($peg) Returns the set of pegs that are thought to be clustered with $peg (on the chromosome). =cut sub in_cluster_with { my($self,$peg) = @_; my($set,$id,%in); return $self->in_set_with($peg,"chromosomal_clusters","cluster_id"); } =pod =head1 add_chromosomal_clusters usage: $fig->add_chromosomal_clusters($file) The given file is supposed to contain one predicted chromosomal cluster per line (either comma or tab separated pegs). These will be added (to the extent they are new) to those already in $FIG_Config::global/chromosomal_clusters. =cut sub add_chromosomal_clusters { my($self,$file) = @_; my($set,$added); open(TMPCLUST,"<$file") || die "aborted"; while (defined($set = )) { print STDERR "."; chomp $set; $added += $self->add_chromosomal_cluster([split(/[\t,]+/,$set)]); } close(TMPCLUST); if ($added) { my $rdbH = $self->db_handle; $self->export_set("chromosomal_clusters","cluster_id","$FIG_Config::global/chromosomal_clusters"); return 1; } return 0; } #=pod # #=head1 export_chromosomal_clusters # #usage: $fig->export_chromosomal_clusters # #Invoking this routine writes the set of chromosomal clusters as known in the #relational DB back to $FIG_Config::global/chromosomal_clusters. # #=cut # sub export_chromosomal_clusters { my($self) = @_; $self->export_set("chromosomal_clusters","cluster_id","$FIG_Config::global/chromosomal_clusters"); } sub add_chromosomal_cluster { my($self,$ids) = @_; my($id,$set,%existing,%in,$new,$existing,$new_id); # print STDERR "adding cluster ",join(",",@$ids),"\n"; foreach $id (@$ids) { foreach $set ($self->in_sets($id,"chromosomal_clusters","cluster_id")) { $existing{$set} = 1; foreach $id ($self->ids_in_set($set,"chromosomal_clusters","cluster_id")) { $in{$id} = 1; } } } # print &Dumper(\%existing,\%in); $new = 0; foreach $id (@$ids) { if (! $in{$id}) { $in{$id} = 1; $new++; } } # print STDERR "$new new ids\n"; if ($new) { foreach $existing (keys(%existing)) { $self->delete_set($existing,"chromosomal_clusters","cluster_id"); } $new_id = $self->next_set("chromosomal_clusters","cluster_id"); # print STDERR "adding new cluster $new_id\n"; $self->insert_set($new_id,[keys(%in)],"chromosomal_clusters","cluster_id"); return 1; } return 0; } ################################# PCH pins #################################### =pod =head1 in_pch_pin_with usage: $fig->in_pch_pin_with($peg) Returns the set of pegs that are believed to be "pinned" to $peg (in the sense that PCHs occur containing these pegs over significant phylogenetic distances). =cut sub in_pch_pin_with { my($self,$peg) = @_; my($set,$id,%in); return $self->in_set_with($peg,"pch_pins","pin"); } =pod =head1 add_pch_pins usage: $fig->add_pch_pins($file) The given file is supposed to contain one set of pinned pegs per line (either comma or tab seprated pegs). These will be added (to the extent they are new) to those already in $FIG_Config::global/pch_pins. =cut sub add_pch_pins { my($self,$file) = @_; my($set,$added); open(TMPCLUST,"<$file") || die "aborted"; while (defined($set = )) { print STDERR "."; chomp $set; my @tmp = split(/[\t,]+/,$set); if (@tmp < 200) { $added += $self->add_pch_pin([@tmp]); } } close(TMPCLUST); if ($added) { my $rdbH = $self->db_handle; $self->export_set("pch_pins","pin","$FIG_Config::global/pch_pins"); return 1; } return 0; } sub export_pch_pins { my($self) = @_; $self->export_set("pch_pins","pin","$FIG_Config::global/pch_pins"); } sub add_pch_pin { my($self,$ids) = @_; my($id,$set,%existing,%in,$new,$existing,$new_id); # print STDERR "adding cluster ",join(",",@$ids),"\n"; foreach $id (@$ids) { foreach $set ($self->in_sets($id,"pch_pins","pin")) { $existing{$set} = 1; foreach $id ($self->ids_in_set($set,"pch_pins","pin")) { $in{$id} = 1; } } } # print &Dumper(\%existing,\%in); $new = 0; foreach $id (@$ids) { if (! $in{$id}) { $in{$id} = 1; $new++; } } if ($new) { if (keys(%in) < 300) { foreach $existing (keys(%existing)) { $self->delete_set($existing,"pch_pins","pin"); } $new_id = $self->next_set("pch_pins","pin"); # print STDERR "adding new pin $new_id\n"; $self->insert_set($new_id,[keys(%in)],"pch_pins","pin"); } else { $new_id = $self->next_set("pch_pins","pin"); # print STDERR "adding new pin $new_id\n"; $self->insert_set($new_id,$ids,"pch_pins","pin"); } return 1; } return 0; } ################################# Annotations #################################### =pod =head1 add_annotation usage: $fig->add_annotation($fid,$user,$annotation) $annotation is added as a time-stamped annotation to $peg showing $user as the individual who added the annotation. =cut sub add_annotation { my($self,$feature_id,$user,$annotation) = @_; my($genome); if ($self->is_deleted_fid($feature_id)) { return 0 } # print STDERR "add: fid=$feature_id user=$user annotation=$annotation\n"; if ($genome = $self->genome_of($feature_id)) { my $file = "$FIG_Config::organisms/$genome/annotations"; my $fileno = $self->file2N($file); my $time_made = time; my $ma = ($annotation =~ /^Set master function to/); if (open(TMP,">>$file")) { flock(TMP,LOCK_EX) || confess "cannot lock assigned_functions"; seek(TMP,0,2) || confess "failed to seek to the end of the file"; my $seek1 = tell TMP; print TMP "$feature_id\n$time_made\n$user\n$annotation", (substr($annotation,-1) eq "\n") ? "" : "\n","//\n"; my $seek2 = tell TMP; close(TMP); chmod 0777, $file; my $ln = ($seek2 - $seek1) - 3; my $rdbH = $self->db_handle; if ($rdbH->SQL("INSERT INTO annotation_seeks ( fid, dateof, who, ma, fileno, seek, len ) VALUES ( \'$feature_id\', $time_made, \'$user\', \'$ma\', $fileno, $seek1, $ln )")) { return 1; } } } return 0; } =pod =head1 merged_related_annotations usage: @annotations = $fig->merged_related_annotations($fids) The set of annotations of a set of PEGs ($fids) is returned as a list of 4-tuples. Each entry in the list is of the form [$fid,$timestamp,$user,$annotation]. =cut sub merged_related_annotations { my($self,$fids) = @_; my($fid); my(@ann) = (); foreach $fid (@$fids) { push(@ann,$self->feature_annotations1($fid)); } return map { $_->[1] = localtime($_->[1]); $_ } sort { $a->[1] <=> $b->[1] } @ann; } =pod =head1 feature_annotations usage: @annotations = $fig->feature_annotations($fid) The set of annotations of $fid is returned as a list of 4-tuples. Each entry in the list is of the form [$fid,$timestamp,$user,$annotation]. =cut sub feature_annotations { my($self,$feature_id,$rawtime) = @_; if ($self->is_deleted_fid($feature_id)) { return () } if ($rawtime) { return $self->feature_annotations1($feature_id); } else { return map { $_->[1] = localtime($_->[1]); $_ } $self->feature_annotations1($feature_id); } } sub feature_annotations1 { my($self,$feature_id) = @_; my($tuple,$fileN,$seek,$ln,$annotation,$feature_idQ); my($file,$fh); if ($self->is_deleted_fid($feature_id)) { return () } my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT fileno, seek, len FROM annotation_seeks WHERE fid = \'$feature_id\' "); my @annotations = (); foreach $tuple (@$relational_db_response) { ($fileN,$seek,$ln) = @$tuple; $annotation = $self->read_annotation($fileN,$seek,$ln); $feature_idQ = quotemeta $feature_id; if ($annotation =~ /^$feature_idQ\n(\d+)\n([^\n]+)\n(.*)/s) { push(@annotations,[$feature_id,$1,$2,$3]); } else { print STDERR "malformed annotation\n$annotation\n"; } } return sort { $a->[1] <=> $b->[1] } @annotations; } sub read_annotation { my($self,$fileN,$seek,$ln) = @_; my($readN,$readC); my $file = $self->N2file($fileN); my $fh = $self->openF($file); if (! $fh) { confess "could not open annotations for $file"; } seek($fh,$seek,0); $readN = read($fh,$readC,$ln); ($readN == $ln) || confess "could not read the block of annotations at $seek for $ln characters; $readN actually read from $file\n$readC"; return $readC; } sub epoch_to_readable { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($epoch) = @_; my($sec,$min,$hr,$dd,$mm,$yr) = localtime($epoch); $mm++; $yr += 1900; return "$mm-$dd-$yr:$hr:$min:$sec"; } # # This now calls assignments_made_full and remaps the output. # sub assignments_made { my($self,$genomes,$who,$date) = @_; my @a = $self->assignments_made_full($genomes, $who, $date); return map { [ @{$_}[0,1]] } @a; } # # Looks up and returns assignments made; return is a list of # tuples [peg, assignment, date, who] # sub assignments_made_full { my($self,$genomes,$who,$date) = @_; my($relational_db_response,$entry,$fid,$fileno,$seek,$len,$ann); my($epoch_date,$when,%sofar,$x); if (! defined($genomes)) { $genomes = [$self->genomes] } my %genomes = map { $_ => 1 } @$genomes; if ($date =~ /^(\d{1,2})\/(\d{1,2})\/(\d{4})$/) { my($mm,$dd,$yyyy) = ($1,$2,$3); $epoch_date = &Time::Local::timelocal(0,0,0,$dd,$mm-1,$yyyy-1900,0,0,0); } elsif ($date =~ /^\d+$/) { $epoch_date = $date; } else { $epoch_date = 0; } $epoch_date = defined($epoch_date) ? $epoch_date-1 : 0; my @assignments = (); my $rdbH = $self->db_handle; if ($who eq "master") { $relational_db_response = $rdbH->SQL("SELECT fid, dateof, fileno, seek, len FROM annotation_seeks WHERE ((ma = \'1\') AND (dateof > $epoch_date))"); } else { $relational_db_response = $rdbH->SQL("SELECT fid, dateof, fileno, seek, len FROM annotation_seeks WHERE (( who = \'$who\' ) AND (dateof > $epoch_date))"); } if ($relational_db_response && (@$relational_db_response > 0)) { foreach $entry (@$relational_db_response) { ($fid,$when,$fileno,$seek,$len) = @$entry; if (($fid =~ /^fig\|(\d+\.\d+)/) && $genomes{$1} && (! $self->is_deleted_fid($fid))) { if ($len < 4) { print STDERR "BAD: fid=$fid when=$when fileno=$fileno seek=$seek len=$len\n"; next; } $ann = $self->read_annotation($fileno,$seek,$len); if (($ann =~ /^(fig\|\d+\.\d+\.peg\.\d+)\n(\d+)\n(\S+)\nSet ([^\n]*)function[^\n]*\n(\S[^\n]+\S)/s) && (($who eq $3) || (($4 eq "master ") && ($who eq "master"))) && ($2 >= $epoch_date)) { if ((! $sofar{$1}) || (($x = $sofar{$1}) && ($when > $x->[0]))) { $sofar{$1} = [$when, $5, $3]; } } } } } @assignments = map { $x = $sofar{$_}; [$_,$x->[1], $x->[0], $x->[2]] } keys(%sofar); return @assignments; } sub assignments_made_for_protein { my($self, $fid) = @_; my($relational_db_response,$entry,$fileno,$seek,$len,$ann); my($epoch_date,$when,%sofar,$x); if ($self->is_deleted_fid($fid)) { return () } my @assignments = (); my $rdbH = $self->db_handle; $relational_db_response = $rdbH->SQL("SELECT fid, dateof, fileno, seek, len FROM annotation_seeks WHERE (fid = '$fid')"); if ($relational_db_response && (@$relational_db_response > 0)) { foreach $entry (@$relational_db_response) { ($fid,$when,$fileno,$seek,$len) = @$entry; if ($len < 4) { print STDERR "BAD: fid=$fid when=$when fileno=$fileno seek=$seek len=$len\n"; next; } $ann = $self->read_annotation($fileno,$seek,$len); if (my ($peg, $when, $who, $what, $func) = $ann =~ /^(fig\|\d+\.\d+\.peg\.\d+)\n(\d+)\n(\S+)\nSet ([^\n]*)function[^\n]*\n(\S[^\n]+\S)/s) { push(@assignments, [$peg, $when, $who, $what, $func]); } } } return @assignments; } =pod =head1 annotations_made usage: @annotations = $fig->annotations_made($genomes, $who, $date) Return the list of annotations on the genomes in @$genomes made by $who after $date. Each returned annotation is of the form [$fid,$timestamp,$user,$annotation]. =cut sub annotations_made { my($self,$genomes,$who,$date) = @_; my($relational_db_response,$entry,$fid,$fileno,$seek,$len,$ann); my($epoch_date,$when,@annotations); if (! defined($genomes)) { $genomes = [$self->genomes] } my %genomes = map { $_ => 1 } @$genomes; if ($date =~ /^(\d{1,2})\/(\d{1,2})\/(\d{4})$/) { my($mm,$dd,$yyyy) = ($1,$2,$3); $epoch_date = &Time::Local::timelocal(0,0,0,$dd,$mm-1,$yyyy-1900,0,0,0); } elsif ($date =~ /^\d+$/) { $epoch_date = $date; } else { $epoch_date = 0; } $epoch_date = defined($epoch_date) ? $epoch_date-1 : 0; @annotations = (); my $rdbH = $self->db_handle; if ($who eq "master") { $relational_db_response = $rdbH->SQL("SELECT fid, dateof, fileno, seek, len FROM annotation_seeks WHERE ((ma = \'1\') AND (dateof > $epoch_date))"); } else { $relational_db_response = $rdbH->SQL("SELECT fid, dateof, fileno, seek, len FROM annotation_seeks WHERE (( who = \'$who\' ) AND (dateof > $epoch_date))"); } if ($relational_db_response && (@$relational_db_response > 0)) { foreach $entry (@$relational_db_response) { ($fid,$when,$fileno,$seek,$len) = @$entry; if (($fid =~ /^fig\|(\d+\.\d+)/) && $genomes{$1} && (! $self->is_deleted_fid($fid))) { $ann = $self->read_annotation($fileno,$seek,$len); if ($ann =~ /^(fig\|\d+\.\d+\.peg\.\d+)\n(\d+)\n(\S+)\n(.*\S)/s) { push(@annotations,[$1,$2,$3,$4]); } } } } return @annotations; } sub feature_attributes { my($self,$fid) = @_; my($rdbH,$relational_db_response); $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT tag,val,url FROM attribute WHERE ( fid = \'$fid\' )")) && (@$relational_db_response > 0)) { return @$relational_db_response; } else { return (); } } ################################# Indexing Features and Functional Roles #################################### =pod =head1 search_index usage: ($pegs,$roles) = $fig->search_pattern($pattern) All pegs that "match" $pattern are put into a list, and $pegs will be a pointer to that list. All roles that "match" $pattern are put into a list, and $roles will be a pointer to that list. The notion of "match $pattern" is intentionally left undefined. For now, you will probably get only entries in which each word id $pattern occurs exactly, but that is not a long term commitment. =cut sub search_index { my($self,$pattern) = @_; my($patternQ,@raw,@pegs,@roles); &clean_tmp; $patternQ = $pattern; $patternQ =~ s/\s+/;/g; $patternQ =~ s/\./\\./g; # print STDERR "pattern=$pattern patternQ=$patternQ\n"; @raw = `$FIG_Config::ext_bin/glimpse -y -H $FIG_Config::data/Indexes -i -w \'$patternQ\'`; @pegs = grep { ! $self->is_deleted_fid($_->[0]) } sort { &FIG::by_fig_id($a->[0],$b->[0]) } map { $_ =~ s/^\S+:\s+//; [split(/\t/,$_)] } grep { $_ =~ /^\S+peg.index/ } @raw; my %roles = map { $_ =~ s/^\S+:\s+//; $_ => 1} grep { $_ =~ /^\S+role.index/ } @raw; @roles = sort keys(%roles); return ([@pegs],[@roles]); } ################################# Loading Databases #################################### #=pod # #=head1 load_all # #usage: load_all # #This function is supposed to reload all entries into the database and do #whatever is required to properly support indexing of pegs and roles. # #=cut sub load_all { shift if UNIVERSAL::isa($_[0],__PACKAGE__); print STDERR "\nLoading SEED data\n\n"; my @packages = qw(load_peg_mapping index_contigs compute_genome_counts load_features index_sims index_translations add_assertions_of_function load_protein_families load_external_orgs load_chromosomal_clusters load_pch_pins index_neighborhoods index_annotations load_ec_names init_maps load_kegg load_distances make_indexes format_peg_dbs load_links index_subsystems load_attributes load_bbhs load_literature ); my $pn = @packages; for my $i (0..@packages - 1) { my $i1 = $i + 1; my $pkg = $packages[$i]; print "Running $pkg ($i1 of $pn)\n"; &run($pkg); } print "\n\nLoad complete.\n\n"; } ################################# Automated Assignments #################################### =pod =head1 auto_assign usage: $assignment = &FIG::auto_assign($peg,$seq) This returns an automated assignment for $peg. $seq is optional; if it is not present, then it is assumed that similarities already exist for $peg. $assignment is set to either Function or Function\tW if it is felt that the assertion is pretty weak. =cut sub auto_assign { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($peg,$seq) = @_; my $cmd = $seq ? "echo \"$peg\t$seq\" | $FIG_Config::bin/auto_assign | $FIG_Config::bin/make_calls" : "echo \"$peg\" | $FIG_Config::bin/auto_assign | $FIG_Config::bin/make_calls"; # print STDERR $cmd; my(@tmp) = `$cmd`; if ((@tmp == 1) && ($tmp[0] =~ /^\S+\t(\S.*\S)/)) { return $1; } else { return "hypothetical protein"; } } ################################# Protein Families #################################### =pod =head1 all_protein_families usage: @all = $fig->all_protein_families Returns a list of the ids of all of the protein families currently defined. =cut sub all_protein_families { my($self) = @_; return $self->all_sets("protein_families","family"); } =pod =head1 ids_in_family usage: @pegs = $fig->ids_in_family($family) Returns a list of the pegs in $family. =cut sub ids_in_family { my($self,$family) = @_; return $self->ids_in_set($family,"protein_families","family"); } =pod =head1 family_function usage: $func = $fig->family_function($family) Returns the putative function of all of the pegs in $family. Remember, we are defining "protein family" as a set of homologous proteins that have the same function. =cut sub family_function { my($self,$family) = @_; my($relational_db_response); my $rdbH = $self->db_handle; defined($family) || confess "family is missing"; if (($relational_db_response = $rdbH->SQL("SELECT function FROM family_function WHERE ( family = $family)")) && (@$relational_db_response >= 1)) { return $relational_db_response->[0]->[0]; } return ""; } =pod =head1 sz_family usage: $n = $fig->sz_family($family) Returns the number of pegs in $family. =cut sub sz_family { my($self,$family) = @_; return $self->sz_set($family,"protein_families","family"); } =pod =head1 in_family usage: @pegs = $fig->in_family($family) Returns the pegs in $family. =cut sub in_family { my($self,$id) = @_; my @in = $self->in_sets($id,"protein_families","family"); return (@in > 0) ? $in[0] : ""; } ################################# Abstract Set Routines #################################### sub all_sets { my($self,$relation,$set_name) = @_; my($relational_db_response); my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT DISTINCT $set_name FROM $relation")) && (@$relational_db_response >= 1)) { return map { $_->[0] } @$relational_db_response; } return (); } sub next_set { my($self,$relation,$set_name) = @_; my($relational_db_response); my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT MAX($set_name) FROM $relation")) && (@$relational_db_response == 1)) { return $relational_db_response->[0]->[0] + 1; } } sub ids_in_set { my($self,$which,$relation,$set_name) = @_; my($relational_db_response); my $rdbH = $self->db_handle; if (defined($which) && ($which =~ /^\d+$/)) { if (($relational_db_response = $rdbH->SQL("SELECT id FROM $relation WHERE ( $set_name = $which)")) && (@$relational_db_response >= 1)) { return grep { ! $self->is_deleted_fid($_) } sort { by_fig_id($a,$b) } map { $_->[0] } @$relational_db_response; } } return (); } sub in_sets { my($self,$id,$relation,$set_name) = @_; my($relational_db_response); if ($self->is_deleted_fid($id)) { return () } my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT $set_name FROM $relation WHERE ( id = \'$id\' )")) && (@$relational_db_response >= 1)) { return map { $_->[0] } @$relational_db_response; } return (); } sub sz_set { my($self,$which,$relation,$set_name) = @_; my($relational_db_response); my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT COUNT(*) FROM $relation WHERE ( $set_name = $which)")) && (@$relational_db_response == 1)) { return $relational_db_response->[0]->[0]; } return 0; } sub delete_set { my($self,$set,$relation,$set_name) = @_; # print STDERR "deleting set $set\n"; my $rdbH = $self->db_handle; return $rdbH->SQL("DELETE FROM $relation WHERE ( $set_name = $set )"); } sub insert_set { my($self,$set,$ids,$relation,$set_name) = @_; my($id); # print STDERR "inserting set $set containing ",join(",",@$ids),"\n"; my $rdbH = $self->db_handle; my @ids = grep { length($_) < 255 } @$ids; if (@ids < 2) { return 0 } my $rc = 1; foreach $id (@ids) { next if ($self->is_deleted_fid($id)); if (! $rdbH->SQL("INSERT INTO $relation ( $set_name,id ) VALUES ( $set,\'$id\' )")) { $rc = 0; } } # print STDERR " rc=$rc\n"; return $rc; } sub in_set_with { my($self,$peg,$relation,$set_name) = @_; my($set,$id,%in); foreach $set ($self->in_sets($peg,$relation,$set_name)) { foreach $id ($self->ids_in_set($set,$relation,$set_name)) { $in{$id} = 1; } } return sort { &by_fig_id($a,$b) } keys(%in); } sub export_set { my($self,$relation,$set_name,$file) = @_; my($pair); my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT $set_name, id FROM $relation"); open(TMPSET,">$file") || die "could not open $file"; flock(TMPSET,LOCK_EX) || confess "cannot lock $file"; seek(TMPSET,0,2) || confess "failed to seek to the end of the file"; foreach $pair (sort { ($a->[0] <=> $b->[0]) or &by_fig_id($a->[1],$b->[1]) } @$relational_db_response) { if (! $self->is_deleted_fid($pair->[1])) { print TMPSET join("\t",@$pair),"\n"; } } close(TMPSET); return 1; } ################################# KEGG Stuff #################################### =pod =head1 all_compounds usage: @compounds = $fig->all_compounds Returns a list containing all of the KEGG compounds. =cut sub all_compounds { my($self) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT DISTINCT cid FROM comp_name"); if (@$relational_db_response > 0) { return sort map { $_->[0] } @$relational_db_response; } return (); } =pod =head1 names_of_compound usage: @names = $fig->names_of_compound Returns a list containing all of the names assigned to the KEGG compounds. The list will be ordered as given by KEGG. =cut sub names_of_compound { my($self,$cid) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT pos,name FROM comp_name where cid = \'$cid\'"); if (@$relational_db_response > 0) { return map { $_->[1] } sort { $a->[0] <=> $b->[0] } @$relational_db_response; } return (); } =pod =head1 comp2react usage: @rids = $fig->comp2react($cid) Returns a list containing all of the reaction IDs for reactions that take $cid as either a substrate or a product. =cut sub comp2react { my($self,$cid) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT rid FROM reaction_to_compound where cid = \'$cid\'"); if (@$relational_db_response > 0) { return sort map { $_->[0] } @$relational_db_response; } return (); } =pod =head1 cas usage: $cas = $fig->cas($cid) Returns the CAS ID for the compound, if known. =cut sub cas { my($self,$cid) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT cas FROM comp_cas where cid = \'$cid\'"); if (@$relational_db_response == 1) { return $relational_db_response->[0]->[0]; } return ""; } =pod =head1 cas_to_cid usage: $cid = $fig->cas_to_cid($cas) Returns the compound id (cid), given the CAS ID. =cut sub cas_to_cid { my($self,$cas) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT cid FROM comp_cas where cas = \'$cas\'"); if (@$relational_db_response == 1) { return $relational_db_response->[0]->[0]; } return ""; } =pod =head1 all_reactions usage: @rids = $fig->all_reactions Returns a list containing all of the KEGG reaction IDs. =cut sub all_reactions { my($self) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT DISTINCT rid FROM reaction_to_compound"); if (@$relational_db_response > 0) { return sort map { $_->[0] } @$relational_db_response; } return (); } =pod =head1 reversible usage: $rev = $fig->reversible($rid) Returns true iff the reactions had a "main direction" designated as "<=>"; =cut sub reversible { my($self,$rid) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT reversible FROM reversible where rid = \'$rid\'"); if (@$relational_db_response == 1) { return $relational_db_response->[0]->[0]; } return 1; } =pod =head1 reaction2comp usage: @tuples = $fig->reaction2comp($rid,$which) Returns the "substrates" iff $which == 0. In any event (i.e., whether you ask for substrates or products), you get back a list of 3-tuples. Each 3-tuple will contain [$cid,$stoich,$main] Stoichiometry is normally numeric, but can be things like "n" or "(n+1)". $main is 1 iff the compound is considered "main" or "connectable". =cut sub reaction2comp { my($self,$rid,$which) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT cid,stoich,main FROM reaction_to_compound where rid = \'$rid\' and setn = \'$which\'"); if (@$relational_db_response > 0) { return sort { $a->[0] cmp $b->[0] } map { $_->[1] =~ s/\s+//g; $_ } @$relational_db_response; } return (); } =pod =head1 catalyzed_by usage: @ecs = $fig->catalyzed_by($rid) Returns the ECs that are reputed to catalyze the reaction. Note that we are currently just returning the ECs that KEGG gives. We need to handle the incompletely specified forms (e.g., 1.1.1.-), but we do not do it yet. =cut sub catalyzed_by { my($self,$rid) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT role FROM reaction_to_enzyme where rid = \'$rid\'"); if (@$relational_db_response > 0) { return sort map { $_->[0] } @$relational_db_response; } return (); } =pod =head1 catalyzes usage: @ecs = $fig->catalyzes($role) Returns the rids of the reactions catalyzed by the "role" (normally an EC). =cut sub catalyzes { my($self,$role) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT rid FROM reaction_to_enzyme where role = \'$role\'"); if (@$relational_db_response > 0) { return sort map { $_->[0] } @$relational_db_response; } return (); } =pod =head1 displayable_reaction usage: $display_format = $fig->displayable_reaction($rid) Returns a string giving the displayable version of a reaction. =cut sub displayable_reaction { my($self,$rid) = @_; my @tmp = `grep $rid $FIG_Config::data/KEGG/reaction_name.lst`; if (@tmp > 0) { chomp $tmp[0]; return $tmp[0]; } return $rid; } =pod =head1 all_maps usage: @maps = $fig->all_maps Returns a list containing all of the KEGG maps that the system knows about (the maps need to be periodically updated). =cut sub all_maps { my($self) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT DISTINCT map FROM ec_map "); if (@$relational_db_response > 0) { return map { $_->[0] } @$relational_db_response; } return (); } =pod =head1 ec_to_maps usage: @maps = $fig->ec_to_maps($ec) Returns the set of maps that contain $ec as a functional role. $ec is usually an EC number, but in the more general case, it can be a functional role. =cut sub ec_to_maps { my($self,$ec) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT map FROM ec_map WHERE ( ec = \'$ec\' )"); if (@$relational_db_response > 0) { return map { $_->[0] } @$relational_db_response; } return (); } sub role_to_maps { my($self, $role) = @_; return $self->ec_to_maps($role); } =pod =head1 map_to_ecs usage: @ecs = $fig->map_to_ecs($map) Returns the set of functional roles (usually ECs) that are contained in the functionality depicted by $map. =cut sub map_to_ecs { my($self,$map) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT ec FROM ec_map WHERE ( map = \'$map\' )"); if (@$relational_db_response > 0) { return map { $_->[0] } @$relational_db_response; } return (); } =pod =head1 map_name usage: $name = $fig->map_name($map) Returns the descriptive name covering the functionality depicted by $map. =cut sub map_name { my($self,$map) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT mapname FROM map_name WHERE ( map = \'$map\' )"); if (@$relational_db_response == 1) { return $relational_db_response->[0]->[0]; } return ""; } ################################# Functional Roles #################################### =pod =head1 neighborhood_of_role usage: @roles = $fig->neighborhood_of_role($role) Returns a list of functional roles that we consider to be "the neighborhood" of $role. =cut sub neighborhood_of_role { my($self,$role) = @_; my($readC); my $file = "$FIG_Config::global/role.neighborhoods"; my $rdbH = $self->db_handle; my $roleQ = quotemeta $role; my $relational_db_response = $rdbH->SQL("SELECT seek, len FROM neigh_seeks WHERE role = \'$roleQ\' "); if (@$relational_db_response == 1) { my($seek,$ln) = @{$relational_db_response->[0]}; my $fh = $self->openF($file); seek($fh,$seek,0); my $readN = read($fh,$readC,$ln-1); ($readN == ($ln-1)) || confess "could not read the block of sims at $seek for $ln - 1 characters; $readN actually read from $file\n$readC"; return grep { $_ && ($_ !~ /^\/\//) } split(/\n/,$readC); } return (); } =pod =head1 roles_of_function usage: @roles = $fig->roles_of_function($func) Returns a list of the functional roles implemented by $func. =cut sub roles_of_function { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my $func = (@_ == 1) ? $_[0] : $_[1]; return (split(/\s*[\/;]\s+/,$func),($func =~ /\d+\.\d+\.\d+\.\d+/g)); } =pod =head1 seqs_with_role usage: @pegs = $fig->seqs_with_role($role,$who) Returns a list of the pegs that implement $role. If $who is not given, it defaults to "master". The system returns all pegs with an assignment made by either "master" or $who (if it is different than the master) that implement $role. Note that this includes pegs for which the "master" annotation disagrees with that of $who, the master's implements $role, and $who's does not. =cut sub seqs_with_role { my($self,$role,$who,$genome) = @_; my($relational_db_response,$query); my $roleQ = quotemeta $role; $who = $who ? $who : "master"; my $rdbH = $self->db_handle; my $who_cond; if ($who eq "master") { $who_cond = "( made_by = \'master\' OR made_by = \'unknown\' )"; } else { $who_cond = "( made_by = \'master\' OR made_by = \'$who\' OR made_by = \'unknown\')"; } if (! $genome) { $query = "SELECT distinct prot FROM roles WHERE (( role = \'$roleQ\' ) AND $who_cond )"; } else { $query = "SELECT distinct prot FROM roles WHERE (( role = \'$roleQ\' ) AND $who_cond AND (org = \'$genome\'))"; } return (($relational_db_response = $rdbH->SQL($query)) && (@$relational_db_response >= 1)) ? grep { ! $self->is_deleted_fid($_) } map { $_->[0] } @$relational_db_response : (); } =pod =head1 seqs_with_roles_in_genomes usage: $result = $fig->seqs_with_roles_in_genomes($genomes,$roles,$made_by) This routine takes a pointer to a list of genomes ($genomes) and a pointer to a list of roles ($roles) and looks up all of the sequences that connect to those roles according to either the master assignments or those made by $made_by. Again, you will get assignments for which the "master" assignment connects, but the $made_by does not. A hash is returned. The keys to the hash are genome IDs for which at least one sequence was found. $result->{$genome} will itself be a hash, assuming that at least one sequence was found for $genome. $result->{$genome}->{$role} will be set to a pointer to a list of 2-tuples. Each 2-tuple will contain [$peg,$function], where $function is the one for $made_by (which may not be the one that connected). =cut sub seqs_with_roles_in_genomes { my($self,$genomes,$roles,$made_by) = @_; my($genome,$role,$roleQ,$role_cond,$made_by_cond,$query,$relational_db_response,$peg,$genome_cond,$hit); my $rdbH = $self->db_handle; my $result = {}; # foreach $genome ($self->genomes) { $result->{$genome} = {} } if (! $made_by) { $made_by = 'master' } if ((@$genomes > 0) && (@$roles > 0)) { $genome_cond = "(" . join(" OR ",map { "( org = \'$_\' )" } @$genomes) . ")"; $role_cond = "(" . join(" OR ",map { $roleQ = quotemeta $_; "( role = \'$roleQ\' )" } @$roles) . ")"; $made_by_cond = ($made_by eq 'master') ? "(made_by = 'master')" : "(made_by = 'master' OR made_by = '$made_by')"; $query = "SELECT distinct prot, role FROM roles WHERE ( $made_by_cond AND $genome_cond AND $role_cond )"; if (($relational_db_response = $rdbH->SQL($query)) && (@$relational_db_response >= 1)) { foreach $hit (@$relational_db_response) { ($peg,$role) = @$hit; if (! $self->is_deleted_fid($peg)) { $genome = $self->genome_of($peg); push(@{ $result->{$genome}->{$role} },[$peg,scalar $self->function_of($peg,$made_by)]); } } } } return $result; } =pod =head1 largest_clusters usage: @clusters = $fig->largest_clusters($roles,$user) This routine can be used to find the largest clusters containing some of the designated set of roles. A list of clusters is returned. Each cluster is a pointer to a list of pegs. =cut sub largest_clusters { my($self,$roles,$user,$sort_by_unique_functions) = @_; my($genome,$x,$role,$y,$peg,$loc,$contig,$beg,$end,%pegs,@pegs,$i,$j); my $ss = $self->seqs_with_roles_in_genomes([$self->genomes],$roles,$user); my @clusters = (); foreach $genome (keys(%$ss)) { my %pegs; $x = $ss->{$genome}; foreach $role (keys(%$x)) { $y = $x->{$role}; foreach $peg (map { $_->[0] } @$y) { if ($loc = $self->feature_location($peg)) { ($contig,$beg,$end) = &FIG::boundaries_of($loc); $pegs{$peg} = [$peg,$contig,int(($beg + $end) / 2)]; } } } @pegs = sort { ($pegs{$a}->[1] cmp $pegs{$b}->[1]) or ($pegs{$a}->[2] <=> $pegs{$b}->[2]) } keys(%pegs); $i = 0; while ($i < $#pegs) { for ($j=$i+1; ($j < @pegs) && &close_enough_locs($pegs{$pegs[$j-1]},$pegs{$pegs[$j]}); $j++) {} if ($j > ($i+1)) { push(@clusters,[@pegs[$i..$j-1]]); } $i = $j; } } if ($sort_by_unique_functions) { @clusters = sort { $self->unique_functions($b,$user) <=> $self->unique_functions($a,$user) } @clusters; } else { @clusters = sort { @$b <=> @$a } @clusters; } return @clusters; } sub unique_functions { my($self,$pegs,$user) = @_; my($peg,$func,%seen); foreach $peg (@$pegs) { if ($func = $self->function_of($peg,$user)) { $seen{$func} = 1; } } return scalar keys(%seen); } sub close_enough_locs { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($x,$y) = @_; return (($x->[1] eq $y->[1]) && (abs($x->[2] - $y->[2]) < 5000)); } sub candidates_for_role { my($self,$role,$genome,$cutoff,$user) = @_; my($peg); $user = $user ? $user : "master"; my @cand = map { $_->[0] } sort { $a->[1] <=> $b->[1] } map { $peg = $_; [$peg,$self->crude_estimate_of_distance($genome,&FIG::genome_of($peg))] } $self->seqs_with_role($role,$user); return $self->candidates_for_role_from_known($genome,$cutoff,\@cand); } sub candidates_for_role_from_known { my($self,$genome,$cutoff,$known) = @_; my($peg); my @cand = @$known; my $hits = {}; my $seen = {}; my $how_many = (@cand > 10) ? 9 : scalar @cand; &try_to_locate($self,$genome,$hits,[@cand[0..$how_many]],$seen,$cutoff); if (keys(%$hits) == 0) { splice(@cand,0,$how_many+1); &try_to_locate($self,$genome,$hits,\@cand,$seen,$cutoff); } return sort {$hits->{$a}->[0] <=> $hits->{$b}->[0]} keys(%$hits); } sub try_to_locate { my($self,$genome,$hits,$to_try,$seen,$cutoff) = @_; my($prot,$id2,$psc,$id2a,$x,$sim); if (! $cutoff) { $cutoff = 1.0e-5 } foreach $prot (@$to_try) { if (! $seen->{$prot}) { if (($prot =~ /^fig\|(\d+\.\d+)/) && ($1 eq $genome)) { $hits->{$prot} = [0,$prot]; } else { foreach $sim ($self->sims($prot,1000,$cutoff,"raw",0)) { $id2 = $sim->id2; $psc = $sim->psc; foreach $id2a (map { $_->[0] } $self->mapped_prot_ids($id2)) { if (($id2a =~ /^fig\|(\d+\.\d+)/) && ($1 eq $genome)) { $x = $hits->{$id2a}; if ((! $x) || ($x->[0] > $psc)) { $hits->{$id2a} = [$sim->psc,$prot]; } } elsif ($psc < 1.0e-20) { { $seen->{$id2a} = 1; } } } } } } } } =pod =head1 best_bbh_candidates usage: @candidates = $fig->best_bbh_candidates($genome,$cutoff,$requested,$known) This routine returns a list of up to $requested candidates from $genome. A candidate is a BBH against one of the PEGs in genomes from the list given by@$known. Each entry in the list is a 3-tuple: [CandidatePEG,KnownBBH,Pscore] =cut sub best_bbh_candidates { my($self,$genome,$cutoff,$requested,$known,$frac_match) = @_; my($i,$j,$k,$sim,@sims,$peg,$id2,$genome2,$sim_back); my($bbh,%seen,%computed_sims,$genome1); $frac_match = defined($frac_match) ? $frac_match : 0.7; my @got = (); my @cand = $self->candidates_for_role_from_known($genome,$cutoff,$known); if (@cand > 0) { my %genomes = map { $genome1 = &FIG::genome_of($_); $genome1 => 1 } @$known; my %pegs = map { $_ => 1 } @$known; for ($i=0; (@got < $requested) && ($i < @cand); $i++) { $peg = $cand[$i]; undef %seen; @sims = grep { $genomes{&FIG::genome_of($_->id2)} } $self->sims($peg,1000,$cutoff,"fig"); $bbh = 0; for ($j=0; (! $bbh) && ($j < @sims); $j++) { $sim = $sims[$j]; $id2 = $sim->id2; $genome2 = &FIG::genome_of($id2); if (! $seen{$genome2}) { if ($pegs{$id2}) { if (! defined($sim_back = $computed_sims{$id2})) { my @sims_back = $self->sims($id2,1000,$cutoff,"fig"); for ($k=0; ($k < @sims_back) && (&FIG::genome_of($sims_back[$k]->id2) ne $genome); $k++) {} if ($k < @sims_back) { $sim_back = $computed_sims{$id2} = $sims_back[$k]; } else { $sim_back = $computed_sims{$id2} = 0; } } if ($sim_back) { if ($self->ok_match($sim_back,$frac_match)) { $bbh = [$id2,$sim_back->psc]; } } } $seen{$genome2} = 1; } } if ($bbh) { push(@got,[$peg,@$bbh]); } } } return @got; } sub ok_match { my($self,$sim,$frac_match) = @_; my $ln1 = $sim->ln1; my $ln2 = $sim->ln2; my $b1 = $sim->b1; my $e1 = $sim->e1; my $b2 = $sim->b2; my $e2 = $sim->e2; return (((($e1 - $b1) / $ln1) >= $frac_match) && ((($e2 - $b2) / $ln2) >= $frac_match)) } sub external_calls { my($self,$pegs) = @_; my($peg,$func); open(TMP,">/tmp/pegs.$$") || die "could not open /tmp/pegs.$$"; foreach $peg (@$pegs) { print TMP "$peg\n"; } close(TMP); open(TMP,">/tmp/parms.$$") || die "could not open /tmp/parms.$$"; print TMP "no_fig\t1\n"; close(TMP); my %call = map { chop; ($peg,$func) = split(/\t/,$_) } `$FIG_Config::bin/auto_assign /tmp/parms.$$ < /tmp/pegs.$$ 2> /dev/null | $FIG_Config::bin/make_calls`; unlink("/tmp/pegs.$$","/tmp/parms.$$"); return map { $call{$_} ? [$_,$call{$_}] : [$_,"hypothetical protein"] } @$pegs; } use SameFunc; sub same_func { my($self,$f1,$f2) = @_; return &SameFunc::same_func($f1,$f2); } ################################# DNA sequence Stuff #################################### =pod =head1 extract_seq usage: $seq = &FIG::extract_seq($contigs,$loc) This is just a little utility routine that I have found convenient. It assumes that $contigs is a hash that contains IDs as keys and sequences as values. $loc must be of the form Contig_Beg_End where Contig is the ID of one of the sequences; Beg and End give the coordinates of the sought subsequence. If Beg > End, it is assumed that you want the reverse complement of the subsequence. This routine plucks out the subsequence for you. =cut sub extract_seq { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($contigs,$loc) = @_; my($contig,$beg,$end,$contig_seq); my($plus,$minus); $plus = $minus = 0; my $strand = ""; my @loc = split(/,/,$loc); my @seq = (); foreach $loc (@loc) { if ($loc =~ /^\S+_(\d+)_(\d+)$/) { if ($1 < $2) { $plus++; } elsif ($2 < $1) { $minus++; } } } if ($plus > $minus) { $strand = "+"; } elsif ($plus < $minus) { $strand = "-"; } foreach $loc (@loc) { if ($loc =~ /^(\S+)_(\d+)_(\d+)$/) { ($contig,$beg,$end) = ($1,$2,$3); my $len = length($contigs->{$contig}); if (!$len) { carp "Undefined or zero-length contig $contig"; return ""; } if (($beg > $len) || ($end > $len)) { carp "Region $loc out of bounds (contig len=$len)"; } else { if (($beg < $end) || (($beg == $end) && ($strand eq "+"))) { push(@seq,substr($contigs->{$contig},$beg-1,($end+1-$beg))); } else { $strand = "-"; push(@seq,&reverse_comp(substr($contigs->{$contig},$end-1,($beg+1-$end)))); } } } } return join("",@seq); } =pod =head1 all_contigs usage: @contig_ids = $fig->all_contigs($genome) Returns a list of all of the contigs occurring in the designated genome. =cut sub all_contigs { my($self,$genome) = @_; my($rdbH,$relational_db_response); $rdbH = $self->db_handle; if (defined($genome)) { if ($relational_db_response = $rdbH->SQL("SELECT DISTINCT contig FROM contig_lengths WHERE ( genome = \'$genome\' )")) { return map { $_->[0] } @$relational_db_response; } } return undef; } =pod =head1 contig_ln usage: $n = $fig->contig_ln($genome,$contig) Returns the length of $contig from $genome. =cut sub contig_ln { my($self,$genome,$contig) = @_; my($rdbH,$relational_db_response); $rdbH = $self->db_handle; if (defined($genome) && defined($contig)) { if (($relational_db_response = $rdbH->SQL("SELECT len FROM contig_lengths WHERE ( genome = \'$genome\' ) and ( contig = \'$contig\' )")) && (@$relational_db_response == 1)) { return $relational_db_response->[0]->[0]; } } return undef; } =pod =head1 dna_seq usage: $seq = dna_seq($genome,@locations) Returns the concatenated subsequences described by the list of locations. Each location must be of the form Contig_Beg_End where Contig must be the ID of a contig for genome $genome. If Beg > End the location describes a stretch of the complementary strand. =cut sub dna_seq { my($self,$genome,@locations) = @_; my(@pieces,$loc,$contig,$beg,$end,$ln,$rdbH); @locations = map { split(/,/,$_) } @locations; @pieces = (); foreach $loc (@locations) { if ($loc =~ /^(\S+)_(\d+)_(\d+)$/) { ($contig,$beg,$end) = ($1,$2,$3); $ln = $self->contig_ln($genome,$contig); if (! $ln) { print STDERR "$genome/$contig: could not get length\n"; return ""; } if (&between(1,$beg,$ln) && &between(1,$end,$ln)) { if ($beg < $end) { push(@pieces, $self->get_dna($genome,$contig,$beg,$end)); } else { push(@pieces, &reverse_comp($self->get_dna($genome,$contig,$end,$beg))); } } } } return join("",@pieces); } sub get_dna { my($self,$genome,$contig,$beg,$end) = @_; my $relational_db_response; my $rdbH = $self->db_handle; my $indexpt = int(($beg-1)/10000) * 10000; if (($relational_db_response = $rdbH->SQL("SELECT startN,fileno,seek FROM contig_seeks WHERE ( genome = \'$genome\' ) AND ( contig = \'$contig\' ) AND ( indexpt = $indexpt )")) && (@$relational_db_response == 1)) { my($startN,$fileN,$seek) = @{$relational_db_response->[0]}; my $fh = $self->openF($self->N2file($fileN)); if (seek($fh,$seek,0)) { my $chunk = ""; read($fh,$chunk,int(($end + 1 - $startN) * 1.03)); # print STDERR "genome=$genome contig=$contig beg=$beg end=$end startN=$startN chunk=$chunk\n"; $chunk =~ s/\s//g; my $ln = ($end - $beg) + 1; if (length($chunk) >= $ln) { return substr($chunk,(($beg-1)-$startN),$ln); } } } return undef; } ################################# Taxonomy #################################### =pod =head1 taxonomy_of usage: $gs = $fig->taxonomy_of($genome_id) Returns the taxonomy of the specified genome. Gives the taxonomy down to genus and species. =cut sub taxonomy_of :scalar { my($self,$genome) = @_; my($ans); my $taxonomy = $self->cached('_taxonomy'); if (! ($ans = $taxonomy->{$genome})) { my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT genome,taxonomy FROM genome"); my $pair; foreach $pair (@$relational_db_response) { $taxonomy->{$pair->[0]} = $pair->[1]; } $ans = $taxonomy->{$genome}; } return $ans; } =pod =head1 is_bacterial usage: $fig->is_bacterial($genome) Returns true iff the genome is bacterial. =cut sub is_bacterial :scalar { my($self,$genome) = @_; return ($self->taxonomy_of($genome) =~ /^Bacteria/) ? 1 : 0; } =pod =head1 is_archaeal usage: $fig->is_archaeal($genome) Returns true iff the genome is archaeal. =cut sub is_archaeal :scalar { my($self,$genome) = @_; return ($self->taxonomy_of($genome) =~ /^Archaea/) ? 1 : 0; } =pod =head1 is_prokaryotic usage: $fig->is_prokaryotic($genome) Returns true iff the genome is prokaryotic =cut sub is_prokaryotic :scalar { my($self,$genome) = @_; return ($self->taxonomy_of($genome) =~ /^(Archaea|Bacteria)/) ? 1 : 0; } =pod =head1 is_eukaryotic usage: $fig->is_eukaryotic($genome) Returns true iff the genome is eukaryotic =cut sub is_eukaryotic :scalar { my($self,$genome) = @_; return ($self->taxonomy_of($genome) =~ /^Eukaryota/) ? 1 : 0; } =pod =head1 sort_genomes_by_taxonomy usage: @genomes = $fig->sort_genomes_by_taxonomy(@list_of_genomes) This routine is used to sort a list of genome IDs to put them into taxonomic order. =cut sub sort_genomes_by_taxonomy { my($self,@fids) = @_; return map { $_->[0] } sort { $a->[1] cmp $b->[1] } map { [$_,$self->taxonomy_of($_)] } @fids; } =pod =head1 crude_estimate_of_distance usage: $dist = $fig->crude_estimate_of_distance($genome1,$genome2) There are a number of places where we need estimates of the distance between two genomes. This routine will return a value between 0 and 1, where a value of 0 means "the genomes are essentially identical" and a value of 1 means "the genomes are in different major groupings" (the groupings are archaea, bacteria, euks, and viruses). The measure is extremely crude. =cut sub crude_estimate_of_distance :scalar { my($self,$genome1,$genome2) = @_; my($i,$v,$d,$dist); if ($genome1 > $genome2) { ($genome1,$genome2) = ($genome2,$genome1) } my $relational_db_response; my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT dist FROM distances WHERE ( genome1 = \'$genome1\' ) AND ( genome2 = \'$genome2\' ) ")) && (@$relational_db_response == 1)) { return $relational_db_response->[0]->[0]; } return $self->crude_estimate_of_distance1($genome1,$genome2); } sub crude_estimate_of_distance1 :scalar { my($self,$genome1,$genome2) = @_; my($i,$v,$d,$dist); if ($genome1 > $genome2) { ($genome1,$genome2) = ($genome2,$genome1) } $dist = $self->cached('_dist'); if (! $dist->{"$genome1,$genome2"}) { my @tax1 = split(/\s*;\s*/,$self->taxonomy_of($genome1)); my @tax2 = split(/\s*;\s*/,$self->taxonomy_of($genome2)); $d = 1; for ($i=0, $v=0.5; ($i < @tax1) && ($i < @tax2) && ($tax1[$i] eq $tax2[$i]); $i++, $v = $v/2) { $d -= $v; } $dist->{"$genome1,$genome2"} = $d; } return $dist->{"$genome1,$genome2"}; } =pod =head1 sort_fids_by_taxonomy usage: @sorted_by_taxonomy = $fig->sort_fids_by_taxonomy(@list_of_fids) Sorts a list of feature IDs based on the taxonomies of the genomes that contain the features. =cut sub sort_fids_by_taxonomy { my($self,@fids) = @_; return map { $_->[0] } sort { $a->[1] cmp $b->[1] } map { [$_,$self->taxonomy_of(&genome_of($_))] } @fids; } sub build_tree_of_complete { my($self,$min_for_label) = @_; my(@last,@tax,$i,$prefix,$lev,$genome,$tax); $min_for_label = $min_for_label ? $min_for_label : 10; open(TMP,">/tmp/tree$$") || die "could not open /tmp/tree$$"; print TMP "1. root\n"; @last = (); foreach $genome (grep { $_ !~ /^99999/ } $self->sort_genomes_by_taxonomy($self->genomes("complete"))) { $tax = $self->taxonomy_of($genome); @tax = split(/\s*;\s*/,$tax); push(@tax,$genome); for ($i=0; ((@last > $i) && (@tax > $i) && ($last[$i] eq $tax[$i])); $i++) {} while ($i < @tax) { $lev = $i+2; $prefix = " " x (4 * ($lev-1)); print TMP "$prefix$lev\. $tax[$i]\n"; $i++; } @last = @tax; } close(TMP); my $tree = &tree_utilities::build_tree_from_outline("/tmp/tree$$"); $tree->[0] = 'All'; &limit_labels($tree,$min_for_label); unlink("/tmp/tree$$"); return ($tree,&tips_of_tree($tree)); } sub limit_labels { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($tree,$min_for_label) = @_; my($children) = &tree_utilities::node_pointers($tree); if (@$children == 1) { return 1; } else { my $n = 0; my $i; for ($i=1; ($i < @$children); $i++) { $n += &limit_labels($children->[$i],$min_for_label); } if ($n < $min_for_label) { $tree->[0] = ""; } return $n; } } sub taxonomic_groups_of_complete { my($self,$min_for_labels) = @_; my($tree,undef) = $self->build_tree_of_complete($min_for_labels); return &taxonomic_groups($tree); } sub taxonomic_groups { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($tree) = @_; my($groups,undef) = &taxonomic_groups_and_children($tree); return $groups; } sub taxonomic_groups_and_children { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($tree) = @_; my($ids1,$i,$groupsC,$idsC); my $ptrs = &tree_utilities::node_pointers($tree); my $ids = []; my $groups = []; if (@$ptrs > 1) { $ids1 = []; for ($i=1; ($i < @$ptrs); $i++) { ($groupsC,$idsC) = &taxonomic_groups_and_children($ptrs->[$i]); if (@$groupsC > 0) { push(@$groups,@$groupsC); } push(@$ids1,@$idsC); } if ($tree->[0]) { push(@$groups,[$tree->[0],$ids1]); } push(@$ids,@$ids1); } elsif ($tree->[0]) { push(@$ids,$tree->[0]); } return ($groups,$ids); } ################################# Literature Stuff #################################### sub get_titles_by_gi { my($self,$gi) = @_; &verify_existence_of_literature; $gi =~ s/^gi\|//; my $relational_db_response; my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT pmid,title FROM literature_titles WHERE ( gi = '$gi' ) ")) && (@$relational_db_response > 0)) { return sort { $a->[1] cmp $b->[1] } @$relational_db_response; } else { return (); } } sub get_titles_by_peg { my($self,$peg) = @_; my $gi; &verify_existence_of_literature; my @gis = grep { $_ =~ /^gi\|/ } $self->feature_aliases($peg); if (@gis > 0) { my $relational_db_response; my $rdbH = $self->db_handle; my $constraint = join(" OR ", map { $gi = ($_ =~ /gi\|(\S+)/) ? $1 : $_; "( gi = '$gi' )" } @gis); if (($relational_db_response = $rdbH->SQL("SELECT pmid,title FROM literature_titles WHERE ( $constraint ) ")) && (@$relational_db_response > 0)) { return sort { $a->[1] cmp $b->[1] } @$relational_db_response; } else { return (); } } return (); } sub get_title_by_pmid { my($self,$pmid) = @_; &verify_existence_of_literature; $pmid =~ s/^.*\|//; my $relational_db_response; my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT DISTINCT title FROM literature_titles WHERE ( pmid = '$pmid' ) ")) && (@$relational_db_response == 1)) { return $relational_db_response->[0]->[0]; } else { return ""; } } sub verify_existence_of_literature { if (! -d "$FIG_Config::global/Literature") { mkdir("$FIG_Config::global/Literature",0777); system "touch $FIG_Config::global/Literature/gi_pmid_title"; system "$FIG_Config::bin/load_literature"; } } ################################# Subsystems #################################### sub exportable_subsystem { my($self,$ssa) = @_; my(%seqs,@genomes); my $spreadsheet = []; my $notes = []; $ssa =~ s/[ \/]/_/g; if (open(SSA,"<$FIG_Config::data/Subsystems/$ssa/spreadsheet")) { my $version = $self->subsystem_version($ssa); my $exchangable = $self->is_exchangable_subsystem($ssa); push(@$spreadsheet,"$ssa\n$version\n$exchangable\n"); my @curation; if (-s "$FIG_Config::data/Subsystems/$ssa/curation.log") { @curation = `head -1 \"$FIG_Config::data/Subsystems/$ssa/curation.log\"`; } else { @curation = ("0000000000\tmaster:unknown\tstarted\n"); } push(@$spreadsheet,$curation[0],"//\n"); while (defined($_ = ) && ($_ !~ /^\/\//)) { push(@$spreadsheet,$_); } push(@$spreadsheet,"//\n"); while (defined($_ = ) && ($_ !~ /^\/\//)) { push(@$spreadsheet,$_); } push(@$spreadsheet,"//\n"); while (defined($_ = )) { push(@$spreadsheet,$_); chomp; my @flds = split(/\t/,$_); my $genome = $flds[0]; push(@genomes,$genome); my($i,$id); for ($i=2; ($i < @flds); $i++) { if ($flds[$i]) { my @entries = split(/,/,$flds[$i]); foreach $id (@entries) { $seqs{"fig\|$genome\.peg.$id"} = 1; } } } } push(@$spreadsheet,"//\n"); my $peg; foreach $peg (sort { &FIG::by_fig_id($a,$b) } keys(%seqs)) { my @aliases = grep { $_ =~ /^(sp\||gi\||pirnr\||kegg\||N[PGZ]_)/ } $self->feature_aliases($peg); my $alias_txt = join(",",@aliases); my $gs_txt = $self->genus_species($self->genome_of($peg)); my $func_txt = scalar $self->function_of($peg); push(@$spreadsheet, join("\t", ($peg, $alias_txt, $gs_txt, $func_txt)) . "\n"); } push(@$spreadsheet,"//\n"); foreach $peg (sort { &FIG::by_fig_id($a,$b) } keys(%seqs)) { my $aliases = $self->feature_aliases($peg); my $seq = $self->get_translation($peg); push(@$spreadsheet,">$peg $aliases\n"); my($i,$ln); $ln = length($seq); for ($i=0; ($i < $ln); $i += 60) { if (($ln - $i) > 60) { push(@$spreadsheet,substr($seq,$i,60) . "\n"); } else { push(@$spreadsheet,substr($seq,$i) . "\n"); } } } close(SSA); push(@$spreadsheet,"//\n"); if (open(NOTES,"<$FIG_Config::data/Subsystems/$ssa/notes")) { while (defined($_ = )) { push(@$notes,$_); } close(NOTES); } if ($notes->[$#{$notes}] ne "\n") { push(@$notes,"\n") } push(@$notes,"//\n"); } return ($spreadsheet,$notes); } sub is_exchangable_subsystem :scalar { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my $ssa = (@_ == 1) ? $_[0] : $_[1]; $ssa =~ s/[ \/]/_/g; if (open(TMP,"<$FIG_Config::data/Subsystems/$ssa/EXCHANGABLE")) { my $line; $line = ; if ($line && ($line =~ /^(\S+)/) && $1) { return 1; } close(TMP); } return 0; } sub all_exchangable_subsystems { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my @exchangable = (); if (opendir(SUB,"$FIG_Config::data/Subsystems")) { push(@exchangable,grep { ($_ !~ /^\./) && &is_exchangable_subsystem($_) } readdir(SUB)); closedir(SUB); } return @exchangable; } sub all_subsystems { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my @subsystems = (); if (opendir(SUB,"$FIG_Config::data/Subsystems")) { push(@subsystems,grep { ($_ !~ /^\./) } readdir(SUB)); closedir(SUB); } return @subsystems; } sub all_constructs { my($self) = @_; my @subsystems = (); if (opendir(SUB,"$FIG_Config::data/Subsystems")) { push(@subsystems,grep { ($_ !~ /^\./) } readdir(SUB)); closedir(SUB); } my @c; for my $subname (@subsystems) { $subname =~ s/[ \/]/_/g; my $cfile = "$FIG_Config::data/Subsystems/$subname/constructs"; if (-f $cfile) { my $sub = $self->get_subsystem($subname); my @a = Construct::parse_constructs_file($cfile, $sub); my $l = []; for my $con (@a) { my($cname, $list) = @$con; my $nreqs = []; for my $req (@$list) { if ($req->[0] eq 'R') { push(@$nreqs, ['R', $req->[2]]); } else { push(@$nreqs, $req); } } push(@$l, [$cname, $nreqs]); } push(@c, [$subname, $l]); } } return @c; } sub subsystem_version :scalar { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my $ssa = (@_ == 1) ? $_[0] : $_[1]; $ssa =~ s/[ \/]/_/g; if (open(VER,"<$FIG_Config::data/Subsystems/$ssa/VERSION")) { my $ver = ; close(VER); if ($ver =~ /^(\S+)/) { return $1; } } return 0; } =pod =head1 subsystem_curator usage: $curator = $fig->subsystem_curator($subsystem_name) Return the curator of a subsystem. =cut sub subsystem_curator :scalar { my($self, $ssa) = @_; my($who) = ""; $ssa =~ s/[ \/]/_/g; if (open(DATA,"<$FIG_Config::data/Subsystems/$ssa/curation.log")) { $_ = ; if ($_ =~ /^\d+\t(\S+)\s+started/) { $who = $1; } close(DATA); } return $who; } =pod =head1 subsystem_info usage: ($version, $curator, $pedigree, $roles) = $fig->subsystem_info($subsystem_name) Return information about the given subsystem. $roles is a list of tuples (abbrev, name). =cut sub subsystem_info { my($self,$ssa) = @_; my($version, $curator, $pedigree, $roles);; $ssa =~ s/[ \/]/_/g; $roles = []; $version = $self->subsystem_version($ssa); $curator = $self->subsystem_curator($ssa); if (open(CUR, "<$FIG_Config::data/Subsystems/$ssa/curation.log")) { local($/); $pedigree = ; } if (open(SSA,"<$FIG_Config::data/Subsystems/$ssa/spreadsheet")) { # # The spreadsheet appears to be of the form # # role-abbr role name # ... # // # Something about subsets # // # genome-id spreadsheet-info # local $/ = "//"; my($chunk); if (defined($chunk = )) { for $_ (split(/\n/, $chunk)) { chomp; if (/^(\S+)\s+(.*)\s*$/) { push(@$roles, [$1, $2]); } } } close(SSA); } return ($version, $curator, $pedigree, $roles); } =pod =head1 subsystem_genomes usage: $genomes) = $fig->subsystem_genomes($subsystem_name) Return the list of genomes in the subsystem. $genomes is a list of tuples (genome_id, name) =cut sub subsystem_genomes :scalar { my($self,$ssa,$all) = @_; my($genomes); $ssa =~ s/[ \/]/_/g; $genomes = []; if (open(SSA,"<$FIG_Config::data/Subsystems/$ssa/spreadsheet")) { # # The spreadsheet appears to be of the form # # role-abbr role name # ... # // # Something about subsets # // # genome-id spreadsheet-info # local $/ = "//"; my($chunk); if (defined($chunk = )) { } if (defined($chunk = )) { } local $/ = "\n"; while () { chomp; s/^\s*//; s/\s*$//; next if $_ eq ""; if (($_ =~ /^(\d+\.\d+)\s+(\S+)/) && ($all || $2)) { my $genome = $1; my $name = $self->genus_species($genome); push(@$genomes, [$genome, $name]); } } close(SSA); } return $genomes; } # # @pegs = $fig->pegs_in_subsystem_cell($genome,$role) # @roles = $fig->subsystem_to_roles($subsystem) # @maps = $fig->role_to_maps($role) # @subsystems = $fig->peg_to_subsystems($peg); sub get_subsystem :scalar { my($self, $subsystem, $force_load) = @_; my $sub; $subsystem =~ s/[ \/]/_/g; my $cache = $self->cached('_Subsystems'); if ($force_load || !($sub = $cache->{$subsystem})) { $sub = new Subsystem($subsystem, $self); $cache->{$subsystem} = $sub if $sub; } return $sub; } sub subsystem_to_roles { my($self, $subsystem) = @_; $subsystem =~ s/[ \/]/_/g; my $sub = $self->get_subsystem($subsystem); return () unless $sub; return $sub->get_roles(); } sub pegs_in_subsystem_cell { my($self, $subsystem, $genome, $role) = @_; $subsystem =~ s/[ \/]/_/g; my $sub = $self->get_subsystem($subsystem); return undef unless $sub; return grep { ! $self->is_deleted_fid($_) } $sub->get_pegs_from_cell($genome, $role); } sub get_clearinghouse :scalar { my($self, $url) = @_; if (defined($self->{_clearinghouse})) { return $self->{_clearinghouse}; } if (!$ClearinghouseOK) { warn "Error: Clearinghouse code not available.\n"; return undef; } if ($url eq "") { $url = "http://www-unix.mcs.anl.gov/~olson/SEED/api.cgi"; } my $ch = new Clearinghouse($url); $self->{_clearinghouse} = $ch; return $ch; } sub publish_subsystem_to_clearinghouse { my ($self, $ssa, $url) = @_; my ($id, $token); $ssa =~ s/[ \/]/_/g; my $ch = $self->get_clearinghouse($url); if (!defined($ch)) { warn "Cannot publish: clearinghouse not available\n"; return undef; } my($version, $curator, $pedigree, $roles) = $self->subsystem_info($ssa); my $genomes = $self->subsystem_genomes($ssa); my @genome_names = (); for my $g (@$genomes) { push(@genome_names, $g->[1]); } my $seed_id = $self->get_seed_id(); my $time = int(time()); print "publishing: ss=$ssa version=$version time=$time curator=$curator seed_id=$seed_id\n"; my $ret = $ch->publish_subsystem($ssa, $version, $time, $curator, $pedigree, $seed_id, $roles, \@genome_names); ($id, $token, $url) = @$ret; print "Got id $id token $token url $url\n"; # # Retrieve the package # print "Packaging...\n"; my($spreadsheet, $notes) = $self->exportable_subsystem($ssa); my $package = join("", @$spreadsheet, @$notes); print "Sending...\n"; $ch->upload_subsystem_package($url, $package); return 1; } # # Return the list of subsystems this peg appears in. # Each entry is a pair [subsystem, role]. # =head1 subsystems_for_peg Return the list of subsystems and roles that this peg appears in. Returns an array. Each item in the array is a reference to a tuple of subsystem and role. =cut sub subsystems_for_peg { my($self, $peg) = @_; if ($self->is_deleted_fid($peg)) { return () } ($peg =~ /^fig\|\d+\.\d+\.peg\.\d+$/) or return; my $rdbH = $self->db_handle; my $q = "SELECT subsystem, role FROM subsystem_index WHERE protein = '$peg'"; if (my $relational_db_response = $rdbH->SQL($q)) { my %seen; my @in; my $pair; foreach $pair (@$relational_db_response) { my $key = join("\t",@$pair); if (! $seen{$key}) { push(@in,$pair); $seen{$key} = 1; } } return @in; } else { return (); } } =head1 subsystem_roles Return a list of all roles present in locally-installed subsystems. The return is a hash keyed on role name with each value a list of subsystem names. =cut sub subsystem_roles { my($self) = @_; my $rdbH = $self->db_handle; my $q = "SELECT distinct subsystem, role FROM subsystem_index"; my $ret = {}; if (my $relational_db_response = $rdbH->SQL($q)) { foreach my $pair (@$relational_db_response) { my($subname, $role) = @$pair; push(@{$ret->{$role}}, $subname); } } return $ret; } # # Return just the list of subsystems the peg appears in. # sub peg_to_subsystems { my($self, $peg) = @_; if ($self->is_deleted_fid($peg)) { return () } my %in = map { $_->[0] => 1 } $self->subsystems_for_peg($peg); return sort keys(%in); } sub write_subsystem_spreadsheet { my($self,$ssa,$roles,$genomes,$pegs_in_cells) = @_; my(@genomes,$genome,$role,@pegs,$pair,$gs); $ssa =~ s/[ \/]/_/g; &verify_dir("$FIG_Config::data/Subsystems/$ssa"); open(SSA,">$FIG_Config::data/Subsystems/$ssa/spreadsheet") || die "Cannot open $FIG_Config::data/Subsystems/$ssa/spreadsheet"; foreach $pair (@$roles) { print SSA join("\t",@$pair),"\n"; } print SSA "//\n"; print SSA "All\n\nAll\n//\n"; @genomes = map { $_->[1] } sort { ($a->[0] cmp $b->[0]) or ($a->[1] <=> $b->[1]) } map {$genome = $_; $gs = $self->genus_species($genome); [$gs,$genome] } @$genomes; foreach $genome (@genomes) { print SSA "$genome\t0"; foreach $role (@$roles) { $_ = $pegs_in_cells->{"$genome\t$role->[1]"}; @pegs = $_ ? sort { &by_fig_id($a,$b) } @{$_} : (); print SSA "\t",join(",",map { $_ =~ /^fig\|\d+\.\d+\.peg\.(\d+)/; $1 } @pegs); } print SSA "\n"; } close(SSA); chmod(0777,"$FIG_Config::data/Subsystems/$ssa"); } ################################# PEG Translation #################################### sub translate_pegs { my($self,$pegs,$seq_of, $cb) = @_; my($seq,$aliases,$pegT,%to,%sought,@keys,$peg,$alias); $cb = sub {} unless ref($cb) eq "CODE"; my $tran_peg = {}; my $n = scalar keys (%$pegs); my $idx = 0; foreach $peg (keys(%$pegs)) { $idx++; &$cb("$idx of $n") if $idx % 100 == 0; # # First, see if the peg of the same name locally has the same # last 10 chars. # if (($seq = $self->get_translation($peg)) && (length($seq) > 10) && (length($seq_of->{$peg}) > 10) && (uc substr($seq,-10) eq substr($seq_of->{$peg},-10))) { $tran_peg->{$peg} = $peg; } else { # # Otherwise, search for a local peg that has the same alias # as this peg. (Canonicalize based on the original source) # ($aliases,undef,undef) = @{$pegs->{$peg}}; undef %to; foreach $alias (split(/,/,$aliases)) { if ($pegT = $self->by_alias($alias)) { $to{$pegT}++; } } # # If we have a unique answer, we are done. # Otherwise mark this one as needing more search. # if ((@keys = keys(%to)) == 1) { $tran_peg->{$peg} = $keys[0]; } else { $sought{$peg} = 1; } } } if ((scalar keys(%sought)) > 0) { &tough_search($self,$pegs,$seq_of,$tran_peg,\%sought); } return $tran_peg; } =pod =head1 tough_search($pegs, $seq_of, $tran_peg, $sought) $pegs - not used $seq_of - hash from peg to peg sequence $tran_peg - hash into which translated pegs are placed $sought - hash keyed on the list of pegs we're looking for. =cut sub tough_search { my($self,$pegs,$seq_of,$tran_peg,$sought) = @_; my($peg,$seq,%needH,%needT,%poss,$id,$sub,$to,$x,$genome); # # Construct %needT, key is 50-bases from tail of sequence, values are pegs from # the list of pegs we're seeking. # # %needH is the same, but keyed on 50 bases from the head of the sequence. # foreach $peg (keys(%$sought)) { if (($seq = $seq_of->{$peg}) && (length($seq) > 50)) { my $sub = substr($seq,-50); push(@{$needT{$sub}},$peg); $sub = substr($seq,0,50); push(@{$needH{$sub}},$peg); } } # print STDERR &Dumper(\%needT,\%needH); open(NR,"<$FIG_Config::global/nr") || die "could not open $FIG_Config::global/nr"; $/ = "\n>"; while (defined($_ = )) { chomp; if ($_ =~ /^>?(\S+)[^\n]*\n(.*)/s) { $id = $1; $seq = $2; if (length($seq) >= 50) { $sub = uc substr($seq,-50); if ((($x = $needT{uc substr($seq,-50)}) && (@$x == 1)) || (($x = $needH{uc substr($seq,0,50)}) && (@$x == 1))) { $peg = $x->[0]; my @same = grep { $_ =~ /^fig/ } map { $_->[0] } $self->mapped_prot_ids($id); if (@same > 0) { push(@{$poss{$peg}},@same); } } } } } close(NR); $/ = "\n"; # print STDERR &Dumper(\%poss); foreach $peg (keys(%poss)) { # print STDERR "processing $peg\n"; $x = $poss{$peg}; if (@$x == 1) { $tran_peg->{$peg} = $x->[0]; delete $sought->{$peg}; } elsif ($genome = $self->probable_genome($self->genome_of($peg),$tran_peg)) { # print STDERR " mapped to genome $genome\n"; my $genomeQ = quotemeta $genome; my @tmp = grep { $_ =~ /^fig\|$genomeQ\./ } @$x; if (@tmp == 1) { $tran_peg->{$peg} = $tmp[0]; delete $sought->{$peg}; } else { # print STDERR &Dumper(["failed",$peg,$genome,\@tmp]); } } else { # print STDERR "could not place genome for $peg\n"; } } foreach $peg (keys(%$sought)) { print STDERR "failed to map $peg\n"; } } sub probable_genome { my($self,$genome,$tran_peg) = @_; my($peg,%poss,@poss); my $genomeQ = quotemeta $genome; foreach $peg (grep { $_ =~ /^fig\|$genomeQ\./ } keys(%$tran_peg)) { $poss{$self->genome_of($tran_peg->{$peg})}++; } @poss = sort { $poss{$b} <=> $poss{$a} } keys(%poss); if ((@poss == 1) || ((@poss > 1) && ($poss{$poss[0]} > $poss{$poss[1]}))) { return $poss[0]; } else { # print STDERR &Dumper(["could not resolve",\%poss,$genome]); return undef; } } =pod =head1 find_genome_by_content Find a genome given the number of contigs, number of nucleotides, and checksum. We pass in a potential name for the genome as a quick starting check. =cut sub find_genome_by_content { my($self, $genome, $n_contigs, $n_nucs, $cksum) = @_; my(@genomes); my $gbase = (split(/\./, $genome))[0]; # # Construct the list of genomes so that we first check ones with the same # base-part as the one passed in. # for ($self->genomes()) { if (/^$gbase\./) { unshift(@genomes, $_); } else { push(@genomes, $_); } } for my $genome (@genomes) { if (open(my $cfh, "<$FIG_Config::organisms/$genome/COUNTS")) { if ($_ = <$cfh>) { my($cgenome, $cn_contigs, $cn_nucs, $ccksum) = split(/\t/); if ($cgenome eq $genome and $cn_contigs == $n_contigs and $cn_nucs == $n_nucs and $ccksum == $cksum) { return $genome; } } close($cfh); } } return undef; } ################################# Support for PEG Links #################################### sub peg_links { my($self,$fid) = @_; return $self->fid_links($fid); } sub fid_links { my($self,$fid) = @_; my($i,$got,$genome,$fidN); if ($self->is_deleted_fid($fid)) { return () } my @links = (); my @aliases = $self->feature_aliases($fid); my $link_file; for $link_file (("$FIG_Config::global/fid.links","$FIG_Config::global/peg.links")) { if (open(GLOBAL,"<$link_file")) { while (defined($_ = )) { chop; my($pat,$link) = split(/\t/,$_); for ($i=0,$got=0; (! $got) && ($i < @aliases); $i++) { if ($aliases[$i] =~ /$pat/) { push(@links,eval "\"$link\""); $got = 1; } } } close(GLOBAL); } } my $relational_db_response; my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT link FROM fid_links WHERE ( fid = \'$fid\' )")) && (@$relational_db_response > 0)) { push(@links, map { $_->[0] } @$relational_db_response); } return sort { $a =~ /\>([^\<]+)\<\/a\>/; my $l1 = $1; $b =~ /\>([^\<]+)\<\/a\>/; my $l2 = $1; $a cmp $b } @links; } # Each link is a 2-tuple [fid,link] sub add_peg_links { my($self,@links) = @_; return $self->add_fid_links(@links); } sub add_fid_links { my($self,@links) = @_; my($fid,$link,$pair,$i); my $relational_db_response; my $rdbH = $self->db_handle; foreach $pair (@links) { ($fid,$link) = @$pair; if (($relational_db_response = $rdbH->SQL("SELECT link FROM fid_links WHERE ( fid = \'$fid\' )")) && (@$relational_db_response > 0)) { for ($i=0; ($i < @$relational_db_response) && ($relational_db_response->[$i]->[0] ne $link); $i++) {} if ($i == @$relational_db_response) { &add_fid_link($self,$fid,$link); } } else { &add_fid_link($self,$fid,$link); } } } sub add_fid_link { my($self,$fid,$link) = @_; if ($self->is_deleted_fid($fid)) { return } my $rdbH = $self->db_handle; ($fid =~ /^fig\|\d+\.\d+\.([^.]+)\.\d+$/) || confess "bad fid $fid"; my $type = $1; $rdbH->SQL("INSERT INTO fid_links ( fid,link ) VALUES ( \'$fid\',\'$link\' )"); if (($fid =~ /^fig\|(\d+\.\d+)\.fid\.\d+$/) && open(TMP,">>$FIG_Config::organisms/$1/Features/$type/links")) { print TMP "$fid\t$link\n"; close(TMP); chmod 0777,"$FIG_Config::organisms/$1/Features/$type/links"; } } sub delete_peg_link { my($self,$peg,$link) = @_; $self->delete_fid_link($peg,$link); } sub delete_fid_link { my($self,$fid,$link) = @_; my($i); if ($self->is_deleted_fid($fid)) { return } my $genome = $self->genome_of($fid); ($fid =~ /^fig\|\d+\.\d+\.([^.]+)\.\d+$/) || confess "bad fid $fid"; my $type = $1; my $rdbH = $self->db_handle; $rdbH->SQL("DELETE FROM fid_links WHERE ( fid = \'$fid\' AND link = \'$link\' )"); my $file; foreach $file (("$FIG_Config::organisms/$genome/Features/$type/$type.links","$FIG_Config::organisms/$genome/Features/$type/links")) { if (-s $file) { my @links = `cat $file`; for ($i=0; ($i < @links) && (! (($links[$i] =~ /^(\S+)\t(\S.*\S)/) && ($1 eq $fid) && ($2 eq $link))); $i++) {} if (($i < @links) && open(TMP,">$file")) { splice(@links,$i,1); print TMP join("",@links); close(TMP); } } } } sub delete_all_peg_links { my($self,$peg) = @_; $self->delete_all_fid_links($peg); } sub delete_all_fid_links { my($self,$fid) = @_; my($i); if ($self->is_deleted_fid($fid)) { return } my $genome = $self->genome_of($fid); my $rdbH = $self->db_handle; $rdbH->SQL("DELETE FROM fid_links WHERE ( fid = \'$fid\' )"); ($fid =~ /^fig\|\d+\.\d+\.([^.]+)\.\d+$/) || confess "bad fid $fid"; my $type = $1; my $file; foreach $file (("$FIG_Config::organisms/$genome/Features/$type/$type.links","$FIG_Config::organisms/$genome/Features/$type/links")) { if (-s $file) { my @links = `cat $file`; my @links1 = grep { ! (($_ =~ /^(\S+)/) && ($1 eq $fid)) } @links; if ((@links1 < @links) && open(TMP,">$file")) { print TMP join("",@links1); close(TMP); } } } } ########### # # Some routines for dealing with peg search and similarities. # # This is code lifted from pom.cgi and reformatted for more general use. # # # Find the given role in the given (via CGI params) organism. # # We do this by finding a list of pegs that are annotated to have # this role in other organisms that are "close enough" to our organism # # We then find pegs in this organism that are similar to # these pegs. # sub find_role_in_org { my($self,$role, $org, $user, $sims_cutoff) = @_; my($id2,$psc,$col_hdrs,$tab,$peg,$curr_func,$id2_func); my($seen,$peg); if (!$org) { return undef; } # # Create a list of candidates. # # These are the list of sequences that contain the given role, # sorted by the crude_estimate_of_distance from the given peg. # my @cand = map { $_->[0] } sort { $a->[1] <=> $b->[1] } map { $peg = $_; [$peg,$self->crude_estimate_of_distance($org,&FIG::genome_of($peg))] } $self->seqs_with_role($role,$user); my $hits = {}; $seen = {}; # # Pick the top 10 hits if there are more than 10. # my $how_many0 = ((@cand > 10) ? 10 : scalar @cand) - 1; $self->try_to_locate($org,$hits,[@cand[0..$how_many0]],$seen, $sims_cutoff); if (keys(%$hits) == 0) { splice(@cand,0,$how_many0+1); &try_to_locate($self,$org,$hits,\@cand,$seen, $sims_cutoff); } # # At this point %$hits contains the pegs in our organism that # may have the given role. The key is the peg, the value # is a pair [score, similar-peg] # # # We reformat this into a list of entries # [ $psc, $peg-in-this-org, $length, $current-fn, $matched-protein, $matched-len, $matched-fun] # $col_hdrs = ["P-Sc","PEG","Ln1","Current Function", "Protein Hit","Ln2","Function"]; my @ret; foreach $peg ( sort {$hits->{$a}->[0] <=> $hits->{$b}->[0]} keys(%$hits)) { ($psc,$id2) = @{$hits->{$peg}}; $curr_func = $self->function_of($peg,$user); $id2_func = $self->function_of($id2,$user); push(@ret, [$psc, $peg, $self->translation_length($peg), $curr_func, $id2, $self->translation_length($id2),$id2_func]); } return @ret; } # # Mark in $hits the pegs in $org that are similar to # pegs in other organisms that have the given role. # sub try_to_locate { my($self,$org,$hits,$to_try,$seen, $cutoff) = @_; my($prot,$id2,$psc,$id2a,$x,$sim); if (! $cutoff) { $cutoff = 1.0e-5 } # # @$to_try is a list of pegs # foreach $prot (@$to_try) { # # If we've not looked at it before ... # if (! $seen->{$prot}) { # # Retrieve the top 1000 sims for this peg. raw # means don't expand. # foreach $sim ($self->sims($prot,1000,$cutoff,"raw",0)) { $id2 = $sim->id2; $psc = $sim->psc; # # Retrieve the proteins that the sims map to. # foreach $id2a (map { $_->[0] } $self->mapped_prot_ids($id2)) { # # If it's a protein in the organism we're looking in, # and if it's a better hit than the hit we had before, # mark it in $hits->{id} with the score and the # protein id. # if (($id2a =~ /^fig\|(\d+\.\d+)/) && ($1 eq $org)) { $x = $hits->{$id2a}; if ((! $x) || ($x->[0] > $psc)) { $hits->{$id2a} = [$sim->psc,$prot]; } } # # Otherwise, mark it as having been seen if the score is good enough. # elsif ($psc < 1.0e-20) { { $seen->{$id2a} = 1; } } } } } } } # # Background job support. # # If one wants to turn a script into a background, invoke # $fig->run_in_background($coderef). This will cause $coderef to be invoked # as a background job. This means its output will be written to # $FIG_Config::data/Global/background_jobs/, and that # it shows up and is killable via the seed control panel. # sub run_in_background { my($self, $coderef, $close_fds) = @_; if (ref($coderef) ne "CODE") { warn "FIG::run_in_background() invoked without a code reference\n"; return; } my $job_dir = "$FIG_Config::data/Global/background_jobs"; verify_dir($job_dir); my $child = fork; if (!defined($child)) { die "run_in_background: fork failed: $!\n"; } if ($child == 0) { # # In the child process. # POSIX::setsid(); my $d = $self->db_handle(); if ($d) { my $dbh = $d->{_dbh}; $dbh->{InactiveDestroy} = 1; } if ($close_fds) { for (my $fd = 3; $fd < 32; $fd++) { POSIX::close($fd); } } my $my_job_dir = "$job_dir/$$"; verify_dir($my_job_dir); open(my $info, ">$my_job_dir/INFO"); my $now = asctime(localtime(time())); chomp($now); print $info "Background job $$ created from run_in_background by $> on $now\n"; close($info); # # Redirect stdout/stderr to the OUTPUT file. # close(STDOUT); close(STDERR); open STDOUT, ">$my_job_dir/OUTPUT"; open STDERR, ">&STDOUT"; select(STDERR); $| = 1; select(STDOUT); $| = 1; # # Make stdin be from nowhere. # open STDIN, "$my_job_dir/STATUS"); print $stat "Job started at $now\n"; close($stat); eval { &$coderef; }; open(my $stat, ">$my_job_dir/STATUS"); if ($@ eq "") { print $stat "Finished successfully\n"; } else { print $stat "Code had an error:\n$@\n"; } close($stat); # # We need to undef this, otherwise the DBrtns destructor # will do an explicit dbh->disconnect, which will undo any # effect of the InactiveDestroy set above. # my $d = $self->db_handle(); if ($d) { delete $d->{_dbh}; } exit; } return $child; } sub get_all_jobs :list { my($self) = @_; my $job_dir = "$FIG_Config::data/Global/background_jobs"; opendir(my $jfh, $job_dir); my @jobs = grep { $_ =~ /^\d+$/ } readdir($jfh); closedir($jfh); return @jobs; } sub get_job :scalar { my($self, $job_id) = @_; my $job_dir = "$FIG_Config::data/Global/background_jobs/$job_id"; if (-d $job_dir) { return FIG::Job->new($job_id, $job_dir); } else { return undef; } } sub get_current_arch :scalar { my $arch; open(FH, "<$FIG_Config::fig_disk/config/fig-user-env.sh"); while () { chomp; if (/^RTARCH=\"(.*)\"/) { $arch = $1; last; } } return $arch; } ############################### Interfaces to Other Systems ###################################### # This section contains the functionality introduced by the interface with GenDB. The initial # two functions simply register when GenDB has a version of the genome (so we can set links # to it when displaying PEGS: # =pod =head1 has_genome(System,Genome) usage: has_genome("GenDB",$genome) Invoking this routine just records that GenDB has a copy of the genome designated by $genome. =cut sub has_genome { my($system,$genome) = @_; print STDERR "$system has $genome\n"; } =pod =head1 dropped_genome(System,Genome) usage: dropped_genome("GenDB",$genome) Invoking this routine just records that GenDB should no longer be viewed as having a copy of the genome designated by $genome. =cut sub dropped_genome { my($system,$genome) = @_; print STDERR "$system dropped $genome\n"; } =pod =head1 link_to_system(System,FID) usage: $url = link_to_system("GenDB",$fid) # usually $fid is a peg, but it can be other types of features, as well This routine is used to get a URL that can be used to "flip" from one system to the other. If the feature is unknown to the system, undef should be returned. =cut sub link_to_system { my($system,$fid) = @_; return undef; } ############################### Adding, Deleting, Altering Features #################### # The following routines support alteration of features =pod =head1 delete_feature(FeatureID) usage: $fig->delete_feature($fid) Invoking this routine deletes the feature designated by $fid. =cut sub delete_feature { my($self,$fid) = @_; open(TMP,">>$FIG_Config::global/deleted.features") || die "could not open $FIG_Config::global/deleted.features"; flock(TMP,LOCK_EX) || confess "cannot lock deleted.features"; print TMP "$fid\n"; close(TMP); chmod 0777, "$FIG_Config::global/deleted.features"; $self->{_deleted_fids} = undef; } =pod =head1 add_feature(Genome,Type,Location,Aliases,Translation) usage: $fid = $fig->add_feature($genome,$type,$location,$aliases,$translation) Invoking this routine adds the feature, returning the new (generated) $fid. $translation is optionally and only applies to PEGs. $genome is the genome ID $type is the feature type. For now, "peg" or "rna" $location is of the form "Loc1,Loc2,...LocN" where each Loci (often just 1) if of the form Contig_Beg_End $aliases is of the form "Alias1,Alias2,...AliasN" $translation is just a string of amino acids Upon success the new feature ID is returned. On failure, undef is returned. =cut sub add_feature { my($self,$genome,$type,$location,$aliases,$translation) = @_; my $dbh = $self->db_handle(); $aliases = $aliases ? $aliases : ""; my $aliasesT = $aliases; $aliasesT =~ s/,/\t/g; my @aliases = split(/\t/,$aliasesT); my $fid = $self->next_fid($genome,$type); &add_tbl_entry($fid,$location,$aliasesT); if (($type eq "peg") and $translation) { $self->add_translation($fid,$translation); } my @loc = split(/,/,$location); my($contig,$beg,$end); if (($loc[0] =~ /^(\S+)_(\d+)_\d+$/) && (($contig,$beg) = ($1,$2)) && ($location =~ /(\d+)$/)) { $end = $1; if ($beg > $end) { ($beg,$end) = ($end,$beg) } $fid =~ /(\d+)$/; my $fidN = $1; if ((length($location) < 5000) && (length($contig) < 96) && (length($fid) < 32) && ($fid =~ /(\d+)$/)) { $dbh->SQL("INSERT INTO features (id,idN,type,genome,location,contig,minloc,maxloc,aliases) VALUES ('$fid',$fidN,'$type','$genome','$location','$contig',$beg,$end,'$aliases')"); if (@aliases > 0) { my $alias; foreach $alias (@aliases) { if ($alias =~ /^(NP_|gi\||sp\|\tr\||kegg\||uni\|)/) { $dbh->SQL("INSERT INTO ext_alias (id,alias,genome) VALUES ('$fid','$alias','$genome')"); } } } return $fid; } } return undef; } sub next_fid { my($self,$genome,$type) = @_; my $dbh = $self->db_handle(); my $res = $dbh->SQL("select max(idN) from features where (genome = '$genome' and type = '$type')"); return undef unless $res; my $fidN = $res->[0]->[0] + 1; while ($self->is_deleted_fid("fig\|$genome\.$type\.$fidN")) { $fidN++; } return "fig\|$genome\.$type\.$fidN"; } sub is_deleted_fid { my($self,$fid) = @_; my($x,$y); if (! ($x = $self->{_deleted_fids})) { $self->{_deleted_fids} = {}; if (open(TMP,"<$FIG_Config::global/deleted.features")) { while ($y = ) { if ($y =~ /^(fig\|\d+\.\d+\.[a-zA-Z]+\.\d+)/) { $self->{_deleted_fids}->{$1} = 1; } } close(TMP); } $x = $self->{_deleted_fids}; } return $x->{$fid}; } sub fid_with_changed_location { my($self,$fid) = @_; my($x); if (! ($x = $self->{_changed_location_fids})) { $self->{_changed_location_fids} = {}; if (open(TMP,"<$FIG_Config::global/changed.location.features")) { while ($_ = ) { if ($_ =~ /^(fig\|\d+\.\d+\.[a-zA-Z]+\.\d+)/) { $self->{_changed_location_fids}->{$1}++; } } close(TMP); } $x = $self->{_changed_location_fids}; } return $x->{$fid}; } =pod =head1 change_location_of_feature(FeatureID,Location,Translation) usage: $fig->change_location_of_feature($fid,$location,$translation) Invoking this routine changes the location of the feature. The $translation argument is optional (and applies only to PEGs). The routine returns 1 on success and 0 on failure. =cut sub change_location_of_feature { my($self,$fid,$location,$translation) = @_; my($x); if ($self->is_deleted_fid($fid)) { return 0 } my $dbh = $self->db_handle(); my $genome = &genome_of($fid); my $type = &ftype($fid); my($got) = 0; my @loc = split(/,/,$location); my($contig,$beg,$end); if (($loc[0] =~ /^(\S+)_(\d+)_\d+$/) && (($contig,$beg) = ($1,$2)) && ($location =~ /(\d+)$/)) { $end = $1; if ($beg > $end) { ($beg,$end) = ($end,$beg) } } else { return 0; } my @tmp = grep { ($_ =~ /^(\S+)/) && ($1 eq $fid) } `grep '$fid' $FIG_Config::organisms/$genome/Features/$type/tbl`; if (@tmp > 0) { $x = $tmp[$#tmp]; chop $x; my @flds = split(/\t/,$x); shift @flds; shift @flds; my $aliasesT = (@flds > 0) ? join("\t",@flds) : ""; &add_tbl_entry($fid,$location,$aliasesT); $dbh->SQL("UPDATE features SET location = '$location', contig = '$contig', minloc = $beg, maxloc = $end WHERE id = '$fid'"); if (my $locations = $self->cached('_location')) { $locations->{$fid} = $location; } open(TMP,">>$FIG_Config::global/changed.location.features") || die "could not open $FIG_Config::global/changed.location.features"; flock(TMP,LOCK_EX) || confess "cannot lock changed.location.features"; print TMP "$fid\n"; close(TMP); chmod 0777, "$FIG_Config::global/changed.location.features"; $self->{_changed_location_fids} = undef; if (($type eq "peg") && defined($translation)) { $self->add_translation($fid,$translation); } $got = 1 } return $got; } sub add_tbl_entry { my($fid,$location,$aliasesT) = @_; my $genome = &genome_of($fid); my $file = "$FIG_Config::organisms/$genome/Features/peg/tbl"; open(TMP,">>$file") || die "could not open $file"; flock(TMP,LOCK_EX) || confess "cannot lock $file"; print TMP "$fid\t$location\t$aliasesT\n"; close(TMP); chmod 0777, "$file"; } sub add_translation { my($self,$fid,$translation) = @_; my $genome = &genome_of($fid); my $file = "$FIG_Config::organisms/$genome/Features/peg/fasta"; if (open(TMP,">>$file")) { flock(TMP,LOCK_EX) || confess "cannot lock $file"; print TMP ">$fid\n"; my $seek = tell TMP; my $ln = length($translation); print TMP "$translation\n"; close(TMP); chmod 0777, $file; my $fileno = $self->file2N($file); my $dbh = $self->db_handle(); $dbh->SQL("DELETE FROM protein_sequence_seeks where id = '$fid'"); $dbh->SQL("INSERT INTO protein_sequence_seeks (id,fileno,seek,len,slen) VALUES ('$fid',$fileno,$seek,$ln+1,$ln)"); } } ### Begin FIG::Job module package FIG::Job; use FIGAttributes; use base 'FIGAttributes'; sub new { my($class, $job_id, $job_dir) = @_; my $self = { id => $job_id, dir => $job_dir, }; return bless $self, $class; } sub status :scalar { my($self) = @_; return &FIG::file_read("$self->{dir}/STATUS"); } sub running :scalar { my($self) = @_; my $rc; warn "running test on $self->{id}\n"; if (kill(0, $self->{id}) > 0) { $rc = 1; } else { $rc = 0; } warn "running returns $rc\n"; return $rc; } sub info :scalar :list { my($self) = @_; return &FIG::file_read("$self->{dir}/INFO"); } sub output :scalar :list { my($self) = @_; return &FIG::file_read("$self->{dir}/OUTPUT"); } ######### End FIG::Job ## package FIG; # # DAS support. # =pod =head1 init_das Initialize a DAS data query object. =cut sub init_das { my($self, $url, $dsn) = @_; my $das_data_dir = "$FIG_Config::global/DAS"; if (-d $das_data_dir) { return new SeedDas($self,$das_data_dir, $url, $dsn); } else { return undef; } } 1;