Parent Directory
|
Revision Log
protein families again
package FIG; use strict; use Fcntl qw/:flock/; # import LOCK_* constants use POSIX; use IPC::Open2; use MIME::Base64; use File::Basename; use FileHandle; use DBrtns; use Sim; use Annotation; use Blast; use FIG_Config; use FullLocation; use tree_utilities; use Subsystem; use SeedDas; use Construct; use FIGRules; use Tracer; use GenomeIDMap; our $haveDateParse; eval { require Date::Parse; import Date::Parse; $haveDateParse = 1; }; eval { require FigGFF; }; if ($@ and $ENV{USER} eq "olson") { warn $@; } # # Conditionally evaluate this in case its prerequisites are not available. # our $ClearinghouseOK; eval { require Clearinghouse; $ClearinghouseOK = 1; }; use IO::Socket; use FileHandle; use Carp; use Data::Dumper; use Time::Local; use File::Spec; use File::Copy; # # Try to load the RPC stuff; it might fail on older versions of the software. # eval { require FIGrpc; }; my $xmlrpc_available = 1; if ($@ ne "") { $xmlrpc_available = 0; } use FIGAttributes; use base 'FIGAttributes'; use vars qw(%_FunctionAttributes); use Data::Dumper; # # Force all new files to be all-writable. # umask 0; =head1 FIG Genome Annotation System =head2 Introduction This is the main object for access to the SEED data store. The data store itself is a combination of flat files and a database. The flat files can be moved easily between systems and the database rebuilt as needed. A reduced set of this object's functions are available via the B<SFXlate> object. The SFXlate object uses a single database to represent all its genomic information. It provides a much smaller capability for updating the data, and eliminates all similarities except for bidirectional best hits. The key to making the FIG system work is proper configuration of the C<FIG_Config.pm> file. This file contains names and URLs for the key directories as well as the type and login information for the database. FIG was designed to operate as a series of peer instances. Each instance is updated independently by its owner, and the instances can be synchronized using a process called a I<peer-to-peer update>. The terms I<SEED instance> and I<peer> are used more-or-less interchangeably. The POD documentation for this module is still in progress, and is provided on an AS IS basis without warranty. If you have a correction and you're not a developer, EMAIL the details to B<bruce@gigabarb.com> and I'll fold it in. B<NOTE>: The usage example for each method specifies whether it is static FIG::something or dynamic $fig->something If the method is static and has no parameters (C<FIG::something()>) it can also be invoked dynamically. This is a general artifact of the way PERL implements object-oriented programming. =head2 Tracing The FIG object supports tracing using the B<Tracer> module. If tracing is inactive when the FIG object is constructed, it will call B<TSetup> using parameters specified either in the environment variables or in the C<FIG_Config> module. Most command-line tools should call B<TSetup> before constructing a FIG object so that the tracing configuration can be specified as command-line options. If the prior call to B<TSetup> has not occurred, then the environment variables C<Trace> and C<TraceType> will be examined. If those do not exist, the global variables I<$FIG_Config::trace_levels> and I<$FIG_Config::trace_type> will be used. C<Trace> and I<$FIG_Config::trace_type> specify the tracing level and categories. Only tracing calls for the specified categories with a level less than or equal to the trace level will be displayed. The higher the trace level or the more the categories, the more messages will be displayed. For example, the following Unix command will set up for tracing at level 3 for the categories C<SQL> and C<Sprout>. env Trace="3 SQL Sprout" In most cases, the category names is the same as the name of the Perl package from which the trace call was made. An asterisk (C<*>) can be used to turn on tracing for all categories. env Trace="2 *" turns on tracing at level 2 for everything. C<TraceType> and C<$FIG_Config::trace_type> determine where the tracing is going to show up. A full treatment of all the options can be found in the documentation for the B<Tracer> module. The most common options, however, are C<WARN>, which converts all trace messages to warnings, and C<TEXT>, which writes them to the standard output. The default is C<WARN>, the theory being that this is the best option during web page construction. =head2 Hiding/Caching in a FIG object We save the DB handle, cache taxonomies, and put a few other odds and ends in the FIG object. We expect users to invoke these services using the object $fig constructed using: use FIG; my $fig = new FIG; $fig is then used as the basic mechanism for accessing FIG services. It is, of course, just a hash that is used to retain/cache data. The most commonly accessed item is the DB filehandle, which is accessed via $self->db_handle. We cache genus/species expansions, taxonomies, distances (very crudely estimated) estimated between genomes, and a variety of other things. =cut #: Constructor FIG->new(); =head2 Public Methods =head3 new C<< my $fig = FIG->new(); >> This is the constructor for a FIG object. It uses no parameters. If tracing has not yet been turned on, it will be turned on here. The tracing type and level are specified by the configuration variables C<$FIG_Config::trace_levels> and C<$FIG_Config::trace_type>. These defaults can be overridden using the environment variables C<Trace> and C<TraceType>, respectively. =cut sub new { my($class) = @_; # # Check to see if we have a FIG_URL environment variable set. # If we do, don't actually create a FIG object, but rather # create a FIGrpc and return that as the return from this constructor. # if ($ENV{FIG_URL} ne "" && $xmlrpc_available) { my $figrpc = new FIGrpc($ENV{FIG_URL}); return $figrpc; } # Here we have the normal case. Check for default tracing. We only do this if # the proper parameters are present and nobody else has set up tracing yet. if (Tracer::Setups() == 0 && (defined $FIG_Config::trace_levels || exists $ENV{Trace})) { # Tracing is not active and the user has specified tracing levels, so it's safe for # us to set it up using our own rules. First, the trace type: the default is WARN. my $trace_type; if (exists($ENV{TraceType})) { $trace_type = $ENV{TraceType}; } elsif (defined($FIG_Config::trace_type)) { $trace_type = $FIG_Config::trace_type; } else { $trace_type = "WARN"; } # Now the trace levels. The environment variable wins over the FIG_Config value. my $trace_levels = (exists($ENV{Trace}) ? $ENV{Trace} : $FIG_Config::trace_levels); TSetup($trace_levels, $trace_type); } Trace("Connecting to the database.") if T(2); # Connect to the database, then return ourselves. my $rdbH = new DBrtns; bless { _dbf => $rdbH, }, $class; } =head3 db_handle C<< my $dbh = $fig->db_handle; >> Return the handle to the internal B<DBrtns> object. This allows direct access to the database methods. =cut sub db_handle { my($self) = @_; return $self->{_dbf}; } sub table_exists { my($self,$table) = @_; my $rdbH = $self->db_handle; return $rdbH->table_exists($table); } =head3 cached C<< my $x = $fig->cached($name); >> Return a reference to a hash containing transient data. If no hash exists with the specified name, create an empty one under that name and return it. The idea behind this method is to allow clients to cache data in the FIG object for later use. (For example, a method might cache feature data so that it can be retrieved later without using the database.) This facility should be used sparingly, since different clients may destroy each other's data if they use the same name. =over 4 =item name Name assigned to the cached data. =item RETURN Returns a reference to a hash that is permanently associated with the specified name. If no such hash exists, an empty one will be created for the purpose. =back =cut sub cached { my($self,$what) = @_; my $x = $self->{$what}; if (! $x) { $x = $self->{$what} = {}; } return $x; } =head3 get_system_name C<< my $name = $fig->get_system_name; >> Returns C<seed>, indicating that this is object is using the SEED database. The same method on an SFXlate object will return C<sprout>. =cut #: Return Type $; sub get_system_name { return "seed"; } =head3 DESTROY The destructor releases the database handle. =cut sub DESTROY { my($self) = @_; my($rdbH); if ($rdbH = $self->db_handle) { $rdbH->DESTROY; } } =head3 same_seqs C<< my $sameFlag = FIG::same_seqs($s1, $s2); >> Return TRUE if the specified protein sequences are considered equivalent and FALSE otherwise. The sequences should be presented in I<nr-analysis> form, which is in reverse order and upper case with the stop codon omitted. The sequences will be considered equivalent if the shorter matches the initial portion of the long one and is no more than 30% smaller. Since the sequences are in nr-analysis form, the equivalent start potions means that the sequences have the same tail. The importance of the tail is that the stop point of a PEG is easier to find than the start point, so a same tail means that the two sequences are equivalent except for the choice of start point. =over 4 =item s1 First protein sequence, reversed and with the stop codon removed. =item s2 Second protein sequence, reversed and with the stop codon removed. =item RETURN Returns TRUE if the two protein sequences are equivalent, else FALSE. =back =cut sub same_seqs { my ($s1,$s2) = @_; my $ln1 = length($s1); my $ln2 = length($s2); return ((abs($ln1-$ln2) < (0.3 * (($ln1 < $ln2) ? $ln1 : $ln2))) && ((($ln1 <= $ln2) && (index($s2,$s1) == 0)) || (($ln1 > $ln2) && (index($s1,$s2) == 0)))); } =head3 delete_genomes C<< $fig->delete_genomes(\@genomes); >> Delete the specified genomes from the data store. This requires making system calls to move and delete files. =cut #: Return Type ; sub delete_genomes { my($self,$genomes) = @_; my $tmpD = "$FIG_Config::temp/tmp.deleted.$$"; my $tmp_Data = "$FIG_Config::temp/Data.$$"; my %to_del = map { $_ => 1 } @$genomes; open(TMP,">$tmpD") || die "could not open $tmpD"; my $genome; foreach $genome ($self->genomes) { if (! $to_del{$genome}) { print TMP "$genome\n"; } } close(TMP); &run("extract_genomes $tmpD $FIG_Config::data $tmp_Data"); # &run("mv $FIG_Config::data $FIG_Config::data.deleted; mv $tmp_Data $FIG_Config::data; fig load_all; rm -rf $FIG_Config::data.deleted"); &run("mv $FIG_Config::data $FIG_Config::data.deleted"); &run("mv $tmp_Data $FIG_Config::data"); &run("fig load_all"); &run("rm -rf $FIG_Config::data.deleted"); } =head3 add_genome C<< my $ok = $fig->add_genome($genomeF, $force, $skipnr); >> Add a new genome to the data store. A genome's data is kept in a directory by itself, underneath the main organism directory. This method essentially moves genome data from an external directory to the main directory and performs some indexing tasks to integrate it. =over 4 =item genomeF Name of the directory containing the genome files. This should be a fully-qualified directory name. The last segment of the directory name should be the genome ID. =item force This will ignore errors thrown by verify_genome_directory. This is bad, and you should never do it, but I am in the situation where I need to move a genome from one machine to another, and although I trust the genome I can't. =item skipnr We don't always want to add the pooteins into the nr database. For example wih a metagnome that has been called by blastx. This will just skip appending the proteins into the NR file. =item RETURN Returns TRUE if successful, else FALSE. =back =cut #: Return Type $; sub add_genome { my($self,$genomeF, $force, $skipnr) = @_; my $rc = 0; my(undef, $path, $genome) = File::Spec->splitpath($genomeF); if ($genome !~ /^\d+\.\d+$/) { warn "Invalid genome filename $genomeF\n"; return $rc; } if (-d $FIG_Config::organisms/$genome) { warn "Organism already exists for $genome\n"; return $rc; } # # We're okay, it doesn't exist. # my @errors = `$FIG_Config::bin/verify_genome_directory $genomeF`; if (@errors) { warn "Errors found while verifying genome directory $genomeF:\n"; print join("", @errors); if (!$force) {return $rc} else {warn "Skipped these errors and continued. You should not do this"} } &run("cp -r $genomeF $FIG_Config::organisms"); &run("chmod -R 777 $FIG_Config::organisms/$genome"); if (-s "$FIG_Config::organisms/$genome/COMPLETE") { print STDERR "$genome was marked as \"complete\"\n"; } else { &run("assess_completeness $genome"); if (-s "$FIG_Config::organisms/$genome/PROBABLY_COMPLETE") { print STDERR "Assessed $genome to be probably complete\n"; &run("cp -p $FIG_Config::organisms/$genome/PROBABLY_COMPLETE $FIG_Config::organisms/$genome/COMPLETE"); } else { print STDERR "Assessed $genome to not be probably complete\n"; } } &run("index_contigs $genome"); &run("compute_genome_counts $genome"); &run("load_features $genome"); $rc = 1; if (-s "$FIG_Config::organisms/$genome/Features/peg/fasta") { &run("index_translations $genome"); my @tmp = `cut -f1 $FIG_Config::organisms/$genome/Features/peg/tbl`; chomp @tmp; &run("cat $FIG_Config::organisms/$genome/Features/peg/fasta >> $FIG_Config::data/Global/nr") if (!$skipnr); # &run("formatdb -i $FIG_Config::data/Global/nr -p T") if (!$skipnr); &enqueue_similarities(\@tmp); } if ((-s "$FIG_Config::organisms/$genome/assigned_functions") || (-d "$FIG_Config::organisms/$genome/UserModels")) { &run("add_assertions_of_function $genome"); } return $rc; } =head3 parse_genome_args C<< my ($mode, @genomes) = FIG::parse_genome_args(@args); >> Extract a list of genome IDs from an argument list. If the argument list is empty, return all the genomes in the data store. This is a function that is performed by many of the FIG command-line utilities. The user has the option of specifying a list of specific genome IDs or specifying none in order to get all of them. If your command requires additional arguments in the command line, you can still use this method if you shift them out of the argument list before calling. The $mode return value will be C<all> if the user asked for all of the genomes or C<some> if he specified a list of IDs. This is useful to know if, for example, we are loading a table. If we're loading everything, we can delete the entire table; if we're only loading some genomes, we must delete them individually. This method uses the genome directory rather than the database because it may be used before the database is ready. =over 4 =item args1, args2, ... argsN List of genome IDs. If all genome IDs are to be processed, then this list should be empty. =item RETURN Returns a list. The first element of the list is C<all> if the user is asking for all the genome IDs and C<some> otherwise. The remaining elements of the list are the desired genome IDs. =back =cut sub parse_genome_args { # Get the parameters. my @args = @_; # Check the mode. my $mode = (@args > 0 ? 'some' : 'all'); # Build the return list. my @retVal = ($mode); # Process according to the mode. if ($mode eq 'all') { # We want all the genomes, so we get them from the organism directory. my $orgdir = "$FIG_Config::organisms"; opendir( GENOMES, $orgdir ) || Confess("Could not open directory $orgdir"); push @retVal, grep { $_ =~ /^\d/ } readdir( GENOMES ); closedir( GENOMES ); } else { # We want only the genomes specified by the user. push @retVal, @args; } # Return the result. return @retVal; } =head3 reload_table C<< $fig->reload_table($mode, $table, $flds, $xflds, $fileName, $keyList, $keyName); >> Reload a database table from a sequential file. If I<$mode> is C<all>, the table will be dropped and re-created. If I<$mode> is C<some>, the data for the individual items in I<$keyList> will be deleted before the table is loaded. Thus, the load process is optimized for the type of reload. =over 4 =item mode C<all> if we are reloading the entire table, C<some> if we are only reloading specific entries. =item table Name of the table to reload. =item flds String defining the table columns, in SQL format. In general, this is a comma-delimited set of field specifiers, each specifier consisting of the field name followed by the field type and any optional qualifiers (such as C<NOT NULL> or C<DEFAULT>); however, it can be anything that would appear between the parentheses in a C<CREATE TABLE> statement. The order in which the fields are specified is important, since it is presumed that is the order in which they are appearing in the load file. =item xflds Reference to a hash that describes the indexes. The hash is keyed by index name. The value is the index's field list. This is a comma-delimited list of field names in order from most significant to least significant. If a field is to be indexed in descending order, its name should be followed by the qualifier C<DESC>. For example, the following I<$xflds> value will create two indexes, one for name followed by creation date in reverse chronological order, and one for ID. { name_index => "name, createDate DESC", id_index => "id" } =item fileName Fully-qualified name of the file containing the data to load. Each line of the file must correspond to a record, and the fields must be arranged in order and tab-delimited. If the file name is omitted, the table is dropped and re-created but not loaded. =item keyList Reference to a list of the IDs for the objects being reloaded. This parameter is only used if I<$mode> is C<some>. =item keyName (optional) Name of the key field containing the IDs in the keylist. If omitted, C<genome> is assumed. =back =cut sub reload_table { # Get the parameters. my ($self, $mode, $table, $flds, $xflds, $fileName, $keyList, $keyName) = @_; if (!defined $keyName) { $keyName = 'genome'; } # Get the database handler. my $dbf = $self->{_dbf}; # Call the DBKernel method. $dbf->reload_table($mode, $table, $flds, $xflds, $fileName, $keyList, $keyName); } =head3 enqueue_similarities C<< FIG::enqueue_similarities(\@fids); >> Queue the passed Feature IDs for similarity computation. The actual computation is performed by L</create_sim_askfor_pool>. The queue is a persistent text file in the global data directory, and this method essentially writes new IDs on the end of it. =over 4 =item fids Reference to a list of feature IDs. =back =cut #: Return Type ; sub enqueue_similarities { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($fids) = @_; my $fid; my $sim_q = "$FIG_Config::global/queued_similarities"; open(TMP,">>$sim_q") || die "could not open $sim_q"; # # We need to lock here so that if a computation is creating a snapshot of the # queue, we block until it's done. # flock(TMP, LOCK_EX) or die "Cannot lock $sim_q\n"; foreach $fid (@$fids) { print TMP "$fid\n"; } close(TMP); } =head3 export_similarity_request Creates a similarity computation request from the queued similarities and the current NR. We keep track of the exported requests in case one gets lost. =cut sub export_similarity_request { my($self, $nr_file, $fasta_file) = @_; my $req_dir = "$FIG_Config::fig/var/sim_requests"; &verify_dir("$FIG_Config::fig/var"); &verify_dir($req_dir); $req_dir = "$req_dir/" . time; &verify_dir($req_dir); # # Open all of our output files before zeroing out the sim queue, in case # there is a problem. # open(my $user_fasta_fh, ">$fasta_file") or confess "Cannot open $fasta_file for writing: $!"; open(my $fasta_fh, ">$req_dir/fasta.in"); open(my $user_nr_fh, ">$nr_file") or confess "Cannot open $nr_file for writing: $!"; open(my $nr_fh, ">$req_dir/nr") or confess "Cannot open $req_dir/nr for writing: $!"; open(my $nr_read_fh, "<$FIG_Config::data/Global/nr") or die "Cannot open $FIG_Config::data/Global/nr for reading: $!"; my $sim_q = "$FIG_Config::global/queued_similarities"; # # We need to lock here so that if a computation is creating a snapshot of the # queue, we block until it's done. # open(my $sim_q_lock, ">>$sim_q") or confess "could not open $sim_q"; flock($sim_q_lock, LOCK_EX) or confess "Cannot lock $sim_q\n"; # # Everything open & locked, start copying. # copy("$sim_q", "$req_dir/q") or confess "Copy $sim_q $req_dir/q failed: $!"; my($buf); while (1) { my $n = read($nr_read_fh, $buf, 4096); defined($n) or confess "Error reading nr: $!"; last unless $n; syswrite($user_nr_fh, $buf) or confess "Error writing $nr_file: $!"; syswrite($nr_fh, $buf) or confess "Error writing $req_dir/nr: $!"; } close($nr_read_fh); close($nr_fh); close($user_nr_fh); # # We can zero out the queue and unlock now. # open(F, ">$sim_q") or die "Cannot open $sim_q to truncate it: $!\n"; close(F); close($sim_q_lock); # # Generate the fasta input from the queued ids. # open(my $q_fh, "<$req_dir/q"); while (my $id = <$q_fh>) { chomp $id; my $seq = $self->get_translation($id); display_id_and_seq($id, \$seq, $user_fasta_fh); display_id_and_seq($id, \$seq, $fasta_fh); } close($q_fh); close($user_fasta_fh); close($fasta_fh); } =head3 create_sim_askfor_pool C<< $fig->create_sim_askfor_pool($chunk_size); >> Creates an askfor pool, which a snapshot of the current NR and similarity queue. This process clears the old queue. The askfor pool needs to keep track of which sequences need to be calculated, which have been handed out, etc. To simplify this task we chunk the sequences into fairly small numbers (20k characters) and allocate work on a per-chunk basis. We make use of the relational database to keep track of chunk status as well as the seek locations into the file of sequence data. The initial creation of the pool involves indexing the sequence data with seek offsets and lengths and populating the sim_askfor_index table with this information and with initial status information. =over 4 =item chunk_size Number of features to put into a processing chunk. The default is 15. =back =cut #: Return Type $; sub create_sim_askfor_pool { my($self, $chunk_size) = @_; $chunk_size = 20000 unless $chunk_size =~ /^\d+$/; my $pool_dir = "$FIG_Config::fig/var/sim_pools"; &verify_dir($pool_dir); # # Lock the pool directory. # open(my $lock, ">$pool_dir/lockfile"); flock($lock, LOCK_EX); my $num = 0; if (open(my $toc, "<$pool_dir/TOC")) { while (<$toc>) { chomp; # print STDERR "Have toc entry $_\n"; my ($idx, $time, $str) = split(/\s+/, $_, 3); $num = max($num, $idx); } close($toc); } $num++; open(my $toc, ">>$pool_dir/TOC") or die "Cannot write $pool_dir/TOC: $!\n"; print $toc "$num ", time(), " New toc entry\n"; close($toc); my $cpool_id = sprintf "%04d", $num; my $cpool_dir = "$pool_dir/$cpool_id"; # # All set, create the directory for this pool. # &verify_dir($cpool_dir); # # Now we can copy the nr and sim queue here. # Do this stuff inside an eval so we can clean up # the lockfile. # eval { my $sim_q = "$FIG_Config::global/queued_similarities"; copy("$sim_q", "$cpool_dir/q"); copy("$FIG_Config::data/Global/nr", "$cpool_dir/nr"); open(F, ">$sim_q") or die "Cannot open $sim_q to truncate it: $!\n"; close(F); }; unlink("$pool_dir/lockfile"); close($lock); # # We've created our pool; we can now run the formatdb and # extract the sequences for the blast run. # my $child_pid = $self->run_in_background( sub { # # Need to close db or there's all sorts of trouble. # my $cmd = "$FIG_Config::ext_bin/formatdb -i $cpool_dir/nr -p T -l $cpool_dir/formatdb.log"; print "Will run '$cmd'\n"; &run($cmd); print "finished. Logfile:\n"; print &FIG::file_read("$cpool_dir/formatdb.log"); unlink("$cpool_dir/formatdb.pid"); }); warn "Running formatdb in background job $child_pid\n"; open(FPID, ">$cpool_dir/formatdb.pid"); print FPID "$child_pid\n"; close(FPID); my $db = $self->db_handle(); if (!$db->table_exists("sim_queue")) { $db->create_table(tbl => "sim_queue", flds => "qid varchar(32), chunk_id INTEGER, seek INTEGER, len INTEGER, " . "assigned BOOL, finished BOOL, output_file varchar(255), " . "assignment_expires INTEGER, worker_info varchar(255)" ); } # # Write the fasta input file. Keep track of how many have been written, # and write seek info into the database as appropriate. # open(my $seq_fh, ">$cpool_dir/fasta.in"); my($chunk_idx, $chunk_begin, $seq_idx); my $cur_size = 0; $chunk_idx = 0; $chunk_begin = 0; $seq_idx = 0; my(@seeks); my $tmpfile = "$FIG_Config::temp/simseek.$$"; open(my $tmpfh, ">$tmpfile") or confess "Cannot open tmpfile $tmpfile: $!"; open(my $q_fh, "<$cpool_dir/q"); while (my $id = <$q_fh>) { chomp $id; my $seq = $self->get_translation($id); # # check if we're at the beginning of a chunk # print $seq_fh ">$id\n$seq\n"; # # Check if we're at the end of a chunk # $cur_size += length($seq); if ($cur_size >= $chunk_size) { my $chunk_end = tell($seq_fh); my $chunk_len = $chunk_end - $chunk_begin; push(@seeks, [$cpool_id, $chunk_idx, $chunk_begin, $chunk_len]); print $tmpfh join("\t", $cpool_id, $chunk_idx, $chunk_begin, $chunk_len, 'FALSE', 'FALSE'), "\n"; $chunk_idx++; $chunk_begin = $chunk_end; $cur_size = 0; } $seq_idx++; } if ($cur_size > 0) { my $chunk_end = tell($seq_fh); my $chunk_len = $chunk_end - $chunk_begin; print $tmpfh join("\t", $cpool_id, $chunk_idx, $chunk_begin, $chunk_len, 'FALSE', 'FALSE'), "\n"; push(@seeks, [$cpool_id, $chunk_idx, $chunk_begin, $chunk_len]); } close($q_fh); close($seq_fh); close($tmpfh); warn "Write seqs from $tmpfile\n"; $self->db_handle->load_table(tbl => 'sim_queue', file => $tmpfile); unlink($tmpfile); # for my $seek (@seeks) # { # my($cpool_id, $chunk_idx, $chunk_begin, $chunk_len) = @$seek; # $db->SQL("insert into sim_queue (qid, chunk_id, seek, len, assigned, finished) " . # "values('$cpool_id', $chunk_idx, $chunk_begin, $chunk_len, FALSE, FALSE)"); # } return $cpool_id; } #=head3 get_sim_queue # #usage: get_sim_queue($pool_id, $all_sims) # #Returns the sims in the given pool. If $all_sims is true, return the entire queue. Otherwise, #just return the sims awaiting processing. # #=cut sub get_sim_queue { my($self, $pool_id, $all_sims) = @_; } =head3 get_sim_work C<< my ($nrPath, $fasta) = $fig->get_sim_work(); >> Get the next piece of sim computation work to be performed. Returned are the path to the NR and a string containing the fasta data. =cut sub get_sim_work { my ($self) = @_; # # For now, just don't care about order of data that we get back. # my $db = $self->db_handle(); my $lock = FIG::SimLock->new; my $work = $db->SQL(qq(SELECT qid, chunk_id, seek, len FROM sim_queue WHERE not finished LIMIT 1)); print "Got work ", Dumper($work), "\n"; if (not $work or @$work == 0) { return undef; } my($cpool_id, $chunk_id, $seek, $len) = @{$work->[0]}; my $pool_dir = "$FIG_Config::fig/var/sim_pools"; my $cpool_dir = "$pool_dir/$cpool_id"; my $nr = "$cpool_dir/nr"; open(my $fh, "<$cpool_dir/fasta.in"); seek($fh, $seek, 0); my $fasta; read($fh, $fasta, $len); return($cpool_id, $chunk_id, $nr, $fasta, "$cpool_dir/out.$chunk_id"); } =head3 sim_work_done C<< $fig->sim_work_done($pool_id, $chunk_id, $out_file); >> Declare that the work in pool_id/chunk_id has been completed, and output written to the pool directory (get_sim_work gave it the path). =over 4 =item pool_id The ID number of the pool containing the work that just completed. =item chunk_id The ID number of the chunk completed. =item out_file The file into which the work was placed. =back =cut sub sim_work_done { my ($self, $pool_id, $chunk_id, $out_file) = @_; if (! -f $out_file) { Confess("sim_work_done: output file $out_file does not exist"); } my $db = $self->db_handle(); my $lock = FIG::SimLock->new; my $dbh = $db->{_dbh}; my $rows = $dbh->do(qq(UPDATE sim_queue SET finished = TRUE, output_file = ? WHERE qid = ? and chunk_id = ?), undef, $out_file, $pool_id, $chunk_id); if ($rows != 1) { if ($dbh->errstr) { Confess("Update not able to set finished=TRUE: ", $dbh->errstr); } else { Confess("Update not able to set finished=TRUE"); } } # # Determine if this was the last piece of work for this pool. If so, we can # schedule the postprocessing work. # # Note we're still holding the lock. # my $out = $db->SQL(qq(SELECT chunk_id FROM sim_queue WHERE qid = ? AND not finished), undef, $pool_id); if (@$out == 0) { # # Pool is done. # $self->schedule_sim_pool_postprocessing($pool_id); } } =head3 schedule_sim_pool_postprocessing C<< $fig->schedule_sim_pool_postprocessing($pool_id); >> Schedule a job to do the similarity postprocessing for the specified pool. =over 4 =item pool_id ID of the pool whose similarity postprocessing needs to be scheduled. =back =cut sub schedule_sim_pool_postprocessing { my($self, $pool_id) = @_; my $pool_dir = "$FIG_Config::fig/var/sim_pools"; my $cpool_dir = "$pool_dir/$pool_id"; my $js = JobScheduler->new(); my $job = $js->job_create(); my $spath = $job->get_script_path(); open(my $sfh, ">$spath"); print $sfh <<END; #!/bin/sh . $FIG_Config::fig_disk/config/fig-user-env.sh $FIG_Config::bin/postprocess_computed_sims $pool_id END close($sfh); chmod(0775, $spath); # # Write the job ID to the subsystem queue dir. # open(J, ">$cpool_dir/postprocess_jobid"); print J $job->get_id(), "\n"; close(J); $job->enqueue(); } =head3 postprocess_computed_sims C<< $fig->postprocess_computed_sims($pool_id); >> Set up to reduce, reformat, and split the similarities in a given pool. We build a pipe to this pipeline: reduce_sims peg.synonyms 300 | reformat_sims nr | split_sims dest prefix Then we put the new sims in the pool directory, and then copy to NewSims. =over 4 =item pool_id ID of the pool whose similarities are to be post-processed. =back =cut sub postprocess_computed_sims { my($self, $pool_id) = @_; # # We don't lock here because the job is already done, and we # shouldn't (ha, ha) ever postprocess twice. # my $pool_dir = "$FIG_Config::fig/var/sim_pools"; my $cpool_dir = "$pool_dir/$pool_id"; my $sim_dir = "$cpool_dir/NewSims"; &verify_dir($sim_dir); # # Open the processing pipeline. # my $reduce = "$FIG_Config::bin/reduce_sims $FIG_Config::global/peg.synonyms 300"; my $reformat = "$FIG_Config::bin/reformat_sims $cpool_dir/nr"; my $split = "$FIG_Config::bin/split_sims $sim_dir sims.$pool_id"; open(my $process, "| $reduce | $reformat | $split"); # # Iterate over all the sims files, taken from the database. # my $dbh = $self->db_handle()->{_dbh}; my $files = $dbh->selectcol_arrayref(qq(SELECT output_file FROM sim_queue WHERE qid = ? and output_file IS NOT NULL ORDER BY chunk_id), undef, $pool_id); for my $file (@$files) { my $buf; open(my $fh, "<$file") or confess "Cannot sim input file $file: $!"; while (read($fh, $buf, 4096)) { print $process $buf; } close($fh); } my $res = close($process); if (!$res) { if ($!) { confess "Error closing process pipeline: $!"; } else { confess "Process pipeline exited with status $?"; } } # # If we got here, it worked. Copy the new sims files over to NewSims. # opendir(my $simdh, $sim_dir) or confess "Cannot open $sim_dir: $!"; my @new_sims = grep { $_ !~ /^\./ } readdir($simdh); closedir($simdh); &verify_dir("$FIG_Config::data/NewSims"); for my $sim_file (@new_sims) { my $target = "$FIG_Config::data/NewSims/$sim_file"; if (-s $target) { Confess("$target already exists"); } print "copying sim file $sim_file\n"; &FIG::run("cp $sim_dir/$sim_file $target"); &FIG::run("$FIG_Config::bin/index_sims $target"); } } =head3 get_active_sim_pools C<< @pools = $fig->get_active_sim_pools(); >> Return a list of the pool IDs for the sim processing queues that have entries awaiting computation. =cut #: Return Type @; sub get_active_sim_pools { my($self) = @_; my $dbh = $self->db_handle(); my $res = $dbh->SQL("select distinct qid from sim_queue where not finished"); return undef unless $res; return map { $_->[0] } @$res; } =head3 compute_clusters C<< my @clusterList = $fig->compute_clusters(\@pegList, $subsystem, $distance); >> Partition a list of PEGs into sections that are clustered close together on the genome. The basic algorithm used builds a graph connecting PEGs to other PEGs close by them on the genome. Each connected subsection of the graph is then separated into a cluster. Singleton clusters are thrown away, and the remaining ones are sorted by length. All PEGs in the incoming list should belong to the same genome, but this is not a requirement. PEGs on different genomes will simply find themselves in different clusters. =over 4 =item pegList Reference to a list of PEG IDs. =item subsystem Subsystem object for the relevant subsystem. This parameter is not used, but is required for compatability with Sprout. =item distance (optional) The maximum distance between PEGs that makes them considered close. If omitted, the distance is 5000 bases. =item RETURN Returns a list of lists. Each sub-list is a cluster of PEGs. =back =cut sub compute_clusters { # Get the parameters. my ($self, $pegList, $subsystem, $distance) = @_; if (! defined $distance) { $distance = 5000; } # Create a hash of the PEG IDs we're considering for cluster membership. my %myPeg = map { $_ => 1 } @{$pegList}; # This next hash serves as our connection graph. We map each PEG to a list of # the PEGs that are near it. The GREP filter insures that a PEG is not # connected to itself and that only PEGs from the caller's list are # included. my %conn = (); for my $peg (keys %myPeg) { $conn{$peg} = [grep { $myPeg{$_} && ($_ ne $peg) } $self->close_genes($peg, $distance)]; } # Our third and final hash tracks the PEGs we've already processed. This prevents # a PEG from being put in more than one cluster or in the same cluster twice. my %seen = (); # Now we create the list of clusters. my @clusters = (); # Loop through the pegs. for my $peg (keys %myPeg) { # Only proceed if this PEG has not been put into a cluster. if (! $seen{$peg}) { # Create a new cluster for this PEG. my $subList = [$peg]; # Denote we've seen it. $seen{$peg} = 1; # Now we recursively build this cluster. The "$subList" acts as a # queue. We run through it from the beginning, adding connected # pegs to the list. The process stops when we run out of new PEGs to # add. for (my $i=0; $i < @$subList; $i++) { # Get a list of the PEGs connected to the current cluster PEG. # Only PEGs we haven't clustered yet will be processed. my $subPeg = $subList->[$i]; my @tmp = grep { ! $seen{$_} } @{$conn{$subPeg}}; # Only proceed if we found at least one new PEG. if (@tmp > 0) { # For each new PEG, denote we've seen it and # stuff it into the queue. for my $peg1 (@tmp) { $seen{$peg1} = 1 } push @$subList, @tmp; } } # If the queue we've built is not a singleton, we push it on # the master cluster list. if (@$subList > 1) { push @clusters, $subList; } } } # Sort the clusters by length. The shortest clusters will be bubbled to # the front. my @retVal = sort { @$a <=> @$b } @clusters; # Return the sorted and pruned cluster list. return @retVal; } =head3 get_sim_pool_info C<< my ($total_entries, $n_finished, $n_assigned, $n_unassigned) = $fig->get_sim_pool_info($pool_id); >> Return information about the given sim pool. =over 4 =item pool_id Pool ID of the similarity processing queue whose information is desired. =item RETURN Returns a four-element list. The first is the number of features in the queue; the second is the number of features that have been processed; the third is the number of features that have been assigned to a processor, and the fourth is the number of features left over. =back =cut #: Return Type @; sub get_sim_pool_info { my($self, $pool_id) = @_; my($dbh, $res, $total_entries, $n_finished, $n_assigned, $n_unassigned); $dbh = $self->db_handle(); $res = $dbh->SQL("select count(chunk_id) from sim_queue where qid = '$pool_id'"); $total_entries = $res->[0]->[0]; $res = $dbh->SQL("select count(chunk_id) from sim_queue where qid = '$pool_id' and finished"); $n_finished = $res->[0]->[0]; $res = $dbh->SQL("select count(chunk_id) from sim_queue where qid = '$pool_id' and assigned and not finished"); $n_assigned = $res->[0]->[0]; $res = $dbh->SQL("select count(chunk_id) from sim_queue where qid = '$pool_id' and not finished and not assigned"); $n_unassigned = $res->[0]->[0]; return ($total_entries, $n_finished, $n_assigned, $n_unassigned); } #=head3 get_sim_chunk # #usage: get_sim_chunk($n_seqs, $worker_id) # #Returns a chunk of $n_seqs of work. # #From Ross, about how sims are processed: # #Here is how I process them: # # # bash$ cd /Volumes/seed/olson/Sims/June22.out # bash$ for i in really* # > do # > cat < $i >> /Volumes/laptop/new.sims # > done # # #Then, I need to "reformat" them by adding to columns to each one # and split the result into files of about 3M each This I do using # #reduce_sims /Volumes/laptop/NR/NewNR/peg.synonyms.june21 300 < /Volumes/laptop/new.sims | # reformat_sims /Volumes/laptop/NR/NewNR/checked.nr.june21 > /Volumes/laptop/reformated.sims #rm /Volumes/laptop/new.sims #split_sims /Volumes/laptop/NewSims sims.june24 reformated.sims #rm reformatted.sims # #=cut sub get_sim_chunk { my($self, $n_seqs, $worker_id) = @_; } =head3 get_local_hostname C<< my $result = FIG::get_local_hostname(); >> Return the local host name for the current processor. The name may be stored in a configuration file, or we may have to get it from the operating system. =cut #: Return Type $; sub get_local_hostname { # # See if there is a FIGdisk/config/hostname file. If there # is, force the hostname to be that. # my $hostfile = "$FIG_Config::fig_disk/config/hostname"; if (-f $hostfile) { my $fh; if (open($fh, $hostfile)) { my $hostname = <$fh>; chomp($hostname); return $hostname; } } # # First check to see if we our hostname is correct. # # Map it to an IP address, and try to bind to that ip. # my $tcp = getprotobyname('tcp'); my $hostname = `hostname`; chomp($hostname); my @hostent = gethostbyname($hostname); if (@hostent > 0) { my $sock; my $ip = $hostent[4]; socket($sock, PF_INET, SOCK_STREAM, $tcp); if (bind($sock, sockaddr_in(0, $ip))) { # # It worked. Reverse-map back to a hopefully fqdn. # my @rev = gethostbyaddr($ip, AF_INET); if (@rev > 0) { my $host = $rev[0]; # # Check to see if we have a FQDN. # if ($host =~ /\./) { # # Good. # return $host; } else { # # We didn't get a fqdn; bail and return the IP address. # return get_hostname_by_adapter() } } else { return inet_ntoa($ip); } } else { # # Our hostname must be wrong; we can't bind to the IP # address it maps to. # Return the name associated with the adapter. # return get_hostname_by_adapter() } } else { # # Our hostname isn't known to DNS. This isn't good. # Return the name associated with the adapter. # return get_hostname_by_adapter() } } =head3 get_hostname_by_adapter C<< my $name = FIG::get_hostname_by_adapter(); >> Return the local host name for the current network environment. =cut #: Return Type $; sub get_hostname_by_adapter { # # Attempt to determine our local hostname based on the # network environment. # # This implementation reads the routing table for the default route. # We then look at the interface config for the interface that holds the default. # # # Linux routing table: # [olson@yips 0.0.0]$ netstat -rn # Kernel IP routing table # Destination Gateway Genmask Flags MSS Window irtt Iface # 140.221.34.32 0.0.0.0 255.255.255.224 U 0 0 0 eth0 # 169.254.0.0 0.0.0.0 255.255.0.0 U 0 0 0 eth0 # 127.0.0.0 0.0.0.0 255.0.0.0 U 0 0 0 lo # 0.0.0.0 140.221.34.61 0.0.0.0 UG 0 0 0 eth0 # # Mac routing table: # # bash-2.05a$ netstat -rn # Routing tables # # Internet: # Destination Gateway Flags Refs Use Netif Expire # default 140.221.11.253 UGSc 12 120 en0 # 127.0.0.1 127.0.0.1 UH 16 8415486 lo0 # 140.221.8/22 link#4 UCS 12 0 en0 # 140.221.8.78 0:6:5b:f:51:c4 UHLW 0 183 en0 408 # 140.221.8.191 0:3:93:84:ab:e8 UHLW 0 92 en0 622 # 140.221.8.198 0:e0:98:8e:36:e2 UHLW 0 5 en0 691 # 140.221.9.6 0:6:5b:f:51:d6 UHLW 1 63 en0 1197 # 140.221.10.135 0:d0:59:34:26:34 UHLW 2 2134 en0 1199 # 140.221.10.152 0:30:1b:b0:ec:dd UHLW 1 137 en0 1122 # 140.221.10.153 127.0.0.1 UHS 0 0 lo0 # 140.221.11.37 0:9:6b:53:4e:4b UHLW 1 624 en0 1136 # 140.221.11.103 0:30:48:22:59:e6 UHLW 3 973 en0 1016 # 140.221.11.224 0:a:95:6f:7:10 UHLW 1 1 en0 605 # 140.221.11.237 0:1:30:b8:80:c0 UHLW 0 0 en0 1158 # 140.221.11.250 0:1:30:3:1:0 UHLW 0 0 en0 1141 # 140.221.11.253 0:d0:3:e:70:a UHLW 13 0 en0 1199 # 169.254 link#4 UCS 0 0 en0 # # Internet6: # Destination Gateway Flags Netif Expire # UH lo0 # fe80::%lo0/64 Uc lo0 # link#1 UHL lo0 # fe80::%en0/64 link#4 UC en0 # 0:a:95:a8:26:68 UHL lo0 # ff01::/32 U lo0 # ff02::%lo0/32 UC lo0 # ff02::%en0/32 link#4 UC en0 my($fh); if (!open($fh, "netstat -rn |")) { warn "Cannot run netstat to determine local IP address\n"; return "localhost"; } my $interface_name; while (<$fh>) { my @cols = split(); if ($cols[0] eq "default" || $cols[0] eq "0.0.0.0") { $interface_name = $cols[$#cols]; } } close($fh); # print "Default route on $interface_name\n"; # # Find ifconfig. # my $ifconfig; for my $dir ((split(":", $ENV{PATH}), "/sbin", "/usr/sbin")) { if (-x "$dir/ifconfig") { $ifconfig = "$dir/ifconfig"; last; } } if ($ifconfig eq "") { warn "Ifconfig not found\n"; return "localhost"; } # print "Foudn $ifconfig\n"; if (!open($fh, "$ifconfig $interface_name |")) { warn "Could not run $ifconfig: $!\n"; return "localhost"; } my $ip; while (<$fh>) { # # Mac: # inet 140.221.10.153 netmask 0xfffffc00 broadcast 140.221.11.255 # Linux: # inet addr:140.221.34.37 Bcast:140.221.34.63 Mask:255.255.255.224 # chomp; s/^\s*//; # print "Have '$_'\n"; if (/inet\s+addr:(\d+\.\d+\.\d+\.\d+)\s+/) { # # Linux hit. # $ip = $1; # print "Got linux $ip\n"; last; } elsif (/inet\s+(\d+\.\d+\.\d+\.\d+)\s+/) { # # Mac hit. # $ip = $1; # print "Got mac $ip\n"; last; } } close($fh); if ($ip eq "") { warn "Didn't find an IP\n"; return "localhost"; } return $ip; } =head3 get_seed_id C<< my $id = FIG::get_seed_id(); >> Return the Universally Unique ID for this SEED instance. If one does not exist, it will be created. =cut #: Return type $; sub get_seed_id { # # Retrieve the seed identifer from FIGdisk/config/seed_id. # # If it's not there, create one, and make it readonly. # my $id; my $id_file = "$FIG_Config::fig_disk/config/seed_id"; if (! -f $id_file) { my $newid = `uuidgen`; if (!$newid) { die "Cannot run uuidgen: $!"; } chomp($newid); my $fh = new FileHandle(">$id_file"); if (!$fh) { die "error creating $id_file: $!"; } print $fh "$newid\n"; $fh->close(); chmod(0444, $id_file); } my $fh = new FileHandle("<$id_file"); $id = <$fh>; chomp($id); return $id; } =head3 get_release_info C<< my ($name, $id, $inst, $email, $parent_id, $description) = FIG::get_release_info(); >> Return the current data release information.. The release info comes from the file FIG/Data/RELEASE. It is formatted as: <release-name> <unique id> <institution> <contact email> <unique id of data release this release derived from> <description> For instance: ----- SEED Data Release, 09/15/2004. 4148208C-1DF2-11D9-8417-000A95D52EF6 ANL/FIG olson@mcs.anl.gov Test release. ----- If no RELEASE file exists, this routine will create one with a new unique ID. This lets a peer optimize the data transfer by being able to cache ID translations from this instance. =cut #: Return Type @; sub get_release_info { my($fig, $no_create) = @_; my $rel_file = "$FIG_Config::data/RELEASE"; if (! -f $rel_file and !$no_create) { # # Create a new one. # my $newid = `uuidgen`; if (!$newid) { die "Cannot run uuidgen: $!"; } chomp($newid); my $relinfo = "Automatically generated release info " . localtime(); my $inst = "Unknown"; my $contact = "Unknown"; my $parent = ""; my( $a, $b, $e, $v, $env ) = $fig->genome_counts; my $description = "Automatically generated release info\n"; $description .= "Contains $a archaeal, $b bacterial, $e eukaryal, $v viral and $env environmental genomes.\n"; my $fh = new FileHandle(">$rel_file"); if (!$fh) { warn "error creating $rel_file: $!"; return undef; } print $fh "$relinfo\n"; print $fh "$newid\n"; print $fh "$inst\n"; print $fh "$contact\n"; print $fh "$parent\n"; print $fh $description; $fh->close(); chmod(0444, $rel_file); } if (open(my $fh, $rel_file)) { my(@lines) = <$fh>; close($fh); chomp(@lines); my($info, $id, $inst, $contact, $parent, @desc) = @lines; return ($info, $id, $inst, $contact, $parent, join("\n", @desc)); } return undef; } =head3 FIG C<< my $realFig = $fig->FIG(); >> Return this object. This method is provided for compatability with SFXlate. =cut sub FIG { my ($self) = @_; return $self; } =head3 get_peer_last_update C<< my $date = $fig->get_peer_last_update($peer_id); >> Return the timestamp from the last successful peer-to-peer update with the given peer. If the specified peer has made updates, comparing this timestamp to the timestamp of the updates can tell you whether or not the updates have been integrated into your SEED data store. We store this information in FIG/Data/Global/Peers/<peer-id>. =over 4 =item peer_id Universally Unique ID for the desired peer. =item RETURN Returns the date/time stamp for the last peer-to-peer updated performed with the identified SEED instance. =back =cut #: Return Type $; sub get_peer_last_update { my($self, $peer_id) = @_; my $dir = "$FIG_Config::data/Global/Peers"; &verify_dir($dir); $dir .= "/$peer_id"; &verify_dir($dir); my $update_file = "$dir/last_update"; if (-f $update_file) { my $time = file_head($update_file, 1); chomp $time; return $time; } else { return undef; } } =head3 set_peer_last_update C<< $fig->set_peer_last_update($peer_id, $time); >> Manually set the update timestamp for a specified peer. This informs the SEED that you have all of the assignments and updates from a particular SEED instance as of a certain date. =cut #: Return Type ; sub set_peer_last_update { my($self, $peer_id, $time) = @_; my $dir = "$FIG_Config::data/Global/Peers"; &verify_dir($dir); $dir .= "/$peer_id"; &verify_dir($dir); my $update_file = "$dir/last_update"; open(F, ">$update_file"); print F "$time\n"; close(F); } =head3 clean_spaces Remove any extra spaces from input fields. This will (currently) remove ^\s, \s$, and concatenate multiple spaces into one. my $input=$fig->clean_spaces($cgi->param('input')); =cut sub clean_spaces { my ($self, $s)=@_; # note at the moment I do not use \s because that recognizes \t and \n too. This should only remove multiple spaces. $s =~ s/^ +//; $s =~ s/ +$//; $s =~ s/ +/ /g; return $s; } =head3 cgi_url C<< my $url = FIG::$fig->cgi_url(); >> Return the URL for the CGI script directory. =cut #: Return Type $; sub cgi_url { # return &plug_url($FIG_Config::cgi_url); # # In order to globally make relative references work properly, return ".". # This might break some stuff in p2p, but this will get us most of the way there. # The things that break we can repair by inspecting the value of $ENV{SCRIPT_NAME} # return "."; } =head3 top_link C<< my $url = FIG::top_link(); >> Return the relative URL for the top of the CGI script directory. We determine this based on the SCRIPT_NAME environment variable, falling back to FIG_Config::cgi_base if necessary. =cut sub top_link { # # Determine if this is a toplevel cgi or one in one of the subdirs (currently # just /p2p). # my @parts = split(/\//, $ENV{SCRIPT_NAME}); my $top; if ($parts[-2] eq 'FIG') { $top = '.'; # warn "toplevel @parts\n"; } elsif ($parts[-3] eq 'FIG') { $top = '..'; # warn "subdir @parts\n"; } else { $top = $FIG_Config::cgi_base; # warn "other @parts\n"; } return $top; } =head3 temp_url C<< my $url = FIG::temp_url(); >> Return the URL of the temporary file directory. =cut #: Return Type $; sub temp_url { # return &plug_url($FIG_Config::temp_url); # # Similarly, make this relative. # return "../FIG-Tmp"; } =head3 plug_url C<< my $url2 = $fig->plug_url($url); >> or C<< my $url2 = $fig->plug_url($url); >> Change the domain portion of a URL to point to the current domain. This essentially relocates URLs into the current environment. =over 4 =item url URL to relocate. =item RETURN Returns a new URL with the base portion converted to the current operating host. If the URL does not begin with C<http://>, the URL will be returned unmodified. =back =cut #: Return Type $; sub plug_url { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($url) = @_; my $name; # Revised by GJO # First try to get url from the current http request if ( defined( $ENV{ 'HTTP_HOST' } ) # This is where $cgi->url gets its value && ( $name = $ENV{ 'HTTP_HOST' } ) && ( $url =~ s~^http://[^/]*~http://$name~ ) # ~ is delimiter ) {} # Otherwise resort to alternative sources elsif ( ( $name = &get_local_hostname ) && ( $url =~ s~^http://[^/]*~http://$name~ ) # ~ is delimiter ) {} return $url; } =head3 file_read C<< my $text = $fig->file_read($fileName); >> or C<< my @lines = $fig->file_read($fileName); >> or C<< my $text = FIG::file_read($fileName); >> or C<< my @lines = FIG::file_read($fileName); >> Read an entire file into memory. In a scalar context, the file is returned as a single text string with line delimiters included. In a list context, the file is returned as a list of lines, each line terminated by a line delimiter. (For a method that automatically strips the line delimiters, use C<Tracer::GetFile>.) =over 4 =item fileName Fully-qualified name of the file to read. =item RETURN In a list context, returns a list of the file lines. In a scalar context, returns a string containing all the lines of the file with delimiters included. =back =cut #: Return Type $; #: Return Type @; sub file_read { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($fileName) = @_; return file_head($fileName, '*'); } =head3 file_head C<< my $text = $fig->file_head($fileName, $count); >> or C<< my @lines = $fig->file_head($fileName, $count); >> or C<< my $text = FIG::file_head($fileName, $count); >> or C<< my @lines = FIG::file_head($fileName, $count); >> Read a portion of a file into memory. In a scalar context, the file portion is returned as a single text string with line delimiters included. In a list context, the file portion is returned as a list of lines, each line terminated by a line delimiter. =over 4 =item fileName Fully-qualified name of the file to read. =item count (optional) Number of lines to read from the file. If omitted, C<1> is assumed. If the non-numeric string C<*> is specified, the entire file will be read. =item RETURN In a list context, returns a list of the desired file lines. In a scalar context, returns a string containing the desired lines of the file with delimiters included. =back =cut #: Return Type $; #: Return Type @; sub file_head { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($file, $count) = @_; my ($n, $allFlag); if ($count eq '*') { Trace("Full file read for \"$file\".") if T(3); $allFlag = 1; $n = 0; } else { $allFlag = 0; $n = (!$count ? 1 : $count); Trace("Reading $n record(s) from \"$file\".") if T(3); } if (open(my $fh, "<$file")) { my(@ret, $i); $i = 0; while (<$fh>) { push(@ret, $_); $i++; last if !$allFlag && $i >= $n; } close($fh); if (wantarray) { return @ret; } else { return join("", @ret); } } } ################ Basic Routines [ existed since WIT ] ########################## =head3 min C<< my $min = FIG::min(@x); >> or C<< my $min = $fig->min(@x); >> Return the minimum numeric value from a list. =over 4 =item x1, x2, ... xN List of numbers to process. =item RETURN Returns the numeric value of the list entry possessing the lowest value. Returns C<undef> if the list is empty. =back =cut #: Return Type $; sub min { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my(@x) = @_; my($min,$i); (@x > 0) || return undef; $min = $x[0]; for ($i=1; ($i < @x); $i++) { $min = ($min > $x[$i]) ? $x[$i] : $min; } return $min; } =head3 max C<< my $max = FIG::max(@x); >> or C<< my $max = $fig->max(@x); >> Return the maximum numeric value from a list. =over 4 =item x1, x2, ... xN List of numbers to process. =item RETURN Returns the numeric value of t/he list entry possessing the highest value. Returns C<undef> if the list is empty. =back =cut #: Return Type $; sub max { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my(@x) = @_; my($max,$i); (@x > 0) || return undef; $max = $x[0]; for ($i=1; ($i < @x); $i++) { $max = ($max < $x[$i]) ? $x[$i] : $max; } return $max; } =head3 between C<< my $flag = FIG::between($x, $y, $z); >> or C<< my $flag = $fig->between($x, $y, $z); >> Determine whether or not $y is between $x and $z. =over 4 =item x First edge number. =item y Number to examine. =item z Second edge number. =item RETURN Return TRUE if the number I<$y> is between the numbers I<$x> and I<$z>. The check is inclusive (that is, if I<$y> is equal to I<$x> or I<$z> the function returns TRUE), and the order of I<$x> and I<$z> does not matter. If I<$x> is lower than I<$z>, then the return is TRUE if I<$x> <= I<$y> <= I<$z>. If I<$z> is lower, then the return is TRUE if I<$x> >= I$<$y> >= I<$z>. =back =cut #: Return Type $; sub between { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($x,$y,$z) = @_; if ($x < $z) { return (($x <= $y) && ($y <= $z)); } else { return (($x >= $y) && ($y >= $z)); } } =head3 standard_genetic_code C<< my $code = FIG::standard_genetic_code(); >> Return a hash containing the standard translation of nucleotide triples to proteins. Methods such as L</translate> can take a translation scheme as a parameter. This method returns the default translation scheme. The scheme is implemented as a reference to a hash that contains nucleotide triplets as keys and has protein letters as values. =cut #: Return Type $; sub standard_genetic_code { my $code = {}; $code->{"AAA"} = "K"; $code->{"AAC"} = "N"; $code->{"AAG"} = "K"; $code->{"AAT"} = "N"; $code->{"ACA"} = "T"; $code->{"ACC"} = "T"; $code->{"ACG"} = "T"; $code->{"ACT"} = "T"; $code->{"AGA"} = "R"; $code->{"AGC"} = "S"; $code->{"AGG"} = "R"; $code->{"AGT"} = "S"; $code->{"ATA"} = "I"; $code->{"ATC"} = "I"; $code->{"ATG"} = "M"; $code->{"ATT"} = "I"; $code->{"CAA"} = "Q"; $code->{"CAC"} = "H"; $code->{"CAG"} = "Q"; $code->{"CAT"} = "H"; $code->{"CCA"} = "P"; $code->{"CCC"} = "P"; $code->{"CCG"} = "P"; $code->{"CCT"} = "P"; $code->{"CGA"} = "R"; $code->{"CGC"} = "R"; $code->{"CGG"} = "R"; $code->{"CGT"} = "R"; $code->{"CTA"} = "L"; $code->{"CTC"} = "L"; $code->{"CTG"} = "L"; $code->{"CTT"} = "L"; $code->{"GAA"} = "E"; $code->{"GAC"} = "D"; $code->{"GAG"} = "E"; $code->{"GAT"} = "D"; $code->{"GCA"} = "A"; $code->{"GCC"} = "A"; $code->{"GCG"} = "A"; $code->{"GCT"} = "A"; $code->{"GGA"} = "G"; $code->{"GGC"} = "G"; $code->{"GGG"} = "G"; $code->{"GGT"} = "G"; $code->{"GTA"} = "V"; $code->{"GTC"} = "V"; $code->{"GTG"} = "V"; $code->{"GTT"} = "V"; $code->{"TAA"} = "*"; $code->{"TAC"} = "Y"; $code->{"TAG"} = "*"; $code->{"TAT"} = "Y"; $code->{"TCA"} = "S"; $code->{"TCC"} = "S"; $code->{"TCG"} = "S"; $code->{"TCT"} = "S"; $code->{"TGA"} = "*"; $code->{"TGC"} = "C"; $code->{"TGG"} = "W"; $code->{"TGT"} = "C"; $code->{"TTA"} = "L"; $code->{"TTC"} = "F"; $code->{"TTG"} = "L"; $code->{"TTT"} = "F"; return $code; } =head3 translate C<< my $aa_seq = &FIG::translate($dna_seq, $code, $fix_start); >> Translate a DNA sequence to a protein sequence using the specified genetic code. If I<$fix_start> is TRUE, will translate an initial C<TTG> or C<GTG> code to C<M>. (In the standard genetic code, these two combinations normally translate to C<V> and C<L>, respectively.) =over 4 =item dna_seq DNA sequence to translate. Note that the DNA sequence can only contain known nucleotides. =item code Reference to a hash specifying the translation code. The hash is keyed by nucleotide triples, and the value for each key is the corresponding protein letter. If this parameter is omitted, the L</standard_genetic_code> will be used. =item fix_start TRUE if the first triple is to get special treatment, else FALSE. If TRUE, then a value of C<TTG> or C<GTG> in the first position will be translated to C<M> instead of the value specified in the translation code. =item RETURN Returns a string resulting from translating each nucleotide triple into a protein letter. =back =cut #: Return Type $; sub translate { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my( $dna,$code,$start ) = @_; my( $i,$j,$ln ); my( $x,$y ); my( $prot ); if (! defined($code)) { $code = &FIG::standard_genetic_code; } $ln = length($dna); $prot = "X" x ($ln/3); $dna =~ tr/a-z/A-Z/; for ($i=0,$j=0; ($i < ($ln-2)); $i += 3,$j++) { $x = substr($dna,$i,3); if ($y = $code->{$x}) { substr($prot,$j,1) = $y; } } if (($start) && ($ln >= 3) && (substr($dna,0,3) =~ /^[GT]TG$/)) { substr($prot,0,1) = 'M'; } return $prot; } =head3 reverse_comp C<< my $dnaR = FIG::reverse_comp($dna); >> or C<< my $dnaR = $fig->reverse_comp($dna); >> Return the reverse complement os the specified DNA sequence. NOTE: for extremely long DNA strings, use L</rev_comp>, which allows you to pass the strings around in the form of pointers. =over 4 =item dna DNA sequence whose reverse complement is desired. =item RETURN Returns the reverse complement of the incoming DNA sequence. =back =cut #: Return Type $; sub reverse_comp { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($seq) = @_; return ${&rev_comp(\$seq)}; } =head3 rev_comp C<< my $dnaRP = FIG::rev_comp(\$dna); >> or C<< my $dnaRP = $fig->rev_comp(\$dna); >> Return the reverse complement of the specified DNA sequence. The DNA sequence is passed in as a string reference rather than a raw string for performance reasons. If this is unnecessary, use L</reverse_comp>, which processes strings instead of references to strings. =over 4 =item dna Reference to the DNA sequence whose reverse complement is desired. =item RETURN Returns a reference to the reverse complement of the incoming DNA sequence. =back =cut #: Return Type $; sub rev_comp { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my( $seqP ) = @_; my( $rev ); $rev = reverse( $$seqP ); $rev =~ tr/A-Z/a-z/; $rev =~ tr/acgtumrwsykbdhv/tgcaakywsrmvhdb/; return \$rev; } =head3 verify_dir C<< FIG::verify_dir($dir); >> or C<< $fig->verify_dir($dir); >> Insure that the specified directory exists. If it must be created, the permissions will be set to C<0777>. =cut sub verify_dir { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($dir) = @_; if (-d $dir) { return } if ($dir =~ /^(.*)\/[^\/]+$/) { &verify_dir($1); } mkdir($dir,0777) || Confess("Could not make directory $dir: $!"); } =head3 run C<< FIG::run($cmd); >> or C<< $fig->run($cmd); >> Run a command. If the command fails, the error will be traced. =cut sub run { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($cmd) = @_; if ($ENV{FIG_VERBOSE}) { my @tmp = `date`; chomp @tmp; print STDERR "$tmp[0]: running $cmd\n"; } Trace("Running command: $cmd") if T(3); (system($cmd) == 0) || Confess("FAILED: $cmd"); } =head3 augment_path C<< FIG::augment_path($dirName); >> Add a directory to the system path. This method adds a new directory to the front of the system path. It looks in the configuration file to determine whether this is Windows or Unix, and uses the appropriate separator. =over 4 =item dirName Name of the directory to add to the path. =back =cut sub augment_path { my ($dirName) = @_; if ($FIG_Config::win_mode) { $ENV{PATH} = "$dirName;$ENV{PATH}"; } else { $ENV{PATH} = "$dirName:$ENV{PATH}"; } } =head3 read_fasta_record C<< my ($seq_id, $seq_pointer, $comment) = FIG::read_fasta_record(\*FILEHANDLE); >> or C<< my ($seq_id, $seq_pointer, $comment) = $fig->read_fasta_record(\*FILEHANDLE); >> Read and parse the next logical record of a FASTA file. A FASTA logical record consists of multiple lines of text. The first line begins with a C<< > >> symbol and contains the sequence ID followed by an optional comment. (NOTE: comments are currently deprecated, because not all tools handle them properly.) The remaining lines contain the sequence data. This method uses a trick to smooth its operation: the line terminator character is temporarily changed to C<< \n> >> so that a single read operation brings in the entire logical record. =over 4 =item FILEHANDLE Open handle of the FASTA file. If not specified, C<STDIN> is assumed. =item RETURN If we are at the end of the file, returns C<undef>. Otherwise, returns a three-element list. The first element is the sequence ID, the second is a pointer to the sequence data (that is, a string reference as opposed to as string), and the third is the comment. =back =cut #: Return Type @; sub read_fasta_record { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my ($file_handle) = @_; my ($old_end_of_record, $fasta_record, @lines, $head, $sequence, $seq_id, $comment, @parsed_fasta_record); if (not defined($file_handle)) { $file_handle = \*STDIN; } $old_end_of_record = $/; $/ = "\n>"; if (defined($fasta_record = <$file_handle>)) { chomp $fasta_record; @lines = split( /\n/, $fasta_record ); $head = shift @lines; $head =~ s/^>?//; $head =~ m/^(\S+)/; $seq_id = $1; if ($head =~ m/^\S+\s+(.*)$/) { $comment = $1; } else { $comment = ""; } $sequence = join( "", @lines ); @parsed_fasta_record = ( $seq_id, \$sequence, $comment ); } else { @parsed_fasta_record = (); } $/ = $old_end_of_record; return @parsed_fasta_record; } =head3 display_id_and_seq C<< FIG::display_id_and_seq($id_and_comment, $seqP, $fh); >> or C<< $fig->display_id_and_seq($id_and_comment, \$seqP, $fh); >> Display a fasta ID and sequence to the specified open file. This method is designed to work well with L</read_fasta_sequence> and L</rev_comp>, because it takes as input a string pointer rather than a string. If the file handle is omitted it defaults to STDOUT. The output is formatted into a FASTA record. The first line of the output is preceded by a C<< > >> symbol, and the sequence is split into 60-character chunks displayed one per line. Thus, this method can be used to produce FASTA files from data gathered by the rest of the system. =over 4 =item id_and_comment The sequence ID and (optionally) the comment from the sequence's FASTA record. The ID =item seqP Reference to a string containing the sequence. The sequence is automatically formatted into 60-character chunks displayed one per line. =item fh Open file handle to which the ID and sequence should be output. If omitted, C<\*STDOUT> is assumed. =back =cut sub display_id_and_seq { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my( $id, $seqP, $fh ) = @_; if (! defined($fh) ) { $fh = \*STDOUT; } print $fh ">$id\n"; &display_seq($seqP, $fh); } =head3 display_seq C<< FIG::display_seq(\$seqP, $fh); >> or C<< $fig->display_seq(\$seqP, $fh); >> Display a fasta sequence to the specified open file. This method is designed to work well with L</read_fasta_sequence> and L</rev_comp>, because it takes as input a string pointer rather than a string. If the file handle is omitted it defaults to STDOUT. The sequence is split into 60-character chunks displayed one per line for readability. =over 4 =item seqP Reference to a string containing the sequence. =item fh Open file handle to which the sequence should be output. If omitted, C<STDOUT> is assumed. =back =cut sub display_seq { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my ( $seqP, $fh ) = @_; my ( $i, $n, $ln ); if (! defined($fh) ) { $fh = \*STDOUT; } $n = length($$seqP); # confess "zero-length sequence ???" if ( (! defined($n)) || ($n == 0) ); for ($i=0; ($i < $n); $i += 60) { if (($i + 60) <= $n) { $ln = substr($$seqP,$i,60); } else { $ln = substr($$seqP,$i,($n-$i)); } print $fh "$ln\n"; } } ########## I commented the pods on the following routines out, since they should not ########## be part of the SOAP/WSTL interface #=pod # #=head3 file2N # #usage: $n = $fig->file2N($file) # #In some of the databases I need to store filenames, which can waste a lot of #space. Hence, I maintain a database for converting filenames to/from integers. # #=cut # sub file2N :Scalar { my($self,$file) = @_; my($relational_db_response); my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT fileno FROM file_table WHERE ( file = \'$file\')")) && (@$relational_db_response == 1)) { return $relational_db_response->[0]->[0]; } elsif (($relational_db_response = $rdbH->SQL("SELECT MAX(fileno) FROM file_table ")) && (@$relational_db_response == 1) && ($relational_db_response->[0]->[0])) { my $fileno = $relational_db_response->[0]->[0] + 1; if ($rdbH->SQL("INSERT INTO file_table ( file, fileno ) VALUES ( \'$file\', $fileno )")) { return $fileno; } } elsif ($rdbH->SQL("INSERT INTO file_table ( file, fileno ) VALUES ( \'$file\', 1 )")) { return 1; } return undef; } #=pod # #=head3 N2file # #usage: $filename = $fig->N2file($n) # #In some of the databases I need to store filenames, which can waste a lot of #space. Hence, I maintain a database for converting filenames to/from integers. # #=cut # sub N2file :Scalar { my($self,$fileno) = @_; # # Cache outputs. This results in a huge savings of time when files are # accessed multiple times (as in when a bunch of sims are requested). # my $fcache = $self->cached("_n2file"); my $fname; if (defined($fname = $fcache->{$fileno})) { return $fname; } my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT file FROM file_table WHERE ( fileno = $fileno )"); if ($relational_db_response and @$relational_db_response == 1) { $fname = $relational_db_response->[0]->[0]; $fcache->{$fileno} = $fname; return $fname; } return undef; } #=pod # #=head3 openF # #usage: $fig->openF($filename) # #Parts of the system rely on accessing numerous different files. The most obvious case is #the situation with similarities. It is important that the system be able to run in cases in #which an arbitrary number of files cannot be open simultaneously. This routine (with closeF) is #a hack to handle this. I should probably just pitch them and insist that the OS handle several #hundred open filehandles. # #=cut # sub openF { my($self,$file) = @_; my($fxs,$x,@fxs,$fh); $fxs = $self->cached('_openF'); if ($x = $fxs->{$file}) { $x->[1] = time(); return $x->[0]; } @fxs = keys(%$fxs); if (defined($fh = new FileHandle "<$file")) { if (@fxs >= 50) { @fxs = sort { $fxs->{$a}->[1] <=> $fxs->{$b}->[1] } @fxs; $x = $fxs->{$fxs[0]}; undef $x->[0]; delete $fxs->{$fxs[0]}; } $fxs->{$file} = [$fh,time()]; return $fh; } return undef; } #=pod # #=head3 closeF # #usage: $fig->closeF($filename) # #Parts of the system rely on accessing numerous different files. The most obvious case is #the situation with similarities. It is important that the system be able to run in cases in #which an arbitrary number of files cannot be open simultaneously. This routine (with openF) is #a hack to handle this. I should probably just pitch them and insist that the OS handle several #hundred open filehandles. # #=cut # sub closeF { my($self,$file) = @_; my($fxs,$x); if (($fxs = $self->{_openF}) && ($x = $fxs->{$file})) { undef $x->[0]; delete $fxs->{$file}; } } =head3 ec_name C<< my $enzymatic_function = $fig->ec_name($ec); >> Returns the enzymatic name corresponding to the specified enzyme code. =over 4 =item ec Code number for the enzyme whose name is desired. The code number is actually a string of digits and periods (e.g. C<1.2.50.6>). =item RETURN Returns the name of the enzyme specified by the indicated code, or a null string if the code is not found in the database. =back =cut sub ec_name { my($self,$ec) = @_; ($ec =~ /^\d+\.\d+\.\d+\.\d+$/) || return ""; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT name FROM ec_names WHERE ( ec = \'$ec\' )"); return (@$relational_db_response == 1) ? $relational_db_response->[0]->[0] : ""; return ""; } =head3 all_roles C<< my @roles = $fig->all_roles; >> Return a list of the known roles. Currently, this is a list of the enzyme codes and names. The return value is a list of list references. Each element of the big list contains an enzyme code (EC) followed by the enzymatic name. =cut sub all_roles { my($self) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT ec,name FROM ec_names"); return @$relational_db_response; } =head3 expand_ec C<< my $expanded_ec = $fig->expand_ec($ec); >> Expands "1.1.1.1" to "1.1.1.1 - alcohol dehydrogenase" or something like that. =cut sub expand_ec { my($self,$ec) = @_; my($name); return ($name = $self->ec_name($ec)) ? "$ec - $name" : $ec; } =head3 clean_tmp C<< FIG::clean_tmp(); >> Delete temporary files more than two days old. We store temporary files in $FIG_Config::temp. There are specific classes of files that are created and should be saved for at least a few days. This routine can be invoked to clean out those that are over two days old. =cut sub clean_tmp { my($file); if (opendir(TMP,"$FIG_Config::temp")) { # change the pattern to pick up other files that need to be cleaned up my @temp = grep { $_ =~ /^(Geno|tmp)/ } readdir(TMP); foreach $file (@temp) { if (-M "$FIG_Config::temp/$file" > 2) { unlink("$FIG_Config::temp/$file"); } } } } ################ Routines to process genomes and genome IDs ########################## =head3 genomes C<< my @genome_ids = $fig->genomes($complete, $restrictions, $domain); >> Return a list of genome IDs. If called with no parameters, all genome IDs in the database will be returned. Genomes are assigned ids of the form X.Y where X is the taxonomic id maintained by NCBI for the species (not the specific strain), and Y is a sequence digit assigned to this particular genome (as one of a set with the same genus/species). Genomes also have versions, but that is a separate issue. =over 4 =item complete TRUE if only complete genomes should be returned, else FALSE. =item restrictions TRUE if only restriction genomes should be returned, else FALSE. =item domain Name of the domain from which the genomes should be returned. Possible values are C<Bacteria>, C<Virus>, C<Eukaryota>, C<unknown>, C<Archaea>, and C<Environmental Sample>. If no domain is specified, all domains will be eligible. =item RETURN Returns a list of all the genome IDs with the specified characteristics. =back =cut #: Return Type @; sub genomes :Remote :List { my( $self, $complete, $restrictions, $domain ) = @_; my $rdbH = $self->db_handle; my @where = (); if ($complete) { push(@where, "( complete = \'1\' )") } if ($restrictions) { push(@where, "( restrictions = \'1\' )") } if ($domain) { push( @where, "( maindomain = '$domain' )" ) } my $relational_db_response; if (@where > 0) { my $where = join(" AND ",@where); $relational_db_response = $rdbH->SQL("SELECT genome FROM genome where $where"); } else { $relational_db_response = $rdbH->SQL("SELECT genome FROM genome"); } my @genomes = sort { $a <=> $b } map { $_->[0] } @$relational_db_response; return @genomes; } =head3 is_complete C<< my $flag = $fig->is_complete($genome); >> Return TRUE if the genome with the specified ID is complete, else FALSE. =over 4 =item genome ID of the relevant genome. =item RETURN Returns TRUE if there is a complete genome in the database with the specified ID, else FALSE. =back =cut sub is_complete { my($self,$genome) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT genome FROM genome where (genome = '$genome') AND (complete = '1')"); return (@$relational_db_response == 1) } =head3 genome_counts C<< my ($arch, $bact, $euk, $vir, $env, $unk) = $fig->genome_counts($complete); >> Count the number of genomes in each domain. If I<$complete> is TRUE, only complete genomes will be included in the counts. =over 4 =item complete TRUE if only complete genomes are to be counted, FALSE if all genomes are to be counted =item RETURN A six-element list containing the number of genomes in each of six categories-- Archaea, Bacteria, Eukaryota, Viral, Environmental, and Unknown, respectively. =back =cut sub genome_counts { my($self,$complete) = @_; my($x,$relational_db_response); my $rdbH = $self->db_handle; if ($complete) { $relational_db_response = $rdbH->SQL("SELECT genome, maindomain FROM genome where complete = '1'"); } else { $relational_db_response = $rdbH->SQL("SELECT genome,maindomain FROM genome"); } my ($arch, $bact, $euk, $vir, $env, $unk) = (0, 0, 0, 0, 0, 0); if (@$relational_db_response > 0) { foreach $x (@$relational_db_response) { if ($x->[1] =~ /^archaea/i) { ++$arch } elsif ($x->[1] =~ /^bacter/i) { ++$bact } elsif ($x->[1] =~ /^eukar/i) { ++$euk } elsif ($x->[1] =~ /^vir/i) { ++$vir } elsif ($x->[1] =~ /^env/i) { ++$env } else { ++$unk } } } return ($arch, $bact, $euk, $vir, $env, $unk); } =head3 genome_domain C<< my $domain = $fig->genome_domain($genome_id); >> Find the domain of a genome. =over 4 =item genome_id ID of the genome whose domain is desired. =item RETURN Returns the name of the genome's domain (archaea, bacteria, etc.), or C<undef> if the genome is not in the database. =back =cut sub genome_domain { my($self,$genome) = @_; my $relational_db_response; my $rdbH = $self->db_handle; if ($genome) { if (($relational_db_response = $rdbH->SQL("SELECT genome,maindomain FROM genome WHERE ( genome = \'$genome\' )")) && (@$relational_db_response == 1)) { # die Dumper($relational_db_response); return $relational_db_response->[0]->[1]; } } return undef; } =head3 genome_pegs C<< my $num_pegs = $fig->genome_pegs($genome_id); >> Return the number of protein-encoding genes (PEGs) for a specified genome. =over 4 =item genome_id ID of the genome whose PEG count is desired. =item RETURN Returns the number of PEGs for the specified genome, or C<undef> if the genome is not indexed in the database. =back =cut sub genome_pegs { my($self,$genome) = @_; my $relational_db_response; my $rdbH = $self->db_handle; if ($genome) { if (($relational_db_response = $rdbH->SQL("SELECT pegs FROM genome WHERE ( genome = \'$genome\' )")) && (@$relational_db_response == 1)) { return $relational_db_response->[0]->[0]; } } return undef; } =head3 genome_rnas C<< my $num_rnas = $fig->genome_rnas($genome_id); >> Return the number of RNA-encoding genes for a genome. "$genome_id" is indexed in the "genome" database, and 'undef' otherwise. =over 4 =item genome_id ID of the genome whose RNA count is desired. =item RETURN Returns the number of RNAs for the specified genome, or C<undef> if the genome is not indexed in the database. =back =cut sub genome_rnas { my($self,$genome) = @_; my $relational_db_response; my $rdbH = $self->db_handle; if ($genome) { if (($relational_db_response = $rdbH->SQL("SELECT rnas FROM genome WHERE ( genome = \'$genome\' )")) && (@$relational_db_response == 1)) { return $relational_db_response->[0]->[0]; } } return undef; } =head3 genome_szdna usage: $szdna = $fig->genome_szdna($genome_id); Return the number of DNA base-pairs in a genome's contigs. =over 4 =item genome_id ID of the genome whose base-pair count is desired. =item RETURN Returns the number of base pairs in the specified genome's contigs, or C<undef> if the genome is not indexed in the database. =back =cut sub genome_szdna { my($self,$genome) = @_; my $relational_db_response; my $rdbH = $self->db_handle; if ($genome) { if (($relational_db_response = $rdbH->SQL("SELECT szdna FROM genome WHERE ( genome = \'$genome\' )")) && (@$relational_db_response == 1)) { return $relational_db_response->[0]->[0]; } } return undef; } =head3 genome_version C<< my $version = $fig->genome_version($genome_id); >> Return the version number of the specified genome. Versions are incremented for major updates. They are put in as major updates of the form 1.0, 2.0, ... Users may do local "editing" of the DNA for a genome, but when they do, they increment the digits to the right of the decimal. Two genomes remain comparable only if the versions match identically. Hence, minor updating should be committed only by the person/group responsible for updating that genome. We can, of course, identify which genes are identical between any two genomes (by matching the DNA or amino acid sequences). However, the basic intent of the system is to support editing by the main group issuing periodic major updates. =over 4 =item genome_id ID of the genome whose version is desired. =item RETURN Returns the version number of the specified genome, or C<undef> if the genome is not in the data store or no version number has been assigned. =back =cut sub genome_version :Scalar { my($self,$genome) = @_; my(@tmp); if ((-s "$FIG_Config::organisms/$genome/VERSION") && (@tmp = `cat $FIG_Config::organisms/$genome/VERSION`) && ($tmp[0] =~ /^(\S+)$/)) { return $1; } return undef; } =head3 genome_md5sum C<< my $md5sum = $fig->genome_md5sum($genome_id); >> Returns the MD5 checksum of the specified genome. The checksum of a genome is defined as the checksum of its signature file. The signature file consists of tab-separated lines, one for each contig, ordered by the contig id. Each line contains the contig ID, the length of the contig in nucleotides, and the MD5 checksum of the nucleotide data, with uppercase letters forced to lower case. The checksum is indexed in the database. If you know a genome's checksum, you can use the L</genome_with_md5sum> method to find its ID in the database. =over 4 =item genome ID of the genome whose checksum is desired. =item RETURN Returns the specified genome's checksum, or C<undef> if the genome is not in the database. =back =cut sub genome_md5sum :Scalar { my($self,$genome) = @_; my $relational_db_response; my $rdbH = $self->db_handle; if ($genome) { if (($relational_db_response = $rdbH->SQL("SELECT md5sum FROM genome_md5sum WHERE ( genome = \'$genome\' )")) && (@$relational_db_response == 1)) { return $relational_db_response->[0]->[0]; } } return undef; } =head3 genome_with_md5sum C<< my $genome = $fig->genome_with_md5sum($cksum); >> Find a genome with the specified checksum. The MD5 checksum is computed from the content of the genome (see L</genome_md5sum>). This method can be used to determine if a genome already exists for a specified content. =over 4 =item cksum Checksum to use for searching the genome table. =item RETURN The ID of a genome with the specified checksum, or C<undef> if no such genome exists. =back =cut sub genome_with_md5sum :Scalar { my($self,$cksum) = @_; my $relational_db_response; my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT genome FROM genome_md5sum WHERE ( md5sum = \'$cksum\' )")) && (@$relational_db_response == 1)) { return $relational_db_response->[0]->[0]; } return undef; } =head3 contig_md5sum C<< my $cksum = $fig->contig_md5sum($genome, $contig); >> Return the MD5 checksum for a contig. The MD5 checksum is computed from the content of the contig. This method retrieves the checksum stored in the database. The checksum can be compared to the checksum of an external contig as a cheap way of seeing if they match. =over 4 =item genome ID of the genome containing the contig. =item contig ID of the relevant contig. =item RETURN Returns the checksum of the specified contig, or C<undef> if the contig is not in the database. =back =cut sub contig_md5sum :Scalar { my($self, $genome, $contig) = @_; my $relational_db_response; my $rdbH = $self->db_handle; if ($genome) { if (($relational_db_response = $rdbH->SQL(qq(SELECT md5 FROM contig_md5sums WHERE (genome = ? AND contig = ?)), undef, $genome, $contig)) && (@$relational_db_response == 1)) { return $relational_db_response->[0]->[0]; } } return undef; } =head3 genus_species C<< my $gs = $fig->genus_species($genome_id); >> Return the genus, species, and possibly also the strain of a specified genome. This method converts a genome ID into a more recognizble species name. The species name is stored directly in the genome table of the database. Essentially, if the strain is present in the database, it will be returned by this method, and if it's not present, it won't. =over 4 =item genome_id ID of the genome whose name is desired. =item RETURN Returns the scientific species name associated with the specified ID, or C<undef> if the ID is not in the database. =back =cut #: Return Type $; sub genus_species :Scalar { my ($self,$genome) = @_; my $ans; my $genus_species = $self->cached('_genus_species'); if (! ($ans = $genus_species->{$genome})) { my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT genome,gname FROM genome"); my $pair; foreach $pair (@$relational_db_response) { $genus_species->{$pair->[0]} = $pair->[1]; } $ans = $genus_species->{$genome}; } return $ans; } =head3 org_of C<< my $org = $fig->org_of($prot_id); >> Return the genus/species name of the organism containing a protein. Note that in this context I<protein> is not a certain string of amino acids but a protein encoding region on a specific contig. For a FIG protein ID (e.g. C<fig|134537.1.peg.123>), the organism and strain information is always available. In the case of external proteins, we can usually determine an organism, but not anything more precise than genus/species (and often not that). When the organism name is not present, a null string is returned. =over 4 =item prot_id Protein or feature ID. =item RETURN Returns the displayable scientific name (genus, species, and strain) of the organism containing the identified PEG. If the name is not available, returns a null string. If the PEG is not found, returns C<undef>. =back =cut sub org_of { my($self,$prot_id) = @_; my $relational_db_response; my $rdbH = $self->db_handle; if ($prot_id =~ /^fig\|/) { return $self->is_deleted_fid( $prot_id) ? undef : $self->genus_species( $self->genome_of( $prot_id ) ) || ""; } if (($relational_db_response = $rdbH->SQL("SELECT org FROM external_orgs WHERE ( prot = \'$prot_id\' )")) && (@$relational_db_response >= 1)) { $relational_db_response->[0]->[0] =~ s/^\d+://; return $relational_db_response->[0]->[0]; } return ""; } =head3 genus_species_domain C<< my ($gs, $domain) = $fig->genus_species_domain($genome_id); >> Returns a genome's genus and species (and strain if that has been properly recorded) in a printable form, along with its domain. This method is similar to L</genus_species>, except it also returns the domain name (archaea, bacteria, etc.). =over 4 =item genome_id ID of the genome whose species and domain information is desired. =item RETURN Returns a two-element list. The first element is the species name and the second is the domain name. =back =cut sub genus_species_domain { my ($self, $genome) = @_; my $genus_species_domain = $self->cached('_genus_species_domain'); if ( ! $genus_species_domain->{ $genome } ) { my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT genome,gname,maindomain FROM genome"); my $triple; foreach $triple ( @$relational_db_response ) { $genus_species_domain->{ $triple->[0] } = [ $triple->[1], $triple->[2] ]; } } my $gsdref = $genus_species_domain->{ $genome }; return $gsdref ? @$gsdref : ( "", "" ); } =head3 domain_color C<< my $web_color = FIG::domain_color($domain); >> Return the web color string associated with a specified domain. The colors are extremely subtle (86% luminance), so they absolutely require a black background. Archaea are slightly cyan, bacteria are slightly magenta, eukaryota are slightly yellow, viruses are slightly silver, environmental samples are slightly gray, and unknown or invalid domains are pure white. =over 4 =item domain Name of the domain whose color is desired. =item RETURN Returns a web color string for the specified domain (e.g. C<#FFDDFF> for bacteria). =back =cut my %domain_color = ( AR => "#DDFFFF", BA => "#FFDDFF", EU => "#FFFFDD", VI => "#DDDDDD", EN => "#BBBBBB" ); sub domain_color { my ($domain) = @_; defined $domain || return "#FFFFFF"; return $domain_color{ uc substr($domain, 0, 2) } || "#FFFFFF"; } =head3 org_and_color_of C<< my ($org, $color) = $fig->org_and_domain_of($prot_id); >> Return the best guess organism and domain html color string of an organism. In the case of external proteins, we can usually determine an organism, but not anything more precise than genus/species (and often not that). =over 4 =item prot_id Relevant protein or feature ID. =item RETURN Returns a two-element list. The first element is the displayable organism name, and the second is an HTML color string based on the domain (see L</domain_color>). =back =cut sub org_and_color_of { my($self,$prot_id) = @_; my $relational_db_response; my $rdbH = $self->db_handle; if ($prot_id =~ /^fig\|/) { my( $gs, $domain ) = $self->genus_species_domain($self->genome_of($prot_id)); return ( $gs, domain_color( $domain ) ); } if (($relational_db_response = $rdbH->SQL("SELECT org FROM external_orgs WHERE ( prot = \'$prot_id\' )")) && (@$relational_db_response >= 1)) { return ($relational_db_response->[0]->[0], "#FFFFFF"); } return ("", "#FFFFFF"); } =head3 partial_genus_matching Return a list of genome IDs that match a partial genus. For example partial_genus_matching("Listeria") will return all genome IDs that begin with Listeria, and this can also be restricted to complete genomes with another argument like this partial_genus_matching("Listeria", 1) =cut sub partial_genus_matching { my ($self, $gen, $complete)=@_; return grep {$self->genus_species($_) =~ /$gen/i} $self->genomes($complete); } =head3 abbrev C<< my $abbreviated_name = FIG::abbrev($genome_name); >> or C<< my $abbreviated_name = $fig->abbrev($genome_name); >> Abbreviate a genome name to 10 characters or less. For alignments and such, it is very useful to be able to produce an abbreviation of genus/species. That's what this does. Note that multiple genus/species might reduce to the same abbreviation, so be careful (disambiguate them, if you must). The abbreviation is formed from the first three letters of the species name followed by the first three letters of the genus name followed by the first three letters of the species name and then the next four nonblank characters. =over 4 =item genome_name The name to abbreviate. =item RETURN An abbreviated version of the specified name. =back =cut sub abbrev :Scalar { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($genome_name) = @_; $genome_name =~ s/^(\S{3})\S+/$1./; $genome_name =~ s/^(\S+)\s+(\S{3})\S+/$1$2./; $genome_name =~ s/ //g; if (length($genome_name) > 10) { $genome_name = substr($genome_name,0,10); } return $genome_name; } ################ Routines to process Features and Feature IDs ########################## =head3 ftype C<< my $type = FIG::ftype($fid); >> or C<< my $type = $fig->ftype($fid); >> Returns the type of a feature, given the feature ID. This just amounts to lifting it out of the feature ID, since features have IDs of the form fig|x.y.f.n where x.y is the genome ID f is the type of feature n is an integer that is unique within the genome/type =over 4 =item fid FIG ID of the feature whose type is desired. =item RETURN Returns the feature type (e.g. C<peg>, C<rna>, C<pi>, or C<pp>), or C<undef> if the feature ID is not a FIG ID. =back =cut sub ftype { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($feature_id) = @_; if ($feature_id =~ /^fig\|\d+\.\d+\.([^\.]+)/) { return $1; } return undef; } =head3 genome_of C<< my $genome_id = $fig->genome_of($fid); >> or C<< my $genome_id = FIG::genome_of($fid); >> Return the genome ID from a feature ID. =over 4 =item fid ID of the feature whose genome ID is desired. =item RETURN If the feature ID is a FIG ID, returns the genome ID embedded inside it; otherwise, it returns C<undef>. =back =cut sub genome_of :Scalar { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my $prot_id = (@_ == 1) ? $_[0] : $_[1]; if ($prot_id =~ /^fig\|(\d+\.\d+)/) { return $1; } return undef; } =head3 genome_and_peg_of C<< my ($genome_id, $peg_number = FIG::genome_and_peg_of($fid); >> C<< my ($genome_id, $peg_number = $fig->genome_and_peg_of($fid); >> Return the genome ID and peg number from a feature ID. =over 4 =item prot_id ID of the feature whose genome and PEG number as desired. =item RETURN Returns the genome ID and peg number associated with a feature if the feature is represented by a FIG ID, else C<undef>. =back =cut sub genome_and_peg_of { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my $prot_id = (@_ == 1) ? $_[0] : $_[1]; if ($prot_id =~ /^fig\|(\d+\.\d+)\.peg\.(\d+)/) { return ($1, $2); } return undef; } =head3 by_fig_id C<< my @sorted_by_fig_id = sort { FIG::by_fig_id($a,$b) } @fig_ids; >> Compare two feature IDs. This function is designed to assist in sorting features by ID. The sort is by genome ID followed by feature type and then feature number. =over 4 =item a First feature ID. =item b Second feature ID. =item RETURN Returns a negative number if the first parameter is smaller, zero if both parameters are equal, and a positive number if the first parameter is greater. =back =cut sub by_fig_id { my($a,$b) = @_; my($g1,$g2,$t1,$t2,$n1,$n2); if (($a =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g1,$t1,$n1) = ($1,$2,$3)) && ($b =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g2,$t2,$n2) = ($1,$2,$3))) { ($g1 <=> $g2) or ($t1 cmp $t2) or ($n1 <=> $n2); } else { $a cmp $b; } } =head3 by_genome_id C<< my @sorted_by_genome_id = sort { FIG::by_genome_id($a,$b) } @genome_ids; >> Compare two genome IDs. This function is designed to assist in sorting genomes by ID. =over 4 =item a First genome ID. =item b Second genome ID. =item RETURN Returns a negative number if the first parameter is smaller, zero if both parameters are equal, and a positive number if the first parameter is greater. =back =cut sub by_genome_id { my($a,$b) = @_; my($g1,$g2,$s1, $s2); if (($a =~ /^(\d+)\.(\d+)$/) && (($g1, $s1) = ($1, $2)) && ($b =~ /^(\d+)\.(\d+)$/) && (($g2, $s2) = ($1, $2))) { ($g1 <=> $g2) or ($s1 <=> $s2); } else { $a cmp $b; } } =head3 genes_in_region C<< my ($features_in_region, $beg1, $end1) = $fig->genes_in_region($genome, $contig, $beg, $end, size_limit); >> Locate features that overlap a specified region of a contig. This includes features that begin or end outside that region, just so long as some part of the feature can be found in the region of interest. It is often important to be able to find the genes that occur in a specific region on a chromosome. This routine is designed to provide this information. It returns all genes that overlap positions from I<$beg> through I<$end> in the specified contig. The I<$size_limit> parameter limits the search process. It is presumed that no features are longer than the specified size limit. A shorter size limit means you'll miss some features; a longer size limit significantly slows the search process. For prokaryotes, a value of C<10000> (the default) seems to work best. =over 4 =item genome ID of the genome containing the relevant contig. =item contig ID of the relevant contig. =item beg Position of the first base pair in the region of interest. =item end Position of the last base pair in the region of interest. =item size_limit Maximum allowable size for a feature. If omitted, C<10000> is assumed. =item RETURN Returns a three-element list. The first element is a reference to a list of the feature IDs found. The second element is the position of the leftmost base pair in any feature found. This may be well before the region of interest begins or it could be somewhere inside. The third element is the position of the rightmost base pair in any feature found. Again, this can be somewhere inside the region or it could be well to the right of it. =back =cut #: Return Type @; sub genes_in_region { my($self, $genome, $contig, $beg, $end, $pad) = @_; if (!defined $pad) { $pad = 10000; } my($x,$relational_db_response,$feature_id,$b1,$e1,@feat,@tmp,$l,$u); my $rdbH = $self->db_handle; my $minV = $beg - $pad; my $maxV = $end + $pad; if (($relational_db_response = $rdbH->SQL("SELECT id FROM features " . " WHERE ( minloc > $minV ) AND ( minloc < $maxV ) AND ( maxloc < $maxV) AND " . " ( genome = \'$genome\' ) AND ( contig = \'$contig\' );")) && (@$relational_db_response >= 1)) { @tmp = sort { ($a->[1] cmp $b->[1]) or (($a->[2]+$a->[3]) <=> ($b->[2]+$b->[3])) } map { $feature_id = $_->[0]; $x = $self->feature_location($feature_id); $x ? [$feature_id,&boundaries_of($x)] : () } @$relational_db_response; ($l,$u) = (10000000000,0); foreach $x (@tmp) { ($feature_id,undef,$b1,$e1) = @$x; if (&between($beg,&min($b1,$e1),$end) || &between(&min($b1,$e1),$beg,&max($b1,$e1))) { if (! $self->is_deleted_fid($feature_id)) { push(@feat,$feature_id); $l = &min($l,&min($b1,$e1)); $u = &max($u,&max($b1,$e1)); } } } (@feat <= 0) || return ([@feat],$l,$u); } return ([],$l,$u); } #============================================================================= # Using the following version is better, but it brings out a very annoying # issue with some genomes. It already exists in the current code (above) # for some genes in some genomes. For example, visit fig|70601.1.peg.1. # This is true for any genome that has a feature that crosses the origin. # The root of the problem lies in boundaries_of. I am working on a fix that # replaces boundaries_of with a more sophisticated function. When it is # all done, genes_in_retion should behave as desired. -- GJO, Aug. 22, 2004 #============================================================================= # # =pod # # =head3 genes_in_region # # usage: ( $features_in_region, $min_coord, $max_coord ) # = $fig->genes_in_region( $genome, $contig, $beg, $end ) # # It is often important to be able to find the genes that occur in a specific # region on a chromosome. This routine is designed to provide this information. # It returns all genes that overlap the region ( $genome, $contig, $beg, $end ). # $min_coord is set to the minimum coordinate of the returned genes (which may # preceed the given region), and $max_coord the maximum coordinate. Because # the database is indexed by the leftmost and rightmost coordinates of each # feature, the function makes no assumption about the length of the feature, but # it can (and probably will) miss features spanning multiple contigs. # # =cut # # # sub genes_in_region { # my ( $self, $genome, $contig, $beg, $end ) = @_; # my ( $x, $db_response, $feature_id, $b1, $e1, @tmp, @bounds ); # my ( $min_coord, $max_coord ); # # my @features = (); # my $rdbH = $self->db_handle; # # if ( ( $db_response = $rdbH->SQL( "SELECT id # FROM features # WHERE ( contig = '$contig' ) # AND ( genome = '$genome' ) # AND ( minloc <= $end ) # AND ( maxloc >= $beg );" # ) # ) # && ( @$db_response > 0 ) # ) # { # # The sort is unnecessary, but provides a consistent ordering # # @tmp = sort { ( $a->[1] cmp $b->[1] ) # contig # || ( ($a->[2] + $a->[3] ) <=> ( $b->[2] + $b->[3] ) ) # midpoint # } # map { $feature_id = $_->[0]; # ( ( ! $self->is_deleted_fid( $feature_id ) ) # not deleted # && ( $x = $self->feature_location( $feature_id ) ) # and has location # && ( ( @bounds = boundaries_of( $x ) ) == 3 ) # and has bounds # ) ? [ $feature_id, @bounds ] : () # } @$db_response; # # ( $min_coord, $max_coord ) = ( 10000000000, 0 ); # # foreach $x ( @tmp ) # { # ( $feature_id, undef, $b1, $e1 ) = @$x; # push @features, $feature_id; # my ( $min, $max ) = ( $b1 <= $e1 ) ? ( $b1, $e1 ) : ( $e1, $b1 ); # ( $min_coord <= $min ) || ( $min_coord = $min ); # ( $max_coord >= $max ) || ( $max_coord = $max ); # } # } # # return ( @features ) ? ( [ @features ], $min_coord, $max_coord ) # : ( [], undef, undef ); # } # These will be part of the fix to genes_in_region. -- GJO =head3 regions_spanned C<< my ( [ $contig, $beg, $end ], ... ) = $fig->regions_spanned( $loc ); >> or C<< my ( [ $contig, $beg, $end ], ... ) = FIG::regions_spanned( $loc ); >> The location of a feature in a scalar context is contig_b1_e1, contig_b2_e2, ... [one contig_b_e for each segment] This routine takes as input a scalar location in the above form and reduces it to one or more regions spanned by the gene. This involves combining regions in the location string that are on the same contig and going in the same direction. Unlike L</boundaries_of>, which returns one region in which the entire gene can be found, B<regions_spanned> handles wrapping through the orgin, features split over contigs and exons that are not ordered nicely along the chromosome (ugly but true). =over 4 =item loc The location string for a feature. =item RETURN Returns a list of list references. Each inner list contains a contig ID, a starting position, and an ending position. The starting position may be numerically greater than the ending position (which indicates a backward-traveling gene). It is guaranteed that the entire feature is covered by the regions in the list. =back =cut sub regions_spanned { shift if UNIVERSAL::isa( $_[0], __PACKAGE__ ); my( $location ) = ( @_ == 1 ) ? $_[0] : $_[1]; defined( $location ) || return undef; my @regions = (); my ( $cur_contig, $cur_beg, $cur_end, $cur_dir ); my ( $contig, $beg, $end, $dir ); my @segs = split( /\s*,\s*/, $location ); # should not have space, but ... @segs || return undef; # Process the first segment my $seg = shift @segs; ( ( $cur_contig, $cur_beg, $cur_end ) = ( $seg =~ /^(\S+)_(\d+)_\d+$/ ) ) || return undef; $cur_dir = ( $cur_end >= $cur_beg ) ? 1 : -1; foreach $seg ( @segs ) { ( ( $contig, $beg, $end ) = ( $seg =~ /^(\S+)_(\d+)_\d+$/ ) ) || next; $dir = ( $end >= $beg ) ? 1 : -1; # Is this a continuation? Update end if ( ( $contig eq $cur_contig ) && ( $dir == $cur_dir ) && ( ( ( $dir > 0 ) && ( $end > $cur_end ) ) || ( ( $dir < 0 ) && ( $end < $cur_end ) ) ) ) { $cur_end = $end; } # Not a continuation. Report previous and update current. else { push @regions, [ $cur_contig, $cur_beg, $cur_end ]; ( $cur_contig, $cur_beg, $cur_end, $cur_dir ) = ( $contig, $beg, $end, $dir ); } } # There should alwasy be a valid, unreported region. push @regions, [ $cur_contig, $cur_beg, $cur_end ]; return wantarray ? @regions : \@regions; } =head3 filter_regions C<< my @regions = FIG::filter_regions( $contig, $min, $max, @regions ); >> or C<< my \@regions = FIG::filter_regions( $contig, $min, $max, @regions ); >> or C<< my @regions = FIG::filter_regions( $contig, $min, $max, \@regions ); >> or C<< my \@regions = FIG::filter_regions( $contig, $min, $max, \@regions ); >> Filter a list of regions to those that overlap a specified section of a particular contig. Region definitions correspond to those produced by L</regions_spanned>. That is, C<[>I<contig>C<,>I<beg>C<,>I<end>C<]>. In the function call, either I<$contig> or I<$min> and I<$max> can be undefined (permitting anything). So, for example, my @regions = FIG::filter_regions(undef, 1, 5000, $regionList); would return all regions in C<$regionList> that overlap the first 5000 base pairs in any contig. Conversely, my @regions = FIG::filter_regions('NC_003904', undef, undef, $regionList); would return all regions on the contig C<NC_003904>. =over 4 =item contig ID of the contig whose regions are to be passed by the filter, or C<undef> if the contig doesn't matter. =item min Leftmost position of the region used for filtering. Only regions which contain at least one base pair at or beyond this position will be passed. A value of C<undef> is equivalent to zero. =item max Rightmost position of the region used for filtering. Only regions which contain at least one base pair on or before this position will be passed. A value of C<undef> is equivalent to the length of the contig. =item regionList A list of regions, or a reference to a list of regions. Each region is a reference to a three-element list, the first element of which is a contig ID, the second element of which is the start position, and the third element of which is the ending position. (The ending position can be before the starting position if the region is backward-traveling.) =item RETURN In a scalar context, returns a reference to a list of the filtered regions. In a list context, returns the list itself. =back =cut sub filter_regions { my ( $contig, $min, $max, @regions ) = @_; @regions || return (); ( ref( $regions[0] ) eq "ARRAY" ) || return undef; # Is it a region list, or a reference to a region list? if ( ref( $regions[0]->[0] ) eq "ARRAY" ) { @regions = @{ $regions[0] } } if ( ! defined( $contig ) ) { ( defined( $min ) && defined( $max ) ) || return undef; } else # with a defined contig name, allow undefined range { defined( $min ) || ( $min = 1 ); defined( $max ) || ( $max = 1000000000 ); } ( $min <= $max ) || return (); my ( $c, $b, $e ); my @filtered = grep { ( @$_ >= 3 ) # Allow extra fields? && ( ( $c, $b, $e ) = @$_ ) && ( ( ! defined( $contig ) ) || ( $c eq $contig ) ) && ( ( $e >= $b ) || ( ( $b, $e ) = ( $e, $b ) ) ) && ( ( $b <= $max ) && ( $e >= $min ) ) } @regions; return wantarray ? @filtered : \@filtered; } =head3 close_genes C<< my @features = $fig->close_genes($fid, $dist); >> Return all features within a certain distance of a specified other feature. This method is a quick way to get genes that are near another gene. It calls L</boundaries_of> to get the boundaries of the incoming gene, then passes the region computed to L</genes_in_region>. So, for example, if the specified I<$dist> is 500, the method would select a region that extends 500 base pairs to either side of the boundaries for the gene I<$fid>, and pass it to C<genes_in_region> for analysis. The features returned would be those that overlap the selected region. Note that the flaws inherent in C<genes_in_region> are also inherent in this method: if a feature is more than 10000 base pairs long, it may not be caught even though it has an overlap in the specified region. =over 4 =item fid ID of the relevant feature. =item dist Desired maximum distance. =item RETURN Returns a list of feature IDs for genes that overlap or are close to the boundaries for the specified incoming feature. =back =cut sub close_genes { my($self,$fid,$dist) = @_; # warn "In close_genes, self=$self, fid=$fid"; my $loc = $self->feature_location($fid); if ($loc) { my($contig,$beg,$end) = &FIG::boundaries_of($loc); if ($contig && $beg && $end) { my $min = &min($beg,$end) - $dist; my $max = &max($beg,$end) + $dist; my $feat; ($feat,undef,undef) = $self->genes_in_region(&FIG::genome_of($fid),$contig,$min,$max); return @$feat; } } return (); } =head3 adjacent_genes C<< my ($left_fid, $right_fid) = $fig->adjacent_genes($fid, $dist); >> Return the IDs of the genes immediately to the left and right of a specified feature. This method gets a list of the features within the specified distance of the incoming feature (using L</close_genes>), and then chooses the two closest to the feature found. If the incoming feature is on the + strand, these are features to the left and the right. If the incoming feature is on the - strand, the features will be returned in reverse order. =over 4 =item fid ID of the feature whose neighbors are desired. =item dist Maximum permissible distance to the neighbors. =item RETURN Returns a two-element list containing the IDs of the features on either side of the incoming feature. =back =cut sub adjacent_genes { my ($self, $fid, $dist) = @_; my (@close, $strand, $i); # warn "In adjacent_genes, self=$self, fid=$fid"; $strand = $self->strand_of($fid); $dist = $dist || 2000; @close = $self->close_genes($fid, $dist); for ($i=0; $i < @close; ++$i) { last if ($close[$i] eq $fid); } # RAE note that if $self->strand_of($close[$i-1]) ne $strand then left/right neighbors # were never set! oops! # I think the concept of Left and right is confused here. In my mind, left and right # are independent of strand ?? E.g. take a look at PEG fig|196600.1.peg.1806 # this is something like # # ---> <--1805--- --1806--> <--1807-- <---- # # 1805 is always the left neighbor, no? my ($left_neighbor, $right_neighbor) = ($close[$i-1], $close[$i+1]); # if (0) # this was if ($i > 0) I just skip this whole section! # { # if ($self->strand_of($close[$i-1]) eq $strand) { $left_neighbor = $close[$i-1]; } # } if ($i < $#close) { if ($self->strand_of($close[$i+1]) eq $strand) { $right_neighbor = $close[$i+1]; } } # ...return genes in transcription order... if ($strand eq '-') { ($left_neighbor, $right_neighbor) = ($right_neighbor, $left_neighbor); } return ($left_neighbor, $right_neighbor) ; } =head3 feature_location C<< my $loc = $fig->feature_location($fid); >> or C<< my @loc = $fig->feature_location($fid);; >> Return the location of a feature. The location consists of a list of (contigID, begin, end) triples encoded as strings with an underscore delimiter. So, for example, C<NC_002755_100_199> indicates a location starting at position 100 and extending through 199 on the contig C<NC_002755>. If the location goes backward, the start location will be higher than the end location (e.g. C<NC_002755_199_100>). In a scalar context, this method returns the locations as a comma-delimited string NC_002755_100_199,NC_002755_210_498 In a list context, the locations are returned as a list (NC_002755_100_199, NC_002755_210_498) =over 4 =item fid ID of the feature whose location is desired. =item RETURN Returns the locations of a feature, either as a comma-delimited string or a list. =back =cut sub feature_location :Scalar :List { my($self,$feature_id) = @_; my($relational_db_response,$locations,$location); # warn "In feature_location, self=$self, fid=$feature_id"; if ($self->is_deleted_fid($feature_id)) { return undef } $locations = $self->cached('_location'); if (! ($location = $locations->{$feature_id})) { my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT location FROM features WHERE ( id = \'$feature_id\' )")) && (@$relational_db_response == 1)) { $locations->{$feature_id} = $location = $relational_db_response->[0]->[0]; } } if ($location) { return wantarray() ? split(/,/,$location) : $location; } return undef; } =head3 contig_of C<< my $contigID = $fig->contig_of($location); >> Return the ID of the contig containing a location. This method only works with SEED-style locations (I<contigID>C<_>I<beg>C<_>I<end>). For more comprehensive location parsing, use the B<Location> object. =over 4 =item location A SEED-style location (I<contigID>C<_>I<beg>C<_>I<end>), or a comma-delimited list of SEED-style locations. In the latter case, only the first location in the list will be processed. =item RETURN Returns the contig ID from the first location in the incoming string. =back =cut sub contig_of { my ($self, $locus) = @_; $locus =~ m/^([^,]+)_\d+_\d+/; return $1; } =head3 beg_of C<< my $beg = $fig->beg_of($location); >> Return the beginning point of a location. This method only works with SEED-style locations (I<contigID>C<_>I<beg>C<_>I<end>). For more comprehensive location parsing, use the B<Location> object. =over 4 =item location A SEED-style location (I<contigID>C<_>I<beg>C<_>I<end>), or a comma-delimited list of SEED-style locations. In the latter case, only the first location in the list will be processed. =item RETURN Returns the beginning point from the first location in the incoming string. =back =cut sub beg_of { my ($self, $locus) = @_; $locus =~ m/^[^,]+_(\d+)_\d+/; return $1; } =head3 end_of C<< my $end = $fig->end_of($location); >> Return the ending point of a location. This method only works with SEED-style locations (I<contigID>C<_>I<beg>C<_>I<end>). For more comprehensive location parsing, use the B<Location> object. =over 4 =item location A SEED-style location (I<contigID>C<_>I<beg>C<_>I<end>), or a comma-delimited list of SEED-style locations. In the latter case, only the first location in the list will be processed. =item RETURN Returns the contig ID from the first location in the incoming string. =back =cut sub end_of { my ($self, $locus) = @_; $locus =~ m/\S+_\d+_(\d+)$/; return $1; } =head3 upstream_of C<< my $dna = $fig->upstream_of($peg, $upstream, $coding); >> Return the DNA immediately upstream of a feature. This method contains code lifted from the C<upstream.pl> script. =over 4 =item peg ID of the feature whose upstream DNA is desired. =item upstream Number of base pairs considered upstream. =item coding Number of base pairs inside the feature to be included in the upstream region. =item RETURN Returns the DNA sequence upstream of the feature's begin point and extending into the coding region. Letters inside a feature are in upper case and inter-genic letters are in lower case. A hyphen separates the true upstream letters from the coding region. =back =cut #: Return Type $; sub upstream_of { # Get the parameters. my ($self, $peg, $upstream, $coding) = @_; # Declare the work variables. my ($gene_before, $inter_genic, $c_seq) = ("", "", ""); # Declare the return variable. my $retVal; # Compute the upstream. my($contig,$beg,$end) = $self->boundaries_of(scalar $self->feature_location($peg)); my $genome = $self->genome_of($peg); if (defined($contig) && (my $contig_ln = $self->contig_ln($genome,$contig))) { if ($beg < $end) { # Here the feature goes from left to right. Set "b" and "e" to the start and # end of the upstream region, which precedes the feature. my $b = max(1,$beg-$upstream); my $e = max(1,$beg-1); if ($e > $b) { # Here an upstream region exists. We put it in u_seq. my $u_seq = lc $self->dna_seq($genome,join("_",($contig,$b,$e))); # Now we look for a coding region inside the feature. It would go from # "$beg" to "$end1". my $end1 = min($end,$beg+$coding-1); # If a coding region exists, we put it in c_seq. $c_seq = ($end1 > $beg) ? uc $self->dna_seq($genome,join("_",($contig,$beg,$end1))) : ""; # Now we look for a gene in the upstream region. If one exists, we put its # rightmost point in "$e_ov". my (undef,undef,$e_ov) = $self->genes_in_region($genome,$contig,$b,$e); if ($e_ov && ($e_ov >= $b)) { # Get the length of the upstream region that overlaps the preceding gene. # We need to be careful here, because it might actually extend into the # coding region. my $ov_ln = min($e_ov - $b + 1, length $u_seq); $gene_before = uc substr($u_seq,0,$ov_ln); $inter_genic = substr($u_seq,$ov_ln); } else { # No overlap. The entire upstream is considered inter-genic. $gene_before = ""; $inter_genic = $u_seq; } } } else { # Here the feature goes from right to left. Set "$b" and "$e" to the beginning # and end of the upstream region, which is to the right of the feature. my $b = min($contig_ln,$beg+$upstream); my $e = min($contig_ln,$beg+1); if ($e < $b) { # Here an upstream region exists. We put it in u_seq. my $u_seq = lc $self->dna_seq($genome,join("_",($contig,$b,$e))); # Now we look for a coding region inside the feature. If one exists we # put it in c_seq. my $end1 = max($end,$beg-$coding+1); $c_seq = ($end1 < $beg) ? uc $self->dna_seq($genome,join("_",($contig,$beg,$end1))) : ""; # Now we look for a gene that overlaps the upstream region. If it exists, # we put its leftmost point in b_ov. my (undef,$b_ov,undef) = $self->genes_in_region($genome,$contig,$e,$b); if ($b_ov && ($b_ov <= $b)) { # Get the length of the upstream region that overlaps the preceding gene. # We need to be careful here, because it might actually extend into the # coding region. my $ov_ln = min($b - $b_ov + 1, length $u_seq); $gene_before = uc substr($u_seq,0,$ov_ln); $inter_genic = substr($u_seq,$ov_ln); } else { $gene_before = ""; $inter_genic = $u_seq; } } } $retVal = join(($c_seq ? "-" : ""), ($gene_before . $inter_genic, $c_seq)); } # Return the result. return $retVal; } =head3 strand_of C<< my $strand = $fig->contig_of($location); >> Return the strand (C<+> or C<->) of a location. This method only works with SEED-style locations (I<contigID>C<_>I<beg>C<_>I<end>). For more comprehensive location parsing, use the B<Location> object. =over 4 =item location A comma-delimited list of SEED-style location (I<contigID>C<_>I<beg>C<_>I<end>). =item RETURN Returns C<+> if the list describes a forward-oriented location, and C<-> if the list described a backward-oriented location. =back =cut sub strand_of { my ($self, $fid) = @_; my ($beg, $end); # warn "In strand_of, self=$self, fid=$fid"; (undef, $beg, $end) = $self->boundaries_of($self->feature_location($fid)); if ($beg < $end) { return '+'; } else { return '-'; } } =head3 find_contig_with_checksum C<< my $contigID = $fig->find_contig_with_checksum($genome, $checksum); >> Find a contig in the given genome with the given checksum. This method is useful for determining if a particular contig has already been recorded for the given genome. The checksum is computed from the contig contents, so a matching checksum indicates that the contigs may have the same content. =over 4 =item genome ID of the genome whose contigs are to be examined. =item checksum Checksum value for the desired contig. =item RETURN Returns the ID of a contig in the given genome that has the caller-specified checksum, or C<undef> if no such contig exists. =back =cut sub find_contig_with_checksum { my($self, $genome, $checksum) = @_; # # This implementation scans all the contig files for the organism; when # we have contig checksums in the database this will simplify # significantly. # # For some efficiency, we cache the checksums we compute along the way since # it's probably likely we'll poke at other contigs for this organism. # my $gdir = "$FIG_Config::organisms/$genome"; my $cached_cksum = $self->cached('_contig_checksum'); if (opendir(my $dh, $gdir)) { for my $file (map { "$gdir/$_" } grep { $_ =~ /^contigs\d*$/ } readdir($dh)) { local $/ = "\n>"; if (open(my $fh, "<$file")) { while (<$fh>) { chomp; # # Pull the contig identifier from the first line. # We need the >? to handle the first line in the file; # the others are removed by the chomp above because # $/ is set to "\n>"; # if (s/^>?\s*(\S+)([^\n]*)\n//) { my $ident = $1; my $contig_txt = $_; $contig_txt =~ s/\s//sg; $contig_txt = uc($contig_txt); # # See if we have a cached value. # my $this_checksum; $this_checksum = $cached_cksum->{$genome, $ident}; if (!$this_checksum) { my($rd, $wr, $pid); if (!($pid = open2($rd, $wr, "cksum"))) { Confess("Cannot run open2 cksum: $!"); } $wr->write($contig_txt, length($contig_txt)); close($wr); $_ = <$rd>; close($rd); waitpid $pid, 0; chomp; my @vals = split(/\s+/, $_); $this_checksum = $vals[0]; $cached_cksum->{$genome, $ident} = $this_checksum; } if ($this_checksum == $checksum) { return $ident; } } } } } } } =head3 contig_checksum C<< my $checksum = $fig->contig_checksum($genome, $contig); >> or C<< my @checksum = $fig->contig_checksum($genome, $contig); >> Return the checksum of the specified contig. The checksum is computed from the contig's content in a parallel process. The process returns a space-delimited list of numbers. The numbers can be split into a real list if the method is invoked in a list context. For b =cut sub contig_checksum { my($self, $genome, $contig) = @_; my $contig_txt = $self->read_contig($genome, $contig); $contig_txt =~ s/\s//sg; $contig_txt = uc($contig_txt); my($rd, $wr, $pid); if (!($pid = open2($rd, $wr, "cksum"))) { Confess("Cannot run open2 cksum: $!"); } $wr->write($contig_txt, length($contig_txt)); close($wr); $_ = <$rd>; close($rd); waitpid $pid, 0; chomp; my @vals = split(/\s+/, $_); if (wantarray) { return @vals; } else { return $vals[0]; } } =head3 read_contig Read a single contig from the contigs file. =cut sub read_contig { my($self, $genome, $contig) = @_; # # Read the contig. The database has it in a set of chunks, but we # just use the seek to the starting point, and read up to the next "\n>". # my $ret = $self->db_handle->SQL(qq!SELECT fileno, seek FROM contig_seeks WHERE genome = '$genome' and contig = '$contig' and startn = 0!); if (!$ret or @$ret != 1) { return undef; } my($fileno, $seek) = @{$ret->[0]}; my $file = $self->N2file($fileno); my($fh, $contig_txt); if (!open($fh, "<$file")) { warn "contig_checksum: could not open $file: $!\n"; return undef; } seek($fh, $seek, 0); { local $/ = "\n>"; $contig_txt = <$fh>; chomp($contig_txt); } return $contig_txt; } =head3 boundaries_of usage: ($contig,$beg,$end) = $fig->boundaries_of($loc) The location of a feature in a scalar context is contig_b1_e1,contig_b2_e2,... [one contig_b_e for each exon] This routine takes as input such a location and reduces it to a single description of the entire region containing the gene. =cut sub boundaries_of { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($location) = (@_ == 1) ? $_[0] : $_[1]; my($contigQ); if (defined($location)) { my @exons = split(/,/,$location); my($contig,$beg,$end); if (($exons[0] =~ /^(\S+)_(\d+)_\d+$/) && (($contig,$beg) = ($1,$2)) && ($contigQ = quotemeta $contig) && ($exons[$#exons] =~ /^$contigQ\_\d+_(\d+)$/) && ($end = $1)) { return ($contig,$beg,$end); } } return undef; } =head3 all_features_detailed C<< my $featureList = $fig->all_features_detailed($genome); >> Returns a list of all features in the designated genome, with their location, alias, and type information included. This is used in the GenDB import and Sprout load to speed up the process. Deleted features are not returned! =over 4 =item genome ID of the genome whose features are desired. =item RETURN Returns a reference to a list of tuples. Each tuple consists of four elements: (1) the feature ID, (2) the feature location (as a comma-delimited list of location specifiers), (3) the feature aliases (as a comma-delimited list of named aliases), and (4) the feature type. =back =cut #: Return Type $@@; sub all_features_detailed { my($self,$genome) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT id, location, aliases, type FROM features WHERE (genome = \'$genome\')"); my @features; foreach my $tuple (@$relational_db_response) { push @features, $tuple unless ($self->is_deleted_fid($tuple->[0])); } return \@features; } =head3 all_features usage: $fig->all_features($genome,$type) Returns a list of all feature IDs of a specified type in the designated genome. You would usually use just $fig->pegs_of($genome) or $fig->rnas_of($genome) which simply invoke this routine. =cut sub all_features { my($self,$genome,$type) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT id FROM features WHERE (genome = \'$genome\' AND (type = \'$type\'))"); if (@$relational_db_response > 0) { return grep { ! $self->is_deleted_fid($_) } map { $_->[0] } @$relational_db_response; } return (); } =head3 pegs_of usage: $fig->pegs_of($genome) Returns a list of all PEGs in the specified genome. Note that order is not specified. =cut sub pegs_of { my($self,$genome) = @_; return $self->all_features($genome,"peg"); } =head3 rnas_of usage: $fig->rnas_of($genome) Returns a list of all RNAs for the given genome. =cut sub rnas_of { my($self,$genome) = @_; return $self->all_features($genome,"rna"); } =head3 feature_aliases usage: @aliases = $fig->feature_aliases($fid) OR $aliases = $fig->feature_aliases($fid) Returns a list of aliases (gene IDs, arbitrary numbers assigned by authors, etc.) for the feature. These must come from the tbl files, so add them there if you want to see them here. In a scalar context, the aliases come back with commas separating them. =cut sub feature_aliases { my($self,$feature_id) = @_; my($rdbH,$relational_db_response,@aliases,$aliases,%aliases,$x); if ($self->is_deleted_fid($feature_id)) { return undef } $rdbH = $self->db_handle; @aliases = (); if (($relational_db_response = $rdbH->SQL("SELECT aliases FROM features WHERE ( id = \'$feature_id\' )")) && (@$relational_db_response == 1)) { $aliases = $relational_db_response->[0]->[0]; %aliases = map { $_ => 1 } split(/,/,$aliases); } if (($relational_db_response = $rdbH->SQL("SELECT alias FROM ext_alias WHERE ( id = \'$feature_id\' )")) && (@$relational_db_response > 0)) { foreach $x (@$relational_db_response) { $aliases{$x->[0]} = 1; } } @aliases = sort keys(%aliases); return wantarray() ? @aliases : join(",",@aliases); } =head3 by_alias usage: $peg = $fig->by_alias($alias) Returns a FIG id if the alias can be converted. Right now we convert aliases of the form NP_* (RefSeq IDs), gi|* (GenBank IDs), sp|* (Swiss Prot), uni|* (UniProt), kegg|* (KEGG) and maybe a few more =cut sub by_alias { my($self,$alias,$genome) = @_; my($rdbH,$relational_db_response); my ($peg, $flag) = FIGRules::NormalizeAlias($alias); if ($flag) { return $peg; } else { my $genomeQ = $genome ? quotemeta $genome : ""; $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT id FROM ext_alias WHERE ( alias = ? )", undef, $peg)) && (@$relational_db_response > 0)) { if (@$relational_db_response == 1) { $peg = $relational_db_response->[0]->[0]; return wantarray() ? ($peg) : $peg; } elsif (wantarray()) { return map { $_->[0] } @$relational_db_response; } } return wantarray() ? () : ""; } } sub to_alias { my($self,$fid,$type) = @_; my @aliases = $self->feature_aliases($fid); if ($type) { @aliases = grep { $_ =~ /^$type\|/ } @aliases; } if (wantarray()) { return @aliases; } elsif (@aliases > 0) { return $aliases[0]; } else { return ""; } } =head3 possibly_truncated usage: $fig->possibly_truncated($feature_id) or $fig->possibly_truncated($genome, $loc) Returns true iff the feature or location occurs near the end of a contig. =cut sub possibly_truncated { my($self, $arg1, $arg2) = @_; my($fid, $loc, $genome); if (($arg1 =~ m/^fig\|\d+\.\d+\.([^\.]+)\.\d+$/) && (not defined($arg2))) { $fid = $arg1; $loc = $self->feature_location($fid); $genome = $self->genome_of($fid); } elsif (($arg1 =~ m/^\d+\.\d+$/) && ($arg2 =~ m/\S+_\d+_\d+/)) { $genome = $arg1; $loc = $arg2; } else { confess "Invalid Arguments ", join(", ", @_), " to FIG::possibly_truncated"; } my ($contig,$beg,$end) = $self->boundaries_of($loc); if ((! $self->near_end($genome,$contig,$beg)) && (! $self->near_end($genome,$contig,$end))) { return 0; } return 1; } sub near_end { my($self,$genome,$contig,$x) = @_; return (($x < 300) || ($x > ($self->contig_ln($genome,$contig) - 300))); } sub is_real_feature { my($self,$fid) = @_; my($relational_db_response); if ($self->is_deleted_fid($fid)) { return 0 } my $rdbH = $self->db_handle; return (($relational_db_response = $rdbH->SQL("SELECT id FROM features WHERE ( id = \'$fid\' )")) && (@$relational_db_response == 1)) ? 1 : 0; } =head3 map_peg_to_ids C<<my $gnum, $pnum = $fig->map_peg_to_ids($peg)>> Map a peg ID to a pair of numbers describing that peg. In order to conserve storage and increase performance for some operations (the functional coupling computation, for instance), we provide a mechanism by which a full peg (of the form fig|X.Y.peg.Z) is mapped to a pair of integers: a genome number and a PEG index. We maintain a table genome_mapping that retains the mapping between genome ID and local genome number. No effort is expended to ensure this mapping is at all coherent between SEED instances; this is purely a local mechanism for performance enhancement. =over 4 =item $peg ID of the peg to be mapped. =item RETURN A pair of numbers ($gnum, $pnum) =back =cut sub map_peg_to_ids { my($self, $peg) = @_; my $mapperc = $self->cached("_genome_mapper"); my $mapper = $mapperc->{mapper_obj}; if (!defined($mapper)) { $mapper = new GenomeIDMap($self); $mapperc->{mapper_obj} = $mapper; } return $mapper->map_peg_to_nums($peg); } sub map_ids_to_peg { my($self, @ids) = @_; my $mapperc = $self->cached("_genome_mapper"); my $mapper = $mapperc->{mapper_obj}; if (!defined($mapper)) { $mapper = new GenomeIDMap($self); $mapperc->{mapper_obj} = $mapper; } return $mapper->map_nums_to_peg(@ids); } sub map_genome_to_id { my($self, $genome) = @_; my $mapperc = $self->cached("_genome_mapper"); my $mapper = $mapperc->{mapper_obj}; if (!defined($mapper)) { $mapper = new GenomeIDMap($self); $mapperc->{mapper_obj} = $mapper; } return $mapper->map_genome_id_to_gnum($genome); } sub map_id_to_genome { my($self, $id) = @_; my $mapperc = $self->cached("_genome_mapper"); my $mapper = $mapperc->{mapper_obj}; if (!defined($mapper)) { $mapper = new GenomeIDMap($self); $mapperc->{mapper_obj} = $mapper; } return $mapper->map_gnum_to_gid($id); } ################ Routines to process functional coupling for PEGs ########################## ################ Routines to process functional coupling for PEGs ########################## =head3 coupled_to C<< my @coupled_to = $fig->coupled_to($peg); >> Return a list of functionally coupled PEGs. The new form of coupling and evidence computation is based on precomputed data. The old form took minutes to dynamically compute things when needed. The old form still works, if the directory B<Data/CouplingData> is not present. If it is present, it theis assumed to contain comprehensive coupling data in the form of precomputed scores and PCHs. If B<Data/CouplingData> is present, this routine returns a list of 2-tuples [Peg,Sc]. It returns the empty list if the peg is not coupled. It returns C<undef> if B<Data/CouplingData> is not there. =over 4 =item peg ID of the protein encoding group whose functionally-coupled proteins are desired. =item RETURN Returns a list of 2-tuples, each consisting of the ID of a coupled PEG and a score. If there are no PEGs functionally coupled to the incoming PEG, it will return an empty list. If the PEG data is not present, it will return C<undef>. =back =cut sub coupled_to { my($self, $peg) = @_; my $rdbH = $self->db_handle; if (! $rdbH->table_exists('fc_pegs')) { return undef } my $relational_db_response = $rdbH->SQL("SELECT peg2,score FROM fc_pegs WHERE peg1 = \'$peg\' "); return @$relational_db_response; } =head3 coupling_evidence usage: @evidence = $fig->coupling_evidence($peg1,$peg2) The new form of coupling and evidence computation is based on precomputed data. The old form took minutes to dynamically compute things when needed. The old form still works, ikf the directory Data/CouplingData is not present. If it is present, it is assumed to contain comprehensive coupling data in the form of precomputed scores and PCHs. If Data/CouplingData is present, this routine returns a list of 3-tuples [Peg3,Peg4,Rep]. Here, Peg3 is similar to Peg1, Peg4 is similar to Peg2, and Rep == 1 iff this is a "representative pair". That is, we take all pairs and create a representative set in which each pair is not "too close" to any other pair in the representative set. Think of "too close" as being roughly 95% identical at the DNA level. This keeps (usually) a single pair from a set of different genomes from the same species. It returns the empty list if the peg is not coupled. It returns undef, if Data/CouplingData is not there. =cut sub coupling_evidence { my($self,$peg1,$peg2) = @_; my $rdbH = $self->db_handle; if (! $rdbH->table_exists('pchs')) { return undef } my $relational_db_response = $rdbH->SQL("SELECT peg3,peg4,rep FROM pchs WHERE peg1 = \'$peg1\' AND peg2 = \'$peg2\'"); return @$relational_db_response; } =head3 coupling_and_evidence usage: @coupling_data = $fig->coupling_and_evidence($fid,$bound,$sim_cutoff,$coupling_cutoff,$keep_record) A computation of couplings and evidence starts with a given peg and produces a list of 3-tuples. Each 3-tuple is of the form [Score,CoupledToFID,Evidence] Evidence is a list of 2-tuples of FIDs that are close in other genomes (producing a "pair of close homologs" of [$peg,CoupledToFID]). The maximum score for a single PCH is 1, but "Score" is the sum of the scores for the entire set of PCHs. NOTE: once the new version of precomputed coupling is installed (i.e., when Data/CouplingData is filled with the precomputed relations), the parameters on computing evidence are ignored. If $keep_record is true, the system records the information, asserting coupling for each of the pairs in the set of evidence, and asserting a pin from the given $fd through all of the PCH entries used in forming the score. =cut sub coupling_and_evidence { my($self,$peg1,$bound,$sim_cutoff,$coupling_cutoff,$keep_record,$try_old) = @_; my $rdbH = $self->db_handle; if ($rdbH->table_exists('fc_pegs') && $self->is_complete(&FIG::genome_of($peg1))) { my @ans = (); my $tuple; foreach $tuple ($self->coupled_to($peg1)) { my($peg2,$sc) = @$tuple; my $evidence = [map { [$_->[0],$_->[1]] } $self->coupling_evidence($peg1,$peg2)]; push(@ans,[$sc,$peg2,$evidence]); } if ((@ans > 0) || (! $try_old)) { return @ans; } } return &coupling_and_evidence1($self,$peg1,$bound,$sim_cutoff,$coupling_cutoff,$keep_record); } sub coupling_and_evidence1 { my($self,$feature_id,$bound,$sim_cutoff,$coupling_cutoff,$keep_record) = @_; my($neighbors,$neigh,$similar1,$similar2,@hits,$sc,$ev,$genome1); if ($self->is_deleted_fid($feature_id)) { return undef } if ($feature_id =~ /^fig\|(\d+\.\d+)/) { $genome1 = $1; } else { return undef; } my $locations = $self->feature_location($feature_id); my($contig,$beg,$end) = $self->boundaries_of($locations); if (! $contig) { return () } ($neighbors,undef,undef) = $self->genes_in_region($self->genome_of($feature_id), $contig, &min($beg,$end) - $bound, &max($beg,$end) + $bound); if (@$neighbors == 0) { return () } $similar1 = $self->acceptably_close($feature_id,$sim_cutoff); @hits = (); foreach $neigh (grep { $_ =~ /peg/ } @$neighbors) { next if ($neigh eq $feature_id); $similar2 = $self->acceptably_close($neigh,$sim_cutoff); ($sc,$ev) = $self->coupling_ev($genome1,$similar1,$similar2,$bound); if ($sc >= $coupling_cutoff) { push(@hits,[$sc,$neigh,$ev]); } } if ($keep_record) { $self->add_chr_clusters_and_pins($feature_id,\@hits); } return sort { $b->[0] <=> $a->[0] } @hits; } sub fast_coupling { my($self,$peg,$bound,$coupling_cutoff) = @_; my($genome,$genome1,$genome2,$peg1,$peg2,$peg3,%maps,$loc,$loc1,$loc2,$loc3); my($pairs,$sc,%ev); if ($self->is_deleted_fid($peg)) { return undef } my @ans = (); $genome = &genome_of($peg); foreach $peg1 ($self->in_pch_pin_with($peg)) { $peg1 =~ s/,.*$//; if ($peg ne $peg1) { $genome1 = &genome_of($peg1); $maps{$peg}->{$genome1} = $peg1; } } $loc = [&boundaries_of(scalar $self->feature_location($peg))]; foreach $peg1 ($self->in_cluster_with($peg)) { if ($peg ne $peg1) { # print STDERR "peg1=$peg1\n"; $loc1 = [&boundaries_of(scalar $self->feature_location($peg1))]; if (&close_enough($loc,$loc1,$bound)) { foreach $peg2 ($self->in_pch_pin_with($peg1)) { $genome2 = &genome_of($peg2); if (($peg3 = $maps{$peg}->{$genome2}) && ($peg2 ne $peg3)) { $loc2 = [&boundaries_of(scalar $self->feature_location($peg2))]; $loc3 = [&boundaries_of(scalar $self->feature_location($peg3))]; if (&close_enough($loc2,$loc3,$bound)) { push @{$ev{$peg1}}, [$peg3,$peg2]; } } } } } } foreach $peg1 (keys(%ev)) { $pairs = $ev{$peg1}; my @pegMap = $peg, map { $_->[0] } @$pairs; $sc = $self->score(\@pegMap); if ($sc >= $coupling_cutoff) { push(@ans,[$sc,$peg1]); } } return sort { $b->[0] <=> $a->[0] } @ans; } sub score { my($self,$pegs) = @_; my(@ids); if ($self->{_no9s_scoring}) { @ids = map { $self->maps_to_id($_) } grep { $_ !~ /^fig\|999999/ } @$pegs; } else { @ids = map { $self->maps_to_id($_) } @$pegs; } return &score1($self,\@ids) - 1; } sub score1 { my($self,$pegs) = @_; my($sim); my($iden_cutoff) = 97; my($iden_cutoff_gap) = 100 - $iden_cutoff; my($first,@rest) = @$pegs; my $count = 1; my %hits = map { $_ => 1 } @rest; my @ordered = sort { $b->[0] <=> $a->[0] } map { $sim = $_; [$sim->iden,$sim->id2] } grep { $hits{$_->id2} } $self->sims($first,1000,1,"raw"); my %ordered = map { $_->[1] => 1 } @ordered; foreach $_ (@rest) { if (! $ordered{$_}) { push(@ordered,[0,$_]); } } while ((@ordered > 0) && ($ordered[0]->[0] >= $iden_cutoff)) { shift @ordered ; } while (@ordered > 0) { my $start = $ordered[0]->[0]; $_ = shift @ordered; my @sub = ( $_->[1] ); while ((@ordered > 0) && ($ordered[0]->[0] > ($start-$iden_cutoff_gap))) { $_ = shift @ordered; push(@sub, $_->[1]); } if (@sub == 1) { $count++; } else { $count += &score1($self,\@sub); } } return $count; } =head3 add_chr_clusters_and_pins usage: $fig->add_chr_clusters_and_pins($peg,$hits) The system supports retaining data relating to functional coupling. If a user computes evidence once and then saves it with this routine, data relating to both "the pin" and the "clusters" (in all of the organisms supporting the functional coupling) will be saved. $hits must be a pointer to a list of 3-tuples of the sort returned by $fig->coupling_and_evidence. =cut sub add_chr_clusters_and_pins { my($self,$peg,$hits) = @_; my(@clusters,@pins,$x,$sc,$neigh,$pairs,$y,@corr,@orgs,%projection); my($genome,$cluster,$pin,$peg2); if (@$hits > 0) { @clusters = (); @pins = (); my @pinMap = ($peg, map { $_->[1] } @$hits); push @clusters, \@pinMap; foreach $x (@$hits) { ($sc,$neigh,$pairs) = @$x; my @mapped = ($neigh, map { $_->[1] } @$pairs); push @pins, \@mapped; foreach $y (@$pairs) { $peg2 = $y->[0]; if ($peg2 =~ /^fig\|(\d+\.\d+)/) { $projection{$1}->{$peg2} = 1; } } } @corr = (); @orgs = keys(%projection); if (@orgs > 0) { foreach $genome (sort { $a <=> $b } @orgs) { push @corr, sort { &FIG::by_fig_id($a,$b) } keys(%{$projection{$genome}}); } push(@pins,[$peg,@corr]); } foreach $cluster (@clusters) { $self->add_chromosomal_cluster($cluster); } foreach $pin (@pins) { $self->add_pch_pin($pin); } } } sub coupling_ev { my($self,$genome1,$sim1,$sim2,$bound) = @_; my($ev,$sc,$i,$j); $ev = []; $i = 0; $j = 0; while (($i < @$sim1) && ($j < @$sim2)) { if ($sim1->[$i]->[0] < $sim2->[$j]->[0]) { $i++; } elsif ($sim1->[$i]->[0] > $sim2->[$j]->[0]) { $j++; } else { $self->accumulate_ev($genome1,$sim1->[$i]->[1],$sim2->[$j]->[1],$bound,$ev); $i++; $j++; } } my @mapped = map { $_->[0] } @$ev; return ($self->score(\@mapped),$ev); } sub accumulate_ev { my($self,$genome1,$feature_ids1,$feature_ids2,$bound,$ev) = @_; my($genome2,@locs1,@locs2,$i,$j,$x); if ((@$feature_ids1 == 0) || (@$feature_ids2 == 0)) { return 0 } $feature_ids1->[0] =~ /^fig\|(\d+\.\d+)/; $genome2 = $1; @locs1 = map { $x = $self->feature_location($_); $x ? [&boundaries_of($x)] : () } @$feature_ids1; @locs2 = map { $x = $self->feature_location($_); $x ? [&boundaries_of($x)] : () } @$feature_ids2; for ($i=0; ($i < @$feature_ids1); $i++) { for ($j=0; ($j < @$feature_ids2); $j++) { if (($feature_ids1->[$i] ne $feature_ids2->[$j]) && &close_enough($locs1[$i],$locs2[$j],$bound)) { push(@$ev,[$feature_ids1->[$i],$feature_ids2->[$j]]); } } } } sub close_enough { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($locs1,$locs2,$bound) = @_; # print STDERR &Dumper(["close enough",$locs1,$locs2]); return (($locs1->[0] eq $locs2->[0]) && (abs((($locs1->[1]+$locs1->[2])/2) - (($locs2->[1]+$locs2->[2])/2)) <= $bound)); } sub acceptably_close { my($self,$feature_id,$sim_cutoff) = @_; my(%by_org,$id2,$genome,$sim); my($ans) = []; foreach $sim ($self->sims($feature_id,1000,$sim_cutoff,"fig")) { $id2 = $sim->id2; if ($id2 =~ /^fig\|(\d+\.\d+)/) { my $genome = $1; if (! $self->is_eukaryotic($genome)) { push(@{$by_org{$genome}},$id2); } } } foreach $genome (sort { $a <=> $b } keys(%by_org)) { push(@$ans,[$genome,$by_org{$genome}]); } return $ans; } ################ Translations of PEGsand External Protein Sequences ########################## =head3 translatable usage: $fig->translatable($prot_id) The system takes any number of sources of protein sequences as input (and builds an nr for the purpose of computing similarities). For each of these input fasta files, it saves (in the DB) a filename, seek address and length so that it can go get the translation if needed. This routine simply returns true iff info on the translation exists. =cut sub translatable { my($self,$prot) = @_; return &translation_length($self,$prot) ? 1 : 0; } =head3 translation_length usage: $len = $fig->translation_length($prot_id) The system takes any number of sources of protein sequences as input (and builds an nr for the purpose of computing similarities). For each of these input fasta files, it saves (in the DB) a filename, seek address and length so that it can go get the translation if needed. This routine returns the length of a translation. This does not require actually retrieving the translation. =cut sub translation_length { my($self,$prot) = @_; if ($self->is_deleted_fid($prot)) { return undef } $prot =~ s/^([^\|]+\|[^\|]+)\|.*$/$1/; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT slen,seek FROM protein_sequence_seeks WHERE id = \'$prot\' "); my @vals = sort { $b->[1] <=> $a->[1] } @$relational_db_response; return (@vals > 0) ? $vals[0]->[0] : undef; } =head3 get_translation C<< my $translation = $fig->get_translation($prot_id); >> The system takes any number of sources of protein sequences as input (and builds an nr for the purpose of computing similarities). For each of these input fasta files, it saves (in the DB) a filename, seek address and length so that it can go get the translation if needed. This routine returns the stored protein sequence of the specified PEG feature. =over 4 =item prot_id ID of the feature (PEG) whose translation is desired. =item RETURN Returns the protein sequence string for the specified feature. =back =cut #: Return Type $; sub get_translation { my($self,$id) = @_; my($rdbH,$relational_db_response,$fileN,$file,$fh,$seek,$ln,$tran); if ($self->is_deleted_fid($id)) { return '' } $rdbH = $self->db_handle; $id =~ s/^([^\|]+\|[^\|]+)\|.*$/$1/; $relational_db_response = $rdbH->SQL("SELECT fileno, seek, len FROM protein_sequence_seeks WHERE id = \'$id\' "); if ((! ($relational_db_response && @$relational_db_response > 0)) && ($id !~ /^fig\|/) && ($id = $self->by_alias($id))) { $relational_db_response = $rdbH->SQL("SELECT fileno, seek, len FROM protein_sequence_seeks WHERE id = \'$id\' "); } if ($relational_db_response && @$relational_db_response > 0) { my @vals = sort { $b->[1] <=> $a->[1] } @$relational_db_response; ($fileN,$seek,$ln) = @{$vals[0]}; if (($fh = $self->openF($self->N2file($fileN))) && ($ln > 10)) { seek($fh,$seek,0); read($fh,$tran,$ln-1); $tran =~ s/\s//g; return $tran; } } return ''; } =head3 mapped_prot_ids usage: @mapped = $fig->mapped_prot_ids($prot) This routine is at the heart of maintaining synonyms for protein sequences. The system determines which protein sequences are "essentially the same". These may differ in length (presumably due to miscalled starts), but the tails are identical (and the heads are not "too" extended). Anyway, the set of synonyms is returned as a list of 2-tuples [Id,length] sorted by length. =cut sub mapped_prot_ids { my($self,$id) = @_; my $rdbH = $self->db_handle; my $dbh = $rdbH->{_dbh}; if ($self->is_deleted_fid($id)) { return () } # # Manage cached statement handles to accelerate multiple queries into the db. # my $query_cache = $self->cached("_mapped_prot_ids_cache"); if (not exists($query_cache->{q1})) { $query_cache->{q1} = $dbh->prepare(qq(SELECT maps_to FROM peg_synonyms WHERE syn_id = ?)); } if (not exists($query_cache->{q2})) { # # Select distinct to work around the duplicate-rows bug in build_nr. # $query_cache->{q2} = $dbh->prepare(qq(SELECT distinct syn_id,syn_ln,maps_to_ln FROM peg_synonyms WHERE maps_to = ?)); } my $q1_sth = $query_cache->{q1}; my $q2_sth = $query_cache->{q2}; # # Determine the principal synonym for $id. # $q1_sth->execute($id); my $relational_db_response = $q1_sth->fetchall_arrayref(); # my $relational_db_response = $rdbH->SQL("SELECT maps_to FROM peg_synonyms WHERE syn_id = \'$id\' "); if ($relational_db_response && (@$relational_db_response)) { $id = $relational_db_response->[0]->[0]; # # if we have more than one, we have duplicate lines. Warn and let it still work. # if (@$relational_db_response > 1) { warn "Duplicates found in peg_synonyms for syn_id $id\n"; } } # # Retrieve the list of synonyms for the principal synonym. # $q2_sth->execute($id); $relational_db_response = $q2_sth->fetchall_arrayref(); # $relational_db_response = $rdbH->SQL("SELECT syn_id,syn_ln,maps_to_ln FROM peg_synonyms WHERE maps_to = \'$id\' "); if ($relational_db_response && (@$relational_db_response > 0)) { return ([$id,$relational_db_response->[0]->[2]],map { [$_->[0],$_->[1]] } @$relational_db_response); } else { # # If the sequence is a singleton, return it as such. # my $len = $self->translation_length($id); if ($len) { return ([$id,$len]); } else { return (); } } } sub maps_to_id { my($self,$id) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT maps_to FROM peg_synonyms WHERE syn_id = \'$id\' "); return ($relational_db_response && (@$relational_db_response == 1)) ? $relational_db_response->[0]->[0] : $id; } ################ GFF3 utilities ########################## sub get_gff_writer { my($self, %options) = @_; my $w = GFFWriter->new($self, %options); return $w; } ################ Assignments of Function to PEGs ########################## # set to undef to unset user # sub set_user { my($self,$user) = @_; $self->{_user} = $user; } sub get_user { my($self) = @_; return $self->{_user}; } =head3 function_of C<< my $function = $fig->function_of($id, $user); >> or C<< my @functions = $fig->function_of($id); >> In a scalar context, returns the most recently-determined functional assignment of a specified feature by a particular user. In a list context, returns a list of 2-tuples, each consisting of a user ID followed by a functional assighment by that user. In this case, the list contains all the functional assignments for the feature. =over 4 =item id ID of the relevant feature. =item user ID of the user whose assignment is desired (scalar context only) =item RETURN Returns the most recent functional assignment by the given user in scalar context, and a list of functional assignments in list context. Each assignment in the list context is a 2-tuple of the form [$user, $assignment]. =back =cut # Note that we do not return confidence. I propose a separate function to get both # function and confidence # sub function_of { my($self,$id,$user) = @_; my($relational_db_response,@tmp,$entry,$i); my $wantarray = wantarray(); my $rdbH = $self->db_handle; if ($self->is_deleted_fid($id)) { return $wantarray ? () : "" } if (($id =~ /^fig\|(\d+\.\d+\.peg\.\d+)/) && ($wantarray || $user)) { if (($relational_db_response = $rdbH->SQL("SELECT made_by,assigned_function FROM assigned_functions WHERE ( prot = \'$id\' )")) && (@$relational_db_response >= 1)) { @tmp = sort { $a->[0] cmp $b->[0] } map { $_->[1] =~ s/^\s//; $_->[1] =~ s/(\t\S)?\s*$//; [$_->[0],$_->[1]] } @$relational_db_response; for ($i=0; ($i < @tmp) && ($tmp[$i]->[0] ne "master"); $i++) {} if ($i < @tmp) { $entry = splice(@tmp,$i,1); unshift @tmp, ($entry); } my $val; if ($wantarray) { return @tmp } elsif ($user && ($val = &extract_by_who(\@tmp,$user))) { return $val } elsif ($user && ($val = &extract_by_who(\@tmp,"master"))) { return $val } else { return "" } } } else { if (($relational_db_response = $rdbH->SQL("SELECT assigned_function FROM assigned_functions WHERE ( prot = \'$id\' AND made_by = \'master\' )")) && (@$relational_db_response >= 1)) { $relational_db_response->[0]->[0] =~ s/^\s//; $relational_db_response->[0]->[0] =~ s/(\t\S)?\s*$//; return $wantarray ? (["master",$relational_db_response->[0]->[0]]) : $relational_db_response->[0]->[0]; } } return $wantarray ? () : ""; } =head3 translated_function_of usage: $function = $fig->translated_function_of($peg,$user) You get just the translated function. =cut sub translated_function_of { my($self,$id,$user) = @_; my $func = $self->function_of($id,$user); if ($func) { $func = $self->translate_function($func); } return $func; } sub extract_by_who { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($xL,$who) = @_; my($i); for ($i=0; ($i < @$xL) && ($xL->[$i]->[0] ne $who); $i++) {} return ($i < @$xL) ? $xL->[$i]->[1] : ""; } =head3 translate_function usage: $translated_func = $fig->translate_function($func) Translates a function based on the function.synonyms table. =cut sub translate_function { my($self,$function) = @_; my ($tran,$from,$to,$line); if (! ($tran = $self->{_function_translation})) { $tran = {}; if (open(TMP,"<$FIG_Config::global/function.synonyms")) { while (defined($line = <TMP>)) { chomp $line; ($from,$to) = split(/\t/,$line); $tran->{$from} = $to; } close(TMP); } foreach $from (keys(%$tran)) { $to = $tran->{$from}; if ($tran->{$to}) { delete $tran->{$from}; } } $self->{_function_translation} = $tran; } while ($to = $tran->{$function}) { $function = $to; } return $function; } =head3 assign_function usage: $fig->assign_function($peg,$user,$function,$confidence) Assigns a function. Note that confidence can (and should be if unusual) included. Note that no annotation is written. This should normally be done in a separate call of the form ???? =cut sub assign_function { my($self,$peg,$user,$function,$confidence) = @_; my($role,$roleQ,$kvs,$kv,$k,$v); if (! $self->is_real_feature($peg)) { return 0 } my $genome = $self->genome_of($peg); $function =~ s/\s+/ /sg; $function =~ s/^\s+//; $function =~ s/\s+$//; if ($function =~ /^(.*?)\!(.*)/) { ($function,$kvs) = ($1,$2); if ($kvs) { $kvs =~ s/^\s+//; $kvs =~ s/\s+$//; foreach $kv (split(/\s+\!\s+/,$kvs)) { if ($kv =~ /^([A-Za-z0-9._\-\+\%]+)\s+\^\s+(.*)$/) { ($k,$v) = ($1,$2); if ($v !~ /\S/) { &replace_peg_key_value($self,$peg,$k,""); } else { &replace_peg_key_value($self,$peg,$k,$v); } } elsif ($kv =~ /^([A-Za-z0-9._\-\+\%]+)$/) { &replace_peg_key_value($self,$peg,$1,1); } } } } my $rdbH = $self->db_handle; $confidence = $confidence ? $confidence : ""; $rdbH->SQL("DELETE FROM assigned_functions WHERE ( prot = \'$peg\' AND made_by = \'$user\' )"); my $funcQ = quotemeta $function; $rdbH->SQL("INSERT INTO assigned_functions ( prot, made_by, assigned_function, quality, org ) VALUES ( \'$peg\', \'$user\', \'$funcQ\', \'$confidence\', \'$genome\' )"); $rdbH->SQL("DELETE FROM roles WHERE ( prot = \'$peg\' AND made_by = \'$user\' )"); foreach $role (&roles_of_function($function)) { $roleQ = quotemeta $role; $rdbH->SQL("INSERT INTO roles ( prot, role, made_by, org ) VALUES ( \'$peg\', '$roleQ\', \'$user\', \'$genome\' )"); } &verify_dir("$FIG_Config::organisms/$genome/UserModels"); if ($user ne "master") { &verify_dir("$FIG_Config::organisms/$genome/UserModels/$user"); } my $file; if ((($user eq "master") && ($file = "$FIG_Config::organisms/$genome/assigned_functions") && open(TMP,">>$file")) || (($user ne "master") && ($file = "$FIG_Config::organisms/$genome/UserModels/$user/assigned_functions") && open(TMP,">>$file"))) { flock(TMP,LOCK_EX) || confess "cannot lock assigned_functions"; seek(TMP,0,2) || confess "failed to seek to the end of the file"; print TMP "$peg\t$function\t$confidence\n"; close(TMP); chmod(0777,$file); return 1; } else { print STDERR "FAILED ASSIGNMENT: $peg\t$function\t$confidence\n"; } return 0; } sub hypo { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my $x = (@_ == 1) ? $_[0] : $_[1]; if (! $x) { return 1 } if ($x =~ /hypoth/i) { return 1 } if ($x =~ /conserved protein/i) { return 1 } if ($x =~ /gene product/i) { return 1 } if ($x =~ /interpro/i) { return 1 } if ($x =~ /B[sl][lr]\d/i) { return 1 } if ($x =~ /^U\d/) { return 1 } if ($x =~ /^orf/i) { return 1 } if ($x =~ /uncharacterized/i) { return 1 } if ($x =~ /psedogene/i) { return 1 } if ($x =~ /^predicted/i) { return 1 } if ($x =~ /AGR_/) { return 1 } if ($x =~ /similar to/i) { return 1 } if ($x =~ /similarity/i) { return 1 } if ($x =~ /glimmer/i) { return 1 } if ($x =~ /unknown/i) { return 1 } if (($x =~ /domain/i) || ($x =~ /^y[a-z]{2,4}\b/i) || ($x =~ /complete/i) || ($x =~ /ensang/i) || ($x =~ /unnamed/i) || ($x =~ /EG:/) || ($x =~ /orf\d+/i) || ($x =~ /RIKEN/) || ($x =~ /Expressed/i) || ($x =~ /[a-zA-Z]{2,3}\|/) || ($x =~ /predicted by Psort/) || ($x =~ /^bh\d+/i) || ($x =~ /cds_/i) || ($x =~ /^[a-z]{2,3}\d+/i) || ($x =~ /similar to/i) || ($x =~ / identi/i) || ($x =~ /ortholog of/i) || ($x =~ /structural feature/i)) { return 1 } return 0; } ############################ Similarities ############################### =head3 sims usage: @sims = $fig->sims($peg,$maxN,$maxP,$select) Returns a list of similarities for $peg such that there will be at most $maxN similarities, each similarity will have a P-score <= $maxP, and $select gives processing instructions: "raw" means that the similarities will not be expanded (by far fastest option) "fig" means return only similarities to fig genes "all" means that you want all the expanded similarities. By "expanded", we refer to taking a "raw similarity" against an entry in the non-redundant protein collection, and converting it to a set of similarities (one for each of the proteins that are essentially identical to the representative in the nr). Each entry in @sims is a refence to an array. These are the values in each array position: 0. The query peg 1. The similar peg 2. The percent id 3. Alignment length 4. Mismatches 5. Gap openings 6. The start of the match in the query peg 7. The end of the match in the query peg 8. The start of the match in the similar peg 9. The end of the match in the similar peg 10. E value 11. Bit score 12. Length of query peg 13. Length of similar peg 14. Method =cut sub sims { my ( $self, $id, $maxN, $maxP, $select, $max_expand, $filters ) = @_; my( $sim ); $max_expand = defined( $max_expand ) ? $max_expand : $maxN; return () if $self->is_deleted_fid( $id ); # # Retrieve the list of synonyms for this peg. The first in the list # is the principal synonym. # my @maps_to = $self->mapped_prot_ids( $id ); ( @maps_to > 0 ) or return (); my $rep_id = $maps_to[0]->[0]; if ( ! defined( $maps_to[0]->[1] ) ) { print STDERR &Dumper( \@maps_to ); confess "bad"; } # # Find my entry in the list. # my @entry = grep { $_->[0] eq $id } @maps_to; ( @entry == 1 ) and defined( $entry[0]->[1] ) or return (); # # Get the similarities. They are based on the principal synonym. # my @raw_sims = get_raw_sims( $self, $rep_id, $maxP, $filters ); # If the query is not the representative, make sims look like it is # by replacing id1 and fixing match coordinates if lengths differ. my $delta = $maps_to[0]->[1] - $entry[0]->[1]; if ( $id ne $rep_id ) { foreach $sim ( @raw_sims ) { $sim->[0] = $id; $sim->[6] -= $delta; $sim->[7] -= $delta; } } # The query must be present for expanding matches to identical sequences. if ( ( $max_expand > 0 ) && ( $select ne "raw" ) ) { unshift( @raw_sims, bless( [ $id, $rep_id, "100.00", $entry[0]->[1], 0, 0, 1, $entry[0]->[1], $delta+1, $maps_to[0]->[1], 0.0, 2 * $entry[0]->[1], $entry[0]->[1], $maps_to[0]->[1], "blastp" ], 'Sim' ) ); $max_expand++; } # print STDERR "\n\n"; for ( @raw_sims ) { print STDERR join( ", ", @{ $_ } ), "\n" } # expand_raw_sims now handles sanity checks on id1 eq id2 and id2 # is not deleted. This lets it keep count of the actual number of # sims reported! return expand_raw_sims( $self, \@raw_sims, $maxP, $select, 1, $max_expand, $filters ); } # maxP is not used. It is checked by both functions that call here. sub expand_raw_sims { my( $self, $raw_sims, $maxP, $select, $dups, $max_expand, $filters ) = @_; my( $sim, $id1, $id2, %others, $x ); # Set up behavior defaults (pretty wide open): my ( $maxN, $show_env ); if ( $filters && ref( $filters ) eq "HASH" ) { defined( $filters->{ maxN } ) and $maxN = $filters->{ maxN }; defined( $filters->{ select } ) and $select = $filters->{ select }; defined( $filters->{ max_expand } ) and $max_expand = $filters->{ max_expand }; defined( $filters->{ show_env } ) and $show_env = $filters->{ show_env }; defined( $filters->{ dups } ) and $dups = $filters->{ dups }; } defined( $maxN ) or $maxN = 1000000; # Unlimited sims defined( $select ) or $select = 'all'; # Show all expansions defined( $max_expand ) or $max_expand = 0; # But none by default defined( $show_env ) or $show_env = 1; # Show environmental by default $max_expand = 1000000000 if ( $select =~ /^figx/ ); # figx forces unlimited expand my @sims = (); foreach $sim ( @$raw_sims ) { $id2 = $sim->id2; if ( ! $dups ) { next if $others{ $id2 }; $others{ $id2 } = 1; } $id1 = $sim->id1; if ( ( $select eq "raw" ) || ( $max_expand <= 0 ) ) { next if ( ! $show_env && ( $id2 =~ /^fig\|9999999/ ) ); next if ( $id1 eq $id2 ) || $self->is_deleted_fid( $id2 ); push( @sims, $sim ); return @sims if ( @sims >= $maxN ); } else { my @relevant = (); $max_expand--; # # If we are expanding, determine the set of proteins that # are equivalent to the protein that we are similar to. # # Depending on the options passed in, we filter the # equivalent proteins found. # my @maps_to = grep { $_->[0] !~ /^xxx\d+/ } $self->mapped_prot_ids( $id2 ); if ( $select =~ /^figx?$/ ) # Only fig { @relevant = grep { $_->[0] =~ /^fig/ } @maps_to; } elsif ( $select =~ /^figx?_?pref/ ) # FIG preferred { @relevant = grep { $_->[0] =~ /^fig/ } @maps_to; if ( ! @relevant and $id2 !~ /^xxx\d+$/) { push @sims, $sim; return @sims if ( @sims >= $maxN ); next; } } elsif ( $select =~ /^ext/i ) # Not fig { @relevant = grep { $_->[0] !~ /^fig/ } @maps_to; } else # All { @relevant = @maps_to; } # # Include the relevant sims. # foreach $x ( @relevant ) { my ( $x_id, $x_ln ) = @$x; defined( $x_ln ) || confess "x_ln id2='$id2' x_id='$x_id'"; next if ( ! $show_env && ( $x_id =~ /^fig\|9999999/ ) ); next if ( $id1 eq $x_id ) || $self->is_deleted_fid( $x_id ); defined( $maps_to[0]->[1] ) || confess "maps_to"; my $delta2 = $maps_to[0]->[1] - $x_ln; # Coordinate shift my $sim1 = [ @$sim ]; # Make a copy $sim1->[1] = $x_id; $sim1->[8] -= $delta2; $sim1->[9] -= $delta2; bless( $sim1, "Sim" ); push( @sims, $sim1 ); return @sims if ( @sims >= $maxN ); } } } return @sims; } sub get_raw_sims { my ( $self, $rep_id, $maxP, $filters ) = @_; my ( $sim_chunk, $seek, $fileN, $ln, $fh, $file, @lines, $sim ); # Set up behavior defaults (pretty wide open): my ( $show_env, $min_sim, $sim_meas, $min_q_cov, $min_s_cov, $sort_by ); if ( $filters && ref( $filters ) eq "HASH" ) { defined( $filters->{ maxP } ) and $maxP = $filters->{ maxP }; defined( $filters->{ show_env } ) and $show_env = $filters->{ show_env }; defined( $filters->{ min_sim } ) and $min_sim = $filters->{ min_sim }; defined( $filters->{ sim_meas } ) and $sim_meas = $filters->{ sim_meas }; defined( $filters->{ min_q_cov } ) and $min_q_cov = $filters->{ min_q_cov }; defined( $filters->{ min_s_cov } ) and $min_s_cov = $filters->{ min_s_cov }; defined( $filters->{ sort_by } ) and $sort_by = $filters->{ sort_by }; } defined( $maxP ) or $maxP = 10; defined( $show_env ) or $show_env = 1; defined( $min_sim ) or $min_sim = 0; defined( $sim_meas ) or $sim_meas = 'id'; defined( $min_q_cov ) or $min_q_cov = 0; defined( $min_s_cov ) or $min_s_cov = 0; defined( $sort_by ) or $sort_by = 'bits'; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT seek, fileN, len FROM sim_seeks WHERE id = \'$rep_id\' "); # Gather all of the acceptable "lines" from the sim chunks foreach $sim_chunk ( @$relational_db_response ) { ( $seek, $fileN, $ln ) = @$sim_chunk; $file = $self->N2file( $fileN ); $fh = $self->openF( $file ); $fh or confess "could not open sims for $file"; # Read file, parse lines, sanity check values, and filter E-value # 0. The query peg # 1. The similar peg # 2. The percent id # 3. Alignment length # 4. Mismatches # 5. Gap openings # 6. The start of the match in the query peg # 7. The end of the match in the query peg # 8. The start of the match in the similar peg # 9. The end of the match in the similar peg # 10. E-value # 11. Bit score # 12. Length of query peg # 13. Length of similar peg # 14. Method push @lines, grep { ( @$_ >= 15 ) && ( $_->[10] =~ /^[0-9.e-]+$/ ) && # E-value ( $_->[10] <= $maxP ) && # E-value test ( $_->[11] =~ /^[0-9.]+$/ ) && # bit score ( $_->[12] =~ /^\d+$/ ) && # query len ( $_->[13] =~ /^\d+$/ ) && # subj len ( $_->[6] =~ /^\d+$/ ) && # q-match start ( $_->[7] =~ /^\d+$/ ) && # q-match end ( $_->[8] =~ /^\d+$/ ) && # s-match start ( $_->[9] =~ /^\d+$/ ) && # s-match end ( $_->[2] =~ /^[0-9.]+$/ ) # percent id } map { [ split( /\t/, $_ ), "blastp" ] } @{ read_block( $fh, $seek, $ln-1 ) }; } # Similarity filter if ( $min_sim > 0 ) { if ( $sim_meas eq 'id' ) { @lines = grep { $_->[2] >= $min_sim } @lines; } elsif ( $sim_meas eq 'bpp' ) { @lines = grep { $_->[11] >= $min_sim * ( $_->[7] - $_->[6] + 1 ) } @lines; } } # Query coverage filter if ( $min_q_cov > 0 ) { my $thresh = 0.01 * $min_q_cov; @lines = grep { ( abs( $_->[7] - $_->[6] ) + 1 ) >= ( $thresh * $_->[12] ) } @lines; } # Subject coverage filter if ( $min_s_cov > 0 ) { my $thresh = 0.01 * $min_s_cov; @lines = grep { ( abs( $_->[9] - $_->[8] ) + 1 ) >= ( $thresh * $_->[13] ) } @lines; } # Order the surviving raw sims by requested criterion: if ( $sort_by eq 'id' ) # Percent identity { @lines = sort { $b->[2] <=> $a->[2] } @lines; } elsif ( $sort_by eq 'id2' ) # Percent identity adjusted { # Lower percent identity by 2 standard deviations to prevent random # fluctuation in short sequences from moving them up so often. my ( $p, $len, $sigma ); @lines = map { $_->[0] } sort { $b->[1] <=> $a->[1] } map { $p = 0.01 * $_->[2]; # fraction identity $len = abs( $_->[7] - $_->[6] ) + 1; # seq len $sigma = sqrt( $p * ( 1 - $p ) / $len ); # binomial sigma [ $_, $_->[2] - 200 * $sigma ] } @lines; } elsif ( $sort_by eq 'bpp' ) # Bits per position { @lines = map { $_->[0] } sort { $b->[1] <=> $a->[1] } map { [ $_, $_->[11] / abs( $_->[7] - $_->[6] ) ] } @lines; } elsif ( $sort_by eq 'bpp2' ) # Bits per position adjusted { # Lower score by 2 standard deviations to prevent random # fluctuation in short sequences from moving them up so often. my ( $bpp, $len, $sigma ); @lines = map { $_->[0] } sort { $b->[1] <=> $a->[1] } map { $len = abs( $_->[7] - $_->[6] ) + 1; # seq len $bpp = $_->[11] / $len; # bit per pos $sigma = 2.5 * sqrt( 1 / $len ); # simple estimate [ $_, $bpp - 2 * $sigma ] } @lines; } else # Bit score (bits) { @lines = sort { $b->[11] <=> $a->[11] } @lines; } # Bless the raw sims: return map { bless( $_, 'Sim' ); $_ } @lines; } sub read_block { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($fh,$seek,$ln) = @_; my($piece,$readN); seek($fh,$seek,0); my @lines = (); $readN = read($fh,$piece,$ln); ($readN == $ln) || confess "could not read the block of sims at $seek for $ln characters; $readN actually read"; return [ split( /\n/, $piece ) ]; } =head3 bbhs C<< my @bbhList = $fig->bbhs($peg); >> Return a list of the bi-directional best hits relevant to the specified PEG. =over 4 =item peg ID of the feature whose bidirectional best hits are desired. =item RETURN Returns a list of 2-tuples. The first element of the list is the best-hit PEG; the second element is the score. A lower score indicates a better match. =back =cut #: Return Type @@; sub bbhs { # ???? The cutoff is not used for anything ???? my($self,$peg,$cutoff) = @_; my($sim,$peg2,$genome2,$i,@sims2,%seen); if ($self->is_deleted_fid($peg)) { return () } $cutoff = defined($cutoff) ? $cutoff : 1.0e-10; my @bbhs = (); my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT peg2, psc FROM bbh WHERE peg1 = \'$peg\' "); return sort { $a->[1] <=> $b->[1] } @{$relational_db_response}; } =head3 bbh_list C<< my $bbhHash = $fig->bbh_list($genomeID, \@featureList); >> Return a hash mapping the features in a specified list to their bidirectional best hits on a specified target genome. (Modeled after the Sprout call of the same name.) =over 4 =item genomeID ID of the genome from which the best hits should be taken. =item featureList List of the features whose best hits are desired. =item RETURN Returns a reference to a hash that maps the IDs of the incoming features to the best hits on the target genome. =back =cut #: Return Type %; sub bbh_list { my($self, $genome, $features) = @_; my $cutoff = 1.0e-10; my $out = {}; for my $feature (@$features) { my @bbhs = $self->bbhs($feature, $cutoff); my @featureList = grep { /fig\|$genome\.peg/ } map { $_->[0] } @bbhs; $out->{$feature} = \@featureList; } return $out; } =head3 dsims usage: @sims = $fig->dsims($peg,$maxN,$maxP,$select) Returns a list of similarities for $peg such that there will be at most $maxN similarities, each similarity will have a P-score <= $maxP, and $select gives processing instructions: "raw" means that the similarities will not be expanded (by far fastest option) "fig" means return only similarities to fig genes "all" means that you want all the expanded similarities. By "expanded", we refer to taking a "raw similarity" against an entry in the non-redundant protein collection, and converting it to a set of similarities (one for each of the proteins that are essentially identical to the representative in the nr). The "dsims" or "dynamic sims" are not precomputed. They are computed using a heuristic which is much faster than blast, but misses some similarities. Essentially, you have an "index" or representative sequences, a quick blast is done against it, and if there are any hits these are used to indicate which sub-databases to blast against. =cut sub dsims { my($self,$id,$seq,$maxN,$maxP,$select) = @_; my($sim,$sub_dir,$db,$hit,@hits,%in); my @index = &blastit($id,$seq,"$FIG_Config::global/SimGen/exemplar.fasta",1.0e-3); foreach $sim (@index) { if ($sim->id2 =~ /_(\d+)$/) { $in{$1}++; } } @hits = (); foreach $db (keys(%in)) { $sub_dir = $db % 1000; push(@hits,&blastit($id,$seq,"$FIG_Config::global/SimGen/AccessSets/$sub_dir/$db",$maxP)); } if (@hits == 0) { push(@hits,&blastit($id,$seq,"$FIG_Config::global/SimGen/nohit.fasta",$maxP)); } @hits = sort { ($a->psc <=> $b->psc) or ($a->iden cmp $b->iden) } grep { $_->id2 ne $id } @hits; if ($maxN && ($maxN < @hits)) { $#hits = $maxN - 1 } return expand_raw_sims( $self, \@hits, $maxP, $select ); } sub blastit { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($id,$seq,$db,$maxP) = @_; if (! $maxP) { $maxP = 1.0e-5 } my $tmp = &Blast::blastp([[$id,$seq]],$db,"-e $maxP"); my $tmp1 = $tmp->{$id}; if ($tmp1) { return @$tmp1; } return (); } sub related_by_func_sim { my($self,$peg,$user) = @_; my($func,$sim,$id2,%related); if ($self->is_deleted_fid($peg)) { return () } if (($func = $self->function_of($peg,$user)) && (! &FIG::hypo($func))) { foreach $sim ($self->sims($peg,500,1,"fig",500)) { $id2 = $sim->id2; if ($func eq $self->function_of($id2,$user)) { $related{$id2} = 1; } } } return keys(%related); } ################################# chromosomal clusters #################################### =head3 in_cluster_with usage: @pegs = $fig->in_cluster_with($peg) Returns the set of pegs that are thought to be clustered with $peg (on the chromosome). =cut sub in_cluster_with { my($self,$peg) = @_; my($set,$id,%in); return $self->in_set_with($peg,"chromosomal_clusters","cluster_id"); } =head3 add_chromosomal_clusters usage: $fig->add_chromosomal_clusters($file) The given file is supposed to contain one predicted chromosomal cluster per line (either comma or tab separated pegs). These will be added (to the extent they are new) to those already in $FIG_Config::global/chromosomal_clusters. =cut sub add_chromosomal_clusters { my($self,$file) = @_; my($set,$added); open(TMPCLUST,"<$file") || die "aborted"; while (defined($set = <TMPCLUST>)) { print STDERR "."; chomp $set; $added += $self->add_chromosomal_cluster([split(/[\t,]+/,$set)]); } close(TMPCLUST); if ($added) { my $rdbH = $self->db_handle; $self->export_set("chromosomal_clusters","cluster_id","$FIG_Config::global/chromosomal_clusters"); return 1; } return 0; } #=pod # #=head3 export_chromosomal_clusters # #usage: $fig->export_chromosomal_clusters # #Invoking this routine writes the set of chromosomal clusters as known in the #relational DB back to $FIG_Config::global/chromosomal_clusters. # #=cut # sub export_chromosomal_clusters { my($self) = @_; $self->export_set("chromosomal_clusters","cluster_id","$FIG_Config::global/chromosomal_clusters"); } sub add_chromosomal_cluster { my($self,$ids) = @_; my($id,$set,%existing,%in,$new,$existing,$new_id); # print STDERR "adding cluster ",join(",",@$ids),"\n"; foreach $id (@$ids) { foreach $set ($self->in_sets($id,"chromosomal_clusters","cluster_id")) { $existing{$set} = 1; foreach $id ($self->ids_in_set($set,"chromosomal_clusters","cluster_id")) { $in{$id} = 1; } } } # print &Dumper(\%existing,\%in); $new = 0; foreach $id (@$ids) { if (! $in{$id}) { $in{$id} = 1; $new++; } } # print STDERR "$new new ids\n"; if ($new) { foreach $existing (keys(%existing)) { $self->delete_set($existing,"chromosomal_clusters","cluster_id"); } $new_id = $self->next_set("chromosomal_clusters","cluster_id"); # print STDERR "adding new cluster $new_id\n"; $self->insert_set($new_id,[keys(%in)],"chromosomal_clusters","cluster_id"); return 1; } return 0; } ################################# PCH pins #################################### =head3 in_pch_pin_with usage: $fig->in_pch_pin_with($peg) Returns the set of pegs that are believed to be "pinned" to $peg (in the sense that PCHs occur containing these pegs over significant phylogenetic distances). =cut sub in_pch_pin_with { my($self,$peg) = @_; my($set,$id,%in); return $self->in_set_with($peg,"pch_pins","pin"); } =head3 add_pch_pins usage: $fig->add_pch_pins($file) The given file is supposed to contain one set of pinned pegs per line (either comma or tab seprated pegs). These will be added (to the extent they are new) to those already in $FIG_Config::global/pch_pins. =cut sub add_pch_pins { my($self,$file) = @_; my($set,$added); open(TMPCLUST,"<$file") || die "aborted"; while (defined($set = <TMPCLUST>)) { print STDERR "."; chomp $set; my @tmp = split(/[\t,]+/,$set); if (@tmp < 200) { $added += $self->add_pch_pin([@tmp]); } } close(TMPCLUST); if ($added) { my $rdbH = $self->db_handle; $self->export_set("pch_pins","pin","$FIG_Config::global/pch_pins"); return 1; } return 0; } sub export_pch_pins { my($self) = @_; $self->export_set("pch_pins","pin","$FIG_Config::global/pch_pins"); } sub add_pch_pin { my($self,$ids) = @_; my($id,$set,%existing,%in,$new,$existing,$new_id); # print STDERR "adding cluster ",join(",",@$ids),"\n"; foreach $id (@$ids) { foreach $set ($self->in_sets($id,"pch_pins","pin")) { $existing{$set} = 1; foreach $id ($self->ids_in_set($set,"pch_pins","pin")) { $in{$id} = 1; } } } # print &Dumper(\%existing,\%in); $new = 0; foreach $id (@$ids) { if (! $in{$id}) { $in{$id} = 1; $new++; } } if ($new) { if (keys(%in) < 300) { foreach $existing (keys(%existing)) { $self->delete_set($existing,"pch_pins","pin"); } $new_id = $self->next_set("pch_pins","pin"); # print STDERR "adding new pin $new_id\n"; $self->insert_set($new_id,[keys(%in)],"pch_pins","pin"); } else { $new_id = $self->next_set("pch_pins","pin"); # print STDERR "adding new pin $new_id\n"; $self->insert_set($new_id,$ids,"pch_pins","pin"); } return 1; } return 0; } ################################# Annotations #################################### =head3 add_annotation C<< my $okFlag = $fig->add_annotation($fid, $user, $annotation, $time_made); >> Add an annotation to a feature. =over 4 =item fid ID of the feature to be annotated. =item user Name of the user making the annotation. =item annotation Text of the annotation. =item time_made (optional) Time of the annotation, in seconds since the epoch. If omitted, the current time is used. =item RETURN Returns 1 if successful, 0 if any of the parameters are invalid or an error occurs. =back =cut sub add_annotation { my($self,$feature_id,$user,$annotation, $time_made) = @_; my($genome); $time_made = time unless $time_made =~ /^\d+$/; if ($self->is_deleted_fid($feature_id)) { return 0 } # print STDERR "add: fid=$feature_id user=$user annotation=$annotation\n"; if ($genome = $self->genome_of($feature_id)) { my $file = "$FIG_Config::organisms/$genome/annotations"; my $fileno = $self->file2N($file); my $ma = ($annotation =~ /^Set master function to/); if (open(TMP,">>$file")) { flock(TMP,LOCK_EX) || confess "cannot lock assigned_functions"; seek(TMP,0,2) || confess "failed to seek to the end of the file"; # Tweaked this section for Windows compatability. The size on disk of # "\n" is not constant. my $seek1 = tell TMP; my $dataLine = "$feature_id\n$time_made\n$user\n$annotation" . ((substr($annotation,-1) eq "\n") ? "" : "\n"); print TMP $dataLine . "//\n"; close(TMP); chmod 0777, $file; my $ln = length($dataLine); my $rdbH = $self->db_handle; if ($rdbH->SQL("INSERT INTO annotation_seeks ( fid, dateof, who, ma, fileno, seek, len ) VALUES ( \'$feature_id\', $time_made, \'$user\', \'$ma\', $fileno, $seek1, $ln )")) { return 1; } } } return 0; } =head3 add_annotation_batch C<< my ($n_added, $badList) = $fig->add_annotation_batch($file); >> Install a batch of annotations. =over 4 =item file File containing annotations. =item RETURN Returns the number of annotations successfully added in $n_added. If annotations failed, they are returned in $badList as a tuple [$peg, $error_msg, $entry]. =back =cut # # This method exists because it is hugely slow to add a large number # of annotations with add_annotation (it opens and closes the annotation # file for each individual annotation, and uses individual INSERT statements # to update the database). This method batches updates to the files and creates # a load file for the database update. # # if the annotations are sorted by genome, so much the better: it will # do a single file open for the annotation file for that genome. # sub add_annotation_batch { my($self, $file) = @_; my $anno_fh = new FileHandle("<$file"); if (not $anno_fh) { confess "Cannot open $file for reading: $!\n"; } my $dbtmp = "$FIG_Config::temp/add_anno_db.$$"; my $dbfh = new FileHandle(">$dbtmp"); if (not $dbfh) { confess "Cannot write database tmpfile $dbtmp for writing: $!\n"; } local $/ = "///\n"; my $count = 0; my $last_file; my $anno_out_fh; my $errors = []; while (my $anno = <$anno_fh>) { chomp $anno; my ($feature_id, $time_made, $user, $annotation) = split(/\n/, $anno, 4); if ($feature_id eq '' or $time_made eq '' or $user eq '' or $annotation eq '') { push(@$errors, [$feature_id, "Empty fields in annotation", $anno]); next; } next if $self->is_deleted_fid($feature_id); my $genome = $self->genome_of($feature_id); if (not $genome) { push(@$errors, [$feature_id, "no genome found for fid '$feature_id'", $anno]); next; } my $file = "$FIG_Config::organisms/$genome/annotations"; my $fileno = $self->file2N($file); my $ma = ($annotation =~ /^Set master function to/) ? 1 : 0; # # if this is the first time through or if we have a new file, close and reopen. # if (not $last_file or $file ne $last_file) { close($anno_out_fh) if $anno_out_fh; chmod 0777, $last_file; print "Close $last_file, open $file\n"; $anno_out_fh = new FileHandle(">>$file"); if (not $anno_out_fh) { push(@$errors, [$feature_id, "cannot open annotation file $file: $!", $anno]); next; } $last_file = $file; flock($anno_out_fh, LOCK_EX) or confess "cannot lock assigned_functions $file: $!"; seek($anno_out_fh, 0, 2) or confess "failed to seek to the end of the file $file: $!"; } # Tweaked this section for Windows compatability. The size on disk of # "\n" is not constant. my $seek1 = tell $anno_out_fh; my $dataLine = "$feature_id\n$time_made\n$user\n$annotation" . ((substr($annotation,-1) eq "\n") ? "" : "\n"); print $anno_out_fh $dataLine . "//\n"; my $ln = length($dataLine); print $dbfh join("\t", $feature_id, $time_made, $user, $ma, $fileno, $seek1, $ln), "\n"; $count++; } close($anno_out_fh); chmod 0777, $last_file; print "Loading $count annotations into database from $dbtmp\n"; close($dbfh); my $rows = $self->db_handle()->load_table(file => $dbtmp, tbl => 'annotation_seeks'); print "Loaded $rows rows\n"; return $count, $errors; } =head3 merged_related_annotations usage: @annotations = $fig->merged_related_annotations($fids) The set of annotations of a set of PEGs ($fids) is returned as a list of 4-tuples. Each entry in the list is of the form [$fid,$timestamp,$user,$annotation]. =cut sub merged_related_annotations { my($self,$fids) = @_; my($fid); my(@ann) = (); foreach $fid (@$fids) { push(@ann,$self->feature_annotations1($fid)); } return map { $_->[1] = localtime($_->[1]); $_ } sort { $a->[1] <=> $b->[1] } @ann; } =head3 feature_annotations C<< my @annotations = $fig->feature_annotations($fid, $rawtime); >> Return a list of the specified feature's annotations. Each entry in the list returned is a 4-tuple containing the feature ID, time stamp, user ID, and annotation text. These are exactly the values needed to add the annotation using L</add_annotation>, though in a different order. =over 4 =item fid ID of the features whose annotations are to be listed. =item rawtime (optional) If TRUE, the times will be returned as PERL times (seconds since the epoch); otherwise, they will be returned as formatted time strings. =item RETURN Returns a list of 4-tuples, one per annotation. Each tuple is of the form I<($fid, $timeStamp, $user, $annotation)> where I<$fid> is the feature ID, I<$timeStamp> is the time the annotation was made, I<$user> is the name of the user who made the annotation, and I<$annotation> is the text of the annotation. =back =cut sub feature_annotations { my($self,$feature_id,$rawtime) = @_; if ($self->is_deleted_fid($feature_id)) { return () } if ($rawtime) { return $self->feature_annotations1($feature_id); } else { return map { $_->[1] = localtime($_->[1]); $_ } $self->feature_annotations1($feature_id); } } sub feature_annotations1 { my($self,$feature_id) = @_; my($tuple,$fileN,$seek,$ln,$annotation,$feature_idQ); my($file,$fh); if ($self->is_deleted_fid($feature_id)) { return () } my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT fileno, seek, len FROM annotation_seeks WHERE fid = \'$feature_id\' "); my @annotations = (); foreach $tuple (@$relational_db_response) { ($fileN,$seek,$ln) = @$tuple; $annotation = $self->read_annotation($fileN,$seek,$ln); $feature_idQ = quotemeta $feature_id; if ($annotation =~ /^$feature_idQ\n(\d+)\n([^\n]+)\n(.*)/s) { push(@annotations,[$feature_id,$1,$2,$3]); } else { print STDERR "malformed annotation\n$annotation\n"; } } return sort { $a->[1] <=> $b->[1] } @annotations; } sub read_annotation { my($self,$fileN,$seek,$ln) = @_; my($readN,$readC); my $file = $self->N2file($fileN); if (! $file) { return "" } my $fh = $self->openF($file); if (! $fh) { confess "could not open annotations for $file"; } seek($fh,$seek,0); $readN = read($fh,$readC,$ln); my $len2 = length $readC; ($readN == $ln) || confess "could not read the block of annotations at $seek for $ln characters; $readN actually read from $file\n$readC"; return $readC; } sub epoch_to_readable { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($epoch) = @_; my($sec,$min,$hr,$dd,$mm,$yr) = localtime($epoch); $mm++; $yr += 1900; return "$mm-$dd-$yr:$hr:$min:$sec"; } =head3 parse_date usage: $date = $fig->parse_date(date-string) Parse a date string, returning seconds-since-the-epoch, or undef if the date did not parse. Accepted formats include an integer, which is assumed to be seconds-since-the-epoch an is just returned; MM/DD/YYYY; or a date that can be parsed by the routines in the Date::Parse module. =cut sub parse_date { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($date) = @_; $date or return undef; my $epoch_date; if ($date =~ /^(\d{1,2})\/(\d{1,2})\/(\d{4})$/) { my($mm,$dd,$yyyy) = ($1,$2,$3); $epoch_date = &Time::Local::timelocal(0,0,0,$dd,$mm-1,$yyyy-1900,0,0,0); } elsif ($date =~ /^\d+$/) { $epoch_date = $date; } elsif ($haveDateParse) { $epoch_date = str2time($date); } return $epoch_date; } # # This now calls assignments_made_full and remaps the output. # sub assignments_made { my($self,$genomes,$who,$date) = @_; my @a = $self->assignments_made_full($genomes, $who, $date); return map { [ @{$_}[0,1]] } @a; } # # Looks up and returns assignments made; return is a list of # tuples [peg, assignment, date, who] # sub assignments_made_full { my($self,$genomes,$who,$date) = @_; my($relational_db_response,$entry,$fid,$fileno,$seek,$len,$ann); my($epoch_date,$when,%sofar,$x); if (! defined($genomes)) { $genomes = [$self->genomes] } my %genomes = map { $_ => 1 } @$genomes; $epoch_date = $self->parse_date($date); $epoch_date = defined($epoch_date) ? $epoch_date-1 : 0; my @assignments = (); my $rdbH = $self->db_handle; if ($who eq "master") { $relational_db_response = $rdbH->SQL("SELECT fid, dateof, fileno, seek, len FROM annotation_seeks WHERE ((ma = \'1\') AND (dateof > $epoch_date))"); } else { $relational_db_response = $rdbH->SQL("SELECT fid, dateof, fileno, seek, len FROM annotation_seeks WHERE (( who = \'$who\' ) AND (dateof > $epoch_date))"); } if ($relational_db_response && (@$relational_db_response > 0)) { foreach $entry (@$relational_db_response) { ($fid,$when,$fileno,$seek,$len) = @$entry; if (($fid =~ /^fig\|(\d+\.\d+)/) && $genomes{$1} && (! $self->is_deleted_fid($fid))) { if ($len < 4) { print STDERR "BAD: fid=$fid when=$when fileno=$fileno seek=$seek len=$len\n"; next; } $ann = $self->read_annotation($fileno,$seek,$len); if (($ann =~ /^(fig\|\d+\.\d+\.peg\.\d+)\n(\d+)\n(\S+)\nSet ([^\n]*)function[^\n]*\n(\S[^\n]+\S)/s) && (($who eq $3) || (($4 eq "master ") && ($who eq "master"))) && ($2 >= $epoch_date)) { if ((! $sofar{$1}) || (($x = $sofar{$1}) && ($when > $x->[0]))) { $sofar{$1} = [$when, $5, $3]; } } } } } @assignments = map { $x = $sofar{$_}; [$_,$x->[1], $x->[0], $x->[2]] } keys(%sofar); return @assignments; } =head3 extract_assignments_from_annotations Extract a list of assignments from an annotations package as created by annotations_made_fast. Assumes that the user and date filtering was done by the annotations extraction, so all this has to do is to sort the lists of annotations by date and grab the latest one. Return value is a list of tuples [$peg, $assignment, $date, $who]. =cut sub extract_assignments_from_annotations { my($self, $annos) = @_; # # $annos is a list of pairs [$genome, $genomeannos] # $genomeannos is a hash keyed on peg. value is a list of lists [$peg, $time, $who, $anno]. # # # Sort on genome. # my @annos = sort { &FIG::by_genome_id($a->[0], $b->[0]) } @$annos; my @out; for my $gent (@annos) { my($genome, $genome_anno_list) = @$gent; # # Sort on peg id. for my $peg (sort { &FIG::by_fig_id($a, $b) } keys %$genome_anno_list) { my $anno_list = $genome_anno_list->{$peg}; # # Pull assignment annotations. # my @a = grep { $_->is_assignment() } @$anno_list; next unless @a > 0; # # and sort by date, descending. # @a = sort { $b->anno_time() <=> $a->anno_time() } @a; my $winner = $a[0]; $winner->fid() eq $peg or confess "KEY mismatch in annotations_made_fast output"; push(@out, $winner); } } return @out; } sub assignments_made_for_protein { my($self, $fid) = @_; my($relational_db_response,$entry,$fileno,$seek,$len,$ann); my($epoch_date,$when,%sofar,$x); if ($self->is_deleted_fid($fid)) { return () } my @assignments = (); my $rdbH = $self->db_handle; $relational_db_response = $rdbH->SQL("SELECT fid, dateof, fileno, seek, len FROM annotation_seeks WHERE (fid = '$fid')"); if ($relational_db_response && (@$relational_db_response > 0)) { foreach $entry (@$relational_db_response) { ($fid,$when,$fileno,$seek,$len) = @$entry; if ($len < 4) { print STDERR "BAD: fid=$fid when=$when fileno=$fileno seek=$seek len=$len\n"; next; } $ann = $self->read_annotation($fileno,$seek,$len); if (my ($peg, $when, $who, $what, $func) = $ann =~ /^(fig\|\d+\.\d+\.peg\.\d+)\n(\d+)\n(\S+)\nSet ([^\n]*)function[^\n]*\n(\S[^\n]+\S)/s) { push(@assignments, [$peg, $when, $who, $what, $func]); } } } return @assignments; } =head3 annotations_made usage: @annotations = $fig->annotations_made($genomes, $who, $date) Return the list of annotations on the genomes in @$genomes made by $who after $date. Each returned annotation is of the form [$fid,$timestamp,$user,$annotation]. =cut sub annotations_made { my($self,$genomes,$who,$date) = @_; my($relational_db_response,$entry,$fid,$fileno,$seek,$len,$ann); my($epoch_date,$when,@annotations); if (! defined($genomes)) { $genomes = [$self->genomes] } my %genomes = map { $_ => 1 } @$genomes; if ($date =~ /^(\d{1,2})\/(\d{1,2})\/(\d{4})$/) { my($mm,$dd,$yyyy) = ($1,$2,$3); $epoch_date = &Time::Local::timelocal(0,0,0,$dd,$mm-1,$yyyy-1900,0,0,0); } elsif ($date =~ /^\d+$/) { $epoch_date = $date; } else { $epoch_date = 0; } $epoch_date = defined($epoch_date) ? $epoch_date-1 : 0; @annotations = (); my $rdbH = $self->db_handle; if ($who eq "master") { $relational_db_response = $rdbH->SQL("SELECT fid, dateof, fileno, seek, len FROM annotation_seeks WHERE ((ma = \'1\') AND (dateof > $epoch_date))"); } else { $relational_db_response = $rdbH->SQL("SELECT fid, dateof, fileno, seek, len FROM annotation_seeks WHERE (( who = \'$who\' ) AND (dateof > $epoch_date))"); } if ($relational_db_response && (@$relational_db_response > 0)) { foreach $entry (@$relational_db_response) { ($fid,$when,$fileno,$seek,$len) = @$entry; if (($fid =~ /^fig\|(\d+\.\d+)/) && $genomes{$1} && (! $self->is_deleted_fid($fid))) { $ann = $self->read_annotation($fileno,$seek,$len); if ($ann =~ /^(fig\|\d+\.\d+\.peg\.\d+)\n(\d+)\n(\S+)\n(.*\S)/s) { push(@annotations,[$1,$2,$3,$4]); } } } } return @annotations; } sub annotations_made_fast { my($self, $genomes, $start_time, $end_time, $anno_by, $replace_master_with_group) = @_; if (!defined($anno_by)) { $anno_by = 'master'; } if (!defined($genomes)) { $genomes = [$self->genomes()]; } my $group = $FIG_Config::group; $group = 'FIG' unless $group; my $annos; my $pegs = {}; if ($start_time !~ /^\d+$/) { my $st = parse_date($start_time); if (!defined($st)) { confess "annotations_made_fast: unparsable start time '$start_time'"; } $start_time = $st; } if (defined($end_time)) { if ($end_time !~ /^\d+$/) { my $et = parse_date($end_time); if (!defined($et)) { confess "annotations_made_fast: unparsable end time '$end_time'"; } $end_time = $et; } } else { $end_time = time + 60; } # # We originally used a query to get the PEGs that needed to have annotations # sent. Unfortunately, this performed very poorly due to all of the resultant # seeking around in the annotations files. # # The code below just runs through all of the anno files looking for annos. # # A better way to do this would be to do a query to retrieve the genome id's for # genomes that have updates. The problem here is that the annotation_seeks # table doesn't have an explicit genome field. # # Surprisingly, to me anyway, the following query appers to run quickly, in both # postgres and mysql: # # SELECT distinct(substring(fid from 5 for position('.peg.' in fid) - 5)) # FROM annotation_seeks # WHERE dateof > some-date. # # The output of that can be parsed to get the genome id and just those # annotations files searched. # my $master_anno = $anno_by eq 'master'; for my $genome (@$genomes) { my $genome_dir = "$FIG_Config::organisms/$genome"; next unless -d $genome_dir; my $gpegs = {}; my $afh; my $afh = new FileHandle("<$genome_dir/annotations"); if ($afh) { my($fid, $anno_time, $who, $anno_text,$anno_who, @rest); while (not $afh->eof()) { chomp($fid = <$afh>); next if $fid eq "//"; chomp($anno_time = <$afh>); next if $anno_time eq "//"; chomp($who = <$afh>); next if $who eq "//"; @rest = (); while (<$afh>) { chomp; last if $_ eq "//"; push(@rest, $_); } # # Validate. # if ($fid !~ /^fig\|\d+\.\d+\.peg\.\d+$/) { #warn "Invalid fid '$fid' in annotations ($genome_dir/annotations line $.)\n"; next; } elsif ($anno_time !~ /^\d+$/) { warn "Invalid annotation time '$anno_time' in annotations ($genome_dir/annotations line $.)\n"; next; } # # Filter deleted fids. # next if $self->is_deleted_fid($fid); # # Filter on date. # next if $anno_time < $start_time or $anno_time > $end_time; my $aobj = new Annotation($fid, $anno_time, $who, @rest); if ($aobj->is_assignment()) { my $anno_who = $aobj->assignment_who(); # # Filter on annotator. # if ($anno_by eq 'all' or ($master_anno ? ($anno_who eq 'FIG' or $anno_who eq 'master') : ($who eq $anno_by))) { if ($replace_master_with_group) { $aobj->set_assignment_who($group); } } else { next; } } # # Non-assignment annotations are filtered such that: # If master annotations are requested, we take all non-assignment annotations. # Otherwise, we take only those where $who eq $anno_by. # elsif (not($master_anno or $anno_by eq 'all' or $anno_by eq $who)) { next; } # # Fall through: save this anno. Note that we do not save the final newline. # push(@{$gpegs->{$fid}}, $aobj); } # while (my $ann = <$afh>) # { # chomp $ann; # if ((($fid, $anno_time, $who, $anno_text, $anno_who) = # ($ann =~ /^(fig\|\d+\.\d+\.peg\.\d+)\n(\d+)\n(\S+)\n(Set\s+(\S+)\s+function\s+to.*\S)/s)) and # not $self->is_deleted_fid($fid) and # $anno_time >= $start_time and # $anno_time <= $end_time and # ($anno_by eq 'all' or ($master_anno ? ($anno_who eq 'FIG' or $anno_who eq 'master') : ($who eq $anno_by)))) # { # # # # Update users list. # # # { # my $d = $self->is_deleted_fid($fid); # } # if ($replace_master_with_group) # { # $anno_text =~ s/Set master function to/Set $group function to/; # } # my $anno = [$fid, $anno_time, $who, $anno_text]; # push(@{$gpegs->{$fid}}, $anno); # } # } } push(@$annos, [$genome, $gpegs]); } return $annos; } ################################### ATTRIBUTES =head2 Attributes The attributes methods have now been rewritten for handling all kinds of attributes. The key/value pairs can be associated with a feature like a peg, rna, or prophage, or a genome. There are several base attribute methods: get_attributes add_attribute delete_attribute change_attribute There are also methods for more complex things: get_keys get_values guess_value_format By default all keys are case sensitive, and all keys have leading and trailing white space removed. Keys can not contain any of the following characters: space, tab, newline, @$!#%^&*()`~{}[]|\:;"'<>?,./ A method (clean_keys) is called for removing these characters from keys. Attributes are not on a 1:1 correlation, so a single key can have several values. Most attributes files are stored in the genome specific directories. These are in Organisms/nnnnn.n/Attributes for the organisms, and Organisms/nnnnn.n/Feaures/*/Attributes for the features. Attributes can also be stored in Global/Attributes where they will be loaded, but users are discouraged from doing this since there will be no packaging and sharing of those attibutes. Global should be reserved for those attributes that are calculated on a database-wide instance. There are several "special" files that we are using: 1. Definition files These are the raw text files stored in the appropriate locations (Organisms/nnnnn.n/Attributes, Organisms/nnnnn.n/Feaures/*/Attributes, and Global/Attributes). The files should consist of ONLY feature, key, value, and optional URL. Any other columns will be ignored and not loaded into the database. 2. Global/Attributes/attribute_keys This contains the definition of the attribute keys. There are currently 3 defined columns although others may be added and this file can contain lines of an arbitrary length. 3. Global/Attributes/transaction_log, Organisms/nnnnnn.n/Attributes/transaction_log, and Organisms/nnnnnn.n/Features/*/Attributes/transaction_log These are the transaction logs that contain any modifications to the data. In general the data is loaded from a single definition file this is not modified by the software. Any changes to the attributes are made in the Database and then written to the transaction log. The transaction log has the following columns 1. command. This can be one of ADD/DELETE/CHANGE 2. feature. The feature id to be modified 3. key. The key to be modified 4. old value. The original value of the key 5. old url. The original URL 6. new value. The new value for the key. Ignored if the key is deleted. 7. new url. The new value for the URL. Ignored if the key is deleted. Note that the old value and old url are optional. If they are not provided ALL instances of the key will be affected. Notice also that the old file assigned_attributes is no longer used. This is replaced by the transaction log. Finally, in the parsing of all files any line beginning with a pound sign is ignored as a comment. A method, read_attribute_transaction_log, is provided to read the transaction_logs and implement the changes therein. In each of the methods add_attribute, delete_attribute, and change_attribute there is an optional boolean that can be set to prevent writing of the transaction_log. The read_attribute_transaction_log reads the log and then adds/changes/deletes the records as appropriate. Without this boolean there is a circular reference. =head3 get_attributes Get attributes requires one of four keys: fid (which can be genome, peg, rna, or other id), key, value, url It will find any attribute that has the characteristics that you request, and if any values match it will return a four-ple of: [fid, key, value, url] You can request an E. coli key like this $fig->get_attributes('83333.1'); You can request any PIRSF key like this $fig->get_attributes(undef, 'PIRSF'); You can request any google url like this $fig->get_attributes(undef, undef, undef, 'http://www.google.com'); NOTE: If there are no attributes an empty array will be returned. You need to check for this and not assume that it will be undef. =cut sub feature_attributes { my $self=shift; # RAE. Since I was in a changing mood, I renamed this from feature_attributes to get_attributes (we have genomes, now too). # however I left this in here so as not to break things. return $self->get_attributes(@_); } sub get_attributes { # Esoteric question: get_attribute or get_attributes. Everything below (add/change/delete) is singular, but this does make # more sense in the plural. Oh well, I could do both. my($self,@request) = @_; my($rdbH,$relational_db_response); # clean the keys if there is one $request[1] && ($request[1] = $self->clean_attribute_key($request[1])); $rdbH = $self->db_handle; return () unless ($rdbH); # build the SQL statement depending on what we have my @col=('fid','tag','val','url'); my $select = "SELECT DISTINCT fid,tag,val,url FROM attribute "; my $first=1; for (my $i=0; $i<=3; $i++) { next if (!$request[$i]); if ($first) { $select .= "WHERE ( "; undef($first); } else { $select .= " AND "; } $select .= $col[$i] . " = \'" . $request[$i] . "\' "; } if (!$first) {$select .= " )"} return @{$rdbH->SQL($select)}; } # RAE I moved this here as it made no sense where it was. Keeping all the stuff together. # but then I went and renamed it :) sub replace_peg_key_value { my $self=shift; # RAE I deprecated this and replaced it with change_attributes. This should have no effect on existing code, # but the name is more consistent with the other routines that I am adding (add, change, delete) # we should have some verbose switch like $self->{'verbose'} to allow warning of things like this that users shouldn't see #if ($self->{'verbose'}) {print STDERR "replace_peg_key_value has been deprecated and changed to change_attributes\n"} return $self->change_attribute(@_); } # RAE Style: There are two (or more?) ways that I could have coded this. I though about making another subroutine with the SQL and file calls, # and could do so easily, but I opted for the copy paste method which makes the code longer but probably (?) clearer. =head3 add_attribute Add a new key/value pair to something. Something can be a genome id, a peg, an rna, prophage, whatever. Arguments: feature id, this can be a peg, genome, etc, key name. This is case sensitive and has the leading and trailing white space removed value optional URL to add boolean to prevent writing to the transaction log. See above =cut sub find_by_attribute { # search by substrings in attribute values or attribute tags. # This might replace the present search-for-attributes that works by # glimpse. The problem with the present approach is that you can't # search until you rebuild indices with make_attribute_index # my($self,$searchTerm)=@_; return unless( $searchTerm); my $rdbH = $self->db_handle; my $relational_db_response=$rdbH->SQL("SELECT fid,tag, val from attribute WHERE tag LIKE '%$searchTerm%' OR val LIKE '%$searchTerm%'"); my @results; foreach my $res (@$relational_db_response) { my ($fid, $tag, $value)=@$res; $tag =~ s/^\s+//; $tag =~ s/\s+$//; push (@results, [$fid, $tag, $value]); } return @results; } sub add_attribute { my($self,$peg,$k,$v, $url, $notl) = @_; return unless ($peg && $k); # we must have at least a peg and a tag to add (though other things can be undef) $k = $self->clean_attribute_key($k); my $rdbH = $self->db_handle; $rdbH->SQL("INSERT INTO attribute ( fid,tag,val,url ) VALUES ( ?,?,?,?)", undef, $peg,$k,$v,$url); my $location=$self->attribute_location($peg); &verify_dir("$location"); if (!$notl && open(TMPATTR,">>$location/transaction_log")) { print TMPATTR "ADD\t$peg\t$k\t\t\t$v\t$url\n"; close(TMPATTR); } return 1; } =head3 delete_attribute Remove a key from a feature. Arguments: feature id, this can be a peg, genome, etc, key name to delete optional value of the key to be deleted optional url of the key to be deleted don't write to transaction_log. This is usually false which means that the changes are written to the transaction log. It is set to true when parsing the transaction log. If the optional value is ommited all keys with this name for this feature will be deleted. Otherwise, only those keys with that value will be deleted. =cut sub delete_attribute { my($self,$peg,$k, $oldval, $oldurl, $notl) = @_; # we need a feature and a key to delete return unless ($peg && $k); # clean the key $k = $self->clean_attribute_key($k); # get the transaction log my $location=$self->attribute_location($peg); &verify_dir("$location"); if (!$notl && open(TMPATTR,">>$location/transaction_log")) { print TMPATTR "DELETE\t$peg\t$k\t$oldval\n"; close(TMPATTR); } return $self->change_attribute($peg, $k, $oldval, $oldurl, undef, undef); } =head3 change_attribute Change the value of a key/value pair (and optionally its url). Arguments: feature id, this can be a peg, genome, etc, key name whose value to replace value of the original key URL of the original key (optional) value to replace it with optional URL to add don't write to transaction_log. This is usually used when parsing the transaction log to avoid circular references Returns 1 on success. =cut sub change_attribute { my($self,$peg,$k,$oldval, $oldurl, $newval, $newurl, $notl) = @_; return (0) unless ($peg && $k); # we must have at least a peg and a key. $k = $self->clean_attribute_key($k); my $rdbH = $self->db_handle; # Build the delete statement "@boundValues" will be the values replacing the # parameter marks. my @boundValues = ($peg, $k); my $exc="DELETE FROM attribute WHERE fid = ? and tag = ?"; if ($oldval) { $exc .= " and val = ?"; push @boundValues, $oldval; if ($oldurl) { $exc .= " AND url = ?"; push @boundValues, $oldurl; } } $rdbH->SQL($exc, undef, @boundValues); if (defined $newval) { $exc = "INSERT INTO attribute ( fid,tag,val,url ) VALUES ( ?,?,?,? )"; $rdbH->SQL($exc, undef, $peg, $k, $newval, $newurl); # write to the transaction log if we add a new value (writing deletes is handled above) my $location = $self->attribute_location($peg); &verify_dir("$location"); if (!$notl && open(TMPATTR,">>$location/transaction_log")) { print TMPATTR "CHANGE\t$peg\t$oldval\t$oldurl\t$newval\t$newurl\n"; close(TMPATTR); } } return 1; } =head3 clean_attribute_key() use $key=$fig->clean_attribute_key($key) Keys for attributes are used as filenames in the code, and there are limitations on the characters that can be used in the key name. We provide an extended explanation of each key, so the key does not necessarily need to be person-readable. Keys are not allowed to contain any non-word character (i.e. they must only contain [a-zA-Z0-9] and _ This method will remove these. =cut sub clean_attribute_key { my ($self, $key)=@_; #$key =~ s/[\s\n\t\$\@\/\\\Q!#%^&*()`~{}[]|:;"'<>?,.\E]//g; # the \Q .. \E just allows not escaping all the intermediate metacharacters $key =~ s/\W//g; return $key; } =head3 split_attribute_oid() use my ($genome, $type, $id)=split_attribute_feature($id); splits an id into genome, type, and id if it is a feature, or just genome and '', '' if it is a genome, and just the id and undef undef if it is not known =cut sub split_attribute_oid { my($self, $id)=@_; if ($id =~ /^\d+\.\d+$/) { # it appears to be a genome id return ($id, "", ""); } elsif ($id =~ /^fig\|(\d+\.\d+)\.(\w+)\.(\d+)/) { # it appears to be a feature return ($1, $2, $3); } else { # not sure what it is return ($id, undef, undef); } } =head3 read_attribute_transaction_log use: $fig->read_attribute_transaction_log($logfile); This method reads the transaction_log described in $logfile and enacts the changes described therein. The changes must be one of add, delete, or change. =cut sub read_attribute_transaction_log { my ($self, $file)=@_; return unless (-e $file); open(IN, $file) || die "Can't open $file"; while (<IN>) { next if (/^\s*\#/); chomp; my @line=split /\t/; my $type=shift @line; if (uc($type) eq "DELETE") { $line[4]=1; $self->delete_attribute(@line); } elsif (uc($type) eq "ADD") { $line[4]=1; $self->add_attribute(@line); } elsif (uc($type) eq "CHANGE") { $line[7]=1; $self->change_attribute(@line); } else { print STDERR "Do not understand this line from $file. It doesn't appear to be a transaction record:\n$_\n"; } } } =head3 erase_attribute_entirely This method will remove any notion of the attribute that you give it. It is different from delete as that just removes a single attribute associated with a peg. This will remove the files and uninstall the attributes from the database so there is no memory of that type of attribute. All of the attribute files are moved to FIG_Tmp/Attributes/deleted_attributes, and so you can recover the data for a while. Still, you should probably use this carefully! I use this to clean out old PIR superfamily attributes immediately before installing the new correspondence table. e.g. my $status=$fig->erase_attribute_entirely("pirsf"); This will return the number of files that were moved to the new location =cut sub erase_attribute_entirely { my ($self, $attr)=@_; return 0 unless ($attr); my %path_to_files; # this hash has the path as the key and the genome id as the value # get all the tags we know about my $tags=$self->get_keys(); foreach my $type (keys %$tags) { foreach my $label (keys %{$tags->{$type}}) { next unless ($label eq $attr); foreach my $peg (@{$tags->{$type}->{$label}}) { # delete the attribute from the database my $rdbH = $self->db_handle; $rdbH->SQL("DELETE FROM attribute WHERE fid = \'$peg\' and tag = \'$label\'"); # make a hash of all files that we should delete, and then when we are done we can go move them $path_to_files{$self->attribute_location($peg)}=$self->genome_of($peg); } } } # now we need to check that we have the files to delete # we are going to see if there are files to delete, and then we will make temp dirs and move them. If there are no files # to do we don't need to make the dir my @files; foreach my $path (keys %path_to_files) { if (-e "$path/$attr") {push @files, $path} } return 1 unless (scalar @files); # don't continue if there are no files to move `mkdir -p $FIG_Config::temp/Attributes/deleted_attributes`, 0755 unless (-e "$FIG_Config::temp/Attributes/deleted_attributes"); unless (-e "$FIG_Config::temp/Attributes/deleted_attributes") { die "directory structure not constructed. wanted:\n$FIG_Config::temp/Attributes/deleted_attributes\nCan't proceed"; } foreach my $path (@files) { my $genome=$path_to_files{$path}; unless ($genome) {$genome='unknown'} my $dest="$FIG_Config::temp/Attributes/deleted_attributes/$genome"; mkdir "$FIG_Config::temp/Attributes/deleted_attributes/$genome", 0755 unless (-e "$FIG_Config::temp/Attributes/deleted_attributes/$genome"); $dest .= "/".$attr; if (-e $dest) { # don't overwrite the file my $count=1; while (-e "$dest.$count") {$count++} $dest .= ".$count"; } system("mv $path/$attr $dest"); } return scalar @files; } =head3 get_keys Get all the keys that we know about. Without any arguments: Returns a reference to a hash, where the key is the type of feature (peg, genome, rna, prophage, etc), and the value is a reference to a hash where the key is the key name and the value is a reference to an array of all features with that id. e.g. print "There are " , scalar @{{$fig->get_keys}->{'peg'}->{'PIRSF'}}, " PIRSF keys in the database\n"; my $keys=$fig->get_keys; foreach my $type (keys %$keys) { foreach my $label (keys %{$keys->{$type}}) { foreach my $peg (@{$keys->{$type}->{$label}}) { .. do something to each peg and genome here } } } With an argument (that should be a recognized type like peg, rna, genome, etc): Returns a reference to a hash where the key is the key name and the value is the reference to the array. This should use less memory than above. The argument should be (currently) peg, rna, pp, genome, or any other recognized feature type (generally defined as the .peg. in the fid). The default is to return all keys, and this can also be specified with all =cut sub get_tags { my $self=shift @_; # deprecated method replaced with get_keys return $self->get_keys(@_); } sub get_keys { my($self, $want)=@_; unless ($want) {$want = "all"} my $rdbH = $self->db_handle; my $relational_db_response=$rdbH->SQL("SELECT fid,tag from attribute"); my $tags; foreach my $res (@$relational_db_response) { my ($fid, $tag)=@$res; $tag =~ s/^\s+//; $tag =~ s/\s+$//; my $type=$self->ftype($fid); if ($type && ($want eq $type || $want eq "all")) { push (@{$tags->{$type}->{$tag}}, $fid); } elsif (($fid =~ /^\d+\.\d+$/) && (lc($want) eq "genome" || $want eq "all")) { push (@{$tags->{'genome'}->{$tag}}, $fid); } } if ($want eq "all") {return $tags} else {return $tags->{$want}} } =head2 get_all_keys() Just get all the keys and return them. No processing involved. =cut sub get_all_keys { my($self)=@_; my $rdbH = $self->db_handle; return @{$rdbH->SQL("SELECT fid,tag from attribute")}; } =head3 get_values Get all the values that we know about Without any arguments: Returns a reference to a hash, where the key is the type of feature (peg, genome, rna, prophage, etc), and the value is a reference to a hash where the key is the value and the value is the number of occurences e.g. print "There are " , {$fig->get_values}->{'peg'}->{'100'}, " keys with the value 100 in the database\n"; With a single argument: The argument is assumed to be the type (rna, peg, genome, etc). With two arguments: The first argument is the type (rna, peg, genome, etc), and the second argument is the key. In each case it will return a reference to a hash. E.g. $fig->get_values(); # will get all values $fig->get_values('peg'); # will get all values for pegs $fig->get_values('peg', 'pirsf'); # will get all values for pegs with attribute pirsf $fig->get_values(undef, 'pirsf'); # will get all values for anything with that attribute =cut sub get_values { my ($self, $want, $tag)=@_; unless ($want) {$want="all"} my $rdbH = $self->db_handle; $tag =~ s/^\s+//; $tag =~ s/\s+$//; $tag=uc($tag); my $sql="SELECT fid,val FROM attribute"; if ($tag) {$sql .= " WHERE tag = \'$tag\'"} my $relational_db_response=$rdbH->SQL($sql); my $tags; foreach my $res (@$relational_db_response) { my ($fid, $val)=@$res; my $type=$self->ftype($fid); if ($type && ($want eq $type || $want eq "all")) { $tags->{$type}->{$val}++; } elsif (($fid =~ /^\d+\.\d+$/) && (lc($want) eq "genome" || $want eq "all")) { $tags->{'genome'}->{$val}++; } } if ($want eq "all") {return $tags} else {return $tags->{$want}} } =head3 key_info Access a hash of key information. The data that are returned are: hash key name what is it data type single [boolean] description Explanation of key [free text] readonly whether to allow read/write [boolean] is_cv attribute is a cv term [boolean] Single is a boolean, if it is true only the last value returned should be used. Note that the other methods willl still return all the values, it is upto the implementer to ensure that only the last value is used. Explanation is a user-derived explanation that can be free text If a reference to a hash is provided, along with the key, those values will be set to the attribute_keys file Returns an empty hash if the key is not provieded or doesn't exist e.g. $fig->key_info($key, \%data); # set the data $data=$fig->key_info($key); # get the data =cut sub key_info { my ($self, $key, $data)=@_; return {} unless ($key); $key = $self->clean_attribute_key($key); return $self->{'key_info'}->{$key} if (!$data && $self->{'key_info'}->{$key}); my $version=1; my $attr; if (-e "$FIG_Config::global/Attributes/attribute_keys") { open(IN, "$FIG_Config::global/Attributes/attribute_keys") || die "Can't open $FIG_Config::global/Attributes/attribute_keys although it exists"; while (<IN>) { if (/^\#\s*Version\s*(\d+)/) {$version=$1} next if (/^\s*\#/); chomp; next unless ($_); my @a=split /\t/; # fix old versions of attribute_keys if ($version==1) {$attr->{$a[0]}->{'single'}=$a[1]; $attr->{$a[0]}->{'description'}=$a[2]; next} $attr->{$a[0]}->{$a[1]}=$a[2]; } close IN; } if ($data) { $attr->{$key}=$data; open(OUT, ">$FIG_Config::global/Attributes/attribute_keys") || die "Can't open $FIG_Config::global/Attributes/attribute_keys for writing"; print OUT "# Version 2\n# This file contains information about the attribute keys in this database. The columns are:\n"; print OUT "# attribute key\n# tag associated for that key\n# value of that tag\n"; print OUT "# Each attribute key can have as many of these as you want. The last one in the file will be used. This is used to store data applicable to\n"; print OUT "# every key in the attributes\n"; #whatever this does, it wasn't quite right, so we wrote in enlgish, below. #map {my $k=$_; map {print OUT "$k\t$_\t", $attr->{$k}->{$_}, "\n"} keys %{$attr->{$k}}} keys %$attr; foreach my $keyName (keys %$attr) { foreach my $attrName (keys %{$attr->{$keyName}} ) { print OUT "$keyName\t$attrName\t$attr->{$keyName}->{$attrName}\n"; } } close OUT; } $self->{'key_info'}=$attr; if (exists $attr->{$key}) {return $attr->{$key}} else {return {}} } =head3 get_key_value Given a key and a value will return anything that has both E.g. my @nonmotile_genomes = $fig->get_key_value('motile', 'non-motile'); my @bluepegs = $fig->get_key_value('color', 'blue'); If either the key or the value is ommitted will return all the matching sets. =cut sub get_tag_value { # deprecated to get_key_value my $self=shift @_; return $self->get_key_value(@_); } sub get_key_value { my ($self, $key, $val) = @_; $key = $self->clean_attribute_key($key); my $sql="SELECT fid,tag,val FROM attribute"; # AT THE MOMENT THIS IS NOT WORKING. Not sure why # so I am going to do it in two stages. #if ($key && $val) {$sql .= " WHERE ( tag = \'$key\' and val = \'$val\' )"} if ($key && $val) {$sql .= " WHERE (tag = \'$key\' and val = \'$val\')"} elsif ($key) {$sql .= " WHERE tag = \'".$key."\'"} elsif ($val) {$sql .= " WHERE val = \'$val\'"} else { # neither key nor value requested warn("neither key nor value sent to get_key_value"); return(); } my $rdbH = $self->db_handle; my $relational_db_response=$rdbH->SQL($sql); my @results; map {push @results, $_->[0]} @$relational_db_response; return @results; } =head3 guess_value_format There are occassions where I want to know what a value is for a key. I have three scenarios right now: 1. strings 2. numbers 3. percentiles ( a type of number, I know) In these cases, I may want to know something about them and do something interesting with them. This will try and guess what the values are for a given key so that you can try and limit what people add. At the moment this is pure guess work, although I suppose we could put some restrictions on t/v pairs I don't feel like. This method will return a reference to an array. If the element is a string there will only be one element in that array, the word "string". If the value is a number, there will be three elements, the word "float" in position 0, and then the minimum and maximum values. You can figure out if it is a percent :-) =cut sub guess_value_format { my ($self, $tag)=@_; return unless ($tag); # I am using self->{'value_format'} to save the format so if this is called multiple times it is not recalculated each time return $self->{'value_format'}->{$tag} if (defined $self->{'value_format'}->{$tag}); my $hash = $self->get_values(undef, $tag); return if (!$hash || !scalar keys %$hash); # don't carry on if there is nothing to look at my ($min, $max)=(100000000, 0); foreach my $type (keys %$hash) { foreach my $val (keys %{$hash->{$type}}) { next unless ($val); # this code is taken from the perl cookbook pg 44 # it should detect for all nummbers if ($val !~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/) { undef $min; undef $max; last; } else { if ($val > $max) {$max=$val} if ($val < $min) {$min=$val} } } } # if $min and $max are defined then the value is a number # if not, then it is a string; if (defined $min && defined $max) {$self->{'value_format'}->{$tag} = ["float", $min, $max]} else {$self->{'value_format'}->{$tag}=["string"]} return $self->{'value_format'}->{$tag}; } =head3 attribute_location This is just an internal method to find the appropriate location of the attributes file depending on whether it is a peg, an rna, or a genome or whatever. =cut sub attribute_location { my ($self, $peg)=@_; return unless ($peg); my $type=$self->ftype($peg); # we need to know whether it is a peg, prophage, rna, etc my $location; if ($type) { my $genome = &genome_of($peg); $location="$FIG_Config::organisms/$genome/Features/$type/Attributes"; } elsif ($peg =~ /^\d+\.\d+$/ && (-e "$FIG_Config::organisms/$peg")) { # $peg is a genome number and we know about it #$location="$FIG_Config::organisms/$peg/Attributes"; # we want to put things in global again $location="$FIG_Config::global/Attributes/"; } else { print STDERR "Can't figure out what $peg is. It is neither a known feature or a genome id. Added to $FIG_Config::global/Attributes/\n"; $location="$FIG_Config::global/Attributes/"; } return $location; } ################################# Indexing Features and Functional Roles #################################### =head3 search_index usage: ($pegs,$roles) = $fig->search_pattern($pattern) All pegs that "match" $pattern are put into a list, and $pegs will be a pointer to that list. All roles that "match" $pattern are put into a list, and $roles will be a pointer to that list. The notion of "match $pattern" is intentionally left undefined. For now, you will probably get only entries in which each word id $pattern occurs exactly, but that is not a long term commitment. =cut sub search_index { my($self,$pattern, $non_word_search) = @_; my($patternQ,@raw,@pegs,@roles); &clean_tmp; $patternQ = $pattern; $patternQ =~ s/\s+/;/g; $patternQ =~ s/\./\\./g; my $glimpse_args = "-y -H $FIG_Config::data/Indexes -i"; $glimpse_args .= " -w" unless $non_word_search; $glimpse_args .= " \'$patternQ\'"; # print STDERR "pattern=$pattern patternQ=$patternQ\n"; # warn "args: $glimpse_args\n"; @raw = `$FIG_Config::ext_bin/glimpse $glimpse_args`; @pegs = grep { ! $self->is_deleted_fid($_->[0]) } sort { &FIG::by_fig_id($a->[0],$b->[0]) } map { $_ =~ /^\S+:\s+(\S.*\S)/; [split(/\t/,$1)] } #grep { $_ =~ /^\S+peg\.index/ } @raw; grep { $_ =~ /^\S+(peg|attribute).index/ } @raw; my @rolesT = grep { $_ =~ /^\S+role.index/ } @raw; my %roles = map { $_ =~ /^\S+:\s+(\S.*\S)/; $1 => 1;} @rolesT; my @roles = keys(%roles); return ([@pegs],[@roles]); } ################################# Loading Databases #################################### =head3 load_all_list C<< my @packages = FIG::load_all_list(); >> Return a list of the commands to be executed in order to load the SEED database. =cut sub load_all_list { my @packages = qw(load_peg_mapping index_contigs compute_genome_counts load_features index_sims index_translations add_assertions_of_function load_protein_families load_external_orgs load_chromosomal_clusters load_pch_pins index_neighborhoods index_annotations load_ec_names init_maps load_kegg load_distances make_indexes format_peg_dbs load_links index_subsystems load_attributes load_bbhs load_literature load_coupling ); push(@packages, "pegs_in_conflict | peg_to_subsystems > $FIG_Config::global/conflicted.pegs"); return @packages; } #=pod # #=head3 load_all # #usage: load_all # #This function is supposed to reload all entries into the database and do #whatever is required to properly support indexing of pegs and roles. # #=cut sub load_all { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($skip_to) = @_; my $start; my $just_list; # # If skip_to is numeric, start with that package. # # If it is the string "list", list the packages with their numbers. # if ($skip_to eq "list") { $just_list = 1; } elsif ($skip_to =~ /^\d+$/) { $start = $skip_to - 1; } else { $start = 0; } Trace("Loading SEED data.") if T(2); my @packages = load_all_list; my $pn = @packages; for my $i ($start..@packages - 1) { my $i1 = $i + 1; my $pkg = $packages[$i]; my $date = `date`; chomp $date; print "$date: Running $pkg ($i1 of $pn)\n"; if (!$just_list) { &run($pkg); } } print "\n\nLoad complete.\n\n"; } ################################# Automated Assignments #################################### =head3 auto_assign usage: $assignment = &FIG::auto_assign($peg,$seq) This returns an automated assignment for $peg. $seq is optional; if it is not present, then it is assumed that similarities already exist for $peg. $assignment is set to either Function or Function\tW if it is felt that the assertion is pretty weak. =cut sub auto_assign { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($peg,$seq) = @_; my $cmd = $seq ? "echo \"$peg\t$seq\" | $FIG_Config::bin/auto_assign | $FIG_Config::bin/make_calls" : "echo \"$peg\" | $FIG_Config::bin/auto_assign | $FIG_Config::bin/make_calls"; # print STDERR $cmd; my(@tmp) = `$cmd`; if ((@tmp == 1) && ($tmp[0] =~ /^\S+\t(\S.*\S)/)) { return $1; } else { return "hypothetical protein"; } } ################################# Protein Families #################################### =head2 Protein Families In the protein families we have our own concept of an id that I have called an cid. This is entirely internal and does not map to any known database except our own, however it is used to store the correspondence between different protein families. Therefore, to find out what family any protein is in you need to convert that protein to an cid. You can start with a KEGG, COG, TIGR, SP, GI, or FIG id, and get an cid back. From there, you can find out what other proteins that cid maps to, and what families that protein is also in. =head3 all_protein_families usage: @all = $fig->all_protein_families Returns a list of the ids of all of the protein families currently defined. =cut sub all_protein_families { my($self) = @_; my($relational_db_response); my $rdbH = $self->db_handle; if ($rdbH->table_exists('localfam_function') && ($relational_db_response = $rdbH->SQL("SELECT DISTINCT family FROM localfam_function")) && (@$relational_db_response >= 1)) { return map { $_->[0] } @$relational_db_response; } return (); } =head3 families_for_protein use: @families = $fig->families_for_protein($peg) This takes any type of protein identifier (e.g. fig, swissprot, kegg, etc) and returns an array of families that this protein is in. Notes: 1. This is really a wrapper to prot_to_cid and then in_family, but this is what you will likely use most of the time =cut sub families_for_protein { my($self, $peg)=@_; return unless ($peg); my $cid=$self->prot_to_cid($peg); return unless ($cid); return $self->in_family($cid); } =head3 proteins_in_family use @proteins = $fig->proteins_in_family($family); This takes a family id as an argument and returns every protein id known to be in that family. Note: this is also really a wrapper around a couple of other methods, but they will always be called together so this makes it a little cleaner. =cut sub proteins_in_family { my($self, $family)=@_; return unless ($family); my @prots; foreach my $cid ($self->ids_in_family($family)) { push @prots, $self->cid_to_prots($cid); } # note that some proteins may be duplicated, so we flatten this array and return only those things that are unique my %seen; # only return the first occurence of anyting. return grep {!$seen{$_}++} @prots; } =head3 family_function usage: $func = $fig->family_function($family) Returns the putative function of all of the pegs in $family. Remember, we are defining "protein family" as a set of homologous proteins that have the same function. =cut sub family_function { my($self,$family) = @_; my($relational_db_response); my $rdbH = $self->db_handle; if ($rdbH->table_exists('localfam_function') && ($relational_db_response = $rdbH->SQL("SELECT function from localfam_function WHERE family = '$family'")) && (@$relational_db_response >= 1)) { return $relational_db_response->[0]->[0]; } return ""; } =head3 sz_family usage: $n = $fig->sz_family($family) Returns the number of cids in $family. =cut sub sz_family { my($self,$family) = @_; my($relational_db_response); my $rdbH = $self->db_handle; if ($rdbH->table_exists('localfam_function') && ($relational_db_response = $rdbH->SQL("SELECT DISTINCT cid from localfam_cid WHERE family = '$family'"))) { return scalar @$relational_db_response; } return 0; } =head3 ext_sz_family usage: $n = $fig->ext_sz_family($family) Returns the number of external IDs in $family. =cut sub ext_sz_family { my($self,$family) = @_; my @proteins=$self->ext_ids_in_family($family); return scalar(@proteins); } =head3 all_cids usage: @all_cids=$fig->all_cids(); Returns a list of all the ids we know about. =cut sub all_cids { my($self) = @_; my($relational_db_response); my $rdbH = $self->db_handle; if ($rdbH->table_exists('localfam_cid') && ($relational_db_response = $rdbH->SQL("SELECT DISTINCT cid FROM localfam_cid")) && (@$relational_db_response >= 1)) { return map { $_->[0] } @$relational_db_response; } return (); } =head3 ids_in_family usage: @pegs = $fig->ids_in_family($family) Returns a list of the cids in $family. =cut sub ids_in_family { my($self,$family) = @_; my($relational_db_response); my $rdbH = $self->db_handle; if ($rdbH->table_exists('localfam_function') && ($relational_db_response = $rdbH->SQL("SELECT DISTINCT cid from localfam_cid WHERE family = '$family'")) && (@$relational_db_response >= 1)) { return map { $_->[0] } @$relational_db_response; } return (); } =head3 in_family usage: @families = $fig->in_family($cid) Returns an array containing the families containing an cid. =cut sub in_family { my($self,$cid) = @_; my($relational_db_response); my $rdbH = $self->db_handle; if ($rdbH->table_exists('localfam_function') && ($relational_db_response = $rdbH->SQL("SELECT DISTINCT family from localfam_cid WHERE cid = $cid"))) { my %seen; # only return the first occurence of anyting. return grep {!$seen{$_}++} map { $_->[0] } @$relational_db_response; } return (); } =head3 ext_ids_in_family usage: @exts = $fig->ext_ids_in_family($family) Returns a list of the external ids in an external family name. =cut sub ext_ids_in_family { my($self,$family) = @_; my($relational_db_response); my $rdbH = $self->db_handle; if ($rdbH->table_exists('localid_map') && ($relational_db_response = $rdbH->SQL("SELECT DISTINCT localid from localid_map WHERE family = '$family'")) && (@$relational_db_response >= 1)) { return map { $_->[0] } @$relational_db_response; } return (); } =head3 ext_in_family usage: @ext_families = $fig->in_family($id) Returns an array containing the external families containing an id. The ID is the one from the original database (e.g. npfam|PB129746) =cut sub ext_family_for_id { my($self,$id) = @_; my($relational_db_response); my $rdbH = $self->db_handle; if ($rdbH->table_exists('localid_map') && ($relational_db_response = $rdbH->SQL("SELECT DISTINCT family from localid_map WHERE localid = '$id'"))) { my %seen; # only return the first occurence of anyting. return grep {!$seen{$_}++} map { $_->[0] } @$relational_db_response; } return (); } =head3 prot_to_cid Convert a protein to a global ID my $cid=$fig->prot_to_cid($proteinid) $proteinid can be a FIG ID, a SP, tigr, or one of many other IDs returns "" if not known =cut sub prot_to_cid { my($self,$prot) = @_; my($relational_db_response); my $rdbH = $self->db_handle; if ($rdbH->table_exists('localid_cid') && ($relational_db_response = $rdbH->SQL("SELECT DISTINCT cid from localid_cid WHERE localid = '$prot'")) && (@$relational_db_response == 1)) { return $relational_db_response->[0]->[0]; } return ""; } =head3 cid_to_prots Convert an internal ID to the proteins that map to that ID. my @proteins=$fig->cid_to_prots($cid); =cut sub cid_to_prots { my($self,$cid) = @_; my($relational_db_response); my $rdbH = $self->db_handle; if ($rdbH->table_exists('localid_cid') && ($relational_db_response = $rdbH->SQL("SELECT DISTINCT localid from localid_cid WHERE cid = $cid")) && (@$relational_db_response >= 1)) { return map { $_->[0] } @$relational_db_response; } return (); } ################################# Abstract Set Routines #################################### =head2 Abstract Set Routines =cut sub all_sets { my($self,$relation,$set_name) = @_; my($relational_db_response); my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT DISTINCT $set_name FROM $relation")) && (@$relational_db_response >= 1)) { return map { $_->[0] } @$relational_db_response; } return (); } sub next_set { my($self,$relation,$set_name) = @_; my($relational_db_response); my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT MAX($set_name) FROM $relation")) && (@$relational_db_response == 1)) { return $relational_db_response->[0]->[0] + 1; } } sub ids_in_set { my($self,$which,$relation,$set_name) = @_; my($relational_db_response); my $rdbH = $self->db_handle; if (defined($which) && ($which =~ /^\d+$/)) { if (($relational_db_response = $rdbH->SQL("SELECT id FROM $relation WHERE ( $set_name = $which)")) && (@$relational_db_response >= 1)) { return grep { ! $self->is_deleted_fid($_) } sort { by_fig_id($a,$b) } map { $_->[0] } @$relational_db_response; } } return (); } sub in_sets { my($self,$id,$relation,$set_name) = @_; my($relational_db_response); if ($self->is_deleted_fid($id)) { return () } my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT $set_name FROM $relation WHERE ( id = \'$id\' )")) && (@$relational_db_response >= 1)) { return map { $_->[0] } @$relational_db_response; } return (); } sub sz_set { my($self,$which,$relation,$set_name) = @_; my($relational_db_response); my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT COUNT(*) FROM $relation WHERE ( $set_name = $which)")) && (@$relational_db_response == 1)) { return $relational_db_response->[0]->[0]; } return 0; } sub delete_set { my($self,$set,$relation,$set_name) = @_; # print STDERR "deleting set $set\n"; my $rdbH = $self->db_handle; return $rdbH->SQL("DELETE FROM $relation WHERE ( $set_name = $set )"); } sub insert_set { my($self,$set,$ids,$relation,$set_name) = @_; my($id); # print STDERR "inserting set $set containing ",join(",",@$ids),"\n"; my $rdbH = $self->db_handle; my @ids = grep { length($_) < 255 } @$ids; if (@ids < 2) { return 0 } my $rc = 1; foreach $id (@ids) { next if ($self->is_deleted_fid($id)); if (! $rdbH->SQL("INSERT INTO $relation ( $set_name,id ) VALUES ( $set,\'$id\' )")) { $rc = 0; } } # print STDERR " rc=$rc\n"; return $rc; } sub in_set_with { my($self,$peg,$relation,$set_name) = @_; my($set,$id,%in); foreach $set ($self->in_sets($peg,$relation,$set_name)) { foreach $id ($self->ids_in_set($set,$relation,$set_name)) { $in{$id} = 1; } } return sort { &by_fig_id($a,$b) } keys(%in); } sub export_set { my($self,$relation,$set_name,$file) = @_; my($pair); my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT $set_name, id FROM $relation"); open(TMPSET,">$file") || die "could not open $file"; flock(TMPSET,LOCK_EX) || confess "cannot lock $file"; seek(TMPSET,0,2) || confess "failed to seek to the end of the file"; foreach $pair (sort { ($a->[0] <=> $b->[0]) or &by_fig_id($a->[1],$b->[1]) } @$relational_db_response) { if (! $self->is_deleted_fid($pair->[1])) { print TMPSET join("\t",@$pair),"\n"; } } close(TMPSET); return 1; } ################################# KEGG Stuff #################################### =head2 KEGG methods =head3 all_compounds C<< my @compounds = $fig->all_compounds(); >> Return a list containing all of the KEGG compounds. =cut sub all_compounds { my($self) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT DISTINCT cid FROM comp_name"); if (@$relational_db_response > 0) { return sort map { $_->[0] } @$relational_db_response; } return (); } =head3 names_of_compound C<< my @names = $fig->names_of_compound($cid); >> Returns a list containing all of the names assigned to the specified KEGG compound. The list will be ordered as given by KEGG. =over 4 =item cid ID of the desired compound. =item RETURN Returns a list of names for the specified compound. =back =cut sub names_of_compound { my($self,$cid) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT pos,name FROM comp_name where cid = \'$cid\'"); if (@$relational_db_response > 0) { return map { $_->[1] } sort { $a->[0] <=> $b->[0] } @$relational_db_response; } return (); } =head3 comp2react C<< my @rids = $fig->comp2react($cid); >> Returns a list containing all of the reaction IDs for reactions that take $cid as either a substrate or a product. =cut sub comp2react { my($self,$cid) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT rid FROM reaction_to_compound where cid = \'$cid\'"); if (@$relational_db_response > 0) { return sort map { $_->[0] } @$relational_db_response; } return (); } =head3 valid_reaction_id C<< my $flag = $fig->valid_reaction_id($rid); >> Returns true iff the specified ID is a valid reaction ID. This will become important as we include non-KEGG reactions =over 4 =item rid Reaction ID to test. =item RETURN Returns TRUE if the reaction ID is in the data store, else FALSE. =back =cut sub valid_reaction_id { my($self,$rid) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT rid FROM reaction_to_compound WHERE rid = '$rid'"); return (@$relational_db_response > 0); } =head3 cas C<< my $cas = $fig->cas($cid); >> Return the Chemical Abstract Service (CAS) ID for the compound, if known. =over 4 =item cid ID of the compound whose CAS ID is desired. =item RETURN Returns the CAS ID of the specified compound, or an empty string if the CAS ID is not known or does not exist. =back =cut sub cas { my($self,$cid) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT cas FROM comp_cas where cid = \'$cid\'"); if (@$relational_db_response == 1) { return $relational_db_response->[0]->[0]; } return ""; } =head3 cas_to_cid C<< my $cid = $fig->cas_to_cid($cas); >> Return the compound id (cid), given the Chemical Abstract Service (CAS) ID. =over 4 =item cas CAS ID of the desired compound. =item RETURN Returns the ID of the compound corresponding to the specified CAS ID, or an empty string if the CAS ID is not in the data store. =back =cut sub cas_to_cid { my($self,$cas) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT cid FROM comp_cas where cas = \'$cas\'"); if (@$relational_db_response == 1) { return $relational_db_response->[0]->[0]; } return ""; } =head3 all_reactions << my @rids = $fig->all_reactions(); >> Return a list containing all of the KEGG reaction IDs. =cut sub all_reactions { my($self) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT DISTINCT rid FROM reaction_to_compound"); if (@$relational_db_response > 0) { return sort map { $_->[0] } @$relational_db_response; } return (); } =head3 reversible C<< my $flag = $fig->reversible($rid); >> Return TRUE if the specified reaction is reversible. A reversible reaction has no main direction. The connector is symbolized by C<< <=> >> instead of C<< => >>. =over 4 =item rid ID of the ralevant reaction. =item RETURN Returns TRUE if the specified reaction is reversible, else FALSE. If the reaction does not exist, returns TRUE. =back =cut sub reversible { my ($self, $rid) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT reversible FROM reversible where rid = \'$rid\'"); if (@$relational_db_response == 1) { return $relational_db_response->[0]->[0]; } return 1; } =head3 reaction2comp C<< my @tuples = $fig->reaction2comp($rid, $which); >> Return the substrates or products for a reaction. In any event (i.e., whether you ask for substrates or products), you get back a list of 3-tuples. Each 3-tuple will contain [$cid,$stoich,$main] Stoichiometry indicates how many copies of the compound participate in the reaction. It is normally numeric, but can be things like "n" or "(n+1)". $main is 1 iff the compound is considered "main" or "connectable". =over 4 =item rid ID of the raction whose compounds are desired. =item which TRUE if the products (right side) should be returned, FALSE if the substrates (left side) should be returned. =item RETURN Returns a list of 3-tuples. Each tuple contains the ID of a compound, its stoichiometry, and a flag that is TRUE if the compound is one of the main participants in the reaction. =back =cut sub reaction2comp { my($self,$rid,$which) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT cid,stoich,main FROM reaction_to_compound where rid = \'$rid\' and setn = \'$which\'"); if (@$relational_db_response > 0) { return sort { $a->[0] cmp $b->[0] } map { $_->[1] =~ s/\s+//g; $_ } @$relational_db_response; } return (); } =head3 catalyzed_by C<< my @ecs = $fig->catalyzed_by($rid); >> Return the ECs (roles) that are reputed to catalyze the reaction. Note that we are currently just returning the ECs that KEGG gives. We need to handle the incompletely specified forms (e.g., 1.1.1.-), but we do not do it yet. =over 4 =item rid ID of the reaction whose catalyzing roles are desired. =item RETURN Returns the IDs of the roles that catalyze the reaction. =back =cut sub catalyzed_by { my($self,$rid) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT role FROM reaction_to_enzyme where rid = \'$rid\'"); if (@$relational_db_response > 0) { return sort map { $_->[0] } @$relational_db_response; } return (); } =head3 catalyzes usage: @ecs = $fig->catalyzes($role) Returns the rids of the reactions catalyzed by the "role" (normally an EC). =cut sub catalyzes { my($self,$role) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT rid FROM reaction_to_enzyme where role = \'$role\'"); if (@$relational_db_response > 0) { return sort map { $_->[0] } @$relational_db_response; } return (); } =head3 displayable_reaction usage: $display_format = $fig->displayable_reaction($rid) Returns a string giving the displayable version of a reaction. =cut sub displayable_reaction { my($self,$rid) = @_; my @tmp = `grep $rid $FIG_Config::data/KEGG/reaction_name.lst`; if (@tmp > 0) { chomp $tmp[0]; return $tmp[0]; } return $rid; } =head3 all_maps usage: @maps = $fig->all_maps Returns a list containing all of the KEGG maps that the system knows about (the maps need to be periodically updated). =cut sub all_maps { my($self) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT DISTINCT map FROM ec_map "); if (@$relational_db_response > 0) { return map { $_->[0] } @$relational_db_response; } return (); } =head3 ec_to_maps usage: @maps = $fig->ec_to_maps($ec) Returns the set of maps that contain $ec as a functional role. $ec is usually an EC number, but in the more general case, it can be a functional role. =cut sub ec_to_maps { my($self,$ec) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT map FROM ec_map WHERE ( ec = \'$ec\' )"); if (@$relational_db_response > 0) { return map { $_->[0] } @$relational_db_response; } return (); } sub role_to_maps { my($self, $role) = @_; return $self->ec_to_maps($role); } =head3 map_to_ecs usage: @ecs = $fig->map_to_ecs($map) Returns the set of functional roles (usually ECs) that are contained in the functionality depicted by $map. =cut sub map_to_ecs { my($self,$map) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT ec FROM ec_map WHERE ( map = \'$map\' )"); if (@$relational_db_response > 0) { return map { $_->[0] } @$relational_db_response; } return (); } =head3 map_name usage: $name = $fig->map_name($map) Returns the descriptive name covering the functionality depicted by $map. =cut sub map_name { my($self,$map) = @_; my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT mapname FROM map_name WHERE ( map = \'$map\' )"); if (@$relational_db_response == 1) { return $relational_db_response->[0]->[0]; } return ""; } ################################# Functional Roles #################################### =head2 Functional Roles =head3 neighborhood_of_role usage: @roles = $fig->neighborhood_of_role($role) Returns a list of functional roles that we consider to be "the neighborhood" of $role. =cut sub neighborhood_of_role { my($self,$role) = @_; my($readC); my $file = "$FIG_Config::global/role.neighborhoods"; my $rdbH = $self->db_handle; my $roleQ = quotemeta $role; my $relational_db_response = $rdbH->SQL("SELECT seek, len FROM neigh_seeks WHERE role = \'$roleQ\' "); if (@$relational_db_response == 1) { my($seek,$ln) = @{$relational_db_response->[0]}; my $fh = $self->openF($file); seek($fh,$seek,0); my $readN = read($fh,$readC,$ln-1); ($readN == ($ln-1)) || confess "could not read the block of sims at $seek for $ln - 1 characters; $readN actually read from $file\n$readC"; return grep { $_ && ($_ !~ /^\/\//) } split(/\n/,$readC); } return (); } =head3 roles_of_function C<< my @roles = $fig->roles_of_function($func); >> Returns a list of the functional roles implemented by the specified function. This method parses the role data out of the function name, and does not require access to the database. =over 4 =item func Name of the function whose roles are to be parsed out. =item RETURN Returns a list of the roles performed by the specified function. =back =cut sub roles_of_function { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my $func = (@_ == 1) ? $_[0] : $_[1]; $func =~ s/\!.*$//; my %roles = map { $_ => 1 } (split(/\s*;\s+|\s+[\@\/]\s+/,$func),($func =~ /\d+\.\d+\.\d+\.\d+/g),$func); return keys(%roles); } =head3 seqs_with_role usage: @pegs = $fig->seqs_with_role($role,$who) Returns a list of the pegs that implement $role. If $who is not given, it defaults to "master". The system returns all pegs with an assignment made by either "master" or $who (if it is different than the master) that implement $role. Note that this includes pegs for which the "master" annotation disagrees with that of $who, the master's implements $role, and $who's does not. =cut sub seqs_with_role { my($self,$role,$who,$genome) = @_; my($relational_db_response,$query); my $roleQ = quotemeta $role; $who = $who ? $who : "master"; my $rdbH = $self->db_handle; my $who_cond; if ($who eq "master") { $who_cond = "( made_by = \'master\' OR made_by = \'unknown\' )"; } else { $who_cond = "( made_by = \'master\' OR made_by = \'$who\' OR made_by = \'unknown\')"; } if (! $genome) { $query = "SELECT distinct prot FROM roles WHERE (( role = \'$roleQ\' ) AND $who_cond )"; } else { $query = "SELECT distinct prot FROM roles WHERE (( role = \'$roleQ\' ) AND $who_cond AND (org = \'$genome\'))"; } return (($relational_db_response = $rdbH->SQL($query)) && (@$relational_db_response >= 1)) ? grep { ! $self->is_deleted_fid($_) } map { $_->[0] } @$relational_db_response : (); } =head3 seqs_with_roles_in_genomes usage: $result = $fig->seqs_with_roles_in_genomes($genomes,$roles,$made_by) This routine takes a pointer to a list of genomes ($genomes) and a pointer to a list of roles ($roles) and looks up all of the sequences that connect to those roles according to either the master assignments or those made by $made_by. Again, you will get assignments for which the "master" assignment connects, but the $made_by does not. A hash is returned. The keys to the hash are genome IDs for which at least one sequence was found. $result->{$genome} will itself be a hash, assuming that at least one sequence was found for $genome. $result->{$genome}->{$role} will be set to a pointer to a list of 2-tuples. Each 2-tuple will contain [$peg,$function], where $function is the one for $made_by (which may not be the one that connected). =cut sub seqs_with_roles_in_genomes { my($self,$genomes,$roles,$made_by) = @_; my($genome,$role,$roleQ,$role_cond,$made_by_cond,$query,$relational_db_response,$peg,$genome_cond,$hit); my $rdbH = $self->db_handle; my $result = {}; # foreach $genome ($self->genomes) { $result->{$genome} = {} } if (! $made_by) { $made_by = 'master' } if ((@$genomes > 0) && (@$roles > 0)) { $genome_cond = "(" . join(" OR ",map { "( org = \'$_\' )" } @$genomes) . ")"; $role_cond = "(" . join(" OR ",map { $roleQ = quotemeta $_; "( role = \'$roleQ\' )" } @$roles) . ")"; $made_by_cond = ($made_by eq 'master') ? "(made_by = 'master')" : "(made_by = 'master' OR made_by = '$made_by')"; $query = "SELECT distinct prot, role FROM roles WHERE ( $made_by_cond AND $genome_cond AND $role_cond )"; if (($relational_db_response = $rdbH->SQL($query)) && (@$relational_db_response >= 1)) { foreach $hit (@$relational_db_response) { ($peg,$role) = @$hit; if (! $self->is_deleted_fid($peg)) { $genome = $self->genome_of($peg); push(@{ $result->{$genome}->{$role} },[$peg,scalar $self->function_of($peg,$made_by)]); } } } } return $result; } =head3 largest_clusters usage: @clusters = $fig->largest_clusters($roles,$user) This routine can be used to find the largest clusters containing some of the designated set of roles. A list of clusters is returned. Each cluster is a pointer to a list of pegs. =cut sub largest_clusters { my($self,$roles,$user,$sort_by_unique_functions) = @_; my($genome,$x,$role,$y,$peg,$loc,$contig,$beg,$end,%pegs,@pegs,$i,$j); my $ss = $self->seqs_with_roles_in_genomes([$self->genomes],$roles,$user); my @clusters = (); foreach $genome (keys(%$ss)) { my %pegs; $x = $ss->{$genome}; foreach $role (keys(%$x)) { $y = $x->{$role}; foreach $peg (map { $_->[0] } @$y) { if ($loc = $self->feature_location($peg)) { ($contig,$beg,$end) = &FIG::boundaries_of($loc); $pegs{$peg} = [$peg,$contig,int(($beg + $end) / 2)]; } } } @pegs = sort { ($pegs{$a}->[1] cmp $pegs{$b}->[1]) or ($pegs{$a}->[2] <=> $pegs{$b}->[2]) } keys(%pegs); $i = 0; while ($i < $#pegs) { for ($j=$i+1; ($j < @pegs) && &close_enough_locs($pegs{$pegs[$j-1]},$pegs{$pegs[$j]}); $j++) {} if ($j > ($i+1)) { push(@clusters,[@pegs[$i..$j-1]]); } $i = $j; } } if ($sort_by_unique_functions) { @clusters = sort { $self->unique_functions($b,$user) <=> $self->unique_functions($a,$user) } @clusters; } else { @clusters = sort { @$b <=> @$a } @clusters; } return @clusters; } sub unique_functions { my($self,$pegs,$user) = @_; my($peg,$func,%seen); foreach $peg (@$pegs) { if ($func = $self->function_of($peg,$user)) { $seen{$func} = 1; } } return scalar keys(%seen); } sub close_enough_locs { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($x,$y) = @_; return (($x->[1] eq $y->[1]) && (abs($x->[2] - $y->[2]) < 5000)); } sub candidates_for_role { my($self,$role,$genome,$cutoff,$user) = @_; my($peg); $user = $user ? $user : "master"; my @cand = map { $_->[0] } sort { $a->[1] <=> $b->[1] } map { $peg = $_; [$peg,$self->crude_estimate_of_distance($genome,&FIG::genome_of($peg))] } $self->seqs_with_role($role,$user); return $self->candidates_for_role_from_known($genome,$cutoff,\@cand); } sub candidates_for_role_from_known { my($self,$genome,$cutoff,$known) = @_; my($peg); my @cand = @$known; my $hits = {}; my $seen = {}; my $how_many = (@cand > 10) ? 9 : $#cand; &try_to_locate($self,$genome,$hits,[@cand[0..$how_many]],$seen,$cutoff); if (keys(%$hits) == 0) { splice(@cand,0,$how_many+1); &try_to_locate($self,$genome,$hits,\@cand,$seen,$cutoff); } return sort {$hits->{$a}->[0] <=> $hits->{$b}->[0]} keys(%$hits); } sub try_to_locate { my($self,$genome,$hits,$to_try,$seen,$cutoff) = @_; my($prot,$id2,$psc,$id2a,$x,$sim); if (! $cutoff) { $cutoff = 1.0e-5 } foreach $prot (@$to_try) { if (! $seen->{$prot}) { if (($prot =~ /^fig\|(\d+\.\d+)/) && ($1 eq $genome)) { $hits->{$prot} = [0,$prot]; } else { foreach $sim ($self->sims($prot,1000,$cutoff,"fig")) { $id2 = $sim->id2; $psc = $sim->psc; if (($id2 =~ /^fig\|(\d+\.\d+)/) && ($1 eq $genome)) { $x = $hits->{$id2}; if ((! $x) || ($x->[0] > $psc)) { $hits->{$id2} = [$psc,$prot]; } } elsif (&neg_log($psc) > (2 * &neg_log($cutoff))) { $seen->{$id2} = 1; } } } } } } sub neg_log { my($x) = @_; if ($x == 0) { return 200; } else { return -log($x) / log(10); } } =head2 Bidirectional Best Hits =head3 best_bbh_candidates usage: @candidates = $fig->best_bbh_candidates($genome,$cutoff,$requested,$known) This routine returns a list of up to $requested candidates from $genome. A candidate is a BBH against one of the PEGs in genomes from the list given by@$known. Each entry in the list is a 3-tuple: [CandidatePEG,KnownBBH,Pscore] =cut sub best_bbh_candidates { my($self,$genome,$cutoff,$requested,$known,$frac_match) = @_; my($i,$j,$k,$sim,@sims,$peg,$id2,$genome2,$sim_back); my($bbh,%seen,%computed_sims,$genome1); $frac_match = defined($frac_match) ? $frac_match : 0.7; my @got = (); my @cand = $self->candidates_for_role_from_known($genome,$cutoff,$known); if (@cand > 0) { my %genomes = map { $genome1 = &FIG::genome_of($_); $genome1 => 1 } @$known; my %pegs = map { $_ => 1 } @$known; for ($i=0; (@got < $requested) && ($i < @cand); $i++) { $peg = $cand[$i]; undef %seen; @sims = grep { $genomes{&FIG::genome_of($_->id2)} } $self->sims($peg,1000,$cutoff,"fig"); $bbh = 0; for ($j=0; (! $bbh) && ($j < @sims); $j++) { $sim = $sims[$j]; $id2 = $sim->id2; $genome2 = &FIG::genome_of($id2); if (! $seen{$genome2}) { if ($pegs{$id2}) { if (! defined($sim_back = $computed_sims{$id2})) { my @sims_back = $self->sims($id2,1000,$cutoff,"fig"); for ($k=0; ($k < @sims_back) && (&FIG::genome_of($sims_back[$k]->id2) ne $genome); $k++) {} if ($k < @sims_back) { $sim_back = $computed_sims{$id2} = $sims_back[$k]; } else { $sim_back = $computed_sims{$id2} = 0; } } if ($sim_back) { if (($sim_back->id2 eq $peg) && $self->ok_match($sim_back,$frac_match)) { $bbh = [$id2,$sim_back->psc]; } } } $seen{$genome2} = 1; } } if ($bbh) { push(@got,[$peg,@$bbh]); } } } return @got; } sub ok_match { my($self,$sim,$frac_match) = @_; my $ln1 = $sim->ln1; my $ln2 = $sim->ln2; my $b1 = $sim->b1; my $e1 = $sim->e1; my $b2 = $sim->b2; my $e2 = $sim->e2; return (((($e1 - $b1) / $ln1) >= $frac_match) && ((($e2 - $b2) / $ln2) >= $frac_match)) } sub external_calls { my($self,$pegs) = @_; my($peg,$func); open(TMP,">/tmp/pegs.$$") || die "could not open /tmp/pegs.$$"; foreach $peg (@$pegs) { print TMP "$peg\n"; } close(TMP); open(TMP,">/tmp/parms.$$") || die "could not open /tmp/parms.$$"; print TMP "no_fig\t1\n"; close(TMP); my %call = map { chop; ($peg,$func) = split(/\t/,$_) } `$FIG_Config::bin/auto_assign /tmp/parms.$$ < /tmp/pegs.$$ 2> /dev/null | $FIG_Config::bin/make_calls`; unlink("/tmp/pegs.$$","/tmp/parms.$$"); return map { $call{$_} ? [$_,$call{$_}] : [$_,"hypothetical protein"] } @$pegs; } use SameFunc; sub same_func { my($self,$f1,$f2) = @_; return &SameFunc::same_func($f1,$f2); } ################################# DNA sequence Stuff #################################### =head2 DNA Sequences =head3 extract_seq usage: $seq = &FIG::extract_seq($contigs,$loc) This is just a little utility routine that I have found convenient. It assumes that $contigs is a hash that contains IDs as keys and sequences as values. $loc must be of the form Contig_Beg_End where Contig is the ID of one of the sequences; Beg and End give the coordinates of the sought subsequence. If Beg > End, it is assumed that you want the reverse complement of the subsequence. This routine plucks out the subsequence for you. =cut sub extract_seq { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($contigs,$loc) = @_; my($contig,$beg,$end,$contig_seq); my($plus,$minus); $plus = $minus = 0; my $strand = ""; my @loc = split(/,/,$loc); my @seq = (); foreach $loc (@loc) { if ($loc =~ /^\S+_(\d+)_(\d+)$/) { if ($1 < $2) { $plus++; } elsif ($2 < $1) { $minus++; } } } if ($plus > $minus) { $strand = "+"; } elsif ($plus < $minus) { $strand = "-"; } foreach $loc (@loc) { if ($loc =~ /^(\S+)_(\d+)_(\d+)$/) { ($contig,$beg,$end) = ($1,$2,$3); my $len = length($contigs->{$contig}); if (!$len) { carp "Undefined or zero-length contig $contig"; return ""; } if (($beg > $len) || ($end > $len)) { carp "Region $loc out of bounds (contig len=$len)"; } else { if (($beg < $end) || (($beg == $end) && ($strand eq "+"))) { push(@seq,substr($contigs->{$contig},$beg-1,($end+1-$beg))); } else { $strand = "-"; push(@seq,&reverse_comp(substr($contigs->{$contig},$end-1,($beg+1-$end)))); } } } } return join("",@seq); } =head3 all_contigs usage: @contig_ids = $fig->all_contigs($genome) Returns a list of all of the contigs occurring in the designated genome. =cut #: Return Type @; sub all_contigs { my($self,$genome) = @_; my($rdbH,$relational_db_response); $rdbH = $self->db_handle; if (defined($genome)) { if ($relational_db_response = $rdbH->SQL("SELECT DISTINCT contig FROM contig_lengths WHERE ( genome = \'$genome\' )")) { return map { $_->[0] } @$relational_db_response; } } return undef; } =head3 contig_ln usage: $n = $fig->contig_ln($genome,$contig) Returns the length of $contig from $genome. =cut sub contig_ln { my($self,$genome,$contig) = @_; my($rdbH,$relational_db_response); $rdbH = $self->db_handle; if (defined($genome) && defined($contig)) { if (($relational_db_response = $rdbH->SQL("SELECT len FROM contig_lengths WHERE ( genome = \'$genome\' ) and ( contig = \'$contig\' )")) && (@$relational_db_response == 1)) { return $relational_db_response->[0]->[0]; } } return undef; } =head3 dna_seq usage: $seq = dna_seq($genome,@locations) Returns the concatenated subsequences described by the list of locations. Each location must be of the form Contig_Beg_End where Contig must be the ID of a contig for genome $genome. If Beg > End the location describes a stretch of the complementary strand. =cut #: Return Type $; sub dna_seq { my($self,$genome,@locations) = @_; my(@pieces,$loc,$contig,$beg,$end,$ln,$rdbH); @locations = map { split(/,/,$_) } @locations; @pieces = (); foreach $loc (@locations) { if ($loc =~ /^(\S+)_(\d+)_(\d+)$/) { ($contig,$beg,$end) = ($1,$2,$3); $ln = $self->contig_ln($genome,$contig); if (! $ln) { print STDERR "$genome/$contig: could not get length\n"; return ""; } if (&between(1,$beg,$ln) && &between(1,$end,$ln)) { if ($beg < $end) { push(@pieces, $self->get_dna($genome,$contig,$beg,$end)); } else { push(@pieces, &reverse_comp($self->get_dna($genome,$contig,$end,$beg))); } } } } return lc(join("",@pieces)); } sub get_dna { my($self,$genome,$contig,$beg,$end) = @_; my $relational_db_response; my $rdbH = $self->db_handle; my $indexpt = int(($beg-1)/10000) * 10000; if (($relational_db_response = $rdbH->SQL("SELECT startN,fileno,seek FROM contig_seeks WHERE ( genome = \'$genome\' ) AND ( contig = \'$contig\' ) AND ( indexpt = $indexpt )")) && (@$relational_db_response == 1)) { my($startN,$fileN,$seek) = @{$relational_db_response->[0]}; my $fh = $self->openF($self->N2file($fileN)); if (seek($fh,$seek,0)) { my $chunk = ""; read($fh,$chunk,int(($end + 1 - $startN) * 1.03)); # print STDERR "genome=$genome contig=$contig beg=$beg end=$end startN=$startN chunk=$chunk\n"; $chunk =~ s/\s//g; my $ln = ($end - $beg) + 1; if (length($chunk) >= $ln) { return lc(substr($chunk,(($beg-1)-$startN),$ln)); } } } return undef; } ################################# Taxonomy #################################### =head2 Taxonomy =head3 taxonomy_of usage: $gs = $fig->taxonomy_of($genome_id) Returns the taxonomy of the specified genome. Gives the taxonomy down to genus and species. =cut sub taxonomy_of :Scalar { my($self,$genome) = @_; my($ans); my $taxonomy = $self->cached('_taxonomy'); if (! ($ans = $taxonomy->{$genome})) { my $rdbH = $self->db_handle; my $relational_db_response = $rdbH->SQL("SELECT genome,taxonomy FROM genome"); my $pair; foreach $pair (@$relational_db_response) { $taxonomy->{$pair->[0]} = $pair->[1]; } $ans = $taxonomy->{$genome}; } return $ans; } =head3 is_bacterial usage: $fig->is_bacterial($genome) Returns true iff the genome is bacterial. =cut sub is_bacterial :Scalar { my($self,$genome) = @_; return ($self->taxonomy_of($genome) =~ /^Bacteria/) ? 1 : 0; } =head3 is_archaeal usage: $fig->is_archaeal($genome) Returns true iff the genome is archaeal. =cut sub is_archaeal :Scalar { my($self,$genome) = @_; return ($self->taxonomy_of($genome) =~ /^Archaea/) ? 1 : 0; } =head3 is_prokaryotic usage: $fig->is_prokaryotic($genome) Returns true iff the genome is prokaryotic =cut sub is_prokaryotic :Scalar { my($self,$genome) = @_; return ($self->taxonomy_of($genome) =~ /^(Archaea|Bacteria)/) ? 1 : 0; } =head3 is_eukaryotic usage: $fig->is_eukaryotic($genome) Returns true iff the genome is eukaryotic =cut sub is_eukaryotic :Scalar { my($self,$genome) = @_; return ($self->taxonomy_of($genome) =~ /^Eukaryota/) ? 1 : 0; } =head3 sort_genomes_by_taxonomy usage: @genomes = $fig->sort_genomes_by_taxonomy(@list_of_genomes) This routine is used to sort a list of genome IDs to put them into taxonomic order. =cut sub sort_genomes_by_taxonomy { my($self,@fids) = @_; return map { $_->[0] } sort { $a->[1] cmp $b->[1] } map { [$_,$self->taxonomy_of($_)] } @fids; } =head3 crude_estimate_of_distance usage: $dist = $fig->crude_estimate_of_distance($genome1,$genome2) There are a number of places where we need estimates of the distance between two genomes. This routine will return a value between 0 and 1, where a value of 0 means "the genomes are essentially identical" and a value of 1 means "the genomes are in different major groupings" (the groupings are archaea, bacteria, euks, and viruses). The measure is extremely crude. =cut sub crude_estimate_of_distance :Scalar { my($self,$genome1,$genome2) = @_; my($i,$v,$d,$dist); if ($genome1 > $genome2) { ($genome1,$genome2) = ($genome2,$genome1) } my $relational_db_response; my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT dist FROM distances WHERE ( genome1 = \'$genome1\' ) AND ( genome2 = \'$genome2\' ) ")) && (@$relational_db_response == 1)) { return $relational_db_response->[0]->[0]; } return $self->crude_estimate_of_distance1($genome1,$genome2); } sub crude_estimate_of_distance1 :Scalar { my($self,$genome1,$genome2) = @_; my($i,$v,$d,$dist); if ($genome1 > $genome2) { ($genome1,$genome2) = ($genome2,$genome1) } $dist = $self->cached('_dist'); if (! $dist->{"$genome1,$genome2"}) { my @tax1 = split(/\s*;\s*/,$self->taxonomy_of($genome1)); my @tax2 = split(/\s*;\s*/,$self->taxonomy_of($genome2)); $d = 1; for ($i=0, $v=0.5; ($i < @tax1) && ($i < @tax2) && ($tax1[$i] eq $tax2[$i]); $i++, $v = $v/2) { $d -= $v; } $dist->{"$genome1,$genome2"} = $d; } return $dist->{"$genome1,$genome2"}; } =head3 sort_fids_by_taxonomy usage: @sorted_by_taxonomy = $fig->sort_fids_by_taxonomy(@list_of_fids) Sorts a list of feature IDs based on the taxonomies of the genomes that contain the features. =cut sub sort_fids_by_taxonomy { my($self,@fids) = @_; return map { $_->[0] } sort { $a->[1] cmp $b->[1] } map { [$_,$self->taxonomy_of(&genome_of($_))] } @fids; } sub build_tree_of_complete { my($self,$min_for_label) = @_; my(@last,@tax,$i,$prefix,$lev,$genome,$tax); $min_for_label = $min_for_label ? $min_for_label : 10; open(TMP,">/tmp/tree$$") || die "could not open /tmp/tree$$"; print TMP "1. root\n"; @last = (); foreach $genome (grep { $_ !~ /^99999/ } $self->sort_genomes_by_taxonomy($self->genomes("complete"))) { $tax = $self->taxonomy_of($genome); @tax = split(/\s*;\s*/,$tax); push(@tax,$genome); for ($i=0; ((@last > $i) && (@tax > $i) && ($last[$i] eq $tax[$i])); $i++) {} while ($i < @tax) { $lev = $i+2; $prefix = " " x (4 * ($lev-1)); print TMP "$prefix$lev\. $tax[$i]\n"; $i++; } @last = @tax; } close(TMP); my $tree = &tree_utilities::build_tree_from_outline("/tmp/tree$$"); $tree->[0] = 'All'; &limit_labels($tree,$min_for_label); unlink("/tmp/tree$$"); return ($tree,&tips_of_tree($tree)); } sub limit_labels { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($tree,$min_for_label) = @_; my($children) = &tree_utilities::node_pointers($tree); if (@$children == 1) { return 1; } else { my $n = 0; my $i; for ($i=1; ($i < @$children); $i++) { $n += &limit_labels($children->[$i],$min_for_label); } if ($n < $min_for_label) { $tree->[0] = ""; } return $n; } } sub taxonomic_groups_of_complete { my($self,$min_for_labels) = @_; my($tree,undef) = $self->build_tree_of_complete($min_for_labels); return &taxonomic_groups($tree); } sub taxonomic_groups { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($tree) = @_; my($groups,undef) = &taxonomic_groups_and_children($tree); return $groups; } sub taxonomic_groups_and_children { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my($tree) = @_; my($ids1,$i,$groupsC,$idsC); my $ptrs = &tree_utilities::node_pointers($tree); my $ids = []; my $groups = []; if (@$ptrs > 1) { $ids1 = []; for ($i=1; ($i < @$ptrs); $i++) { ($groupsC,$idsC) = &taxonomic_groups_and_children($ptrs->[$i]); if (@$groupsC > 0) { push(@$groups,@$groupsC); } push(@$ids1,@$idsC); } if ($tree->[0]) { push(@$groups,[$tree->[0],$ids1]); } push(@$ids,@$ids1); } elsif ($tree->[0]) { push(@$ids,$tree->[0]); } return ($groups,$ids); } ################################# Literature Stuff #################################### =head2 Literature Methods =cut sub get_titles_by_gi { my($self,$gi) = @_; &verify_existence_of_literature; $gi =~ s/^gi\|//; my $relational_db_response; my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT pmid,title FROM literature_titles WHERE ( gi = '$gi' ) ")) && (@$relational_db_response > 0)) { return sort { $a->[1] cmp $b->[1] } @$relational_db_response; } else { return (); } } sub get_titles_by_peg { my($self,$peg) = @_; my $gi; &verify_existence_of_literature; my @gis = grep { $_ =~ /^gi\|/ } $self->feature_aliases($peg); if (@gis > 0) { my $relational_db_response; my $rdbH = $self->db_handle; my $constraint = join(" OR ", map { $gi = ($_ =~ /gi\|(\S+)/) ? $1 : $_; "( gi = '$gi' )" } @gis); if (($relational_db_response = $rdbH->SQL("SELECT pmid,title FROM literature_titles WHERE ( $constraint ) ")) && (@$relational_db_response > 0)) { return sort { $a->[1] cmp $b->[1] } @$relational_db_response; } else { return (); } } return (); } sub get_title_by_pmid { my($self,$pmid) = @_; &verify_existence_of_literature; $pmid =~ s/^.*\|//; my $relational_db_response; my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT DISTINCT title FROM literature_titles WHERE ( pmid = '$pmid' ) ")) && (@$relational_db_response == 1)) { return $relational_db_response->[0]->[0]; } else { return ""; } } sub verify_existence_of_literature { if (! -d "$FIG_Config::global/Literature") { mkdir("$FIG_Config::global/Literature",0777); system "touch $FIG_Config::global/Literature/gi_pmid_title"; system "$FIG_Config::bin/load_literature"; } } ################################# Subsystems #################################### sub active_subsystems { my($self,$genome) = @_; my($active,$file,$variant); $active = {}; foreach $_ (`grep \"^$genome\" $FIG_Config::data/Subsystems/*/spreadsheet`) { if (($_ =~ /^(.*?)\/spreadsheet:$genome\t(\S+)/) && ($2 ne '0') && ($2 ne '-1')) { $file = $1; $variant = $2; if ($file =~ /^.*?([^\/]+)$/) { $active->{$1} = $variant; } } } return $active; } =head2 Subsystem Methods =cut sub exportable_subsystem { my($self,$ssa) = @_; my(%seqs,@genomes); my $spreadsheet = []; my $notes = []; $ssa =~ s/[ \/]/_/g; my $subsys_dir = "$FIG_Config::data/Subsystems/$ssa"; if (open(SSA,"<$subsys_dir/spreadsheet")) { # # Push the subsystem metadata. # my $version = $self->subsystem_version($ssa); my $exchangable = $self->is_exchangable_subsystem($ssa); push(@$spreadsheet,"$ssa\n$version\n$exchangable\n"); my @curation; if (-s "$FIG_Config::data/Subsystems/$ssa/curation.log") { @curation = `head -n 1 \"$FIG_Config::data/Subsystems/$ssa/curation.log\"`; } else { @curation = ("0000000000\tmaster:unknown\tstarted\n"); } push(@$spreadsheet,$curation[0],"//\n"); # # Roles # while (defined($_ = <SSA>) && ($_ !~ /^\/\//)) { push(@$spreadsheet,$_); } push(@$spreadsheet,"//\n"); # # Subsets # while (defined($_ = <SSA>) && ($_ !~ /^\/\//)) { push(@$spreadsheet,$_); } push(@$spreadsheet,"//\n"); # # The spreadsheet itself. # Collect the pegs referenced into %seqs. # while (defined($_ = <SSA>)) { push(@$spreadsheet,$_); chomp; my @flds = split(/\t/,$_); my $genome = $flds[0]; push(@genomes,$genome); my($i,$id); for ($i=2; ($i < @flds); $i++) { if ($flds[$i]) { my @entries = split(/,/,$flds[$i]); foreach $id (@entries) { $seqs{"fig\|$genome\.peg.$id"} = 1; } } } } push(@$spreadsheet,"//\n"); # # Assignments and aliases. # my $peg; foreach $peg (sort { &FIG::by_fig_id($a,$b) } keys(%seqs)) { my @aliases = grep { $_ =~ /^(sp\||gi\||pirnr\||kegg\||N[PGZ]_)/ } $self->feature_aliases($peg); my $alias_txt = join(",",@aliases); my $gs_txt = $self->genus_species($self->genome_of($peg)); my $func_txt = scalar $self->function_of($peg); push(@$spreadsheet, join("\t", ($peg, $alias_txt, $gs_txt, $func_txt)) . "\n"); } push(@$spreadsheet,"//\n"); # # Protein sequence data # foreach $peg (sort { &FIG::by_fig_id($a,$b) } keys(%seqs)) { my $aliases = $self->feature_aliases($peg); my $seq = $self->get_translation($peg); push(@$spreadsheet,">$peg $aliases\n"); my($i,$ln); $ln = length($seq); for ($i=0; ($i < $ln); $i += 60) { if (($ln - $i) > 60) { push(@$spreadsheet,substr($seq,$i,60) . "\n"); } else { push(@$spreadsheet,substr($seq,$i) . "\n"); } } } close(SSA); push(@$spreadsheet,"//\n"); # # Notes file # if (open(NOTES,"<$FIG_Config::data/Subsystems/$ssa/notes")) { while (defined($_ = <NOTES>)) { push(@$notes,$_); } close(NOTES); } if ($notes->[$#{$notes}] ne "\n") { push(@$notes,"\n") } push(@$notes,"//\n"); # # And tag the reactions onto the end. This is fudging the format a little bit, but # it should let older parsers handle the subsystems with extra sections. # if (open(REACTIONS, "<$FIG_Config::data/Subsystems/$ssa/reactions")) { while (<REACTIONS>) { push(@$notes, $_); } } # # And here we break compatibility. If we have diagrams, # save the diagram images. # if (opendir(D, "$subsys_dir/diagrams")) { my @ids = grep { not /^\./ and -d "$subsys_dir/diagrams/$_" } readdir(D); closedir(D); for my $id (@ids) { my $ddir = "$subsys_dir/diagrams/$id"; my $name = &FIG::file_head("$ddir/NAME", 1); chomp($name); if ($name) { push(@$notes, "//diagram:$id:name\t$name\n"); push(@$notes, "//end\n"); } # # Find the diagram image. # my @images = <$ddir/diagram.{png,gif,jpg,html}>; for my $img_file (@images) { if (open(DIAGRAM, "<$img_file")) { my $size = -s DIAGRAM; my $base = basename($img_file); push(@$notes, "//diagram:$id:diagram=$base\t$size\n"); my $buf; while (read(DIAGRAM, $buf, 60*57)) { my $enc = encode_base64($buf); # # Feh, escape the start of the lines. # $enc =~ s/^/B:/mg; push(@$notes, $enc); } close(DIAGRAM); push(@$notes, "//end\n"); } } } } } return ($spreadsheet,$notes); } sub is_exchangable_subsystem :Scalar { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my $ssa = (@_ == 1) ? $_[0] : $_[1]; $ssa =~ s/[ \/]/_/g; if (open(TMP,"<$FIG_Config::data/Subsystems/$ssa/EXCHANGABLE")) { my $line; $line = <TMP>; if ($line && ($line =~ /^(\S+)/) && $1) { return 1; } close(TMP); } return 0; } sub all_exchangable_subsystems { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my @exchangable = (); if (opendir(SUB,"$FIG_Config::data/Subsystems")) { push(@exchangable,grep { ($_ !~ /^\./) && &is_exchangable_subsystem($_) } readdir(SUB)); closedir(SUB); } return @exchangable; } =head3 all_subsystems Return a list of all subsystems. =cut sub all_subsystems { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my @subsystems = (); if (opendir(SUB,"$FIG_Config::data/Subsystems")) { push(@subsystems,grep { ($_ !~ /^\./) } readdir(SUB)); closedir(SUB); } return @subsystems; } =head3 index_subsystems Run indexing on one or more subsystems. If no subsystems are defined we will reindex the whole thing. Otherwise we will only index the defined subsystem. Note that this method just launches index_subsystems as a background job. Returns the job of the child process. $pid=$fig->index_subsystems("Alkanesulfonates Utilization"); # do only Alkanesulfonates Utilization $pid=$fig->index_subsystems(@ss); # do subsystems in @ss $pid=$fig->index_subsystems(); # do all subsystems =cut sub index_subsystems { my ($self, @ss)=@_; print STDERR "Trying $FIG_Config::bin/index_subsystems @ss\n"; return $self->run_in_background( sub { my $cmd="$FIG_Config::bin/index_subsystems @ss"; print "Will run '$cmd'\n"; &run($cmd); print "finished.\n"; } ); } =head3 all_constructs Hmmm... =cut sub all_constructs { my($self) = @_; my @subsystems = (); if (opendir(SUB,"$FIG_Config::data/Subsystems")) { push(@subsystems,grep { ($_ !~ /^\./) } readdir(SUB)); closedir(SUB); } my @c; for my $subname (@subsystems) { $subname =~ s/[ \/]/_/g; my $cfile = "$FIG_Config::data/Subsystems/$subname/constructs"; if (-f $cfile) { my $sub = $self->get_subsystem($subname); my @a = Construct::parse_constructs_file($cfile, $sub); my $l = []; for my $con (@a) { my($cname, $list) = @$con; my $nreqs = []; for my $req (@$list) { if ($req->[0] eq 'R') { push(@$nreqs, ['R', $req->[2]]); } else { push(@$nreqs, $req); } } push(@$l, [$cname, $nreqs]); } push(@c, [$subname, $l]); } } return @c; } =head3 subsystem_version my $version=subsystem_version($subsystem_name) returns the current version of the subsystem. =cut sub subsystem_version :Scalar { shift if UNIVERSAL::isa($_[0],__PACKAGE__); my $ssa = (@_ == 1) ? $_[0] : $_[1]; $ssa =~ s/[ \/]/_/g; if (open(VER,"<$FIG_Config::data/Subsystems/$ssa/VERSION")) { my $ver = <VER>; close(VER); if ($ver =~ /^(\S+)/) { return $1; } } return 0; } =head3 subsystem_classification Get or set the classification of the subsystem. Added by RAE in response to the changes made on seed wiki If a reference to an array is supplied it is saved as the new classification of the subsystem. Regardless, the current classification is returned as a reference to an array. There is no control over what the things are. Returns a reference to an empty array if a valid subsystem is not supplied, or if no classification is known The classification is stored as a \t separated list of things in $subsys/CLASSIFICATION. There is no control over what the things are. =cut sub subsystem_classification { my ($self, $ssa, $classification)=@_; $ssa =~ s/[ \/]/_/g; my $return=['', '']; if ($ssa && $classification->[0]) { return $return unless (-e "$FIG_Config::data/Subsystems/$ssa/"); if (open(SSA,">$FIG_Config::data/Subsystems/$ssa/CLASSIFICATION")) { print SSA join("\t", @$classification), "\n"; } close SSA; return $classification; } # using get_subsystem is really slow, and so we are going to cat the file and return that #return $subsys->get_classification; if (open(SSA,"<$FIG_Config::data/Subsystems/$ssa/CLASSIFICATION")) { my @line; while (<SSA>) { chomp; my @thisline=split /\t/; if ($thisline[0] || $thisline[1]) {@line=@thisline} } $line[0]='' unless (defined $line[0]); $line[1]='' unless (defined $line[1]); return [$line[0], $line[1]]; } else { return ['', '']; } } =head3 all_subsystem_classifications() usage: $classifications=$fig->all_subsystems_classifications(); print map {join "\t", @$_} @$classifications; Returns a array where each element is a reference to an array of the two classifications. =cut sub all_subsystem_classifications { my $self=shift; my %found; map { my $cl=join "\t", @{$self->subsystem_classification($_)}; $found{$cl}++; } ($self->all_subsystems); my @return; foreach my $c (keys %found) { my @a=split /\t/, $c; push @return, \@a; } return @return; } =head3 subsystem_curator usage: $curator = $fig->subsystem_curator($subsystem_name) Return the curator of a subsystem. =cut sub subsystem_curator :Scalar { my($self, $ssa) = @_; my($who) = ""; $ssa =~ s/[ \/]/_/g; if (open(DATA,"<$FIG_Config::data/Subsystems/$ssa/curation.log")) { while (defined($_ = <DATA>)) { if ($_ =~ /^\d+\t(\S+)\s+started/) { $who = $1; } } close(DATA); } return $who; } sub reset_subsystem_curator :Scalar { my($self, $ssa, $who) = @_; $ssa =~ s/[ \/]/_/g; if ($who && open(LOG,">>$FIG_Config::data/Subsystems/$ssa/curation.log")) { my $time = time; print LOG "$time\t$who\tstarted\n"; close(LOG); return 1; } return 0; } =head3 subsystem_info usage: ($version, $curator, $pedigree, $roles) = $fig->subsystem_info($subsystem_name) Return information about the given subsystem. $roles is a list of tuples (abbrev, name). =cut sub subsystem_info { my($self,$ssa) = @_; my($version, $curator, $pedigree, $roles);; $ssa =~ s/[ \/]/_/g; $roles = []; $version = $self->subsystem_version($ssa); $curator = $self->subsystem_curator($ssa); if (open(CUR, "<$FIG_Config::data/Subsystems/$ssa/curation.log")) { local($/); $pedigree = <CUR>; } if (open(SSA,"<$FIG_Config::data/Subsystems/$ssa/spreadsheet")) { # # The spreadsheet appears to be of the form # # role-abbr role name # ... # // # Something about subsets # // # genome-id spreadsheet-info # local $/ = "//"; my($chunk); if (defined($chunk = <SSA>)) { for $_ (split(/\n/, $chunk)) { chomp; if (/^(\S+)\s+(.*)\s*$/) { push(@$roles, [$1, $2]); } } } close(SSA); } return ($version, $curator, $pedigree, $roles); } =head3 subsystem_genomes usage: $genomes = $fig->subsystem_genomes($subsystem_name, $zero) Return the list of genomes in the subsystem. $genomes is a list of tuples (genome_id, name) unless ($zero) is set to true it will only return those genomes with a non-zero variant code =cut #: Return Type $@@; sub subsystem_genomes :Scalar { my($self,$ssa,$all) = @_; my($genomes); $ssa =~ s/[ \/]/_/g; $genomes = []; if (open(SSA,"<$FIG_Config::data/Subsystems/$ssa/spreadsheet")) { # # The spreadsheet appears to be of the form # # role-abbr role name # ... # // # Something about subsets # // # genome-id spreadsheet-info # local $/ = "//"; my($chunk); if (defined($chunk = <SSA>)) { } if (defined($chunk = <SSA>)) { } local $/ = "\n"; while (<SSA>) { chomp; s/^\s*//; s/\s*$//; next if $_ eq ""; if (($_ =~ /^(\d+\.\d+)\s+(\S+)/) && ($all || $2)) { my $genome = $1; my $name = $self->genus_species($genome); push(@$genomes, [$genome, $name]); } } close(SSA); } return $genomes; } # # @pegs = $fig->pegs_in_subsystem_cell($subsystem, $genome,$role) # @roles = $fig->subsystem_to_roles($subsystem) # @maps = $fig->role_to_maps($role) # @subsystems = $fig->peg_to_subsystems($peg); =head3 get_subsystem C<< my $subsysObject = $fig->get_subsystem($name, $force_load); >> Return a subsystem object for manipulation of the named subsystem. If the subsystem does not exist, an undefined value will be returned. =over 4 =item name Name of the desired subsystem. =item force_load TRUE to reload the subsystem from the data store even if it is already cached in memory, else FALSE. =item RETURN Returns a blessed object that allows access to subsystem data. =back =cut sub get_subsystem :Scalar { my($self, $subsystem, $force_load) = @_; my $sub; $subsystem =~ s/[ \/]/_/g; my $cache = $self->cached('_Subsystems'); if ($force_load || !($sub = $cache->{$subsystem})) { $sub = new Subsystem($subsystem, $self); $cache->{$subsystem} = $sub if $sub; } return $sub; } =head3 subsystem_to_roles C<< my @roles = $fig->subsystem_to_roles($subsysID); >> Return a list of the roles for the specified subsystem. =over 4 =item subsysID Name (ID) of the subsystem whose roles are to be listed. =item RETURN Returns a list of role IDs. =back =cut sub subsystem_to_roles { my($self, $subsystem) = @_; $subsystem =~ s/[ \/]/_/g; my $sub = $self->get_subsystem($subsystem); return () unless $sub; return $sub->get_roles(); } sub pegs_in_subsystem_cell { my($self, $subsystem, $genome, $role) = @_; $subsystem =~ s/[ \/]/_/g; my $sub = $self->get_subsystem($subsystem); return undef unless $sub; return grep { ! $self->is_deleted_fid($_) } $sub->get_pegs_from_cell($genome, $role); } sub get_clearinghouse :Scalar { my($self, $url) = @_; if (defined($self->{_clearinghouse})) { return $self->{_clearinghouse}; } if (!$ClearinghouseOK) { warn "Error: Clearinghouse code not available.\n"; return undef; } if ($url eq "") { $url = "http://www-unix.mcs.anl.gov/~olson/SEED/api.cgi"; } my $ch = new Clearinghouse($url); $self->{_clearinghouse} = $ch; return $ch; } sub publish_subsystem_to_clearinghouse { my ($self, $ssa, $url) = @_; my ($id, $token); $ssa =~ s/[ \/]/_/g; my $ch = $self->get_clearinghouse($url); if (!defined($ch)) { warn "Cannot publish: clearinghouse not available\n"; return undef; } my($version, $curator, $pedigree, $roles) = $self->subsystem_info($ssa); my $genomes = $self->subsystem_genomes($ssa); my @genome_names = (); for my $g (@$genomes) { push(@genome_names, $g->[1]); } my $seed_id = $self->get_seed_id(); my $time = int(time()); print "publishing: ss=$ssa version=$version time=$time curator=$curator seed_id=$seed_id\n"; my $ret = $ch->publish_subsystem($ssa, $version, $time, $curator, $pedigree, $seed_id, $roles, \@genome_names); ($id, $token, $url) = @$ret; print "Got id $id token $token url $url\n"; # # Retrieve the package # print "Packaging...\n"; my($spreadsheet, $notes) = $self->exportable_subsystem($ssa); my $package = join("", @$spreadsheet, @$notes); print "Sending...\n"; $ch->upload_subsystem_package($url, $package); return 1; } # # Return the list of subsystems this peg appears in. # Each entry is a pair [subsystem, role]. # =head3 subsystems_for_peg Return the list of subsystems and roles that this peg appears in. Returns an array. Each item in the array is a reference to a tuple of subsystem and role. =cut sub subsystems_for_peg { my($self, $peg) = @_; if ($self->is_deleted_fid($peg)) { return () } ($peg =~ /^fig\|\d+\.\d+\.peg\.\d+$/) or return; my $rdbH = $self->db_handle; my $q = "SELECT subsystem, role FROM subsystem_index WHERE protein = '$peg'"; if (my $relational_db_response = $rdbH->SQL($q)) { my %seen; my @in; my $pair; foreach $pair (@$relational_db_response) { $pair->[0] =~ s/ /_/g; my $key = join("\t",@$pair); if (! $seen{$key}) { push(@in,$pair); $seen{$key} = 1; } } return @in; } else { return (); } } =head3 subsystems_roles Return the list of subsystems and roles for every peg in subsystems Returns an array. Each item in the array is a reference to a three-ple of subsystem, role, and peg. =cut sub subsystems_roles { my($self) = @_; my $rdbH = $self->db_handle; my $q = "SELECT subsystem, role, protein FROM subsystem_index"; if (my $relational_db_response = $rdbH->SQL($q)) { my %seen; my @in; my $pair; foreach $pair (@$relational_db_response) { my $key = join("\t",@$pair); if (! $seen{$key}) { push(@in,$pair); $seen{$key} = 1; } } return @in; } else { return (); } } =head3 subsystems_for_role Return a list of subsystems, roles, and proteins containing a given role Returns an array. Each item in the array is a reference to a three-ple of subsystem, role, and peg. =cut sub subsystems_for_role { my($self, $role) = @_; my $rdbH = $self->db_handle; my $q = "SELECT subsystem, role, protein FROM subsystem_index WHERE role = \'$role\'"; if (my $relational_db_response = $rdbH->SQL($q)) { my %seen; my @in; my $pair; foreach $pair (@$relational_db_response) { my $key = join("\t",@$pair); if (! $seen{$key}) { push(@in,$pair); $seen{$key} = 1; } } return @in; } else { return (); } } =head3 assigned_pegs_in_subsystems Return list of [peg, function, ss, role in ss]. =cut sub assigned_pegs_in_subsystems { my($self, $genome) = @_; my @result = (); for my $peg ($self->pegs_of($genome)) { my $fn = $self->function_of($peg); next if $fn eq ""; next if $self->hypo($fn); my $rdbH = $self->db_handle; my $q = "SELECT subsystem, role FROM subsystem_index WHERE protein = '$peg'"; if (my $relational_db_response = $rdbH->SQL($q)) { my $pair; foreach $pair (@$relational_db_response) { my ($ss, $role) = @$pair; push(@result, [$peg, $fn, $ss, $role]); } } } return @result; } sub peg_to_roles_in_subsystems { my($self,$peg) = @_; my $rdbH = $self->db_handle; my $q = "SELECT subsystem, role FROM subsystem_index WHERE protein = '$peg'"; if (my $relational_db_response = $rdbH->SQL($q)) { return @$relational_db_response; } return (); } =head3 assigned_pegs_not_in_ss Return all pegs with non-hypothetical assignments that are not in ss. =cut sub assigned_pegs_not_in_ss { my($self, $genome) = @_; my @result = (); for my $peg ($self->pegs_of($genome)) { my $fn = $self->function_of($peg); next if $fn eq ""; next if $self->hypo($fn); my @subs = $self->subsystems_for_peg($peg); if (@subs < 1) { push(@result, [$peg, $fn, "No Subsytem", "No Role"]); } } return @result; } =head3 assigned_pegs Return list of [peg, function, ss, role in ss] for every non-hypo protein regardless of being in ss =cut sub assigned_pegs { my($self, $genome) = @_; my @result = (); for my $peg ($self->pegs_of($genome)) { my $fn = $self->function_of($peg); next if $fn eq ""; next if $self->hypo($fn); my $rdbH = $self->db_handle; my $q = "SELECT subsystem, role FROM subsystem_index WHERE protein = '$peg'"; if (my $relational_db_response = $rdbH->SQL($q)) { my $pair; if(@$relational_db_response > 0) { foreach $pair (@$relational_db_response) { my ($ss, $role) = @$pair; push(@result, [$peg, $fn, $ss, $role]); } } else { push(@result, [$peg, $fn, "No Subsystem", "No Role"]); } } } return @result; } =head3 subsystem_roles Return a list of all roles present in locally-installed subsystems. The return is a hash keyed on role name with each value a list of subsystem names. =cut sub subsystem_roles { my($self) = @_; my $rdbH = $self->db_handle; my $q = "SELECT distinct subsystem, role FROM subsystem_index"; my $ret = {}; if (my $relational_db_response = $rdbH->SQL($q)) { foreach my $pair (@$relational_db_response) { my($subname, $role) = @$pair; push(@{$ret->{$role}}, $subname); } } return $ret; } # # Return just the list of subsystems the peg appears in. # sub peg_to_subsystems { my($self, $peg) = @_; if ($self->is_deleted_fid($peg)) { return () } my %in = map { $_->[0] =~ s/ /_/g; $_->[0] => 1 } $self->subsystems_for_peg($peg); return sort keys(%in); } sub write_subsystem_spreadsheet { my($self,$ssa,$roles,$genomes,$pegs_in_cells) = @_; my(@genomes,$genome,$role,@pegs,$pair,$gs); $ssa =~ s/[ \/]/_/g; &verify_dir("$FIG_Config::data/Subsystems/$ssa"); open(SSA,">$FIG_Config::data/Subsystems/$ssa/spreadsheet") || die "Cannot open $FIG_Config::data/Subsystems/$ssa/spreadsheet"; foreach $pair (@$roles) { print SSA join("\t",@$pair),"\n"; } print SSA "//\n"; print SSA "All\n\nAll\n//\n"; @genomes = map { $_->[1] } sort { ($a->[0] cmp $b->[0]) or ($a->[1] <=> $b->[1]) } map {$genome = $_; $gs = $self->genus_species($genome); [$gs,$genome] } @$genomes; foreach $genome (@genomes) { print SSA "$genome\t0"; foreach $role (@$roles) { $_ = $pegs_in_cells->{"$genome\t$role->[1]"}; @pegs = $_ ? sort { &by_fig_id($a,$b) } @{$_} : (); print SSA "\t",join(",",map { $_ =~ /^fig\|\d+\.\d+\.peg\.(\d+)/; $1 } @pegs); } print SSA "\n"; } close(SSA); chmod(0777,"$FIG_Config::data/Subsystems/$ssa"); } ################################# PEG Translation #################################### =head2 PEG Translations =cut sub translate_pegs { my($self,$pegs,$seq_of, $cb) = @_; my($seq,$aliases,$pegT,%to,%sought,@keys,$peg,$alias); $cb = sub {} unless ref($cb) eq "CODE"; my $tran_peg = {}; my $n = scalar keys (%$pegs); my $idx = 0; foreach $peg (keys(%$pegs)) { $idx++; &$cb("$idx of $n") if $idx % 100 == 0; # # First, see if the peg of the same name locally has the same # last 10 chars. # if (($seq = $self->get_translation($peg)) && (length($seq) > 10) && (length($seq_of->{$peg}) > 10) && (uc substr($seq,-10) eq substr($seq_of->{$peg},-10))) { $tran_peg->{$peg} = $peg; } else { # # Otherwise, search for a local peg that has the same alias # as this peg. (Canonicalize based on the original source) # ($aliases,undef,undef) = @{$pegs->{$peg}}; undef %to; foreach $alias (split(/,/,$aliases)) { if ($pegT = $self->by_alias($alias)) { $to{$pegT}++; } } # # If we have a unique answer, we are done. # Otherwise mark this one as needing more search. # if ((@keys = keys(%to)) == 1) { $tran_peg->{$peg} = $keys[0]; } else { $sought{$peg} = 1; } } } if ((scalar keys(%sought)) > 0) { &tough_search($self,$pegs,$seq_of,$tran_peg,\%sought); } return $tran_peg; } =head3 tough_search($pegs, $seq_of, $tran_peg, $sought) $pegs - not used $seq_of - hash from peg to peg sequence $tran_peg - hash into which translated pegs are placed $sought - hash keyed on the list of pegs we're looking for. =cut sub tough_search { my($self,$pegs,$seq_of,$tran_peg,$sought) = @_; my($peg,$seq,%needH,%needT,%poss,$id,$sub,$to,$x,$genome); # # Construct %needT, key is 50-bases from tail of sequence, values are pegs from # the list of pegs we're seeking. # # %needH is the same, but keyed on 50 bases from the head of the sequence. # foreach $peg (keys(%$sought)) { if (($seq = $seq_of->{$peg}) && (length($seq) > 50)) { my $sub = substr($seq,-50); push(@{$needT{$sub}},$peg); $sub = substr($seq,0,50); push(@{$needH{$sub}},$peg); } } # print STDERR &Dumper(\%needT,\%needH); open(NR,"<$FIG_Config::global/nr") || die "could not open $FIG_Config::global/nr"; $/ = "\n>"; while (defined($_ = <NR>)) { chomp; if ($_ =~ /^>?(\S+)[^\n]*\n(.*)/s) { $id = $1; $seq = $2; if (length($seq) >= 50) { $sub = uc substr($seq,-50); if ((($x = $needT{uc substr($seq,-50)}) && (@$x == 1)) || (($x = $needH{uc substr($seq,0,50)}) && (@$x == 1))) { $peg = $x->[0]; my @same = grep { $_ =~ /^fig/ } map { $_->[0] } $self->mapped_prot_ids($id); if (@same > 0) { push(@{$poss{$peg}},@same); } } } } } close(NR); $/ = "\n"; # print STDERR &Dumper(\%poss); foreach $peg (keys(%poss)) { # print STDERR "processing $peg\n"; $x = $poss{$peg}; if (@$x == 1) { $tran_peg->{$peg} = $x->[0]; delete $sought->{$peg}; } elsif ($genome = $self->probable_genome($self->genome_of($peg),$tran_peg)) { # print STDERR " mapped to genome $genome\n"; my $genomeQ = quotemeta $genome; my @tmp = grep { $_ =~ /^fig\|$genomeQ\./ } @$x; if (@tmp == 1) { $tran_peg->{$peg} = $tmp[0]; delete $sought->{$peg}; } else { # print STDERR &Dumper(["failed",$peg,$genome,\@tmp]); } } else { # print STDERR "could not place genome for $peg\n"; } } foreach $peg (keys(%$sought)) { print STDERR "failed to map $peg\n"; } } sub probable_genome { my($self,$genome,$tran_peg) = @_; my($peg,%poss,@poss); my $genomeQ = quotemeta $genome; foreach $peg (grep { $_ =~ /^fig\|$genomeQ\./ } keys(%$tran_peg)) { $poss{$self->genome_of($tran_peg->{$peg})}++; } @poss = sort { $poss{$b} <=> $poss{$a} } keys(%poss); if ((@poss == 1) || ((@poss > 1) && ($poss{$poss[0]} > $poss{$poss[1]}))) { return $poss[0]; } else { # print STDERR &Dumper(["could not resolve",\%poss,$genome]); return undef; } } =head3 find_genome_by_content Find a genome given the number of contigs, number of nucleotides, and checksum. We pass in a potential name for the genome as a quick starting check. =cut sub find_genome_by_content { my($self, $genome, $n_contigs, $n_nucs, $cksum) = @_; my(@genomes); my $gbase = (split(/\./, $genome))[0]; # # Construct the list of genomes so that we first check ones with the same # base-part as the one passed in. # for ($self->genomes()) { if (/^$gbase\./) { unshift(@genomes, $_); } else { push(@genomes, $_); } } for my $genome (@genomes) { if (open(my $cfh, "<$FIG_Config::organisms/$genome/COUNTS")) { if ($_ = <$cfh>) { my($cgenome, $cn_contigs, $cn_nucs, $ccksum) = split(/\t/); if ($cgenome eq $genome and $cn_contigs == $n_contigs and $cn_nucs == $n_nucs and $ccksum == $cksum) { return $genome; } } close($cfh); } } return undef; } ################################# Support for PEG Links #################################### =head2 Links =cut sub peg_links { my($self,$fid) = @_; return $self->fid_links($fid); } =head3 fid_links C<< my @links = $fig->fid_links($fid); >> Return a list of hyperlinks to web resources about a specified feature. =over 4 =item fid ID of the feature whose hyperlinks are desired. =item RETURN Returns a list of raw HTML strings representing hyperlinks to web pages relating to the specified feature. =back =cut #: Return Type @; sub fid_links { my($self,$fid) = @_; my($i,$got,$genome,$fidN); if ($self->is_deleted_fid($fid)) { return () } my @links = (); my @aliases = $self->feature_aliases($fid); my $link_file; for $link_file (("$FIG_Config::global/fid.links","$FIG_Config::global/peg.links")) { if (open(GLOBAL,"<$link_file")) { while (defined($_ = <GLOBAL>)) { chop; my($pat,$link) = split(/\t/,$_); for ($i=0,$got=0; (! $got) && ($i < @aliases); $i++) { if ($aliases[$i] =~ /$pat/) { push(@links,eval "\"$link\""); $got = 1; } } } close(GLOBAL); } } my $relational_db_response; my $rdbH = $self->db_handle; if (($relational_db_response = $rdbH->SQL("SELECT link FROM fid_links WHERE ( fid = \'$fid\' )")) && (@$relational_db_response > 0)) { push(@links, map { $_->[0] } @$relational_db_response); } return sort { $a =~ /\>([^\<]+)\<\/a\>/; my $l1 = $1; $b =~ /\>([^\<]+)\<\/a\>/; my $l2 = $1; $a cmp $b } @links; } # Each link is a 2-tuple [fid,link] sub add_peg_links { my($self,@links) = @_; return $self->add_fid_links(@links); } sub add_fid_links { my($self,@links) = @_; my($fid,$link,$pair,$i); my $relational_db_response; my $rdbH = $self->db_handle; foreach $pair (@links) { ($fid,$link) = @$pair; if (($relational_db_response = $rdbH->SQL("SELECT link FROM fid_links WHERE ( fid = \'$fid\' )")) && (@$relational_db_response > 0)) { for ($i=0; ($i < @$relational_db_response) && ($relational_db_response->[$i]->[0] ne $link); $i++) {} if ($i == @$relational_db_response) { &add_fid_link($self,$fid,$link); } } else { &add_fid_link($self,$fid,$link); } } } sub add_fid_link { my($self,$fid,$link) = @_; if ($self->is_deleted_fid($fid)) { return } my $rdbH = $self->db_handle; ($fid =~ /^fig\|\d+\.\d+\.([^.]+)\.\d+$/) || confess "bad fid $fid"; my $type = $1; $rdbH->SQL("INSERT INTO fid_links ( fid,link ) VALUES ( \'$fid\',\'$link\' )"); if (($fid =~ /^fig\|(\d+\.\d+)\.fid\.\d+$/) && open(TMP,">>$FIG_Config::organisms/$1/Features/$type/links")) { print TMP "$fid\t$link\n"; close(TMP); chmod 0777,"$FIG_Config::organisms/$1/Features/$type/links"; } } sub delete_peg_link { my($self,$peg,$link) = @_; $self->delete_fid_link($peg,$link); } sub delete_fid_link { my($self,$fid,$link) = @_; my($i); if ($self->is_deleted_fid($fid)) { return } my $genome = $self->genome_of($fid); ($fid =~ /^fig\|\d+\.\d+\.([^.]+)\.\d+$/) || confess "bad fid $fid"; my $type = $1; my $rdbH = $self->db_handle; $rdbH->SQL("DELETE FROM fid_links WHERE ( fid = \'$fid\' AND link = \'$link\' )"); my $file; foreach $file (("$FIG_Config::organisms/$genome/Features/$type/$type.links","$FIG_Config::organisms/$genome/Features/$type/links")) { if (-s $file) { my @links = `cat $file`; for ($i=0; ($i < @links) && (! (($links[$i] =~ /^(\S+)\t(\S.*\S)/) && ($1 eq $fid) && ($2 eq $link))); $i++) {} if (($i < @links) && open(TMP,">$file")) { splice(@links,$i,1); print TMP join("",@links); close(TMP); } } } } sub delete_all_peg_links { my($self,$peg) = @_; $self->delete_all_fid_links($peg); } sub delete_all_fid_links { my($self,$fid) = @_; my($i); if ($self->is_deleted_fid($fid)) { return } my $genome = $self->genome_of($fid); my $rdbH = $self->db_handle; $rdbH->SQL("DELETE FROM fid_links WHERE ( fid = \'$fid\' )"); ($fid =~ /^fig\|\d+\.\d+\.([^.]+)\.\d+$/) || confess "bad fid $fid"; my $type = $1; my $file; foreach $file (("$FIG_Config::organisms/$genome/Features/$type/$type.links","$FIG_Config::organisms/$genome/Features/$type/links")) { if (-s $file) { my @links = `cat $file`; my @links1 = grep { ! (($_ =~ /^(\S+)/) && ($1 eq $fid)) } @links; if ((@links1 < @links) && open(TMP,">$file")) { print TMP join("",@links1); close(TMP); } } } } ########### # # =head2 Peg Searches and Similarities Some routines for dealing with peg search and similarities. This is code lifted from pom.cgi and reformatted for more general use. Find the given role in the given (via CGI params) organism. We do this by finding a list of pegs that are annotated to have this role in other organisms that are "close enough" to our organism We then find pegs in this organism that are similar to these pegs. =cut sub find_role_in_org { my($self,$role, $org, $user, $sims_cutoff) = @_; my($id2,$psc,$col_hdrs,$tab,$peg,$curr_func,$id2_func); my($seen,$peg); if (!$org) { return undef; } # # Create a list of candidates. # # These are the list of sequences that contain the given role, # sorted by the crude_estimate_of_distance from the given peg. # my @cand = map { $_->[0] } sort { $a->[1] <=> $b->[1] } map { $peg = $_; [$peg,$self->crude_estimate_of_distance($org,&FIG::genome_of($peg))] } $self->seqs_with_role($role,$user); my $hits = {}; $seen = {}; # # Pick the top 10 hits if there are more than 10. # my $how_many0 = ((@cand > 10) ? 10 : scalar @cand) - 1; $self->try_to_locate($org,$hits,[@cand[0..$how_many0]],$seen, $sims_cutoff); if (keys(%$hits) == 0) { splice(@cand,0,$how_many0+1); &try_to_locate($self,$org,$hits,\@cand,$seen, $sims_cutoff); } # # At this point %$hits contains the pegs in our organism that # may have the given role. The key is the peg, the value # is a pair [score, similar-peg] # # # We reformat this into a list of entries # [ $psc, $peg-in-this-org, $length, $current-fn, $matched-protein, $matched-len, $matched-fun] # $col_hdrs = ["P-Sc","PEG","Ln1","Current Function", "Protein Hit","Ln2","Function"]; my @ret; foreach $peg ( sort {$hits->{$a}->[0] <=> $hits->{$b}->[0]} keys(%$hits)) { ($psc,$id2) = @{$hits->{$peg}}; $curr_func = $self->function_of($peg,$user); $id2_func = $self->function_of($id2,$user); push(@ret, [$psc, $peg, $self->translation_length($peg), $curr_func, $id2, $self->translation_length($id2),$id2_func]); } return @ret; } =head2 Utility Methods =head3 run_in_background Background job support. If one wants to turn a script into a background, invoke $fig->run_in_background($coderef). This will cause $coderef to be invoked as a background job. This means its output will be written to $FIG_Config::data/Global/background_jobs/<pid>, and that it shows up and is killable via the seed control panel. =cut sub run_in_background { my($self, $coderef, $close_fds) = @_; if (ref($coderef) ne "CODE") { warn "FIG::run_in_background() invoked without a code reference\n"; return; } my $job_dir = "$FIG_Config::data/Global/background_jobs"; verify_dir($job_dir); my $child = fork; if (!defined($child)) { die "run_in_background: fork failed: $!\n"; } if ($child == 0) { # # In the child process. # POSIX::setsid(); my $d = $self->db_handle(); if ($d) { my $dbh = $d->{_dbh}; $dbh->{InactiveDestroy} = 1; } if ($close_fds) { for (my $fd = 3; $fd < 32; $fd++) { POSIX::close($fd); } } my $my_job_dir = "$job_dir/$$"; verify_dir($my_job_dir); open(my $info, ">$my_job_dir/INFO"); my $now = asctime(localtime(time())); chomp($now); print $info "Background job $$ created from run_in_background by $> on $now\n"; close($info); # # Redirect stdout/stderr to the OUTPUT file. # close(STDOUT); close(STDERR); open STDOUT, ">$my_job_dir/OUTPUT"; open STDERR, ">&STDOUT"; select(STDERR); $| = 1; select(STDOUT); $| = 1; # # Make stdin be from nowhere. # open STDIN, "</dev/null"; # # Run the code. # open(my $stat, ">$my_job_dir/STATUS"); print $stat "Job started at $now\n"; close($stat); eval { &$coderef; }; open(my $stat, ">$my_job_dir/STATUS"); if ($@ eq "") { print $stat "Finished successfully\n"; } else { print $stat "Code had an error:\n$@\n"; } close($stat); # # We need to undef this, otherwise the DBrtns destructor # will do an explicit dbh->disconnect, which will undo any # effect of the InactiveDestroy set above. # my $d = $self->db_handle(); if ($d) { delete $d->{_dbh}; } exit; } return $child; } sub get_all_jobs :List { my($self) = @_; my $job_dir = "$FIG_Config::data/Global/background_jobs"; opendir(my $jfh, $job_dir); my @jobs = grep { $_ =~ /^\d+$/ } readdir($jfh); closedir($jfh); return @jobs; } sub get_job :Scalar { my($self, $job_id) = @_; my $job_dir = "$FIG_Config::data/Global/background_jobs/$job_id"; if (-d $job_dir) { return FIG::Job->new($job_id, $job_dir); } else { return undef; } } sub get_current_arch :Scalar { my $arch; open(FH, "<$FIG_Config::fig_disk/config/fig-user-env.sh"); while (<FH>) { chomp; if (/^RTARCH=\"(.*)\"/) { $arch = $1; last; } } return $arch; } ############################### Interfaces to Other Systems ###################################### # =head2 External Interface Methods This section contains the functionality introduced by the interface with GenDB. The initial two functions simply register when GenDB has a version of the genome (so we can set links to it when displaying PEGS: =head3 has_genome usage: has_genome("GenDB",$genome) Invoking this routine just records that GenDB has a copy of the genome designated by $genome. =cut sub has_genome { my($system,$genome) = @_; print STDERR "$system has $genome\n"; } =head3 dropped_genome usage: dropped_genome("GenDB",$genome) Invoking this routine just records that GenDB should no longer be viewed as having a copy of the genome designated by $genome. =cut sub dropped_genome { my($system,$genome) = @_; print STDERR "$system dropped $genome\n"; } =head3 link_to_system usage: $url = link_to_system("GenDB",$fid) # usually $fid is a peg, but it can be other types of features, as well This routine is used to get a URL that can be used to "flip" from one system to the other. If the feature is unknown to the system, undef should be returned. =cut sub link_to_system { my($system,$fid) = @_; return undef; } ############################### Adding, Deleting, Altering Features #################### =head2 Feature Update Methods The following routines support alteration of features =head3 delete_feature usage: $fig->delete_feature($fid) Invoking this routine deletes the feature designated by $fid. =cut sub delete_feature { my($self,$fid) = @_; open(TMP,">>$FIG_Config::global/deleted.features") || die "could not open $FIG_Config::global/deleted.features"; flock(TMP,LOCK_EX) || confess "cannot lock deleted.features"; print TMP "$fid\n"; close(TMP); chmod 0777, "$FIG_Config::global/deleted.features"; $self->{_deleted_fids} = undef; } =head3 add_feature C<< my $fid = $fig->add_feature($genome,$type,$location,$aliases,$translation,$fid); >> Invoking this routine adds the feature, returning a new (generated) $fid. It is also possible to specify the feature ID, which is recommended if the feature is to be permanent. (In order to do this the ID needs to be allocated from the clearinghouse machine.) The translation is optional and only applies to PEGs. =over 4 =item genome ID of the genome to which the feature belongs. =item type Type of the feature (C<peg>, C<rna>, etc.) =item location Location of the feature, in the form of a comma-delimited list of location specifiers. These are of the form I<contig>C<_>I<begin>C<_>I<end>, where I<contig> is the ID of a contig, and I<begin> and I<end> are the starting and stopping offsets of the location. These offsets are 1-based, and depending on the strand, the beginning offset could be larger than the ending offset. =item aliases A comma-delimited list of alias names for the feature. =item translation (optional) The protein translation of the feature, if it is a peg. =item fid (optional) The ID to give to the new feature. If this parameter is omitted, an ID will be generated automatically. =item RETURN Returns the new feature's ID if successful,or C<undef> if an error occurred. =back =cut sub add_feature { my($self,$genome,$type,$location,$aliases,$translation,$fid) = @_; my $dbh = $self->db_handle(); $aliases = $aliases ? $aliases : ""; my $aliasesT = $aliases; $aliasesT =~ s/,/\t/g; my @aliases = split(/\t/,$aliasesT); if (! defined $fid) { $fid = $self->next_fid($genome,$type); } &add_tbl_entry($fid,$location,$aliasesT); if (($type eq "peg") and $translation) { $self->add_translation($fid,$translation); } my @loc = split(/,/,$location); my($contig,$beg,$end); if (($loc[0] =~ /^(\S+)_(\d+)_\d+$/) && (($contig,$beg) = ($1,$2)) && ($location =~ /(\d+)$/)) { $end = $1; if ($beg > $end) { ($beg,$end) = ($end,$beg) } $fid =~ /(\d+)$/; my $fidN = $1; if ((length($location) < 5000) && (length($contig) < 96) && (length($fid) < 32) && ($fid =~ /(\d+)$/)) { my $rv = $dbh->SQL("INSERT INTO features (id,idN,type,genome,location,contig,minloc,maxloc,aliases) VALUES ('$fid',$fidN,'$type','$genome','$location','$contig',$beg,$end,'$aliases')"); if (@aliases > 0) { my $alias; foreach $alias (@aliases) { if ($alias =~ /^(NP_|gi\||sp\|\tr\||kegg\||uni\|)/) { $dbh->SQL("INSERT INTO ext_alias (id,alias,genome) VALUES ('$fid','$alias','$genome')"); } } } return $fid; } } return undef; } sub next_fid { my($self,$genome,$type) = @_; my $dbh = $self->db_handle(); my $res = $dbh->SQL("select max(idN) from features where (genome = '$genome' and type = '$type')"); return undef unless $res; my $fidN = $res->[0]->[0] + 1; while ($self->is_deleted_fid("fig\|$genome\.$type\.$fidN")) { $fidN++; } return "fig\|$genome\.$type\.$fidN"; } sub is_deleted_fid { my($self,$fid) = @_; my($x,$y); if (! ($x = $self->{_deleted_fids})) { $self->{_deleted_fids} = {}; if (open(TMP,"<$FIG_Config::global/deleted.features")) { # # Feh. Try using this method inside a loop with $/ set and you'lll get bitten. # local $/ = "\n"; while ($y = <TMP>) { if ($y =~ /^(fig\|\d+\.\d+\.[a-zA-Z]+\.\d+)/) { $self->{_deleted_fids}->{$1} = 1; } } close(TMP); } $x = $self->{_deleted_fids}; } return $x->{$fid}; } sub fid_with_changed_location { my($self,$fid) = @_; my($x); if (! ($x = $self->{_changed_location_fids})) { $self->{_changed_location_fids} = {}; if (open(TMP,"<$FIG_Config::global/changed.location.features")) { while ($_ = <TMP>) { if ($_ =~ /^(fig\|\d+\.\d+\.[a-zA-Z]+\.\d+)/) { $self->{_changed_location_fids}->{$1}++; } } close(TMP); } $x = $self->{_changed_location_fids}; } return $x->{$fid}; } =head3 call_start usage: $fig->call_start($genome,$loc,$translation,$against) This routine can be invoked to produce an estimate of the correct start, given a location in a genome believed to be a protein-encoding gene, along with a set of PEGs that are believed to be orthologs. If called in a list context, it returns a list containing a string representing the estimated start location a confidence measure (better than 0.2 seems to be pretty solid) a new translation If called in a scalar context, it returns its best prediction of the start. =cut sub call_start { my($self,$genome,$loc,$tran,$against) = @_; my($peg); open(TMP,"| start_data_for_set_of_pegs use-close > $FIG_Config::temp/tmp.objects$$") || die "could not set up pipe to start_data_for_set_of_pegs"; print TMP "new|$genome\.peg\.1\t$loc\t$tran\n"; foreach $peg (@$against) { print TMP "$peg\tno_recall\n"; } close(TMP); &FIG::run("predict_starts $FIG_Config::temp/tmp.objects$$ $FIG_Config::temp/clear$$ $FIG_Config::temp/proposed$$ > /dev/null"); if (-s "$FIG_Config::temp/proposed$$") { my @changes = `changed_starts $FIG_Config::temp/proposed$$ /dev/null`; if ((@changes == 1) && ($changes[0] =~ /^\S+\t\S+\t(\S+)\t(\S+)/)) { my($new_loc,$conf) = ($1,$2); if (($ENV{FIG_VERBOSE}) && open(TMP,"<$FIG_Config::temp/proposed$$")) { while (defined($_ = <TMP>)) { print STDERR $_ } close(TMP); } my $proposed = wantarray ? join("",`cat $FIG_Config::temp/proposed$$`) : ""; $proposed =~ s/^ID=[^\n]+\n//s; unlink("$FIG_Config::temp/tmp.objects$$","$FIG_Config::temp/clear$$","$FIG_Config::temp/proposed$$"); return wantarray ? ($new_loc,$conf,$self->fixed_translation($tran,$genome,$loc,$new_loc),$proposed) : $new_loc; } } unlink("$FIG_Config::temp/tmp.objects$$","$FIG_Config::temp/clear$$","$FIG_Config::temp/proposed$$"); return wantarray ? ($loc,0,$tran,"") : $loc; } sub fixed_translation { my($self,$old_tran,$genome,$old_loc,$new_loc) = @_; my($extra,$trimmed,$new_tran); if ($old_loc =~ /^(\S+)_(\d+)_(\d+)$/) { my($contigO,$begO,$endO) = ($1,$2,$3); if ($new_loc =~ /^(\S+)_(\d+)_(\d+)$/) { my($contigN,$begN,$endN) = ($1,$2,$3); if ($begO < $endO) { if ($begO < $begN) { $trimmed = ($begN - $begO) / 3; $new_tran = &translate($self->dna_seq($genome,join("_",($contigO,$begN,$begN+2))),undef,"start") . substr($old_tran,$trimmed+1); } else { $extra = ($begO - $begN) / 3; $new_tran = &translate($self->dna_seq($genome,join("_",($contigO,$begN,$begO+2))),undef,"start") . substr($old_tran,1); } } else { if ($begO > $begN) { $trimmed = ($begO - $begN) / 3; $new_tran = &translate($self->dna_seq($genome,join("_",($contigO,$begN,$begN-2))),undef,"start") . substr($old_tran,$trimmed+1); } else { $extra = ($begN - $begO) / 3; $new_tran = &translate($self->dna_seq($genome,join("_",($contigO,$begN,$begO-2))),undef,"start") . substr($old_tran,1); } } return $new_tran; } } return $old_tran; } =head3 pick_gene_boundaries usage: $fig->pick_gene_boundaries($genome,$loc,$translation) This routine can be invoked to expand a region of similarity to potential gene boundaries. It does not try to find the best start, but only the one that is first after the beginning of the ORF. It returns a list containing the predicted location and the expanded translation. Thus, you might use ($new_loc,$new_tran) = $fig->pick_gene_boundaries($genome,$loc,$tran); $recalled = $fig->call_start($genome,$new_loc,$new_tran,\@others); to get the location of a recalled gene (in, for example, the process of correcting a frameshift). =cut sub pick_gene_boundaries { my($self,$genome,$loc,$tran) = @_; my($leftStop,$firstStart,$start,$end,$rightStop); my $full_loc = new FullLocation($self,$genome,$loc,$tran); $leftStop = $full_loc->Search("taa|tga|tag",$full_loc->PrevPoint,"-",9000); # attempt to get a stop to the left if ($leftStop) { if ($firstStart = $full_loc->Search("atg|gtg|ttg", $leftStop,"+",9000)) # if you succeed, attempt to get the first # start to the right of it { $start = $firstStart; # if you get the start, that is where the actual sequence/tran begin } else { return undef; # else this cannot be a gene } } else { $start = $full_loc->ExtremeCodon('first'); # If no stop was found, we start with the first codon } $rightStop = $full_loc->Search("taa|tga|tag",$full_loc->NextPoint,"+",9000); # loc for the first stop to the right if ($rightStop) # if you get it, adjust the position to the third base of the stop codon { $end = $full_loc->Adjusted($rightStop,+2); } else { $end = $full_loc->ExtremeCodon('last'); # else, adjust to the last base of the last codon to the right } $full_loc->Extend($start,$end,"trim"); my $tran = $full_loc->Translation; $tran =~ s/\*$//; # The location should contain the stop, but the translation should not return ($full_loc->SeedString,$tran); } =head3 change_location_of_feature usage: $fig->change_location_of_feature($fid,$location,$translation) Invoking this routine changes the location of the feature. The $translation argument is optional (and applies only to PEGs). The routine returns 1 on success and 0 on failure. =cut sub change_location_of_feature { my($self,$fid,$location,$translation) = @_; my($x); if ($self->is_deleted_fid($fid)) { return 0 } my $dbh = $self->db_handle(); my $genome = &genome_of($fid); my $type = &ftype($fid); my($got) = 0; my @loc = split(/,/,$location); my($contig,$beg,$end); if (($loc[0] =~ /^(\S+)_(\d+)_\d+$/) && (($contig,$beg) = ($1,$2)) && ($location =~ /(\d+)$/)) { $end = $1; if ($beg > $end) { ($beg,$end) = ($end,$beg) } } else { return 0; } my @tmp = grep { ($_ =~ /^(\S+)/) && ($1 eq $fid) } `grep '$fid' $FIG_Config::organisms/$genome/Features/$type/tbl`; if (@tmp > 0) { $x = $tmp[$#tmp]; chop $x; my @flds = split(/\t/,$x); shift @flds; shift @flds; my $aliasesT = (@flds > 0) ? join("\t",@flds) : ""; &add_tbl_entry($fid,$location,$aliasesT); $dbh->SQL("UPDATE features SET location = '$location', contig = '$contig', minloc = $beg, maxloc = $end WHERE id = '$fid'"); if (my $locations = $self->cached('_location')) { $locations->{$fid} = $location; } open(TMP,">>$FIG_Config::global/changed.location.features") || die "could not open $FIG_Config::global/changed.location.features"; flock(TMP,LOCK_EX) || confess "cannot lock changed.location.features"; print TMP "$fid\n"; close(TMP); chmod 0777, "$FIG_Config::global/changed.location.features"; $self->{_changed_location_fids} = undef; if (($type eq "peg") && defined($translation)) { $self->add_translation($fid,$translation); } $got = 1 } return $got; } sub add_tbl_entry { my($fid,$location,$aliasesT) = @_; my $genome = &genome_of($fid); my $file = "$FIG_Config::organisms/$genome/Features/peg/tbl"; open(TMP,">>$file") || die "could not open $file"; flock(TMP,LOCK_EX) || confess "cannot lock $file"; print TMP "$fid\t$location\t$aliasesT\n"; close(TMP); chmod 0777, "$file"; } sub add_translation { my($self,$fid,$translation) = @_; my $genome = &genome_of($fid); my $file = "$FIG_Config::organisms/$genome/Features/peg/fasta"; if (open(TMP,">>$file")) { flock(TMP,LOCK_EX) || confess "cannot lock $file"; print TMP ">$fid\n"; my $seek = tell TMP; my $ln = length($translation); print TMP "$translation\n"; close(TMP); chmod 0777, $file; my $fileno = $self->file2N($file); my $dbh = $self->db_handle(); $dbh->SQL("DELETE FROM protein_sequence_seeks where id = '$fid'"); $dbh->SQL("INSERT INTO protein_sequence_seeks (id,fileno,seek,len,slen) VALUES ('$fid',$fileno,$seek,$ln+1,$ln)"); } } sub peg_in_gendb { my($self, $peg) = @_; my $genome = &genome_of($peg); return $self->genome_in_gendb($genome); } sub genome_in_gendb { my($self, $genome) = @_; return 0; } =head2 FIG::Job module =cut ### Begin FIG::Job module package FIG::Job; use FIGAttributes; use base 'FIGAttributes'; sub new { my($class, $job_id, $job_dir) = @_; my $self = { id => $job_id, dir => $job_dir, }; return bless $self, $class; } sub status :Scalar { my($self) = @_; return &FIG::file_read("$self->{dir}/STATUS"); } sub running :Scalar { my($self) = @_; my $rc; warn "running test on $self->{id}\n"; if (kill(0, $self->{id}) > 0) { $rc = 1; } else { $rc = 0; } warn "running returns $rc\n"; return $rc; } sub info :Scalar :List { my($self) = @_; return &FIG::file_read("$self->{dir}/INFO"); } sub output :Scalar :List { my($self) = @_; return &FIG::file_read("$self->{dir}/OUTPUT"); } ######### End FIG::Job ## package FIG; # # DAS support. # =head3 init_das Initialize a DAS data query object. =cut sub init_das { my($self, $url, $dsn) = @_; my $das_data_dir = "$FIG_Config::global/DAS"; if (-d $das_data_dir) { return new SeedDas($self,$das_data_dir, $url, $dsn); } else { return undef; } } package FIG::SimLock; # # Little package to implement a lock for sims work. # use strict; use Fcntl qw/:flock/; # import LOCK_* constants sub new { my($class) = @_; my $pool_dir = "$FIG_Config::global/sim_pools"; &FIG::verify_dir($pool_dir); # # Lock the pool directory. # open(my $lock, ">$pool_dir/lockfile"); flock($lock, LOCK_EX); my $self = { lock_fh => $lock, }; return bless($self, $class); } sub DESTROY { my($self) = @_; warn "$$ unlocking sims lock\n"; $self->{lock_fh}->close(); } package FIG; 1;
MCS Webmaster | ViewVC Help |
Powered by ViewVC 1.0.3 |