[Bio] / FigKernelPackages / FIG.pm Repository:
ViewVC logotype

View of /FigKernelPackages/FIG.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.234 - (download) (as text) (annotate)
Sat Mar 5 04:29:59 2005 UTC (14 years, 9 months ago) by redwards
Branch: MAIN
Changes since 1.233: +10 -2 lines
added highlight by peg attribute to subsystem

package FIG;

use strict;

use Fcntl qw/:flock/;  # import LOCK_* constants

use POSIX;
use IPC::Open2;

use DBrtns;
use Sim;
use Blast;
use FIG_Config;
use tree_utilities;
use Subsystem;
use SeedDas;
use Construct;
use FIGRules;
use Tracer;

#
# Conditionally evaluate this in case its prerequisites are not available.
#

our $ClearinghouseOK = eval {
    require Clearinghouse;
};

use IO::Socket;

use FileHandle;

use Carp;
use Data::Dumper;
use Time::Local;
use File::Spec;
use File::Copy;
#
# Try to load the RPC stuff; it might fail on older versions of the software.
#
eval {
    require FIGrpc;
};

my $xmlrpc_available = 1;
if ($@ ne "")
{
    $xmlrpc_available = 0;
}


use FIGAttributes;
use base 'FIGAttributes';

use vars qw(%_FunctionAttributes);

use Data::Dumper;

#
# Force all new files to be all-writable.
#

umask 0;

=head1 FIG Genome Annotation System

=head2 Introduction

This is the main object for access to the SEED data store. The data store
itself is a combination of flat files and a database. The flat files can
be moved easily between systems and the database rebuilt as needed.

A reduced set of this object's functions are available via the B<SFXlate>
object. The SFXlate object uses a single database to represent all its
genomic information. It provides a much smaller capability for updating
the data, and eliminates all similarities except for bidirectional best
hits.

The key to making the FIG system work is proper configuration of the
C<FIG_Config.pm> file. This file contains names and URLs for the key
directories as well as the type and login information for the database.

=cut

#: Constructor FIG->new();

=head2 Public Methods

=head3 new

C<< my $fig = FIG->new(); >>

This is the constructor for a FIG object. It uses no parameters.

=cut

sub new {
    my($class) = @_;

    #
    # Check to see if we have a FIG_URL environment variable set.
    # If we do, don't actually create a FIG object, but rather
    # create a FIGrpc and return that as the return from this constructor.
    #

    if ($ENV{FIG_URL} ne "" && $xmlrpc_available) {
        Trace("Creating figrpc for '$ENV{FIG_URL}'") if T(0);
        my $figrpc = new FIGrpc($ENV{FIG_URL});
        return $figrpc;
    }

    my $rdbH = new DBrtns;
    bless {
        _dbf  => $rdbH,
    }, $class;
}


=head3 get_system_name

C<< my $name = $fig->get_system_name; >>

Returns C<seed>, indicating that this is object is using the SEED
database. The same method on an SFXlate object will return C<sprout>.

=cut
#: Return Type $;
sub get_system_name {
    return "seed";
}

# Destructor: releases the database handle.

sub DESTROY {
    my($self) = @_;
    my($rdbH);

    if ($rdbH = $self->db_handle) {
        $rdbH->DESTROY;
    }
}

=head3 delete_genomes

C<< $fig->delete_genomes(\@genomes); >>

Delete the specified genomes from the data store. This requires making
system calls to move and delete files.

=cut
#: Return Type ;
sub delete_genomes {
    my($self,$genomes) = @_;
    my $tmpD     = "$FIG_Config::temp/tmp.deleted.$$";
    my $tmp_Data = "$FIG_Config::temp/Data.$$";

    my %to_del = map { $_ => 1 } @$genomes;
    open(TMP,">$tmpD") || die "could not open $tmpD";

    my $genome;
    foreach $genome ($self->genomes)
    {
	if (! $to_del{$genome})
	{
	    print TMP "$genome\n";
	}
    }
    close(TMP);

    &run("extract_genomes $tmpD $FIG_Config::data $tmp_Data");

#   &run("mv $FIG_Config::data $FIG_Config::data.deleted; mv $tmp_Data $FIG_Config::data; fig load_all; rm -rf $FIG_Config::data.deleted");

    &run("mv $FIG_Config::data $FIG_Config::data.deleted");
    &run("mv $tmp_Data $FIG_Config::data");
    &run("fig load_all");
    &run("rm -rf $FIG_Config::data.deleted");
}

=head3 add_genome

C<< my $ok = $fig->add_genome($genomeF); >>

Add a new genome to the data store. A genome's data is kept in a directory
by itself, underneath the main organism directory.

=over 4

=item genomeF

Name of the directory containing the genome files.

=item RETURN

Returns TRUE if successful, else FALSE.

=back

=cut
#: Return Type $;
sub add_genome {
    my($self,$genomeF) = @_;

    my $rc = 0;

    my(undef, $path, $genome) = File::Spec->splitpath($genomeF);

    if ($genome !~ /^\d+\.\d+$/)
    {
	warn "Invalid genome filename $genomeF\n";
	return $rc;
    }

    if (-d $FIG_Config::organisms/$genome)
    {
	warn "Organism already exists for $genome\n";
	return $rc;
    }


    #
    # We're okay, it doesn't exist.
    #

    my @errors = `$FIG_Config::bin/verify_genome_directory $genomeF`;

    if (@errors)
    {
	warn "Errors found while verifying genome directory $genomeF:\n";
	print join("", @errors);
	return $rc;
    }

    &run("cp -r $genomeF $FIG_Config::organisms");
    &run("chmod -R 777 $FIG_Config::organisms/$genome");

    &run("index_contigs $genome");
    &run("compute_genome_counts $genome");
    &run("load_features $genome");

    $rc = 1;
    if (-s "$FIG_Config::organisms/$genome/Features/peg/fasta")
    {
	&run("index_translations $genome");
	my @tmp = `cut -f1 $FIG_Config::organisms/$genome/Features/peg/tbl`;
	chomp @tmp;
	&run("cat $FIG_Config::organisms/$genome/Features/peg/fasta >> $FIG_Config::data/Global/nr");
	&enqueue_similarities(\@tmp);
    }
    if ((-s "$FIG_Config::organisms/$genome/assigned_functions") ||
	(-d "$FIG_Config::organisms/$genome/UserModels"))
    {
	&run("add_assertions_of_function $genome");
    }

    return $rc;
}

=head3 enqueue_similarities

usage: enqueue_similarities(\@sims)

Queue the passed fids (a reference to a list) for similarity
computation.

=cut
#: Return Type ;
sub enqueue_similarities {
    my($fids) = @_;
    my $fid;

    my $sim_q = "$FIG_Config::global/queued_similarities";

    open(TMP,">>$sim_q")
	|| die "could not open $sim_q";

    #
    # We need to lock here so that if a computation is creating a snapshot of the
    # queue, we block until it's done.
    #

    flock(TMP, LOCK_EX) or die "Cannot lock $sim_q\n";

    foreach $fid (@$fids)
    {
	print TMP "$fid\n";
    }
    close(TMP);
}

=head3 create_sim_askfor_pool

usage: create_sim_askfor_pool()

Creates an askfor pool, a snapshot of the current NR and similarity
queue. Zeros out the old queue.

The askfor pool needs to keep track of which sequences need to be
calculated, which have been handed out, etc. To simplify this task we
chunk the sequences into fairly small numbers (10-20 sequences) and
allocate work on a per-chunk basis. We make use of the relational
database to keep track of chunk status as well as the seek locations
into the file of sequence data. The initial creation of the pool
involves indexing the sequence data with seek offsets and lengths and
populating the sim_askfor_index table with this information and with
initial status information.

=cut
#: Return Type $;
sub create_sim_askfor_pool
{
    my($self, $chunk_size) = @_;

    $chunk_size = 15 unless $chunk_size =~ /^\d+$/;

    my $pool_dir = "$FIG_Config::global/sim_pools";
    &verify_dir($pool_dir);

    #
    # Lock the pool directory.
    #
    open(my $lock, ">$pool_dir/lockfile");

    flock($lock, LOCK_EX);

    my $num = 0;
    if (open(my $toc, "<$pool_dir/TOC"))
    {
	while (<$toc>)
	{
	    chomp;
	    # print STDERR "Have toc entry  $_\n";
	    my ($idx, $time, $str) = split(/\s+/, $_, 3);

	    $num = max($num, $idx);
	}
	close($toc);
    }
    $num++;
    open(my $toc, ">>$pool_dir/TOC") or die "Cannot write $pool_dir/TOC: $!\n";

    print $toc "$num ", time(), " New toc entry\n";
    close($toc);

    my $cpool_id = sprintf "%04d", $num;
    my $cpool_dir = "$pool_dir/$cpool_id";

    #
    # All set, create the directory for this pool.
    #

    &verify_dir($cpool_dir);

    #
    # Now we can copy the nr and sim queue here.
    # Do this stuff inside an eval so we can clean up
    # the lockfile.
    #

    eval {
	my $sim_q = "$FIG_Config::global/queued_similarities";

	copy("$sim_q", "$cpool_dir/q");
	copy("$FIG_Config::data/Global/nr", "$cpool_dir/nr");

	open(F, ">$sim_q") or die "Cannot open $sim_q to truncate it: $!\n";
	close(F);
    };

    unlink("$pool_dir/lockfile");
    close($lock);

    #
    # We've created our pool; we can now run the formatdb and
    # extract the sequences for the blast run.
    #
    my $child_pid = $self->run_in_background(sub {
	#
	# Need to close db or there's all sorts of trouble.
	#

	my $cmd = "$FIG_Config::ext_bin/formatdb -i $cpool_dir/nr -p T -l $cpool_dir/formatdb.log";
	print "Will run '$cmd'\n";
 	&run($cmd);
	print "finished. Logfile:\n";
	print &FIG::file_read("$cpool_dir/formatdb.log");
 	unlink("$cpool_dir/formatdb.pid");
     });
    print "Running formatdb in background job $child_pid\n";
    open(FPID, ">$cpool_dir/formatdb.pid");
    print FPID "$child_pid\n";
    close(FPID);

    my $db = $self->db_handle();
    if (!$db->table_exists("sim_queue"))
    {
	$db->create_table(tbl => "sim_queue",
			  flds => "qid varchar(32), chunk_id INTEGER, seek INTEGER, len INTEGER, " .
			  "assigned BOOL, finished BOOL, output_file varchar(255), " .
			  "assignment_expires INTEGER, worker_info varchar(255)"
			 );
    }

    #
    # Write the fasta input file. Keep track of how many have been written,
    # and write seek info into the database as appropriate.
    #

    open(my $seq_fh, ">$cpool_dir/fasta.in");

    my($chunk_idx, $chunk_begin, $seq_idx);

    $chunk_idx = 0;
    $chunk_begin = 0;
    $seq_idx = 0;

    my(@seeks);

    open(my $q_fh, "<$cpool_dir/q");
    while (my $id = <$q_fh>)
    {
	chomp $id;

	my $seq = $self->get_translation($id);

	#
	# check if we're at the beginning of a chunk
	#

	print $seq_fh ">$id\n$seq\n";

	#
	# Check if we're at the end of a chunk
	#

	if ((($seq_idx + 1) % $chunk_size) == 0)
	{
	    my $chunk_end = tell($seq_fh);
	    my $chunk_len = $chunk_end - $chunk_begin;

	    push(@seeks, [$cpool_id, $chunk_idx, $chunk_begin, $chunk_len]);
	    $chunk_idx++;
	    $chunk_begin = $chunk_end;
	}
	$seq_idx++;
    }

    if ((($seq_idx) % $chunk_size) != 0)
    {
	my $chunk_end = tell($seq_fh);
	my $chunk_len = $chunk_end - $chunk_begin;

	push(@seeks, [$cpool_id, $chunk_idx, $chunk_begin, $chunk_len]);

	$chunk_idx++;
	$chunk_begin = $chunk_end;
    }

    close($q_fh);
    close($seq_fh);

    print "Write seqs\n";

    for my $seek (@seeks)
    {
	my($cpool_id, $chunk_idx, $chunk_begin, $chunk_len) = @$seek;

	$db->SQL("insert into sim_queue (qid, chunk_id, seek, len, assigned, finished) " .
		 "values('$cpool_id', $chunk_idx, $chunk_begin, $chunk_len, FALSE, FALSE)");
    }

    return $cpool_id;
}

#=head3 get_sim_queue
#
#usage: get_sim_queue($pool_id, $all_sims)
#
#Returns the sims in the given pool. If $all_sims is true, return the entire queue. Otherwise,
#just return the sims awaiting processing.
#
#=cut

sub get_sim_queue
{
    my($self, $pool_id, $all_sims) = @_;
}

=head3 get_active_sim_pools

usage: get_active_sim_pools()

Return a list of the pool id's for the sim processing queues that have entries awaiting
computation.

=cut
#: Return Type @;
sub get_active_sim_pools
{
    my($self) = @_;

    my $dbh = $self->db_handle();

    my $res = $dbh->SQL("select distinct qid from sim_queue where not finished");
    return undef unless $res;

    return map { $_->[0] } @$res;
}

=head3 get_sim_pool_info

usage: get_sim_pool_info($pool_id)

Return information about the given sim pool. Return value
is a list ($total_entries, $n_finished, $n_assigned, $n_unassigned)

=cut
#: Return Type @;
sub get_sim_pool_info
{
    my($self, $pool_id) = @_;
    my($dbh, $res, $total_entries, $n_finished, $n_assigned, $n_unassigned);

    $dbh = $self->db_handle();

    $res = $dbh->SQL("select count(chunk_id) from sim_queue where qid = '$pool_id'");
    $total_entries = $res->[0]->[0];

    $res = $dbh->SQL("select count(chunk_id) from sim_queue where qid = '$pool_id' and finished");
    $n_finished = $res->[0]->[0];

    $res = $dbh->SQL("select count(chunk_id) from sim_queue where qid = '$pool_id' and assigned and not finished");
    $n_assigned = $res->[0]->[0];

    $res = $dbh->SQL("select count(chunk_id) from sim_queue where qid = '$pool_id' and not finished and not assigned");
    $n_unassigned = $res->[0]->[0];

    return ($total_entries, $n_finished, $n_assigned, $n_unassigned);
}

#=head3 get_sim_chunk
#
#usage: get_sim_chunk($n_seqs, $worker_id)
#
#Returns a chunk of $n_seqs of work.
#
#From Ross, about how sims are processed:
#
#Here is how I process them:
#
#
#    bash$ cd /Volumes/seed/olson/Sims/June22.out
#    bash$ for i in really*
#    > do
#    > cat  < $i >> /Volumes/laptop/new.sims
#    > done
#
#
#Then, I need to "reformat" them by adding to columns to each one
# and split the result into files of about 3M each This I do using
#
#reduce_sims /Volumes/laptop/NR/NewNR/peg.synonyms.june21  300 < /Volumes/laptop/new.sims |
#    reformat_sims /Volumes/laptop/NR/NewNR/checked.nr.june21   > /Volumes/laptop/reformated.sims
#rm /Volumes/laptop/new.sims
#split_sims /Volumes/laptop/NewSims sims.june24  reformated.sims
#rm reformatted.sims
#
#=cut

sub get_sim_chunk
{
    my($self, $n_seqs, $worker_id) = @_;


}

=head3 get_local_hostname

usage: my $result = $fig->get_local_hostname();

=cut
#: Return Type $;
sub get_local_hostname {

    #
    # See if there is a FIGdisk/config/hostname file. If there
    # is, force the hostname to be that.
    #

    my $hostfile = "$FIG_Config::fig_disk/config/hostname";
    if (-f $hostfile)
    {
	my $fh;
	if (open($fh, $hostfile))
	{
	    my $hostname = <$fh>;
	    chomp($hostname);
	    return $hostname;
	}
    }

    #
    # First check to see if we our hostname is correct.
    #
    # Map it to an IP address, and try to bind to that ip.
    #

    my $tcp = getprotobyname('tcp');

    my $hostname = `hostname`;
    chomp($hostname);

    my @hostent = gethostbyname($hostname);

    if (@hostent > 0)
    {
	my $sock;
	my $ip = $hostent[4];

	socket($sock, PF_INET, SOCK_STREAM, $tcp);
	if (bind($sock, sockaddr_in(0, $ip)))
	{
	    #
	    # It worked. Reverse-map back to a hopefully fqdn.
	    #

	    my @rev = gethostbyaddr($ip, AF_INET);
	    if (@rev > 0)
	    {
		my $host = $rev[0];
		#
		# Check to see if we have a FQDN.
		#

		if ($host =~ /\./)
		{
		    #
		    # Good.
		    #
		    return $host;
		}
		else
		{
		    #
		    # We didn't get a fqdn; bail and return the IP address.
		    #
		    return get_hostname_by_adapter()
		}
	    }
	    else
	    {
		return inet_ntoa($ip);
	    }
	}
	else
	{
	    #
	    # Our hostname must be wrong; we can't bind to the IP
	    # address it maps to.
	    # Return the name associated with the adapter.
	    #
	    return get_hostname_by_adapter()
	}
    }
    else
    {
	#
	# Our hostname isn't known to DNS. This isn't good.
	# Return the name associated with the adapter.
	#
	return get_hostname_by_adapter()
    }
}

=head3 get_hostname_by_adapter

usage: my $name = $fig->get_hostname_by_adapter();

=cut
#: Return Type $;
sub get_hostname_by_adapter {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    #
    # Attempt to determine our local hostname based on the
    # network environment.
    #
    # This implementation reads the routing table for the default route.
    # We then look at the interface config for the interface that holds the default.
    #
    #
    # Linux routing table:
    # [olson@yips 0.0.0]$ netstat -rn
    #     Kernel IP routing table
    #     Destination     Gateway         Genmask         Flags   MSS Window  irtt Iface
    #     140.221.34.32   0.0.0.0         255.255.255.224 U         0 0          0 eth0
    #     169.254.0.0     0.0.0.0         255.255.0.0     U         0 0          0 eth0
    #     127.0.0.0       0.0.0.0         255.0.0.0       U         0 0          0 lo
    #     0.0.0.0         140.221.34.61   0.0.0.0         UG        0 0          0 eth0
    #
    #     Mac routing table:
    #
    #     bash-2.05a$ netstat -rn
    #     Routing tables
    #
    #  Internet:
    #     Destination        Gateway            Flags    Refs      Use  Netif Expire
    #     default            140.221.11.253     UGSc       12      120    en0
    #     127.0.0.1          127.0.0.1          UH         16  8415486    lo0
    #     140.221.8/22       link#4             UCS        12        0    en0
    #     140.221.8.78       0:6:5b:f:51:c4     UHLW        0      183    en0    408
    #     140.221.8.191      0:3:93:84:ab:e8    UHLW        0       92    en0    622
    #     140.221.8.198      0:e0:98:8e:36:e2   UHLW        0        5    en0    691
    #     140.221.9.6        0:6:5b:f:51:d6     UHLW        1       63    en0   1197
    #     140.221.10.135     0:d0:59:34:26:34   UHLW        2     2134    en0   1199
    #     140.221.10.152     0:30:1b:b0:ec:dd   UHLW        1      137    en0   1122
    #     140.221.10.153     127.0.0.1          UHS         0        0    lo0
    #     140.221.11.37      0:9:6b:53:4e:4b    UHLW        1      624    en0   1136
    #     140.221.11.103     0:30:48:22:59:e6   UHLW        3      973    en0   1016
    #     140.221.11.224     0:a:95:6f:7:10     UHLW        1        1    en0    605
    #     140.221.11.237     0:1:30:b8:80:c0    UHLW        0        0    en0   1158
    #     140.221.11.250     0:1:30:3:1:0       UHLW        0        0    en0   1141
    #     140.221.11.253     0:d0:3:e:70:a      UHLW       13        0    en0   1199
    #     169.254            link#4             UCS         0        0    en0
    #
    #     Internet6:
    #     Destination                       Gateway                       Flags      Netif Expire
    #                                                                     UH          lo0
    #     fe80::%lo0/64                                                   Uc          lo0
    #                                       link#1                        UHL         lo0
    #     fe80::%en0/64                     link#4                        UC          en0
    #     0:a:95:a8:26:68               UHL         lo0
    #     ff01::/32                                                       U           lo0
    #     ff02::%lo0/32                                                   UC          lo0
    #     ff02::%en0/32                     link#4                        UC          en0

    my($fh);

    if (!open($fh, "netstat -rn |"))
    {
	warn "Cannot run netstat to determine local IP address\n";
	return "localhost";
    }

    my $interface_name;

    while (<$fh>)
    {
	my @cols = split();

	if ($cols[0] eq "default" || $cols[0] eq "0.0.0.0")
	{
	    $interface_name = $cols[$#cols];
	}
    }
    close($fh);

    # print "Default route on $interface_name\n";

    #
    # Find ifconfig.
    #

    my $ifconfig;

    for my $dir ((split(":", $ENV{PATH}), "/sbin", "/usr/sbin"))
    {
	if (-x "$dir/ifconfig")
	{
	    $ifconfig = "$dir/ifconfig";
	    last;
	}
    }

    if ($ifconfig eq "")
    {
	warn "Ifconfig not found\n";
	return "localhost";
    }
    # print "Foudn $ifconfig\n";

    if (!open($fh, "$ifconfig $interface_name |"))
    {
	warn "Could not run $ifconfig: $!\n";
	return "localhost";
    }

    my $ip;
    while (<$fh>)
    {
	#
	# Mac:
	#         inet 140.221.10.153 netmask 0xfffffc00 broadcast 140.221.11.255
	# Linux:
	#           inet addr:140.221.34.37  Bcast:140.221.34.63  Mask:255.255.255.224
	#

	chomp;
	s/^\s*//;

	# print "Have '$_'\n";
	if (/inet\s+addr:(\d+\.\d+\.\d+\.\d+)\s+/)
	{
	    #
	    # Linux hit.
	    #
	    $ip = $1;
	    # print "Got linux $ip\n";
	    last;
	}
	elsif (/inet\s+(\d+\.\d+\.\d+\.\d+)\s+/)
	{
	    #
	    # Mac hit.
	    #
	    $ip = $1;
	    # print "Got mac $ip\n";
	    last;
	}
    }
    close($fh);

    if ($ip eq "")
    {
	warn "Didn't find an IP\n";
	return "localhost";
    }

    return $ip;
}

=head3 get_seed_id

usage: my $id = $fig->get_seed_id();

=cut
#: Return type $;
sub get_seed_id {
    #
    # Retrieve the seed identifer from FIGdisk/config/seed_id.
    #
    # If it's not there, create one, and make it readonly.
    #

    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my $id;
    my $id_file = "$FIG_Config::fig_disk/config/seed_id";
    if (! -f $id_file)
    {
	my $newid = `uuidgen`;
	if (!$newid)
	{
	    die "Cannot run uuidgen: $!";
	}

	chomp($newid);
	my $fh = new FileHandle(">$id_file");
	if (!$fh)
	{
	    die "error creating $id_file: $!";
	}
	print $fh "$newid\n";
	$fh->close();
	chmod(0444, $id_file);
    }
    my $fh = new FileHandle("<$id_file");
    $id = <$fh>;
    chomp($id);
    return $id;
}

=pod

=head1 get_release_info

Return the current data release information.  It is returned as the list
($name, $id, $inst, $email, $parent_id, $description).

The release info comes from the file FIG/Data/RELEASE. It is formatted as:

<release-name>
<unique id>
<institution>
<contact email>
<unique id of data release this release derived from>
<description>

For instance:
-----
SEED Data Release, 09/15/2004.
4148208C-1DF2-11D9-8417-000A95D52EF6
ANL/FIG
olson@mcs.anl.gov

Test release.
-----

If no RELEASE file exists, this routine will create one with a new unique ID. This
lets a peer optimize the data transfer by being able to cache ID translations
from this instance.

=cut
#: Return Type @;
sub get_release_info
{
    my($fig, $no_create) = @_;

    my $rel_file = "$FIG_Config::data/RELEASE";

    if (! -f $rel_file and !$no_create)
    {
	#
	# Create a new one.
	#

	my $newid = `uuidgen`;
	if (!$newid)
	{
	    die "Cannot run uuidgen: $!";
	}

	chomp($newid);

	my $relinfo = "Automatically generated release info " . localtime();
	my $inst = "Unknown";
	my $contact = "Unknown";
	my $parent = "";
	my( $a, $b, $e, $v, $env ) = $fig->genome_counts;
	my $description = "Automatically generated release info\n";
	$description .= "Contains $a archaeal, $b bacterial, $e eukaryal, $v viral and $env environmental genomes.\n";

	my $fh = new FileHandle(">$rel_file");
	if (!$fh)
	{
	    warn "error creating $rel_file: $!";
	    return undef;
	}
	print $fh "$relinfo\n";
	print $fh "$newid\n";
	print $fh "$inst\n";
	print $fh "$contact\n";
	print $fh "$parent\n";
	print $fh $description;
	$fh->close();
	chmod(0444, $rel_file);
    }

    if (open(my $fh, $rel_file))
    {
	my(@lines) = <$fh>;
	close($fh);

	chomp(@lines);

	my($info, $id, $inst, $contact, $parent, @desc) = @lines;

	return ($info, $id, $inst, $contact, $parent, join("\n", @desc));
    }

    return undef;
}

=pod

=head1 get_peer_last_update

usage: my $date = $fig->get_peer_last_update($peer_id);

Return the timestamp from the last successful peer-to-peer update with
the given peer.

We store this information in FIG/Data/Global/Peers/<peer-id>.

=cut
#: Return Type $;
sub get_peer_last_update
{
    my($self, $peer_id) = @_;

    my $dir = "$FIG_Config::data/Global/Peers";
    &verify_dir($dir);
    $dir .= "/$peer_id";
    &verify_dir($dir);

    my $update_file = "$dir/last_update";
    if (-f $update_file)
    {
	my $time = file_head($update_file, 1);
	chomp $time;
	return $time;
    }
    else
    {
	return undef;
    }
}

=pod

=head1 set_peer_last_update

usage: $fig->set_peer_last_update($peer_id, $time);

=cut
#: Return Type ;

sub set_peer_last_update
{
    my($self, $peer_id, $time) = @_;

    my $dir = "$FIG_Config::data/Global/Peers";
    &verify_dir($dir);
    $dir .= "/$peer_id";
    &verify_dir($dir);

    my $update_file = "$dir/last_update";
    open(F, ">$update_file");
    print F "$time\n";
    close(F);
}

=head3 cgi_url

usage: my $url = $fig->cgi_url();

=cut
#: Return Type $;
sub cgi_url {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    return &plug_url($FIG_Config::cgi_url);
}

=head3 temp_url

usage: my $url = $fig->temp_url();

=cut
#: Return Type $;
sub temp_url {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    return &plug_url($FIG_Config::temp_url);
}

=head3 plug_url

usage: my $url2 = $fig->plug_url($url);

=cut
#: Return Type $;
sub plug_url {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my($url) = @_;

    my $name;

    #  Revised by GJO
    #  First try to get url from the current http request

    if (      defined( $ENV{ 'HTTP_HOST' } )   # This is where $cgi->url gets its value
         && ( $name =  $ENV{ 'HTTP_HOST' } )
         && ( $url  =~ s~^http://[^/]*~http://$name~ )  # ~ is delimiter
       ) {}

    #  Otherwise resort to alternative sources

    elsif ( ( $name = &get_local_hostname )
         && ( $url  =~ s~^http://[^/]*~http://$name~ )  # ~ is delimiter
       ) {}

    return $url;
}

=head3 file_read

usage: my $text = $fig->file_read($fileName);

usage: my @lines = $fig->file_read($fileName);

=cut
#: Return Type $;
#: Return Type @;
sub file_read
{
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my($file) = @_;

    if (open(my $fh, "<$file"))
    {
	if (wantarray)
	{
	    my @ret = <$fh>;
	    return @ret;
	}
	else
	{
	    local $/;
	    my $text = <$fh>;
	    close($fh);
	    return $text;
	}
    }
}


=head3 file_head

usage: my $text = $fig->file_read($fileName, $count);

usage: my @lines = $fig->file_read($fileName, $count);

=cut
#: Return Type $;
#: Return Type @;
sub file_head
{
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my($file, $n) = @_;

    if (!$n)
    {
	$n = 1;
    }

    if (open(my $fh, "<$file"))
    {
    	my(@ret, $i);

	$i = 0;
	while (<$fh>)
	{
	    push(@ret, $_);
	    $i++;
	    last if $i >= $n;
	}
	close($fh);

	if (wantarray)
	{
	    return @ret;
	}
	else
	{
	    return join("", @ret);
	}
    }
}


=pod

=head1 hiding/caching in a FIG object

We save the DB handle, cache taxonomies, and put a few other odds and ends in the
FIG object.  We expect users to invoke these services using the object $fig constructed
using:

    use FIG;
    my $fig = new FIG;

$fig is then used as the basic mechanism for accessing FIG services.  It is, of course,
just a hash that is used to retain/cache data.  The most commonly accessed item is the
DB filehandle, which is accessed via $self->db_handle.

We cache genus/species expansions, taxonomies, distances (very crudely estimated) estimated
between genomes, and a variety of other things.  I am not sure that using cached/2 was a
good idea, but I did it.

=cut

sub db_handle {
    my($self) = @_;

    return $self->{_dbf};
}

sub cached {
    my($self,$what) = @_;

    my $x = $self->{$what};
    if (! $x)
    {
	$x = $self->{$what} = {};
    }
    return $x;
}

################ Basic Routines [ existed since WIT ] ##########################


=pod

=head1 min

usage: $n = &FIG::min(@x)

Assumes @x contains numeric values.  Returns the minimum of the values.

=cut
#: Return Type $;
sub min {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my(@x) = @_;
    my($min,$i);

    (@x > 0) || return undef;
    $min = $x[0];
    for ($i=1; ($i < @x); $i++)
    {
	$min = ($min > $x[$i]) ? $x[$i] : $min;
    }
    return $min;
}

=pod

=head1 max

usage: $n = &FIG::max(@x)

Assumes @x contains numeric values.  Returns the maximum of the values.

=cut
#: Return Type $;
sub max {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my(@x) = @_;
    my($max,$i);

    (@x > 0) || return undef;
    $max = $x[0];
    for ($i=1; ($i < @x); $i++)
    {
	$max = ($max < $x[$i]) ? $x[$i] : $max;
    }
    return $max;
}

=pod

=head1 between

usage: &FIG::between($x,$y,$z)

Returns true iff $y is between $x and $z.

=cut
#: Return Type $;
sub between {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my($x,$y,$z) = @_;

    if ($x < $z)
    {
	return (($x <= $y) && ($y <= $z));
    }
    else
    {
	return (($x >= $y) && ($y >= $z));
    }
}

=pod

=head1 standard_genetic_code

usage: $code = &FIG::standard_genetic_code()

Routines like "translate" can take a "genetic code" as an argument.  I implemented such
codes using hashes that assumed uppercase DNA triplets as keys.

=cut
#: Return Type $;
sub standard_genetic_code {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);

    my $code = {};

    $code->{"AAA"} = "K";
    $code->{"AAC"} = "N";
    $code->{"AAG"} = "K";
    $code->{"AAT"} = "N";
    $code->{"ACA"} = "T";
    $code->{"ACC"} = "T";
    $code->{"ACG"} = "T";
    $code->{"ACT"} = "T";
    $code->{"AGA"} = "R";
    $code->{"AGC"} = "S";
    $code->{"AGG"} = "R";
    $code->{"AGT"} = "S";
    $code->{"ATA"} = "I";
    $code->{"ATC"} = "I";
    $code->{"ATG"} = "M";
    $code->{"ATT"} = "I";
    $code->{"CAA"} = "Q";
    $code->{"CAC"} = "H";
    $code->{"CAG"} = "Q";
    $code->{"CAT"} = "H";
    $code->{"CCA"} = "P";
    $code->{"CCC"} = "P";
    $code->{"CCG"} = "P";
    $code->{"CCT"} = "P";
    $code->{"CGA"} = "R";
    $code->{"CGC"} = "R";
    $code->{"CGG"} = "R";
    $code->{"CGT"} = "R";
    $code->{"CTA"} = "L";
    $code->{"CTC"} = "L";
    $code->{"CTG"} = "L";
    $code->{"CTT"} = "L";
    $code->{"GAA"} = "E";
    $code->{"GAC"} = "D";
    $code->{"GAG"} = "E";
    $code->{"GAT"} = "D";
    $code->{"GCA"} = "A";
    $code->{"GCC"} = "A";
    $code->{"GCG"} = "A";
    $code->{"GCT"} = "A";
    $code->{"GGA"} = "G";
    $code->{"GGC"} = "G";
    $code->{"GGG"} = "G";
    $code->{"GGT"} = "G";
    $code->{"GTA"} = "V";
    $code->{"GTC"} = "V";
    $code->{"GTG"} = "V";
    $code->{"GTT"} = "V";
    $code->{"TAA"} = "*";
    $code->{"TAC"} = "Y";
    $code->{"TAG"} = "*";
    $code->{"TAT"} = "Y";
    $code->{"TCA"} = "S";
    $code->{"TCC"} = "S";
    $code->{"TCG"} = "S";
    $code->{"TCT"} = "S";
    $code->{"TGA"} = "*";
    $code->{"TGC"} = "C";
    $code->{"TGG"} = "W";
    $code->{"TGT"} = "C";
    $code->{"TTA"} = "L";
    $code->{"TTC"} = "F";
    $code->{"TTG"} = "L";
    $code->{"TTT"} = "F";

    return $code;
}

=pod

=head1 translate

usage: $aa_seq = &FIG::translate($dna_seq,$code,$fix_start);

If $code is undefined, I use the standard genetic code.  If $fix_start is true, I
will translate initial TTG or GTG to 'M'.

=cut
#: Return Type $;
sub translate {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my( $dna,$code,$start) = @_;
    my( $i,$j,$ln );
    my( $x,$y );
    my( $prot );

    if (! defined($code))
    {
	$code = &FIG::standard_genetic_code;
    }
    $ln = length($dna);
    $prot = "X" x ($ln/3);
    $dna =~ tr/a-z/A-Z/;

    for ($i=0,$j=0; ($i < ($ln-2)); $i += 3,$j++)
    {
	$x = substr($dna,$i,3);
        if ($y = $code->{$x})
        {
	    substr($prot,$j,1) = $y;
        }
    }

    if (($start) && ($ln >= 3) && (substr($dna,0,3) =~ /^[GT]TG$/))
    {
	substr($prot,0,1) = 'M';
    }
    return $prot;
}

=pod

=head1 reverse_comp

usage: $dnaR = &FIG::reverse_comp($dna) or
       $dnaRP = &FIG::rev_comp($seqP)

In WIT, we implemented reverse complement passing a pointer to a sequence and returning
a pointer to a sequence.  In most cases the pointers are a pain (although in a few they
are just what is needed).  Hence, I kept both versions of the function to allow you
to use whichever you like.  Use rev_comp only for long strings where passing pointers is a
reasonable effeciency issue.

=cut
#: Return Type $;
sub reverse_comp {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my($seq) = @_;

    return ${&rev_comp(\$seq)};
}

=head1 rev_comp

=cut
#: Return Type $;
sub rev_comp {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my( $seqP ) = @_;
    my( $rev  );

    $rev =  reverse( $$seqP );
    $rev =~ tr/a-z/A-Z/;
    $rev =~ tr/ACGTUMRWSYKBDHV/TGCAAKYWSRMVHDB/;
    return \$rev;
}

=pod

=head1 verify_dir

usage: &FIG::verify_dir($dir)

Makes sure that $dir exists.  If it has to create it, it sets permissions to 0777.

=cut

sub verify_dir {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my($dir) = @_;

    if (-d $dir) { return }
    if ($dir =~ /^(.*)\/[^\/]+$/)
    {
	&verify_dir($1);
    }
    mkdir($dir,0777) || die "could not make $dir: $!";
    # chmod 02777,$dir;
}

=pod

=head1 run

usage: &FIG::run($cmd)

Runs $cmd and fails (with trace) if the command fails.

=cut

sub run {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my($cmd) = @_;

#   my @tmp = `date`; chomp @tmp; print STDERR "$tmp[0]: running $cmd\n";
    (system($cmd) == 0) || confess "FAILED: $cmd";
}



=pod

=head1 read_fasta_record(\*FILEHANDLE)

Usage:     ( $seq_id, $seq_pointer, $comment ) = &read_fasta_record(\*FILEHANDLE);

Function:  Reads a FASTA-formatted sequence file one record at a time.
    The input filehandle defaults to STDIN if not specified.
    Returns a sequence ID, a pointer to the sequence, and an optional
    record comment (NOTE: Record comments are deprecated, as some tools
    such as BLAST do not handle them gracefully). Returns an empty list
    if attempting to read a record results in an undefined value
    (e.g., due to reaching the EOF).

Author:    Gordon D. Pusch

Date:      2004-Feb-18

=cut
#: Return Type @;
sub read_fasta_record
{
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my ($file_handle) = @_;
    my ( $old_end_of_record, $fasta_record, @lines, $head, $sequence, $seq_id, $comment, @parsed_fasta_record );

    if (not defined($file_handle))  { $file_handle = \*STDIN; }

    $old_end_of_record = $/;
    $/ = "\n>";

    if (defined($fasta_record = <$file_handle>))
    {
	chomp $fasta_record;
	@lines  =  split( /\n/, $fasta_record );
	$head   =  shift @lines;
	$head   =~ s/^>?//;
	$head   =~ m/^(\S+)/;
	$seq_id = $1;

	if ($head  =~ m/^\S+\s+(.*)$/)  { $comment = $1; } else { $comment = ""; }

	$sequence  =  join( "", @lines );

	@parsed_fasta_record = ( $seq_id, \$sequence, $comment );
    }
    else
    {
	@parsed_fasta_record = ();
    }

    $/ = $old_end_of_record;

    return @parsed_fasta_record;
}


=pod

=head1 display_id_and_seq

usage: &FIG::display_id_and_seq($id_and_comment,$seqP,$fh)

This command has always been used to put out fasta sequences.  Note that it
takes a pointer to the sequence.  $fh is optional and defalts to STDOUT.

=cut


sub display_id_and_seq {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my( $id, $seq, $fh ) = @_;

    if (! defined($fh) )  { $fh = \*STDOUT; }

    print $fh ">$id\n";
    &display_seq($seq, $fh);
}

sub display_seq {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my ( $seq, $fh ) = @_;
    my ( $i, $n, $ln );

    if (! defined($fh) )  { $fh = \*STDOUT; }

    $n = length($$seq);
#   confess "zero-length sequence ???" if ( (! defined($n)) || ($n == 0) );
    for ($i=0; ($i < $n); $i += 60)
    {
	if (($i + 60) <= $n)
	{
	    $ln = substr($$seq,$i,60);
	}
	else
	{
	    $ln = substr($$seq,$i,($n-$i));
	}
	print $fh "$ln\n";
    }
}

##########  I commented the pods on the following routines out, since they should not
##########  be part of the SOAP/WSTL interface
#=pod
#
#=head1 file2N
#
#usage: $n = $fig->file2N($file)
#
#In some of the databases I need to store filenames, which can waste a lot of
#space.  Hence, I maintain a database for converting filenames to/from integers.
#
#=cut
#
sub file2N :scalar {
    my($self,$file) = @_;
    my($relational_db_response);

    my $rdbH = $self->db_handle;

    if (($relational_db_response = $rdbH->SQL("SELECT fileno FROM file_table WHERE ( file = \'$file\')")) &&
	(@$relational_db_response == 1))
    {
	return $relational_db_response->[0]->[0];
    }
    elsif (($relational_db_response = $rdbH->SQL("SELECT MAX(fileno) FROM file_table "))  && (@$relational_db_response == 1) && ($relational_db_response->[0]->[0]))
    {
	my $fileno = $relational_db_response->[0]->[0] + 1;
	if ($rdbH->SQL("INSERT INTO file_table ( file, fileno ) VALUES ( \'$file\', $fileno )"))
	{
	    return $fileno;
	}
    }
    elsif ($rdbH->SQL("INSERT INTO file_table ( file, fileno ) VALUES ( \'$file\', 1 )"))
    {
	return 1;
    }
    return undef;
}

#=pod
#
#=head1 N2file
#
#usage: $filename = $fig->N2file($n)
#
#In some of the databases I need to store filenames, which can waste a lot of
#space.  Hence, I maintain a database for converting filenames to/from integers.
#
#=cut
#
sub N2file :scalar {
    my($self,$fileno) = @_;
    my($relational_db_response);

    my $rdbH = $self->db_handle;

    if (($relational_db_response = $rdbH->SQL("SELECT file FROM file_table WHERE ( fileno = $fileno )")) &&
	(@$relational_db_response == 1))
    {
	return $relational_db_response->[0]->[0];
    }
    return undef;
}


#=pod
#
#=head1 openF
#
#usage: $fig->openF($filename)
#
#Parts of the system rely on accessing numerous different files.  The most obvious case is
#the situation with similarities.  It is important that the system be able to run in cases in
#which an arbitrary number of files cannot be open simultaneously.  This routine (with closeF) is
#a hack to handle this.  I should probably just pitch them and insist that the OS handle several
#hundred open filehandles.
#
#=cut
#
sub openF {
    my($self,$file) = @_;
    my($fxs,$x,@fxs,$fh);

    $fxs = $self->cached('_openF');
    if ($x = $fxs->{$file})
    {
	$x->[1] = time();
	return $x->[0];
    }

    @fxs = keys(%$fxs);
    if (defined($fh = new FileHandle "<$file"))
    {
	if (@fxs >= 50)
	{
	    @fxs = sort { $fxs->{$a}->[1] <=> $fxs->{$b}->[1] } @fxs;
	    $x = $fxs->{$fxs[0]};
	    undef $x->[0];
	    delete $fxs->{$fxs[0]};
	}
	$fxs->{$file} = [$fh,time()];
	return $fh;
    }
    return undef;
}

#=pod
#
#=head1 closeF
#
#usage: $fig->closeF($filename)
#
#Parts of the system rely on accessing numerous different files.  The most obvious case is
#the situation with similarities.  It is important that the system be able to run in cases in
#which an arbitrary number of files cannot be open simultaneously.  This routine (with openF) is
#a hack to handle this.  I should probably just pitch them and insist that the OS handle several
#hundred open filehandles.
#
#=cut
#
sub closeF {
    my($self,$file) = @_;
    my($fxs,$x);

    if (($fxs = $self->{_openF}) &&
	($x = $fxs->{$file}))
    {
	undef $x->[0];
	delete $fxs->{$file};
    }
}

=pod

=head1 ec_name

usage: $enzymatic_function = $fig->ec_name($ec)

Returns enzymatic name for EC.

=cut

sub ec_name {
    my($self,$ec) = @_;

    ($ec =~ /^\d+\.\d+\.\d+\.\d+$/) || return "";
    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT name FROM ec_names WHERE ( ec = \'$ec\' )");

    return (@$relational_db_response == 1) ? $relational_db_response->[0]->[0] : "";
    return "";
}

=pod

=head1 all_roles

usage: @roles = $fig->all_roles

Supposed to return all known roles.  For now, we get all ECs with "names".

=cut

sub all_roles {
    my($self) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT ec,name FROM ec_names");

    return @$relational_db_response;
}

=pod

=head1 expand_ec

usage: $expanded_ec = $fig->expand_ec($ec)

Expands "1.1.1.1" to "1.1.1.1 - alcohol dehydrogenase" or something like that.

=cut

sub expand_ec {
    my($self,$ec) = @_;
    my($name);

    return ($name = $self->ec_name($ec)) ? "$ec - $name" : $ec;
}


=pod

=head1 clean_tmp

usage: &FIG::clean_tmp

We store temporary files in $FIG_Config::temp.  There are specific classes of files
that are created and should be saved for at least a few days.  This routine can be
invoked to clean out those that are over two days old.

=cut

sub clean_tmp {

    my($file);
    if (opendir(TMP,"$FIG_Config::temp"))
    {
#       change the pattern to pick up other files that need to be cleaned up
	my @temp = grep { $_ =~ /^(Geno|tmp)/ } readdir(TMP);
	foreach $file (@temp)
	{
	    if (-M "$FIG_Config::temp/$file" > 2)
	    {
		unlink("$FIG_Config::temp/$file");
	    }
	}
    }
}

################ Routines to process genomes and genome IDs  ##########################


=pod

=head1 genomes

usage: @genome_ids = $fig->genomes( $complete, $restrictions, $domain );

Genomes are assigned ids of the form X.Y where X is the taxonomic id maintained by
NCBI for the species (not the specific strain), and Y is a sequence digit assigned to
this particular genome (as one of a set with the same genus/species).  Genomes also
have versions, but that is a separate issue.

=cut

sub genomes  :remote :list {
    my( $self, $complete, $restrictions, $domain ) = @_;

    my $rdbH = $self->db_handle;

    my @where = ();
    if ($complete)
    {
	push(@where,"( complete = \'1\' )")
    }

    if ($restrictions)
    {
	push(@where,"( restrictions = \'1\' )")
    }

    if ($domain)
    {
	push( @where, "( maindomain = '$domain' )" )
    }

    my $relational_db_response;
    if (@where > 0)
    {
	my $where = join(" AND ",@where);
	$relational_db_response = $rdbH->SQL("SELECT genome  FROM genome where $where");
    }
    else
    {
	$relational_db_response = $rdbH->SQL("SELECT genome  FROM genome");
    }
    my @genomes = sort { $a <=> $b } map { $_->[0] } @$relational_db_response;
    return @genomes;
}

sub is_complete {
    my($self,$genome) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT genome  FROM genome where (genome = '$genome') AND (complete = '1')");
    return (@$relational_db_response == 1)
    }

sub genome_counts {
    my($self,$complete) = @_;
    my($x,$relational_db_response);

    my $rdbH = $self->db_handle;

    if ($complete)
    {
	$relational_db_response = $rdbH->SQL("SELECT genome,maindomain FROM genome where complete = '1'");
    }
    else
    {
	$relational_db_response = $rdbH->SQL("SELECT genome,maindomain FROM genome");
    }

    my ($arch, $bact, $euk, $vir, $env, $unk) = (0, 0, 0, 0, 0, 0);
    if (@$relational_db_response > 0)
    {
	foreach $x (@$relational_db_response)
	{
	    if    ($x->[1] =~ /^archaea/i)  { ++$arch }
	    elsif ($x->[1] =~ /^bacter/i)   { ++$bact }
	    elsif ($x->[1] =~ /^eukar/i)    { ++$euk }
	    elsif ($x->[1] =~ /^vir/i)      { ++$vir }
	    elsif ($x->[1] =~ /^env/i)      { ++$env }
	    else  { ++$unk }
	}
    }

    return ($arch, $bact, $euk, $vir, $env, $unk);
}


=pod

=head1 genome_domain

usage: $domain = $fig->genome_domain($genome_id);

Returns the domain of a genome ID, and 'undef' if it is not in the database.

=cut

sub genome_domain {
    my($self,$genome) = @_;
    my $relational_db_response;
    my $rdbH = $self->db_handle;

    if ($genome)
    {
	if (($relational_db_response = $rdbH->SQL("SELECT genome,maindomain FROM genome WHERE ( genome = \'$genome\' )"))
	    && (@$relational_db_response == 1))
	{
	    # die Dumper($relational_db_response);
	    return $relational_db_response->[0]->[1];
	}
    }
    return undef;
}


=pod

=head1 genome_pegs

usage: $num_pegs = $fig->genome_pegs($genome_id);

Returns the number of protein-encoding genes (PEGs) in $genome_id if
"$genome_id" is indexed in the "genome" database, and 'undef' otherwise.

=cut

sub genome_pegs {
    my($self,$genome) = @_;
    my $relational_db_response;
    my $rdbH = $self->db_handle;

    if ($genome)
    {
	if (($relational_db_response = $rdbH->SQL("SELECT pegs FROM genome WHERE ( genome = \'$genome\' )"))
	    && (@$relational_db_response == 1))
	{
	    return $relational_db_response->[0]->[0];
	}
    }
    return undef;
}


=pod

=head1 genome_rnas

usage: $num_rnas = $fig->genome_rnas($genome_id);

Returns the number of RNA-encoding genes (RNAs) in $genome_id if
"$genome_id" is indexed in the "genome" database, and 'undef' otherwise.

=cut

sub genome_rnas {
    my($self,$genome) = @_;
    my $relational_db_response;
    my $rdbH = $self->db_handle;

    if ($genome)
    {
	if (($relational_db_response = $rdbH->SQL("SELECT rnas FROM genome WHERE ( genome = \'$genome\' )"))
	    && (@$relational_db_response == 1))
	{
	    return $relational_db_response->[0]->[0];
	}
    }
    return undef;
}


=pod

=head1 genome_szdna

usage: $szdna = $fig->genome_szdna($genome_id);

Returns the number of DNA base-pairs in the genome contigs file(s) of $genome_id
"$genome_id" is indexed in the "genome" database, and 'undef' otherwise.

=cut

sub genome_szdna {
    my($self,$genome) = @_;
    my $relational_db_response;
    my $rdbH = $self->db_handle;

    if ($genome)
    {
	if (($relational_db_response = $rdbH->SQL("SELECT szdna FROM genome WHERE ( genome = \'$genome\' )"))
	    && (@$relational_db_response == 1))
	{
	    return $relational_db_response->[0]->[0];
	}
    }
    return undef;
}


=pod

=head1 genome_version

usage: $version = $fig->genome_version($genome_id);

Versions are incremented for major updates.  They are put in as major
updates of the form 1.0, 2.0, ...

Users may do local "editing" of the DNA for a genome, but when they do,
they increment the digits to the right of the decimal.  Two genomes remain
comparable only if the versions match identically.  Hence, minor updating should be
committed only by the person/group responsible for updating that genome.

We can, of course, identify which genes are identical between any two genomes (by matching
the DNA or amino acid sequences).  However, the basic intent of the system is to
support editing by the main group issuing periodic major updates.

=cut

sub genome_version :scalar {
    my($self,$genome) = @_;

    my(@tmp);
    if ((-s "$FIG_Config::organisms/$genome/VERSION") &&
	(@tmp = `cat $FIG_Config::organisms/$genome/VERSION`) &&
	($tmp[0] =~ /^(\S+)$/))
    {
	return $1;
    }
    return undef;
}

=pod

=head1 genus_species

usage: $gs = $fig->genus_species($genome_id)

Returns the genus and species (and strain if that has been properly recorded)
in a printable form.

=cut

sub genus_species :scalar {
    my ($self,$genome) = @_;
    my $ans;

    my $genus_species = $self->cached('_genus_species');
    if (! ($ans = $genus_species->{$genome}))
    {
	my $rdbH = $self->db_handle;
	my $relational_db_response = $rdbH->SQL("SELECT genome,gname  FROM genome");
	my $pair;
	foreach $pair (@$relational_db_response)
	{
	    $genus_species->{$pair->[0]} = $pair->[1];
	}
	$ans = $genus_species->{$genome};
    }
    return $ans;
}

=pod

=head1 org_of

usage: $org = $fig->org_of($prot_id)

In the case of external proteins, we can usually determine an organism, but not
anything more precise than genus/species (and often not that).  This routine takes
a protein ID (which may be a feature ID) and returns "the organism".

=cut

sub org_of {
    my($self,$prot_id) = @_;
    my $relational_db_response;
    my $rdbH = $self->db_handle;

    if ($prot_id =~ /^fig\|/)
    {
	#
	#  Trying to guess what Ross wanted (there was a servere bug):
	#
	#  deleted -> undefined
	#  failed lookup -> ""
	#
	return  $self->is_deleted_fid( $prot_id) ? undef
	      : $self->genus_species( $self->genome_of( $prot_id ) ) || "";
    }

    if (($relational_db_response = $rdbH->SQL("SELECT org FROM external_orgs WHERE ( prot = \'$prot_id\' )")) &&
	(@$relational_db_response >= 1))
    {
	return $relational_db_response->[0]->[0];
    }
    return "";
}

#
#  Support for colorizing organisms by domain
#  -- GJO
#
=pod

=head1 genus_species_domain

usage: ($gs, $domain) = $fig->genus_species_domain($genome_id)

Returns the genus and species (and strain if that has been properly recorded)
in a printable form, and domain.

=cut

sub genus_species_domain {
    my ($self, $genome) = @_;

    my $genus_species_domain = $self->cached('_genus_species_domain');
    if ( ! $genus_species_domain->{ $genome } )
    {
	my $rdbH = $self->db_handle;
	my $relational_db_response = $rdbH->SQL("SELECT genome,gname,maindomain FROM genome");
	my $triple;
	foreach $triple ( @$relational_db_response )
	{
	    $genus_species_domain->{ $triple->[0] } = [ $triple->[1], $triple->[2] ];
	}
    }
    my $gsdref = $genus_species_domain->{ $genome };
    return $gsdref ? @$gsdref : ( "", "" );
}


my %domain_color = ( AR => "#DDFFFF", BA => "#FFDDFF", EU => "#FFFFDD",
                     VI => "#DDDDDD", EN => "#BBBBBB" );

sub domain_color {
    my ($domain) = @_;
    defined $domain || return "#FFFFFF";
    return $domain_color{ uc substr($domain, 0, 2) } || "#FFFFFF";
}


=pod

=head1 org_and_color_of

usage: ($org, $color) = $fig->org_and_domain_of($prot_id)

Return the best guess organism and domain html color string of an organism.
In the case of external proteins, we can usually determine an organism, but not
anything more precise than genus/species (and often not that).  This routine takes
a protein ID (which may be a feature ID) and returns "the organism".

=cut

sub org_and_color_of {
    my($self,$prot_id) = @_;
    my $relational_db_response;
    my $rdbH = $self->db_handle;

    if ($prot_id =~ /^fig\|/)
    {
	my( $gs, $domain ) = $self->genus_species_domain($self->genome_of($prot_id));
	return ( $gs, domain_color( $domain ) );
    }

    if (($relational_db_response = $rdbH->SQL("SELECT org FROM external_orgs WHERE ( prot = \'$prot_id\' )")) &&
	(@$relational_db_response >= 1))
    {
	return ($relational_db_response->[0]->[0], "#FFFFFF");
    }
    return ("", "#FFFFFF");
}

#
#  End of support for colorizing organisms by domain
#  -- GJO
#

=pod

=head1 abbrev

usage: $abbreviated_name = $fig->abbrev($genome_name)

For alignments and such, it is very useful to be able to produce an abbreviation of genus/species.
That's what this does.  Note that multiple genus/species might reduce to the same abbreviation, so
be careful (disambiguate them, if you must).

=cut

sub abbrev :scalar {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my($genome_name) = @_;

    $genome_name =~ s/^(\S{3})\S+/$1./;
    $genome_name =~ s/^(\S+)\s+(\S{3})\S+/$1$2./;
    if (length($genome_name) > 13)
    {
	$genome_name = substr($genome_name,0,13);
    }
    return $genome_name;
}

################ Routines to process Features and Feature IDs  ##########################

=pod

=head1 ftype

usage: $type = &FIG::ftype($fid)

Returns the type of a feature, given the feature ID.  This just amounts
to lifting it out of the feature ID, since features have IDs of tghe form

    fig|x.y.f.n

where
        x.y is the genome ID
        f   is the type pf feature
        n   is an integer that is unique within the genome/type

=cut

sub ftype {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my($feature_id) = @_;

    if ($feature_id =~ /^fig\|\d+\.\d+\.([^\.]+)/)
    {
	return $1;
    }
    return undef;
}

=pod

=head1 genome_of

usage: $genome_id = $fig->genome_of($fid)

This just extracts the genome ID from a feature ID.

=cut


sub genome_of :scalar {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my $prot_id = (@_ == 1) ? $_[0] : $_[1];

    if ($prot_id =~ /^fig\|(\d+\.\d+)/) { return $1; }
    return undef;
}

=head1 genome_and_peg_of

usage: ($genome_id, $peg_number = $fig->genome_and_peg_of($fid)

This just extracts the genome ID and peg number from a feature ID.

=cut


sub genome_and_peg_of {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my $prot_id = (@_ == 1) ? $_[0] : $_[1];

    if ($prot_id =~ /^fig\|(\d+\.\d+)\.peg\.(\d+)/)
    {
	return ($1, $2);
    }
    return undef;
}

=pod

=head1 by_fig_id

usage: @sorted_by_fig_id = sort { &FIG::by_fig_id($a,$b) } @fig_ids

This is a bit of a clutzy way to sort a list of FIG feature IDs, but it works.

=cut

sub by_fig_id {
    my($a,$b) = @_;
    my($g1,$g2,$t1,$t2,$n1,$n2);
    if (($a =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g1,$t1,$n1) = ($1,$2,$3)) &&
	($b =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g2,$t2,$n2) = ($1,$2,$3)))
    {
	($g1 <=> $g2) or ($t1 cmp $t2) or ($n1 <=> $n2);
    }
    else
    {
	$a cmp $b;
    }
}

=pod

=head1 genes_in_region

usage: ($features_in_region,$beg1,$end1) = $fig->genes_in_region($genome,$contig,$beg,$end)

It is often important to be able to find the genes that occur in a specific region on
a chromosome.  This routine is designed to provide this information.  It returns all genes
that overlap the region ($genome,$contig,$beg,$end).  $beg1 is set to the minimum coordinate of
the returned genes (which may be before the given region), and $end1 the maximum coordinate.

The routine assumes that genes are not more than 10000 bases long, which is certainly not true
in eukaryotes.  Hence, in euks you may well miss genes that overlap the boundaries of the specified
region (sorry).

=cut
#: Return Type @;
sub genes_in_region {
    my($self,$genome,$contig,$beg,$end) = @_;
    my($x,$relational_db_response,$feature_id,$b1,$e1,@feat,@tmp,$l,$u);

    my $pad = 10000;
    my $rdbH = $self->db_handle;

    my $minV = $beg - $pad;
    my $maxV = $end + $pad;
    if (($relational_db_response = $rdbH->SQL("SELECT id FROM features
                                         WHERE ( minloc > $minV ) AND ( minloc < $maxV ) AND ( maxloc < $maxV) AND
                                               ( genome = \'$genome\' )  AND ( contig = \'$contig\' );")) &&
	 (@$relational_db_response >= 1))
    {
	@tmp =  sort { ($a->[1] cmp $b->[1]) or
		       (($a->[2]+$a->[3]) <=> ($b->[2]+$b->[3]))
                     }
	        map  { $feature_id = $_->[0];
                       $x = $self->feature_location($feature_id);
                       $x ? [$feature_id,&boundaries_of($x)] : ()
                     } @$relational_db_response;


	($l,$u) = (10000000000,0);
	foreach $x (@tmp)
	{
	    ($feature_id,undef,$b1,$e1) = @$x;
	    if (&between($beg,&min($b1,$e1),$end) || &between(&min($b1,$e1),$beg,&max($b1,$e1)))
	    {
		if (! $self->is_deleted_fid($feature_id))
		{
		    push(@feat,$feature_id);
		    $l = &min($l,&min($b1,$e1));
		    $u = &max($u,&max($b1,$e1));
		}
	    }
	}
	(@feat <= 0) || return ([@feat],$l,$u);
    }
    return ([],$l,$u);
}


#=============================================================================
#  Using the following version is better, but it brings out a very annoying
#  issue with some genomes. It already exists in the current code (above)
#  for some genes in some genomes.  For example, visit fig|70601.1.peg.1.
#  This is true for any genome that has a feature that crosses the origin.
#  The root of the problem lies in boundaries_of.  I am working on a fix that
#  replaces boundaries_of with a more sophisticated function.  When it is
#  all done, genes_in_retion should behave as desired.  -- GJO, Aug. 22, 2004
#=============================================================================
#
#  =pod
#
#  =head1 genes_in_region
#
#  usage: ( $features_in_region, $min_coord, $max_coord )
#           = $fig->genes_in_region( $genome, $contig, $beg, $end )
#
#  It is often important to be able to find the genes that occur in a specific
#  region on a chromosome.  This routine is designed to provide this information.
#  It returns all genes that overlap the region ( $genome, $contig, $beg, $end ).
#  $min_coord is set to the minimum coordinate of the returned genes (which may
#  preceed the given region), and $max_coord the maximum coordinate.  Because
#  the database is indexed by the leftmost and rightmost coordinates of each
#  feature, the function makes no assumption about the length of the feature, but
#  it can (and probably will) miss features spanning multiple contigs.
#
#  =cut
#
#
#  sub genes_in_region {
#      my ( $self, $genome, $contig, $beg, $end ) = @_;
#      my ( $x, $db_response, $feature_id, $b1, $e1, @tmp, @bounds );
#      my ( $min_coord, $max_coord );
#
#      my @features = ();
#      my $rdbH = $self->db_handle;
#
#      if ( ( $db_response = $rdbH->SQL( "SELECT id
#                                           FROM features
#                                          WHERE ( contig = '$contig' )
#                                            AND ( genome = '$genome' )
#                                            AND ( minloc <= $end )
#                                            AND ( maxloc >= $beg );"
#                                      )
#           )
#  	 && ( @$db_response > 0 )
#         )
#      {
#          #  The sort is unnecessary, but provides a consistent ordering
#
#  	@tmp = sort { (  $a->[1]             cmp   $b->[1] )             # contig
#  	           || ( ($a->[2] + $a->[3] ) <=> ( $b->[2] + $b->[3] ) ) # midpoint
#                      }
#  	       map  { $feature_id = $_->[0];
#  	              ( ( ! $self->is_deleted_fid( $feature_id ) )       # not deleted
#                       && ( $x = $self->feature_location( $feature_id ) )  # and has location
#                       && ( ( @bounds = boundaries_of( $x ) ) == 3 )       # and has bounds
#                        ) ? [ $feature_id, @bounds ] : ()
#                      } @$db_response;
#
#  	( $min_coord, $max_coord ) = ( 10000000000, 0 );
#
#  	foreach $x ( @tmp )
#  	{
#  	    ( $feature_id, undef, $b1, $e1 ) = @$x;
#  	    push @features, $feature_id;
#  	    my ( $min, $max ) = ( $b1 <= $e1 ) ? ( $b1, $e1 ) : ( $e1, $b1 );
#  	    ( $min_coord <= $min ) || ( $min_coord = $min );
#  	    ( $max_coord >= $max ) || ( $max_coord = $max );
#  	}
#      }
#
#      return ( @features ) ? ( [ @features ], $min_coord, $max_coord )
#                           : ( [], undef, undef );
#  }

#  These will be part of the fix to genes_in_region.  -- GJO

=pod

=head1 regions_spanned

usage: ( [ $contig, $beg, $end ], ... ) = $fig->regions_spanned( $loc )

The location of a feature in a scalar context is

    contig_b1_e1,contig_b2_e2,...   [one contig_b_e for each segment]

This routine takes as input a fig location and reduces it to one or more
regions spanned by the gene.  Unlike boundaries_of, regions_spanned handles
wrapping through the orgin, features split over contigs and exons that are
not ordered nicely along the chromosome (ugly but true).

=cut

sub regions_spanned {
    shift if UNIVERSAL::isa( $_[0], __PACKAGE__ );
    my( $location ) = ( @_ == 1 ) ? $_[0] : $_[1];
    defined( $location ) || return undef;

    my @regions = ();

    my ( $cur_contig, $cur_beg, $cur_end, $cur_dir );
    my ( $contig, $beg, $end, $dir );
    my @segs = split( /\s*,\s*/, $location );  # should not have space, but ...
    @segs || return undef;

    #  Process the first segment

    my $seg = shift @segs;
    ( ( $cur_contig, $cur_beg, $cur_end ) = ( $seg =~ /^(\S+)_(\d+)_\d+$/ ) )
       || return undef;
    $cur_dir = ( $cur_end >= $cur_beg ) ? 1 : -1;

    foreach $seg ( @segs ) {
	( ( $contig, $beg, $end ) = ( $seg =~ /^(\S+)_(\d+)_\d+$/ ) ) || next;
	$dir = ( $end >= $beg ) ? 1 : -1;

	#  Is this a continuation?  Update end

	if ( ( $contig eq $cur_contig )
	  && ( $dir == $cur_dir )
	  && ( ( ( $dir > 0 ) && ( $end > $cur_end ) )
	    || ( ( $dir < 0 ) && ( $end < $cur_end ) ) )
	   )
	{
	    $cur_end = $end;
	}

	#  Not a continuation.  Report previous and update current.

	else
	{
	    push @regions, [ $cur_contig, $cur_beg, $cur_end ];
	    ( $cur_contig, $cur_beg, $cur_end, $cur_dir )
	       = ( $contig, $beg, $end, $dir );
	}
    }

    #  There should alwasy be a valid, unreported region.

    push @regions, [ $cur_contig, $cur_beg, $cur_end ];

    return wantarray ? @regions : \@regions;
}


=pod

=head1 filter_regions

usage:  @regions = filter_regions( $contig, $min, $max,  @regions )
       \@regions = filter_regions( $contig, $min, $max,  @regions )
        @regions = filter_regions( $contig, $min, $max, \@regions )
       \@regions = filter_regions( $contig, $min, $max, \@regions )

This function provides a simple filter for extracting a list of genome regions
for those that overlap a particular interval.  Region definitions correspond
to those produced by regions_spanned.  That is, [ contig, beg, end ].  In the
function call, either $contig or $min and $max can be undefined (permitting
anything).

=cut

sub filter_regions {
    my ( $contig, $min, $max, @regions ) = @_;

    @regions || return ();
    ( ref( $regions[0] ) eq "ARRAY" ) || return undef;

    #  Is it a region list, or a reference to a region list?

    if ( ref( $regions[0]->[0] ) eq "ARRAY" ) { @regions = @{ $regions[0] } }

    if ( ! defined( $contig ) )
    {
	( defined( $min ) && defined( $max ) ) || return undef;
    }
    else       # with a defined contig name, allow undefined range
    {
	defined( $min ) || ( $min =          1 );
	defined( $max ) || ( $max = 1000000000 );
    }
    ( $min <= $max ) || return ();

    my ( $c, $b, $e );
    my @filtered = grep { ( @$_ >= 3 )               #  Allow extra fields?
                       && ( ( $c, $b, $e ) = @$_ )
                       && ( ( ! defined( $contig ) ) || ( $c eq $contig ) )
                       && ( ( $e >= $b ) || ( ( $b, $e ) = ( $e, $b ) ) )
                       && ( ( $b <= $max ) && ( $e >= $min ) )
                        } @regions;

    return wantarray ? @filtered : \@filtered;
}


sub close_genes {
    my($self,$fid,$dist) = @_;

#   warn "In close_genes, self=$self, fid=$fid";

    my $loc = $self->feature_location($fid);
    if ($loc)
    {
	my($contig,$beg,$end) = &FIG::boundaries_of($loc);
	if ($contig && $beg && $end)
	{
	    my $min = &min($beg,$end) - $dist;
	    my $max = &max($beg,$end) + $dist;
	    my $feat;
	    ($feat,undef,undef) = $self->genes_in_region(&FIG::genome_of($fid),$contig,$min,$max);
	    return @$feat;
	}
    }
    return ();
}

sub adjacent_genes
{
    my ($self, $fid, $dist) = @_;
    my (@close, $strand, $i);

#   warn "In adjacent_genes, self=$self, fid=$fid";


    $strand = $self->strand_of($fid);

    $dist   = $dist || 2000;
    @close  = $self->close_genes($fid, $dist);
    for ($i=0; $i < @close; ++$i) { last if ($close[$i] eq $fid); }

    # RAE note that if $self->strand_of($close[$i-1]) ne $strand then left/right neighbors
    # were never set! oops!

    # I think the concept of Left and right is confused here. In my mind, left and right
    # are independent of strand ?? E.g. take a look at PEG fig|196600.1.peg.1806
    # this is something like
    #
    #     ---> <--1805--- --1806-->   <--1807--  <----
    #
    # 1805 is always the left neighbor, no?

    my ($left_neighbor, $right_neighbor) = ($close[$i-1], $close[$i+1]);

    if (0) # this was if ($i > 0) I just skip this whole section!
    {
	if ($self->strand_of($close[$i-1]) eq $strand) { $left_neighbor  = $close[$i-1]; }
	# else {$left_neighbor=$close[$i+1]} # RAE: this is the alternative that is needed if you do it by strand
    }

    if ($i < $#close)
    {
	if ($self->strand_of($close[$i+1]) eq $strand) { $right_neighbor = $close[$i+1]; }
	# else {$right_neighbor = $close[$i-1]} # RAE: this is the alternative that is needed if you do it by strand
    }

    # ...return genes in transcription order...
    if ($strand eq '-')
    {
	($left_neighbor, $right_neighbor) = ($right_neighbor, $left_neighbor);
    }

    return ($left_neighbor, $right_neighbor) ;
}


=pod

=head1 feature_location

usage: $loc = $fig->feature_location($fid)   OR
       @loc = $fig->feature_location($fid)

The location of a feature in a scalar context is

    contig_b1_e1,contig_b2_e2,...   [one contig_b_e for each exon]

In a list context it is

    (contig_b1_e1,contig_b2_e2,...)

=cut

sub feature_location :scalar :list {
    my($self,$feature_id) = @_;
    my($relational_db_response,$locations,$location);

#   warn "In feature_location, self=$self, fid=$feature_id";

    if ($self->is_deleted_fid($feature_id)) { return undef }

    $locations = $self->cached('_location');
    if (! ($location = $locations->{$feature_id}))
    {
	my $rdbH = $self->db_handle;
	if (($relational_db_response = $rdbH->SQL("SELECT location FROM features WHERE ( id = \'$feature_id\' )")) &&
	    (@$relational_db_response == 1))
	{
            $locations->{$feature_id} = $location = $relational_db_response->[0]->[0];
	}
    }

    if ($location)
    {
	return wantarray() ? split(/,/,$location) : $location;
    }
    return undef;
}

sub contig_of
{
    my ($self, $locus) = @_;

    $locus =~ m/^([^,]+)_\d+_\d+/;

    return $1;
}

sub beg_of
{
    my ($self, $locus) = @_;

    $locus =~ m/^[^,]+_(\d+)_\d+/;

    return $1;
}

sub end_of
{
    my ($self, $locus) = @_;

    $locus =~ m/\S+_\d+_(\d+)$/;

    return $1;
}

sub strand_of
{
    my ($self, $fid) = @_;
    my ($beg, $end);

#   warn "In strand_of, self=$self, fid=$fid";

    (undef, $beg, $end) = $self->boundaries_of($self->feature_location($fid));

    if ($beg < $end) { return '+'; } else { return '-'; }
}

=pod

=head1 find_contig_with_checksum

Find a contig in the given genome with the given checksum.


=cut

sub find_contig_with_checksum
{
    my($self, $genome, $checksum) = @_;

    #
    # This implementation scans all the contig files for the organism; when
    # we have contig checksums in the database this will simplify
    # significantly.
    #
    # For some efficiency, we cache the checksums we compute along the way since
    # it's probably likely we'll poke at other contigs for this organism.
    #

    my $gdir = "$FIG_Config::organisms/$genome";

    my $cached_cksum = $self->cached('_contig_checksum');

    if (opendir(my $dh, $gdir))
    {
	for my $file (map { "$gdir/$_" } grep { $_ =~ /^contigs\d*$/ } readdir($dh))
	{
	    local $/ = "\n>";

	    if (open(my $fh, "<$file"))
	    {
		while (<$fh>)
		{
		    chomp;

		    #
		    # Pull the contig identifier from the first line.
		    # We need the >? to handle the first line in the file;
		    # the others are removed by the chomp above because
		    # $/ is set to "\n>";
		    #

		    if (s/^>?\s*(\S+)([^\n]*)\n//)
		    {
			my $ident = $1;
			my $contig_txt = $_;

			$contig_txt =~ s/\s//sg;
			$contig_txt = uc($contig_txt);

			#
			# See if we have a cached value.
			#

			my $this_checksum;
			$this_checksum = $cached_cksum->{$genome, $ident};
			if (!$this_checksum)
			{

			    my($rd, $wr, $pid);

			    if (!($pid = open2($rd, $wr, "cksum")))
			    {
				die "Cannot run open2 cksum: $!";
			    }

			    $wr->write($contig_txt, length($contig_txt));

			    close($wr);

			    $_ = <$rd>;
			    close($rd);
			    waitpid $pid, 0;
			    chomp;

			    my @vals = split(/\s+/, $_);
			    $this_checksum = $vals[0];
			    $cached_cksum->{$genome, $ident} = $this_checksum;
			}
			if ($this_checksum == $checksum)
			{
			    return $ident;
			}
		    }
		}
	    }
	}
    }
}

sub contig_checksum
{
    my($self, $genome, $contig) = @_;

    my $contig_txt = $self->read_contig($genome, $contig);

    $contig_txt =~ s/\s//sg;
    $contig_txt = uc($contig_txt);

    my($rd, $wr, $pid);

    if (!($pid = open2($rd, $wr, "cksum")))
    {
	die "Cannot run open2 cksum: $!";
    }

    $wr->write($contig_txt, length($contig_txt));

    close($wr);

    $_ = <$rd>;
    close($rd);
    waitpid $pid, 0;

    chomp;
    my @vals = split(/\s+/, $_);
    if (wantarray)
    {
	return @vals;
    }
    else
    {
	return $vals[0];
    }
}

=pod

=head1 read_contig

Read a single contig from the contigs file.

=cut
sub read_contig
{
    my($self, $genome, $contig) = @_;

    #
    # Read the contig. The database has it in a set of chunks, but we
    # just use the seek to the starting point, and read up to the next "\n>".
    #

    my $ret = $self->db_handle->SQL(qq!SELECT fileno, seek FROM contig_seeks
				       WHERE genome = '$genome' and contig = '$contig' and
				       startn = 0!);
    if (!$ret or @$ret != 1)
    {
	return undef;
    }

    my($fileno, $seek) = @{$ret->[0]};
    my $file = $self->N2file($fileno);

    my($fh, $contig_txt);

    if (!open($fh, "<$file"))
    {
	warn "contig_checksum: could not open $file: $!\n";
	return undef;
    }

    seek($fh, $seek, 0);

    {
	local $/ = "\n>";

	$contig_txt = <$fh>;
	chomp($contig_txt);
    }

    return $contig_txt;
}

=pod

=head1 boundaries_of

usage: ($contig,$beg,$end) = $fig->boundaries_of($loc)

The location of a feature in a scalar context is

    contig_b1_e1,contig_b2_e2,...   [one contig_b_e for each exon]

This routine takes as input such a location and reduces it to a single
description of the entire region containing the gene.

=cut

sub boundaries_of {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my($location) = (@_ == 1) ? $_[0] : $_[1];
    my($contigQ);

    if (defined($location))
    {
	my @exons = split(/,/,$location);
	my($contig,$beg,$end);

	if (($exons[0] =~ /^(\S+)_(\d+)_\d+$/) &&
	    (($contig,$beg) = ($1,$2)) && ($contigQ = quotemeta $contig) &&
	    ($exons[$#exons] =~ /^$contigQ\_\d+_(\d+)$/) &&
	    ($end = $1))
	{
	    return ($contig,$beg,$end);
	}
    }
    return undef;
}

=pod

=head1 all_features_detailed

usage: $fig->all_features_detailed($genome)

Returns a list of all feature IDs, location, aliases, type in the designated genome.
Used in gendb import to speed up the process
=cut

sub all_features_detailed {
    my($self,$genome) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT id, location, aliases, type FROM features WHERE  (genome = \'$genome\')");
        my @features;
    foreach my $tuple (@$relational_db_response)
    {
        push @features, $tuple unless ($self->is_deleted_fid($tuple->[0]));
    }
    return \@features;
}


=pod

=head1 all_features

usage: $fig->all_features($genome,$type)

Returns a list of all feature IDs of a specified type in the designated genome.  You would
usually use just

    $fig->pegs_of($genome) or
    $fig->rnas_of($genome)

which simply invoke this routine.

=cut

sub all_features {
    my($self,$genome,$type) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT id FROM features WHERE  (genome = \'$genome\' AND (type = \'$type\'))");

    if (@$relational_db_response > 0)
    {
	return grep { ! $self->is_deleted_fid($_) } map { $_->[0] } @$relational_db_response;
    }
    return ();
}


=pod

=head1 pegs_of

usage: $fig->pegs_of($genome)

Returns a list of all PEGs in the specified genome.  Note that order is not
specified.

=cut

sub pegs_of {
    my($self,$genome) = @_;

    return $self->all_features($genome,"peg");
}


=pod

=head1 rnas_of

usage: $fig->rnas_of($genome)

Returns a list of all RNAs for the given genome.

=cut

sub rnas_of {
    my($self,$genome) = @_;

    return $self->all_features($genome,"rna");
}

=pod

=head1 feature_aliases

usage: @aliases = $fig->feature_aliases($fid)  OR
       $aliases = $fig->feature_aliases($fid)

Returns a list of aliases (gene IDs, arbitrary numbers assigned by authors, etc.) for the feature.
These must come from the tbl files, so add them there if you want to see them here.

In a scalar context, the aliases come back with commas separating them.

=cut

sub feature_aliases {
    my($self,$feature_id) = @_;
    my($rdbH,$relational_db_response,@aliases,$aliases,%aliases,$x);

    if ($self->is_deleted_fid($feature_id)) { return undef }

    $rdbH = $self->db_handle;
    @aliases = ();
    if (($relational_db_response = $rdbH->SQL("SELECT aliases FROM features WHERE ( id = \'$feature_id\' )")) &&
	    (@$relational_db_response == 1))
    {
	$aliases = $relational_db_response->[0]->[0];
	%aliases = map { $_ => 1 } split(/,/,$aliases);
    }

    if (($relational_db_response = $rdbH->SQL("SELECT alias FROM ext_alias WHERE ( id = \'$feature_id\' )")) &&
	(@$relational_db_response > 0))
    {
	foreach $x (@$relational_db_response)
	{
	    $aliases{$x->[0]} = 1;
	}
    }

    @aliases = sort keys(%aliases);

    return wantarray() ? @aliases : join(",",@aliases);
}

=pod

=head1 by_alias

usage: $peg = $fig->by_alias($alias)

Returns a FIG id if the alias can be converted.  Right now we convert aliases
of the form NP_* (RefSeq IDs), gi|* (GenBank IDs), sp|* (Swiss Prot), uni|* (UniProt),
kegg|* (KEGG) and maybe a few more

=cut

sub by_alias {
    my($self,$alias,$genome) = @_;
    my($rdbH,$relational_db_response,$peg);
    my ($peg, $flag) = FIGRules::NormalizeAlias($alias);
	if ($flag) {
		return $peg;
	} else {
    	my $genomeQ = $genome ? quotemeta $genome : "";
		$rdbH = $self->db_handle;

    	if (($relational_db_response = $rdbH->SQL("SELECT id FROM ext_alias WHERE ( alias = ? )", undef, $peg)) &&
			(@$relational_db_response > 0)) {

	    if (@$relational_db_response == 1) {
		$peg = $relational_db_response->[0]->[0];
		return wantarray() ? ($peg) : $peg;
	    } elsif (wantarray()) {
		return map { $_->[0] } @$relational_db_response;
	    }
	}
	return wantarray() ? () : "";
    }
}

sub to_alias {
    my($self,$fid,$type) = @_;

    my @aliases = grep { $_ =~ /^$type\|/ } $self->feature_aliases($fid);

    if (wantarray())
    {
	return @aliases;
    }
    elsif (@aliases > 0)
    {
	return $aliases[0];
    }
    else
    {
	return "";
    }
}

=pod

=head1 possibly_truncated

usage: $fig->possibly_truncated($fid)

Returns true iff the feature occurs near the end of a contig.

=cut

sub possibly_truncated {
    my($self,$feature_id) = @_;
    my($loc);

    if ($loc = $self->feature_location($feature_id))
    {
	my $genome = $self->genome_of($feature_id);
	my ($contig,$beg,$end) = $self->boundaries_of($loc);
	if ((! $self->near_end($genome,$contig,$beg)) && (! $self->near_end($genome,$contig,$end)))
	{
	    return 0;
	}
    }
    return 1;
}

sub near_end {
    my($self,$genome,$contig,$x) = @_;

    return (($x < 300) || ($x > ($self->contig_ln($genome,$contig) - 300)));
}

sub is_real_feature {
    my($self,$fid) = @_;
    my($relational_db_response);

    if ($self->is_deleted_fid($fid)) { return 0 }

    my $rdbH = $self->db_handle;
    return (($relational_db_response = $rdbH->SQL("SELECT id FROM features WHERE ( id = \'$fid\' )")) &&
	    (@$relational_db_response == 1)) ? 1 : 0;
}

################ Routines to process functional coupling for PEGs  ##########################

=pod

=head1 coupling_and_evidence

usage: @coupling_data = $fig->coupling_and_evidence($fid,$bound,$sim_cutoff,$coupling_cutoff,$keep_record)

A computation of couplings and evidence starts with a given peg and produces a list of
3-tuples.  Each 3-tuple is of the form

    [Score,CoupledToFID,Evidence]

Evidence is a list of 2-tuples of FIDs that are close in other genomes (producing
a "pair of close homologs" of [$peg,CoupledToFID]).  The maximum score for a single
PCH is 1, but "Score" is the sum of the scores for the entire set of PCHs.

If $keep_record is true, the system records the information, asserting coupling for each
of the pairs in the set of evidence, and asserting a pin from the given $fd through all
of the PCH entries used in forming the score.

=cut

sub coupling_and_evidence {
    my($self,$feature_id,$bound,$sim_cutoff,$coupling_cutoff,$keep_record) = @_;
    my($neighbors,$neigh,$similar1,$similar2,@hits,$sc,$ev,$genome1);

    if ($self->is_deleted_fid($feature_id)) { return undef }

    if ($feature_id =~ /^fig\|(\d+\.\d+)/)
    {
	$genome1 = $1;
    }
    else
    {
	return undef;
    }
	my $locations = $self->feature_location($feature_id);
    my($contig,$beg,$end) = $self->boundaries_of($locations);
    if (! $contig) { return () }

    ($neighbors,undef,undef) = $self->genes_in_region($self->genome_of($feature_id),
						      $contig,
						      &min($beg,$end) - $bound,
						      &max($beg,$end) + $bound);
    if (@$neighbors == 0) { return () }
    $similar1 = $self->acceptably_close($feature_id,$sim_cutoff);
    @hits = ();

    foreach $neigh (grep { $_ =~ /peg/ } @$neighbors)
    {
	next if ($neigh eq $feature_id);
	$similar2 = $self->acceptably_close($neigh,$sim_cutoff);
	($sc,$ev) = $self->coupling_ev($genome1,$similar1,$similar2,$bound);
	if ($sc >= $coupling_cutoff)
	{
	    push(@hits,[$sc,$neigh,$ev]);
	}
    }
    if ($keep_record)
    {
	$self->add_chr_clusters_and_pins($feature_id,\@hits);
    }
    return sort { $b->[0] <=> $a->[0] } @hits;
}

sub fast_coupling {
    my($self,$peg,$bound,$coupling_cutoff) = @_;
    my($genome,$genome1,$genome2,$peg1,$peg2,$peg3,%maps,$loc,$loc1,$loc2,$loc3);
    my($pairs,$sc,%ev);

    if ($self->is_deleted_fid($peg)) { return undef }

    my @ans = ();

    $genome = &genome_of($peg);
    foreach $peg1 ($self->in_pch_pin_with($peg))
    {
	$peg1 =~ s/,.*$//;
	if ($peg ne $peg1)
	{
	    $genome1 = &genome_of($peg1);
	    $maps{$peg}->{$genome1} = $peg1;
	}
    }

    $loc = [&boundaries_of(scalar $self->feature_location($peg))];
    foreach $peg1 ($self->in_cluster_with($peg))
    {
	if ($peg ne $peg1)
	{
#	    print STDERR "peg1=$peg1\n";
	    $loc1 = [&boundaries_of(scalar $self->feature_location($peg1))];
	    if (&close_enough($loc,$loc1,$bound))
	    {
		foreach $peg2 ($self->in_pch_pin_with($peg1))
		{
		    $genome2 = &genome_of($peg2);
		    if (($peg3 = $maps{$peg}->{$genome2}) && ($peg2 ne $peg3))
		    {
			$loc2 = [&boundaries_of(scalar $self->feature_location($peg2))];
			$loc3 = [&boundaries_of(scalar $self->feature_location($peg3))];
			if (&close_enough($loc2,$loc3,$bound))
			{
			    push(@{$ev{$peg1}},[$peg3,$peg2]);
			}
		    }
		}
	    }
	}
    }
    foreach $peg1 (keys(%ev))
    {
	$pairs = $ev{$peg1};
	$sc = $self->score([$peg,map { $_->[0] } @$pairs]);
	if ($sc >= $coupling_cutoff)
	{
	    push(@ans,[$sc,$peg1]);
	}
    }
    return sort { $b->[0] <=> $a->[0] } @ans;
}


sub score {
    my($self,$pegs) = @_;
    my(@ids);

    if ($self->{_no9s_scoring})
    {
	@ids = map { $self->maps_to_id($_) } grep { $_ !~ /^fig\|999999/ } @$pegs;
    }
    else
    {
	@ids = map { $self->maps_to_id($_) } @$pegs;
    }
    return &score1($self,\@ids) - 1;
}

sub score1 {
    my($self,$pegs) = @_;
    my($sim);

    my($iden_cutoff) = 97;
    my($iden_cutoff_gap) = 100 - $iden_cutoff;

    my($first,@rest) = @$pegs;
    my $count = 1;
    my %hits = map { $_ => 1 } @rest;
    my @ordered = sort { $b->[0] <=> $a->[0] }
                  map { $sim = $_; [$sim->iden,$sim->id2] }
                  grep { $hits{$_->id2} }
                  $self->sims($first,1000,1,"raw");
    my %ordered = map { $_->[1] => 1 } @ordered;
    foreach $_ (@rest)
    {
	if (! $ordered{$_})
	{
	    push(@ordered,[0,$_]);
	}
    }

    while ((@ordered > 0) && ($ordered[0]->[0] >= $iden_cutoff))
    {
	shift @ordered ;
    }
    while (@ordered > 0)
    {
	my $start = $ordered[0]->[0];
	$_ = shift @ordered;
	my @sub = ( $_->[1] );
	while ((@ordered > 0) && ($ordered[0]->[0] > ($start-$iden_cutoff_gap)))
	{
	    $_ = shift @ordered;
	    push(@sub, $_->[1]);
	}

	if (@sub == 1)
	{
	    $count++;
	}
	else
	{
	    $count += &score1($self,\@sub);
	}
    }
    return $count;
}

=pod

=head1 add_chr_clusters_and_pins

usage: $fig->add_chr_clusters_and_pins($peg,$hits)

The system supports retaining data relating to functional coupling.  If a user
computes evidence once and then saves it with this routine, data relating to
both "the pin" and the "clusters" (in all of the organisms supporting the
functional coupling) will be saved.

$hits must be a pointer to a list of 3-tuples of the sort returned by
$fig->coupling_and_evidence.

=cut

sub add_chr_clusters_and_pins {
    my($self,$peg,$hits) = @_;
    my(@clusters,@pins,$x,$sc,$neigh,$pairs,$y,@corr,@orgs,%projection);
    my($genome,$cluster,$pin,$peg2);

    if (@$hits > 0)
    {
	@clusters = ();
	@pins = ();
	push(@clusters,[$peg,map { $_->[1] } @$hits]);
	foreach $x (@$hits)
	{
	    ($sc,$neigh,$pairs) = @$x;
	    push(@pins,[$neigh,map { $_->[1] } @$pairs]);
	    foreach $y (@$pairs)
	    {
		$peg2 = $y->[0];
		if ($peg2 =~ /^fig\|(\d+\.\d+)/)
		{
		    $projection{$1}->{$peg2} = 1;
		}
	    }
	}
	@corr = ();
	@orgs = keys(%projection);
	if (@orgs > 0)
	{
	    foreach $genome (sort { $a <=> $b } @orgs)
	    {
		push(@corr,sort { &FIG::by_fig_id($a,$b) } keys(%{$projection{$genome}}));
	    }
	    push(@pins,[$peg,@corr]);
	}

	foreach $cluster (@clusters)
	{
	    $self->add_chromosomal_cluster($cluster);
	}

	foreach $pin (@pins)
	{
	    $self->add_pch_pin($pin);
	}
    }
}

sub coupling_ev {
    my($self,$genome1,$sim1,$sim2,$bound) = @_;
    my($ev,$sc,$i,$j);

    $ev = [];

    $i = 0;
    $j = 0;
    while (($i < @$sim1) && ($j < @$sim2))
    {
	if ($sim1->[$i]->[0] < $sim2->[$j]->[0])
	{
	    $i++;
	}
	elsif ($sim1->[$i]->[0] > $sim2->[$j]->[0])
	{
	    $j++;
	}
	else
	{
	    $self->accumulate_ev($genome1,$sim1->[$i]->[1],$sim2->[$j]->[1],$bound,$ev);
	    $i++;
	    $j++;
	}
    }
    return ($self->score([map { $_->[0] } @$ev]),$ev);
}

sub accumulate_ev {
    my($self,$genome1,$feature_ids1,$feature_ids2,$bound,$ev) = @_;
    my($genome2,@locs1,@locs2,$i,$j,$x);

    if ((@$feature_ids1 == 0) || (@$feature_ids2 == 0)) { return 0 }

    $feature_ids1->[0] =~ /^fig\|(\d+\.\d+)/;
    $genome2 = $1;
    @locs1 = map { $x = $self->feature_location($_); $x ? [&boundaries_of($x)] : () } @$feature_ids1;
    @locs2 = map { $x = $self->feature_location($_); $x ? [&boundaries_of($x)] : () } @$feature_ids2;

    for ($i=0; ($i < @$feature_ids1); $i++)
    {
	for ($j=0; ($j < @$feature_ids2); $j++)
	{
	    if (($feature_ids1->[$i] ne $feature_ids2->[$j]) &&
		&close_enough($locs1[$i],$locs2[$j],$bound))
	    {
		push(@$ev,[$feature_ids1->[$i],$feature_ids2->[$j]]);
	    }
	}
    }
}

sub close_enough {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my($locs1,$locs2,$bound) = @_;

#   print STDERR &Dumper(["close enough",$locs1,$locs2]);
    return (($locs1->[0] eq $locs2->[0]) && (abs((($locs1->[1]+$locs1->[2])/2) - (($locs2->[1]+$locs2->[2])/2)) <= $bound));
}

sub acceptably_close {
    my($self,$feature_id,$sim_cutoff) = @_;
    my(%by_org,$id2,$genome,$sim);

    my($ans) = [];

    foreach $sim ($self->sims($feature_id,1000,$sim_cutoff,"fig"))
    {
	$id2 = $sim->id2;
	if ($id2 =~ /^fig\|(\d+\.\d+)/)
	{
	    my $genome = $1;
	    if (! $self->is_eukaryotic($genome))
	    {
		push(@{$by_org{$genome}},$id2);
	    }
	}
    }
    foreach $genome (sort { $a <=> $b } keys(%by_org))
    {
	push(@$ans,[$genome,$by_org{$genome}]);
    }
    return $ans;
}

################ Translations of PEGsand External Protein Sequences  ##########################


=pod

=head1 translatable

usage: $fig->translatable($prot_id)

The system takes any number of sources of protein sequences as input (and builds an nr
for the purpose of computing similarities).  For each of these input fasta files, it saves
(in the DB) a filename, seek address and length so that it can go get the translation if
needed.  This routine simply returns true iff info on the translation exists.

=cut

sub translatable {
    my($self,$prot) = @_;

    return &translation_length($self,$prot) ? 1 : 0;
}


=pod

=head1 translation_length

usage: $len = $fig->translation_length($prot_id)

The system takes any number of sources of protein sequences as input (and builds an nr
for the purpose of computing similarities).  For each of these input fasta files, it saves
(in the DB) a filename, seek address and length so that it can go get the translation if
needed.  This routine returns the length of a translation.  This does not require actually
retrieving the translation.

=cut

sub translation_length {
    my($self,$prot) = @_;

    if ($self->is_deleted_fid($prot)) { return undef }

    $prot =~ s/^([^\|]+\|[^\|]+)\|.*$/$1/;
    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT slen,seek FROM protein_sequence_seeks
                                             WHERE  id = \'$prot\' ");

    my @vals = sort { $b->[1] <=> $a->[1] } @$relational_db_response;
    return (@vals > 0) ? $vals[0]->[0] : undef;
}


=pod

=head1 get_translation

usage: $translation = $fig->get_translation($prot_id)

The system takes any number of sources of protein sequences as input (and builds an nr
for the purpose of computing similarities).  For each of these input fasta files, it saves
(in the DB) a filename, seek address and length so that it can go get the translation if
needed.  This routine returns a protein sequence.

=cut

sub get_translation {
    my($self,$id) = @_;
    my($rdbH,$relational_db_response,$fileN,$file,$fh,$seek,$ln,$tran);

    if ($self->is_deleted_fid($id)) { return '' }

    $rdbH = $self->db_handle;
    $id =~ s/^([^\|]+\|[^\|]+)\|.*$/$1/;

    $relational_db_response = $rdbH->SQL("SELECT fileno, seek, len FROM protein_sequence_seeks WHERE  id = \'$id\' ");

    if ($relational_db_response && @$relational_db_response > 0)
    {
	my @vals = sort { $b->[1] <=> $a->[1] } @$relational_db_response;
	($fileN,$seek,$ln) = @{$vals[0]};
	if (($fh = $self->openF($self->N2file($fileN))) &&
	     ($ln > 10))
	{
	    seek($fh,$seek,0);
	    read($fh,$tran,$ln-1);
	    $tran =~ s/\s//g;
	    return $tran;
	}
    }
    return '';
}

=pod

=head1 mapped_prot_ids

usage: @mapped = $fig->mapped_prot_ids($prot)

This routine is at the heart of maintaining synonyms for protein sequences.  The system
determines which protein sequences are "essentially the same".  These may differ in length
(presumably due to miscalled starts), but the tails are identical (and the heads are not "too" extended).
Anyway, the set of synonyms is returned as a list of 2-tuples [Id,length] sorted
by length.

=cut

sub mapped_prot_ids {
    my($self,$id) = @_;

    if ($self->is_deleted_fid($id)) { return () }

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT maps_to FROM peg_synonyms WHERE  syn_id = \'$id\' ");
    if ($relational_db_response && (@$relational_db_response == 1))
    {
	$id = $relational_db_response->[0]->[0];
    }

    $relational_db_response = $rdbH->SQL("SELECT syn_id,syn_ln,maps_to_ln FROM peg_synonyms WHERE maps_to = \'$id\' ");
    if ($relational_db_response && (@$relational_db_response > 0))
    {
	return ([$id,$relational_db_response->[0]->[2]],map { [$_->[0],$_->[1]] } @$relational_db_response);
    }
    else
    {
	return ([$id,$self->translation_length($id)]);
    }
}

sub maps_to_id {
    my($self,$id) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT maps_to FROM peg_synonyms WHERE  syn_id = \'$id\' ");
    return ($relational_db_response && (@$relational_db_response == 1)) ? $relational_db_response->[0]->[0] : $id;
}

################ Assignments of Function to PEGs  ##########################

# set to undef to unset user
#
sub set_user {
    my($self,$user) = @_;

    $self->{_user} = $user;
}

sub get_user {
    my($self) = @_;

    return $self->{_user};
}

=pod

=head1 function_of

usage: @functions = $fig->function_of($peg)  OR
       $function  = $fig->function_of($peg,$user)

In a list context, you get back a list of 2-tuples.  Each 2-tuple is of the
form [MadeBy,Function].

In a scalar context,

    1. user is "master" if not specified
    2. function returned is the user's, if one exists; otherwise, master's, if one exists

In a scalar context, you get just the function.

=cut

# Note that we do not return confidence.  I propose a separate function to get both
# function and confidence
#
sub function_of {
    my($self,$id,$user) = @_;
    my($relational_db_response,@tmp,$entry,$i);
    my $wantarray = wantarray();
    my $rdbH = $self->db_handle;

    if ($self->is_deleted_fid($id)) { return $wantarray ? () : "" }

    if (($id =~ /^fig\|(\d+\.\d+\.peg\.\d+)/) && ($wantarray || $user))
    {
	if (($relational_db_response = $rdbH->SQL("SELECT made_by,assigned_function FROM assigned_functions WHERE ( prot = \'$id\' )")) &&
	    (@$relational_db_response >= 1))
	{
	    @tmp = sort { $a->[0] cmp $b->[0] } map { $_->[1] =~ s/^\s//; $_->[1] =~ s/(\t\S)?\s*$//; [$_->[0],$_->[1]] } @$relational_db_response;
	    for ($i=0; ($i < @tmp) && ($tmp[$i]->[0] ne "master"); $i++) {}
	    if ($i < @tmp)
	    {
		$entry = splice(@tmp,$i,1);
		unshift @tmp, ($entry);
	    }

	    my $val;
	    if     ($wantarray)                                         { return @tmp }
	    elsif  ($user && ($val  = &extract_by_who(\@tmp,$user)))    { return $val }
            elsif  ($user && ($val  = &extract_by_who(\@tmp,"master"))) { return $val }
            else                                                        { return ""   }
	}
    }
    else
    {
	if (($relational_db_response = $rdbH->SQL("SELECT assigned_function FROM assigned_functions WHERE ( prot = \'$id\' AND made_by = \'master\' )")) &&
	    (@$relational_db_response >= 1))
	{
	    $relational_db_response->[0]->[0]  =~ s/^\s//; $relational_db_response->[0]->[0] =~ s/(\t\S)?\s*$//;
	    return $wantarray ? (["master",$relational_db_response->[0]->[0]]) : $relational_db_response->[0]->[0];
	}
    }

    return $wantarray ? () : "";
}

=pod

=head1 translated_function_of

usage: $function  = $fig->translated_function_of($peg,$user)

You get just the translated function.

=cut

sub translated_function_of {
    my($self,$id,$user) = @_;

    my $func = $self->function_of($id,$user);
    if ($func)
    {
	$func = $self->translate_function($func);
    }
    return $func;
}


sub extract_by_who {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my($xL,$who) = @_;
    my($i);

    for ($i=0; ($i < @$xL) && ($xL->[$i]->[0] ne $who); $i++) {}
    return ($i < @$xL) ? $xL->[$i]->[1] : "";
}


=pod

=head1 translate_function

usage: $translated_func = $fig->translate_function($func)

Translates a function based on the function.synonyms table.

=cut

sub translate_function {
    my($self,$function) = @_;

    my ($tran,$from,$to,$line);
    if (! ($tran = $self->{_function_translation}))
    {
	$tran = {};
	if (open(TMP,"<$FIG_Config::global/function.synonyms"))
	{
	    while (defined($line = <TMP>))
	    {
		chomp $line;
		($from,$to) = split(/\t/,$line);
		$tran->{$from} = $to;
	    }
	    close(TMP);
	}
	foreach $from (keys(%$tran))
	{
	    $to = $tran->{$from};
	    if ($tran->{$to})
	    {
		delete $tran->{$from};
	    }
	}
	$self->{_function_translation} = $tran;
    }

    while ($to = $tran->{$function})
    {
        $function = $to;
    }
    return $function;
}

=pod

=head1 assign_function

usage: $fig->assign_function($peg,$user,$function,$confidence)

Assigns a function.  Note that confidence can (and should be if unusual) included.
Note that no annotation is written.  This should normally be done in a separate
call of the form



=cut

sub assign_function {
    my($self,$peg,$user,$function,$confidence) = @_;
    my($role,$roleQ,$kvs,$kv,$k,$v);

    if (! $self->is_real_feature($peg)) { return 0 }

    my $genome = $self->genome_of($peg);

    $function =~ s/\s+/ /sg;
    $function =~ s/^\s+//;
    $function =~ s/\s+$//;
    if ($function =~ /^(.*?)\!(.*)/)
    {
	($function,$kvs) = ($1,$2);
	if ($kvs)
	{
	    $kvs =~ s/^\s+//;
	    $kvs =~ s/\s+$//;
	    foreach $kv (split(/\s+\!\s+/,$kvs))
	    {
		if ($kv =~ /^([A-Za-z0-9._\-\+\%]+)\s+\^\s+(.*)$/)
		{
		    ($k,$v) = ($1,$2);
		    if ($v !~ /\S/)
		    {
			&replace_peg_key_value($self,$peg,$k,"");
		    }
		    else
		    {
			&replace_peg_key_value($self,$peg,$k,$v);
		    }
		}
		elsif ($kv =~ /^([A-Za-z0-9._\-\+\%]+)$/)
		{
		    &replace_peg_key_value($self,$peg,$1,1);
		}
	    }
	}
    }

    my $rdbH = $self->db_handle;
    $confidence = $confidence ? $confidence : "";

    $rdbH->SQL("DELETE FROM assigned_functions WHERE ( prot = \'$peg\' AND made_by = \'$user\' )");

    my $funcQ = quotemeta $function;
    $rdbH->SQL("INSERT INTO assigned_functions ( prot, made_by, assigned_function, quality, org ) VALUES ( \'$peg\', \'$user\', \'$funcQ\', \'$confidence\', \'$genome\' )");
    $rdbH->SQL("DELETE FROM roles WHERE ( prot = \'$peg\' AND made_by = \'$user\' )");

    foreach $role (&roles_of_function($function))
    {
	$roleQ = quotemeta $role;
	$rdbH->SQL("INSERT INTO roles ( prot, role, made_by, org ) VALUES ( \'$peg\', '$roleQ\', \'$user\',  \'$genome\' )");
    }

    &verify_dir("$FIG_Config::organisms/$genome/UserModels");
    if ($user ne "master")
    {
	&verify_dir("$FIG_Config::organisms/$genome/UserModels/$user");
    }

    my $file;
    if ((($user eq "master") && ($file = "$FIG_Config::organisms/$genome/assigned_functions") && open(TMP,">>$file")) ||
	(($user ne "master") && ($file = "$FIG_Config::organisms/$genome/UserModels/$user/assigned_functions") && open(TMP,">>$file")))
    {
	flock(TMP,LOCK_EX) || confess "cannot lock assigned_functions";
	seek(TMP,0,2)      || confess "failed to seek to the end of the file";
	print TMP "$peg\t$function\t$confidence\n";
	close(TMP);
        chmod(0777,$file);
	return 1;
    }
    else
    {
	print STDERR "FAILED ASSIGNMENT: $peg\t$function\t$confidence\n";
    }
    return 0;
}

sub hypo {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my $x = (@_ == 1) ? $_[0] : $_[1];

    if (! $x)                             { return 1 }
    if ($x =~ /hypoth/i)                  { return 1 }
    if ($x =~ /conserved protein/i)       { return 1 }
    if ($x =~ /gene product/i)            { return 1 }
    if ($x =~ /interpro/i)                { return 1 }
    if ($x =~ /B[sl][lr]\d/i)             { return 1 }
    if ($x =~ /^U\d/)                     { return 1 }
    if ($x =~ /^orf/i)                    { return 1 }
    if ($x =~ /uncharacterized/i)         { return 1 }
    if ($x =~ /psedogene/i)               { return 1 }
    if ($x =~ /^predicted/i)              { return 1 }
    if ($x =~ /AGR_/)                     { return 1 }
    if ($x =~ /similar to/i)              { return 1 }
    if ($x =~ /similarity/i)              { return 1 }
    if ($x =~ /glimmer/i)                 { return 1 }
    if ($x =~ /unknown/i)                 { return 1 }
    if (($x =~ /domain/i) ||
	($x =~ /^y[a-z]{2,4}\b/i) ||
        ($x =~ /complete/i) ||
        ($x =~ /predicted by Psort/) ||
        ($x =~ /^bh\d+/i) ||
        ($x =~ /cds_/i) ||
	($x =~ /^[a-z]{2,3}\d+/i) ||
	($x =~ /similar to/i) ||
	($x =~ / identi/i) ||
        ($x =~ /structural feature/i))    { return 1 }
    return 0;
}

############################  Similarities ###############################

=pod

=head1 sims

usage: @sims = $fig->sims($peg,$maxN,$maxP,$select)

Returns a list of similarities for $peg such that

    there will be at most $maxN similarities,

    each similarity will have a P-score <= $maxP, and

    $select gives processing instructions:

        "raw" means that the similarities will not be expanded (by far fastest option)
        "fig" means return only similarities to fig genes
        "all" means that you want all the expanded similarities.

By "expanded", we refer to taking a "raw similarity" against an entry in the non-redundant
protein collection, and converting it to a set of similarities (one for each of the
proteins that are essentially identical to the representative in the nr).

Each entry in @sims is a refence to an array. These are the values in each array position:

0.   The query peg
1.   The similar peg
2.   The percent id
3.   Alignment length
4.   Mismatches
5.   Gap openings
6.   The start of the match in the query peg
7.   The end of the match in the query peg
8.   The start of the match in the similar peg
9.   The end of the match in the similar peg
10.  E value
11.  Bit score
12.  Length of query peg
13.  Length of similar peg
14.  Method

=cut

sub sims {
    my ($self,$id,$maxN,$maxP,$select,$max_expand) = @_;
    my($sim);
    $max_expand = defined($max_expand) ? $max_expand : $maxN;

    if ($self->is_deleted_fid($id)) { return () }

    my @sims = ();
    my @maps_to = $self->mapped_prot_ids($id);
    if (@maps_to > 0)
    {
	my $rep_id   = $maps_to[0]->[0];
	my @entry = grep { $_->[0] eq $id } @maps_to;
	if ((@entry == 1) && defined($entry[0]->[1]))
	{
	    if ((! defined($maps_to[0]->[1])) ||
		(! defined($entry[0]->[1])))
	    {
		print STDERR &Dumper(\@maps_to,\@entry);
		confess "bad";
	    }
	    my $delta = $maps_to[0]->[1] - $entry[0]->[1];
	    my @raw_sims = &get_raw_sims($self,$rep_id,$maxN,$maxP);
	    if ($id ne $rep_id)
	    {
		foreach $sim (@raw_sims)
		{

		    $sim->[0] = $id;
		    $sim->[6] -= $delta;
		    $sim->[7] -= $delta;
		}
	    }
	    if (($max_expand > 0) && ($select ne "raw"))
	    {
		unshift(@raw_sims,bless([$id,
					 $rep_id,
					 100.00,
					 $entry[0]->[1],
					 0,
					 0,
					 1,$entry[0]->[1],
					 $delta+1,$maps_to[0]->[1],
					 0.0,
					 2 * $entry[0]->[1],
					 $entry[0]->[1],
					 $maps_to[0]->[1],
					 "blastp",
					 undef,
					 undef
					],'Sim'));
		$max_expand++;
	    }
	    @sims = grep { $_->id1 ne $_->id2 } &expand_raw_sims($self,\@raw_sims,$maxP,$select,0,$max_expand);
	}
    }
    return grep { ! $self->is_deleted_fid($_->id2) } @sims;
}

sub expand_raw_sims {
    my($self,$raw_sims,$maxP,$select,$dups,$max_expand) = @_;
    my($sim,$id2,%others,$x);

    my @sims = ();
    foreach $sim (@$raw_sims)
    {
	next if ($sim->psc > $maxP);
	$id2 = $sim->id2;
	next if ($others{$id2} && (! $dups));

	$others{$id2} = 1;
	if (($select && ($select eq "raw")) || ($max_expand <= 0))
	{
	    push(@sims,$sim);
	}
	else
	{
	    my @relevant;
	    $max_expand--;

	    my @maps_to = $self->mapped_prot_ids($id2);
	    if ((! $select) || ($select eq "fig"))
	    {
		@relevant = grep { $_->[0] =~ /^fig/ }  @maps_to;
	    }
	    elsif ($select && ($select =~ /^ext/i))
	    {
		@relevant = grep { $_->[0] !~ /^fig/ } @maps_to;
	    }
	    else
	    {
		@relevant = @maps_to;
	    }

	    foreach $x (@relevant)
	    {
		my $sim1 = [@$sim];
		my($x_id,$x_ln) = @$x;
		defined($x_ln) || confess "x_ln id2=$id2 x_id=$x_id";
		defined($maps_to[0]->[1]) || confess "maps_to";
		my $delta2 = $maps_to[0]->[1] - $x_ln;
		$sim1->[1] = $x_id;
		$sim1->[8] -= $delta2;
		$sim1->[9] -= $delta2;
		bless($sim1,"Sim");
		push(@sims,$sim1);
	    }
	}
    }
    return @sims;
}

sub get_raw_sims {
    my($self,$rep_id,$maxN,$maxP) = @_;
    my(@sims,$seek,$fileN,$ln,$fh,$file,$readC,@lines,$i,$sim);
    my($sim_chunk,$psc,$id2);

    $maxN = $maxN ? $maxN : 500;

    @sims = ();
    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT seek, fileN, len FROM sim_seeks WHERE id = \'$rep_id\' ");
    foreach $sim_chunk (@$relational_db_response)
    {
	($seek,$fileN,$ln) = @$sim_chunk;
	$file = $self->N2file($fileN);
	$fh   = $self->openF($file);
	if (! $fh)
	{
	    confess "could not open sims for $file";
	}
	$readC = &read_block($fh,$seek,$ln-1);
	@lines = grep    {
	                     (@$_ == 15) &&
			     ($_->[12] =~ /^\d+$/) &&
			     ($_->[13] =~ /^\d+$/) &&
			     ($_->[6] =~ /^\d+$/) &&
			     ($_->[7] =~ /^\d+$/) &&
			     ($_->[8] =~ /^\d+$/) &&
			     ($_->[9] =~ /^\d+$/) &&
			     ($_->[2] =~ /^[0-9.]+$/) &&
			     ($_->[10] =~ /^[0-9.e-]+$/)
			 }
		         map  { [split(/\t/,$_),"blastp"] }
	                 @$readC;

	@lines = sort { $b->[11] <=> $a->[11] } @lines;

	for ($i=0; ($i < @lines); $i++)
	{
	    $psc    = $lines[$i]->[10];
	    $id2    = $lines[$i]->[1];
	    if ($maxP >= $psc)
	    {
		$sim = $lines[$i];
		bless($sim,"Sim");
		push(@sims,$sim);
		if (@sims == $maxN) { return @sims }
	    }
	}
    }
    return @sims;
}

sub read_block {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my($fh,$seek,$ln) = @_;
    my($piece,$readN);

    seek($fh,$seek,0);
    my @lines = ();
    my $leftover = "";
    while ($ln > 0)
    {
	my $ln1 = ($ln <= 10000) ? $ln : 10000;
	$readN = read($fh,$piece,$ln1);
	($readN == $ln1)
	    || confess "could not read the block of sims at $seek for $ln1 characters; $readN actually read";
	my @tmp = split(/\n/,$piece);
	if ($leftover)
	{
	    $tmp[0] = $leftover . $tmp[0];
	}

	if (substr($piece,-1) eq "\n")
	{
	    $leftover = "";
	}
	else
	{
	    $leftover = pop @tmp;
	}
	push(@lines,@tmp);
	$ln -= 10000;
    }
    if ($leftover) { push(@lines,$leftover) }
    return \@lines;
}


sub bbhs {
    my($self,$peg,$cutoff,$frac_match) = @_;
    my($sim,$peg2,$genome2,$i,@sims2,%seen);

    if ($self->is_deleted_fid($peg)) { return () }

    $frac_match = defined($frac_match) ? $frac_match : 0;

    $cutoff = defined($cutoff) ? $cutoff : 1.0e-10;
    my @bbhs = ();
    my @precomputed = ();
    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT seek, others FROM bbhs WHERE peg = \'$peg\' ");
    if (@$relational_db_response == 1)
    {
	my($seek_set,$others) = @{$relational_db_response->[0]};
	if (open(CORES,"<$FIG_Config::global/bbh.cores"))
	{
	    my $seek;
	    foreach $seek (split(/,/,$seek_set))
	    {
		seek(CORES,$seek,0);
		$_ = <CORES>;
		chop;
		push(@precomputed,split(/,/,$_));
	    }
	    close(CORES);
	}
	push(@precomputed,split(/,/,$others));
    }
    my %bbhs = map { $_ => 1 } @precomputed;

    foreach $sim ($self->sims($peg,10000,$cutoff,"fig"))
    {
	$peg2 = $sim->id2;
	my $frac = &FIG::min(($sim->e1+1 - $sim->b1) / $sim->ln1, ($sim->e2+1 - $sim->b2) / $sim->ln2);
	if ($bbhs{$peg2} && ($frac >= $frac_match))
	{
	    push(@bbhs,[$peg2,$sim->psc]);
	}
    }
    return @bbhs;
}

#
# Modeled after the Sprout call of the same name.
#


sub bbh_list
{
    my($self, $genome, $features) = @_;

    my $cutoff = 1.0e-10;

    my $out = {};
    for my $feature (@$features)
    {
	my @bbhs = $self->bbhs($feature, $cutoff);

	$out->{$feature} = [grep { /fig\|$genome\.peg/ } map { $_->[0] } @bbhs];
    }
    return $out;
}

=pod

=head1 dsims

usage: @sims = $fig->dsims($peg,$maxN,$maxP,$select)

Returns a list of similarities for $peg such that

    there will be at most $maxN similarities,

    each similarity will have a P-score <= $maxP, and

    $select gives processing instructions:

        "raw" means that the similarities will not be expanded (by far fastest option)
        "fig" means return only similarities to fig genes
        "all" means that you want all the expanded similarities.

By "expanded", we refer to taking a "raw similarity" against an entry in the non-redundant
protein collection, and converting it to a set of similarities (one for each of the
proteins that are essentially identical to the representative in the nr).

The "dsims" or "dynamic sims" are not precomputed.  They are computed using a heuristic which
is much faster than blast, but misses some similarities.  Essentially, you have an "index" or
representative sequences, a quick blast is done against it, and if there are any hits these are
used to indicate which sub-databases to blast against.

=cut

sub dsims {
    my($self,$id,$seq,$maxN,$maxP,$select) = @_;
    my($sim,$sub_dir,$db,$hit,@hits,%in);

    my @index = &blastit($id,$seq,"$FIG_Config::global/SimGen/exemplar.fasta",1.0e-3);
    foreach $sim (@index)
    {
	if ($sim->id2 =~ /_(\d+)$/)
	{
	    $in{$1}++;
	}
    }

    @hits = ();
    foreach $db (keys(%in))
    {
	$sub_dir = $db % 1000;
	push(@hits,&blastit($id,$seq,"$FIG_Config::global/SimGen/AccessSets/$sub_dir/$db",$maxP));

    }

    if (@hits == 0)
    {
	push(@hits,&blastit($id,$seq,"$FIG_Config::global/SimGen/nohit.fasta",$maxP));
    }

    @hits = sort { ($a->psc <=> $b->psc) or ($a->iden cmp $b->iden) } grep { $_->id2 ne $id } @hits;
    if ($maxN && ($maxN < @hits)) { $#hits = $maxN - 1 }
    return &expand_raw_sims($self,\@hits,$maxP,$select);
}

sub blastit {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my($id,$seq,$db,$maxP) = @_;

    if (! $maxP) { $maxP = 1.0e-5 }
    my $tmp = &Blast::blastp([[$id,$seq]],$db,"-e $maxP");
    my $tmp1 = $tmp->{$id};
    if ($tmp1)
    {
	return @$tmp1;
    }
    return ();
}

sub related_by_func_sim {
    my($self,$peg,$user) = @_;
    my($func,$sim,$id2,%related);

    if ($self->is_deleted_fid($peg)) { return () }

    if (($func = $self->function_of($peg,$user)) && (! &FIG::hypo($func)))
    {
	foreach $sim ($self->sims($peg,500,1,"fig",500))
	{
	    $id2 = $sim->id2;
	    if ($func eq $self->function_of($id2,$user))
	    {
		$related{$id2} = 1;
	    }
	}
    }
    return keys(%related);
}

################################# chromosomal clusters ####################################

=pod

=head1 in_cluster_with

usage: @pegs = $fig->in_cluster_with($peg)

Returns the set of pegs that are thought to be clustered with $peg (on the
chromosome).

=cut

sub in_cluster_with {
    my($self,$peg) = @_;
    my($set,$id,%in);

    return $self->in_set_with($peg,"chromosomal_clusters","cluster_id");
}

=pod

=head1 add_chromosomal_clusters

usage: $fig->add_chromosomal_clusters($file)

The given file is supposed to contain one predicted chromosomal cluster per line (either
comma or tab separated pegs).  These will be added (to the extent they are new) to those
already in $FIG_Config::global/chromosomal_clusters.

=cut


sub add_chromosomal_clusters {
    my($self,$file) = @_;
    my($set,$added);

    open(TMPCLUST,"<$file")
	|| die "aborted";
    while (defined($set = <TMPCLUST>))
    {
	print STDERR ".";
	chomp $set;
	$added += $self->add_chromosomal_cluster([split(/[\t,]+/,$set)]);
    }
    close(TMPCLUST);

    if ($added)
    {
	my $rdbH = $self->db_handle;
	$self->export_set("chromosomal_clusters","cluster_id","$FIG_Config::global/chromosomal_clusters");
	return 1;
    }
    return 0;
}

#=pod
#
#=head1 export_chromosomal_clusters
#
#usage: $fig->export_chromosomal_clusters
#
#Invoking this routine writes the set of chromosomal clusters as known in the
#relational DB back to $FIG_Config::global/chromosomal_clusters.
#
#=cut
#
sub export_chromosomal_clusters {
    my($self) = @_;

    $self->export_set("chromosomal_clusters","cluster_id","$FIG_Config::global/chromosomal_clusters");
}

sub add_chromosomal_cluster {
    my($self,$ids) = @_;
    my($id,$set,%existing,%in,$new,$existing,$new_id);

#   print STDERR "adding cluster ",join(",",@$ids),"\n";
    foreach $id (@$ids)
    {
	foreach $set ($self->in_sets($id,"chromosomal_clusters","cluster_id"))
	{
	    $existing{$set} = 1;
	    foreach $id ($self->ids_in_set($set,"chromosomal_clusters","cluster_id"))
	    {
		$in{$id} = 1;
	    }
	}
    }
#   print &Dumper(\%existing,\%in);

    $new = 0;
    foreach $id (@$ids)
    {
	if (! $in{$id})
	{
	    $in{$id} = 1;
	    $new++;
	}
    }
#   print STDERR "$new new ids\n";
    if ($new)
    {
	foreach $existing (keys(%existing))
	{
	    $self->delete_set($existing,"chromosomal_clusters","cluster_id");
	}
	$new_id = $self->next_set("chromosomal_clusters","cluster_id");
#	print STDERR "adding new cluster $new_id\n";
	$self->insert_set($new_id,[keys(%in)],"chromosomal_clusters","cluster_id");
	return 1;
    }
    return 0;
}

################################# PCH pins  ####################################

=pod

=head1 in_pch_pin_with

usage: $fig->in_pch_pin_with($peg)

Returns the set of pegs that are believed to be "pinned" to $peg (in the
sense that PCHs occur containing these pegs over significant phylogenetic
distances).

=cut

sub in_pch_pin_with {
    my($self,$peg) = @_;
    my($set,$id,%in);

    return $self->in_set_with($peg,"pch_pins","pin");
}

=pod

=head1 add_pch_pins

usage: $fig->add_pch_pins($file)

The given file is supposed to contain one set of pinned pegs per line (either
comma or tab seprated pegs).  These will be added (to the extent they are new) to those
already in $FIG_Config::global/pch_pins.

=cut

sub add_pch_pins {
    my($self,$file) = @_;
    my($set,$added);

    open(TMPCLUST,"<$file")
	|| die "aborted";
    while (defined($set = <TMPCLUST>))
    {
	print STDERR ".";
	chomp $set;
	my @tmp = split(/[\t,]+/,$set);
	if (@tmp < 200)
	{
	    $added += $self->add_pch_pin([@tmp]);
	}
    }
    close(TMPCLUST);

    if ($added)
    {
	my $rdbH = $self->db_handle;
	$self->export_set("pch_pins","pin","$FIG_Config::global/pch_pins");
	return 1;
    }
    return 0;
}

sub export_pch_pins {
    my($self) = @_;

    $self->export_set("pch_pins","pin","$FIG_Config::global/pch_pins");
}

sub add_pch_pin {
    my($self,$ids) = @_;
    my($id,$set,%existing,%in,$new,$existing,$new_id);

#   print STDERR "adding cluster ",join(",",@$ids),"\n";
    foreach $id (@$ids)
    {
	foreach $set ($self->in_sets($id,"pch_pins","pin"))
	{
	    $existing{$set} = 1;
	    foreach $id ($self->ids_in_set($set,"pch_pins","pin"))
	    {
		$in{$id} = 1;
	    }
	}
    }
#   print &Dumper(\%existing,\%in);

    $new = 0;
    foreach $id (@$ids)
    {
	if (! $in{$id})
	{
	    $in{$id} = 1;
	    $new++;
	}
    }

    if ($new)
    {
	if (keys(%in) < 300)
	{
	    foreach $existing (keys(%existing))
	    {
		$self->delete_set($existing,"pch_pins","pin");
	    }
	    $new_id = $self->next_set("pch_pins","pin");
#	    print STDERR "adding new pin $new_id\n";
	    $self->insert_set($new_id,[keys(%in)],"pch_pins","pin");
	}
	else
	{
	    $new_id = $self->next_set("pch_pins","pin");
#	    print STDERR "adding new pin $new_id\n";
	    $self->insert_set($new_id,$ids,"pch_pins","pin");
	}
	return 1;
    }
    return 0;
}

################################# Annotations  ####################################

=pod

=head1 add_annotation

usage: $fig->add_annotation($fid,$user,$annotation)

$annotation is added as a time-stamped annotation to $peg showing $user as the
individual who added the annotation.

=cut

sub add_annotation {
    my($self,$feature_id,$user,$annotation) = @_;
    my($genome);

    if ($self->is_deleted_fid($feature_id)) { return 0 }

#   print STDERR "add: fid=$feature_id user=$user annotation=$annotation\n";
    if ($genome = $self->genome_of($feature_id))
    {
	my $file = "$FIG_Config::organisms/$genome/annotations";
	my $fileno = $self->file2N($file);
	my $time_made = time;
	my $ma   = ($annotation =~ /^Set master function to/);


	if (open(TMP,">>$file"))
	{
	    flock(TMP,LOCK_EX) || confess "cannot lock assigned_functions";
	    seek(TMP,0,2)      || confess "failed to seek to the end of the file";

	    my $seek1 = tell TMP;
	    print TMP "$feature_id\n$time_made\n$user\n$annotation", (substr($annotation,-1) eq "\n") ? "" : "\n","//\n";
	    my $seek2 = tell TMP;
	    close(TMP);
	    chmod 0777, $file;
	    my $ln = ($seek2 - $seek1) - 3;
	    my $rdbH = $self->db_handle;
	    if ($rdbH->SQL("INSERT INTO annotation_seeks ( fid, dateof, who, ma, fileno, seek, len ) VALUES ( \'$feature_id\', $time_made, \'$user\', \'$ma\', $fileno, $seek1, $ln )"))
	    {
		return 1;
	    }
	}
    }
    return 0;
}

=pod

=head1 merged_related_annotations

usage: @annotations = $fig->merged_related_annotations($fids)

The set of annotations of a set of PEGs ($fids) is returned as a list of 4-tuples.
Each entry in the list is of the form [$fid,$timestamp,$user,$annotation].

=cut

sub merged_related_annotations {
    my($self,$fids) = @_;
    my($fid);
    my(@ann) = ();

    foreach $fid (@$fids)
    {
	push(@ann,$self->feature_annotations1($fid));
    }
    return map { $_->[1] = localtime($_->[1]); $_ } sort { $a->[1] <=> $b->[1] } @ann;
}

=pod

=head1 feature_annotations

usage: @annotations = $fig->feature_annotations($fid)

The set of annotations of $fid is returned as a list of 4-tuples.  Each entry in the list
is of the form [$fid,$timestamp,$user,$annotation].

=cut


sub feature_annotations {
    my($self,$feature_id,$rawtime) = @_;

    if ($self->is_deleted_fid($feature_id)) { return () }

    if ($rawtime)
    {
	return $self->feature_annotations1($feature_id);
    }
    else
    {
	return map {  $_->[1] = localtime($_->[1]); $_ } $self->feature_annotations1($feature_id);
    }
}

sub feature_annotations1 {
    my($self,$feature_id) = @_;
    my($tuple,$fileN,$seek,$ln,$annotation,$feature_idQ);
    my($file,$fh);

    if ($self->is_deleted_fid($feature_id)) { return () }

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT fileno, seek, len  FROM annotation_seeks WHERE  fid = \'$feature_id\' ");
    my @annotations = ();

    foreach $tuple (@$relational_db_response)
    {
	($fileN,$seek,$ln) = @$tuple;
	$annotation = $self->read_annotation($fileN,$seek,$ln);
	$feature_idQ = quotemeta $feature_id;
	if ($annotation =~ /^$feature_idQ\n(\d+)\n([^\n]+)\n(.*)/s)
	{
	    push(@annotations,[$feature_id,$1,$2,$3]);
	}
	else
	{
	    print STDERR "malformed annotation\n$annotation\n";
	}
    }
    return sort { $a->[1] <=> $b->[1] } @annotations;
}

sub read_annotation {
    my($self,$fileN,$seek,$ln) = @_;
    my($readN,$readC);

    my $file = $self->N2file($fileN);
    my $fh   = $self->openF($file);
    if (! $fh)
    {
	confess "could not open annotations for $file";
    }
    seek($fh,$seek,0);
    $readN = read($fh,$readC,$ln);
    ($readN == $ln)
	|| confess "could not read the block of annotations at $seek for $ln characters; $readN actually read from $file\n$readC";
    return $readC;
}

sub epoch_to_readable {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my($epoch) = @_;

    my($sec,$min,$hr,$dd,$mm,$yr) = localtime($epoch);
    $mm++;
    $yr += 1900;
    return "$mm-$dd-$yr:$hr:$min:$sec";
}

#
# This now calls assignments_made_full and remaps the output.
#
sub assignments_made
{
    my($self,$genomes,$who,$date) = @_;

    my @a = $self->assignments_made_full($genomes, $who, $date);

    return map { [ @{$_}[0,1]] } @a;
}

#
# Looks up and returns assignments made; return is a list of
# tuples [peg, assignment, date, who]
#

sub assignments_made_full {
    my($self,$genomes,$who,$date) = @_;
    my($relational_db_response,$entry,$fid,$fileno,$seek,$len,$ann);
    my($epoch_date,$when,%sofar,$x);

    if (! defined($genomes)) { $genomes = [$self->genomes] }

    my %genomes = map { $_ => 1 } @$genomes;
    if ($date =~ /^(\d{1,2})\/(\d{1,2})\/(\d{4})$/)
    {
	my($mm,$dd,$yyyy) = ($1,$2,$3);
	$epoch_date = &Time::Local::timelocal(0,0,0,$dd,$mm-1,$yyyy-1900,0,0,0);
    }
    elsif ($date =~ /^\d+$/)
    {
	$epoch_date = $date;
    }
    else
    {
	$epoch_date = 0;
    }
    $epoch_date = defined($epoch_date) ? $epoch_date-1 : 0;
    my @assignments = ();
    my $rdbH = $self->db_handle;
    if ($who eq "master")
    {
	$relational_db_response = $rdbH->SQL("SELECT fid, dateof, fileno, seek, len  FROM annotation_seeks WHERE ((ma = \'1\') AND (dateof > $epoch_date))");
    }
    else
    {
	$relational_db_response = $rdbH->SQL("SELECT fid, dateof, fileno, seek, len  FROM annotation_seeks WHERE (( who = \'$who\' ) AND (dateof > $epoch_date))");
    }

    if ($relational_db_response && (@$relational_db_response > 0))
    {
	foreach $entry (@$relational_db_response)
	{
  	    ($fid,$when,$fileno,$seek,$len) = @$entry;
	    if (($fid =~ /^fig\|(\d+\.\d+)/) && $genomes{$1} && (! $self->is_deleted_fid($fid)))
	    {
		if ($len < 4)
		{
		    print STDERR "BAD: fid=$fid when=$when fileno=$fileno seek=$seek len=$len\n";
		    next;
		}
		$ann = $self->read_annotation($fileno,$seek,$len);

 		if (($ann =~ /^(fig\|\d+\.\d+\.peg\.\d+)\n(\d+)\n(\S+)\nSet ([^\n]*)function[^\n]*\n(\S[^\n]+\S)/s) &&
 		    (($who eq $3) || (($4 eq "master ") && ($who eq "master"))) &&
 		    ($2 >= $epoch_date))
 		{
 		    if ((! $sofar{$1}) || (($x = $sofar{$1}) && ($when > $x->[0])))
 		    {
 			$sofar{$1} = [$when, $5, $3];
 		    }
		}
	    }
	}
    }
    @assignments = map { $x = $sofar{$_}; [$_,$x->[1], $x->[0], $x->[2]] } keys(%sofar);
    return @assignments;
}

sub assignments_made_for_protein {
    my($self, $fid) = @_;
    my($relational_db_response,$entry,$fileno,$seek,$len,$ann);
    my($epoch_date,$when,%sofar,$x);

    if ($self->is_deleted_fid($fid)) { return () }

    my @assignments = ();
    my $rdbH = $self->db_handle;

    $relational_db_response = $rdbH->SQL("SELECT fid, dateof, fileno, seek, len  FROM annotation_seeks WHERE (fid = '$fid')");

    if ($relational_db_response && (@$relational_db_response > 0))
    {
	foreach $entry (@$relational_db_response)
	{
  	    ($fid,$when,$fileno,$seek,$len) = @$entry;
	    if ($len < 4)
	    {
		print STDERR "BAD: fid=$fid when=$when fileno=$fileno seek=$seek len=$len\n";
		next;
	    }
	    $ann = $self->read_annotation($fileno,$seek,$len);

	    if (my ($peg, $when, $who, $what, $func) =
		$ann =~ /^(fig\|\d+\.\d+\.peg\.\d+)\n(\d+)\n(\S+)\nSet ([^\n]*)function[^\n]*\n(\S[^\n]+\S)/s)
	    {
		push(@assignments, [$peg, $when, $who, $what, $func]);
	    }
	}
    }
    return @assignments;
}

=pod

=head1 annotations_made

usage: @annotations = $fig->annotations_made($genomes, $who, $date)

Return the list of annotations on the genomes in @$genomes  made by $who
after $date.

Each returned annotation is of the form [$fid,$timestamp,$user,$annotation].

=cut

sub annotations_made {
    my($self,$genomes,$who,$date) = @_;
    my($relational_db_response,$entry,$fid,$fileno,$seek,$len,$ann);
    my($epoch_date,$when,@annotations);

    if (! defined($genomes)) { $genomes = [$self->genomes] }

    my %genomes = map { $_ => 1 } @$genomes;
    if ($date =~ /^(\d{1,2})\/(\d{1,2})\/(\d{4})$/)
    {
	my($mm,$dd,$yyyy) = ($1,$2,$3);
	$epoch_date = &Time::Local::timelocal(0,0,0,$dd,$mm-1,$yyyy-1900,0,0,0);
    }
    elsif ($date =~ /^\d+$/)
    {
	$epoch_date = $date;
    }
    else
    {
	$epoch_date = 0;
    }
    $epoch_date = defined($epoch_date) ? $epoch_date-1 : 0;
    @annotations = ();
    my $rdbH = $self->db_handle;
    if ($who eq "master")
    {
	$relational_db_response = $rdbH->SQL("SELECT fid, dateof, fileno, seek, len  FROM annotation_seeks WHERE ((ma = \'1\') AND (dateof > $epoch_date))");
    }
    else
    {
	$relational_db_response = $rdbH->SQL("SELECT fid, dateof, fileno, seek, len  FROM annotation_seeks WHERE (( who = \'$who\' ) AND (dateof > $epoch_date))");
    }

    if ($relational_db_response && (@$relational_db_response > 0))
    {
	foreach $entry (@$relational_db_response)
	{
  	    ($fid,$when,$fileno,$seek,$len) = @$entry;
	    if (($fid =~ /^fig\|(\d+\.\d+)/) && $genomes{$1} && (! $self->is_deleted_fid($fid)))
	    {
		$ann = $self->read_annotation($fileno,$seek,$len);

		if ($ann =~ /^(fig\|\d+\.\d+\.peg\.\d+)\n(\d+)\n(\S+)\n(.*\S)/s)
		{
		    push(@annotations,[$1,$2,$3,$4]);
		}
	    }
	}
    }
    return @annotations;
}

################################### ATTRIBUTES

=head1 Attributes

The attributes methods have now been rewritten for handling all kinds of attributes. The tag/value pairs can be associated with a feature like a peg, rna, or prophage, or a genome.

There are several base attributes:
 get_attributes
 add_attribute
 delete_attribute
 change_attribute

There are also methods for more complex things:
 get_tags
 get_values
 guess_value_format

By default all tags are treated as uppercase, and all tags have leading and trailing white space removed.

=head1 get_attributes

With one argument (the id):

Get an array of attributes for a feature. Each attribute is a tag, a value, and (optionally) a URL This method will return an array where each element is a tag, a val, and it's url.

 eg. my @attributes=$fig->get_attributes($peg);

Note that attributes are not just for pegs, hence you can get attributes for genomes:
 
 my @attributes=$fig->get_attributes($genome)


With two arguments (the id and the tag):

Will return the value and url for just that attribute

e.g. my ($value, $url)=$fig->get_attributes($genome, 'motility');


=cut

sub feature_attributes {
    my $self=shift;
    # RAE. Since I was in a changing mood, I renamed this from feature_attributes to get_attributes (we have genomes, now too).
    # however I left this in here so as not to break things.
    return $self->get_attributes(@_);
}

sub get_attributes {
    # Esoteric question: get_attribute or get_attributes. Everything below (add/change/delete) is singular, but this does make
    # more sense in the plural. Oh well, I could do both.

    my($self,$fid, $tag) = @_;
    my($rdbH,$relational_db_response);

    $tag =~ s/^\s+//; $tag =~ s/\s+$//; $tag=uc($tag);

    $rdbH = $self->db_handle;

    if ($tag && ($relational_db_response = $rdbH->SQL("SELECT val,url FROM attribute WHERE ( fid = \'$fid\' and tag = \'$tag\' )")) &&
	    (@$relational_db_response > 0))
    {
        my @arr=@$relational_db_response;
	my $lastarray=$arr[$#arr]; # if we have several values for this tag, we just want the last one!
        return @$lastarray;
    }
    elsif (!$tag && ($relational_db_response = $rdbH->SQL("SELECT tag,val,url FROM attribute WHERE ( fid = \'$fid\' )")) &&
            (@$relational_db_response > 0))
    {
        return @$relational_db_response;
    }
    else 
    {
        return undef;
    }
}


# RAE I moved this here as it made no sense where it was. Keeping all the stuff together. 
# but then I went and renamed it :)

sub replace_peg_key_value {
    my $self=shift;
    # RAE I deprecated this and replaced it with change_attributes. This should have no effect on existing code,
    # but the name is more consistent with the other routines that I am adding (add, change, delete)
    # we should have some verbose switch like $self->{'verbose'} to allow warning of things like this that users shouldn't see

    #if ($self->{'verbose'}) {print STDERR "replace_peg_key_value has been deprecated and changed to change_attributes\n"}
    return $self->change_attribute(@_);
}


# RAE Style: There are two (or more?) ways that I could have coded this. I though about making another subroutine with the SQL and file calls,
# and could do so easily, but I opted for the copy paste method which makes the code longer but probably (?) clearer.

=head1 add_attribute

Add a new tag/value pair to something. Something can be a genome id, a peg, an rna, prophage, whatever.

Arguments:
 	feature id, this can be a peg, genome, etc,
	tag name to replace
	value to replace it with
	optional URL to add
	optional file to store the attributes in.

A note on files, at the moment Attributes is a directory, and we are using load attributes to load everything. I think we should switch to this method which is a lot cleaner, so I added the option of defining the file here. The default is to use assigned_attributes and load_attributes loads this file last so anything in that file will replace anthing in the other files.

=cut

sub add_attribute {
    my($self,$peg,$k,$v, $url, $file) = @_;
    return unless ($peg && $k); # we must have at least a peg and a tag to add (though other things can be undef)
    $k =~ s/^\s+//; $k =~ s/\s+$//; $k=uc($k);
    unless ($file) {$file="assigned_attributes"}
    my $rdbH = $self->db_handle;
    $rdbH->SQL("INSERT INTO attribute ( fid,tag,val,url ) VALUES ( '$peg','$k','$v','$url')");
    my $location=$self->attribute_location($peg);
    &verify_dir("$location");
    if (open(TMPATTR,">>$location/$file"))
    {
	print TMPATTR "$peg\t$k\t$v\t$url\n";
	close(TMPATTR);
    }
}

=head1 delete_attribute
 
Remove a tag from a feature. 

 Arguments:
 	feature id, this can be a peg, genome, etc,
	tag name to delete
=cut

sub delete_attribute {
    my($self,$peg,$k) = @_;
    return $self->change_attribute($peg, $k, undef, undef, undef);
}

=head1 change_attribute

 Change the value of a tag/value pair (and optionally its url).
 
 Arguments: 
 	feature id, this can be a peg, genome, etc,
	tag name whose value to replace
	value to replace it with
	optional URL to add
	optional file to store the changes in. 

See the note in add_attributes about files. Almost always you should not include this so that the default (assigned_attributes) is used as it is loaded last. However, this allows you to change the file if you wish.

Returns 0 on error and 1 on success.

=cut

sub change_attribute {
    my($self,$peg,$k,$v, $url, $file) = @_;
    unless ($file) {$file="assigned_attributes"}
    return (0) unless ($peg && $k); # we must have at least a peg and a key.
    $k =~ s/^\s+//; $k =~ s/\s+$//; $k=uc($k);
    my $rdbH = $self->db_handle;
    $rdbH->SQL("DELETE FROM attribute WHERE fid = \'$peg\' and tag = \'$k\'"); # note that peg can have more than one value.
    if (defined $v) {
     # now we can use this to delete things too
     $rdbH->SQL("INSERT INTO attribute ( fid,tag,val,url ) VALUES ( '$peg','$k','$v', '$url')"); 
    }
    my $location=$self->attribute_location($peg);
    &verify_dir("$location");
    if (open(TMPATTR,">>$location/$file"))
    {
	print TMPATTR "$peg\t$k\t$v\t$url\n";
	close(TMPATTR);
    }
    return 1;
}


=head1 get_tags
 
Get all the tags that we know about. 

Without any arguments:

Returns a reference to a hash, where the key is the type of feature (peg, genome, rna, prophage, etc), and the value is a reference to a hash where the key is the tag name and the value is the number of occurences

e.g. print "There are  " , {$fig->get_tags}->{'peg'}->{'PIRSF'}, " PIRSF tags in the database\n";

With an argument (that should be a recognized type like peg, rna, genome, etc):

Returns a reference to a hash where the key is the tag name and the value is the count. This should use less memory than above.
The argument should be (currently) peg, rna, pp, genome, or any other recognized feature type (generally defined as the .peg. in the fid). The default is to return all tags, and this can also be specified with all

=cut

sub get_tags {
 my($self, $want)=@_;
 unless ($want) {$want = "all"}
 my $rdbH = $self->db_handle;
 my $relational_db_response=$rdbH->SQL("SELECT fid,tag from attribute");
 my $tags;
 foreach my $res (@$relational_db_response) {
  my ($fid, $tag)=@$res;
  $tag =~ s/^\s+//; $tag =~ s/\s+$//; $tag=uc($tag);
  my $type=$self->ftype($fid);
  if ($type && ($want eq $type || $want eq "all")) {
   $tags->{$type}->{$tag}++;
  } elsif (($fid =~ /^\d+\.\d+$/) && (lc($want) eq "genome" || $want eq "all")) {
   $tags->{'genome'}->{$tag}++;
  }
 }
 if ($want eq "all") {return $tags} else {return $tags->{$want}}
}

=head1 get_values
 
Get all the values that we know about

Without any arguments:

Returns a reference to a hash, where the key is the type of feature (peg, genome, rna, prophage, etc), and the value is a reference to a hash where the key is the value and the value is the number of occurences

e.g. print "There are  " , {$fig->get_values}->{'peg'}->{'100'}, " tags with the value 100 in  the database\n";

With a single argument:

The argument is assumed to be the type (rna, peg, genome, etc).

With two arguments:

The first argument is the type (rna, peg, genome, etc), and the second argument is the tag.

In each case it will return a reference to a hash.

E.g. 

	$fig->get_values(); # will get all values

	$fig->get_values('peg'); # will get all values for pegs

	$fig->get_values('peg', 'pirsf'); # will get all values for pegs with attribute pirsf

	$fig->get_values(undef, 'pirsf'); # will get all values for anything with that attribute

=cut

sub get_values {
 my ($self, $want, $tag)=@_;
 unless ($want) {$want="all"}
 my $rdbH = $self->db_handle;
 $tag =~ s/^\s+//; $tag =~ s/\s+$//; $tag=uc($tag);
 
 my $sql="SELECT fid,val from attribute";
 if ($tag) {$sql .= " where tag = \'$tag\'"}
  
 my $relational_db_response=$rdbH->SQL($sql);
 
 my $tags;
 foreach my $res (@$relational_db_response) {
  my ($fid, $val)=@$res;
  my $type=$self->ftype($fid);
  if ($type && ($want eq $type || $want eq "all")) {
   $tags->{$type}->{$val}++;
  } elsif (($fid =~ /^\d+\.\d+$/) && (lc($want) eq "genome" || $want eq "all")) {
   $tags->{'genome'}->{$val}++;
  }
 }
 if ($want eq "all") {return $tags} else {return $tags->{$want}}
}
 
 
=head1 guess_value_format

There are occassions where I want to know what a value is for a tag. I have three scenarios right now:

 1. strings
 2. numbers
 3. percentiles ( a type of number, I know)


In these cases, I may want to know something about them and do something interesting with them. This will try and guess what the values are for a given tag so that you can try and limit what people add. At the moment this is pure guess work, although I suppose we could put some restrictions on t/v pairs I don't feel like.

This method will return a reference to an array. If the element is a string there will only be one element in that array, the word "string". If the value is a number, there will be three elements, the word "float" in position 0, and then the minimum and maximum values. You can figure out if it is a percent :-)

=cut

sub guess_value_format {
 my ($self, $tag)=@_;
 return unless ($tag);

 # I am using self->{'value_format'} to save the format so if this is called multiple times it is not recalculated each time
 return $self->{'value_format'}->{$tag} if (defined $self->{'value_format'}->{$tag});
 
 my $hash = $self->get_values(undef, $tag);
 return if (!$hash || !scalar keys %$hash); # don't carry on if there is nothing to look at
 
 my ($min, $max)=(100000000, 0);
 foreach my $type (keys %$hash) {
  foreach my $val (keys %{$hash->{$type}}) {
    print STDERR "GOT |$val| for $tag\n";
  
    # this code is taken from the perl cookbook pg 44
    # it should detect for all nummbers
    if ($val !~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/) {
        undef $min;
        undef $max;
        last;
    }
    else {
      if ($_ > $max) {$max=$_}
      if ($_ < $min) {$min=$_}
    }
   }
 }
 # if $min and $max are defined then the value is a number
 # if not, then it is a string;
 
 if (defined $min && defined $max) {$self->{'value_format'}->{$tag} = ["float", $min, $max]} else {$self->{'value_format'}->{$tag}=["string"]}
 return $self->{'value_format'}->{$tag};
}

=head1 attribute_location
 
This is just an internal method to find the appropriate location of the attributes file depending on whether it is a peg, an rna, or a genome or whatever.

=cut

sub attribute_location {
    my ($self, $peg)=@_;
    return unless ($peg);
    my $type=$self->ftype($peg); # we need to know whether it is a peg, prophage, rna, etc
    
    my $location;
    if ($type) 
    {
     my $genome = &genome_of($peg);
     $location="$FIG_Config::organisms/$genome/Features/$type/Attributes/";
    }
    elsif ($peg =~ /^\d+\.\d+$/ && (-e "$FIG_Config::organisms/$peg")) 
    {
     # $peg is a genome number and we know about it
     $location="$FIG_Config::organisms/$peg/Attributes/";
    }
    else {
     die "Can't figure out what $peg is. It is neither a known feature or a genome id";
    }
    
    return $location;
}




################################# Indexing Features and Functional Roles  ####################################

=pod

=head1 search_index

usage: ($pegs,$roles) = $fig->search_pattern($pattern)

All pegs that "match" $pattern are put into a list, and $pegs will be a
pointer to that list.

All roles that "match" $pattern are put into a list, and $roles will be a
pointer to that list.

The notion of "match $pattern" is intentionally left undefined.  For now, you
will probably get only entries in which each word id $pattern occurs exactly,
but that is not a long term commitment.

=cut

sub search_index {
    my($self,$pattern) = @_;
    my($patternQ,@raw,@pegs,@roles);

    &clean_tmp;
    $patternQ = $pattern;
    $patternQ =~ s/\s+/;/g;
    $patternQ =~ s/\./\\./g;

#   print STDERR "pattern=$pattern patternQ=$patternQ\n";
    @raw = `$FIG_Config::ext_bin/glimpse -y -H $FIG_Config::data/Indexes -i -w \'$patternQ\'`;
    @pegs  = grep { ! $self->is_deleted_fid($_->[0]) }
	     sort { &FIG::by_fig_id($a->[0],$b->[0]) }
             map { $_ =~ s/^\S+:\s+//; [split(/\t/,$_)] }
             grep { $_ =~ /^\S+peg.index/ } @raw;
    my %roles = map { $_ =~ s/^\S+:\s+//; $_ => 1} grep { $_ =~ /^\S+role.index/ } @raw;
    @roles = sort keys(%roles);

    return ([@pegs],[@roles]);
}

################################# Loading Databases  ####################################


#=pod
#
#=head1 load_all
#
#usage: load_all
#
#This function is supposed to reload all entries into the database and do
#whatever is required to properly support indexing of pegs and roles.
#
#=cut

sub load_all {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);

    print STDERR "\nLoading SEED data\n\n";

    my @packages = qw(load_peg_mapping
		      index_contigs
		      compute_genome_counts
		      load_features
		      index_sims
		      index_translations
		      add_assertions_of_function
		      load_protein_families
		      load_external_orgs
		      load_chromosomal_clusters
		      load_pch_pins
		      index_neighborhoods
		      index_annotations
		      load_ec_names
		      init_maps
		      load_kegg
		      load_distances
		      make_indexes
		      format_peg_dbs
		      load_links
		      index_subsystems
		      load_attributes
		      load_bbhs
		      load_literature
		   );

    my $pn = @packages;
    for my $i (0..@packages - 1)
    {
	my $i1 = $i + 1;
	my $pkg = $packages[$i];

	my $date = `date`;
	chomp $date;
	print "$date:  Running $pkg ($i1 of $pn)\n";

	&run($pkg);
    }
    print "\n\nLoad complete.\n\n";
}

################################# Automated Assignments  ####################################

=pod

=head1 auto_assign

usage: $assignment = &FIG::auto_assign($peg,$seq)

This returns an automated assignment for $peg.  $seq is optional; if it is not
present, then it is assumed that similarities already exist for $peg.  $assignment is set
to either

    Function
or
    Function\tW

if it is felt that the assertion is pretty weak.

=cut

sub auto_assign {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my($peg,$seq) = @_;

    my $cmd = $seq ? "echo \"$peg\t$seq\" | $FIG_Config::bin/auto_assign | $FIG_Config::bin/make_calls" : "echo \"$peg\" | $FIG_Config::bin/auto_assign | $FIG_Config::bin/make_calls";
#   print STDERR $cmd;
    my(@tmp) = `$cmd`;
    if ((@tmp == 1) && ($tmp[0] =~ /^\S+\t(\S.*\S)/))
    {
	return $1;
    }
    else
    {
	return "hypothetical protein";
    }
}

################################# Protein Families ####################################

=pod

=head1 all_protein_families

usage: @all = $fig->all_protein_families

Returns a list of the ids of all of the protein families currently defined.

=cut

sub all_protein_families {
    my($self) = @_;

    return $self->all_sets("protein_families","family");
}

=pod

=head1 ids_in_family

usage: @pegs = $fig->ids_in_family($family)

Returns a list of the pegs in $family.

=cut

sub ids_in_family {
    my($self,$family) = @_;

    return $self->ids_in_set($family,"protein_families","family");
}

=pod

=head1 family_function

usage: $func = $fig->family_function($family)

Returns the putative function of all of the pegs in $family.  Remember, we
are defining "protein family" as a set of homologous proteins that have the
same function.

=cut

sub family_function {
    my($self,$family) = @_;
    my($relational_db_response);
    my $rdbH = $self->db_handle;

    defined($family) || confess "family is missing";
    if (($relational_db_response = $rdbH->SQL("SELECT function FROM family_function WHERE ( family = $family)")) &&
	(@$relational_db_response >= 1))
    {
	return $relational_db_response->[0]->[0];
    }
    return "";
}

=pod

=head1 sz_family

usage: $n = $fig->sz_family($family)

Returns the number of pegs in $family.

=cut

sub sz_family {
    my($self,$family) = @_;

    return $self->sz_set($family,"protein_families","family");
}

=pod

=head1 in_family

usage: @pegs = $fig->in_family($family)

Returns the pegs in $family.

=cut

sub in_family {
    my($self,$id) = @_;

    my @in = $self->in_sets($id,"protein_families","family");
    return (@in > 0) ? $in[0] : "";
}

################################# Abstract Set Routines  ####################################

sub all_sets {
    my($self,$relation,$set_name) = @_;
    my($relational_db_response);

    my $rdbH = $self->db_handle;

    if (($relational_db_response = $rdbH->SQL("SELECT DISTINCT $set_name FROM $relation")) &&
	(@$relational_db_response >= 1))
    {
	return map { $_->[0] } @$relational_db_response;
    }
    return ();
}

sub next_set {
    my($self,$relation,$set_name) = @_;
    my($relational_db_response);

    my $rdbH = $self->db_handle;

    if (($relational_db_response = $rdbH->SQL("SELECT MAX($set_name) FROM $relation")) &&
	(@$relational_db_response == 1))
    {
	return $relational_db_response->[0]->[0] + 1;
    }
}

sub ids_in_set {
    my($self,$which,$relation,$set_name) = @_;
    my($relational_db_response);

    my $rdbH = $self->db_handle;
    if (defined($which) && ($which =~ /^\d+$/))
    {
	if (($relational_db_response = $rdbH->SQL("SELECT id FROM $relation WHERE ( $set_name = $which)")) &&
	    (@$relational_db_response >= 1))
	{
	    return grep { ! $self->is_deleted_fid($_) }
                   sort { by_fig_id($a,$b) }
                   map { $_->[0] } @$relational_db_response;
	}
    }
    return ();
}

sub in_sets {
    my($self,$id,$relation,$set_name) = @_;
    my($relational_db_response);

    if ($self->is_deleted_fid($id)) { return () }

    my $rdbH = $self->db_handle;

    if (($relational_db_response = $rdbH->SQL("SELECT $set_name FROM $relation WHERE ( id = \'$id\' )")) &&
	(@$relational_db_response >= 1))
    {
	return map { $_->[0] } @$relational_db_response;
    }
    return ();
}

sub sz_set {
    my($self,$which,$relation,$set_name) = @_;
    my($relational_db_response);

    my $rdbH = $self->db_handle;

    if (($relational_db_response = $rdbH->SQL("SELECT COUNT(*) FROM $relation WHERE ( $set_name = $which)")) &&
	(@$relational_db_response == 1))
    {
	return $relational_db_response->[0]->[0];
    }
    return 0;
}

sub delete_set {
    my($self,$set,$relation,$set_name) = @_;

#   print STDERR "deleting set $set\n";
    my $rdbH = $self->db_handle;

    return $rdbH->SQL("DELETE FROM $relation WHERE ( $set_name = $set )");
}

sub insert_set {
    my($self,$set,$ids,$relation,$set_name) = @_;
    my($id);

#   print STDERR "inserting set $set containing ",join(",",@$ids),"\n";
    my $rdbH = $self->db_handle;

    my @ids = grep { length($_) < 255 } @$ids;
    if (@ids < 2) { return 0 }

    my $rc = 1;
    foreach $id (@ids)
    {
	next if ($self->is_deleted_fid($id));
	if (! $rdbH->SQL("INSERT INTO $relation ( $set_name,id ) VALUES ( $set,\'$id\' )"))
	{
	    $rc = 0;
	}
    }
#   print STDERR "    rc=$rc\n";
    return $rc;
}

sub in_set_with {
    my($self,$peg,$relation,$set_name) = @_;
    my($set,$id,%in);

    foreach $set ($self->in_sets($peg,$relation,$set_name))
    {
	foreach $id ($self->ids_in_set($set,$relation,$set_name))
	{
	    $in{$id} = 1;
	}
    }
    return sort { &by_fig_id($a,$b) } keys(%in);
}


sub export_set {
    my($self,$relation,$set_name,$file) = @_;
    my($pair);

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT $set_name, id FROM $relation");

    open(TMPSET,">$file")
	|| die "could not open $file";
    flock(TMPSET,LOCK_EX) || confess "cannot lock $file";
    seek(TMPSET,0,2)      || confess "failed to seek to the end of the file";

    foreach $pair (sort { ($a->[0] <=> $b->[0]) or &by_fig_id($a->[1],$b->[1]) } @$relational_db_response)
    {
	if (! $self->is_deleted_fid($pair->[1]))
	{
	    print TMPSET join("\t",@$pair),"\n";
	}
    }
    close(TMPSET);
    return 1;
}

################################# KEGG Stuff  ####################################


=pod

=head1 all_compounds

usage: @compounds = $fig->all_compounds

Returns a list containing all of the KEGG compounds.

=cut

sub all_compounds {
    my($self) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT DISTINCT cid FROM comp_name");
    if (@$relational_db_response > 0)
    {
	return sort map { $_->[0] } @$relational_db_response;
    }
    return ();
}

=pod

=head1 names_of_compound

usage: @names = $fig->names_of_compound

Returns a list containing all of the names assigned to the KEGG compounds.  The list
will be ordered as given by KEGG.

=cut

sub names_of_compound {
    my($self,$cid) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT pos,name FROM comp_name where cid = \'$cid\'");
    if (@$relational_db_response > 0)
    {
	return map { $_->[1] } sort { $a->[0] <=> $b->[0] } @$relational_db_response;
    }
    return ();
}

=pod

=head1 comp2react


usage: @rids = $fig->comp2react($cid)

Returns a list containing all of the reaction IDs for reactions that take $cid
as either a substrate or a product.

=cut

sub comp2react {
    my($self,$cid) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT rid FROM reaction_to_compound where cid = \'$cid\'");
    if (@$relational_db_response > 0)
    {
	return sort map { $_->[0] } @$relational_db_response;
    }
    return ();
}

=pod

=head1 cas

usage: $cas = $fig->cas($cid)

Returns the CAS ID for the compound, if known.

=cut

sub cas {
    my($self,$cid) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT cas FROM comp_cas where cid = \'$cid\'");
    if (@$relational_db_response == 1)
    {
	return $relational_db_response->[0]->[0];
    }
    return "";
}

=pod

=head1 cas_to_cid

usage: $cid = $fig->cas_to_cid($cas)

Returns the compound id (cid), given the CAS ID.

=cut

sub cas_to_cid {
    my($self,$cas) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT cid FROM comp_cas where cas = \'$cas\'");
    if (@$relational_db_response == 1)
    {
	return $relational_db_response->[0]->[0];
    }
    return "";
}

=pod

=head1 all_reactions

usage: @rids = $fig->all_reactions

Returns a list containing all of the KEGG reaction IDs.

=cut

sub all_reactions {
    my($self) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT DISTINCT rid FROM reaction_to_compound");
    if (@$relational_db_response > 0)
    {
	return sort map { $_->[0] } @$relational_db_response;
    }
    return ();
}

=pod

=head1 reversible

usage: $rev = $fig->reversible($rid)

Returns true iff the reactions had a "main direction" designated as "<=>";

=cut

sub reversible {
    my($self,$rid) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT reversible FROM reversible where rid = \'$rid\'");
    if (@$relational_db_response == 1)
    {
	return $relational_db_response->[0]->[0];
    }
    return 1;
}

=pod

=head1 reaction2comp

usage: @tuples = $fig->reaction2comp($rid,$which)

Returns the "substrates" iff $which == 0.  In any event (i.e., whether you ask for substrates
or products), you get back a list of 3-tuples.  Each 3-tuple will contain

    [$cid,$stoich,$main]

Stoichiometry is normally numeric, but can be things like "n" or "(n+1)".
$main is 1 iff the compound is considered "main" or "connectable".

=cut

sub reaction2comp {
    my($self,$rid,$which) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT cid,stoich,main FROM reaction_to_compound where rid = \'$rid\' and setn = \'$which\'");
    if (@$relational_db_response > 0)
    {
	return sort { $a->[0] cmp $b->[0] } map { $_->[1] =~ s/\s+//g; $_ } @$relational_db_response;
    }
    return ();
}

=pod

=head1 catalyzed_by

usage: @ecs = $fig->catalyzed_by($rid)

Returns the ECs that are reputed to catalyze the reaction.  Note that we are currently
just returning the ECs that KEGG gives.  We need to handle the incompletely specified forms
(e.g., 1.1.1.-), but we do not do it yet.

=cut

sub catalyzed_by {
    my($self,$rid) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT role FROM reaction_to_enzyme where rid = \'$rid\'");
    if (@$relational_db_response > 0)
    {
	return sort map { $_->[0] } @$relational_db_response;
    }
    return ();
}

=pod

=head1 catalyzes

usage: @ecs = $fig->catalyzes($role)

Returns the rids of the reactions catalyzed by the "role" (normally an EC).

=cut

sub catalyzes {
    my($self,$role) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT rid FROM reaction_to_enzyme where role = \'$role\'");
    if (@$relational_db_response > 0)
    {
	return sort map { $_->[0] } @$relational_db_response;
    }
    return ();
}


=pod

=head1 displayable_reaction

usage: $display_format = $fig->displayable_reaction($rid)

Returns a string giving the displayable version of a reaction.

=cut

sub displayable_reaction {
    my($self,$rid) = @_;

    my @tmp = `grep $rid $FIG_Config::data/KEGG/reaction_name.lst`;
    if (@tmp > 0)
    {
	chomp $tmp[0];
	return $tmp[0];
    }
    return $rid;
}

=pod

=head1 all_maps

usage: @maps = $fig->all_maps

Returns a list containing all of the KEGG maps that the system knows about (the
maps need to be periodically updated).

=cut

sub all_maps {
    my($self) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT DISTINCT map FROM ec_map ");
    if (@$relational_db_response > 0)
    {
	return map { $_->[0] } @$relational_db_response;
    }
    return ();
}

=pod

=head1 ec_to_maps

usage: @maps = $fig->ec_to_maps($ec)

Returns the set of maps that contain $ec as a functional role.  $ec is usually an EC number,
but in the more general case, it can be a functional role.

=cut

sub ec_to_maps {
    my($self,$ec) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT map FROM ec_map WHERE ( ec = \'$ec\' )");
    if (@$relational_db_response > 0)
    {
	return map { $_->[0] } @$relational_db_response;
    }
    return ();
}

sub role_to_maps
{
    my($self, $role) = @_;

    return $self->ec_to_maps($role);
}

=pod

=head1 map_to_ecs

usage: @ecs = $fig->map_to_ecs($map)

Returns the set of functional roles (usually ECs)  that are contained in the functionality
depicted by $map.

=cut

sub map_to_ecs {
    my($self,$map) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT ec FROM ec_map WHERE ( map = \'$map\' )");
    if (@$relational_db_response > 0)
    {
	return map { $_->[0] } @$relational_db_response;
    }
    return ();
}

=pod

=head1 map_name

usage: $name = $fig->map_name($map)

Returns the descriptive name covering the functionality depicted by $map.

=cut

sub map_name {
    my($self,$map) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT mapname FROM map_name WHERE ( map = \'$map\' )");
    if (@$relational_db_response == 1)
    {
	return $relational_db_response->[0]->[0];
    }
    return "";
}

################################# Functional Roles  ####################################

=pod

=head1 neighborhood_of_role

usage: @roles = $fig->neighborhood_of_role($role)

Returns a list of functional roles that we consider to be "the neighborhood" of $role.

=cut

sub neighborhood_of_role {
    my($self,$role) = @_;
    my($readC);

    my $file = "$FIG_Config::global/role.neighborhoods";
    my $rdbH = $self->db_handle;
    my $roleQ = quotemeta $role;
    my $relational_db_response = $rdbH->SQL("SELECT seek, len FROM neigh_seeks WHERE role = \'$roleQ\' ");
    if (@$relational_db_response == 1)
    {
	my($seek,$ln) = @{$relational_db_response->[0]};
	my $fh   = $self->openF($file);
	seek($fh,$seek,0);
	my $readN = read($fh,$readC,$ln-1);
	($readN == ($ln-1))
	    || confess "could not read the block of sims at $seek for $ln - 1 characters; $readN actually read from $file\n$readC";
	return grep { $_ && ($_ !~ /^\/\//) } split(/\n/,$readC);
    }
    return ();
}

=pod

=head1 roles_of_function

usage: @roles = $fig->roles_of_function($func)

Returns a list of the functional roles implemented by $func.

=cut

sub roles_of_function {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my $func = (@_ == 1) ? $_[0] : $_[1];

    $func =~ s/\!.*$//;
    return (split(/\s*;\s+|\s+[\@\/]\s+/,$func),($func =~ /\d+\.\d+\.\d+\.\d+/g));
}

=pod

=head1 seqs_with_role

usage: @pegs = $fig->seqs_with_role($role,$who)

Returns a list of the pegs that implement $role.  If $who is not given, it
defaults to "master".  The system returns all pegs with an assignment made by
either "master" or $who (if it is different than the master) that implement $role.
Note that this includes pegs for which the "master" annotation disagrees with that
of $who, the master's implements $role, and $who's does not.

=cut

sub seqs_with_role {
    my($self,$role,$who,$genome) = @_;
    my($relational_db_response,$query);

    my $roleQ = quotemeta $role;

    $who = $who ? $who : "master";
    my $rdbH = $self->db_handle;

    my $who_cond;
    if ($who eq "master")
    {
	$who_cond = "( made_by = \'master\' OR made_by = \'unknown\' )";
    }
    else
    {
	$who_cond = "( made_by = \'master\' OR made_by = \'$who\' OR made_by = \'unknown\')";
    }

    if (! $genome)
    {
	$query = "SELECT distinct prot FROM roles  WHERE (( role = \'$roleQ\' ) AND $who_cond )";
    }
    else
    {
	$query = "SELECT distinct prot FROM roles  WHERE (( role = \'$roleQ\' ) AND $who_cond AND (org = \'$genome\'))";
    }
    return (($relational_db_response = $rdbH->SQL($query)) && (@$relational_db_response >= 1)) ?
	grep { ! $self->is_deleted_fid($_) } map { $_->[0] } @$relational_db_response : ();
}

=pod

=head1 seqs_with_roles_in_genomes

usage: $result = $fig->seqs_with_roles_in_genomes($genomes,$roles,$made_by)

This routine takes a pointer to a list of genomes ($genomes) and a pointer to a list of
roles ($roles) and looks up all of the sequences that connect to those roles according
to either the master assignments or those made by $made_by.  Again, you will get assignments
for which the "master" assignment connects, but the $made_by does not.

A hash is returned.  The keys to the hash are genome IDs for which at least one sequence
was found.  $result->{$genome} will itself be a hash, assuming that at least one sequence
was found for $genome.  $result->{$genome}->{$role} will be set to a pointer to a list of
2-tuples.  Each 2-tuple will contain [$peg,$function], where $function is the one for
$made_by (which may not be the one that connected).

=cut

sub seqs_with_roles_in_genomes {
    my($self,$genomes,$roles,$made_by) = @_;
    my($genome,$role,$roleQ,$role_cond,$made_by_cond,$query,$relational_db_response,$peg,$genome_cond,$hit);
    my $rdbH = $self->db_handle;
    my $result = {}; # foreach $genome ($self->genomes) { $result->{$genome} = {} }
    if (! $made_by) { $made_by = 'master' }
    if ((@$genomes > 0) && (@$roles > 0))
    {
	$genome_cond = "(" . join(" OR ",map { "( org = \'$_\' )" } @$genomes) . ")";
	$role_cond   = "(" . join(" OR ",map { $roleQ = quotemeta $_; "( role = \'$roleQ\' )" } @$roles) . ")";
	$made_by_cond = ($made_by eq 'master') ? "(made_by = 'master')" : "(made_by = 'master' OR made_by = '$made_by')";
	$query = "SELECT distinct prot, role FROM roles  WHERE ( $made_by_cond AND $genome_cond AND $role_cond )";
	if (($relational_db_response = $rdbH->SQL($query)) && (@$relational_db_response >= 1))
	{
	    foreach $hit (@$relational_db_response)
	    {
		($peg,$role) = @$hit;
		if (! $self->is_deleted_fid($peg))
		{
		    $genome = $self->genome_of($peg);
		    push(@{ $result->{$genome}->{$role} },[$peg,scalar $self->function_of($peg,$made_by)]);
		}
	    }
	}
    }
    return $result;
}

=pod

=head1 largest_clusters

usage: @clusters = $fig->largest_clusters($roles,$user)

This routine can be used to find the largest clusters containing some of the
designated set of roles.  A list of clusters is returned.  Each cluster is a pointer to
a list of pegs.

=cut

sub largest_clusters {
    my($self,$roles,$user,$sort_by_unique_functions) = @_;
    my($genome,$x,$role,$y,$peg,$loc,$contig,$beg,$end,%pegs,@pegs,$i,$j);

    my $ss = $self->seqs_with_roles_in_genomes([$self->genomes],$roles,$user);
    my @clusters = ();

    foreach $genome (keys(%$ss))
    {
	my %pegs;
	$x = $ss->{$genome};
	foreach $role (keys(%$x))
	{
	    $y = $x->{$role};
	    foreach $peg (map { $_->[0] } @$y)
	    {
		if ($loc = $self->feature_location($peg))
		{
		    ($contig,$beg,$end) = &FIG::boundaries_of($loc);
		    $pegs{$peg} = [$peg,$contig,int(($beg + $end) / 2)];
		}
	    }
	}

	@pegs = sort { ($pegs{$a}->[1] cmp $pegs{$b}->[1]) or ($pegs{$a}->[2] <=> $pegs{$b}->[2]) } keys(%pegs);
	$i = 0;
	while ($i < $#pegs)
	{
	    for ($j=$i+1; ($j < @pegs) && &close_enough_locs($pegs{$pegs[$j-1]},$pegs{$pegs[$j]}); $j++) {}
	    if ($j > ($i+1))
	    {
		push(@clusters,[@pegs[$i..$j-1]]);
	    }
	    $i = $j;
	}
    }
    if ($sort_by_unique_functions)
    {
	@clusters = sort { $self->unique_functions($b,$user) <=> $self->unique_functions($a,$user) } @clusters;
    }
    else
    {
	@clusters = sort { @$b <=> @$a } @clusters;
    }
    return @clusters;
}

sub unique_functions {
    my($self,$pegs,$user) = @_;
    my($peg,$func,%seen);

    foreach $peg (@$pegs)
    {
	if ($func = $self->function_of($peg,$user))
	{
	    $seen{$func} = 1;
	}
    }
    return scalar keys(%seen);
}

sub close_enough_locs {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my($x,$y) = @_;

    return (($x->[1] eq $y->[1]) && (abs($x->[2] - $y->[2]) < 5000));
}

sub candidates_for_role {
    my($self,$role,$genome,$cutoff,$user) = @_;
    my($peg);

    $user = $user ? $user : "master";

    my @cand = map { $_->[0] }
               sort { $a->[1] <=> $b->[1] }
               map { $peg = $_; [$peg,$self->crude_estimate_of_distance($genome,&FIG::genome_of($peg))] }
               $self->seqs_with_role($role,$user);

    return $self->candidates_for_role_from_known($genome,$cutoff,\@cand);
}

sub candidates_for_role_from_known {
    my($self,$genome,$cutoff,$known) = @_;
    my($peg);

    my @cand = @$known;
    my $hits = {};
    my $seen = {};
    my $how_many = (@cand > 10) ? 9 : $#cand;
    &try_to_locate($self,$genome,$hits,[@cand[0..$how_many]],$seen,$cutoff);
    if (keys(%$hits) == 0)
    {
	splice(@cand,0,$how_many+1);
	&try_to_locate($self,$genome,$hits,\@cand,$seen,$cutoff);
    }
    return sort {$hits->{$a}->[0] <=> $hits->{$b}->[0]} keys(%$hits);
}

sub try_to_locate {
    my($self,$genome,$hits,$to_try,$seen,$cutoff) = @_;
    my($prot,$id2,$psc,$id2a,$x,$sim);
    if (! $cutoff) { $cutoff = 1.0e-5 }

    foreach $prot (@$to_try)
    {
	if (! $seen->{$prot})
	{
	    if (($prot =~ /^fig\|(\d+\.\d+)/) && ($1 eq $genome))
	    {
		$hits->{$prot} = [0,$prot];
	    }
	    else
	    {
		foreach $sim ($self->sims($prot,1000,$cutoff,"fig"))
		{
		    $id2 = $sim->id2;
		    $psc = $sim->psc;
		    if (($id2 =~ /^fig\|(\d+\.\d+)/) && ($1 eq $genome))
		    {
			$x = $hits->{$id2};
			if ((! $x) || ($x->[0] > $psc))
			{
			    $hits->{$id2} = [$psc,$prot];
			}
		    }
		    elsif (&neg_log($psc) > (2 * &neg_log($cutoff)))
		    {
			$seen->{$id2} = 1;
		    }
		}
	    }
	}
    }
}

sub neg_log {
    my($x) = @_;

    if ($x == 0)
    {
	return 200;
    }
    else
    {
	return -log($x) / log(10);
    }
}



=pod

=head1 best_bbh_candidates

usage: @candidates = $fig->best_bbh_candidates($genome,$cutoff,$requested,$known)

This routine returns a list of up to $requested candidates from $genome.  A candidate is a BBH
against one of the PEGs in genomes from the list given by@$known.
Each entry in the list is a 3-tuple:

    [CandidatePEG,KnownBBH,Pscore]

=cut

sub best_bbh_candidates {
    my($self,$genome,$cutoff,$requested,$known,$frac_match) = @_;
    my($i,$j,$k,$sim,@sims,$peg,$id2,$genome2,$sim_back);
    my($bbh,%seen,%computed_sims,$genome1);

    $frac_match = defined($frac_match) ? $frac_match : 0.7;
    my @got = ();
    my @cand = $self->candidates_for_role_from_known($genome,$cutoff,$known);
    if (@cand > 0)
    {
	my %genomes = map { $genome1 = &FIG::genome_of($_); $genome1 => 1 } @$known;
	my %pegs    = map { $_ => 1 } @$known;
	for ($i=0; (@got < $requested) && ($i < @cand); $i++)
	{
	    $peg = $cand[$i];
	    undef %seen;
	    @sims = grep { $genomes{&FIG::genome_of($_->id2)} } $self->sims($peg,1000,$cutoff,"fig");
	    $bbh = 0;
	    for ($j=0; (! $bbh) && ($j < @sims); $j++)
	    {
		$sim = $sims[$j];
		$id2 = $sim->id2;
		$genome2 = &FIG::genome_of($id2);
		if (! $seen{$genome2})
		{
		    if ($pegs{$id2})
		    {
			if (! defined($sim_back = $computed_sims{$id2}))
			{
			    my @sims_back = $self->sims($id2,1000,$cutoff,"fig");
			    for ($k=0; ($k < @sims_back) && (&FIG::genome_of($sims_back[$k]->id2) ne $genome); $k++) {}
			    if ($k < @sims_back)
			    {
				$sim_back = $computed_sims{$id2} = $sims_back[$k];
			    }
			    else
			    {
				$sim_back = $computed_sims{$id2} = 0;
			    }
			}
			if ($sim_back)
			{
			    if ($self->ok_match($sim_back,$frac_match))
			    {
				$bbh = [$id2,$sim_back->psc];
			    }
			}
		    }
		    $seen{$genome2} = 1;
		}
	    }

	    if ($bbh)
	    {
		push(@got,[$peg,@$bbh]);
	    }
	}
    }
    return @got;
}


sub ok_match {
    my($self,$sim,$frac_match) = @_;

    my $ln1 = $sim->ln1;
    my $ln2 = $sim->ln2;
    my $b1  = $sim->b1;
    my $e1  = $sim->e1;
    my $b2  = $sim->b2;
    my $e2  = $sim->e2;

    return (((($e1 - $b1) / $ln1) >= $frac_match) &&
	    ((($e2 - $b2) / $ln2) >= $frac_match))
}

sub external_calls {
    my($self,$pegs) = @_;
    my($peg,$func);

    open(TMP,">/tmp/pegs.$$") || die "could not open /tmp/pegs.$$";
    foreach $peg (@$pegs)
    {
	print TMP "$peg\n";
    }
    close(TMP);
    open(TMP,">/tmp/parms.$$") || die "could not open /tmp/parms.$$";
    print TMP "no_fig\t1\n";
    close(TMP);

    my %call = map { chop; ($peg,$func) = split(/\t/,$_) }
                `$FIG_Config::bin/auto_assign /tmp/parms.$$ < /tmp/pegs.$$ 2> /dev/null | $FIG_Config::bin/make_calls`;
    unlink("/tmp/pegs.$$","/tmp/parms.$$");
    return map { $call{$_} ? [$_,$call{$_}] : [$_,"hypothetical protein"] } @$pegs;
}

use SameFunc;

sub same_func {
    my($self,$f1,$f2) = @_;

    return &SameFunc::same_func($f1,$f2);
}

#################################   DNA sequence Stuff ####################################

=pod

=head1 extract_seq

usage: $seq = &FIG::extract_seq($contigs,$loc)

This is just a little utility routine that I have found convenient.  It assumes that
$contigs is a hash that contains IDs as keys and sequences as values.  $loc must be of the
form
       Contig_Beg_End

where Contig is the ID of one of the sequences; Beg and End give the coordinates of the sought
subsequence.  If Beg > End, it is assumed that you want the reverse complement of the subsequence.
This routine plucks out the subsequence for you.

=cut

sub extract_seq {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my($contigs,$loc) = @_;
    my($contig,$beg,$end,$contig_seq);
    my($plus,$minus);

    $plus = $minus = 0;
    my $strand = "";
    my @loc = split(/,/,$loc);
    my @seq = ();
    foreach $loc (@loc)
    {
	if ($loc =~ /^\S+_(\d+)_(\d+)$/)
	{
	    if ($1 < $2)
	    {
		$plus++;
	    }
	    elsif ($2 < $1)
	    {
		$minus++;
	    }
	}
    }
    if ($plus > $minus)
    {
	$strand = "+";
    }
    elsif ($plus < $minus)
    {
	$strand = "-";
    }

    foreach $loc (@loc)
    {
	if ($loc =~ /^(\S+)_(\d+)_(\d+)$/)
	{
	    ($contig,$beg,$end) = ($1,$2,$3);

	    my $len = length($contigs->{$contig});
	    if (!$len)
	    {
		carp "Undefined or zero-length contig $contig";
		return "";
	    }

	    if (($beg > $len) || ($end > $len))
	    {
		carp "Region $loc out of bounds (contig len=$len)";
	    }
	    else
	    {
		if (($beg < $end) || (($beg == $end) && ($strand eq "+")))
		{
		    push(@seq,substr($contigs->{$contig},$beg-1,($end+1-$beg)));
		}
		else
		{
		    $strand = "-";
		    push(@seq,&reverse_comp(substr($contigs->{$contig},$end-1,($beg+1-$end))));
		}
	    }
	}
    }
    return join("",@seq);
}

=pod

=head1 all_contigs

usage: @contig_ids = $fig->all_contigs($genome)

Returns a list of all of the contigs occurring in the designated genome.

=cut

sub all_contigs {
    my($self,$genome) = @_;
    my($rdbH,$relational_db_response);

    $rdbH = $self->db_handle;
    if (defined($genome))
    {
	if ($relational_db_response = $rdbH->SQL("SELECT DISTINCT contig FROM contig_lengths WHERE ( genome = \'$genome\' )"))
	{
	    return map { $_->[0] } @$relational_db_response;
	}
    }
    return undef;
}

=pod

=head1 contig_ln

usage: $n = $fig->contig_ln($genome,$contig)

Returns the length of $contig from $genome.

=cut

sub contig_ln {
    my($self,$genome,$contig) = @_;
    my($rdbH,$relational_db_response);

    $rdbH = $self->db_handle;
    if (defined($genome) && defined($contig))
    {
	if (($relational_db_response = $rdbH->SQL("SELECT len FROM contig_lengths WHERE ( genome = \'$genome\' ) and ( contig = \'$contig\' )")) &&

	    (@$relational_db_response == 1))
	{
	    return $relational_db_response->[0]->[0];
	}
    }
    return undef;
}

=pod

=head1 dna_seq

usage: $seq = dna_seq($genome,@locations)

Returns the concatenated subsequences described by the list of locations.  Each location
must be of the form

    Contig_Beg_End

where Contig must be the ID of a contig for genome $genome.  If Beg > End the location
describes a stretch of the complementary strand.

=cut

sub dna_seq {
    my($self,$genome,@locations) = @_;
    my(@pieces,$loc,$contig,$beg,$end,$ln,$rdbH);

    @locations = map { split(/,/,$_) } @locations;
    @pieces = ();
    foreach $loc (@locations)
    {
	if ($loc =~ /^(\S+)_(\d+)_(\d+)$/)
	{
	    ($contig,$beg,$end) = ($1,$2,$3);
	    $ln = $self->contig_ln($genome,$contig);

	    if (! $ln) {
		print STDERR "$genome/$contig: could not get length\n";
		return "";
	    }

	    if (&between(1,$beg,$ln) && &between(1,$end,$ln))
	    {
		if ($beg < $end)
		{
		    push(@pieces, $self->get_dna($genome,$contig,$beg,$end));
		}
		else
		{
		    push(@pieces, &reverse_comp($self->get_dna($genome,$contig,$end,$beg)));
		}
	    }
	}
    }
    return join("",@pieces);
}

sub get_dna {
    my($self,$genome,$contig,$beg,$end) = @_;
    my $relational_db_response;

    my $rdbH = $self->db_handle;
    my $indexpt = int(($beg-1)/10000) * 10000;
    if (($relational_db_response = $rdbH->SQL("SELECT startN,fileno,seek FROM contig_seeks WHERE ( genome = \'$genome\' ) AND ( contig = \'$contig\' ) AND ( indexpt = $indexpt )")) &&
	(@$relational_db_response == 1))
    {
	my($startN,$fileN,$seek) = @{$relational_db_response->[0]};
	my $fh = $self->openF($self->N2file($fileN));
	if (seek($fh,$seek,0))
	{
	    my $chunk = "";
	    read($fh,$chunk,int(($end + 1 - $startN) * 1.03));
#	    print STDERR "genome=$genome contig=$contig beg=$beg end=$end startN=$startN chunk=$chunk\n";
	    $chunk =~ s/\s//g;
	    my $ln = ($end - $beg) + 1;
	    if (length($chunk) >= $ln)
	    {
		return substr($chunk,(($beg-1)-$startN),$ln);
	    }
	}
    }
    return undef;
}

#################################   Taxonomy  ####################################

=pod

=head1 taxonomy_of

usage: $gs = $fig->taxonomy_of($genome_id)

Returns the taxonomy of the specified genome.  Gives the taxonomy down to
genus and species.

=cut

sub taxonomy_of :scalar {
    my($self,$genome) = @_;
    my($ans);
    my $taxonomy = $self->cached('_taxonomy');

    if (! ($ans = $taxonomy->{$genome}))
    {
	my $rdbH = $self->db_handle;
	my $relational_db_response = $rdbH->SQL("SELECT genome,taxonomy  FROM genome");
	my $pair;
	foreach $pair (@$relational_db_response)
	{
	    $taxonomy->{$pair->[0]} = $pair->[1];
	}
	$ans = $taxonomy->{$genome};
    }
    return $ans;
}

=pod

=head1 is_bacterial

usage: $fig->is_bacterial($genome)

Returns true iff the genome is bacterial.

=cut

sub is_bacterial :scalar {
    my($self,$genome) = @_;

    return ($self->taxonomy_of($genome) =~ /^Bacteria/) ? 1 : 0;
}


=pod

=head1 is_archaeal

usage: $fig->is_archaeal($genome)

Returns true iff the genome is archaeal.

=cut

sub is_archaeal :scalar {
    my($self,$genome) = @_;

    return ($self->taxonomy_of($genome) =~ /^Archaea/) ? 1 : 0;
}


=pod

=head1 is_prokaryotic

usage: $fig->is_prokaryotic($genome)

Returns true iff the genome is prokaryotic

=cut

sub is_prokaryotic :scalar {
    my($self,$genome) = @_;

    return ($self->taxonomy_of($genome) =~ /^(Archaea|Bacteria)/) ? 1 : 0;
}


=pod

=head1 is_eukaryotic

usage: $fig->is_eukaryotic($genome)

Returns true iff the genome is eukaryotic

=cut

sub is_eukaryotic :scalar {
    my($self,$genome) = @_;

    return ($self->taxonomy_of($genome) =~ /^Eukaryota/) ? 1 : 0;
}

=pod

=head1 sort_genomes_by_taxonomy

usage: @genomes = $fig->sort_genomes_by_taxonomy(@list_of_genomes)

This routine is used to sort a list of genome IDs to put them
into taxonomic order.

=cut

sub sort_genomes_by_taxonomy {
    my($self,@fids) = @_;

    return map     { $_->[0] }
           sort    { $a->[1] cmp $b->[1] }
           map     { [$_,$self->taxonomy_of($_)] }
           @fids;
}

=pod

=head1 crude_estimate_of_distance

usage: $dist = $fig->crude_estimate_of_distance($genome1,$genome2)

There are a number of places where we need estimates of the distance between
two genomes.  This routine will return a value between 0 and 1, where a value of 0
means "the genomes are essentially identical" and a value of 1 means
"the genomes are in different major groupings" (the groupings are archaea, bacteria,
euks, and viruses).  The measure is extremely crude.

=cut

sub crude_estimate_of_distance :scalar {
    my($self,$genome1,$genome2) = @_;
    my($i,$v,$d,$dist);

    if ($genome1 > $genome2) { ($genome1,$genome2) = ($genome2,$genome1) }

    my $relational_db_response;
    my $rdbH = $self->db_handle;

    if (($relational_db_response = $rdbH->SQL("SELECT dist FROM distances WHERE ( genome1 = \'$genome1\' ) AND ( genome2 = \'$genome2\' ) ")) &&
	(@$relational_db_response == 1))
    {
	return $relational_db_response->[0]->[0];
    }
    return $self->crude_estimate_of_distance1($genome1,$genome2);
}

sub crude_estimate_of_distance1 :scalar {
    my($self,$genome1,$genome2) = @_;
    my($i,$v,$d,$dist);

    if ($genome1 > $genome2) { ($genome1,$genome2) = ($genome2,$genome1) }
    $dist = $self->cached('_dist');
    if (! $dist->{"$genome1,$genome2"})
    {
	my @tax1 = split(/\s*;\s*/,$self->taxonomy_of($genome1));
	my @tax2 = split(/\s*;\s*/,$self->taxonomy_of($genome2));

	$d = 1;
	for ($i=0, $v=0.5; ($i < @tax1) && ($i < @tax2) && ($tax1[$i] eq $tax2[$i]); $i++, $v = $v/2)
	{
	    $d -= $v;
	}
	$dist->{"$genome1,$genome2"} = $d;
    }
    return $dist->{"$genome1,$genome2"};
}

=pod

=head1 sort_fids_by_taxonomy

usage: @sorted_by_taxonomy = $fig->sort_fids_by_taxonomy(@list_of_fids)

Sorts a list of feature IDs based on the taxonomies of the genomes that contain the features.

=cut

sub sort_fids_by_taxonomy {
    my($self,@fids) = @_;

    return map     { $_->[0] }
           sort    { $a->[1] cmp $b->[1] }
           map     { [$_,$self->taxonomy_of(&genome_of($_))] }
           @fids;
}

sub build_tree_of_complete {
    my($self,$min_for_label) = @_;
    my(@last,@tax,$i,$prefix,$lev,$genome,$tax);

    $min_for_label = $min_for_label ? $min_for_label : 10;
    open(TMP,">/tmp/tree$$") || die "could not open /tmp/tree$$";
    print TMP "1. root\n";

    @last = ();


    foreach $genome (grep { $_ !~ /^99999/ } $self->sort_genomes_by_taxonomy($self->genomes("complete")))
    {
	$tax = $self->taxonomy_of($genome);
	@tax = split(/\s*;\s*/,$tax);
	push(@tax,$genome);
	for ($i=0; ((@last > $i) && (@tax > $i) && ($last[$i] eq $tax[$i])); $i++) {}
	while ($i < @tax)
	{
	    $lev = $i+2;
	    $prefix = " " x (4 * ($lev-1));
	    print TMP "$prefix$lev\. $tax[$i]\n";
	    $i++;
	}
	@last = @tax;
    }
    close(TMP);
    my $tree = &tree_utilities::build_tree_from_outline("/tmp/tree$$");
    $tree->[0] = 'All';
    &limit_labels($tree,$min_for_label);
    unlink("/tmp/tree$$");
    return ($tree,&tips_of_tree($tree));
}

sub limit_labels {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my($tree,$min_for_label) = @_;

    my($children) = &tree_utilities::node_pointers($tree);
    if (@$children == 1)
    {
	return 1;
    }
    else
    {
	my $n = 0;
	my $i;
	for ($i=1; ($i < @$children); $i++)
	{
	    $n += &limit_labels($children->[$i],$min_for_label);
	}
	if ($n < $min_for_label)
	{
	    $tree->[0] = "";
	}
	return $n;
    }
}

sub taxonomic_groups_of_complete {
    my($self,$min_for_labels) = @_;

    my($tree,undef) = $self->build_tree_of_complete($min_for_labels);
    return &taxonomic_groups($tree);
}

sub taxonomic_groups {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my($tree) = @_;

    my($groups,undef) = &taxonomic_groups_and_children($tree);
    return $groups;
}

sub taxonomic_groups_and_children {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my($tree) = @_;
    my($ids1,$i,$groupsC,$idsC);

    my $ptrs   = &tree_utilities::node_pointers($tree);
    my $ids    = [];
    my $groups = [];

    if (@$ptrs > 1)
    {
	$ids1 = [];
	for ($i=1; ($i < @$ptrs); $i++)
	{
	    ($groupsC,$idsC) = &taxonomic_groups_and_children($ptrs->[$i]);
	    if (@$groupsC > 0)
	    {
		push(@$groups,@$groupsC);
	    }
	    push(@$ids1,@$idsC);
	}

	if ($tree->[0])
	{
	    push(@$groups,[$tree->[0],$ids1]);
	}
	push(@$ids,@$ids1);
    }
    elsif ($tree->[0])
    {
	push(@$ids,$tree->[0]);
    }

    return ($groups,$ids);
}

################################# Literature Stuff  ####################################

sub get_titles_by_gi {
    my($self,$gi) = @_;

    &verify_existence_of_literature;

    $gi =~ s/^gi\|//;
    my $relational_db_response;
    my $rdbH = $self->db_handle;

    if (($relational_db_response = $rdbH->SQL("SELECT pmid,title FROM literature_titles WHERE ( gi = '$gi' ) ")) &&
	(@$relational_db_response > 0))
    {
	return sort { $a->[1] cmp $b->[1] } @$relational_db_response;
    }
    else
    {
	return ();
    }
}

sub get_titles_by_peg {
    my($self,$peg) = @_;
    my $gi;

    &verify_existence_of_literature;

    my @gis = grep { $_ =~ /^gi\|/ } $self->feature_aliases($peg);
    if (@gis > 0)
    {
	my $relational_db_response;
	my $rdbH = $self->db_handle;
	my $constraint = join(" OR ", map { $gi = ($_ =~ /gi\|(\S+)/) ? $1 : $_;  "( gi = '$gi' )" } @gis);
	if (($relational_db_response = $rdbH->SQL("SELECT pmid,title FROM literature_titles WHERE ( $constraint ) ")) &&
	    (@$relational_db_response > 0))
	{
	    return sort { $a->[1] cmp $b->[1] } @$relational_db_response;
	}
	else
	{
	    return ();
	}
    }
    return ();
}

sub get_title_by_pmid {
    my($self,$pmid) = @_;

    &verify_existence_of_literature;

    $pmid =~ s/^.*\|//;
    my $relational_db_response;
    my $rdbH = $self->db_handle;

    if (($relational_db_response = $rdbH->SQL("SELECT DISTINCT title FROM literature_titles WHERE ( pmid = '$pmid' ) ")) &&
	(@$relational_db_response == 1))
    {
	return $relational_db_response->[0]->[0];
    }
    else
    {
	return "";
    }
}

sub verify_existence_of_literature {

    if (! -d "$FIG_Config::global/Literature")
    {
	mkdir("$FIG_Config::global/Literature",0777);
	system "touch $FIG_Config::global/Literature/gi_pmid_title";
	system "$FIG_Config::bin/load_literature";
    }
}

################################# Subsystems  ####################################



sub exportable_subsystem {
    my($self,$ssa) = @_;
    my(%seqs,@genomes);

    my $spreadsheet = [];
    my $notes = [];

    $ssa =~ s/[ \/]/_/g;
    if (open(SSA,"<$FIG_Config::data/Subsystems/$ssa/spreadsheet"))
    {
	my $version = $self->subsystem_version($ssa);
	my $exchangable = $self->is_exchangable_subsystem($ssa);
	push(@$spreadsheet,"$ssa\n$version\n$exchangable\n");
	my @curation;
	if (-s "$FIG_Config::data/Subsystems/$ssa/curation.log")
	{
	    @curation = `head -n 1 \"$FIG_Config::data/Subsystems/$ssa/curation.log\"`;
	}
	else
	{
	    @curation = ("0000000000\tmaster:unknown\tstarted\n");
	}
	push(@$spreadsheet,$curation[0],"//\n");

	while (defined($_ = <SSA>) && ($_ !~ /^\/\//))
	{
	    push(@$spreadsheet,$_);
	}
	push(@$spreadsheet,"//\n");

	while (defined($_ = <SSA>) && ($_ !~ /^\/\//))
	{
	    push(@$spreadsheet,$_);
	}
	push(@$spreadsheet,"//\n");

	while (defined($_ = <SSA>))
	{
	    push(@$spreadsheet,$_);
	    chomp;
	    my @flds = split(/\t/,$_);
	    my $genome = $flds[0];
	    push(@genomes,$genome);
	    my($i,$id);
	    for ($i=2; ($i < @flds); $i++)
	    {
		if ($flds[$i])
		{
		    my @entries = split(/,/,$flds[$i]);
		    foreach $id (@entries)
		    {
			$seqs{"fig\|$genome\.peg.$id"} = 1;
		    }
		}
	    }
	}
	push(@$spreadsheet,"//\n");

	my $peg;
	foreach $peg (sort { &FIG::by_fig_id($a,$b) } keys(%seqs))
	{
	    my @aliases = grep { $_ =~ /^(sp\||gi\||pirnr\||kegg\||N[PGZ]_)/ } $self->feature_aliases($peg);

	    my $alias_txt = join(",",@aliases);
	    my $gs_txt = $self->genus_species($self->genome_of($peg));
	    my $func_txt = scalar $self->function_of($peg);

	    push(@$spreadsheet, join("\t", ($peg,
					    $alias_txt,
					    $gs_txt,
					    $func_txt)) . "\n");
	}
	push(@$spreadsheet,"//\n");

	foreach $peg (sort { &FIG::by_fig_id($a,$b) } keys(%seqs))
	{
	    my $aliases = $self->feature_aliases($peg);
	    my $seq = $self->get_translation($peg);
	    push(@$spreadsheet,">$peg $aliases\n");
	    my($i,$ln);
	    $ln = length($seq);
	    for ($i=0; ($i < $ln); $i += 60)
	    {
		if (($ln - $i) > 60)
		{
		    push(@$spreadsheet,substr($seq,$i,60) . "\n");
		}
		else
		{
		    push(@$spreadsheet,substr($seq,$i) . "\n");
		}
	    }
	}
	close(SSA);
	push(@$spreadsheet,"//\n");

	if (open(NOTES,"<$FIG_Config::data/Subsystems/$ssa/notes"))
	{
	    while (defined($_ = <NOTES>))
	    {
		push(@$notes,$_);
	    }
	    close(NOTES);
	}

	if ($notes->[$#{$notes}] ne "\n") { push(@$notes,"\n") }
	push(@$notes,"//\n");
    }
    return ($spreadsheet,$notes);
}

sub is_exchangable_subsystem :scalar {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my $ssa = (@_ == 1) ? $_[0] : $_[1];
    $ssa =~ s/[ \/]/_/g;
    if (open(TMP,"<$FIG_Config::data/Subsystems/$ssa/EXCHANGABLE"))
    {
	my $line;
	$line = <TMP>;
	if ($line && ($line =~ /^(\S+)/) && $1)
	{
	    return 1;
	}
	close(TMP);
    }
    return 0;
}

sub all_exchangable_subsystems {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);

    my @exchangable = ();
    if (opendir(SUB,"$FIG_Config::data/Subsystems"))
    {
	push(@exchangable,grep { ($_ !~ /^\./) && &is_exchangable_subsystem($_) } readdir(SUB));
	closedir(SUB);
    }
    return @exchangable;
}

=pod

=head1 all_subsystems 

Return a list of all subsystems.

=cut

sub all_subsystems {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);

    my @subsystems = ();
    if (opendir(SUB,"$FIG_Config::data/Subsystems"))
    {
	push(@subsystems,grep { ($_ !~ /^\./) } readdir(SUB));
	closedir(SUB);
    }
    return @subsystems;
}

=pod

=head1 all_constructs

Hmmm...

=cut

sub all_constructs {
    my($self) = @_;

    my @subsystems = ();
    if (opendir(SUB,"$FIG_Config::data/Subsystems"))
    {
	push(@subsystems,grep { ($_ !~ /^\./) } readdir(SUB));
	closedir(SUB);
    }

    my @c;
    for my $subname (@subsystems)
    {
	$subname =~ s/[ \/]/_/g;
	my $cfile = "$FIG_Config::data/Subsystems/$subname/constructs";
	if (-f $cfile)
	{
	    my $sub = $self->get_subsystem($subname);
	    my @a = Construct::parse_constructs_file($cfile, $sub);
	    my $l = [];

	    for my $con (@a)
	    {
		my($cname, $list) = @$con;
		my $nreqs = [];

		for my $req (@$list)
		{
		    if ($req->[0] eq 'R')
		    {
			push(@$nreqs, ['R', $req->[2]]);
		    }
		    else
		    {
			push(@$nreqs, $req);
		    }
		}
		push(@$l, [$cname, $nreqs]);
	    }
	    push(@c, [$subname, $l]);
	}
    }
    return @c;
}

=pod

=head1 subsystem_version

 my $version=subsystem_version($subsystem_name)

 returns the current version of the subsystem.

=cut

sub subsystem_version :scalar {
    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my $ssa = (@_ == 1) ? $_[0] : $_[1];
    $ssa =~ s/[ \/]/_/g;

    if (open(VER,"<$FIG_Config::data/Subsystems/$ssa/VERSION"))
    {
	my $ver = <VER>;
	close(VER);
	if ($ver =~ /^(\S+)/)
	{
	    return $1;
	}
    }
    return 0;
}

=pod

=head1 subsystem_classification

 Get or set the classification of the subsystem. Added by RAE in response to the changes made on seed wiki
 If a reference to an array is supplied it is saved as the new classification of the subsystem.
 Regardless, the current classification is returned as a reference to an array. There is no control over what the things are.
 Returns a reference to an empty array if a valid subsystem is not supplied, or if no classification is known

 The classification is stored as a \t separated list of things in $subsys/CLASSIFICATION. There is no control over what the things are. 


=cut

sub subsystem_classification {
 my ($self, $ssa, $classification)=@_;
 $ssa =~ s/[ \/]/_/g;
 
 my $return=['', '', ''];

 my $subsys=$self->get_subsystem($ssa);

 return $return unless ($subsys);
 
 $subsys->set_classification($classification) if ($classification); # this wil probably not be written, at the moment. hmmm.
 
 return $subsys->get_classification;

}

=pod

=head1 subsystem_curator

usage: $curator = $fig->subsystem_curator($subsystem_name)

Return the curator of a subsystem.


=cut

sub subsystem_curator :scalar {
    my($self, $ssa) = @_;
    my($who) = "";

    $ssa =~ s/[ \/]/_/g;

    if (open(DATA,"<$FIG_Config::data/Subsystems/$ssa/curation.log"))
    {
	while (defined($_  = <DATA>))
	{
	    if ($_ =~ /^\d+\t(\S+)\s+started/)
	    {
		$who = $1;
	    }
	}
	close(DATA);
    }
    return $who;
}

sub reset_subsystem_curator :scalar {
    my($self, $ssa, $who) = @_;

    $ssa =~ s/[ \/]/_/g;

    if ($who && open(LOG,">>$FIG_Config::data/Subsystems/$ssa/curation.log"))
    {
	my $time = time;
	print LOG "$time\t$who\tstarted\n";
	close(LOG);
	return 1;
    }
    return 0;
}

=pod

=head1 subsystem_info

usage: ($version, $curator, $pedigree, $roles) = $fig->subsystem_info($subsystem_name)

Return information about the given subsystem.

$roles is a list of tuples (abbrev, name).


=cut

sub subsystem_info {
    my($self,$ssa) = @_;
    my($version, $curator, $pedigree, $roles);;

    $ssa =~ s/[ \/]/_/g;

    $roles = [];

    $version = $self->subsystem_version($ssa);
    $curator = $self->subsystem_curator($ssa);

    if (open(CUR, "<$FIG_Config::data/Subsystems/$ssa/curation.log"))
    {
	local($/);
	$pedigree = <CUR>;
    }

    if (open(SSA,"<$FIG_Config::data/Subsystems/$ssa/spreadsheet"))
    {
	#
	# The spreadsheet appears to be of the form
	#
	#   role-abbr role name
	#   ...
	#   //
	#   Something about subsets
	#   //
	#   genome-id spreadsheet-info
	#

	local $/ = "//";

	my($chunk);

	if (defined($chunk = <SSA>))
	{
	    for $_ (split(/\n/, $chunk))
	    {
		chomp;
		if (/^(\S+)\s+(.*)\s*$/)
		{
		    push(@$roles, [$1, $2]);
		}
	    }
	}
	close(SSA);
    }

    return ($version, $curator, $pedigree, $roles);
}

=pod

=head1 subsystem_genomes

usage: $genomes = $fig->subsystem_genomes($subsystem_name, $zero)

Return the list of genomes in the subsystem.

$genomes is a list of tuples (genome_id, name)

unless ($zero) is set to true it will only return those genomes with a non-zero variant code

=cut

sub subsystem_genomes :scalar {
    my($self,$ssa,$all) = @_;
    my($genomes);

    $ssa =~ s/[ \/]/_/g;
    $genomes = [];

    if (open(SSA,"<$FIG_Config::data/Subsystems/$ssa/spreadsheet"))
    {
	#
	# The spreadsheet appears to be of the form
	#
	#   role-abbr role name
	#   ...
	#   //
	#   Something about subsets
	#   //
	#   genome-id spreadsheet-info
	#

	local $/ = "//";

	my($chunk);

	if (defined($chunk = <SSA>))
	{
	}
	if (defined($chunk = <SSA>))
	{
	}
	local $/ = "\n";

	while (<SSA>)
	{
	    chomp;
	    s/^\s*//;
	    s/\s*$//;
	    next if $_ eq "";
            if (($_ =~ /^(\d+\.\d+)\s+(\S+)/) && ($all || $2))
            {
                my $genome = $1;
                my $name = $self->genus_species($genome);
                push(@$genomes, [$genome, $name]);
            }
	}
	close(SSA);
    }

    return $genomes;
}

#
#    @pegs              = $fig->pegs_in_subsystem_cell($subsystem, $genome,$role)
#    @roles              = $fig->subsystem_to_roles($subsystem)
#    @maps             = $fig->role_to_maps($role)
#    @subsystems = $fig->peg_to_subsystems($peg);

sub get_subsystem :scalar
{
    my($self, $subsystem, $force_load) = @_;
    my $sub;

    $subsystem =~ s/[ \/]/_/g;
    my $cache = $self->cached('_Subsystems');
    if ($force_load || !($sub = $cache->{$subsystem}))
    {
	$sub = new Subsystem($subsystem, $self);
	$cache->{$subsystem} = $sub if $sub;
    }
    return $sub;
}

sub subsystem_to_roles
{
    my($self, $subsystem) = @_;
    $subsystem =~ s/[ \/]/_/g;

    my $sub = $self->get_subsystem($subsystem);

    return () unless $sub;

    return $sub->get_roles();
}

sub pegs_in_subsystem_cell
{
    my($self, $subsystem, $genome, $role) = @_;
    $subsystem =~ s/[ \/]/_/g;

    my $sub = $self->get_subsystem($subsystem);

    return undef unless $sub;
    return grep { ! $self->is_deleted_fid($_) } $sub->get_pegs_from_cell($genome, $role);
}

sub get_clearinghouse :scalar
{
    my($self, $url) = @_;

    if (defined($self->{_clearinghouse}))
    {
	return $self->{_clearinghouse};
    }

    if (!$ClearinghouseOK)
    {
	warn "Error: Clearinghouse code not available.\n";
	return undef;
    }

    if ($url eq "")
    {
	$url = "http://www-unix.mcs.anl.gov/~olson/SEED/api.cgi";
    }

    my $ch = new Clearinghouse($url);
    $self->{_clearinghouse} = $ch;

    return $ch;
}

sub publish_subsystem_to_clearinghouse
{
    my ($self, $ssa, $url) = @_;
    my ($id, $token);

    $ssa =~ s/[ \/]/_/g;

    my $ch = $self->get_clearinghouse($url);

    if (!defined($ch))
    {
	warn "Cannot publish: clearinghouse not available\n";
	return undef;
    }

    my($version, $curator, $pedigree, $roles) = $self->subsystem_info($ssa);

    my $genomes = $self->subsystem_genomes($ssa);

    my @genome_names = ();
    for my $g (@$genomes)
    {
	push(@genome_names, $g->[1]);
    }

    my $seed_id = $self->get_seed_id();

    my $time = int(time());

    print "publishing: ss=$ssa version=$version time=$time curator=$curator seed_id=$seed_id\n";
    my $ret = $ch->publish_subsystem($ssa, $version, $time, $curator, $pedigree, $seed_id,
				     $roles, \@genome_names);

    ($id, $token, $url) = @$ret;
    print "Got id  $id token $token url $url\n";

    #
    # Retrieve the package
    #

    print "Packaging...\n";

    my($spreadsheet, $notes) = $self->exportable_subsystem($ssa);
    my $package = join("", @$spreadsheet, @$notes);
    print "Sending...\n";
    $ch->upload_subsystem_package($url, $package);

    return 1;
}

#
# Return the list of subsystems this peg appears in.
# Each entry is a pair [subsystem, role].
#
=head1 subsystems_for_peg

 Return the list of subsystems and roles that this peg appears in.
 Returns an array. Each item in the array is
 a reference to a tuple of subsystem and role.

=cut

sub subsystems_for_peg
{
    my($self, $peg) = @_;

    if ($self->is_deleted_fid($peg)) { return () }

    ($peg =~ /^fig\|\d+\.\d+\.peg\.\d+$/) or return;

    my $rdbH = $self->db_handle;

    my $q = "SELECT subsystem, role FROM subsystem_index WHERE protein = '$peg'";

    if (my $relational_db_response = $rdbH->SQL($q))
    {
	my %seen;
	my @in;
	my $pair;
	foreach $pair (@$relational_db_response)
	{
	    my $key = join("\t",@$pair);
	    if (! $seen{$key})
	    {
		push(@in,$pair);
		$seen{$key} = 1;
	    }
	}
	return @in;
    }
    else
    {
	return ();
    }
}


=head1 assigned_pegs_in_subsystems

Return list of [peg, function, ss, role in ss].

=cut
    
sub assigned_pegs_in_subsystems
{
    my($self, $genome) = @_;

    my @result = ();
    for my $peg ($self->pegs_of($genome))
    {
	my $fn = $self->function_of($peg);
	next if $fn eq "";
	next if $self->hypo($fn);
              
        my $rdbH = $self->db_handle;

        my $q = "SELECT subsystem, role FROM subsystem_index WHERE protein = '$peg'";

        if (my $relational_db_response = $rdbH->SQL($q))
        {
	    my $pair;

 	    foreach $pair (@$relational_db_response)
            {
                  my ($ss, $role) = @$pair;

                  push(@result, [$peg, $fn, $ss, $role]);
	     }
         
	 }
    }
    return @result;
}
    

=head1 assigned_pegs_not_in_ss

Return all pegs with non-hypothetical assignments that are not in ss.

=cut
    
sub assigned_pegs_not_in_ss
{
    my($self, $genome) = @_;

    my @result = ();
    for my $peg ($self->pegs_of($genome))
    {
	my $fn = $self->function_of($peg);
	next if $fn eq "";
	next if $self->hypo($fn);

	my @subs = $self->subsystems_for_peg($peg);
	if (@subs < 1)
	{
	    push(@result, [$peg, $fn, "No Subsytem", "No Role"]);
	}
    }
    return @result;
}

=head1 assigned_pegs

Return list of [peg, function, ss, role in ss] for every non-hypo protein regardless of being in ss

=cut
    
sub assigned_pegs
{
    my($self, $genome) = @_;

    my @result = ();
    for my $peg ($self->pegs_of($genome))
    {
	my $fn = $self->function_of($peg);
	next if $fn eq "";
	next if $self->hypo($fn);
              
        my $rdbH = $self->db_handle;

        my $q = "SELECT subsystem, role FROM subsystem_index WHERE protein = '$peg'";

        if (my $relational_db_response = $rdbH->SQL($q))
        {
	    my $pair;
       
            if(@$relational_db_response > 0)
            {  

 	       foreach $pair (@$relational_db_response)
               {
                  my ($ss, $role) = @$pair;

                  push(@result, [$peg, $fn, $ss, $role]);
	       }
	    }
         	 
            else
            {
             push(@result, [$peg, $fn, "No Subsystem", "No Role"]);
            }  

        }
    }
    return @result;
}
    

=head1 subsystem_roles

Return a list of all roles present in locally-installed subsystems.
The return is a hash keyed on role name with each value a list
of subsystem names.

=cut

sub subsystem_roles
{
    my($self) = @_;

    my $rdbH = $self->db_handle;

    my $q = "SELECT distinct subsystem, role FROM subsystem_index";

    my $ret = {};

    if (my $relational_db_response = $rdbH->SQL($q))
    {
	foreach my $pair (@$relational_db_response)
	{
	    my($subname, $role) = @$pair;
	    push(@{$ret->{$role}}, $subname);
	}
    }

    return $ret;
}

#
# Return just the list of subsystems the peg appears in.
#

sub peg_to_subsystems
{
    my($self, $peg) = @_;

    if ($self->is_deleted_fid($peg)) { return () }
    my %in = map { $_->[0] => 1 } $self->subsystems_for_peg($peg);
    return sort keys(%in);
}

sub write_subsystem_spreadsheet {
    my($self,$ssa,$roles,$genomes,$pegs_in_cells) = @_;
    my(@genomes,$genome,$role,@pegs,$pair,$gs);

    $ssa =~ s/[ \/]/_/g;
    &verify_dir("$FIG_Config::data/Subsystems/$ssa");
    open(SSA,">$FIG_Config::data/Subsystems/$ssa/spreadsheet") || die "Cannot open $FIG_Config::data/Subsystems/$ssa/spreadsheet";
    foreach $pair (@$roles)
    {
	print SSA join("\t",@$pair),"\n";
    }
    print SSA "//\n";
    print SSA "All\n\nAll\n//\n";
    @genomes = map { $_->[1] }
               sort { ($a->[0] cmp $b->[0]) or ($a->[1] <=> $b->[1]) }
               map {$genome = $_; $gs = $self->genus_species($genome); [$gs,$genome] }
               @$genomes;
    foreach $genome (@genomes)
    {
	print SSA "$genome\t0";
	foreach $role (@$roles)
	{
	    $_ = $pegs_in_cells->{"$genome\t$role->[1]"};
	    @pegs = $_ ? sort { &by_fig_id($a,$b) } @{$_} : ();
	    print SSA "\t",join(",",map { $_ =~ /^fig\|\d+\.\d+\.peg\.(\d+)/; $1 } @pegs);
	}
	print SSA "\n";
    }
    close(SSA);
    chmod(0777,"$FIG_Config::data/Subsystems/$ssa");
}

################################# PEG Translation  ####################################

sub translate_pegs {
    my($self,$pegs,$seq_of, $cb) = @_;
    my($seq,$aliases,$pegT,%to,%sought,@keys,$peg,$alias);

    $cb = sub {} unless ref($cb) eq "CODE";

    my $tran_peg = {};

    my $n = scalar keys (%$pegs);
    my $idx = 0;

    foreach $peg (keys(%$pegs))
    {
	$idx++;
	&$cb("$idx of $n") if $idx % 100 == 0;
	#
	# First, see if the peg of the same name locally has the same
	# last 10 chars.
	#
	if (($seq = $self->get_translation($peg)) &&
	    (length($seq) > 10) && (length($seq_of->{$peg}) > 10) &&
	    (uc substr($seq,-10) eq substr($seq_of->{$peg},-10)))
	{
	    $tran_peg->{$peg} = $peg;
	}
	else
	{
	    #
	    # Otherwise, search for a local peg that has the same alias
	    # as this peg. (Canonicalize based on the original source)
	    #
	    ($aliases,undef,undef) = @{$pegs->{$peg}};
	    undef %to;
	    foreach $alias (split(/,/,$aliases))
	    {
		if ($pegT = $self->by_alias($alias))
		{
		    $to{$pegT}++;
		}
	    }

	    #
	    # If we have a unique answer, we are done.
	    # Otherwise mark this one as needing more search.
	    #
	    if ((@keys = keys(%to)) == 1)
	    {
		$tran_peg->{$peg} = $keys[0];
	    }
	    else
	    {
		$sought{$peg} = 1;
	    }
	}
    }

    if ((scalar keys(%sought)) > 0)
    {
	&tough_search($self,$pegs,$seq_of,$tran_peg,\%sought);
    }
    return $tran_peg;
}

=pod

=head1 tough_search($pegs, $seq_of, $tran_peg, $sought)

$pegs - not used
$seq_of - hash from peg to peg sequence
$tran_peg - hash into which translated pegs are placed
$sought - hash keyed on the list of pegs we're looking for.

=cut

sub tough_search {
    my($self,$pegs,$seq_of,$tran_peg,$sought) = @_;
    my($peg,$seq,%needH,%needT,%poss,$id,$sub,$to,$x,$genome);

    #
    # Construct %needT, key is 50-bases from tail of sequence, values are pegs from
    # the list of pegs we're seeking.
    #
    # %needH is the same, but keyed on 50 bases from the head of the sequence.
    #
    foreach $peg (keys(%$sought))
    {
	if (($seq = $seq_of->{$peg}) && (length($seq) > 50))
	{
	    my $sub = substr($seq,-50);
	    push(@{$needT{$sub}},$peg);
	    $sub = substr($seq,0,50);
	    push(@{$needH{$sub}},$peg);
	}
    }
#   print STDERR &Dumper(\%needT,\%needH);
    open(NR,"<$FIG_Config::global/nr")
	|| die "could not open $FIG_Config::global/nr";
    $/ = "\n>";
    while (defined($_ = <NR>))
    {
	chomp;
	if ($_ =~ /^>?(\S+)[^\n]*\n(.*)/s)
	{
	    $id  =  $1;
	    $seq =  $2;
	    if (length($seq) >= 50)
	    {
		$sub = uc substr($seq,-50);
		if ((($x = $needT{uc substr($seq,-50)}) && (@$x == 1)) ||
		    (($x = $needH{uc substr($seq,0,50)}) && (@$x == 1)))
		{
		    $peg = $x->[0];
		    my @same = grep { $_ =~ /^fig/ } map { $_->[0] } $self->mapped_prot_ids($id);
		    if (@same > 0)
		    {
			push(@{$poss{$peg}},@same);
		    }
		}
	    }
	}
    }
    close(NR);
    $/ = "\n";

#   print STDERR &Dumper(\%poss);
    foreach $peg (keys(%poss))
    {
#	print STDERR "processing $peg\n";
	$x = $poss{$peg};
	if (@$x == 1)
	{
	    $tran_peg->{$peg} = $x->[0];
	    delete $sought->{$peg};
	}
	elsif ($genome = $self->probable_genome($self->genome_of($peg),$tran_peg))
	{
#	    print STDERR "    mapped to genome $genome\n";
	    my $genomeQ = quotemeta $genome;
	    my @tmp = grep { $_ =~ /^fig\|$genomeQ\./ } @$x;
	    if (@tmp == 1)
	    {
		$tran_peg->{$peg} = $tmp[0];
		delete $sought->{$peg};
	    }
	    else
	    {
#		print STDERR &Dumper(["failed",$peg,$genome,\@tmp]);
	    }
	}
	else
	{
#	    print STDERR "could not place genome for $peg\n";
	}
    }

    foreach $peg (keys(%$sought))
    {
	print STDERR "failed to map $peg\n";
    }
}

sub probable_genome {
    my($self,$genome,$tran_peg) = @_;
    my($peg,%poss,@poss);

    my $genomeQ = quotemeta $genome;
    foreach $peg (grep { $_ =~ /^fig\|$genomeQ\./ } keys(%$tran_peg))
    {
	$poss{$self->genome_of($tran_peg->{$peg})}++;
    }
    @poss = sort { $poss{$b} <=> $poss{$a} } keys(%poss);
    if ((@poss == 1) || ((@poss > 1) && ($poss{$poss[0]} > $poss{$poss[1]})))
    {
	return $poss[0];
    }
    else
    {
#	print STDERR &Dumper(["could not resolve",\%poss,$genome]);
	return undef;
    }
}

=pod

=head1 find_genome_by_content

Find a genome given the number of contigs, number of nucleotides, and
checksum. We pass in a potential name for the genome as a quick
starting check.

=cut

sub find_genome_by_content
{
    my($self, $genome, $n_contigs, $n_nucs, $cksum) = @_;

    my(@genomes);

    my $gbase = (split(/\./, $genome))[0];

    #
    # Construct the list of genomes so that we first check ones with the same
    # base-part as the one passed in.
    #
    for ($self->genomes())
    {
	if (/^$gbase\./)
	{
	    unshift(@genomes, $_);
	}
	else
	{
	    push(@genomes, $_);
	}
    }


    for my $genome (@genomes)
    {
	if (open(my $cfh, "<$FIG_Config::organisms/$genome/COUNTS"))
	{
	    if ($_ = <$cfh>)
	    {
		my($cgenome, $cn_contigs, $cn_nucs, $ccksum) = split(/\t/);

		if ($cgenome eq $genome and
		    $cn_contigs == $n_contigs and
		    $cn_nucs == $n_nucs and
		    $ccksum == $cksum)
		{
		    return $genome;
		}
	    }
	    close($cfh);
	}
    }

    return undef;
}

################################# Support for PEG Links  ####################################


sub peg_links {
    my($self,$fid) = @_;

    return $self->fid_links($fid);
}

sub fid_links {
    my($self,$fid) = @_;
    my($i,$got,$genome,$fidN);

    if ($self->is_deleted_fid($fid)) { return () }
    my @links = ();
    my @aliases = $self->feature_aliases($fid);

    my $link_file;
    for $link_file (("$FIG_Config::global/fid.links","$FIG_Config::global/peg.links"))
    {
	if (open(GLOBAL,"<$link_file"))
	{
	    while (defined($_ = <GLOBAL>))
	    {
		chop;
		my($pat,$link) = split(/\t/,$_);
		for ($i=0,$got=0; (! $got) && ($i < @aliases); $i++)
		{
		    if ($aliases[$i] =~ /$pat/)
		    {
			push(@links,eval "\"$link\"");
			$got = 1;
		    }
		}
	    }
	    close(GLOBAL);
	}
    }

    my $relational_db_response;
    my $rdbH = $self->db_handle;

    if (($relational_db_response = $rdbH->SQL("SELECT link FROM fid_links WHERE ( fid = \'$fid\' )")) &&
	(@$relational_db_response > 0))
    {
	push(@links, map { $_->[0] } @$relational_db_response);
    }
    return sort { $a =~ /\>([^\<]+)\<\/a\>/; my $l1 = $1;
		  $b =~ /\>([^\<]+)\<\/a\>/; my $l2 = $1;
		  $a cmp $b } @links;
}

# Each link is a 2-tuple [fid,link]

sub add_peg_links {
    my($self,@links) = @_;

    return $self->add_fid_links(@links);
}

sub add_fid_links {
    my($self,@links) = @_;
    my($fid,$link,$pair,$i);

    my $relational_db_response;
    my $rdbH = $self->db_handle;

    foreach $pair (@links)
    {
	($fid,$link) = @$pair;

	if (($relational_db_response = $rdbH->SQL("SELECT link FROM fid_links WHERE ( fid = \'$fid\' )")) &&
	    (@$relational_db_response > 0))
	{
	    for ($i=0; ($i < @$relational_db_response) && ($relational_db_response->[$i]->[0] ne $link); $i++) {}
	    if ($i == @$relational_db_response)
	    {
		&add_fid_link($self,$fid,$link);
	    }
	}
	else
	{
	    &add_fid_link($self,$fid,$link);
	}
    }
}

sub add_fid_link {
    my($self,$fid,$link) = @_;

    if ($self->is_deleted_fid($fid)) { return }
    my $rdbH = $self->db_handle;

    ($fid =~ /^fig\|\d+\.\d+\.([^.]+)\.\d+$/) || confess "bad fid $fid";
    my $type = $1;

    $rdbH->SQL("INSERT INTO fid_links ( fid,link ) VALUES ( \'$fid\',\'$link\' )");
    if (($fid =~ /^fig\|(\d+\.\d+)\.fid\.\d+$/) && open(TMP,">>$FIG_Config::organisms/$1/Features/$type/links"))
    {
	print TMP "$fid\t$link\n";
	close(TMP);
	chmod 0777,"$FIG_Config::organisms/$1/Features/$type/links";
    }
}

sub delete_peg_link {
    my($self,$peg,$link) = @_;

    $self->delete_fid_link($peg,$link);
}

sub delete_fid_link {
    my($self,$fid,$link) = @_;
    my($i);

    if ($self->is_deleted_fid($fid)) { return }
    my $genome = $self->genome_of($fid);

    ($fid =~ /^fig\|\d+\.\d+\.([^.]+)\.\d+$/) || confess "bad fid $fid";
    my $type = $1;

    my $rdbH = $self->db_handle;
    $rdbH->SQL("DELETE FROM fid_links WHERE ( fid = \'$fid\' AND link = \'$link\' )");

    my $file;
    foreach $file (("$FIG_Config::organisms/$genome/Features/$type/$type.links","$FIG_Config::organisms/$genome/Features/$type/links"))
    {
	if (-s $file)
	{
	    my @links = `cat $file`;
	    for ($i=0; ($i < @links) && (! (($links[$i] =~ /^(\S+)\t(\S.*\S)/) && ($1 eq $fid) && ($2 eq $link))); $i++) {}
	    if (($i < @links) && open(TMP,">$file"))
	    {
		splice(@links,$i,1);
		print TMP join("",@links);
		close(TMP);
	    }
	}
    }
}

sub delete_all_peg_links {
    my($self,$peg) = @_;

    $self->delete_all_fid_links($peg);
}

sub delete_all_fid_links {
    my($self,$fid) = @_;
    my($i);

    if ($self->is_deleted_fid($fid)) { return }
    my $genome = $self->genome_of($fid);

    my $rdbH = $self->db_handle;
    $rdbH->SQL("DELETE FROM fid_links WHERE ( fid = \'$fid\' )");

    ($fid =~ /^fig\|\d+\.\d+\.([^.]+)\.\d+$/) || confess "bad fid $fid";
    my $type = $1;

    my $file;
    foreach $file (("$FIG_Config::organisms/$genome/Features/$type/$type.links","$FIG_Config::organisms/$genome/Features/$type/links"))
    {
	if (-s $file)
	{
	    my @links = `cat $file`;
	    my @links1 = grep { ! (($_ =~ /^(\S+)/) && ($1 eq $fid)) } @links;
	    if ((@links1 < @links) && open(TMP,">$file"))
	    {
		print TMP join("",@links1);
		close(TMP);
	    }
	}
    }
}


###########
#
# Some routines for dealing with peg search and similarities.
#
# This is code lifted from pom.cgi and reformatted for more general use.
#
#
# Find the given role in the given (via CGI params) organism.
#
# We do this by finding a list of pegs that are annotated to have
# this role in other organisms that are "close enough" to our organism
#
# We then find pegs in this organism that are similar to
# these pegs.
#

sub find_role_in_org
{
    my($self,$role, $org, $user, $sims_cutoff) = @_;

    my($id2,$psc,$col_hdrs,$tab,$peg,$curr_func,$id2_func);
    my($seen,$peg);

    if (!$org)
    {
	return undef;
    }

    #
    # Create a list of candidates.
    #
    # These are the list of sequences that contain the given role,
    # sorted by the crude_estimate_of_distance from the given peg.
    #

    my @cand = map { $_->[0] }
               sort { $a->[1] <=> $b->[1] }
               map {
	              $peg = $_;
	              [$peg,$self->crude_estimate_of_distance($org,&FIG::genome_of($peg))]
	           }
               $self->seqs_with_role($role,$user);

    my $hits = {};
    $seen = {};

    #
    # Pick the top 10 hits if there are more than 10.
    #
    my $how_many0 = ((@cand > 10) ? 10 : scalar @cand) - 1;

    $self->try_to_locate($org,$hits,[@cand[0..$how_many0]],$seen, $sims_cutoff);

    if (keys(%$hits) == 0)
    {
	splice(@cand,0,$how_many0+1);
	&try_to_locate($self,$org,$hits,\@cand,$seen, $sims_cutoff);
    }

    #
    # At this point %$hits contains the pegs in our organism that
    # may have the given role. The key is the peg, the value
    # is a pair [score, similar-peg]
    #
    #
    # We reformat this into a list of entries
    # [ $psc, $peg-in-this-org, $length, $current-fn, $matched-protein, $matched-len, $matched-fun]
    #


    $col_hdrs = ["P-Sc","PEG","Ln1","Current Function", "Protein Hit","Ln2","Function"];

    my @ret;

    foreach $peg ( sort {$hits->{$a}->[0] <=> $hits->{$b}->[0]} keys(%$hits))
    {
	($psc,$id2) = @{$hits->{$peg}};
	$curr_func = $self->function_of($peg,$user);
	$id2_func  = $self->function_of($id2,$user);

	push(@ret, [$psc, $peg, $self->translation_length($peg),
		    $curr_func, $id2, $self->translation_length($id2),$id2_func]);
    }
    return @ret;
}


#
# Background job support.
#
# If one wants to turn a script into a background, invoke
# $fig->run_in_background($coderef). This will cause $coderef to be invoked
# as a background job. This means its output will be written to
# $FIG_Config::data/Global/background_jobs/<pid>, and that
# it shows up and is killable via the seed control panel.
#

sub run_in_background
{
    my($self, $coderef, $close_fds) = @_;

    if (ref($coderef) ne "CODE")
    {
	warn "FIG::run_in_background() invoked without a code reference\n";
	return;
    }

    my $job_dir = "$FIG_Config::data/Global/background_jobs";
    verify_dir($job_dir);

    my $child = fork;

    if (!defined($child))
    {
	die "run_in_background: fork failed: $!\n";
    }

    if ($child == 0)
    {
	#
	# In the child process.
	#

	POSIX::setsid();

	my $d = $self->db_handle();
	if ($d)
	{
	    my $dbh = $d->{_dbh};
	    $dbh->{InactiveDestroy} = 1;
	}

	if ($close_fds)
	{
	    for (my $fd = 3; $fd < 32; $fd++)
	    {
		POSIX::close($fd);
	    }
	}

	my $my_job_dir = "$job_dir/$$";
	verify_dir($my_job_dir);

	open(my $info, ">$my_job_dir/INFO");
	my $now = asctime(localtime(time()));
	chomp($now);
	print $info "Background job $$ created from run_in_background by $> on $now\n";
	close($info);

	#
	# Redirect stdout/stderr to the OUTPUT file.
	#

	close(STDOUT);
	close(STDERR);

	open STDOUT, ">$my_job_dir/OUTPUT";
	open STDERR, ">&STDOUT";

	select(STDERR);
	$| = 1;
	select(STDOUT);
	$| = 1;

	#
	# Make stdin be from nowhere.
	#

	open STDIN, "</dev/null";

	#
	# Run the code.
	#

	open(my $stat, ">$my_job_dir/STATUS");
	print $stat "Job started at $now\n";
	close($stat);

	eval { &$coderef; };

	open(my $stat, ">$my_job_dir/STATUS");

	if ($@ eq "")
	{
	    print $stat "Finished successfully\n";
	}
	else
	{
	    print $stat "Code had an error:\n$@\n";
	}
	close($stat);

	#
	# We need to undef this, otherwise the DBrtns destructor
	# will do an explicit dbh->disconnect, which will undo any
	# effect of the InactiveDestroy set above.
	#

	my $d = $self->db_handle();
	if ($d)
	{
	    delete $d->{_dbh};
	}

	exit;
    }

    return $child;
}

sub get_all_jobs :list
{
    my($self) = @_;

    my $job_dir = "$FIG_Config::data/Global/background_jobs";

    opendir(my $jfh, $job_dir);
    my @jobs = grep { $_ =~ /^\d+$/ } readdir($jfh);
    closedir($jfh);

    return @jobs;
}


sub get_job :scalar
{
    my($self, $job_id) = @_;

    my $job_dir = "$FIG_Config::data/Global/background_jobs/$job_id";

    if (-d $job_dir)
    {
	return FIG::Job->new($job_id, $job_dir);
    }
    else
    {
	return undef;
    }
}

sub get_current_arch :scalar
{
    my $arch;

    open(FH, "<$FIG_Config::fig_disk/config/fig-user-env.sh");
    while (<FH>)
    {
	chomp;
	if (/^RTARCH=\"(.*)\"/)
	{
	    $arch = $1;
	    last;
	}
    }
    return $arch;
}


############################### Interfaces to Other Systems ######################################
# This section contains the functionality introduced by the interface with GenDB.  The initial
# two functions simply register when GenDB has a version of the genome (so we can set links
# to it when displaying PEGS:
#

=pod

=head1 has_genome(System,Genome)

usage: has_genome("GenDB",$genome)

Invoking this routine just records that GenDB has a copy of the genome designated by
$genome.

=cut

sub has_genome {
    my($system,$genome) = @_;

    print STDERR "$system has $genome\n";
}

=pod

=head1 dropped_genome(System,Genome)

usage: dropped_genome("GenDB",$genome)

Invoking this routine just records that GenDB should no longer be viewed as
having a copy of the genome designated by $genome.

=cut

sub dropped_genome {
    my($system,$genome) = @_;

    print STDERR "$system dropped $genome\n";
}

=pod

=head1 link_to_system(System,FID)

usage: $url = link_to_system("GenDB",$fid) # usually $fid is a peg, but it can be other types of features, as well

This routine is used to get a URL that can be used to "flip" from one system to the other.  If
the feature is unknown to the system, undef should be returned.

=cut

sub link_to_system {
    my($system,$fid) = @_;

    return undef;
}

############################### Adding, Deleting, Altering Features  ####################


# The following routines support alteration of features

=pod

=head1 delete_feature(FeatureID)

usage: $fig->delete_feature($fid)

Invoking this routine deletes the feature designated by $fid.

=cut

sub delete_feature {
    my($self,$fid) = @_;

    open(TMP,">>$FIG_Config::global/deleted.features")
	|| die "could not open $FIG_Config::global/deleted.features";

    flock(TMP,LOCK_EX) || confess "cannot lock deleted.features";
    print TMP "$fid\n";
    close(TMP);
    chmod 0777, "$FIG_Config::global/deleted.features";
    $self->{_deleted_fids} = undef;
}

=pod

=head1 add_feature(Genome,Type,Location,Aliases,Translation)

usage: $fid = $fig->add_feature($genome,$type,$location,$aliases,$translation)

Invoking this routine adds the feature, returning the new (generated) $fid.
$translation is optionally and only applies to PEGs.

	$genome is the genome ID
	$type is the feature type.  For now, "peg" or "rna"
	$location is of the form "Loc1,Loc2,...LocN" where each Loci (often just 1)
		if of the form Contig_Beg_End
	$aliases is of the form "Alias1,Alias2,...AliasN"
	$translation is just a string of amino acids

Upon success the new feature ID is returned.  On failure, undef is returned.

=cut

sub add_feature {
    my($self,$genome,$type,$location,$aliases,$translation) = @_;

    my $dbh = $self->db_handle();

    $aliases = $aliases ? $aliases : "";
    my $aliasesT = $aliases;
    $aliasesT =~ s/,/\t/g;
    my @aliases = split(/\t/,$aliasesT);

    my $fid = $self->next_fid($genome,$type);
    &add_tbl_entry($fid,$location,$aliasesT);

    if (($type eq "peg") and $translation)
    {
	$self->add_translation($fid,$translation);
    }

    my @loc = split(/,/,$location);
    my($contig,$beg,$end);
    if (($loc[0] =~ /^(\S+)_(\d+)_\d+$/) && (($contig,$beg) = ($1,$2)) && ($location =~ /(\d+)$/))
    {
	$end = $1;
	if ($beg > $end)  { ($beg,$end) = ($end,$beg) }
	$fid =~ /(\d+)$/;
	my $fidN = $1;
	if ((length($location) < 5000) && (length($contig) < 96) && (length($fid) < 32) && ($fid =~ /(\d+)$/))
	{
	    $dbh->SQL("INSERT INTO features (id,idN,type,genome,location,contig,minloc,maxloc,aliases)
                              VALUES ('$fid',$fidN,'$type','$genome','$location','$contig',$beg,$end,'$aliases')");

	    if (@aliases > 0)
	    {
		my $alias;
		foreach $alias (@aliases)
		{
		    if ($alias =~ /^(NP_|gi\||sp\|\tr\||kegg\||uni\|)/)
		    {

			$dbh->SQL("INSERT INTO ext_alias (id,alias,genome)
                                   VALUES ('$fid','$alias','$genome')");
		    }
		}
	    }
	    return $fid;
	}
    }
    return undef;
}

sub next_fid {
    my($self,$genome,$type) = @_;

    my $dbh = $self->db_handle();
    my $res = $dbh->SQL("select max(idN) from features where (genome = '$genome' and type = '$type')");
    return undef unless $res;

    my $fidN = $res->[0]->[0] + 1;
    while ($self->is_deleted_fid("fig\|$genome\.$type\.$fidN"))
    {
	$fidN++;
    }
    return "fig\|$genome\.$type\.$fidN";
}

sub is_deleted_fid {
    my($self,$fid) = @_;
    my($x,$y);

    if (! ($x = $self->{_deleted_fids}))
    {
	$self->{_deleted_fids} = {};
	if (open(TMP,"<$FIG_Config::global/deleted.features"))
	{
	    while ($y = <TMP>)
	    {
		if ($y =~ /^(fig\|\d+\.\d+\.[a-zA-Z]+\.\d+)/)
		{
		    $self->{_deleted_fids}->{$1} = 1;
		}
	    }
	    close(TMP);
	}
	$x = $self->{_deleted_fids};
    }
    return $x->{$fid};
}

sub fid_with_changed_location {
    my($self,$fid) = @_;
    my($x);

    if (! ($x = $self->{_changed_location_fids}))
    {
	$self->{_changed_location_fids} = {};
	if (open(TMP,"<$FIG_Config::global/changed.location.features"))
	{
	    while ($_ = <TMP>)
	    {
		if ($_ =~ /^(fig\|\d+\.\d+\.[a-zA-Z]+\.\d+)/)
		{
		    $self->{_changed_location_fids}->{$1}++;
		}
	    }
	    close(TMP);
	}
	$x = $self->{_changed_location_fids};
    }
    return $x->{$fid};
}

=pod

=head1 change_location_of_feature(FeatureID,Location,Translation)

usage: $fig->change_location_of_feature($fid,$location,$translation)

Invoking this routine changes the location of the feature.  The $translation argument
is optional (and applies only to PEGs).

The routine returns 1 on success and 0 on failure.

=cut

sub change_location_of_feature {
    my($self,$fid,$location,$translation) = @_;
    my($x);
    if ($self->is_deleted_fid($fid)) { return 0 }

    my $dbh = $self->db_handle();

    my $genome = &genome_of($fid);
    my $type   = &ftype($fid);

    my($got) = 0;
    my @loc = split(/,/,$location);
    my($contig,$beg,$end);
    if (($loc[0] =~ /^(\S+)_(\d+)_\d+$/) && (($contig,$beg) = ($1,$2)) && ($location =~ /(\d+)$/))
    {
	$end = $1;
	if ($beg > $end)  { ($beg,$end) = ($end,$beg) }
    }
    else
    {
	return 0;
    }

    my @tmp = grep { ($_ =~ /^(\S+)/) && ($1 eq $fid) } `grep '$fid' $FIG_Config::organisms/$genome/Features/$type/tbl`;
    if (@tmp > 0)
    {
	$x = $tmp[$#tmp];
	chop $x;
	my @flds = split(/\t/,$x);
	shift @flds;
	shift @flds;
	my $aliasesT =  (@flds > 0) ? join("\t",@flds) : "";
	&add_tbl_entry($fid,$location,$aliasesT);

	$dbh->SQL("UPDATE features SET location = '$location',
                                       contig = '$contig',
                                       minloc = $beg,
                                       maxloc = $end
                                   WHERE id = '$fid'");

	if (my $locations = $self->cached('_location'))
	{
	    $locations->{$fid} = $location;
	}

	open(TMP,">>$FIG_Config::global/changed.location.features")
	    || die "could not open $FIG_Config::global/changed.location.features";

	flock(TMP,LOCK_EX) || confess "cannot lock changed.location.features";
	print TMP "$fid\n";
	close(TMP);
	chmod 0777, "$FIG_Config::global/changed.location.features";
	$self->{_changed_location_fids} = undef;

	if (($type eq "peg") && defined($translation))
	{
	    $self->add_translation($fid,$translation);
	}
	$got = 1
    }
    return $got;
}

sub add_tbl_entry {
    my($fid,$location,$aliasesT) = @_;

    my $genome = &genome_of($fid);
    my $file   = "$FIG_Config::organisms/$genome/Features/peg/tbl";
    open(TMP,">>$file")
	|| die "could not open $file";
    flock(TMP,LOCK_EX) || confess "cannot lock $file";
    print TMP "$fid\t$location\t$aliasesT\n";
    close(TMP);
    chmod 0777, "$file";
}


sub add_translation {
    my($self,$fid,$translation) = @_;

    my $genome = &genome_of($fid);
    my $file   = "$FIG_Config::organisms/$genome/Features/peg/fasta";
    if (open(TMP,">>$file"))
    {
	flock(TMP,LOCK_EX) || confess "cannot lock $file";
	print TMP ">$fid\n";
	my $seek = tell TMP;
	my $ln   = length($translation);
	print TMP "$translation\n";
	close(TMP);
	chmod 0777, $file;
	my $fileno = $self->file2N($file);

	my $dbh = $self->db_handle();
	$dbh->SQL("DELETE FROM protein_sequence_seeks where id = '$fid'");
	$dbh->SQL("INSERT INTO protein_sequence_seeks (id,fileno,seek,len,slen)
                          VALUES ('$fid',$fileno,$seek,$ln+1,$ln)");
    }
}

sub peg_in_gendb
{
    my($self, $peg) = @_;
    my $genome = &genome_of($peg);

    return $self->genome_in_gendb($genome);
}

sub genome_in_gendb
{
    my($self, $genome) = @_;
    return 0;
}


### Begin FIG::Job module

package FIG::Job;

use FIGAttributes;
use base 'FIGAttributes';

sub new
{
    my($class, $job_id, $job_dir) = @_;

    my $self = {
	id => $job_id,
	dir => $job_dir,
    };
    return bless $self, $class;
}

sub status :scalar
{
    my($self) = @_;

    return &FIG::file_read("$self->{dir}/STATUS");
}

sub running :scalar
{
    my($self) = @_;
    my $rc;
    warn "running test on $self->{id}\n";
    if (kill(0, $self->{id}) > 0)
    {
	$rc = 1;
    }
    else
    {
	$rc = 0;
    }
    warn "running returns $rc\n";

    return $rc;
}

sub info :scalar :list
{
    my($self) = @_;
    return &FIG::file_read("$self->{dir}/INFO");
}

sub output :scalar :list
{
    my($self) = @_;
    return &FIG::file_read("$self->{dir}/OUTPUT");
}

######### End FIG::Job ##

package FIG;

#
# DAS support.
#

=pod

=head1 init_das

Initialize a DAS data query object.

=cut

sub init_das
{
    my($self, $url, $dsn) = @_;

    my $das_data_dir = "$FIG_Config::global/DAS";

    if (-d $das_data_dir)
    {
	return new SeedDas($self,$das_data_dir, $url, $dsn);
    }
    else
    {
	return undef;
    }
}

1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3