[Bio] / FigKernelPackages / FIG.pm Repository:
ViewVC logotype

View of /FigKernelPackages/FIG.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (download) (as text) (annotate)
Thu Dec 4 21:49:02 2003 UTC (16 years ago) by overbeek
Branch: MAIN
Changes since 1.3: +6 -2 lines
fix bug in function translation

package FIG;

use DBrtns;
use Sim;
use Blast;
use FIG_Config;

use FileHandle;

use Carp;
use Data::Dumper;

use strict;
use Fcntl qw/:flock/;  # import LOCK_* constants

sub new {
    my($class) = @_;

    my $rdbH = new DBrtns;
    bless {
    _dbf  => $rdbH,
	}, $class;
}

sub DESTROY {
    my($self) = @_;
    my($rdbH);

    if ($rdbH = $self->db_handle)
    {
	$rdbH->DESTROY;
    }
}

sub add_genome {
    my($self,$genomeF) = @_;

    my $rc = 0;
    if ($genomeF =~ /((.*\/)?(\d+\.\d+))$/)
    {
	my $genome = $3;
	my @errors = `$FIG_Config::bin/verify_genome_directory $genomeF`;
	if (@errors == 0)
	{
	    &run("cp -r $genomeF $FIG_Config::organisms");
	    chmod 0777, "$FIG_Config::organisms/$genomeF";
	    &run("load_features $genome");
	    &run("index_contigs $genome");
	    $rc = 1;
	    if (-s "$FIG_Config::organisms/$genome/Features/peg/fasta")
	    {
		&run("index_translations $genome");
		my @tmp = `cut -f1 $FIG_Config::organisms/$genome/Features/peg/tbl`;
		chop @tmp;
#		&run("cat $FIG_Config::organisms/$genome/Features/peg/fasta >> $FIG_Config::data/Global/nr");
#		&make_similarities(\@tmp);
	    }
	    if ((-s "$FIG_Config::organisms/$genome/assigned_functions") ||
		(-d "$FIG_Config::organisms/$genome/UserModels"))
	    {
		&run("add_assertions_of_function $genome");
	    }
	}
    }
    return $rc;
}

sub make_similarities {
    my($fids) = @_;
    my $fid;

    open(TMP,">>$FIG_Config::global/queued_similarities")
	|| die "could not open $FIG_Config::global/queued_similarities";
    foreach $fid (@$fids)
    {
	print TMP "$fid\n";
    }
    close(TMP);
}

sub set_urls {

    my $name = defined($FIG_Config::hostname) ? $FIG_Config::hostname : `hostname`;
    if ($name)
    {
	chop $name;
	my @tmp = `cat $FIG_Config::fig/Packages/FIG_Config.pm`;
	open(TMP,">$FIG_Config::fig/Packages/FIG_Config.pm") || die "could not open $FIG_Config::fig/Packages/FIG_Config.pm";
	my $line;
	foreach $line (@tmp)
	{
	    $line =~ s/((cgi|temp)_url.*\/\/)[^\/]+\/FIG/$1$name\/FIG/;
	    print TMP $line;
	}
	close(TMP);
    }
}

sub cgi_url {
    return &plug_url($FIG_Config::cgi_url);
}
    
sub temp_url {
    return &plug_url($FIG_Config::temp_url);
}
    
sub plug_url {
    my($url) = @_;

    my $name = defined($FIG_Config::hostname) ? $FIG_Config::hostname : `hostname`;
    if ($name && ($url =~ /^http:\/\/[^\/]+(.*)/))
    {
	$url = "http://$name$1";
    }
    return $url;
}

=pod

=head1 hiding/caching in a FIG object

We save the DB handle, cache taxonomies, and put a few other odds and ends in the
FIG object.  We expect users to invoke these services using the object $fig constructed
using:

    use FIG;
    my $fig = new FIG;

$fig is then used as the basic mechanism for accessing FIG services.  It is, of course,
just a hash that is used to retain/cache data.  The most commonly accessed item is the
DB filehandle, which is accessed via $self->db_handle.

We cache genus/species expansions, taxonomies, distances (very crudely estimated) estimated
between genomes, and a variety of other things.  I am not sure that using cached/2 was a 
good idea, but I did it.

=cut

sub db_handle {
    my($self) = @_;

    return $self->{_dbf};
}

sub cached {
    my($self,$what) = @_;

    my $x = $self->{$what};
    if (! $x)
    {
	$x = $self->{$what} = {};
    }
    return $x;
}

################ Basic Routines [ existed since WIT ] ##########################


=pod

=head1 min

usage: $n = &FIG::min(@x)

Assumes @x contains numeric values.  Returns the minimum of the values.

=cut

sub min {
    my(@x) = @_;
    my($min,$i);

    (@x > 0) || return undef;
    $min = $x[0];
    for ($i=1; ($i < @x); $i++)
    {
	$min = ($min > $x[$i]) ? $x[$i] : $min;
    }
    return $min;
}

=pod

=head1 max

usage: $n = &FIG::max(@x)

Assumes @x contains numeric values.  Returns the maximum of the values.

=cut

sub max {
    my(@x) = @_;
    my($max,$i);

    (@x > 0) || return undef;
    $max = $x[0];
    for ($i=1; ($i < @x); $i++)
    {
	$max = ($max < $x[$i]) ? $x[$i] : $max;
    }
    return $max;
}

=pod

=head1 between

usage: &FIG::between($x,$y,$z)

Returns true iff $y is between $x and $z.

=cut

sub between {
    my($x,$y,$z) = @_;

    if ($x < $z)
    {
	return (($x <= $y) && ($y <= $z));
    }
    else
    {
	return (($x >= $y) && ($y >= $z));
    }
}

=pod

=head1 standard_genetic_code

usage: $code = &FIG::standard_genetic_code()

Routines like "translate" can take a "genetic code" as an argument.  I implemented such
codes using hashes that assumed uppercase DNA triplets as keys.

=cut

sub standard_genetic_code {
    
    my $code = {};

    $code->{"AAA"} = "K";
    $code->{"AAC"} = "N";
    $code->{"AAG"} = "K";
    $code->{"AAT"} = "N";
    $code->{"ACA"} = "T";
    $code->{"ACC"} = "T";
    $code->{"ACG"} = "T";
    $code->{"ACT"} = "T";
    $code->{"AGA"} = "R";
    $code->{"AGC"} = "S";
    $code->{"AGG"} = "R";
    $code->{"AGT"} = "S";
    $code->{"ATA"} = "I";
    $code->{"ATC"} = "I";
    $code->{"ATG"} = "M";
    $code->{"ATT"} = "I";
    $code->{"CAA"} = "Q";
    $code->{"CAC"} = "H";
    $code->{"CAG"} = "Q";
    $code->{"CAT"} = "H";
    $code->{"CCA"} = "P";
    $code->{"CCC"} = "P";
    $code->{"CCG"} = "P";
    $code->{"CCT"} = "P";
    $code->{"CGA"} = "R";
    $code->{"CGC"} = "R";
    $code->{"CGG"} = "R";
    $code->{"CGT"} = "R";
    $code->{"CTA"} = "L";
    $code->{"CTC"} = "L";
    $code->{"CTG"} = "L";
    $code->{"CTT"} = "L";
    $code->{"GAA"} = "E";
    $code->{"GAC"} = "D";
    $code->{"GAG"} = "E";
    $code->{"GAT"} = "D";
    $code->{"GCA"} = "A";
    $code->{"GCC"} = "A";
    $code->{"GCG"} = "A";
    $code->{"GCT"} = "A";
    $code->{"GGA"} = "G";
    $code->{"GGC"} = "G";
    $code->{"GGG"} = "G";
    $code->{"GGT"} = "G";
    $code->{"GTA"} = "V";
    $code->{"GTC"} = "V";
    $code->{"GTG"} = "V";
    $code->{"GTT"} = "V";
    $code->{"TAA"} = "*";
    $code->{"TAC"} = "Y";
    $code->{"TAG"} = "*";
    $code->{"TAT"} = "Y";
    $code->{"TCA"} = "S";
    $code->{"TCC"} = "S";
    $code->{"TCG"} = "S";
    $code->{"TCT"} = "S";
    $code->{"TGA"} = "*";
    $code->{"TGC"} = "C";
    $code->{"TGG"} = "W";
    $code->{"TGT"} = "C";
    $code->{"TTA"} = "L";
    $code->{"TTC"} = "F";
    $code->{"TTG"} = "L";
    $code->{"TTT"} = "F";
    
    return $code;
}

=pod

=head1 translate

usage: $aa_seq = &FIG::translate($dna_seq,$code,$fix_start);

If $code is undefined, I use the standard genetic code.  If $fix_start is true, I
will translate initial TTG or GTG to 'M'.

=cut

sub translate {
    my( $dna,$code,$start) = @_;
    my( $i,$j,$ln );
    my( $x,$y );
    my( $prot );

    if (! defined($code))
    {
	$code = &FIG::standard_genetic_code;
    }
    $ln = length($dna);
    $prot = "X" x ($ln/3);
    $dna =~ tr/a-z/A-Z/;

    for ($i=0,$j=0; ($i < ($ln-2)); $i += 3,$j++)
    {
	$x = substr($dna,$i,3);
        if ($y = $code->{$x})
        {
	    substr($prot,$j,1) = $y;
        }
    }
    
    if (($start) && ($ln >= 3) && (substr($dna,0,3) =~ /^[GT]TG$/))
    {
	substr($prot,0,1) = 'M';
    }
    return $prot;
}

=pod

=head1 reverse_comp and rev_comp

usage: $dnaR = &FIG::reverse_comp($dna) or
       $dnaRP = &FIG::rev_comp($seqP)

In WIT, we implemented reverse complement passing a pointer to a sequence and returning
a pointer to a sequence.  In most cases the pointers are a pain (although in a few they
are just what is needed).  Hence, I kept both versions of the function to allow you
to use whichever you like.  Use rev_comp only for long strings where passing pointers is a
reasonable effeciency issue.

=cut

sub reverse_comp {
    my($seq) = @_;

    return ${&rev_comp(\$seq)};
}

sub rev_comp {
    my( $seqP ) = @_;
    my( $rev  );

    $rev =  reverse( $$seqP );
    $rev =~ tr/a-z/A-Z/;
    $rev =~ tr/ACGTUMRWSYKBDHV/TGCAAKYWSRMVHDB/;
    return \$rev;
}

=pod

=head1 verify_dir

usage: &FIG::verify_dir($dir)

Makes sure that $dir exists.  If it has to create it, it sets permissions to 0777.

=cut

sub verify_dir {
    my($dir) = @_;

    if (-d $dir) { return }
    if ($dir =~ /^(.*)\/[^\/]+$/)
    {
	&verify_dir($1);
    }
    mkdir($dir,0777) || die "could not make $dir";
    chmod 0777,$dir;
}

=pod

=head1 run

usage: &FIG::run($cmd)

Runs $cmd and fails (with trace) if the command fails.

=cut

sub run {
    my($cmd) = @_;

#   my @tmp = `date`; chop @tmp; print STDERR "$tmp[0]: running $cmd\n";
    (system($cmd) == 0) || confess "FAILED: $cmd";
}

=pod

=head1 display_id_and_seq

usage: &FIG::display_id_and_seq($id_and_comment,$seqP,$fh)

This command has always been used to put out fasta sequences.  Note that it
takes a pointer to the sequence.  $fh is optional and defalts to STDOUT.

=cut

sub display_id_and_seq {
    my( $id, $seq, $fh ) = @_;
    
    if (! defined($fh) )  { $fh = \*STDOUT; }
    
    print $fh ">$id\n";
    &display_seq($seq, $fh);
}

sub display_seq {
    my ( $seq, $fh ) = @_;
    my ( $i, $n, $ln );
    
    if (! defined($fh) )  { $fh = \*STDOUT; }

    $n = length($$seq);
#   confess "zero-length sequence ???" if ( (! defined($n)) || ($n == 0) );
    for ($i=0; ($i < $n); $i += 60)
    {
	if (($i + 60) <= $n)
	{
	    $ln = substr($$seq,$i,60);
	}
	else
	{
	    $ln = substr($$seq,$i,($n-$i));
	}
	print $fh "$ln\n";
    }
}

##########  I commented the pods on the following routines out, since they should not
##########  be part of the SOAP/WSTL interface
#=pod
#
#=head1 file2N
#
#usage: $n = $fig->file2N($file)
#
#In some of the databases I need to store filenames, which can waste a lot of
#space.  Hence, I maintain a database for converting filenames to/from integers.
#
#=cut
#
sub file2N {
    my($self,$file) = @_;
    my($relational_db_response);

    my $rdbH = $self->db_handle;

    if (($relational_db_response = $rdbH->SQL("SELECT fileno FROM file_table WHERE ( file = \'$file\')")) &&
	(@$relational_db_response == 1))
    {
	return $relational_db_response->[0]->[0];
    }
    elsif (($relational_db_response = $rdbH->SQL("SELECT MAX(fileno) FROM file_table "))  && (@$relational_db_response == 1) && ($relational_db_response->[0]->[0]))
    {
	my $fileno = $relational_db_response->[0]->[0] + 1;
	if ($rdbH->SQL("INSERT INTO file_table ( file, fileno ) VALUES ( \'$file\', $fileno )"))
	{
	    return $fileno;
	}
    }
    elsif ($rdbH->SQL("INSERT INTO file_table ( file, fileno ) VALUES ( \'$file\', 1 )"))
    {
	return 1;
    }
    return undef;
}

#=pod
#
#=head1 N2file
#
#usage: $filename = $fig->N2file($n)
#
#In some of the databases I need to store filenames, which can waste a lot of
#space.  Hence, I maintain a database for converting filenames to/from integers.
#
#=cut
#
sub N2file {
    my($self,$fileno) = @_;
    my($relational_db_response);

    my $rdbH = $self->db_handle;

    if (($relational_db_response = $rdbH->SQL("SELECT file FROM file_table WHERE ( fileno = $fileno )")) &&
	(@$relational_db_response == 1))
    {
	return $relational_db_response->[0]->[0];
    }
    return undef;
}


#=pod
#
#=head1 openF
#
#usage: $fig->openF($filename)
#
#Parts of the system rely on accessing numerous different files.  The most obvious case is
#the situation with similarities.  It is important that the system be able to run in cases in
#which an arbitrary number of files cannot be open simultaneously.  This routine (with closeF) is
#a hack to handle this.  I should probably just pitch them and insist that the OS handle several
#hundred open filehandles.
#
#=cut
#
sub openF {
    my($self,$file) = @_;
    my($fxs,$x,@fxs,$fh);

    $fxs = $self->cached('_openF');
    if ($x = $fxs->{$file})
    {
	$x->[1] = time();
	return $x->[0];
    }
    
    @fxs = keys(%$fxs);
    if (defined($fh = new FileHandle "<$file"))
    {
	if (@fxs >= 200)
	{
	    @fxs = sort { $fxs->{$a}->[1] <=> $fxs->{$b}->[1] } @fxs;
	    $x = $fxs->{$fxs[0]};
	    undef $x->[0];
	    delete $fxs->{$fxs[0]};
	}
	$fxs->{$file} = [$fh,time()];
	return $fh;
    }
    return undef;
}

#=pod
#
#=head1 closeF
#
#usage: $fig->closeF($filename)
#
#Parts of the system rely on accessing numerous different files.  The most obvious case is
#the situation with similarities.  It is important that the system be able to run in cases in
#which an arbitrary number of files cannot be open simultaneously.  This routine (with openF) is
#a hack to handle this.  I should probably just pitch them and insist that the OS handle several
#hundred open filehandles.
#
#=cut
#
sub closeF {
    my($self,$file) = @_;
    my($fxs,$x);

    if (($fxs = $self->{_openF}) && 
	($x = $fxs->{$file}))
    {
	undef $x->[0];
	delete $fxs->{$file};
    }
}

=pod

=head1 ec_name

usage: $enzymatic_function = $fig->ec_name($ec)

Returns enzymatic name for EC.

=cut

sub ec_name {
    my($self,$ec) = @_;

    ($ec =~ /^\d+\.\d+\.\d+\.\d+$/) || return "";
    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT name FROM ec_names WHERE ( ec = \'$ec\' )");

    return (@$relational_db_response == 1) ? $relational_db_response->[0]->[0] : "";
    return "";
}

=pod

=head1 all_roles

usage: @roles = $fig->all_roles

Supposed to return all known roles.  For now, we ghet all ECs with "names".

=cut

sub all_roles {
    my($self) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT ec,name FROM ec_names");

    return @$relational_db_response;
}

=pod

=head1 expand_ec

usage: $expanded_ec = $fig->expand_ec($ec)

Expands "1.1.1.1" to "1.1.1.1 - alcohol dehydrogenase" or something like that.

=cut

sub expand_ec {
    my($self,$ec) = @_;
    my($name);

    return ($name = $self->ec_name($ec)) ? "$ec - $name" : $ec;
}


=pod

=head1 clean_tmp

usage: &FIG::clean_tmp

We store temporary files in $FIG_Config::temp.  There are specific classes of files
that are created and should be saved for at least a few days.  This routine can be
invoked to clean out those that are over two days old.

=cut

sub clean_tmp {

    my($file);
    if (opendir(TMP,"$FIG_Config::temp"))
    {
#       change the pattern to pick up other files that need to be cleaned up
	my @temp = grep { $_ =~ /^(Geno|tmp)/ } readdir(TMP);
	foreach $file (@temp)
	{
	    if (-M "$FIG_Config::temp/$file" > 2)
	    {
		unlink("$FIG_Config::temp/$file");
	    }
	}
    }
}

################ Routines to process genomes and genome IDs  ##########################


=pod

=head1 genomes

usage: @genome_ids = $fig->genomes;

Genomes are assigned ids of the form X.Y where X is the taxonomic id maintained by
NCBI for the species (not the specific strain), and Y is a sequence digit assigned to
this particular genome (as one of a set with the same genus/species).  Genomes also
have versions, but that is a separate issue.

=cut

sub genomes {
    opendir(GENOMES,"$FIG_Config::fig/Data/Organisms")
	|| die "could not open $FIG_Config::fig/Data/Organisms";
    my @genomes = sort { $a <=> $b } grep { $_ =~ /^\d/ } readdir(GENOMES);
    return @genomes;
}

sub genome_counts {
    my($x);
    my($a,$b,$e,$v) = (0,0,0,0);

    if (open(TMP,"cat $FIG_Config::organisms/*/TAXONOMY |"))
    {
	while (defined($x = <TMP>))
	{
	    if     ($x =~ /^Eukaryota/)          { $e++ }
	    elsif  ($x =~ /^Bacteria/)           { $b++ }
	    elsif  ($x =~ /^Archaea/)            { $a++ }
	    elsif  ($x =~ /^Vir/)                { $v++ }
	}
	close(TMP);
    }
    return ($a,$b,$e,$v);
}

=pod

=head1 genome_version

usage: $version = $fig->genome_version($genome_id);

Versions are incremented for major updates.  They are put in as major
updates of the form 1.0, 2.0, ...

Users may do local "editing" of the DNA for a genome, but when they do,
they increment the digits to the right of the decimal.  Two genomes remain
comparable only if the versions match identically.  Hence, minor updating should be 
committed only by the person/group responsible for updating that genome.

We can, of course, identify which genes are identical between any two genomes (by matching
the DNA or amino acid sequences).  However, the basic intent of the system is to
support editing by the main group issuing periodic major updates.

=cut

sub genome_version {
    my($self,$genome) = @_;

    my(@tmp);
    if ((-s "$FIG_Config::organisms/$genome/VERSION") &&
	(@tmp = `cat $FIG_Config::organisms/$genome/VERSION`) &&
	($tmp[0] =~ /^(\d+(\.\d+)?)$/))
    {
	return $1;
    }
    return undef;
}

=pod

=head1 genus_species

usage: $gs = $fig->genus_species($genome_id)

Returns the genus and species (and strain if that has been properly recorded) 
in a printable form.

=cut

sub genus_species {
    my ($self,$genome) = @_;

    my $ans;
    my $genus_species = $self->cached('_genus_species');
    if (! ($ans = $genus_species->{$genome}))
    {
	if (open(TMP,"<$FIG_Config::organisms/$genome/GENOME"))
	{
	    $ans = <TMP>;
	    chop $ans;
	    close(TMP);
	    $genus_species->{$genome} = $ans;
	}
    }
    return $ans;
}

=pod

=head1 taxonomy_of

usage: $gs = $fig->taxonomy_of($genome_id)

Returns the taxonomy of the specified genome.  Gives the taxonomy down to
genus and species.

=cut

sub taxonomy_of {
    my($self,$genome) = @_;
    my($tax);
    my $taxonomy = $self->cached('_taxonomy');

    if (! $taxonomy->{$genome})
    {
	foreach $genome ($self->genomes)
	{
	    if (open(TMP,"<$FIG_Config::organisms/$genome/TAXONOMY"))
	    {
		$tax = <TMP>;
		chop $tax;
		$self->{_taxonomy}->{$genome} = $tax;
		close(TMP);
	    }
	}
	$taxonomy = $self->{_taxonomy};
    }
    return $taxonomy->{$genome};
}

=pod

=head1 is_bacterial

usage: $fig->is_bacterial($genome)

Returns true iff the genome is bacterial.

=cut

sub is_bacterial {
    my($self,$genome) = @_;

    return ($self->taxonomy_of($genome) =~ /^Bacteria/);
}


=pod

=head1 is_archaeal

usage: $fig->is_archaeal($genome)

Returns true iff the genome is archaeal.

=cut

sub is_archaeal {
    my($self,$genome) = @_;

    return ($self->taxonomy_of($genome) =~ /^Archaea/);
}


=pod

=head1 is_prokaryotic

usage: $fig->is_prokaryotic($genome)

Returns true iff the genome is prokaryotic

=cut

sub is_prokaryotic {
    my($self,$genome) = @_;

    return ($self->taxonomy_of($genome) =~ /^(Archaea|Bacteria)/);
}


=pod

=head1 is_eukaryotic

usage: $fig->is_eukaryotic($genome)

Returns true iff the genome is eukaryotic

=cut

sub is_eukaryotic {
    my($self,$genome) = @_;

    return ($self->taxonomy_of($genome) =~ /^Eukarota/);
}

=pod

=head1 sort_genomes_by_taxonomy

usage: @genomes = $fig->sort_genomes_by_taxonomy(@list_of_genomes)

This routine is used to sort a list of genome IDs to put them
into taxonomic order.

=cut

sub sort_genomes_by_taxonomy {
    my($self,@fids) = @_;

    return map     { $_->[0] }
           sort    { $a->[1] cmp $b->[1] }
           map     { [$_,$self->taxonomy_of($_)] }
           @fids;
}

=pod

=head1 crude_estimate_of_distance

usage: $dist = $fig->crude_estimate_of_distance($genome1,$genome2)

There are a number of places where we need estimates of the distance between
two genomes.  This routine will return a value between 0 and 1, where a value of 0 
means "the genomes are essentially identical" and a value of 1 means
"the genomes are in different major groupings" (the groupings are archaea, bacteria, 
euks, and viruses).  The measure is extremely crude.

=cut

sub crude_estimate_of_distance {
    my($self,$genome1,$genome2) = @_;
    my($i,$v,$d,$dist);

    if ($genome1 > $genome2) { ($genome1,$genome2) = ($genome2,$genome1) }
    $dist = $self->cached('_dist');
    if (! $dist->{"$genome1,$genome2"})
    {
	my @tax1 = split(/\s*;\s*/,$self->taxonomy_of($genome1));
	my @tax2 = split(/\s*;\s*/,$self->taxonomy_of($genome2));

	$d = 1;
	for ($i=0, $v=0.5; ($i < @tax1) && ($i < @tax2) && ($tax1[$i] eq $tax2[$i]); $i++, $v = $v/2)
	{
	    $d -= $v;
	}
	$dist->{"$genome1,$genome2"} = $d;
    }
    return $dist->{"$genome1,$genome2"};
}

=pod

=head1 org_of

usage: $org = $fig->org_of($prot_id)

In the case of external proteins, we can usually determine an organism, but not
anything more precise than genus/species (and often not that).  This routine takes 
a protein ID (which may be a feature ID) and returns "the organism".

=cut

sub org_of {
    my($self,$prot_id) = @_;
    my $relational_db_response;
    my $rdbH = $self->db_handle;

    if ($prot_id =~ /^fig\|/)
    {
	return $self->genus_species($self->genome_of($prot_id));
    }

    if (($relational_db_response = $rdbH->SQL("SELECT org FROM external_orgs WHERE ( prot = \'$prot_id\' )")) &&
	(@$relational_db_response >= 1))
    {
	return $relational_db_response->[0]->[0];
    }
    return "";
}

=pod

=head1 abbrev

usage: $abbreviated_name = $fig->abbrev($genome_name)

For alignments and such, it is very useful to be able to produce an abbreviation of genus/species.
That's what this does.  Note that multiple genus/species might reduce to the same abbreviation, so
be careful (disambiguate them, if you must).

=cut

sub abbrev {
    my($genome_name) = @_;

    $genome_name =~ s/^(\S{3})\S+/$1./;
    $genome_name =~ s/^(\S+\s+\S{4})\S+/$1./;
    if (length($genome_name) > 13)
    {
	$genome_name = substr($genome_name,0,13);
    }
    return $genome_name;
}

################ Routines to process Features and Feature IDs  ##########################

=pod

=head1 ftype

usage: $type = &FIG::ftype($fid)

Returns the type of a feature, given the feature ID.  This just amounts
to lifting it out of the feature ID, since features have IDs of tghe form

    fig|x.y.f.n

where
        x.y is the genome ID
        f   is the type pf feature
        n   is an integer that is unique within the genome/type

=cut

sub ftype {
    my($feature_id) = @_;

    if ($feature_id =~ /^fig\|\d+\.\d+\.([^\.]+)/)
    {
	return $1;
    }
    return undef;
}

=pod

=head1 genome_of

usage: $genome_id = $fig->genome_of($fid)

This just extracts the genome ID from a feature ID.

=cut


sub genome_of {
    my $prot_id = (@_ == 1) ? $_[0] : $_[1];    

    if ($prot_id =~ /^fig\|(\d+\.\d+)/) { return $1; }
    return undef;
}

=pod

=head1 by_fig_id

usage: @sorted_by_fig_id = sort { &FIG::by_fig_id($a,$b) } @fig_ids

This is a bit of a clutzy way to sort a list of FIG feature IDs, but it works.

=cut

sub by_fig_id {
    my($a,$b) = @_;
    my($g1,$g2,$t1,$t2,$n1,$n2);
    if (($a =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g1,$t1,$n1) = ($1,$2,$3)) &&
	($b =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g2,$t2,$n2) = ($1,$2,$3)))
    {
	($g1 <=> $g2) or ($t1 cmp $t2) or ($n1 <=> $n2);
    }
    else
    {
	$a cmp $b;
    }
}

=pod

=head1 sort_fids_by_taxonomy

usage: @sorted_by_taxonomy = $fig->sort_fids_by_taxonomy(@list_of_fids)

Sorts a list of feature IDs based on the taxonomies of the genomes that contain the features.

=cut

sub sort_fids_by_taxonomy {
    my($self,@fids) = @_;

    return map     { $_->[0] }
           sort    { $a->[1] cmp $b->[1] }
           map     { [$_,$self->taxonomy_of(&genome_of($_))] }
           @fids;
}

=pod

=head1 genes_in_region

usage: ($features_in_region,$beg1,$end1) = $fig->genes_in_region($genome,$contig,$beg,$end)

It is often important to be able to find the genes that occur in a specific region on
a chromosome.  This routine is designed to provide this information.  It returns all genes
that overlap the region ($genome,$contig,$beg,$end).  $beg1 is set to the minimum coordinate of
the returned genes (which may be before the given region), and $end1 the maximum coordinate.

The routine assumes that genes are not more than 10000 bases long, which is certainly not true
in eukaryotes.  Hence, in euks you may well miss genes that overlap the boundaries of the specified
region (sorry).

=cut


sub genes_in_region {
    my($self,$genome,$contig,$beg,$end) = @_;
    my($x,$relational_db_response,$feature_id,$b1,$e1,@feat,@tmp,$l,$u);

    my $pad = 10000;
    my $rdbH = $self->db_handle;

    my $minV = $beg - $pad;  
    my $maxV = $end + $pad;
    if (($relational_db_response = $rdbH->SQL("SELECT id FROM features 
                                         WHERE ( minloc > $minV ) AND ( minloc < $maxV ) AND (maxloc < $maxV) AND
                                               ( genome = \'$genome\' )  AND ( contig = \'$contig\' );")) &&
	 (@$relational_db_response >= 1))
    {
	@tmp =  sort { ($a->[1] cmp $b->[1]) or 
		       ($a->[2] <=> $b->[2]) or 
                       ($a->[3] <=> $b->[3]) 
                     }
	        map  { $feature_id = $_->[0]; 
                       $x = $self->feature_location($feature_id); 
                       $x ? [$feature_id,&boundaries_of($x)] : () 
                     } @$relational_db_response;


	($l,$u) = (10000000000,0);
	foreach $x (@tmp)
	{
	    ($feature_id,undef,$b1,$e1) = @$x;
	    if (&between($beg,&min($b1,$e1),$end) || &between(&min($b1,$e1),$beg,&max($b1,$e1)))
	    {
		push(@feat,$feature_id);
		$l = &min($l,&min($b1,$e1));
		$u = &max($u,&max($b1,$e1));
	    }
	}
	(@feat <= 0) || return ([@feat],$l,$u);
    }
    return ([],$l,$u);
}

sub close_genes {
    my($self,$fid,$dist) = @_;

    my $loc = $self->feature_location($fid);
    if ($loc)
    {
	my($contig,$beg,$end) = &FIG::boundaries_of($loc);
	if ($contig && $beg && $end)
	{
	    my $min = &min($beg,$end) - $dist;
	    my $max = &max($beg,$end) + $dist;
	    my $feat;
	    ($feat,undef,undef) = $self->genes_in_region(&FIG::genome_of($fid),$contig,$min,$max);
	    return @$feat;
	}
    }
    return ();
}


=pod

=head1 feature_location

usage: $loc = $fig->feature_location($fid)   OR
       @loc = $fig->feature_location($fid)

The location of a feature in a scalar context is

    contig_b1_e1,contig_b2_e2,...   [one contig_b_e for each exon]

In a list context it is

    (contig_b1_e1,contig_b2_e2,...)

=cut

sub feature_location {
    my($self,$feature_id) = @_;
    my($relational_db_response,$locations,$location);

    $locations = $self->cached('_location');
    if (! ($location = $locations->{$feature_id}))
    {
	my $rdbH = $self->db_handle;
	if (($relational_db_response = $rdbH->SQL("SELECT location FROM features WHERE ( id = \'$feature_id\' )")) &&
	    (@$relational_db_response == 1))
	{
            $locations->{$feature_id} = $location = $relational_db_response->[0]->[0];
	}
    }

    if ($location)
    {
	return wantarray() ? split(/,/,$location) : $location;
    }
    return undef;
}

=pod

=head1 boundaries_of

usage: ($contig,$beg,$end) = $fig->boundaries_of($loc)

The location of a feature in a scalar context is

    contig_b1_e1,contig_b2_e2,...   [one contig_b_e for each exon]

This routine takes as input such a location and reduces it to a single
description of the entire region containing the gene.

=cut

sub boundaries_of {
    my($location) = (@_ == 1) ? $_[0] : $_[1];
    my($contigQ);

    if (defined($location))
    {
	my @exons = split(/,/,$location);
	my($contig,$beg,$end);
	if (($exons[0] =~ /^(\S+)_(\d+)_\d+$/) && 
	    (($contig,$beg) = ($1,$2)) && ($contigQ = quotemeta $contig) &&
	    ($exons[$#exons] =~ /^$contigQ\_\d+_(\d+)$/) && 
	    ($end = $1))
	{
	    return ($contig,$beg,$end);
	}
    }
    return undef;
}


=pod

=head1 all_features

usage: $fig->all_features($genome,$type)

Returns a list of all feature IDs of a specified type in the designated genome.  You would
usually use just 

    $fig->pegs_of($genome) or 
    $fig->rnas_of($genome)

which simply invoke this routine.

=cut

sub all_features {
    my($self,$genome,$type) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT id FROM features WHERE  (genome = \'$genome\' AND (type = \'$type\'))");

    if (@$relational_db_response > 0)
    {
	return map { $_->[0] } @$relational_db_response;
    }
    return ();
}


=pod

=head1 all_pegs_of

usage: $fig->all_pegs_of($genome)

Returns a list of all PEGs in the specified genome.  Note that order is not
specified.

=cut

sub pegs_of {
    my($self,$genome) = @_;
    
    return $self->all_features($genome,"peg");
}


=pod

=head1 all_rnas_of

usage: $fig->all_rnas($genome)

Returns a list of all RNAs for the given genome.

=cut

sub rnas_of {
    my($self,$genome) = @_;
    
    return $self->all_features($genome,"rna");
}

=pod

=head1 feature_aliases

usage: @aliases = $fig->feature_aliases($fid)  OR
       $aliases = $fig->feature_aliases($fid) 

Returns a list of aliases (gene IDs, arbitrary numbers assigned by authors, etc.) for the feature.
These must come from the tbl files, so add them there if you want to see them here.

In a scalar context, the aliases come back with commas separating them.

=cut

sub feature_aliases {
    my($self,$feature_id) = @_;
    my($rdbH,$relational_db_response,$aliases);

    $rdbH = $self->db_handle;
    if (($relational_db_response = $rdbH->SQL("SELECT aliases FROM features WHERE ( id = \'$feature_id\' )")) &&
	    (@$relational_db_response == 1))
    {
	$aliases = $relational_db_response->[0]->[0];
    }
    return $aliases ? (wantarray ? split(/,/,$aliases) : $aliases) : undef;
}

=pod

=head1 possibly_truncated

usage: $fig->possibly_truncated($fid)

Returns true iff the feature occurs near the end of a contig.

=cut

sub possibly_truncated {
    my($self,$feature_id) = @_;
    my($loc);

    if ($loc = $self->feature_location($feature_id))
    {
	my $genome = &genome_of($feature_id);
	my ($contig,$beg,$end) = &boundaries_of($loc);
	if ((! $self->near_end($genome,$contig,$beg)) && (! $self->near_end($genome,$contig,$end)))
	{
	    return 0;
	}
    }
    return 1;
}

sub near_end {
    my($self,$genome,$contig,$x) = @_;

    return (($x < 300) || ($x > ($self->contig_ln($genome,$contig) - 300)));
}

################ Routines to process functional coupling for PEGs  ##########################

=pod

=head1 coupling_and_evidence

usage: @coupling_data = $fig->coupling_and_evidence($fid,$bound,$sim_cutoff,$coupling_cutoff,$keep_record)

A computation of couplings and evidence starts with a given peg and produces a list of
3-tuples.  Each 3-tuple is of the form

    [Score,CoupledToFID,Evidence]

Evidence is a list of 2-tuples of FIDs that are close in other genomes (producing
a "pair of close homologs" of [$peg,CoupledToFID]).  The maximum score for a single
PCH is 1, but "Score" is the sum of the scores for the entire set of PCHs.

If $keep_record is true, the system records the information, asserting coupling for each 
of the pairs in the set of evidence, and asserting a pin from the given $fd through all
of the PCH entries used in forming the score.

=cut

sub coupling_and_evidence {
    my($self,$feature_id,$bound,$sim_cutoff,$coupling_cutoff,$keep_record) = @_;
    my($neighbors,$neigh,$similar1,$similar2,@hits,$sc,$ev,$genome1);

    if ($feature_id =~ /^fig\|(\d+\.\d+)/)
    {
	$genome1 = $1;
    }

    my($contig,$beg,$end) = &FIG::boundaries_of($self->feature_location($feature_id));
    if (! $contig) { return () }

    ($neighbors,undef,undef) = $self->genes_in_region(&genome_of($feature_id),
						      $contig,
						      &min($beg,$end) - $bound, 
						      &max($beg,$end) + $bound);
    if (@$neighbors == 0) { return () }
    $similar1 = $self->acceptably_close($feature_id,$sim_cutoff);
    @hits = ();

    foreach $neigh (grep { $_ =~ /peg/ } @$neighbors)
    {
	next if ($neigh eq $feature_id);
	$similar2 = $self->acceptably_close($neigh,$sim_cutoff);
	($sc,$ev) = $self->coupling_ev($genome1,$similar1,$similar2,$bound);
	if ($sc >= $coupling_cutoff)
	{
	    push(@hits,[$sc,$neigh,$ev]);
	}
    }
    if ($keep_record)
    {
	$self->add_chr_clusters_and_pins($feature_id,\@hits);
    }
    return sort { $b->[0] <=> $a->[0] } @hits;
}


=pod

=head1 add_chr_clusters_and_pins

usage: $fig->add_chr_clusters_and_pins($peg,$hits)

The system supports retaining data relating to functional coupling.  If a user
computes evidence once and then saves it with this routine, data relating to
both "the pin" and the "clusters" (in all of the organisms supporting the
functional coupling) will be saved.

$hits must be a pointer to a list of 3-tuples of the sort returned by
$fig->coupling_and_evidence.

=cut

sub add_chr_clusters_and_pins {
    my($self,$peg,$hits) = @_;
    my(@clusters,@pins,$x,$sc,$neigh,$pairs,$y,@corr,@orgs,%projection);
    my($genome,$cluster,$pin,$peg2);

    if (@$hits > 0)
    {
	@clusters = ();
	@pins = ();
	push(@clusters,[$peg,map { $_->[1] } @$hits]);
	foreach $x (@$hits)
	{
	    ($sc,$neigh,$pairs) = @$x;
	    push(@pins,[$neigh,map { $_->[1] } @$pairs]);
	    foreach $y (@$pairs)
	    {
		$peg2 = $y->[0];
		if ($peg2 =~ /^fig\|(\d+\.\d+)/)
		{
		    $projection{$1}->{$peg2} = 1;
		}
	    }
	}
	@corr = ();
	@orgs = keys(%projection);
	if (@orgs > 0)
	{
	    foreach $genome (sort { $a <=> $b } @orgs)
	    {
		push(@corr,sort { &FIG::by_fig_id($a,$b) } keys(%{$projection{$genome}}));
	    }
	    push(@pins,[$peg,@corr]);
	}

	foreach $cluster (@clusters)
	{
	    $self->add_chromosomal_cluster($cluster);
	}

	foreach $pin (@pins)
	{
	    $self->add_pch_pin($pin);
	}
    }
}

sub coupling_ev {
    my($self,$genome1,$sim1,$sim2,$bound) = @_;
    my($ev,$sc,$i,$j);

    $ev = [];
    $sc = 0;

    $i = 0;
    $j = 0;
    while (($i < @$sim1) && ($j < @$sim2))
    {
	if ($sim1->[$i]->[0] < $sim2->[$j]->[0])
	{
	    $i++;
	}
	elsif ($sim1->[$i]->[0] > $sim2->[$j]->[0])
	{
	    $j++;
	}
	else
	{
	    $sc += $self->accumulate_ev($genome1,$sim1->[$i]->[1],$sim2->[$j]->[1],$bound,$ev);
	    $i++;
	    $j++;
	}
    }
    return ($sc,$ev);
}

sub accumulate_ev {
    my($self,$genome1,$feature_ids1,$feature_ids2,$bound,$ev) = @_;
    my($genome2,@locs1,@locs2,$i,$j,$sc,$x);

    if ((@$feature_ids1 == 0) || (@$feature_ids2 == 0)) { return 0 }

    $feature_ids1->[0] =~ /^fig\|(\d+\.\d+)/;
    $genome2 = $1;
    $sc = 0;
    @locs1 = map { $x = $self->feature_location($_); $x ? [&boundaries_of($x)] : () } @$feature_ids1;
    @locs2 = map { $x = $self->feature_location($_); $x ? [&boundaries_of($x)] : () } @$feature_ids2;

    for ($i=0; ($i < @$feature_ids1); $i++)
    {
	for ($j=0; ($j < @$feature_ids2); $j++)
	{
	    if (($feature_ids1->[$i] ne $feature_ids2->[$j]) && 
		&close_enough($locs1[$i],$locs2[$j],$bound))
	    {
		$sc += $self->crude_estimate_of_distance($genome1,$genome2);
		push(@$ev,[$feature_ids1->[$i],$feature_ids2->[$j]]);
	    }
	}
    }
    return $sc;
}

sub close_enough {
    my($locs1,$locs2,$bound) = @_;

#   print STDERR &Dumper(["close enough",$locs1,$locs2]);
    return (($locs1->[0] eq $locs2->[0]) && (abs((($locs1->[1]+$locs1->[2])/2) - (($locs2->[1]+$locs2->[2])/2)) <= $bound));
}

sub acceptably_close {
    my($self,$feature_id,$sim_cutoff) = @_;
    my(%by_org,$id2,$genome,$sim);

    my($ans) = [];

    foreach $sim ($self->sims($feature_id,1000,$sim_cutoff,"fig",0))
    {
	$id2 = $sim->id2;
	if ($id2 =~ /^fig\|(\d+\.\d+)/)
	{
	    my $genome = $1;
	    if ($self->taxonomy_of($genome) !~ /^Euk/)
	    {
		push(@{$by_org{$genome}},$id2);
	    }
	}
    }
    foreach $genome (sort { $a <=> $b } keys(%by_org))
    {
	push(@$ans,[$genome,$by_org{$genome}]);
    }
    return $ans;
}

################ Translations of PEGsand External Protein Sequences  ##########################


=pod

=head1 translatable

usage: $fig->translatable($prot_id)

The system takes any number of sources of protein sequences as input (and builds an nr
for the purpose of computing similarities).  For each of these input fasta files, it saves
(in the DB) a filename, seek address and length so that it can go get the translation if
needed.  This routine simply returns true iff info on the translation exists.

=cut


sub translatable {
    my($self,$prot) = @_;

    return &translation_length($self,$prot) ? 1 : 0;
}
    

=pod

=head1 translation_length

usage: $len = $fig->translation_length($prot_id)

The system takes any number of sources of protein sequences as input (and builds an nr
for the purpose of computing similarities).  For each of these input fasta files, it saves
(in the DB) a filename, seek address and length so that it can go get the translation if
needed.  This routine returns the length of a translation.  This does not require actually
retrieving the translation.

=cut

sub translation_length {
    my($self,$prot) = @_;

    $prot =~ s/^([^\|]+\|[^\|]+)\|.*$/$1/;
    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT slen FROM protein_sequence_seeks 
                                             WHERE  id = \'$prot\' ");

    return (@$relational_db_response == 1) ? $relational_db_response->[0]->[0] : undef;
}


=pod

=head1 get_translation

usage: $translation = $fig->get_translation($prot_id)

The system takes any number of sources of protein sequences as input (and builds an nr
for the purpose of computing similarities).  For each of these input fasta files, it saves
(in the DB) a filename, seek address and length so that it can go get the translation if
needed.  This routine returns a protein sequence.

=cut

sub get_translation {
    my($self,$id) = @_;
    my($rdbH,$relational_db_response,$fileN,$file,$fh,$seek,$ln,$tran);

    $rdbH = $self->db_handle;
    $id =~ s/^([^\|]+\|[^\|]+)\|.*$/$1/;

    $relational_db_response = $rdbH->SQL("SELECT fileno, seek, len FROM protein_sequence_seeks WHERE  id = \'$id\' ");

    if ($relational_db_response && @$relational_db_response == 1)
    {
	($fileN,$seek,$ln) = @{$relational_db_response->[0]};
	if (($fh = $self->openF($self->N2file($fileN))) &&
	     ($ln > 10))
	{
	    seek($fh,$seek,0);
	    read($fh,$tran,$ln-1);
	    $tran =~ s/\s//g;
	    return $tran;
	}
    }
    return '';
}

=pod

=head1 mapped_prot_ids

usage: @mapped = $fig->mapped_prot_ids($prot)

This routine is at the heart of maintaining synonyms for protein sequences.  The system
determines which protein sequences are "essentially the same".  These may differ in length
(presumably due to miscalled starts), but the tails are identical (and the heads are not "too" extended).
Anyway, the set of synonyms is returned as a list of 2-tuples [Id,length] sorted
by length.  

=cut

sub mapped_prot_ids {
    my($self,$id) = @_;
    
    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT maps_to FROM peg_synonyms WHERE  syn_id = \'$id\' ");
    if ($relational_db_response && (@$relational_db_response == 1))
    {
	$id = $relational_db_response->[0]->[0];
    }

    $relational_db_response = $rdbH->SQL("SELECT syn_id,syn_ln,maps_to_ln FROM peg_synonyms WHERE maps_to = \'$id\' ");
    if ($relational_db_response && (@$relational_db_response > 0))
    {
	return ([$id,$relational_db_response->[0]->[2]],map { [$_->[0],$_->[1]] } @$relational_db_response);
    }
    else
    {
	return ([$id,$self->translation_length($id)]);
    }
}

################ Assignments of Function to PEGs  ##########################

=pod

=head1 function_of

usage: @functions = $fig->function_of($peg)  OR
       $function  = $fig->function_of($peg,$user)
    
In a list context, you get back a list of 2-tuples.  Each 2-tuple is of the
form [MadeBy,Function].

In a scalar context,

    1. user is "master" if not specified
    2. function returned is the user's, if one exists; otherwise, master's, if one exists

In a scalar context, you get just the function.

=cut

# Note that we do not return confidence.  I propose a separate function to get both
# function and confidence
#
sub function_of {
    my($self,$id,$user) = @_;
    my($relational_db_response,@tmp,$entry,$i);
    my $wantarray = wantarray();
    my $rdbH = $self->db_handle;

    if (($id =~ /^fig\|(\d+\.\d+\.peg\.\d+)/) && ($wantarray || $user))
    {
	if (($relational_db_response = $rdbH->SQL("SELECT made_by,assigned_function FROM assigned_functions WHERE ( prot = \'$id\' )")) &&
	    (@$relational_db_response >= 1))
	{
	    @tmp = sort { $a->[0] cmp $b->[0] } map { [$_->[0],$_->[1]] } @$relational_db_response;
	    for ($i=0; ($i < @tmp) && ($tmp[$i]->[0] ne "master"); $i++) {}
	    if ($i < @tmp)
	    {
		$entry = splice(@tmp,$i,1);
		unshift @tmp, ($entry);
	    }

	    my $val;
	    if     ($wantarray)                                         { return @tmp }
	    elsif  ($user && ($val  = &extract_by_who(\@tmp,$user)))    { return $val }
            elsif  ($user && ($val  = &extract_by_who(\@tmp,"master"))) { return $val }
            else                                                        { return ""   }
	}
    }
    else
    {
	if (($relational_db_response = $rdbH->SQL("SELECT assigned_function FROM assigned_functions WHERE ( prot = \'$id\' AND made_by = \'master\' )")) &&
	    (@$relational_db_response >= 1))
	{
	    return $wantarray ? (["master",$relational_db_response->[0]->[0]]) : $relational_db_response->[0]->[0];
	}
    }
    
    return $wantarray ? () : "";
}

=pod

=head1 translated_function_of

usage: $function  = $fig->translated_function_of($peg,$user)
    
You get just the translated function.

=cut

sub translated_function_of {
    my($self,$id,$user) = @_;

    my $func = $self->function_of($id,$user);
    if ($func)
    {
	$func = $self->translate_function($func);
    }
    return $func;
}
    

sub extract_by_who {
    my($xL,$who) = @_;
    my($i);

    for ($i=0; ($i < @$xL) && ($xL->[$i]->[0] ne $who); $i++) {}
    return ($i < @$xL) ? $xL->[$i]->[1] : "";
}


=pod

=head1 translate_function

usage: $translated_func = $fig->translate_function($func)

Translates a function based on the function.synonyms table.    

=cut

sub translate_function {
    my($self,$function) = @_;

    my ($tran,$from,$to,$line);
    if (! ($tran = $self->{_function_translation}))
    {
	$tran = {};
	if (open(TMP,"<$FIG_Config::global/function.synonyms"))
	{
	    while (defined($line = <TMP>))
	    {
		chop $line;
		($from,$to) = split(/\t/,$line);
		$tran->{$from} = $to;
	    }
	    close(TMP);
	}
	$self->{_function_translation} = $tran;
    }

    while ($to = $tran->{$function})
    {
        $function = $to;
    }
    return $function;
}
    
=pod

=head1 assign_function

usage: $fig->assign_function($peg,$user,$function,$confidence)

Assigns a function.  Note that confidence can (and should be if unusual) included.
Note that no annotation is written.  This should normally be done in a separate
call of the form

    

=cut

sub assign_function {
    my($self,$peg,$user,$function,$confidence) = @_;
    my($role,$roleQ);

    my $rdbH = $self->db_handle;
    $confidence = $confidence ? $confidence : "";
    my $genome = $self->genome_of($peg);

    $rdbH->SQL("DELETE FROM assigned_functions WHERE ( prot = \'$peg\' AND made_by = \'$user\' )");

    my $funcQ = quotemeta $function;
    $rdbH->SQL("INSERT INTO assigned_functions ( prot, made_by, assigned_function, quality, org ) VALUES ( \'$peg\', \'$user\', \'$funcQ\', \'$confidence\', \'$genome\' )");
    $rdbH->SQL("DELETE FROM roles WHERE ( prot = \'$peg\' AND made_by = \'$user\' )");

    foreach $role (&roles_of_function($function))
    {
	$roleQ = quotemeta $role;
	$rdbH->SQL("INSERT INTO roles ( prot, role, made_by, org ) VALUES ( \'$peg\', '$roleQ\', \'$user\',  \'$genome\' )");
    }

    &verify_dir("$FIG_Config::organisms/$genome/UserModels");
    if ($user ne "master") 
    {
	&verify_dir("$FIG_Config::organisms/$genome/UserModels/$user");
    }

    if ((($user eq "master") && open(TMP,">>$FIG_Config::organisms/$genome/assigned_functions")) ||
	(($user ne "master") && open(TMP,">>$FIG_Config::organisms/$genome/UserModels/$user/assigned_functions")))
    {
	flock(TMP,LOCK_EX) || confess "cannot lock assigned_functions";
	seek(TMP,0,2)      || confess "failed to seek to the end of the file";
	print TMP "$peg\t$function\t$confidence\n";
	close(TMP);
	return 1;
    }
    return 0;
}

sub hypo {
    my $x = (@_ == 1) ? $_[0] : $_[1];

    return ((! $x) ||
	    ($x =~ /hypoth/i) || 
	    ($x =~ /,.*genes/i) ||
	    ($x =~ /identical/i) ||
	    ($x =~ /\bregion\b/i) ||
	    ($x =~ /\bcomplete cds\b/i) ||
	    ($x =~ /\breading frame\b/i) ||
	    ($x =~ /\bsimilar to hypo\b/i) ||
	    ($x =~ /cl\.41\b/i) ||
	    ($x =~ /HD-GYP domain/i) ||
	    ($x =~ /SI:bY1/i) ||
	    ($x =~ /defext in/i) ||
	    ($x =~ /^(expressed|conserved) protein$/i) ||
	    ($x =~ /gene \d/i) ||
	    ($x =~ /^[a-zA-Z]{2,4}\d{2,8}/) ||
	    ($x =~ /\d{3}.pep/i) ||
	    ($x =~ /\bFROM\b/i) ||
	    ($x =~ /\bA\.L/i) ||
	    ($x =~ /\bA\d\d/i) ||
	    ($x =~ /^C$/i) ||
	    ($x =~ /^\([A-Z]+\d+\)$/) || 
	    ($x =~ /dna fragment/i) || 
	    ($x =~ /Rv\d+[a-z](-like)?\b/i) || 
	    ($x =~ /\bORF_/i) || 
#	    ($x =~ /conserved protein\b/) || 
	    ($x =~ /^[XY]\d\S+/i) || 
	    ($x =~ /^[Yy][a-z]{2}[A-Z]/) || 
	    ($x =~ /^[Yy][A-Z]{3}\b/) || 
	    ($x =~ /weak similarity/i) || 
	    ($x =~ /similar to/i) || 
	    ($x =~ /gene product/i) || 
	    ($x =~ /ORF_/) || 
	    ($x =~ /NO SWISS-PROT/) || 
	    ($x =~ /predicted coding/i) || 
	    ($x =~ /predicted protein/i) || 
	    ($x =~ /predicted by/i) || 
	    ($x =~ /pct identical/i) || 
	    ($x =~ /\borf\d+/i) || 
	    ($x =~ /\bcosmid\d+\b/i) || 
	    ($x =~ /^[a-zA-Z0-9]+\d+[a-z]?$/i) || 
	    ($x =~ /^[a-zA-Z0-9]+[\.-]\d+[a-z]?$/i) || 
	    ($x =~ /^[a-zA-Z0-9]+[\.-]\d+[a-z]?\s+PROTEIN$/i) || 
	    ($x =~ /^cosmid\s+\S+$/i) || 
	    ($x =~ /^\([A-Z0-9]+\) [A-Z][a-z]{2}[a-zA-Z] \[\S+ \S+\]\s*$/) ||
	    ($x =~ /region orf/i) ||
	    ($x =~ /unnamed protein product/i) ||
	    ($x =~ /^[A-Z][0-9\.]{3,10}\S+ protein/i) ||
	    ($x =~ /HYDROPHOBIC PROTEIN/) ||
	    ($x =~ /\bORF\b/i) ||
	    ($x =~ /\b[a-zB-Z]\d{3,10}\b/i) ||
	    ($x =~ /protein similarity/) ||
	    ($x =~ /Uncharacterized/) ||
	    ($x =~ /UNIDENTIFIED/) ||
	    ($x =~ /belongs to the family/) ||
	    ($x =~ /predicted protein/) ||
	    ($x =~ /1-EVIDENCE=PREDICTED BY MATCH/) ||
	    ($x =~ /INTERGENIC REGION/) ||
	    ($x =~ /NO SWISS-PROT SIMILARITIES/) ||
	    ($x =~ /no known similarities/) ||
	    ($x =~ /alternate gene name/) ||
	    ($x =~ /alternate open reading frame/) ||
	    ($x =~ /similar to GenBank Accession Number/) ||
	    ($x =~ /family with/) ||
	    ($x =~ /No definition/) ||
	    ($x =~ /id:/i) ||
	    ($x =~ /cDNA/) ||
	    ($x =~ /SP:/) ||
	    ($x =~ /COMPLETE CDS/) ||
	    ($x =~ /GENE CLUSTER/) ||
	    ($x =~ /\dp,Lp/) ||
	    ($x =~ /3\' END/) ||
	    ($x =~ /START CODON/) ||
	    ($x =~ /_\S+_/) ||
	    ($x =~ /GTG START/i) ||
	    ($x =~ /TTG START/i) ||
	    ($x =~ /chain length determinant/i) ||
	    ($x =~ /f135/i) ||
	    ($x =~ /KDA PROTEIN/i) ||
	    ($x =~ /yole/i) ||
	    ($x =~ /\bMAP\b/) ||
	    ($x =~ /\(\d+-\d+\)/i) ||
	    ($x =~ /D9719.36p/i) ||
	    ($x =~ /THYMOCYTE PROTEIN CTHY28KD/i) ||
	    ($x =~ /PHAC1, PHAC2 AND PHAD GENES/i) ||
	    ($x =~ /OR23peptide/i) ||
	    ($x =~ /\(AE/i) ||
	    ($x =~ /Bem3p,Lph12p/i) ||
	    ($x =~ /Rlm1p,Lpg19p/i) ||
	    ($x =~ /unnamed/i) ||
	    ($x =~ /\b\d{3,20}/i) ||
	    ($x =~ /orf\d{2,20}/i) ||
	    ($x =~ /\d{3,20}\b/i) ||
	    ($x =~ /Intergenic-region/i) ||
	    ($x =~ /and \d+ orf/i) ||
	    ($x =~ /domain protein/i) ||
	    ($x =~ /protein \d{2}[A-Z]{1,3}\d+/i) ||
	    ($x =~ /\bTll\d{3,5}/i) ||
	    ($x =~ /unknown/i));
}

############################  Similarities ###############################

=pod

=head1 sims

usage: @sims = $fig->sims($peg,$maxN,$maxP,$select)

Returns a list of similarities for $peg such that

    there will be at most $maxN similarities,

    each similarity will have a P-score <= $maxP, and

    $select gives processing instructions:

        "raw" means that the similarities will not be expanded (by far fastest option)
        "fig" means return only similarities to fig genes
        "all" means that you want all the expanded similarities.

By "expanded", we refer to taking a "raw similarity" against an entry in the non-redundant
protein collection, and converting it to a set of similarities (one for each of the 
proteins that are essentially identical to the representative in the nr).

=cut
    
sub sims {
    my ($self,$id,$maxN,$maxP,$select) = @_;
    my($sim);

    my @sims = ();
    my @maps_to = $self->mapped_prot_ids($id);
    if (@maps_to > 0)
    {
	my $rep_id   = $maps_to[0]->[0];
	my @entry = grep { $_->[0] eq $id } @maps_to;
	if ((@entry == 1) && defined($entry[0]->[1]))
	{
	    if ((! defined($maps_to[0]->[1])) ||
		(! defined($entry[0]->[1])))
	    {
		print STDERR &Dumper(\@maps_to,\@entry);
		confess "bad";
	    }
	    my $delta = $maps_to[0]->[1] - $entry[0]->[1];
	    my @raw_sims = &get_raw_sims($self,$rep_id,$maxN,$maxP);

	    if ($id ne $rep_id)
	    {
		foreach $sim (@raw_sims)
		{

		    $sim->[0] = $id;
		    $sim->[6] -= $delta;
		    $sim->[7] -= $delta;
		}
	    }
	    unshift(@raw_sims,bless([$id,$rep_id,100.00,undef,undef,undef,1,$entry[0]->[1],$delta+1,$maps_to[0]->[1],0.0,,undef,$entry[0]->[1],$maps_to[0]->[1],"blastp",0,0],'Sim'));
	    @sims = grep { $_->id1 ne $_->id2 } &expand_raw_sims($self,\@raw_sims,$maxP,$select,0);
	}
    }
    return @sims;
}

sub expand_raw_sims {
    my($self,$raw_sims,$maxP,$select,$dups) = @_;
    my($sim,$id2,%others,$x);

    my @sims = ();
    foreach $sim (@$raw_sims)
    {
	next if ($sim->psc > $maxP);
	$id2 = $sim->id2;
	next if ($others{$id2} && (! $dups));
	$others{$id2} = 1;

	if ($select && ($select eq "raw"))
	{
	    push(@sims,$sim);
	}
	else
	{
	    my @relevant;
	    my @maps_to = $self->mapped_prot_ids($id2);
	    if ((! $select) || ($select eq "fig"))
	    {
		@relevant = grep { $_->[0] =~ /^fig/ }  @maps_to;
	    }
	    elsif ($select && ($select =~ /^ext/i))
	    {
		@relevant = grep { $_->[0] !~ /^fig/ } @maps_to;
	    }
	    else
	    {
		@relevant = @maps_to;
	    }

	    foreach $x (@relevant)
	    {
		my $sim1 = [@$sim];
		my($x_id,$x_ln) = @$x;
		defined($x_ln) || confess "x_ln id2=$id2 x_id=$x_id";
		defined($maps_to[0]->[1]) || confess "maps_to";
		my $delta2 = $maps_to[0]->[1] - $x_ln;
		$sim1->[1] = $x_id;
		$sim1->[8] -= $delta2;
		$sim1->[9] -= $delta2;
		bless($sim1,"Sim");
		push(@sims,$sim1);
	    }
	}
    }
    return @sims;
}

sub get_raw_sims {
    my($self,$rep_id,$maxN,$maxP) = @_;
    my(@sims,$seek,$fileN,$ln,$fh,$file,$readN,$readC,@lines,$i,$sim);
    my($sim_chunk,$psc,$id2);

    $maxN = $maxN ? $maxN : 500;

    @sims = ();
    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT seek, fileN, len FROM sim_seeks WHERE id = \'$rep_id\' ");
    foreach $sim_chunk (@$relational_db_response)
    {
	($seek,$fileN,$ln) = @$sim_chunk;
	$file = $self->N2file($fileN);
	$fh   = $self->openF($file);
	if (! $fh)
	{
	    confess "could not open sims for $file";
	}
	seek($fh,$seek,0);
	$readN = read($fh,$readC,$ln-1);
	($readN == ($ln-1)) 
	    || confess "could not read the block of sims at $seek for $ln - 1 characters; $readN actually read from $file\n$readC";
	@lines = grep    { 
	                     (@$_ == 15) && 
			     ($_->[12] =~ /^\d+$/) &&
			     ($_->[13] =~ /^\d+$/) &&
			     ($_->[6] =~ /^\d+$/) &&
			     ($_->[7] =~ /^\d+$/) &&
			     ($_->[8] =~ /^\d+$/) &&
			     ($_->[9] =~ /^\d+$/) &&
			     ($_->[2] =~ /^[0-9.]+$/) &&
			     ($_->[10] =~ /^[0-9.e-]+$/) 
			 }
		         map  { [split(/\t/,$_),"blastp"] } 
                         split(/\n/,$readC);
		                   
	@lines = sort { $a->[10] <=> $b->[10] } @lines;

	for ($i=0; ($i < @lines); $i++)
	{
	    $psc    = $lines[$i]->[10];
	    $id2    = $lines[$i]->[1];
	    if ($maxP >= $psc)
	    {
		$sim = $lines[$i];
		bless($sim,"Sim");
		push(@sims,$sim);
		if (@sims == $maxN) { return @sims }
	    }
	}
    }
    return @sims;
}

=pod

=head1 dsims

usage: @sims = $fig->dsims($peg,$maxN,$maxP,$select)

Returns a list of similarities for $peg such that

    there will be at most $maxN similarities,

    each similarity will have a P-score <= $maxP, and

    $select gives processing instructions:

        "raw" means that the similarities will not be expanded (by far fastest option)
        "fig" means return only similarities to fig genes
        "all" means that you want all the expanded similarities.

By "expanded", we refer to taking a "raw similarity" against an entry in the non-redundant
protein collection, and converting it to a set of similarities (one for each of the 
proteins that are essentially identical to the representative in the nr).

The "dsims" or "dynamic sims" are not precomputed.  They are computed using a heuristic which
is much faster than blast, but misses some similarities.  Essentially, you have an "index" or
representative sequences, a quick blast is done against it, and if there are any hits these are
used to indicate which sub-databases to blast against.

=cut

sub dsims {
    my($self,$id,$seq,$maxN,$maxP,$select) = @_;
    my($sim,$sub_dir,$db,$hit,@hits,%in);

    my @index = &blastit($id,$seq,"$FIG_Config::global/SimGen/exemplar.fasta",1.0e-3);
    foreach $sim (@index)
    {
	if ($sim->id2 =~ /_(\d+)$/)
	{
	    $in{$1}++;
	}
    }

    @hits = ();
    foreach $db (keys(%in))
    {
	$sub_dir = $db % 1000;
	push(@hits,&blastit($id,$seq,"$FIG_Config::global/SimGen/AccessSets/$sub_dir/$db",$maxP));

    }
 
    if (@hits == 0)
    {
	push(@hits,&blastit($id,$seq,"$FIG_Config::global/SimGen/nohit.fasta",$maxP));
    }

    @hits = sort { ($a->psc <=> $b->psc) or ($a->iden cmp $b->iden) } grep { $_->id2 ne $id } @hits;
    if ($maxN && ($maxN < @hits)) { $#hits = $maxN - 1 }
    return &expand_raw_sims($self,\@hits,$maxP,$select,0);
}

sub blastit {
    my($id,$seq,$db,$maxP) = @_;

    if (! $maxP) { $maxP = 1.0e-5 }
    my $tmp = &Blast::blastp([[$id,$seq]],$db,"-e $maxP");
    my $tmp1 = $tmp->{$id};
    if ($tmp1)
    {
	return @$tmp1;
    }
    return ();
}
    
################################# chromosomal clusters ####################################

=pod

=head1 in_cluster_with

usage: @pegs = $fig->in_cluster_with($peg)

Returns the set of pegs that are thought to be clustered with $peg (on the
chromosome).

=cut

sub in_cluster_with {
    my($self,$peg) = @_;
    my($set,$id,%in);

    return $self->in_set_with($peg,"chromosomal_clusters","cluster_id");
}

=pod

=head1 add_chromosomal_clusters

usage: $fig->add_chromosomal_clusters($file)

The given file is supposed to contain one predicted chromosomal cluster per line (either
comma or tab separated pegs).  These will be added (to the extent they are new) to those
already in $FIG_Config::global/chromosomal_clusters.

=cut


sub add_chromosomal_clusters {
    my($self,$file) = @_;
    my($set,$added);

    open(TMPCLUST,"<$file") 
	|| die "aborted";
    while (defined($set = <TMPCLUST>))
    {
	print STDERR ".";
	chop $set;
	$added += $self->add_chromosomal_cluster([split(/[\t,]+/,$set)]);
    }
    close(TMPCLUST);

    if ($added)
    {
	my $rdbH = $self->db_handle;
	$self->export_set("chromosomal_clusters","cluster_id","$FIG_Config::global/chromosomal_clusters");
	return 1;
    }
    return 0;
}

#=pod
#
#=head1 export_chromosomal_clusters
#
#usage: $fig->export_chromosomal_clusters
#
#Invoking this routine writes the set of chromosomal clusters as known in the
#relational DB back to $FIG_Config::global/chromosomal_clusters.
#
#=cut
#
sub export_chromosomal_clusters {
    my($self) = @_;

    $self->export_set("chromosomal_clusters","cluster_id","$FIG_Config::global/chromosomal_clusters");
}

sub add_chromosomal_cluster {
    my($self,$ids) = @_;
    my($id,$set,%existing,%in,$new,$existing,$new_id);

#   print STDERR "adding cluster ",join(",",@$ids),"\n";
    foreach $id (@$ids)
    {
	foreach $set ($self->in_sets($id,"chromosomal_clusters","cluster_id"))
	{
	    $existing{$set} = 1;
	    foreach $id ($self->ids_in_set($set,"chromosomal_clusters","cluster_id"))
	    {
		$in{$id} = 1;
	    }
	}
    }
#   print &Dumper(\%existing,\%in);

    $new = 0;
    foreach $id (@$ids)
    {
	if (! $in{$id})
	{
	    $in{$id} = 1;
	    $new++;
	}
    }
#   print STDERR "$new new ids\n";
    if ($new)
    {
	foreach $existing (keys(%existing))
	{
	    $self->delete_set($existing,"chromosomal_clusters","cluster_id");
	}
	$new_id = $self->next_set("chromosomal_clusters","cluster_id");
#	print STDERR "adding new cluster $new_id\n";
	$self->insert_set($new_id,[keys(%in)],"chromosomal_clusters","cluster_id");
	return 1;
    }
    return 0;
}

################################# PCH pins  ####################################

=pod

=head1 in_pch_pin_with

usage: $fig->in_pch_pin_with($peg)

Returns the set of pegs that are believed to be "pinned" to $peg (in the
sense that PCHs occur containing these pegs over significant phylogenetic
distances).

=cut

sub in_pch_pin_with {
    my($self,$peg) = @_;
    my($set,$id,%in);

    return $self->in_set_with($peg,"pch_pins","pin");
}

=pod

=head1 add_pch_pins

usage: $fig->add_pch_pins($file)

The given file is supposed to contain one set of pinned pegs per line (either
comma or tab seprated pegs).  These will be added (to the extent they are new) to those
already in $FIG_Config::global/pch_pins.

=cut

sub add_pch_pins {
    my($self,$file) = @_;
    my($set,$added);

    open(TMPCLUST,"<$file") 
	|| die "aborted";
    while (defined($set = <TMPCLUST>))
    {
	print STDERR ".";
	chop $set;
	my @tmp = split(/[\t,]+/,$set);
	if (@tmp < 200)
	{
	    $added += $self->add_pch_pin([@tmp]);
	}
    }
    close(TMPCLUST);

    if ($added)
    {
	my $rdbH = $self->db_handle;
	$self->export_set("pch_pins","pin","$FIG_Config::global/pch_pins");
	return 1;
    }
    return 0;
}

sub export_pch_pins {
    my($self) = @_;

    $self->export_set("pch_pins","pin","$FIG_Config::global/pch_pins");
}

sub add_pch_pin {
    my($self,$ids) = @_;
    my($id,$set,%existing,%in,$new,$existing,$new_id);

#   print STDERR "adding cluster ",join(",",@$ids),"\n";
    foreach $id (@$ids)
    {
	foreach $set ($self->in_sets($id,"pch_pins","pin"))
	{
	    $existing{$set} = 1;
	    foreach $id ($self->ids_in_set($set,"pch_pins","pin"))
	    {
		$in{$id} = 1;
	    }
	}
    }
#   print &Dumper(\%existing,\%in);

    $new = 0;
    foreach $id (@$ids)
    {
	if (! $in{$id})
	{
	    $in{$id} = 1;
	    $new++;
	}
    }

    if ($new)
    {
	foreach $existing (keys(%existing))
	{
	    $self->delete_set($existing,"pch_pins","pin");
	}
	$new_id = $self->next_set("pch_pins","pin");
#	print STDERR "adding new pin $new_id\n";
	$self->insert_set($new_id,[keys(%in)],"pch_pins","pin");
	return 1;
    }
    return 0;
}

################################# Annotations  ####################################

=pod

=head1 add_annotation

usage: $fig->add_annotation($fid,$user,$annotation)

$annotation is added as a time-stamped annotation to $peg showing $user as the
individual who added the annotation.

=cut

sub add_annotation {
    my($self,$feature_id,$user,$annotation) = @_;
    my($genome);

#   print STDERR "add: fid=$feature_id user=$user annotation=$annotation\n";
    if ($genome = $self->genome_of($feature_id))
    {
	my $file = "$FIG_Config::organisms/$genome/annotations";
	my $fileno = $self->file2N($file);
	my $time_made = time;

	if (open(TMP,">>$file"))
	{
	    flock(TMP,LOCK_EX) || confess "cannot lock assigned_functions";
	    seek(TMP,0,2)      || confess "failed to seek to the end of the file";

	    my $seek1 = tell TMP;
	    print TMP "$feature_id\n$time_made\n$user\n$annotation", (substr($annotation,-1) eq "\n") ? "" : "\n","//\n";
	    my $seek2 = tell TMP;
	    close(TMP);
	    chmod 0777, $file;
	    my $ln = $seek2 - $seek1;
	    my $rdbH = $self->db_handle;
	    if ($rdbH->SQL("INSERT INTO annotation_seeks ( fid, fileno, seek, len ) VALUES ( \'$feature_id\', $fileno, $seek1, $ln )"))
	    {
		return 1;
	    }
	}
    }
    return 0;
}
    
=pod

=head1 feature_annotations

usage: @annotations = $fig->feature_annotations($fid)

The set of annotations of $fid is returned as a list of 4-tuples.  Each entry in the list
is of the form [$fid,$timestamp,$user,$annotation].

=cut


sub feature_annotations {
    my($self,$feature_id) = @_;
    my($tuple,$fileN,$seek,$ln,$readN,$readC,$annotation,$feature_idQ);
    my($file,$fh);

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT fileno, seek, len  FROM annotation_seeks WHERE  fid = \'$feature_id\' ");
    my @annotations = ();

    foreach $tuple (@$relational_db_response)
    {
	($fileN,$seek,$ln) = @$tuple;
	$file = $self->N2file($fileN);
	$fh   = $self->openF($file);
	if (! $fh)
	{
	    confess "could not open annotations for $file";
	}
	seek($fh,$seek,0);
	$readN = read($fh,$readC,$ln);
	($readN == $ln) 
	    || confess "could not read the block of annotations at $seek for $ln characters; $readN actually read from $file\n$readC";
	$feature_idQ = quotemeta $feature_id;
	foreach $annotation (split(/\n\/\/\n/, $readC))
	{
	    if ($annotation =~ /^$feature_idQ\n(\d+)\n([^\n]+)\n(.*)/s)
	    {
		push(@annotations,[$feature_id,$1,$2,$3]);
	    }
	    else
	    {
		print STDERR "malformed annotation\n$annotation\n";
	    }
	}
    }
    return map { $_->[1] = localtime($_->[1]); $_ } sort { $a->[1] <=> $b->[1] } @annotations;
}

################################# Indexing Features and Functional Roles  ####################################

=pod

=head1 search_index

usage: ($pegs,$roles) = $fig->search_pattern($pattern)

All pegs that "match" $pattern are put into a list, and $pegs will be a
pointer to that list.

All roles that "match" $pattern are put into a list, and $roles will be a
pointer to that list.

The notion of "match $pattern" is intentionally left undefined.  For now, you
will probably get only entries in which each word id $pattern occurs exactly,
but that is not a long term commitment.

=cut

sub search_index {
    my($self,$pattern) = @_;
    my($patternQ,@raw,@pegs,@roles);

    &clean_tmp;
    $patternQ = $pattern;
    $patternQ =~ s/\s+/;/g;
    $patternQ =~ s/\./\\./g;

#   print STDERR "pattern=$pattern patternQ=$patternQ\n";
    @raw = `$FIG_Config::ext_bin/glimpse -y -H $FIG_Config::data/Indexes -i -w \'$patternQ\'`;
    @pegs  = sort { &FIG::by_fig_id($a->[0],$b->[0]) } 
             map { $_ =~ s/^\S+:\s+//; [split(/\t/,$_)] } 
             grep { $_ =~ /^\S+peg.index/ } @raw;
    my %roles = map { $_ =~ s/^\S+:\s+//; $_ => 1} grep { $_ =~ /^\S+role.index/ } @raw;
    @roles = sort keys(%roles);

    return ([@pegs],[@roles]);
}

################################# Loading Databases  ####################################


#=pod
#
#=head1 load_all
#
#usage: load_all
#
#This function is supposed to reload all entries into the database and do
#whatever is required to properly support indexing of pegs and roles.
#
#=cut

sub load_all {
    
    &run("load_features");
    &run("index_sims");
    &run("load_peg_mapping");
    &run("index_translations");
    &run("add_assertions_of_function");
    &run("load_protein_families");
    &run("load_external_orgs");
    &run("load_chromosomal_clusters");
    &run("load_pch_pins");
    &run("index_neighborhoods");
    &run("index_annotations");
    &run("load_ec_names");
    &run("load_kegg");
    &run("index_contigs");
    &run("make_indexes");
}

################################# Automated Assignments  ####################################

=pod

=head1 auto_assign

usage: $assignment = &FIG::auto_assign($peg,$seq)

This returns an automated assignment for $peg.  $seq is optional; if it is not
present, then it is assumed that similarities already exist for $peg.  $assignment is set
to either

    Function
or
    Function\tW

if it is felt that the assertion is pretty weak.

=cut

sub auto_assign {
    my($peg,$seq) = @_;

    my $cmd = $seq ? "echo \"$peg\t$seq\" | auto_assign | make_calls" : "echo \"$peg\" | auto_assign | make_calls";
#   print STDERR $cmd;
    my(@tmp) = `$cmd`;
    if ((@tmp == 1) && ($tmp[0] =~ /^\S+\t(\S.*\S)/))
    {
	return $1;
    }
    else
    {
	return "hypothetical protein";
    }
}

################################# Protein Families ####################################

=pod

=head1 all_protein_families

usage: @all = $fig->all_protein_families

Returns a list of the ids of all of the protein families currently defined.

=cut

sub all_protein_families {
    my($self) = @_;

    return $self->all_sets("protein_families","family");
}

=pod

=head1 ids_in_family

usage: @pegs = $fig->ids_in_family($family)

Returns a list of the pegs in $family.

=cut

sub ids_in_family {
    my($self,$family) = @_;

    return $self->ids_in_set($family,"protein_families","family");
}

=pod

=head1 family_function

usage: $func = $fig->family_function($family)

Returns the putative function of all of the pegs in $family.  Remember, we
are defining "protein family" as a set of homologous proteins that have the
same function.

=cut

sub family_function {
    my($self,$family) = @_;
    my($relational_db_response);
    my $rdbH = $self->db_handle;

    defined($family) || confess "family is missing";
    if (($relational_db_response = $rdbH->SQL("SELECT function FROM family_function WHERE ( family = $family)")) &&
	(@$relational_db_response >= 1))
    {
	return $relational_db_response->[0]->[0];
    }
    return "";
}

=pod

=head1 sz_family

usage: $n = $fig->sz_family($family)

Returns the number of pegs in $family.

=cut

sub sz_family {
    my($self,$family) = @_;

    return $self->sz_set($family,"protein_families","family");
}
    
=pod

=head1 in_family

usage: @pegs = $fig->in_family($family)

Returns the pegs in $family.

=cut

sub in_family {
    my($self,$id) = @_;

    my @in = $self->in_sets($id,"protein_families","family");
    return (@in > 0) ? $in[0] : "";
}

################################# Abstract Set Routines  ####################################

sub all_sets {
    my($self,$relation,$set_name) = @_;
    my($relational_db_response);

    my $rdbH = $self->db_handle;

    if (($relational_db_response = $rdbH->SQL("SELECT DISTINCT $set_name FROM $relation")) &&
	(@$relational_db_response >= 1))
    {
	return map { $_->[0] } @$relational_db_response;
    }
    return ();
}

sub next_set {
    my($self,$relation,$set_name) = @_;
    my($relational_db_response);

    my $rdbH = $self->db_handle;

    if (($relational_db_response = $rdbH->SQL("SELECT MAX($set_name) FROM $relation")) &&
	(@$relational_db_response == 1))
    {
	return $relational_db_response->[0]->[0] + 1;
    }
}

sub ids_in_set {
    my($self,$which,$relation,$set_name) = @_;
    my($relational_db_response);

    my $rdbH = $self->db_handle;
    if (defined($which) && ($which =~ /^\d+$/))
    {
	if (($relational_db_response = $rdbH->SQL("SELECT id FROM $relation WHERE ( $set_name = $which)")) &&
	    (@$relational_db_response >= 1))
	{
	    return sort { by_fig_id($a,$b) }  map { $_->[0] } @$relational_db_response;
	}
    }
    return ();
}

sub in_sets {
    my($self,$id,$relation,$set_name) = @_;
    my($relational_db_response);

    my $rdbH = $self->db_handle;

    if (($relational_db_response = $rdbH->SQL("SELECT $set_name FROM $relation WHERE ( id = \'$id\' )")) &&
	(@$relational_db_response >= 1))
    {
	return map { $_->[0] } @$relational_db_response;
    }
    return ();
}

sub sz_set {
    my($self,$which,$relation,$set_name) = @_;
    my($relational_db_response);

    my $rdbH = $self->db_handle;

    if (($relational_db_response = $rdbH->SQL("SELECT COUNT(*) FROM $relation WHERE ( $set_name = $which)")) &&
	(@$relational_db_response == 1))
    {
	return $relational_db_response->[0]->[0];
    }
    return 0;
}

sub delete_set {
    my($self,$set,$relation,$set_name) = @_;
    
#   print STDERR "deleting set $set\n";
    my $rdbH = $self->db_handle;

    return $rdbH->SQL("DELETE FROM $relation WHERE ( $set_name = $set )");
}

sub insert_set {
    my($self,$set,$ids,$relation,$set_name) = @_;
    my($id);

#   print STDERR "inserting set $set containing ",join(",",@$ids),"\n";
    my $rdbH = $self->db_handle;

    my $rc = 1;
    foreach $id (@$ids)
    {
	if (! $rdbH->SQL("INSERT INTO $relation ( $set_name,id ) VALUES ( $set,\'$id\' )"))
	{
	    $rc = 0;
	}
    }
#   print STDERR "    rc=$rc\n";
    return $rc;
}
    
sub in_set_with {
    my($self,$peg,$relation,$set_name) = @_;
    my($set,$id,%in);

    foreach $set ($self->in_sets($peg,$relation,$set_name))
    {
	foreach $id ($self->ids_in_set($set,$relation,$set_name))
	{
	    $in{$id} = 1;
	}
    }
    return sort { &by_fig_id($a,$b) } keys(%in);
}


sub export_set {
    my($self,$relation,$set_name,$file) = @_;
    my($pair);

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT $set_name, id FROM $relation");

    open(TMP,">$file")
	|| die "could not open $file";
    flock(TMP,LOCK_EX) || confess "cannot lock $file";
    seek(TMP,0,2)      || confess "failed to seek to the end of the file";

    foreach $pair (sort { ($a->[0] <=> $b->[0]) or &by_fig_id($a->[1],$b->[1]) } @$relational_db_response)
    {
	print TMP join("\t",@$pair),"\n";
    }
    close(TMP);
    return 1;
}

################################# KEGG Stuff  ####################################


=pod

=head1 all_compounds

usage: @compounds = $fig->all_compounds

Returns a list containing all of the KEGG compounds.

=cut

sub all_compounds {
    my($self) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT DISTINCT cid FROM comp_name");
    if (@$relational_db_response > 0)
    {
	return sort map { $_->[0] } @$relational_db_response;
    }
    return ();
}

=pod

=head1 names_of_compound

usage: @names = $fig->names_of_compound

Returns a list containing all of the names assigned to the KEGG compounds.  The list
will be ordered as given by KEGG.

=cut

sub names_of_compound {
    my($self,$cid) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT pos,name FROM comp_name where cid = \'$cid\'");
    if (@$relational_db_response > 0)
    {
	return map { $_->[1] } sort { $a->[0] <=> $b->[0] } @$relational_db_response;
    }
    return ();
}

=pod

=head1 comp2react


usage: @rids = $fig->comp2react($cid)

Returns a list containing all of the reaction IDs for reactions that take $cid
as either a substrate or a product.

=cut

sub comp2react {
    my($self,$cid) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT rid FROM reaction_to_compound where cid = \'$cid\'");
    if (@$relational_db_response > 0)
    {
	return sort map { $_->[0] } @$relational_db_response;
    }
    return ();
}

=pod

=head1 cas

usage: $cas = $fig->cas($cid)

Returns the CAS ID for the compound, if known.

=cut

sub cas {
    my($self,$cid) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT cas FROM comp_cas where cid = \'$cid\'");
    if (@$relational_db_response == 1)
    {
	return $relational_db_response->[0]->[0];
    }
    return "";
}

=pod

=head1 cas_to_cid

usage: $cid = $fig->cas_to_cid($cas)

Returns the compound id (cid), given the CAS ID.

=cut

sub cas_to_cid {
    my($self,$cas) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT cid FROM comp_cas where cas = \'$cas\'");
    if (@$relational_db_response == 1)
    {
	return $relational_db_response->[0]->[0];
    }
    return "";
}

=pod

=head1 all_reactions

usage: @rids = $fig->all_reactions

Returns a list containing all of the KEGG reaction IDs.

=cut

sub all_reactions {
    my($self) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT DISTINCT rid FROM reaction_to_compound");
    if (@$relational_db_response > 0)
    {
	return sort map { $_->[0] } @$relational_db_response;
    }
    return ();
}

=pod

=head1 reversible

usage: $rev = $fig->reversible($rid)

Returns true iff the reactions had a "main direction" designated as "<=>";

=cut

sub reversible {
    my($self,$rid) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT reversible FROM reversible where rid = \'$rid\'");
    if (@$relational_db_response == 1) 
    {
	return $relational_db_response->[0]->[0];
    }
    return 1;
}

=pod

=head1 reaction2comp

usage: @tuples = $fig->reaction2comp($rid,$which)

Returns the "substrates" iff $which == 0.  In any event (i.e., whether you ask for substrates
or products), you get back a list of 3-tuples.  Each 3-tuple will contain

    [$cid,$stoich,$main]

Stoichiometry is normally numeric, but can be things like "n" or "(n+1)".
$main is 1 iff the compound is considered "main" or "connectable".

=cut

sub reaction2comp {
    my($self,$rid,$which) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT cid,stoich,main FROM reaction_to_compound where rid = \'$rid\' and setn = \'$which\'");
    if (@$relational_db_response > 0)
    {
	return sort { $a->[0] cmp $b->[0] } map { $_->[1] =~ s/\s+//g; $_ } @$relational_db_response;
    }
    return ();
}

=pod

=head1 catalyzed_by

usage: @ecs = $fig->catalyzed_by($rid)

Returns the ECs that are reputed to catalyze the reaction.  Note that we are currently
just returning the ECs that KEGG gives.  We need to handle the incompletely specified forms
(e.g., 1.1.1.-), but we do not do it yet.

=cut

sub catalyzed_by {
    my($self,$rid) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT role FROM reaction_to_enzyme where rid = \'$rid\'");
    if (@$relational_db_response > 0)
    {
	return sort map { $_->[0] } @$relational_db_response;
    }
    return ();
}

=pod

=head1 catalyzes

usage: @ecs = $fig->catalyzes($role)

Returns the rids of the reactions catalyzed by the "role" (normally an EC).

=cut

sub catalyzes {
    my($self,$role) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT rid FROM reaction_to_enzyme where role = \'$role\'");
    if (@$relational_db_response > 0)
    {
	return sort map { $_->[0] } @$relational_db_response;
    }
    return ();
}


=pod

=head1 displayable_reaction

usage: $display_format = $fig->displayable_reaction($rid)

Returns a string giving the displayable version of a reaction.

=cut

sub displayable_reaction {
    my($self,$rid) = @_;

    my @tmp = `grep $rid $FIG_Config::data/KEGG/reaction_name.lst`;
    if (@tmp > 0)
    {
	chop $tmp[0];
	return $tmp[0];
    }
    return $rid;
}

=pod

=head1 all_maps

usage: @maps = $fig->all_maps

Returns a list containing all of the KEGG maps that the system knows about (the
maps need to be periodically updated).

=cut

sub all_maps {
    my($self,$ec) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT DISTINCT map FROM ec_map ");
    if (@$relational_db_response > 0)
    {
	return map { $_->[0] } @$relational_db_response;
    }
    return ();
}

=pod

=head1 ec_to_maps

usage: @maps = $fig->ec_to_maps($ec)

Returns the set of maps that contain $ec as a functional role.  $ec is usually an EC number,
but in the more general case, it can be a functional role.

=cut

sub ec_to_maps {
    my($self,$ec) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT map FROM ec_map WHERE ( ec = \'$ec\' )");
    if (@$relational_db_response > 0)
    {
	return map { $_->[0] } @$relational_db_response;
    }
    return ();
}


=pod

=head1 map_to_ecs

usage: @ecs = $fig->map_to_ecs($map)

Returns the set of functional roles (usually ECs)  that are contained in the functionality
depicted by $map.

=cut

sub map_to_ecs {
    my($self,$map) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT ec FROM ec_map WHERE ( map = \'$map\' )");
    if (@$relational_db_response > 0)
    {
	return map { $_->[0] } @$relational_db_response;
    }
    return ();
}

=pod

=head1 map_name

usage: $name = $fig->map_name($map)

Returns the descriptive name covering the functionality depicted by $map.

=cut

sub map_name {
    my($self,$map) = @_;

    my $rdbH = $self->db_handle;
    my $relational_db_response = $rdbH->SQL("SELECT mapname FROM map_name WHERE ( map = \'$map\' )");
    if (@$relational_db_response == 1)
    {
	return $relational_db_response->[0]->[0];
    }
    return "";
}

################################# Functional Roles  ####################################

=pod

=head1 neighborhood_of_role

usage: @roles = $fig->neighborhood_of_role($role)

Returns a list of functional roles that we consider to be "the neighborhood" of $role.

=cut

sub neighborhood_of_role {
    my($self,$role) = @_;
    my($readC);

    my $file = "$FIG_Config::global/role.neighborhoods";
    my $rdbH = $self->db_handle;
    my $roleQ = quotemeta $role;
    my $relational_db_response = $rdbH->SQL("SELECT seek, len FROM neigh_seeks WHERE role = \'$roleQ\' ");
    if (@$relational_db_response == 1)
    {
	my($seek,$ln) = @{$relational_db_response->[0]};
	my $fh   = $self->openF($file);
	seek($fh,$seek,0);
	my $readN = read($fh,$readC,$ln-1);
	($readN == ($ln-1)) 
	    || confess "could not read the block of sims at $seek for $ln - 1 characters; $readN actually read from $file\n$readC";
	return grep { $_ && ($_ !~ /^\/\//) } split(/\n/,$readC);
    }
    return ();
}

=pod

=head1 roles_of_function

usage: @roles = $fig->roles_of_function($func)

Returns a list of the functional roles implemented by $func.

=cut

sub roles_of_function {
    my($func) = @_;

    return (split(/\s*[\/;]\s+/,$func),($func =~ /\d+\.\d+\.\d+\.\d+/g));
}

=pod

=head1 seqs_with_role

usage: @pegs = $fig->seqs_with_role($role,$who)

Returns a list of the pegs that implement $role.  If $who is not given, it
defaults to "master".  The system returns all pegs with an assignment made by
either "master" or $who (if it is different than the master) that implement $role.
Note that this includes pegs for which the "master" annotation disagrees with that
of $who, the master's implements $role, and $who's does not.

=cut

sub seqs_with_role {
    my($self,$role,$who) = @_;
    my $relational_db_response;

    $who = $who ? $who : "master";
    my $rdbH = $self->db_handle;

    my $who_cond;
    if ($who eq "master")
    {
	$who_cond = "( made_by = \'master\' OR made_by = \'unknown\' )";
    }
    else
    {
	$who_cond = "( made_by = \'master\' OR made_by = \'$who\' OR made_by = \'unknown\')";
    }
    my $query = "SELECT distinct prot FROM roles  WHERE (( role = \'$role\' ) AND $who_cond )";
    return (($relational_db_response = $rdbH->SQL($query)) && (@$relational_db_response >= 1)) ?
	map { $_->[0] } @$relational_db_response : ();
}

=pod

=head1 seqs_with_roles_in_genomes

usage: $result = $fig->seqs_with_roles_in_genomes($genomes,$roles,$made_by)

This routine takes a pointer to a list of genomes ($genomes) and a pointer to a list of
roles ($roles) and looks up all of the sequences that connect to those roles according
to either the master assignments or those made by $made_by.  Again, you will get assignments
for which the "master" assignment connects, but the $made_by does not.

A hash is returned.  The keys to the hash are genome IDs for which at least one sequence
was found.  $result->{$genome} will itself be a hash, assuming that at least one sequence
was found for $genome.  $result->{$genome}->{$role} will be set to a pointer to a list of
2-tuples.  Each 2-tuple will contain [$peg,$function], where $function is the one for
$made_by (which may not be the one that connected).

=cut

sub seqs_with_roles_in_genomes {
    my($self,$genomes,$roles,$made_by) = @_;
    my($genome,$role,$roleQ,$role_cond,$made_by_cond,$query,$relational_db_response,$peg,$genome_cond,$hit);
    my $rdbH = $self->db_handle;
    my $result = {}; # foreach $genome ($self->genomes) { $result->{$genome} = {} }
    if (! $made_by) { $made_by = 'master' }
    if ((@$genomes > 0) && (@$roles > 0))
    {
	$genome_cond = "(" . join(" OR ",map { "( org = \'$_\' )" } @$genomes) . ")";
	$role_cond   = "(" . join(" OR ",map { $roleQ = quotemeta $_; "( role = \'$roleQ\' )" } @$roles) . ")";
	$made_by_cond = ($made_by eq 'master') ? "(made_by = 'master')" : "(made_by = 'master' OR made_by = '$made_by')";
	$query = "SELECT distinct prot, role FROM roles  WHERE ( $made_by_cond AND $genome_cond AND $role_cond )";
	if (($relational_db_response = $rdbH->SQL($query)) && (@$relational_db_response >= 1))
	{
	    foreach $hit (@$relational_db_response)
	    {
		($peg,$role) = @$hit;
		$genome = $self->genome_of($peg);
		push(@{ $result->{$genome}->{$role} },[$peg,scalar $self->function_of($peg,$made_by)]);
	    }
	}
    }
    return $result;
}

=pod

=head1 largest_clusters

usage: @clusters = $fig->largest_clusters($roles,$user)

This routine can be used to find the largest clusters containing the some of the
designated set of roles.  A list of clusters is returned.  Each cluster is a pointer to
a list of pegs.

=cut

sub largest_clusters {
    my($self,$roles,$user,$sort_by_unique_functions) = @_;
    my($genome,$x,$role,$y,$peg,$loc,$contig,$beg,$end,%pegs,@pegs,$i,$j);

    my $ss = $self->seqs_with_roles_in_genomes([$self->genomes],$roles,$user);
    my @clusters = ();

    foreach $genome (keys(%$ss))
    {
	my %pegs;
	$x = $ss->{$genome};
	foreach $role (keys(%$x))
	{
	    $y = $x->{$role};
	    foreach $peg (map { $_->[0] } @$y)
	    {
		if ($loc = $self->feature_location($peg))
		{
		    ($contig,$beg,$end) = &FIG::boundaries_of($loc);
		    $pegs{$peg} = [$peg,$contig,int(($beg + $end) / 2)];
		}
	    }
	}
	
	@pegs = sort { ($pegs{$a}->[1] cmp $pegs{$b}->[1]) or ($pegs{$a}->[2] <=> $pegs{$b}->[2]) } keys(%pegs);
	$i = 0;
	while ($i < $#pegs)
	{
	    for ($j=$i+1; ($j < @pegs) && &close_enough_locs($pegs{$pegs[$j-1]},$pegs{$pegs[$j]}); $j++) {}
	    if ($j > ($i+1))
	    {
		push(@clusters,[@pegs[$i..$j-1]]);
	    }
	    $i = $j;
	}
    }
    if ($sort_by_unique_functions)
    {
	@clusters = sort { $self->unique_functions($b,$user) <=> $self->unique_functions($a,$user) } @clusters;
    }
    else
    {
	@clusters = sort { @$b <=> @$a } @clusters;
    }
    return @clusters;
}

sub unique_functions {
    my($self,$pegs,$user) = @_;
    my($peg,$func,%seen);

    foreach $peg (@$pegs)
    {
	if ($func = $self->function_of($peg,$user))
	{
	    $seen{$func} = 1;
	}
    }
    return scalar keys(%seen);
}
		
sub close_enough_locs {
    my($x,$y) = @_;

    return (($x->[1] eq $y->[1]) && (abs($x->[2] - $y->[2]) < 5000));
}

#################################   DNA sequence Stuff ####################################

=pod

=head1 extract_seq

usage: $seq = &FIG::extract_seq($contigs,$loc)

This is just a little utility routine that I have found convenient.  It assumes that
$contigs is a hash that contains IDs as keys and sequences as values.  $loc must be of the
form
       Contig_Beg_End

where Contig is the ID of one of the sequences; Beg and End give the coordinates of the sought
subsequence.  If Beg > End, it is assumed that you want the reverse complement of the subsequence.
This routine plucks out the subsequence for you.

=cut

sub extract_seq {
    my($contigs,$loc) = @_;
    my($contig,$beg,$end,$contig_seq);
    my($plus,$minus);

    $plus = $minus = 0;
    my $strand = "";
    my @loc = split(/,/,$loc);
    my @seq = ();
    foreach $loc (@loc)
    {
	if ($loc =~ /^\S+_(\d+)_(\d+)$/)
	{
	    if ($1 < $2)
	    {
		$plus++;
	    }
	    elsif ($2 < $1)
	    {
		$minus++;
	    }
	}
    }
    if ($plus > $minus)
    {
	$strand = "+";
    }
    elsif ($plus < $minus)
    {
	$strand = "-";
    }

    foreach $loc (@loc)
    {
	if ($loc =~ /^(\S+)_(\d+)_(\d+)$/)
	{
	    ($contig,$beg,$end) = ($1,$2,$3);
	    if (($beg < $end) || (($beg == $end) && ($strand eq "+")))
	    {
		$strand = "+";
		push(@seq,substr($contigs->{$contig},$beg-1,($end+1-$beg)));
	    }
	    else
	    {
		$strand = "-";
		push(@seq,&reverse_comp(substr($contigs->{$contig},$end-1,($beg+1-$end))));
	    }
	}
    }
    return join("",@seq);
}

=pod

=head1 contig_ln

usage: $n = $fig->contig_ln($genome,$contig)

Returns the length of $contig from $genome.

=cut

sub contig_ln {
    my($self,$genome,$contig) = @_;
    my($rdbH,$relational_db_response);

    $rdbH = $self->db_handle;
    if (defined($genome) && defined($contig))
    {
	if (($relational_db_response = $rdbH->SQL("SELECT len FROM contig_lengths WHERE ( genome = \'$genome\' ) and ( contig = \'$contig\' )")) &&
	    
	    (@$relational_db_response == 1))
	{
	    return $relational_db_response->[0]->[0];
	}
    }
    return undef;
}
	    
=pod

=head1 dna_seq

usage: $seq = dna_seq($genome,@locations)

Returns the concatenated subsequences described by the list of locations.  Each location
must be of the form

    Contig_Beg_End

where Contig must be the ID of a contig for genome $genome.  If Beg > End the location
describes a stretch of the complementary strand.

=cut

sub dna_seq {
    my($self,$genome,@locations) = @_;
    my(@pieces,$loc,$contig,$beg,$end,$ln,$rdbH);

    @pieces = ();
    foreach $loc (@locations)
    {
	if ($loc =~ /^(\S+)_(\d+)_(\d+)$/)
	{
	    ($contig,$beg,$end) = ($1,$2,$3);
	    $ln = $self->contig_ln($genome,$contig);

	    if (! $ln) {
		print STDERR "$genome/$contig: could not get length\n";
		return "";
	    }

	    if (&between(1,$beg,$ln) && &between(1,$end,$ln))
	    {
		if ($beg < $end)
		{
		    push(@pieces, $self->get_dna($genome,$contig,$beg,$end));
		}
		else
		{
		    push(@pieces, &reverse_comp($self->get_dna($genome,$contig,$end,$beg)));
		}
	    }
	}
    }
    return join("",@pieces);
}

sub get_dna {
    my($self,$genome,$contig,$beg,$end) = @_;
    my $relational_db_response;

    my $rdbH = $self->db_handle;
    my $indexpt = int(($beg-1)/10000) * 10000;
    if (($relational_db_response = $rdbH->SQL("SELECT startN,fileno,seek FROM contig_seeks WHERE ( genome = \'$genome\' ) AND ( contig = \'$contig\' ) AND ( indexpt = $indexpt )")) &&
	(@$relational_db_response == 1))
    {
	my($startN,$fileN,$seek) = @{$relational_db_response->[0]};
	my $fh = $self->openF($self->N2file($fileN));
	if (seek($fh,$seek,0))
	{
	    my $chunk = "";
	    read($fh,$chunk,int(($end + 1 - $startN) * 1.03));
	    $chunk =~ s/\s//g;
	    my $ln = ($end - $beg) + 1;
	    if (length($chunk) >= $ln)
	    {
		return substr($chunk,(($beg-1)-$startN),$ln);
	    }
	}
    }
    return undef;
}

1

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3