[Bio] / FigKernelPackages / FIGV.pm Repository:
ViewVC logotype

View of /FigKernelPackages/FIGV.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.56 - (download) (as text) (annotate)
Thu Jun 26 21:11:11 2008 UTC (11 years, 9 months ago) by olson
Branch: MAIN
CVS Tags: rast_rel_2008_07_21
Changes since 1.55: +192 -16 lines
Add annotation support to FIGV. Add FIGM.

# -*- perl -*-
#########################################################################
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#########################################################################

package FIGV;

use Carp;
use strict;
use FIG;
use FIG_Config;
use SFXlate;
use SproutFIG;
use Tracer;
use Data::Dumper;
use vars qw($AUTOLOAD);
use DB_File;
use File::Basename;
use FileHandle;
use Fcntl qw/:flock/;  # import LOCK_* constants

sub new {
    my($class,$org_dir,$low_level, $fig) = @_;
    print STDERR "FIGV::new => (", join(qq(, ), ($class, $org_dir, $low_level)), ")\n"
	if $ENV{FIG_VERBOSE};
    
    if (!ref($fig))
    {
	if ($low_level && ($low_level =~ /sprout/i))
	{
	    $fig = new SproutFIG($FIG_Config::sproutDB, $FIG_Config::sproutData);
	}
	elsif (! $org_dir)
	{
	    $fig = new FIG;
	    return $fig;
	}
	else
	{
	    $fig = new FIG;
	}
    }

    my $self         = {};
    $self->{_fig}    = $fig;
    $self->{_orgdir} = $org_dir;

    #
    # Determine genome-id. Use the GENOME_ID file if it exists; otherwise
    # we assume the directory name of the org_dir is the genome id.
    #
    my $genome_id;
    if (open(my $fh, "<", "$org_dir/GENOME_ID"))
    {
	my $genome_id = <$fh>;
	chomp $genome_id;
	if ($genome_id !~ /^\d+\.\d+/)
	{
	    warn "Invalid genome id '$genome_id' in $org_dir/GENOME_ID";
	    return undef;
	}
	close($fh);
    }
    else
    {
	if ($org_dir =~ /(\d+\.\d+)$/)
	{
	    $genome_id = $1;
	}
	else
	{
	    warn "No GENOME_ID file found in $org_dir and the directory name is not a valid genome id";
	}
    }
    $self->{_genome} = $genome_id;

    return bless $self, $class;
}

sub genome_id
{
    return $_[0]->{_genome};
}

# return the path of the organism directory
sub organism_directory {
  return $_[0]->{_orgdir};
}

sub is_complete
{
    return 1;
}

#
# Redirect any method invocations that we don't implement out to the
# underlying FIG object.
#
sub AUTOLOAD
{
    my($self, @args) = @_;

    if (ref($self) ne "FIGV") {
	confess "BAD FIGV object passed to AUTOLOAD";
    }

    no strict 'refs';

    my $meth = $AUTOLOAD;
    $meth =~ s/.*:://;
    my $fmeth = "FIG::$meth";

    my $fig = $self->{_fig};
#    my $args = Dumper(\@args);
    if (wantarray)
    {
	my @res = $fig->$meth(@args);
#	my @res = &$fmeth($self, @args);
#	warn "FIGV invoke $meth($args) returns\n", Dumper(\@res);
	return @res;
    }
    else
    {
	my $res = $fig->$meth(@args);
#	my $res = &$fmeth($self, @args);
#	warn "FIGV invoke $meth($args) returns\n", Dumper($res);
	return $res;
    }
}

sub FIG
{
    my($self) = @_;
    return $self;
}

#
# This should be hoisted to FIGM
#
sub sort_fids_by_taxonomy
{
    my($self,@fids) = @_;

    return map     { $_->[0] }
           sort    { $a->[1] cmp $b->[1] }
           map     { [$_,$self->taxonomy_of($self->genome_of($_))] }
           @fids;
}

sub get_basic_statistics
{
    my($self, $genome) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if ($genome ne $newG)
    {
	return $fig->get_basic_statistics($genome);
    }

    #
    # Check cache.
    #

    my $cache = "$newGdir/cache.basic_statistics";
    my $fh = new FileHandle($cache);
    if ($fh)
    {
	my $stats = {};
	while (<$fh>)
	{
	    chomp;
	    my($k, $v) = split(/\t/);
	    $stats->{$k} = $v;
	}
	close($fh);
	return $stats;
    }


    my $subsystem_data = $self->get_genome_subsystem_data($genome);

    my %sscount = map { $_->[0] => 1 } @$subsystem_data;
    my $nss=scalar(keys(%sscount));

    my $statistics = {
	num_subsystems => $nss,
	num_contigs    => scalar($self->all_contigs($genome)),
	num_basepairs  => $self->genome_szdna($genome),
	genome_name    => $self->genus_species($genome),
	genome_domain  => $self->genome_domain($genome),
	genome_pegs    => $self->genome_pegs($genome),
	genome_rnas    => $self->genome_rnas($genome),
	genome_version => $self->genome_version($genome)
	};

    $fh = new FileHandle(">$cache");
    if ($fh)
    {
	while (my($k, $v) = each %$statistics)
	{
	    print $fh join("\t", $k, $v), "\n";
	}
	close($fh);
    }

    return $statistics;
}


sub get_peg_statistics {
    my ($self, $genome) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if ($genome ne $newG)
    {
	return $fig->get_peg_statistics($genome);
    }

    #
    # Check cache.
    #

    my $cache = "$newGdir/cache.peg_statistics";
    my $fh = new FileHandle($cache);
    if ($fh)
    {
	my $stats = {};
	while (<$fh>)
	{
	    chomp;
	    my($k, $v) = split(/\t/);
	    $stats->{$k} = $v;
	}
	close($fh);
	return $stats;
    }


    my $subsystem_data = $self->get_genome_subsystem_data($genome);
    my $assignment_data = $self->get_genome_assignment_data($genome);

    my $hypo_sub = 0;
    my $hypo_nosub = 0;
    my $nothypo_sub = 0;
    my $nothypo_nosub = 0;
    my %in = map { $_->[2] => 1 } @$subsystem_data;
    my $in = keys(%in);

    my %sscount = map { $_->[0] => 1 } @$subsystem_data;

    foreach $_ (@$assignment_data)
    {
	my($peg,$func) = @$_;
	my $is_hypo = &FIG::hypo($func);

	if    ($is_hypo && $in{$peg})           { $hypo_sub++ }
	elsif ($is_hypo && ! $in{$peg})         { $hypo_nosub++ }
	elsif ((! $is_hypo) && (! $in{$peg}))   { $nothypo_nosub++ }
	elsif ((! $is_hypo) && $in{$peg})       { $nothypo_sub++ }
    }
    my $tot = $hypo_sub + $nothypo_sub + $hypo_nosub + $nothypo_nosub;

    my ($fracHS, $fracNHS, $fracHNS, $fracNHNS);

    if ($tot == 0) {
	$fracHS = sprintf "%.2f", 0.0;
	$fracNHS = sprintf "%.2f", 0.0;
	$fracHNS = sprintf "%.2f", 0.0;
	$fracNHNS = sprintf "%.2f", 0.0;
    } else {
	$fracHS = sprintf "%.2f", $hypo_sub / $tot * 100;
	$fracNHS = sprintf "%.2f", $nothypo_sub / $tot * 100;
	$fracHNS = sprintf "%.2f", $hypo_nosub / $tot * 100;
	$fracNHNS = sprintf "%.2f", $nothypo_nosub / $tot * 100;
    }

    my $statistics = {
	hypothetical_in_subsystem => $hypo_sub,
	hypothetical_not_in_subsystem => $hypo_nosub,
	non_hypothetical_in_subsystem => $nothypo_sub,
	non_hypothetical_not_in_subsystem => $nothypo_nosub,
	hypothetical_in_subsystem_percent => $fracHS,
	hypothetical_not_in_subsystem_percent => $fracHNS,
	non_hypothetical_in_subsystem_percent => $fracNHS,
	non_hypothetical_not_in_subsystem_percent => $fracNHNS
	};

    $fh = new FileHandle(">$cache");
    if ($fh)
    {
	while (my($k, $v) = each %$statistics)
	{
	    print $fh join("\t", $k, $v), "\n";
	}
	close($fh);
    }

    return $statistics;
}

#
# To retrieve a subsystem in FIGV, we create the subsystem as normal via $fig->get_subsystem,
# then insert the row for the virtual org dir we are processing.
#

sub get_subsystem
{
    my($self,$ssa) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    my $ss = $fig->get_subsystem($ssa);
    return undef unless $ss;

    $self->load_ss_data();

    my $bindings = $self->{_ss_bindings}->{$ssa};
    my $variant = $self->{_ss_variants}->{$ssa};

#    warn "Adding virtual genome " . Dumper(\%bindings);
    $ss->add_virtual_genome($self->genus_species(), $newG, $variant, $bindings);

    return $ss;
}

sub active_subsystems
{
    my($self, $genome, $all) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if ($genome  ne $newG)
    {
	return $fig->active_subsystems($genome, $all);
    }

    $self->load_ss_data();

    my $slist = {};

    if ($self->{_ss_variants})
    {
	%{$slist} = %{$self->{_ss_variants}};
    }

    if (not $all)
    {
	for my $ss (keys %$slist)
	{
	    my $var = $slist->{$ss};
	    delete $slist->{$ss} if $var eq 0 or $var eq -1;
	}
    }
    return $slist;
}

sub peg_to_subsystems
{
    my($self, $peg) = @_;

    my $variant = $self->{_ss_variants};
    my %sub = map { $_ => 1 }
              grep { $variant->{$_} !~ /^(-1)|0$/ }
              map { $_->[0] }
              $self->peg_to_roles_in_subsystems($peg);
    return sort keys(%sub);
}

sub peg_to_roles_in_subsystems
{
    my($self,$peg) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if ($peg !~ /fig\|$newG\.peg/)
    {
	return $fig->peg_to_roles_in_subsystems($peg);
    }

    $self->load_ss_data();

    my $ret  = $self->{_ss_peg_index}->{$peg};

    return $ret ? @$ret : ();
}

sub subsystems_for_peg
{
    my($self,$peg) = @_;
    return $self->peg_to_roles_in_subsystems($peg);
}

sub genomes {
    my($self,$complete) = @_;
    my $fig = $self->{_fig};

    return ($self->{_genome},$fig->genomes($complete));
}

sub genus_species {
    my($self,$genome) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if (($genome eq $newG) && open(GENOME,"<$newGdir/GENOME"))
    {
	my $x = <GENOME>;
	close(GENOME);
	chop $x;
	return $x;
    }
    else
    {
	return $fig->genus_species($genome);
    }
}

sub get_genome_assignment_data {
    my($self,$genome) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if ($genome eq $newG)
    {
	my @fnfiles = <$newGdir/proposed*functions>;
	if (@fnfiles == 0)
	{
	    @fnfiles = <$newGdir/assigned*functions>;
	}

	if (@fnfiles == 0)
	{
	    return [];
	}
	my %assign = map { ( $_ =~ /^(fig\|\d+\.\d+\.peg\.\d+)\t(\S.*\S)/) ? ($1 => $2) : () }
		             `cat @fnfiles`;
	return [map { [$_,$assign{$_}] } sort { &FIG::by_fig_id($a,$b) } keys(%assign)];
    }
    else
    {
	return $fig->get_genome_assignment_data($genome);
    }
}

sub org_of {
    my($self,$peg) = @_;

    if ($peg =~ /^fig\|(\d+\.\d+)\.peg\.\d+/)
    {
	return $self->genus_species($1);
    }
    return "";
}

sub get_genome_subsystem_data {
    my($self,$genome) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if ($genome eq $newG)
    {
	return [map { ($_ =~ /^(\S[^\t]+\S)\t(\S[^\t]*\S)\t(\S+)/) ? [$1,$2,$3] : () }
		`cat $newGdir/Subsystems/bindings`];
    }
    else
    {
	return $fig->get_genome_subsystem_data($genome);
    }
}

sub get_genome_subsystem_count
{
    my($self,$genome) = @_;
    
    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};
    
    if ($genome eq $newG)
    {
	my $count = 0;
	my ($entry, $vc);
	open(SUBSYSTEMS, "<$newGdir/Subsystems/subsystems");
	while (defined($entry = <SUBSYSTEMS>))
	{
	    chomp $entry;
	    (undef, $vc) = split /\t/, $entry;
	    if ($vc != -1) { ++$count; }
	}
	close(SUBSYSTEMS);
	return $count;
    }
    else
    {
	return $fig->get_genome_subsystem_count($genome);
    }
}

sub orgname_of_orgid {
    my($self,$genome) = @_;

    return $self->genus_species($genome);
}

sub orgid_of_orgname {
    my($self,$genome_name) = @_;

    my @genomes = $self->genomes('complete');
    my $i;
    for ($i=0; ($i < @genomes) && ($genome_name ne $self->genus_species($genomes[$i])); $i++) {}
    return ($i == @genomes) ? undef : $genomes[$i];
}

sub genus_species_domain {
    my($self,$genome) = @_;

    return [$self->genus_species($genome),$self->genome_domain($genome)];
}

sub protein_subsystem_to_roles {
    my ($self,$peg,$subsystem) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if (&FIG::genome_of($peg) ne $newG)
    {
	return $fig->protein_subsystem_to_roles($peg,$subsystem);
    }
    else
    {
	my @roles = map { (($_ =~ /^([^\t]+)\t([^\t]+)\t(\S+)$/) && ($1 eq $subsystem) && ($3 eq $peg)) ?
			  $2 : () } `cat $newGdir/Subsystems/bindings`;
	my %roles = map { $_ => 1 } @roles;
	return [sort keys(%roles)];
    }
}

sub contig_lengths {
    my ($self, $genome) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if ($genome ne $newG)
    {
	return $fig->contig_lengths($genome);
    }
    else
    {
	my $contig_lengths = $self->{_contig_len_index};

	if (!tied(%$contig_lengths))
	{
	     $contig_lengths = {};
	     if (open(CONTIGS,"<$newGdir/contigs"))
	     {
		 $/ = "\n>";
		 while (defined(my $x = <CONTIGS>))
		 {
		     chomp $x;
		     if ($x =~ />?(\S+)[^\n]*\n(.*)/s)
		     {
			 my $id = $1;
			 my $seq = $2;
			 $seq =~ s/\s//gs;
			 $contig_lengths->{$id} = length($seq);
		     }
		 }
		 close(CONTIGS);
		 $/ = "\n";
	     }
	}
	return $contig_lengths;
    }
}

sub contig_ln {
    my ($self, $genome, $contig) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if ($genome ne $newG)
    {
	return $fig->contig_ln($genome, $contig);
    }
    else
    {
	my $contig_len = $self->{_contig_len_index};

	if (tied(%$contig_len))
	{
	    return $contig_len->{$contig};
	}

	if (open(CONTIGS,"<$newGdir/contigs"))
	{
	    local $/ = "\n>";
	    while (defined(my $x = <CONTIGS>))
	    {
		chomp $x;
		if ($x =~ />?(\S+)[^\n]*\n(.*)/s)
		{
		    my $id = $1;
		    my $seq = $2;
		    $seq =~ s/\s//gs;
		    if ($id eq $contig)
		    {
			return length($seq);
		    }
		}
	    }
	    close(CONTIGS);
	}
    }
}

sub contigs_of
{
    my ($self, $genome) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if ($genome ne $newG)
    {
	return $fig->contigs_of($genome);
    }
    else
    {
	my @out;
	$self->load_contigs_index();

	my $contigs = $self->{_contigs_index};
	if (tied(%$contigs))
	{
	    return keys %$contigs;
	}

	if (open(CONTIGS,"<$newGdir/contigs"))
	{
	    local $/ = "\n>";
	    while (defined(my $x = <CONTIGS>))
	    {
		chomp $x;
		if ($x =~ />?(\S+)[^\n]*\n(.*)/s)
		{
		    my $id = $1;
		    push(@out, $id);
		}
	    }
	    close(CONTIGS);
	}
	return @out;
    }
}

=head3 dna_seq

usage: $seq = dna_seq($genome,@locations)

Returns the concatenated subsequences described by the list of locations.  Each location
must be of the form

    Contig_Beg_End

where Contig must be the ID of a contig for genome $genome.  If Beg > End the location
describes a stretch of the complementary strand.

=cut
#: Return Type $;
sub dna_seq {
    my($self,$genome,@locations) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if ($genome ne $newG)
    {
	return $fig->dna_seq($genome, @locations);
    }

    my $contigs = $self->{_contigs_index};
    if (!tied %$contigs)
    {
	$contigs = {};
	if (open(CONTIGS,"<$newGdir/contigs"))
	{
	    local $/ = "\n>";
	    while (defined(my $x = <CONTIGS>))
	    {
		chomp $x;
		if ($x =~ />?(\S+)[^\n]*\n(.*)/s)
		{
		    my $id = $1;
		    my $seq = $2;
		    $seq =~ s/\s//gs;
		    $contigs->{$id} = $seq;
		}
	    }
	    close(CONTIGS);
	}
    }

    my(@pieces,$loc,$contig,$beg,$end,$ln,$rdbH);

    @locations = map { split(/,/,$_) } @locations;
    @pieces = ();
    foreach $loc (@locations)
    {
        if ($loc =~ /^(\S+)_(\d+)_(\d+)$/)
        {
            ($contig,$beg,$end) = ($1,$2,$3);
	    my $seq = $contigs->{$contig};

            $ln = length($seq);

            if (! $ln) {
                print STDERR "$genome/$contig: could not get length\n";
                return "";
            }

            if (&FIG::between(1,$beg,$ln) && &FIG::between(1,$end,$ln))
            {
                if ($beg < $end)
                {
                    push(@pieces, substr($seq, $beg - 1, ($end - $beg) + 1));
                }
                else
                {
                    push(@pieces, &FIG::reverse_comp(substr($seq, $end - 1, ($beg - $end) + 1)));
                }
            }
        }
    }
    return lc(join("",@pieces));
}

sub genome_szdna {
    my ($self, $genome) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if ($genome ne $newG)
    {
	return $fig->genome_szdna($genome);
    }
    else
    {
	my $contig_lens = $self->contig_lengths($genome);
	my $tot = 0;
	while ( my($contig,$len) = each %$contig_lens)
	{
	    $tot += $len;
	}
	return $tot;
    }
}

sub genome_version {
    my ($self, $genome) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if ($genome ne $newG)
    {
	return $fig->genome_version($genome);
    }
    else
    {
	return "$genome.0";
    }
}

sub genome_pegs {
    my ($self, $genome) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if ($genome ne $newG)
    {
	return $fig->genome_pegs($genome);
    }
    else
    {
	my @tmp = $self->all_features($genome,"peg");
	my $n = @tmp;
	return $n;
    }
}

sub genome_rnas {
    my ($self, $genome) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if ($genome ne $newG)
    {
	return $fig->genome_rnas($genome);
    }
    else
    {
	my @tmp = $self->all_features($genome,"rna");
	my $n = @tmp;
	return $n;
    }
}

sub genome_domain {
    my ($self, $genome) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if ($genome ne $newG)
    {
	return $fig->genome_domain($genome);
    }
    else
    {
	my $tax = $self->taxonomy_of($genome);
	return ($tax =~ /^([^ \t;]+)/) ? $1 : "unknown";
    }
}

sub genes_in_region {
    my($self,$genome,$contig,$beg,$end) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if ($genome ne $newG)
    {
	return $fig->genes_in_region($genome,$contig,$beg,$end);
    }
    else
    {
	$self->load_feature_indexes();
	#
	# Use the recno index if exists.
	#

	my $maxV = 0;
	my $minV = 1000000000;
	my $genes = [];

	my $recnos = $self->{_feat_recno};

	if (ref($recnos))
	{
	    while (my($ftype, $list) = each (%$recnos))
	    {
		#
		# Look up start/end of this contig in the btree index.
		#
		
		my $inf = $self->{_feat_btree}->{$ftype}->{$contig};
		my($istart, $iend);
		if ($inf)
		{
		    ($istart, $iend) = split(/$;/, $inf);
		}
		else
		{
		    $istart = 0;
		    $iend = $#$list;
		}

		for (my $idx = $istart; $idx <= $iend; $idx++)
		{
		    my($fid, $fcontig, $fbeg, $fend, $fstrand) = split(/$;/, $list->[$idx]);
		    if ($contig eq $fcontig and &overlaps($beg, $end, $fbeg, $fend))
		    {
			$minV = &FIG::min($minV,$fbeg,$fend);
			$maxV = &FIG::max($maxV,$fbeg,$fend);
			push(@$genes,$fid);
		    }
		}
	    }
	}
	else
	{
	    &load_tbl($self);
	    my $tblH = $self->{_tbl};
	    while ( my($fid,$tuple) = each %$tblH)
	    {
		if (($tuple->[0]->[0] =~ /^(\S+)_(\d+)_\d+$/) && ($1 eq $contig))
		{
		    my $beg1 = $2;
		    my $last = @{$tuple->[0]} - 1;
		    if (($tuple->[0]->[$last] =~ /^(\S+)_\d+_(\d+)$/) && ($1 eq $contig))
		    {
			my $end1 = $2;
			if (&overlaps($beg,$end,$beg1,$end1))
			{
			    $minV = &FIG::min($minV,$beg1,$end1);
			    $maxV = &FIG::max($maxV,$beg1,$end1);
			push(@$genes,$fid);
			}
		    }
		}
	    }
	}
	return ($genes,$minV,$maxV);
    }
}

sub overlaps {
    my($b1,$e1,$b2,$e2) = @_;

    if ($b1 > $e1) { ($b1,$e1) = ($e1,$b1) }
    if ($b2 > $e2) { ($b2,$e2) = ($e2,$b2) }
    return &FIG::between($b1,$b2,$e1) || &FIG::between($b2,$b1,$e2);
}

sub all_contigs {
    my($self,$genome) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if ($genome ne $newG)
    {
	return $fig->all_contigs($genome);
    }
    else
    {
	$self->load_feature_indexes();

	my %contigs;
	my $btrees = $self->{_feat_btree};

	if (ref($btrees))
	{
	    while (my($ftype, $btree) = each (%$btrees))
	    {
		map { $contigs{$_} = 1 } grep { !/^fig/ } keys %$btree ;
	    }
	}
	else
	{
	    &load_tbl($self);
	    my $tblH = $self->{_tbl};
	    while ( my($fid,$tuple) = each %$tblH)
	    {
		if ($tuple->[0]->[0] =~ /^(\S+)_\d+_\d+$/)
		{
		    $contigs{$1} = 1;
		}
	    }
	}
	return keys(%contigs);
    }
}

sub all_features {
    my($self,$genome,$type) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if ($genome ne $newG)
    {
	return $fig->all_features($genome,$type);
    }
    else
    {
	$self->load_feature_indexes();

	my %contigs;
	my $btrees = $self->{_feat_btree};

	if (ref($btrees))
	{
	    my $btree = $btrees->{$type};
	    return sort { &FIG::by_fig_id($a, $b) }
	    	grep { /^fig/ } keys %$btree;
	}
	else
	{
	    &load_tbl($self);
	    my $tblH = $self->{_tbl};
	    return sort { &FIG::by_fig_id($a,$b) }
	    grep { ($_ =~ /^fig\|\d+\.\d+\.([^\.]+)/) && ($1 eq $type) }
	    	keys(%$tblH);
	}
    }
}

sub all_features_detailed_fast {
    my($self,$genome, $regmin, $regmax, $contig) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if ($genome ne $newG)
    {
	return $fig->all_features_detailed_fast($genome, $regmin, $regmax, $contig);
    }
    else
    {
	$self->load_feature_indexes();
	my $feat_details = [];
	my $recnos = $self->{_feat_recno};

	if (ref($recnos))
	{
	    while (my($ftype, $list) = each (%$recnos))
	    {
		#
		# Look up start/end of this contig in the btree index.
		#
		
		my $inf = $self->{_feat_btree}->{$ftype}->{$contig};
		my($istart, $iend);
		if ($inf)
		{
		    ($istart, $iend) = split(/$;/, $inf);
		}
		else
		{
		    $istart = 0;
		    $iend = $#$list;
		}

		for (my $idx = $istart; $idx <= $iend; $idx++)
		{
		    my($fid, $fcontig, $fbeg, $fend, $fstrand) = split(/$;/, $list->[$idx]);

		    if (not defined($regmin) or not defined($regmax) or not defined($contig) or
			(($contig eq $fcontig) and
			 ($fbeg < $regmin and $regmin < $fend) or ($fbeg < $regmax and $regmax < $fend) or ($fbeg > $regmin and $fend < $regmax)))
		    {
			my($loc, $index, @aliases) = split(/$;/, $self->{_feat_btree}->{$ftype}->{$fid});
			
			push(@$feat_details,[$fid, $loc, join(",", @aliases), $ftype, $fbeg, $fend, $self->function_of($fid),'master','']);
		    }
		}
	    }
	}
	else
	{
	    &load_tbl($self);
	    my $tblH = $self->{_tbl};
	    while ( my($fid,$tuple) = each %$tblH)
	    {
		if ($fid =~ /^fig\|\d+\.\d+\.(\S+)\.\d+/)
		{
		    my $type = $1;
		    if ($tuple->[0]->[0] =~ /^(\S+)_(\d+)_(\d+)$/)
		    {
			my($ctg, $beg, $end) = ($1, $2, $3);
			next if (defined($contig) and $contig ne $ctg);
			
			my($min,$max);
			if ($beg < $end)
			{
			    $min = $beg;
			    $max = $end;
			}
			else
			{
			    $min = $end;
			    $max = $beg;
			}
			
			if (not defined($regmin) or not defined($regmax) or
			    ($min < $regmin and $regmin < $max) or ($min < $regmax and $regmax < $max) or ($min > $regmin and $max < $regmax))
			{
			    push(@$feat_details,[$fid,$tuple->[0]->[0],join(",",@{$tuple->[1]}),$type,$min,$max,$self->function_of($fid),'master','']);
			}
		    }
		}
	    }
	}
	return $feat_details;
    }
}

sub compute_clusters {
    # Get the parameters.
    my ($self, $pegList, $subsystem, $distance) = @_;
    if (! defined $distance) {
        $distance = 5000;
    }

    my($peg,%by_contig);
    foreach $peg (@$pegList)
    {
        my $loc;
        if ($loc = $self->feature_location($peg))
        {
            my ($contig,$beg,$end) = &FIG::boundaries_of($loc);
            my $genome = &FIG::genome_of($peg);
            push(@{$by_contig{"$genome\t$contig"}},[($beg+$end)/2,$peg]);
        }
    }

    my @clusters = ();
    foreach my $tuple (keys(%by_contig))
    {
        my $x = $by_contig{$tuple};
        my @pegs = sort { $a->[0] <=> $b->[0] } @$x;
        while ($x = shift @pegs)
        {
            my $clust = [$x->[1]];
            while ((@pegs > 0) && (abs($pegs[0]->[0] - $x->[0]) <= $distance))
            {
                $x = shift @pegs;
                push(@$clust,$x->[1]);
            }

            if (@$clust > 1)
            {
                push(@clusters,$clust);
            }
        }
    }
    return sort { @$b <=> @$a }  @clusters;
}

sub boundaries_of {
    my($self,@args) = @_;

    my $fig     = $self->{_fig};
    return $fig->boundaries_of(@args);
}



sub feature_location {
    my($self,$fid) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if (($fid =~ /^fig\|(\d+\.\d+)\.([^.]+)/) && ($1 eq $newG))
    {
	my $ftype = $2;
	
	$self->load_feature_indexes();

	my $btree = $self->{_feat_btree}->{$ftype};
	if ($btree)
	{
	    my($loc, $idx, @aliases) = split(/$;/, $btree->{$fid});
	    return wantarray ? split(/,/, $loc) : $loc;
	}
	else
	{
	    &load_tbl($self);
	    if (my $x = $self->{_tbl}->{$fid})
	    {
		if (wantarray)
		{
		    return @{$x->[0]};
		}
		else
		{
		    return join(",",@{$x->[0]});
		}
	    }
	    else
	    {
		return undef;
	    }
	}
    }
    else
    {
	return scalar $fig->feature_location($fid);
    }
}

sub assign_function
{
    my($self, $fid, $user, $function, $confidence) = @_;

    if (!$self->is_virtual_feature($fid))
    {
	return $self->{_fig}->assign_function($fid, $user, $function, $confidence);
    }
    
    $user =~ s/^master://i;
    if ((! $self->is_real_feature($fid)) || (! $user)) { return 0 }
    my $genome = $self->genome_of($fid);

    #  Just to make this a little less fragile, convert user of form
    #  master:name to master. -- GJO

    $user = 'master';   # from this point on all assignments are treated as master assignments

    $function =~ s/\s+/ /sg;
    $function =~ s/^\s+//;
    $function =~ s/\s+$//;
    if ($function =~ /^(.*?)[\!](.*)/)
    {
	my $kvs;
        ($function,$kvs) = ($1,$2);
	#
	# We don't have support for these any more
	#
	warn "Ignoring key/value pairs in assignment to $fid\n";
        if (0 and $kvs)
        {
            $kvs =~ s/^\s+//;
            $kvs =~ s/\s+$//;
            foreach my $kv (split(/\s+[\!\#]\s+/,$kvs))
            {
                if ($kv =~ /^([A-Za-z0-9._\-\+\%]+)\s+\^\s+(.*)$/)
                {
                    my ($k,$v) = ($1,$2);
                    if ($v !~ /\S/)
                    {
                        &replace_peg_key_value($self,$fid,$k,"");
                    }
                    else
                    {
                        &replace_peg_key_value($self,$fid,$k,$v);
                    }
                }
                elsif ($kv =~ /^([A-Za-z0-9._\-\+\%]+)$/)
                {
                    &replace_peg_key_value($self,$fid,$1,1);
                }
            }
        }
    }

    $confidence = $confidence ? $confidence : "";

    $self->{_functions}->{$fid} = $function;

    my $file = "$self->{_orgdir}/proposed_user_functions";
    if ( ! open( TMP, ">>$file" ) )
    {
        print STDERR "FAILED ASSIGNMENT: $fid\t$function\t$confidence\n";
        return 0;
    }

    flock(TMP,LOCK_EX) || confess "cannot lock assigned_functions";
    seek(TMP,0,2)      || confess "failed to seek to the end of the file";
    print TMP "$fid\t$function\t$confidence\n";
    close(TMP);
    chmod(0777,$file);

    return 1;
}

sub add_annotation {
    my($self,$feature_id,$user,$annotation, $time_made) = @_;
    my($genome);

    if (!$self->is_virtual_feature($feature_id))
    {
	return $self->{_fig}->add_annotation($feature_id,$user,$annotation, $time_made);
    }

    $time_made = time unless $time_made =~ /^\d+$/;

    if ($self->is_deleted_fid($feature_id)) { return 0 }

#   print STDERR "add: fid=$feature_id user=$user annotation=$annotation\n";
    if ($genome = $self->genome_of($feature_id))
    {
	my $file = "$self->{_orgdir}/annotations";
        my $ma   = ($annotation =~ /^Set master function to/);

        if (open(TMP,">>$file"))
        {
            flock(TMP,LOCK_EX) || confess "cannot lock annotations";
            seek(TMP,0,2)      || confess "failed to seek to the end of the file";

            my $dataLine = "$feature_id\n$time_made\n$user\n$annotation" . ((substr($annotation,-1) eq "\n") ? "" : "\n");
            print TMP $dataLine . "//\n";
            close(TMP);
            chmod 0777, $file;

	    #
	    # Update local cache.
	    #
	    my $ann = $self->{_ann};
	    push(@{$ann->{$feature_id}}, [$feature_id, $time_made, $user, $annotation . "\n"]);
        }
    }
    return 0;
}


sub function_of {
    my($self,$fid) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if (($fid =~ /^fig\|(\d+\.\d+)/) && ($1 eq $newG))
    {
	&load_functions($self);
	
	my $fn = $self->{_functions}->{$fid};
	if (wantarray)
	{
	    return ['master', $fn];
	}
	else
	{
	    return $fn;
	}
    }
    else
    {
	return $fig->function_of($fid);
    }
}


=pod

find_features_by_annotation

Takes a reference to a hash of functions to find and an optional case boolean, and returns a hash with keys are the function and values are a reference to an array of the IDs that have that function.

If the case boolean is set the search is case insensitive. Otherwise case sensitive.

=cut

sub find_features_by_annotation {
	my($self,$anno_hash, $case)=@_;
	$self->load_functions;

	if ($case) {map {$anno_hash->{uc($_)}=1} keys %$anno_hash}
	
	my $res={};
	foreach my $id (keys %{$self->{_functions}})
	{
		my $fn = $self->{_functions}->{$id};
		$case ? $fn = uc($fn) : 1;
		if ($anno_hash->{$fn}) {push @{$res->{$fn}}, $id}
	}
	
	return $res;
}


=pod 

search_features_by_annotation

Takes a string to find and an optional case boolean, and returns a hash with keys are the function and values are a reference to an array of the IDs that have that function.

If the case boolean is set the search is case insensitive. Otherwise case sensitive.

Note that this was originally based on the find above, but this uses a regexp for matching. Will likely be a lot slower which is why I only search for a single term. There may be an SQL way of doing this, if so let Rob know how to do it and I'll replace this method.


=cut

sub search_features_by_annotation {
	my($self,$term, $case)=@_;
	$self->load_functions;

	# to make case insensitive convert everything to uppercase
	# alternative is to use two regexps, one for case insens and one for not case insens
	# but the bad thing about that approach is that if you have a case insensitive search
	# you do two regexps for each failed match
	
	$case ? $term = uc($term) : 1;

	my $res={};
	foreach my $id (keys %{$self->{_functions}})
	{
		# we set two variables, one that has case changed for case insensitive searches
		my $fn = my $fnc = $self->{_functions}->{$id};
		$case ? $fn = uc($fn) : 1;
		if ($fn =~ m/$term/) {push @{$res->{$fnc}}, $id}
	}
	
	return $res;
}


sub feature_aliases {
    my($self,$fid) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    my @aliases;
    if (($fid =~ /^fig\|(\d+\.\d+)/) && ($1 eq $newG))
    {
	&load_tbl($self);
	if (my $x = $self->{_tbl}->{$fid})
	{
	    @aliases = @{$x->[1]};
	}
	else
	{
	    @aliases = ();
	}
    }
    else
    {
	@aliases = $fig->feature_aliases($fid);
    }
    return wantarray() ? @aliases : join(",",@aliases);
}

sub feature_annotations {
    my($self,$fid,$rawtime) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    my @annotations;
    if (($fid =~ /^fig\|(\d+\.\d+)/) && ($1 eq $newG))
    {
	&load_ann($self);
	if (my $x = $self->{_ann}->{$fid})
	{
	    @annotations = @{$x};
	}
	else
	{
	    @annotations = ();
	}

	if ($rawtime)
	{
	    return @annotations;
	}
	else
	{
	    return map { $_->[1] = localtime($_->[1]); $_ } @annotations;
	}
    }
    else
    {
	return $fig->feature_annotations($fid);
    }
}

sub get_translation {
    my($self,$peg) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if (($peg =~ /^fig\|(\d+\.\d+)/) && ($1 eq $newG))
    {
	&load_feature_indexes($self);

	my $out = $self->{_feat}->{peg}->{$peg};
	if (defined($out))
	{
	    return $out;
	}

	#
	# If we have a blast-formatted fasta, use fastacmd to
	# do the lookup, and cache the output for later use.
	#

	if ($self->{_feat_fasta}->{peg})
	{
	    my $id = "gnl|$peg";
	    my $cmd = "$FIG_Config::ext_bin/fastacmd -d $self->{_feat_fasta}->{peg} -s '$id'";
	    open(P, "$cmd|") or die "get_translation: cmd failed with $!: $cmd";
	    $_ = <P>;
	    my $out;
	    while (<P>)
	    {
		s/\s+//g;
		$out .= $_;
	    }
	    close(P);
	    $self->{_feat}->{$peg} = $out;
	    return $out;
	}
	else
	{
	    return $self->{_feat}->{$peg};
	}
    }
    else
    {
	return $fig->get_translation($peg);
    }
}

sub translation_length
{
    my($self, $peg) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if (($peg =~ /^fig\|(\d+\.\d+)/) && ($1 eq $newG))
    {
	my $t = $self->get_translation($peg);
	return length($t);
    }
    else
    {
	return $fig->translation_length($peg);
    }
}

sub translatable
{
    my($self, $peg) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if (($peg =~ /^fig\|(\d+\.\d+)/) && ($1 eq $newG))
    {
	return $self->translation_length($peg) > 0 ? 1 : 0;
    }
    else
    {
	return $fig->translatable($peg);
    }
}

sub pick_gene_boundaries {
    return &FIG::pick_gene_boundaries(@_);
}

sub call_start {
    return &FIG::call_start(@_);
}

sub is_real_feature
{
    my($self, $fid) = @_;

    if ($self->is_virtual_feature($fid))
    {
	return 1;
    }
    else
    {
	return $self->{_fig}->is_real_feature($fid);
    }

}

sub is_deleted_fid
{
    my($self, $fid) = @_;

    if ($self->is_virtual_feature($fid))
    {
	return 0;
    }
    else
    {
	return $self->{_fig}->is_real_feature($fid);
    }

}

sub pegs_of
{
    my($self, $genome) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if ($genome ne $newG)
    {
	return $fig->pegs_of($genome);
    }

    $self->load_feature_indexes();

    my $t = $self->{_feat_btree}->{peg};
    if ($t)
    {
	return keys %$t;
    }
    else
    {
	warn "Reverting to load_tbl for $newG\n";
	$self->load_tbl();
	return grep { /\.peg\./ } keys %{$self->{_tbl}};
    }
}


sub rnas_of
{
    my($self, $genome) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if ($genome ne $newG)
    {
	return $fig->rna_of($genome);
    }

    $self->load_feature_indexes();

    my $t = $self->{_feat_btree}->{rna};
    if ($t)
    {
	return keys %$t;
    }
    else
    {
	warn "Reverting to load_tbl for $newG\n";
	$self->load_tbl();
	return grep { /\.rna\./ } keys %{$self->{_tbl}};
    }

}

sub is_virtual_feature
{
    my($self, $peg) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if (($peg =~ /^fig\|(\d+\.\d+)/) && ($1 eq $newG))
    {
	return 1;
    }
    else
    {
	return 0;
    }
}

####################################################
#
# Following are some MG-RAST specific features. FIGV seems a good a place as any to include them.#
#
#

=head3 taxa_to_seed_ids

Given a prefix of a taxonomy string, return the list of metagenome fids that
mapped to SEED organisms in that taxonomy.

=cut

sub taxa_to_seed_ids
{
    my($self, $tax) = @_;

    my $btree_file = "$self->{_orgdir}/taxa_summary_by_blast.btree";
    my %btree;

    my $btree_tie = tie %btree, 'DB_File', $btree_file, O_RDONLY, 0666, $DB_BTREE;

    if (!$btree_tie)
    {
	warn "Cannot tie $btree_file: $!\n";
	return undef;
    }

    my $key = $tax;
    my ($res, $value);
    my @vals;
    my %seen;

    for ($res = $btree_tie->seq($key, $value, R_CURSOR);
	 $res == 0 and $key =~ /^$tax/;
	 $res = $btree_tie->seq($key, $value, R_NEXT))
    {
	print "$key\n";
	push(@vals, map { [$key, $_] } grep { not $seen{$_}++ } split(/$;/, $value));
    }
    return @vals;
}

sub load_feature_indexes
{
    my($self) = @_;

    if ($self->{_feat}) { return };

    my $newGdir = $self->{_orgdir};

    for my $fdir (<$newGdir/Features/*>)
    {
	my $ftype = basename($fdir);

	#
	# If we have a tbl.btree, tie that for our use.
	#
	
	my $tbl_idx = {};
	my $tie = tie %$tbl_idx, 'DB_File', "$fdir/tbl.btree", O_RDONLY, 0666, $DB_BTREE;
	if ($tie)
	{
	    $self->{_feat_tie}->{$ftype} = $tie;
	    $self->{_feat_btree}->{$ftype} = $tbl_idx;
	}
	
	my $tbl_list = [];
	my $ltie = tie @$tbl_list, 'DB_File', "$fdir/tbl.recno", O_RDONLY, 0666, $DB_RECNO;
	if ($tie)
	{
	    $self->{_feat_ltie}->{$ftype} = $ltie;
	    $self->{_feat_recno}->{$ftype} = $tbl_list;
	}
	
	#
	# If we have fasta.norm.phr, set _pseq_fasta to the fasta file to use with fastacmd.
	#
	
	my $pseq     = {};
	
	if (-f "$fdir/fasta.norm.phr")
	{
	    $self->{_feat_fasta}->{$ftype} = "$fdir/fasta.norm";
	}
	else
	{
	    #
	    # Otherwise, we need to load the data.
	    #

	    if (open(FASTA,"<$newGdir/Features/peg/fasta"))
	    {
		local $/ = "\n>";
		my $x;
		while (defined($x = <FASTA>))
		{
		    chomp $x;
		    if ($x =~ />?(\S+)[^\n]*\n(.*)/s)
		    {
			my $peg = $1;
			my $seq = $2;
			$seq =~ s/\s//gs;
			$pseq->{$peg} = $seq;
		    }
		}
		close(FASTA);
	    }
	}
	$self->{_feat}->{$ftype} = $pseq;
    }
}

sub load_pseq {
    my($self) = @_;

    if ($self->{_pseq}) { return };

    my $newGdir = $self->{_orgdir};
    my $fdir = "$newGdir/Features/peg";

    #
    # If we have a tbl.btree, tie that for our use.
    #

    my $tbl_idx = {};
    my $tie = tie %$tbl_idx, 'DB_File', "$fdir/tbl.btree", O_RDONLY, 0666, $DB_BTREE;
    if ($tie)
    {
	$self->{_tbl_tie} = $tie;
	$self->{_tbl_btree} = $tbl_idx;
    }

    #
    # If we have fasta.norm.phr, set _pseq_fasta to the fasta file to use with fastacmd.
    #

    my $pseq     = {};

    if (-f "$fdir/fasta.norm.phr")
    {
	$self->{_pseq_fasta} = "$fdir/fasta.norm";
    }
    else
    {
	#
	# Otherwise, we need to load the data.
	#

	if (open(FASTA,"<$newGdir/Features/peg/fasta"))
	{
	    local $/ = "\n>";
	    my $x;
	    while (defined($x = <FASTA>))
	    {
		chomp $x;
		if ($x =~ />?(\S+)[^\n]*\n(.*)/s)
		{
		    my $peg = $1;
		    my $seq = $2;
		    $seq =~ s/\s//gs;
		    $pseq->{$peg} = $seq;
		}
	    }
	    close(FASTA);
	}
    }
    $self->{_pseq} = $pseq;
}

sub load_ss_data
{
    my($self) = @_;

    return if defined($self->{_ss_bindings});

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    open(S, "<$newGdir/Subsystems/bindings") or die "Cannot open $newGdir/Subsystems/bindings: $!";

    my $peg_index;
    my $bindings;
    my $genome_index;
    my %seen;
    while (<S>)
    {
	chomp;
	my($sname, $role, $peg) = split(/\t/);

	my($genome) = ($peg =~ /fig\|(\d+\.\d+)/);

	# my $genome = &FIG::genome_of($peg);

	push(@{$bindings->{$sname}->{$role}}, $peg);
	push(@{$peg_index->{$peg}}, [$sname, $role]);

	if (!$seen{$genome, $sname, $role})
	{
	    push(@{$genome_index->{$genome}}, [$sname, $role]);
	}

    }
    close(S);

    open(S, "<$newGdir/Subsystems/subsystems") or die "Cannot open $newGdir/Subsystems/subsystems: $!";
    my $variant;
    while (<S>)
    {
	chomp;
	my($sname, $v) = split(/\t/);
	$variant->{$sname} = $v;
    }
    close(S);

    $self->{_ss_bindings} = $bindings;
    $self->{_ss_variants} = $variant;
    $self->{_ss_peg_index} = $peg_index;
    $self->{_ss_genome_index} = $genome_index;
}

sub load_tbl {
    my($self) = @_;

    if ($self->{_tbl}) { return };

    my $newGdir = $self->{_orgdir};
    my $tbl     = {};
    foreach my $x (`cat $newGdir/Features/*/tbl`)
    {
	if ($x =~ /^(\S+)\t(\S+)(\t(\S.*\S))?/)
	{
	    my $fid = $1;
	    my $loc = [split(/,/,$2)];
	    my $aliases = $4 ? [split(/\t/,$4)] : [];
	    $tbl->{$fid} = [$loc,$aliases];
	}
    }
    $self->{_tbl} = $tbl;
}

sub load_functions {
    my($self) = @_;

    if ($self->{_functions}) { return };

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    my $functions  = {};

    my $bfile = "$newGdir/assigned_functions.btree";
    my $tie;
    if (-f $bfile)
    {
	$tie = tie %$functions, 'DB_File', $bfile, O_RDONLY, 0666, $DB_BTREE;
    }

    if (!$tie)
    {
	foreach my $x (`cat $newGdir/*functions`)
	{
	    if (($x =~ /^(fig\|(\d+\.\d+)\.\S+)\t(\S[^\t]*\S)/) && ($2 eq $newG))
	    {
		$functions->{$1} = $3;
	    }
	}
    }
    
    $self->{_functions} = $functions;
}


sub bbhs
{
    my($self,$peg,$cutoff) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    $self->load_bbh_index();

    $cutoff = 1.0e-10 unless defined($cutoff);

    my $tie = $self->{_bbh_tie};

    my @bbhs = $tie->get_dup($peg);

    # @bbhs now a list of comma-separated tuples (id2, score, bitscore)
#    print Dumper(\@bbhs);

    @bbhs = grep { $_->[1] < $cutoff } map { [split(/,/)] } @bbhs;

    if (not ($peg =~ /^fig\|(\d+\.\d+)/ && ($1 eq $newG)))
    {
	#
	# If this isn't one of our pegs, we retrieve the bbhs from the underlying
	# SEED and merge in the bbhs that we have here.
	#

	my @fbbhs = $fig->bbhs($peg, $cutoff);
#	print Dumper(\@fbbhs);
	push(@bbhs, @fbbhs);
    }
    return sort { $a->[1] <=> $b->[1] } @bbhs;
}

sub sims
{
    my($self,$pegarg,$max,$maxP,$select, $max_expand, $filters) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};
    $max     = $max ? $max : 10000;
    $maxP    = $maxP ? $maxP : 1.0e-5;
    $select     = $select ? $select : "all";

    my $prefix = "_sims";
    my $flip_prefix = "_flips";

    if ($select eq 'raw')
    {
	$prefix = "_raw_sims";
	$flip_prefix = "_raw_flips";
    }

    #
    # Partition pegs into one set that is part of this organism
    # and another set that is not.
    #
    my @pegs = ref($pegarg) ? @$pegarg : ($pegarg);
    my(@part, @not_part, %part);
    my %sims;

    for my $peg  (@pegs)
    {
	if ($peg =~ /^fig\|(\d+\.\d+)/ and $1 eq $newG)
	{
	    push(@part, $peg);
	    $part{$peg}++;
	}
	else
	{
	    push(@not_part, $peg);
	    $sims{$peg} = [];
	}
    }

    #
    # Retrieve a batch of the sims for the not-part pegs, and partition
    # into %sims.
    #
    for my $sim ($fig->sims(\@not_part, $max, $maxP, $select, $max_expand, $filters))
    {
	push(@{$sims{$sim->id1}}, $sim);
    }

    my @out;
    my $start_len;

    for my $peg (@pegs)
    {
	$start_len = @out;
	if (not $part{$peg})
	{
	    my @flips = $self->retrieve_sims($peg, 	$flip_prefix, $maxP, $select);

	    @flips = sort { $b->bsc <=> $a->bsc } @flips;

	    # my @old = $fig->sims($peg,$max,$maxP,$select);

	    my @old = sort { $b->bsc <=> $a->bsc } @{$sims{$peg}};

	    # my @merged = ();
	    my $i1 = 0;
	    my $i2 = 0;
	    while (($i1 < @flips) || ($i2 < @old))
	    {
		if (($i1 == @flips) || (($i2 < @old) && ($flips[$i1]->[11] < $old[$i2]->[11])))
		{
		    # push(@merged,$old[$i2]);
		    push(@out,$old[$i2]);
		    $i2++;
		}
		else
		{
		    # push(@merged,$flips[$i1]);
		    push(@out,$flips[$i1]);
		    $i1++;
		}
	    }
	}
	else
	{
	    my @sims = $self->retrieve_sims($peg, $prefix, $maxP, $select);
	    push(@out, @sims);
	}

	if (@out - $start_len > $max)
	{
	    $#out = $start_len + $max - 1;
	}
    }

    return @out;
}

sub retrieve_sims
{
    my($self, $peg, $prefix, $maxP, $select) = @_;

    $self->load_sims_index();

    my $info = $self->{"${prefix}_index"}->{$peg};

    if ($info !~ /^(\d+),(\d+)$/)
    {
	return;
    }
    my($seek, $len) = ($1, $2);

    my $sims_txt = &FIG::read_block($self->{"${prefix}_fh"}, $seek, $len);

    my @sims = map { bless $_, 'Sim' } grep { $_->[10] <= $maxP } map { [split(/\t/)] } @$sims_txt;

    if ($select =~ /^figx?$/)
    {
	@sims = grep { $_->[1] =~ /^fig/ } @sims;
    }

    return @sims;
}

sub sims_old {
    my($self,$peg,$max,$maxP,$select) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};
    $max     = $max ? $max : 10000;
    $maxP    = $maxP ? $maxP : 1.0e-5;
    $select     = $select ? $select : "all";

    if (($peg =~ /^fig\|(\d+\.\d+)/) && ($1 eq $newG))
    {
	&load_sims($self);
	my @sims = ();
	my $raw_sims = $self->{_sims}->{$peg};
	if ($raw_sims)
	{
	    foreach my $sim ( grep { $_->[10] <= $maxP } @$raw_sims )
	    {
		my $id2 = $sim->id2;
		my $id1 = $sim->id1;
		my @relevant = ();

		my @maps_to = $fig->mapped_prot_ids( $id2 );
		my $ref_len = $maps_to[0]->[1];

		@maps_to = grep { $_->[0] !~ /^xxx\d+/ } @maps_to;

		if ( $select =~ /^figx?$/ )          # Only fig
		{
		    @relevant = grep { $_->[0] =~ /^fig/ } @maps_to;
		}
		else                                 # All
		{
		    @relevant = @maps_to;
		}

		my $seen = {};
		foreach my $x ( @relevant )
		{
		    my ( $x_id, $x_ln ) = @$x;

		    next if $seen->{$x_id};
		    $seen->{$x_id} = 1;

		    my $delta2  = $ref_len - $x_ln;   # Coordinate shift
		    my $sim1    = [ @$sim ];                  # Make a copy
		    $sim1->[1]  = $x_id;
		    $sim1->[8] -= $delta2;
		    $sim1->[9] -= $delta2;
		    bless( $sim1, "Sim" );
		    push( @sims, $sim1 );
		}
	    }
	}

	if (@sims > $max) { $#sims = $max-1; }
	return @sims;
    }
    else
    {
	return $fig->sims($peg,$max,$maxP,$select);
    }
}


sub coupled_to
{
    my($self,$peg) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if ($peg =~ /^fig\|$newG\.peg/)
    {

	$self->load_coupling_index();

	my $tie = $self->{_pch_tie};

	if (not defined($tie))
	{
	    return;
	}

	my @dat = $tie->get_dup($peg);

	return map { [split(/$;/, $_)] } @dat;
    }
    else
    {
	return $fig->coupled_to($peg);
    }

}

sub coupling_evidence
{
    my($self,$peg1, $peg2) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if ($peg1 =~ /^fig\|$newG\.peg/)
    {
	$self->load_coupling_index();

	my $tie = $self->{_pch_ev_tie};

	if (not defined($tie))
	{
	    return;
	}

	my @dat = $tie->get_dup("$peg1$;$peg2");

	my @a;
	return map { @a = split(/$;/, $_); [@a[0,1,4]] } @dat;
    }
    else
    {
	return $fig->coupling_evidence($peg1, $peg2);
    }

}

sub coupling_and_evidence
{
    my($self,$peg1) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if ($peg1 =~ /^fig\|$newG\.peg/)
    {
	$self->load_coupling_index();

	my $tie = $self->{_pch_tie};
	my $evtie = $self->{_pch_ev_tie};

	if (not(defined($tie) and defined($evtie)))
	{
	    return;
	}

	my @out;
	my @coupled_to = $tie->get_dup($peg1);
	for my $ent (@coupled_to)
	{
	    my ($peg2, $score) = split(/$;/, $ent);

	    my @ev = $evtie->get_dup("$peg1$;$peg2");

	    my $l = [];
	    for my $event (@ev)
	    {
		my($peg3, $peg4, $iden3, $iden4, $rep) = split(/$;/, $event);
		push(@$l, [$peg3, $peg4]);
	    }
	    push(@out, [$score, $peg2, $l]);
	}
	return @out;
    }
    else
    {
	return $fig->coupling_and_evidence($peg1);
    }

}

sub in_pch_pin_with
{
    my($self, $peg1, $diverse) = @_;

    my @all = $self->in_pch_pin_with_and_evidence($peg1);

    if ($diverse)
    {
	return map { $_->[0] } grep { $_->[1] == 1 } @all;
    }
    else
    {
	return map { $_->[0] } @all;
    }
}

sub in_pch_pin_with_and_evidence
{
    my($self,$peg1) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if ($peg1 !~ /^fig\|$newG\.peg/)
    {
	return $fig->in_pch_pin_with_and_evidence($peg1);
    }

    $self->load_coupling_index();

    my $evtie = $self->{_pch_ev_tie};

    my($key, $value, $st);

    my %max;

    $key = "$peg1$;";
    my $qkey = quotemeta($key);
    for ($st = $evtie->seq($key, $value, R_CURSOR); $st == 0 and $key =~ /^$qkey/; $st = $evtie->seq($key, $value, R_NEXT))
    {
#	print "key=$key value=$value\n";

	my($peg3, $peg4, $iden3, $iden4, $rep) = split(/$;/, $value);

	if (exists($max{$peg3}))
	{
	    $max{$peg3} = $rep if $rep > $max{$peg3};
	}
	else
	{
	    $max{$peg3} = $rep;
	}
    }

    return map { [$_, $max{$_}] } keys %max;
}


sub load_coupling_index
{
    my($self) = @_;

    return if defined($self->{_pch_index});

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    my $pch_btree = "$newGdir/pchs.btree";
    my $ev_btree = "$newGdir/pchs.evidence.btree";

    my $pch_index = {};
    my $ev_index = {};

    my $tied = tie %$pch_index, 'DB_File', $pch_btree, O_RDONLY, 0666, $DB_BTREE;
    my $evtied = tie %$ev_index, 'DB_File', $ev_btree, O_RDONLY, 0666, $DB_BTREE;

    #
    # Set these even if failed so we don't keep trying to open and failing.
    #
    $self->{_pch_index} = $pch_index;
    $self->{_pch_tie} = $tied;
    $self->{_pch_ev_index} = $ev_index;
    $self->{_pch_ev_tie} = $evtied;

    if (not $tied)
    {
	warn "Cannot tie pch index $pch_btree: $!\n";
    }
}

sub load_bbh_index
{
    my($self) = @_;

    return if defined($self->{_bbh_index});

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    my $bbh_file = "$newGdir/bbhs";
    my $bbh_index_file = "$bbh_file.index";
    my $bbh_index = {};

    my $tied = tie %$bbh_index, 'DB_File', $bbh_index_file, O_RDONLY, 0666, $DB_BTREE;

    #
    # Set these even if failed so we don't keep trying to open and failing.
    #
    $self->{_bbh_index} = $bbh_index;
    $self->{_bbh_tie} = $tied;

    if (not $tied)
    {
	warn "Cannot tie bbh index $bbh_index_file: $!\n";
    }
}

sub load_contigs_index
{
    my($self) = @_;

    return if defined($self->{_contigs_index});

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    my $contig_index_file = "$newGdir/contigs.btree";
    my $contig_index = {};
    my $contig_len_index_file = "$newGdir/contig_len.btree";
    my $contig_len_index = {};

    my $tied = tie %$contig_index, 'DB_File', $contig_index_file, O_RDONLY, 0666, $DB_BTREE;
    if (not $tied)
    {
	# warn "Cannot tie contig index $contig_index_file: $!\n";
    }

    my $ltied = tie %$contig_len_index, 'DB_File', $contig_len_index_file, O_RDONLY, 0666, $DB_BTREE;
    if (not $ltied)
    {
	# warn "Cannot tie contig length index $contig_len_index_file: $!\n";
    }

    #
    # Set these even if failed so we don't keep trying to open and failing.
    #
    $self->{_contigs_index} = $contig_index;
    $self->{_contigs_tie} = $tied;
    $self->{_contig_len_index} = $contig_len_index;
    $self->{_contig_len_tie} = $ltied;

}

sub load_sims_index
{
    my($self) = @_;

    return if defined($self->{_sims_index});

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    my $sims_file = "$newGdir/expanded_similarities";
    my $sims_index_file = "$sims_file.index";
    my $sims_index = {};

    my $flips_file = "$newGdir/expanded_similarities.flips";
    my $flips_index_file = "$flips_file.index";
    my $flips_index = {};

    my $raw_sims_file = "$newGdir/similarities";
    my $raw_sims_index_file = "$raw_sims_file.index";
    my $raw_sims_index = {};

    my $raw_flips_file = "$newGdir/similarities.flips";
    my $raw_flips_index_file = "$raw_flips_file.index";
    my $raw_flips_index = {};

    $self->open_sims($sims_index, $sims_file, $sims_index_file, "_sims");
    $self->open_sims($flips_index, $flips_file, $flips_index_file, "_flips");
    $self->open_sims($raw_sims_index, $raw_sims_file, $raw_sims_index_file, "_raw_sims");
    $self->open_sims($raw_flips_index, $raw_flips_file, $raw_flips_index_file, "_raw_flips");
}

#
# Open a sims file, tie it to a hash, and store the info into the $self obj.
#
sub open_sims
{
    my($self, $hash, $sims_file, $index_file, $prefix) = @_;

    my $tied = tie %$hash, 'DB_File', $index_file, O_RDONLY, 0666, $DB_BTREE;

    #
    # Set these even if failed so we don't keep trying to open and failing.
    #
    $self->{"${prefix}_index"} = $hash;
    $self->{"${prefix}_tie"} = $tied;

    if (not $tied)
    {
	warn "Cannot tie sims index $index_file: $!\n";
    }

    #
    # open the sims file as well.
    #

    $self->{"${prefix}_fh"} = new FileHandle("<$sims_file");

    if (!$self->{"${prefix}_fh"})
    {
	warn "Cannot open sims file $sims_file: $!\n";
    }
}

sub load_sims {
    my($self) = @_;

    if ($self->{_sims}) { return };

    my $newGdir = $self->{_orgdir};

    my $sims     = {};
    foreach my $x (`cat $newGdir/similarities`)
    {
	chomp $x;
	if ($x =~ /^(\S+)/)
	{
	    push(@{$sims->{$1}},bless([split(/\t/,$x)],'Sim'));
	}
    }
    $self->{_sims} = $sims;
}

sub get_attributes {
    my($self, $id, $attr) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if ((($id =~ /^fig\|(\d+\.\d+)\.peg\.\d+$/) and ($1 eq $newG)) or
	(($id =~ /^(\d+\.\d+)/  and $1 eq $newG)))
    {
	&load_attr($self);

	my $t = $self->{_attr_id_tie};
	if (!$t)
	{
	    #
	    # Tied index not present, bail back to simple attributes.
	    #

	    my $l = $self->{_attr_id}->{$id};
	    return $l ? @$l : ();
	}

	my @v = $t->get_dup($id);

	my @out;
	for my $v (@v)
	{
	    my @a = split(/$;/, $v);

	    if (!defined($attr) or $attr eq $a[1])
	    {
		push(@out, [@a]);
	    }
	}
	return @out;
    }
    else
    {
	my @f = $fig->get_attributes($id, $attr);
	if (!defined($id) and defined(my $t = $self->{_attr_key_tie}))
	{
	    #
	    # lookup locally for $attr matches and merge with other output.
	    #

	    my @mine = $t->get_dup($attr);
	    @mine = map { [split(/$;/, $_)] } @mine;

	    return (@mine, @f);
	}
	else
	{
	    return @f;
	}
    }
}

sub load_attr {
    my($self) = @_;

    if ($self->{_attr_id}) { return };

    my $newGdir = $self->{_orgdir};

    my $id = {};
    my $key = {};

    $self->{_attr_id_tie} = tie %$id, 'DB_File', "$newGdir/attr_id.btree", O_RDONLY, 0, $DB_BTREE;
    $self->{_attr_key_tie} = tie %$key, 'DB_File', "$newGdir/attr_key.btree", O_RDONLY, 0, $DB_BTREE;

    $self->{_attr_id} = $id;
    $self->{_attr_key} = $key;

    #
    # If the tie didn't work for ids, at least load up the evidence codes.
    #

    if (!$self->{_attr_id_tie})
    {
	foreach my $x (`cat $newGdir/evidence.codes`)
	{
	    if ($x =~ /^(\S+)\t(\S+)/)
	    {
		push(@{$id->{$1}},[$1,"evidence_code",$2,""]);
	    }
	}
    }

}

sub load_ann {
    my($self) = @_;

    if ($self->{_ann}) { return };

    my $newGdir = $self->{_orgdir};
    my $ann     = {};
    if (open(ANN,"<$newGdir/annotations"))
    {
	$/ = "\n//\n";
	while (defined(my $x = <ANN>))
	{
	    chomp $x;
	    if ($x =~ /^(\S+)\n([^\n]+)\n([^\n]+)\n(.*)/s)
	    {
		push(@{$ann->{$1}},[$1,$2,$3,"$4\n"]);
	    }
	}
	$/ = "\n";
	close(ANN);
    }
    $self->{_ann} = $ann;
}

sub taxonomy_of {
    my($self,$genome) = @_;
    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if ($newG ne $genome)
    {
	return $fig->taxonomy_of($genome);
    }

    my $tax;
    if (open(TAX,"<$newGdir/TAXONOMY") && ($tax = <TAX>))
    {
	chop $tax;
	return $tax;
    }
    else
    {
	return "unknown";
    }
}

sub build_tree_of_complete {
    my($self,$min_for_label) = @_;
    return $self->build_tree_of_all($min_for_label, "complete");
}

sub build_tree_of_all {
    my($self, $min_for_label, $complete)=@_;
    my(@last,@tax,$i,$prefix,$lev,$genome,$tax);

    $min_for_label = $min_for_label ? $min_for_label : 10;
    open(TMP,">/tmp/tree$$") || die "could not open /tmp/tree$$";
    print TMP "1. root\n";

    @last = ();


    foreach $genome (grep { ! $self->is_environmental($_) } $self->sort_genomes_by_taxonomy($self->genomes($complete)))
    {
        $tax = $self->taxonomy_of($genome);
        @tax = split(/\s*;\s*/,$tax);
        push(@tax,$genome);
        for ($i=0; ((@last > $i) && (@tax > $i) && ($last[$i] eq $tax[$i])); $i++) {}
        while ($i < @tax)
        {
            $lev = $i+2;
            $prefix = " " x (4 * ($lev-1));
            print TMP "$prefix$lev\. $tax[$i]\n";
            $i++;
        }
        @last = @tax;
    }
    close(TMP);
    my $tree = &tree_utilities::build_tree_from_outline("/tmp/tree$$");
    $tree->[0] = 'All';
    &FIG::limit_labels($tree,$min_for_label);
    unlink("/tmp/tree$$");
    return ($tree,&tree_utilities::tips_of_tree($tree));
}

sub sort_genomes_by_taxonomy {
    my($self,@genomes) = @_;

    return map     { $_->[0] }
           sort    { $a->[1] cmp $b->[1] }
           map     { [$_,$self->taxonomy_of($_)] }
           @genomes;
}

sub taxonomic_groups_of_complete {
    my($self,$min_for_labels) = @_;

    my($tree,undef) = $self->build_tree_of_complete($min_for_labels);
    return &FIG::taxonomic_groups($tree);
}

=head2 Search Database

Searches the database for objects that match the query string in some way.

Returns a list of results if the query is ambiguous or an unique identifier
otherwise.

=cut

sub search_database {
    # get parameters
    my ($self, $query, $options) = @_;

    my $fig     = $self->{_fig};
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    #
    # Check for local ids and genome.
    #


    if ($query eq $newG or lc($query) eq lc($self->genus_species($newG)))
    {
	return { type => 'organism', result => $newG };
    }

    if ($query =~ /^fig\|(\d+\.\d+)\.peg/ and $1 eq $newG)
    {
	return { type => 'feature', result => $query };
    }

    #
    # Match on functions
    #

    &load_functions($self);

    my @protlist;
    my $fn = $self->{_functions};
    for my $id (keys %$fn)
    {
	if ($fn->{$id} =~ /$query/i)
	{
	    push @protlist, [$id, $fn->{$id}, $newG];
	}
    }

    #
    # Pull FIG lookups.
    #

    my $res = $fig->search_database($query, $options);

    #
    # And integrate our protein list
    #

    my $res_prot;
    if (ref($res) eq 'ARRAY')
    {
	for my $ent (@$res)
	{
	    if ($ent->{type} eq 'proteins')
	    {
		$res_prot = $ent->{result};
	    }
	}
	if (!$res_prot)
	{
	    $res_prot = {type => 'proteins', result => \@protlist}
	}
	else
	{
	    push(@$res_prot, @protlist);
	}
    }

    return $res;

}


=head3 model_directory

    FIG->model_directory($organism);

Returns the model directory of an organism.

=over 4

=item $organism

The seed-taxonomy id of the organism, e.g. 83333.1. If you pass the
string 'All', you will get the directory for the global model.

=back

=cut

sub model_directory {
  my ($self, $organism) = @_;

  my $directory;

  if ($organism eq $self->{_genome})
  {
      $directory = $self->{_orgdir} . "/Models";
      $directory .= "/$organism" if defined($organism);
  }
  elsif(!defined $organism && defined $self->{_genome})
  {
      $directory = $self->{_orgdir} . "/Models";
  }
  else {
      $directory = $self->{_fig}->model_directory($organism);
  }

  return $directory;
}

1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3