[Bio] / FigKernelPackages / SeedV.pm Repository:
ViewVC logotype

View of /FigKernelPackages/SeedV.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Thu Dec 17 21:35:51 2009 UTC (10 years, 3 months ago) by olson
Branch: MAIN
CVS Tags: rast_rel_2010_0526, rast_rel_2010_0118
Changes since 1.1: +5 -1 lines
remove Tracer ref

# -*- perl -*-

#
# This is a SAS Component
#

#########################################################################
# Copyright (c) 2003-2008 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#########################################################################

package SeedV;

use Carp;
use strict;
use Data::Dumper;
use DB_File;
use File::Basename;

sub new {
    my ($class, $org_dir ) = @_;

    $org_dir || confess("$org_dir must be a valid SEED directory");
    my $self         = {};
    $self->{_orgdir} = $org_dir;
    ($org_dir =~ /(\d+\.\d+)$/) || confess("$org_dir must be a path ending in the genome ID");
    $self->{_genome} = $1;
    return bless $self, $class;
}

sub genome_id
{
    return $_[0]->{_genome};
}

# return the path of the organism directory
sub organism_directory {
  return $_[0]->{_orgdir};
}

sub get_basic_statistics
{
    my($self) = @_;

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    #
    # Check cache.
    #

    my $cache = "$newGdir/cache.basic_statistics";
    my $fh = new FileHandle($cache);
    if ($fh)
    {
	my $stats = {};
	while (<$fh>)
	{
	    chomp;
	    my($k, $v) = split(/\t/);
	    $stats->{$k} = $v;
	}
	close($fh);
	return $stats;
    }


    my $subsystem_data = $self->get_genome_subsystem_data();

    my %sscount = map { $_->[0] => 1 } @$subsystem_data;
    my $nss=scalar(keys(%sscount));

    my $statistics = {
	num_subsystems => $nss,
	num_contigs    => scalar($self->all_contigs()),
	num_basepairs  => $self->genome_szdna(),
	genome_name    => $self->genus_species(),
	genome_domain  => $self->genome_domain(),
	genome_pegs    => $self->genome_pegs(),
	genome_rnas    => $self->genome_rnas(),
	};

    $fh = new FileHandle(">$cache");
    if ($fh)
    {
	while (my($k, $v) = each %$statistics)
	{
	    print $fh join("\t", $k, $v), "\n";
	}
	close($fh);
    }
    return $statistics;
}


sub get_peg_statistics {
    my ($self) = @_;

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    my $cache = "$newGdir/cache.peg_statistics";
    my $fh = new FileHandle($cache);
    if ($fh)
    {
	my $stats = {};
	while (<$fh>)
	{
	    chomp;
	    my($k, $v) = split(/\t/);
	    $stats->{$k} = $v;
	}
	close($fh);
	return $stats;
    }


    my $subsystem_data = $self->get_genome_subsystem_data();
    my $assignment_data = $self->get_genome_assignment_data();

    my $hypo_sub = 0;
    my $hypo_nosub = 0;
    my $nothypo_sub = 0;
    my $nothypo_nosub = 0;
    my %in = map { $_->[2] => 1 } @$subsystem_data;
    my $in = keys(%in);

    my %sscount = map { $_->[0] => 1 } @$subsystem_data;

    foreach $_ (@$assignment_data)
    {
	my($peg,$func) = @$_;

	my $is_hypo = &SeedUtils::hypo($func);
	    
	if    ($is_hypo && $in{$peg})           { $hypo_sub++ }
	elsif ($is_hypo && ! $in{$peg})         { $hypo_nosub++ }
	elsif ((! $is_hypo) && (! $in{$peg}))   { $nothypo_nosub++ }
	elsif ((! $is_hypo) && $in{$peg})       { $nothypo_sub++ }
    }
    my $tot = $hypo_sub + $nothypo_sub + $hypo_nosub + $nothypo_nosub;

    my ($fracHS, $fracNHS, $fracHNS, $fracNHNS);

    if ($tot == 0) {
	$fracHS = sprintf "%.2f", 0.0;
	$fracNHS = sprintf "%.2f", 0.0;
	$fracHNS = sprintf "%.2f", 0.0;
	$fracNHNS = sprintf "%.2f", 0.0;
    } else {
	$fracHS = sprintf "%.2f", $hypo_sub / $tot * 100;
	$fracNHS = sprintf "%.2f", $nothypo_sub / $tot * 100;
	$fracHNS = sprintf "%.2f", $hypo_nosub / $tot * 100;
	$fracNHNS = sprintf "%.2f", $nothypo_nosub / $tot * 100;
    }

    my $statistics = {
	hypothetical_in_subsystem => $hypo_sub,
	hypothetical_not_in_subsystem => $hypo_nosub,
	non_hypothetical_in_subsystem => $nothypo_sub,
	non_hypothetical_not_in_subsystem => $nothypo_nosub,
	hypothetical_in_subsystem_percent => $fracHS,
	hypothetical_not_in_subsystem_percent => $fracHNS,
	non_hypothetical_in_subsystem_percent => $fracNHS,
	non_hypothetical_not_in_subsystem_percent => $fracNHNS
	};

    $fh = new FileHandle(">$cache");
    if ($fh)
    {
	while (my($k, $v) = each %$statistics)
	{
	    print $fh join("\t", $k, $v), "\n";
	}
	close($fh);
    }

    return $statistics;
}
sub get_variant_and_bindings
{
    my($self, $ssa) = @_;

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    $self->load_ss_data();

    my $bindings = $self->{_ss_bindings}->{$ssa};
    my $variant = $self->{_ss_variants}->{$ssa};

    unless ($bindings) {
	$variant = '*-1';
	$bindings = {};	
    }

    return ($variant, $bindings);
}

sub active_subsystems
{
    my($self, $all) = @_;

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    $self->load_ss_data();

    my $slist = {};

    if ($self->{_ss_variants})
    {
	%{$slist} = %{$self->{_ss_variants}};
    }

    if (not $all)
    {
	for my $ss (keys %$slist)
	{
	    my $var = $slist->{$ss};
	    delete $slist->{$ss} if $var eq 0 or $var eq -1;
	}
    }
    return $slist;
}

sub peg_to_subsystems
{
    my($self, $peg) = @_;

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    $self->load_ss_data();

    my $variant = $self->{_ss_variants};
    my %sub = map { $_ => 1 }
              grep { $variant->{$_} !~ /^(-1)|0$/ }
              map { $_->[0] }
              $self->peg_to_roles_in_subsystems($peg);
    return sort keys(%sub);
}

sub peg_to_roles_in_subsystems
{
    my($self,$peg) = @_;

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    $self->load_ss_data();

    my $ret  = $self->{_ss_peg_index}->{$peg};

    return $ret ? @$ret : ();
}

sub subsystems_for_peg
{
    my($self,$peg) = @_;
    return $self->peg_to_roles_in_subsystems($peg);
}

sub subsystems_for_peg_complete {
    my ($self, $peg) = @_;
    
    my $newG = $self->{_genome};

    $self->load_ss_data();

    my $ret = $self->{_ss_peg_index}->{$peg};
    if ($ret) {
	return map { [ $_->[0], $_->[1], $self->{_ss_variants}->{$_->[0] }, $peg ] } @$ret;
    } else {
	return ();
    }
}

sub genus_species {
    my($self) = @_;

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if (open(GENOME,"<$newGdir/GENOME"))
    {
	my $x = <GENOME>;
	close(GENOME);
	chop $x;
	return $x;
    }
    else
    {
	return "";
    }
}

sub get_genome_assignment_data {
    my($self) = @_;

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    my @fnfiles = <$newGdir/proposed*functions>;
    if (@fnfiles == 0)
    {
	@fnfiles = <$newGdir/assigned*functions>;
    }

    if (@fnfiles == 0)
    {
	return [];
    }
    my %assign;
    foreach $_ (`cat @fnfiles`)
    {
	if ( $_ =~ /^(fig\|\d+\.\d+\.peg\.\d+)\t(\S.*\S)/)
	{
	    my($fid,$func) = ($1,$2);
	    $assign{$fid} = $func;
	}
	return [map { [$_,$assign{$_}] } sort { &SeedUtils::by_fig_id($a,$b) } keys(%assign)];
    }
}

sub get_genome_subsystem_data {
    my($self,$genome) = @_;

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

	my %operational = map { (($_ =~ /^(\S.*\S)\t(\S+)/) && (($2 ne '-1') && ($2 ne '0'))) ? ($1 => 1) : () }
		          `cat $newGdir/Subsystems/subsystems`;
	           
	return [grep { ! $self->is_deleted_fid($_->[2]) }
	        map { (($_ =~ /^(\S[^\t]+\S)\t(\S[^\t]*\S)\t(\S+)/) && $operational{$1} ) ? [$1,$2,$3] : () }
		`cat $newGdir/Subsystems/bindings`];
}

sub get_genome_subsystem_count
{
    my($self,$genome) = @_;
    
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};
    

	my $count = 0;
	my ($entry, $vc);
	open(SUBSYSTEMS, "<$newGdir/Subsystems/subsystems");
	while (defined($entry = <SUBSYSTEMS>))
	{
	    chomp $entry;
	    (undef, $vc) = split /\t/, $entry;
	    if ($vc != -1) { ++$count; }
	}
	close(SUBSYSTEMS);
	return $count;
}

sub orgname_of_orgid {
    my($self,$genome) = @_;

    return $self->genus_species();
}

sub genus_species_domain {
    my($self,$genome) = @_;

    return [$self->genus_species(),$self->genome_domain()];
}

sub protein_subsystem_to_roles {
    my ($self,$peg,$subsystem) = @_;

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

	my @roles = map { (($_ =~ /^([^\t]+)\t([^\t]+)\t(\S+)$/) && ($1 eq $subsystem) && ($3 eq $peg)) ?
			  $2 : () } `cat $newGdir/Subsystems/bindings`;
	my %roles = map { $_ => 1 } @roles;
	return [sort keys(%roles)];
}

sub contig_lengths {
    my ($self) = @_;

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

	my $contig_lengths = $self->{_contig_len_index};

	$self->load_contigs_index();
	if (!tied(%$contig_lengths))
	{
	    $self->load_contig_len_cache();
	    return $self->{_contig_len_cache};
	}
	return $contig_lengths;
}

sub load_contig_len_cache
{
    my($self) = @_;

    return if ref $self->{_contig_len_cache};
		  
    my $newGdir = $self->{_orgdir};

    my $contig_lengths = {};
    if (open(CONTIGS,"<$newGdir/contigs"))
    {
	local $/ = "\n>";
	while (defined(my $x = <CONTIGS>))
	{
	    chomp $x;
	    if ($x =~ />?(\S+)[^\n]*\n(.*)/s)
	    {
		my $id = $1;
		my $seq = $2;
		$seq =~ s/\s//gs;
		$contig_lengths->{$id} = length($seq);
	    }
	}
	close(CONTIGS);
    }
    $self->{_contig_len_cache} = $contig_lengths;
}

sub contig_ln {
    my ($self, $contig) = @_;

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

	my $contig_len = $self->{_contig_len_index};
	$self->load_contigs_index();
	if (tied(%$contig_len))
	{
	    return $contig_len->{$contig};
	}
	$self->load_contig_len_cache();
	return $self->{_contig_len_cache}->{$contig};
}

sub contigs_of
{
    my ($self) = @_;

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

	my @out;
	$self->load_contigs_index();

	my $contigs = $self->{_contigs_index};
	if (tied(%$contigs))
	{
	    return keys %$contigs;
	}

	$self->load_contig_len_cache();

	return keys %{$self->{_contig_len_cache}};
}

=head3 dna_seq

usage: $seq = dna_seq(@locations)

Returns the concatenated subsequences described by the list of locations.  Each location
must be of the form

    Contig_Beg_End

where Contig must be the ID of a contig for genome $genome.  If Beg > End the location
describes a stretch of the complementary strand.

=cut
#: Return Type $;
sub dna_seq {
    my($self,@locations) = @_;

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    my $contigs = $self->{_contigs_index};
    if (!tied %$contigs)
    {
	$self->load_contig_seq();
	$contigs = $self->{_contigs_seq_cache};
    }

    my(@pieces,$loc,$contig,$beg,$end,$ln,$rdbH);

    @locations = map { split(/,/,$_) } @locations;
    @pieces = ();
    foreach $loc (@locations)
    {
        if ($loc =~ /^(\S+)_(\d+)_(\d+)$/)
        {
            ($contig,$beg,$end) = ($1,$2,$3);
	    my $seq = $contigs->{$contig};

            $ln = length($seq);

            if (! $ln) {
                print STDERR "$contig: could not get length\n";
                return "";
            }

            if (&SeedUtils::between(1,$beg,$ln) && &SeedUtils::between(1,$end,$ln))
            {
                if ($beg < $end)
                {
                    push(@pieces, substr($seq, $beg - 1, ($end - $beg) + 1));
                }
                else
                {
                    push(@pieces, &SeedUtils::reverse_comp(substr($seq, $end - 1, ($beg - $end) + 1)));
                }
            }
        }
    }
    return lc(join("",@pieces));
}

sub load_contig_seq
{
    my($self) = @_;

    return if ref($self->{_contigs_seq_cache});

    my $newGdir = $self->{_orgdir};

    my $contigs = {};

    if (open(CONTIGS,"<$newGdir/contigs"))
    {
	local $/ = "\n>";
	while (defined(my $x = <CONTIGS>))
	{
	    chomp $x;
	    if ($x =~ />?(\S+)[^\n]*\n(.*)/s)
	    {
		my $id = $1;
		my $seq = $2;
		$seq =~ s/\s//gs;
		$contigs->{$id} = $seq;
	    }
	}
	close(CONTIGS);
    }
    $self->{_contigs_seq_cache} = $contigs;
}

sub genome_szdna {
    my ($self) = @_;

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

	my $contig_lens = $self->contig_lengths();
	my $tot = 0;
	while ( my($contig,$len) = each %$contig_lens)
	{
	    $tot += $len;
	}
	return $tot;
}

sub genome_pegs {
    my ($self) = @_;

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    my @tmp = $self->all_features("peg");
    my $n = @tmp;
    return $n;
}

sub genome_rnas {
    my ($self) = @_;

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

	my @tmp = $self->all_features("rna");
	my $n = @tmp;
	return $n;
}

sub genome_domain {
    my ($self) = @_;

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

	my $tax = $self->taxonomy_of();
	return ($tax =~ /^([^ \t;]+)/) ? $1 : "unknown";
}

sub genes_in_region {
    my($self,$contig,$beg,$end) = @_;

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

	$self->load_feature_indexes();
	#
	# Use the recno index if exists.
	#

	my $maxV = 0;
	my $minV = 1000000000;
	my $genes = [];

	my $recnos = $self->{_feat_recno};

	if (ref($recnos))
	{
	    while (my($ftype, $list) = each (%$recnos))
	    {
		#
		# Look up start/end of this contig in the btree index.
		#
		
		my $inf = $self->{_feat_btree}->{$ftype}->{$contig};
		my($istart, $iend);
		if ($inf)
		{
		    ($istart, $iend) = split(/$;/, $inf);
		}
		else
		{
		    $istart = 0;
		    $iend = $#$list;
		}

		for (my $idx = $istart; $idx <= $iend; $idx++)
		{
		    my($fid, $fcontig, $fbeg, $fend, $fstrand) = split(/$;/, $list->[$idx]);
		    if ($contig eq $fcontig and &overlaps($beg, $end, $fbeg, $fend))
		    {
			$minV = &SeedUtils::min($minV,$fbeg,$fend);
			$maxV = &SeedUtils::max($maxV,$fbeg,$fend);
			push(@$genes,$fid);
		    }
		}
	    }
	}
	else
	{
	    &load_tbl($self);
	    my $tblH = $self->{_tbl};
	    while ( my($fid,$tuple) = each %$tblH)
	    {
		if (($tuple->[0]->[0] =~ /^(\S+)_(\d+)_\d+$/) && ($1 eq $contig))
		{
		    my $beg1 = $2;
		    my $last = @{$tuple->[0]} - 1;
		    if (($tuple->[0]->[$last] =~ /^(\S+)_\d+_(\d+)$/) && ($1 eq $contig))
		    {
			my $end1 = $2;
			if (&overlaps($beg,$end,$beg1,$end1))
			{
			    $minV = &SeedUtils::min($minV,$beg1,$end1);
			    $maxV = &SeedUtils::max($maxV,$beg1,$end1);
			push(@$genes,$fid);
			}
		    }
		}
	    }
	}
	return ($genes,$minV,$maxV);
}

sub overlaps {
    my($b1,$e1,$b2,$e2) = @_;

    if ($b1 > $e1) { ($b1,$e1) = ($e1,$b1) }
    if ($b2 > $e2) { ($b2,$e2) = ($e2,$b2) }
    return &SeedUtils::between($b1,$b2,$e1) || &SeedUtils::between($b2,$b1,$e2);
}

sub all_contigs {
    my($self) = @_;

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

	$self->load_feature_indexes();

	my %contigs;
	my $btrees = $self->{_feat_btree};

	if (ref($btrees))
	{
	    while (my($ftype, $btree) = each (%$btrees))
	    {
		map { $contigs{$_} = 1 } grep { !/^fig/ } keys %$btree ;
	    }
	}
	else
	{
	    &load_tbl($self);
	    my $tblH = $self->{_tbl};
	    while ( my($fid,$tuple) = each %$tblH)
	    {
		if ($tuple->[0]->[0] =~ /^(\S+)_\d+_\d+$/)
		{
		    $contigs{$1} = 1;
		}
	    }
	}
	return keys(%contigs);
}

sub all_features {
    my($self,$type) = @_;
    if (not defined($type)) { $type = qq(); }
    
    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};
    
	#warn "Loading feature indices";
	$self->load_feature_indexes();
	
	my %contigs;
	my $btrees = $self->{_feat_btree};
	
	if (ref($btrees)) {
	    warn "B-tree already loaded" if $ENV{FIG_VERBOSE};
	    
	    my $btree = $btrees->{$type};
	    return sort { &SeedUtils::by_fig_id($a, $b) }
	    	grep { /^fig/ } keys %$btree;
	}
	else {
	    warn "Loading contig B-tree" if $ENV{FIG_VERBOSE};
	    
	    &load_tbl($self);
	    my $tblH = $self->{_tbl};

	    return sort { 
		&SeedUtils::by_fig_id($a,$b)
		} grep {
		    #...NOTE: Matches all feature types if $type is the null string
		    ($_ =~ /^fig\|\d+\.\d+\.([^\.]+)/) && ($1 =~ m/$type/)
		    } keys(%$tblH);
	}
}

sub all_features_detailed_fast {
    my($self,$regmin, $regmax, $contig) = @_;

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

	$self->load_feature_indexes();
	my $feat_details = [];
	my $recnos = $self->{_feat_recno};

	if (ref($recnos))
	{
	    while (my($ftype, $list) = each (%$recnos))
	    {
		#
		# Look up start/end of this contig in the btree index.
		#
		
		my $inf = $self->{_feat_btree}->{$ftype}->{$contig};
		my($istart, $iend);
		if ($inf)
		{
		    ($istart, $iend) = split(/$;/, $inf);
		}
		else
		{
		    $istart = 0;
		    $iend = $#$list;
		}

		for (my $idx = $istart; $idx <= $iend; $idx++)
		{
		    my($fid, $fcontig, $fbeg, $fend, $fstrand) = split(/$;/, $list->[$idx]);

		    if (not defined($regmin) or not defined($regmax) or not defined($contig) or
			(($contig eq $fcontig) and
			 ($fbeg < $regmin and $regmin < $fend) or ($fbeg < $regmax and $regmax < $fend) or ($fbeg > $regmin and $fend < $regmax)))
		    {
			my($loc, $index, @aliases) = split(/$;/, $self->{_feat_btree}->{$ftype}->{$fid});

			my $function = $self->function_of($fid);			
			push(@$feat_details,[$fid, $loc, join(",", @aliases), $ftype, $fbeg, $fend, $function,'master','']);
		    }
		}
	    }
	}
	else
	{
	    &load_tbl($self);
	    my $tblH = $self->{_tbl};
	    while ( my($fid,$tuple) = each %$tblH)
	    {
		if ($fid =~ /^fig\|\d+\.\d+\.(\S+)\.\d+/)
		{
		    my $type = $1;
		    if ($tuple->[0]->[0] =~ /^(\S+)_(\d+)_(\d+)$/)
		    {
			my($ctg, $beg, $end) = ($1, $2, $3);
			next if (defined($contig) and $contig ne $ctg);
			
			my($min,$max);
			if ($beg < $end)
			{
			    $min = $beg;
			    $max = $end;
			}
			else
			{
			    $min = $end;
			    $max = $beg;
			}
			
			if (not defined($regmin) or not defined($regmax) or
			    ($min < $regmin and $regmin < $max) or ($min < $regmax and $regmax < $max) or ($min > $regmin and $max < $regmax))
			{
			    my $function = $self->function_of($fid);
			    push(@$feat_details,[$fid,$tuple->[0]->[0],join(",",@{$tuple->[1]}),$type,$min,$max,$function,'master','']);
			}
		    }
		}
	    }
	}
	return $feat_details;
}

sub feature_location {
    my($self,$fid) = @_;

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if (($fid =~ /^fig\|(\d+\.\d+)\.([^.]+)/) && ($1 eq $newG))
    {
	my $ftype = $2;
	
	$self->load_feature_indexes();

	my $btree = $self->{_feat_btree}->{$ftype};
	if ($btree)
	{
	    my($loc, $idx, @aliases) = split(/$;/, $btree->{$fid});
	    return wantarray ? split(/,/, $loc) : $loc;
	}
	else
	{
	    &load_tbl($self);
	    if (my $x = $self->{_tbl}->{$fid})
	    {
		if (wantarray)
		{
		    return @{$x->[0]};
		}
		else
		{
		    return join(",",@{$x->[0]});
		}
	    }
	    else
	    {
		return undef;
	    }
	}
    }
    return undef;
}

sub function_of {
    my($self,$fid) = @_;

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if (($fid =~ /^fig\|(\d+\.\d+)/) && ($1 eq $newG))
    {
	&load_functions($self);
	
	my $fn = $self->{_functions}->{$fid};
	if (wantarray)
	{
	    return ['master', $fn];
	}
	else
	{
	    return $fn;
	}
    }
    return "";
}


=pod

find_features_by_annotation

Takes a reference to a hash of functions to find and an optional case boolean, and returns a hash with keys are the function and values are a reference to an array of the IDs that have that function.

If the case boolean is set the search is case insensitive. Otherwise case sensitive.

=cut

sub find_features_by_annotation {
	my($self,$anno_hash, $case)=@_;
	$self->load_functions;

	if ($case) {map {$anno_hash->{uc($_)}=1} keys %$anno_hash}
	
	my $res={};
	foreach my $id (keys %{$self->{_functions}})
	{
		my $fn = $self->{_functions}->{$id};
		$case ? $fn = uc($fn) : 1;
		if ($anno_hash->{$fn}) {push @{$res->{$fn}}, $id}
	}
	
	return $res;
}


=pod 

search_features_by_annotation

Takes a string to find and an optional case boolean, and returns a hash with keys are the function and values are a reference to an array of the IDs that have that function.

If the case boolean is set the search is case insensitive. Otherwise case sensitive.

Note that this was originally based on the find above, but this uses a regexp for matching. Will likely be a lot slower which is why I only search for a single term. There may be an SQL way of doing this, if so let Rob know how to do it and I'll replace this method.


=cut

sub search_features_by_annotation {
	my($self,$term, $case)=@_;
	$self->load_functions;

	# to make case insensitive convert everything to uppercase
	# alternative is to use two regexps, one for case insens and one for not case insens
	# but the bad thing about that approach is that if you have a case insensitive search
	# you do two regexps for each failed match
	
	$case ? $term = uc($term) : 1;

	my $res={};
	foreach my $id (keys %{$self->{_functions}})
	{
		# we set two variables, one that has case changed for case insensitive searches
		my $fn = my $fnc = $self->{_functions}->{$id};
		$case ? $fn = uc($fn) : 1;
		if ($fn =~ m/$term/) {push @{$res->{$fnc}}, $id}
	}
	
	return $res;
}


sub feature_aliases {
    my($self,$fid) = @_;

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    my @aliases;

    if (($fid =~ /^fig\|(\d+\.\d+)/) && ($1 eq $newG))
    {
	&load_tbl($self);
	if (my $x = $self->{_tbl}->{$fid})
	{
	    @aliases = @{$x->[1]};
	}
	else
	{
	    @aliases = ();
	}
    }
    else
    {
	@aliases = ();
    }
    return wantarray() ? @aliases : join(",",@aliases);
}

sub get_corresponding_ids {
    my($self, $id, $with_type_info) = @_;
    
    my $newG    = $self->{_genome};

    if (($id =~ /^fig\|(\d+\.\d+)/) && ($1 eq $newG)) {
	my @aliases = $self->feature_aliases($id);
	my @corresponding_ids = ();
	foreach my $alias (@aliases) {
	    if ($alias =~ /^gi\|/) {
		if ($with_type_info) {
		    push(@corresponding_ids, [$alias, 'NCBI']);
		} else {
		    push(@corresponding_ids, $alias);
		}
		last;
	    }
	}
	return @corresponding_ids;
    }
}

sub feature_annotations {
    my($self,$fid,$rawtime) = @_;

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    my @annotations;
    if (($fid =~ /^fig\|(\d+\.\d+)/) && ($1 eq $newG))
    {
	&load_ann($self);
	if (my $x = $self->{_ann}->{$fid})
	{
	    @annotations = @{$x};
	}
	else
	{
	    @annotations = ();
	}

	if ($rawtime)
	{
	    return @annotations;
	}
	else
	{
	    return map { my $r = [@$_]; $r->[1] = localtime($r->[1]); $r } @annotations;
	}
    }
    return ();
}

sub get_translation {
    my($self,$peg) = @_;

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if (($peg =~ /^fig\|(\d+\.\d+)/) && ($1 eq $newG))
    {
	&load_feature_indexes($self);

	my $out = $self->{_feat}->{peg}->{$peg};
	if (defined($out))
	{
	    return $out;
	}

	#
	# If we have a blast-formatted fasta, use fastacmd to
	# do the lookup, and cache the output for later use.
	#

	if ($self->{_feat_fasta}->{peg})
	{
	    my $id = "gnl|$peg";
	    my $cmd = "$FIG_Config::ext_bin/fastacmd -d $self->{_feat_fasta}->{peg} -s '$id'";
	    open(P, "$cmd|") or die "get_translation: cmd failed with $!: $cmd";
	    $_ = <P>;
	    my $out;
	    while (<P>)
	    {
		s/\s+//g;
		$out .= $_;
	    }
	    close(P);
	    $self->{_feat}->{$peg} = $out;
	    return $out;
	}
	else
	{
	    return $self->{_feat}->{$peg};
	}
    }
    else
    {
	return undef;
    }
}

sub translation_length
{
    my($self, $peg) = @_;

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    if (($peg =~ /^fig\|(\d+\.\d+)/) && ($1 eq $newG))
    {
	my $t = $self->get_translation($peg);
	return length($t);
    }
    return undef;
}

sub load_feature_hash {
    my($self,$type) = @_;

    my $newGdir = $self->{_orgdir};
    my $typeH = {};
    if (open(FIDS,"<$newGdir/Features/$type/tbl"))
    {
	while ($_ = <FIDS>)
	{
	    if ($_ =~ /^(\S+)/)
	    {
		my $fid = $1;
		if (! $self->is_deleted_fid($fid))
		{
		    $typeH->{$fid} = 1;
		}
	    }
	}
	close(FIDS);
    }
    return $typeH;
}
    
sub load_feature_indexes
{
    my($self) = @_;

    if ($self->{_feat}) { return };

    my $newGdir = $self->{_orgdir};

    for my $fdir (<$newGdir/Features/*>)
    {
	my $ftype = basename($fdir);

	#
	# If we have a tbl.btree, tie that for our use.
	#
	
	my $tbl_idx = {};
	my $tie = tie %$tbl_idx, 'DB_File', "$fdir/tbl.btree", O_RDONLY, 0666, $DB_BTREE;
	if ($tie)
	{
	    $self->{_feat_tie}->{$ftype} = $tie;
	    $self->{_feat_btree}->{$ftype} = $tbl_idx;
	}

	my $tbl_list = [];
	my $ltie = tie @$tbl_list, 'DB_File', "$fdir/tbl.recno", O_RDONLY, 0666, $DB_RECNO;
	if ($tie)
	{
	    $self->{_feat_ltie}->{$ftype} = $ltie;
	    $self->{_feat_recno}->{$ftype} = $tbl_list;
	}

	#
	# If we have fasta.norm.phr, set _pseq_fasta to the fasta file to use with fastacmd.
	#
	
	my $pseq     = {};

	if (-f "$fdir/fasta.norm.phr")
	{
	    $self->{_feat_fasta}->{$ftype} = "$fdir/fasta.norm";

	}
	else
	{
	    #
	    # Otherwise, we need to load the data.
	    #

	    if (open(FASTA,"<$newGdir/Features/peg/fasta"))
	    {
		local $/ = "\n>";
		my $x;
		while (defined($x = <FASTA>))
		{
		    chomp $x;
		    if ($x =~ />?(\S+)[^\n]*\n(.*)/s)
		    {
			my $peg = $1;
			my $seq = $2;
			$seq =~ s/\s//gs;
			if (! $self->is_deleted_fid($peg))
			{
			    $pseq->{$peg} = $seq;
			}
		    }
		}
		close(FASTA);
	    }
	}
	$self->{_feat}->{$ftype} = $pseq;
    }
}

sub load_pseq {
    my($self) = @_;

    if ($self->{_pseq}) { return };

    my $newGdir = $self->{_orgdir};
    my $fdir = "$newGdir/Features/peg";

    #
    # If we have a tbl.btree, tie that for our use.
    #

    my $tbl_idx = {};
    my $tie = tie %$tbl_idx, 'DB_File', "$fdir/tbl.btree", O_RDONLY, 0666, $DB_BTREE;
    if ($tie)
    {
	$self->{_tbl_tie} = $tie;
	$self->{_tbl_btree} = $tbl_idx;
    }

    #
    # If we have fasta.norm.phr, set _pseq_fasta to the fasta file to use with fastacmd.
    #

    my $pseq     = {};

    if (-f "$fdir/fasta.norm.phr")
    {
	$self->{_pseq_fasta} = "$fdir/fasta.norm";
    }
    else
    {
	#
	# Otherwise, we need to load the data.
	#

	if (open(FASTA,"<$newGdir/Features/peg/fasta"))
	{
	    local $/ = "\n>";
	    my $x;
	    while (defined($x = <FASTA>))
	    {
		chomp $x;
		if ($x =~ />?(\S+)[^\n]*\n(.*)/s)
		{
		    my $peg = $1;
		    my $seq = $2;
		    $seq =~ s/\s//gs;
		    if (! $self->is_deleted_fid($peg))
		    {
			$pseq->{$peg} = $seq;
		    }
		}
	    }
	    close(FASTA);
	}
    }
    $self->{_pseq} = $pseq;
}

sub load_ss_data
{
    my($self, $force) = @_;

    return if defined($self->{_ss_bindings}) && (! $force);

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    open(S, "<$newGdir/Subsystems/bindings") or die "Cannot open $newGdir/Subsystems/bindings: $!";

    my $peg_index;
    my $bindings;

    while (<S>)
    {
	chomp;
	my($sname, $role, $peg) = split(/\t/);
	if (! $self->is_deleted_fid($peg))
	{
	    push(@{$bindings->{$sname}->{$role}}, $peg);
	    push(@{$peg_index->{$peg}}, [$sname, $role]);
	}

    }
    close(S);

    open(S, "<$newGdir/Subsystems/subsystems") or die "Cannot open $newGdir/Subsystems/subsystems: $!";
    my $variant;
    while (<S>)
    {
	chomp;
	my($sname, $v) = split(/\t/);
	$variant->{$sname} = $v;
    }
    close(S);

    $self->{_ss_bindings}     = $bindings;
    $self->{_ss_variants}     = $variant;
    $self->{_ss_peg_index}    = $peg_index;
}

sub load_tbl {
    my($self) = @_;

    if ($self->{_tbl}) { return };

    my $newGdir = $self->{_orgdir};
    my $tbl     = {};

    foreach my $x (`cat $newGdir/Features/*/tbl`)
    {
	chomp $x;
	if ($x =~ /^(\S+)\t(\S+)(\t(\S.*\S))?/)
	{
	    my $fid = $1;
	    my $loc = [split(/,/,$2)];
	    my $aliases = $4 ? [split(/\t/,$4)] : [];
	    if (! $self->is_deleted_fid($fid))
	    {
		$tbl->{$fid} = [$loc,$aliases];
	    }
	}
	else {
	    warn "Bad feature line in $newGdir:$x:\n";
	}
    }
    print STDERR ("Loaded ", (scalar keys %$tbl), " features from $newGdir\n") if $ENV{FIG_VERBOSE};
    $self->{_tbl} = $tbl;
}

sub load_functions {
    my($self, $force) = @_;

    if (!$force && $self->{_functions}) { return };

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};
    my $functions = {};
    my $roles     = {};

    # order of "cat" is important - proposed_user_functions must be last
    foreach my $x (`cat $newGdir/*functions`)
    {
	if (($x =~ /^(fig\|(\d+\.\d+)\.\S+)\t(\S[^\t]*\S)/) && ($2 eq $newG))
	{
	    my $peg = $1;
	    my $f = $3;
	    if (! $self->is_deleted_fid($peg))
	    {
		# user has overridden a function, so delete old roles
		if (exists($functions->{$peg})) {
		    my @roles = &SeedUtils::roles_of_function($functions->{$peg});
		    foreach $_ (@roles)
		    {
			delete $roles->{$_};
		    }
		}

		# add new roles
		my @roles = &SeedUtils::roles_of_function($f);
		foreach $_ (@roles)
		{
		    push(@{$roles->{$_}},$peg);
		}
		
		# set function
		$functions->{$peg} = $f;
	    }
	}
    }
    $self->{_functions} = $functions;
    $self->{_roles} = $roles;
}

sub seqs_with_role {
    my($self,$role,$who,$genome) = @_;

    my $newG    = $self->{_genome};
    if ($genome && $newG && ($genome eq $newG)) {
	&load_functions($self);
    	my $pegsL = $self->{_roles}->{$role};	
	return ($pegsL ? @{$pegsL} : ());
    }
    return ();
}
 
sub load_ann {
    my($self) = @_;

    if ($self->{_ann}) { return };

    my $newGdir = $self->{_orgdir};
    my $ann     = {};
    if (open(ANN,"<$newGdir/annotations"))
    {
	$/ = "\n//\n";
	while (defined(my $x = <ANN>))
	{
	    chomp $x;
	    if ($x =~ /^(\S+)\n([^\n]+)\n([^\n]+)\n(.*)/s)
	    {
		push(@{$ann->{$1}},[$1,$2,$3,"$4\n"]);
	    }
	}
	$/ = "\n";
	close(ANN);
    }
    $self->{_ann} = $ann;
}

sub taxonomy_of {
    my($self) = @_;

    my $newG    = $self->{_genome};
    my $newGdir = $self->{_orgdir};

    my $tax;
    if (open(TAX,"<$newGdir/TAXONOMY") && ($tax = <TAX>))
    {
	chop $tax;
	return $tax;
    }
    else
    {
	return "unknown";
    }
}

sub is_deleted_fid {

    return 0;
}

1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3