[Bio] / FigKernelPackages / FigFams.pm Repository:
ViewVC logotype

View of /FigKernelPackages/FigFams.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.46 - (download) (as text) (annotate)
Wed Jun 18 15:56:56 2008 UTC (11 years, 8 months ago) by arodri7
Branch: MAIN
CVS Tags: rast_rel_2008_07_21, rast_2008_0924, rast_rel_2008_09_30, mgrast_rel_2008_0923, mgrast_rel_2008_0924, mgrast_rel_2008_0625, rast_rel_2008_08_07, rast_rel_2008_10_09, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0919, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29
Changes since 1.45: +6 -0 lines
Figfams update

# -*- perl -*-
########################################################################
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
########################################################################

package FigFams;

use strict;
use DB_File;

use FIG;
use Data::Dumper;
use DBrtns;
use Carp;
use FigFam;
use FFs;
use Cwd 'abs_path';

# This is the constructor.  Presumably, $class is 'FigFams'.  
#

sub new {
    my($class,$fig,$fam_data) = @_;
    my $figfams = {};
    
    $figfams->{fig} = defined($fig->{_fig}) ? $fig->{_fig} : $fig;

    $fam_data = $fig->get_figfams_data($fam_data);

    if ($ENV{REPORT_FIGFAM_DETAILS})
    {
	eval {
	    my $abs = abs_path($fam_data);
	    print STDERR "FigFams.pm: $0 using data in $abs ($fam_data)\n";
	};
    }
    $figfams->{dir} = $fam_data;
    $figfams->{ffs} = new FFs($fam_data);

    &verify_dbs_made($fam_data,$figfams->{fig});

    bless $figfams,$class;
    return $figfams;;
}

sub families_implementing_role {
    my($self,$role) = @_;

    return $self->{ffs}->families_implementing_role($role);
}

sub rebuild_dbs {
    my($self) = @_;

    my $dir = $self->{dir};
    $self->rebuild_repdb;
    &FIG::run("formatdb -p T -i $dir/repdb");

    foreach my $db ("role","function","peg","genome","relevant.peg.data","PDB.connections")
    {
	&remove_old($dir,$db);
    }
    &verify_dbs_made($dir,$self->{fig});
}

sub remove_old {
    my($dir,$db) = @_;

    if ((-s "$dir/$db.db") && ((-M "$dir/$db.db") > (-M "$dir/$db")))
    {
	unlink("$dir/$db.db");
    }
}

sub families_with_function {
    my($self,$function) = @_;

    return $self->{ffs}->families_with_function($function);
}

sub families_with_functional_role {
    my($self,$functional_role) = @_;

    return $self->families_implementing_role($functional_role);
}

sub families_containing_peg_bulk {
    my ($self, $peg_bulk) = @_;
    my $family_hash = {};

    foreach my $peg (@$peg_bulk) {
	my @fam = $self->{ffs}->families_containing_peg($peg);
	if (@fam > 0){
	    $family_hash->{$peg} = $fam[0];
	}
    }
    return $family_hash;
}

sub families_containing_peg {
    my($self,$peg) = @_;

    return $self->{ffs}->families_containing_peg($peg);
}

sub families_in_genome {
    my($self,$genome) = @_;

    return $self->{ffs}->families_in_genome($genome);
}

sub all_figfamilies {
    my($self) = @_;

    my $fig = $self->{fig};
    my $dir = $self->{dir};
    my @families;
    open (FH, "$dir/family.functions");
    while (my $line = <FH>)
    {
	my ($fam, $func) = split (/\t/, $line);
	push (@families, $fam);
    }
    return sort @families;
}

sub figfam_active_subsystems {
    my($self,$tmp) = @_;

    return $self->{ffs}->figfam_active_subsystems($tmp);
}

sub all_families {
    my($self) = @_;

    return sort map { chomp; $_ } `cut -f1 $self->{dir}/family.functions`;
}

sub reset_functions {
    my($self,$fams) = @_;

    my $fig = $self->{fig};    
    my $dir = $self->{dir};

    if (!$fams){
	$fams = [];
	push (@$fams, $self->all_families);
    }

#    foreach my $fam_id ($self->all_families)
    foreach my $fam_id (@$fams)
    {
	print STDERR "...resetting function for $fam_id\n";
	my $figfam = new FigFam($fig,$fam_id,$dir);
	if ($figfam)
	{
	    $figfam->reset_function;
	}
    }
    $self->rebuild_family_funcs;
}

sub place_in_family {
    my($self,$seq,$debug,$loose,$debug_peg) = @_;

    return $self->{ffs}->place_in_family($seq,$debug,$loose,$debug_peg);
}

sub is_fid_in_family {
    my($self,$seq,$debug_peg) = @_;

    return $self->{ffs}->is_fid_in_family($seq,$debug_peg);
}

sub next_id {
    my($self) = @_;

    my $fam_data = $self->{dir};
    my @tmp = map { $_ =~ /^(\S+)/ } `cat $fam_data/next_id`;
    return $tmp[0];
}

sub rebuild_repdb {
    my($self) = @_;
    
    my $dir = $self->{dir};
    my($sub,$fam);
    if (-e "$dir/repdb") { system "rm $dir/repdb*" }
    open(REPDB,">$dir/repdb") || die "could not open $dir/repdb";

    opendir(D1,"$dir/FIGFAMS") || die "could not open $dir/FIGFAMS";
    
    foreach $sub (sort grep { $_ =~ /^(\d+)/ } readdir(D1))
    {
#	print STDERR "$sub\n";
	opendir(SUB,"$dir/FIGFAMS/$sub") || die "could not open $dir/FIGFAMS/$sub";
	my @ff_dirs = sort grep { $_ =~ /^FIG\d{6}$/ } readdir(SUB);
	closedir(SUB);
	foreach my $ff (@ff_dirs)
	{
	    my $final_dir = "$dir/FIGFAMS/$sub/$ff";
	    my $rc = open(REPS,"<$final_dir/reps");
	    if (! $rc) 
	    {
		unlink("$final_dir/built");
		my $fig = $self->{fig};    
		my $figfam = new FigFam($fig,$ff,$dir);
		$rc = open(REPS,"<$final_dir/reps");
	    }
	    if ($rc)
	    {
		my $line;
		while (defined($line = <REPS>))
		{
		    print REPDB $line;
		}
		close(REPS);
	    }
	}
#	system "cat $dir/FIGFAMS/$sub/*/reps >> $dir/repdb";
    }
    closedir(D1);
    close(REPDB);
}

sub rebuild_family_funcs {
    my($self) = @_;
    
    my $dir = $self->{dir};
    my($sub,$fam);
    if (-e "$dir/family.functions") { system "rm $dir/family.functions*" }

    open(OUT,">$dir/family.functions") || die "could not open $dir/family.functions";

    opendir(D1,"$dir/FIGFAMS") || die "could not open $dir/FIGFAMS";
    
    foreach $sub (grep { $_ =~ /^(\d+)/ } readdir(D1))
    {
	opendir(D2,"$dir/FIGFAMS/$sub") || die "could not open $dir/FIGFAMS/$sub";
	foreach $fam (grep { $_ =~ /^FIG/ } readdir(D2))
	{
	    if (open(IN,"<$dir/FIGFAMS/$sub/$fam/function") && ($_ = <IN>))
	    {
		
		print OUT "$fam\t$_";
		close(IN);
	    }
	}
	closedir(D2);
    }
    closedir(D1);
    close(OUT);
    &verify_function_to_fams($dir);

#   my $fig = $self->{fig};
#   my $dbf = $fig->db_handle;
#   $dbf->drop_table( tbl => 'ff_funcs');
#   $dbf->create_table( tbl => 'ff_funcs', 
#		flds => 'family char(9),func varchar(400)'
#                     );
#   $dbf->load_table( tbl => "ff_funcs", file => "$dir/family.functions");
#    $dbf->create_index( idx  => "ff_funcs_family_ix",
#		tbl  => "ff_funcs",
#		type => "btree",
#		flds => "family" );
#    print STDERR "finished loading ff_funcs\n";
}

sub is_paralog_figfams{
    my ($self, $fig, $figfamObject1, $figfamObject2) = @_;

    # get the sequences for figfamObject1 and figfamObject2
    my @ff1_ids = $figfamObject1->list_members();
    my @ff2_ids = $figfamObject2->list_members();
    
    foreach my $ff1_id (@ff1_ids){
	foreach my $ff2_id (@ff2_ids){
	    if ($fig->genome_of($ff1_id) eq $fig->genome_of($ff2_id)){
		return 1; # if there are paralogs of each other in the two figfams
	    }
	}
    }
}

sub are_figfams_same_gene_context{
    my ($self, $fig, $figfamObject1, $figfamObject2, $threshold) = @_;

    # get the sequences for figfamObject1 and figfamObject2
    my @ff1_ids = $figfamObject1->list_members();
    my @ff2_ids = $figfamObject2->list_members();
    
    foreach my $ff1_id (@ff1_ids){
	# get the gene context of the $ff1_id
	my $genome = $fig->genome_of($ff1_id);
	
	# get the contig informmation
	my $data = $fig->feature_location($ff1_id);
	my ($contig, $beg, $end);
	if ($data =~ /(.*)_(\d+)_(\d+)$/){
	    $contig = $1;
	    if ($2 < $3)
	    {
		$beg = $2-4000;
		$end = $3+4000;
	    }
	    else
	    {
		$beg = $2+4000;
		$end = $3-4000;
	    }
	}
	
	my ($gene_features, $reg_beg, $reg_end) = $fig->genes_in_region($genome, $contig, $beg, $end);
	my %bbh_feature_hash = ();
	foreach my $fid1 (@$gene_features){
	    my @bbhs  = $fig->bbhs($fid1);
	    my @featureList = map { $_->[0] } @bbhs;
	    foreach my $feature (@featureList){	
		$bbh_feature_hash{$feature} = $fid1;
	    }
	}
	my $gene_context_count = scalar(@$gene_features);
	next if ($gene_context_count < 1);

	foreach my $ff2_id (@ff2_ids){
	    # check if the majority (>.80) of the genes are in the second id
	    # get the gene context of the $ff2_id
	    my $genome2 = $fig->genome_of($ff2_id);
	
	    # get the contig informmation
	    my $data2 = $fig->feature_location($ff2_id);
	    my ($contig2, $beg2, $end2);
	    if ($data2 =~ /(.*)_(\d+)_(\d+)$/){
		$contig2 = $1;
		if ($2 < $3)
		{
		    $beg2 = $2-4000;
		    $end2 = $3+4000;
		}
		else
		{
		    $beg2 = $2+4000;
		    $end2 = $3-4000;
		}
	    }
	
	    my ($gene_features2, $reg_beg2, $reg_end2) = $fig->genes_in_region($genome2, $contig2, $beg2, $end2);
	    my $agree = 0;
	    foreach my $fid2 (@$gene_features2){
		if ($bbh_feature_hash{$fid2}){
		    $agree++;
		}
	    }
	    if ($agree/$gene_context_count >= $threshold){
		return (1,$ff1_id, $ff2_id);  # if they have same gene context.
	    }
	}
    }
    return (0, "empty", "empty");
}

sub verify_dbs_made {
    my($fam_data,$fig) = @_;

    &verify_role_to_fams($fam_data);
    &verify_function_to_fams($fam_data);
    &verify_peg_to_fams($fam_data);
    &verify_genome_to_fams($fam_data);
    &verify_relevant_peg_data($fam_data,$fig);
    &verify_PDB_connections($fam_data) if (-f "$fam_data/PDB.connections");
}

sub verify_PDB_connections {
    my($fam_data) = @_;

    my $db = "$fam_data/PDB.connections.db";
    my %PDB_hash;

    if (! -s $db)
    {
	my $PDB_hash_tie = tie %PDB_hash, 'DB_File', $db, O_RDWR | O_CREAT, 0666, $DB_HASH;
	$PDB_hash_tie || die "tie failed";
	my %fam_to_PDB_sims;
	open(TMP,"<$fam_data/PDB.connections")
	|| die "could not open $fam_data/PDB.connections";
	while (defined($_ = <TMP>))
	{
	    if ($_ =~ /^(FIG\d{6})\t(\S.*\S)/)
	    {
		my $fam        = $1;
		my $pdb_data   = $2;
		push(@{$fam_to_PDB_sims{$fam}},$2);
	    }
	}
	close(TMP);
    
	foreach my $fam (keys(%fam_to_PDB_sims))
	{
	    $PDB_hash{$fam} = join("\n",@{$fam_to_PDB_sims{$fam}});
	}
	untie %PDB_hash;
    }
}


sub verify_role_to_fams {
    my($fam_data) = @_;

    my $db = "$fam_data/role.db";
    my %role_hash;

    if (! -s $db)
    {
	my $role_hash_tie;
	($role_hash_tie = tie %role_hash, 'DB_File', $db, O_RDWR | O_CREAT, 0666, $DB_HASH)
	    || die qq(Attempt to tie \%role_hash to "$db" failed: \$\! = "$!");
	
	open(TMP,"<$fam_data/family.functions")
	|| die "could not open $fam_data/family.functions";
	my %role_to_fams;
	while (defined($_ = <TMP>))
	{
	    if ($_ =~ /^(FIG\d{6})\t(\S.*\S)/)
	    {
		my $fam  = $1;
		my $func = $2;
		$func =~ s/^FIG\d{6}[^:]+:\s*//;
		foreach my $role (&FIG::roles_of_function($func))
		{
		    push(@{$role_to_fams{$role}},$fam);
		}
	    }
	}
	close(TMP);
    
	foreach my $role (keys(%role_to_fams))
	{
	    my $fams = $role_to_fams{$role};
	    $role_hash{$role} = join("\t",@$fams);
	}
	untie %role_hash;
    }
}

sub verify_function_to_fams {
    my($fam_data) = @_;

    my $db = "$fam_data/function.db";
    my %function_hash;

    if (! -s $db)
    {
	my $function_hash_tie = tie %function_hash, 'DB_File', $db, O_RDWR | O_CREAT, 0666, $DB_HASH;
	$function_hash_tie || die "tie failed";

	open(TMP,"<$fam_data/family.functions")
	|| die "could not open $fam_data/family.functions";
	my %function_to_fams;
	while (defined($_ = <TMP>))
	{
	    if ($_ =~ /^(FIG\d{6})\t(\S.*\S)/)
	    {
		my $fam  = $1;
		my $func = $2;
		$func =~ s/^FIG\d{6}[^:]+:\s*//;
		push(@{$function_to_fams{$func}},$fam);
	    }
	}
	close(TMP);
    
	foreach my $function (keys(%function_to_fams))
	{
	    my $fams = $function_to_fams{$function};
	    $function_hash{$function} = join("\t",@$fams);
	}
	untie %function_hash;
    }
}

sub verify_peg_to_fams {
    my($fam_data) = @_;

    my $db = "$fam_data/peg.db";
    my %peg_hash;

    if (! -s $db)
    {
	my $peg_hash_tie = tie %peg_hash, 'DB_File', $db, O_RDWR | O_CREAT, 0666, $DB_HASH;
	$peg_hash_tie || die "tie failed";

	open(TMP,"<$fam_data/families.2c")
	|| die "could not open $fam_data/families.2c";
	my %peg_to_fams;
	while (defined($_ = <TMP>))
	{
	    if ($_ =~ /^(FIG\d{6})\t(\S+)/)
	    {
		my $fam  = $1;
		my $peg  = $2;
		push(@{$peg_to_fams{$peg}},$fam);
	    }
	}
	close(TMP);
    
	foreach my $peg (keys(%peg_to_fams))
	{
	    my $fams = $peg_to_fams{$peg};
	    $peg_hash{$peg} = join("\t",@$fams);
	}
	untie %peg_hash;
    }
}

sub verify_genome_to_fams {
    my($fam_data) = @_;

    my $db = "$fam_data/genome.db";
    my %genome_hash;

    if (! -s $db)
    {
	my $genome_hash_tie = tie %genome_hash, 'DB_File', $db, O_RDWR | O_CREAT, 0666, $DB_HASH;
	$genome_hash_tie || die "tie failed";

	open(TMP,"<$fam_data/families.2c")
	|| die "could not open $fam_data/families.2c";
	my %genome_to_fams;
	while (defined($_ = <TMP>))
	{
	    if ($_ =~ /^(FIG\d{6})\tfig\|(\d+\.\d+)/)
	    {
		my $fam  = $1;
		my $genome  = $2;
		$genome_to_fams{$genome}->{$fam} = 1;
	    }
	}
	close(TMP);
    
	foreach my $genome (keys(%genome_to_fams))
	{
	    my @fams = keys(%{$genome_to_fams{$genome}});
	    $genome_hash{$genome} = join("\t",@fams);
	}
	untie %genome_hash;
    }
}


sub verify_relevant_peg_data {
    my($fam_data,$fig) = @_;

    my $db = "$fam_data/relevant.peg.data.db";
    my %relevant_peg_data_hash;
    if (! -s $db)
    {
	if (! -s "$fam_data/relevant.peg.data")
	{
	    open(PEG,"(cut -f2 $fam_data/families.2c; grep \"^fig\" $fam_data/partitions.input) | sort -u | function_of |")
		|| die "could not make relevant.peg.data";
	    open(OUT,">$fam_data/relevant.peg.data")
		|| die "could not open relevant.peg.data";

	    my($line,$seq,$aliases,$peg);
	    while (defined($line = <PEG>))
	    {
		chop $line;
		if (($line =~ /^(\S+)/) && ($peg = $1) &&
		    ($seq = $fig->get_translation($peg)))
		{
		    $aliases = $fig->feature_aliases($peg);
		    print OUT "$line\t$seq\t$aliases\n";
		}
	    }
	    close(PEG);
	    close(OUT);
	}
	my $relevant_peg_data_hash_tie = tie %relevant_peg_data_hash, 'DB_File', $db, O_RDWR | O_CREAT, 0666, $DB_HASH;
	$relevant_peg_data_hash_tie || die "tie failed";

	open(TMP,"<$fam_data/relevant.peg.data")
	    || die "could not open $fam_data/relevant.peg.data";
	while (defined($_ = <TMP>))
	{
	    if ($_ =~ /^(\S+)\t(\S.*\S)/)
	    {
		$relevant_peg_data_hash{$1} = $2;
	    }
	}
	close(TMP);
	untie %relevant_peg_data_hash;
    }
}

=head3
usage: $figfams->family_functions();

returns a hash of all the functions for all figfams from the family.functions file

=cut

sub family_functions{
    my $ff_data = &FIG::get_figfams_data();
    my $ff_file     = "$ff_data/family.functions";

    my $contents;
    my $len = -s $ff_file;
    sysopen(FF, $ff_file, 0) or die "could not open file '$ff_file': $!";
    sysread(FF, $contents, $len);
    close(FF) or die "could not close file '$ff_file': $!";
    my %ff_name = map {split(/\t/)} split("\n", $contents);

    return \%ff_name;
}

1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3