[Bio] / FigKernelPackages / FFs.pm Repository:
ViewVC logotype

View of /FigKernelPackages/FFs.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.25 - (download) (as text) (annotate)
Thu Sep 4 15:36:34 2008 UTC (11 years, 5 months ago) by arodri7
Branch: MAIN
CVS Tags: rast_rel_2008_12_18, rast_2008_0924, rast_rel_2008_09_30, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, mgrast_rel_2008_0625, rast_rel_2008_10_09, rast_release_2008_09_29, mgrast_rel_2008_0923, mgrast_rel_2008_0919, mgrast_rel_2008_1110, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, rast_rel_2008_11_24
Changes since 1.24: +3 -3 lines
corrected errors for merge_figfams

# -*- perl -*-
########################################################################
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
########################################################################

package FFs;

use strict;
use DB_File;

use FF;

use Data::Dumper;
use Carp;
use FF;

# This is the constructor.  Presumably, $class is 'FFs'.  
#

sub new {
    my($class,$fam_data) = @_;

    my $figfams = {};

    defined($fam_data) || return undef;
    $figfams->{dir} = $fam_data;

    bless $figfams,$class;
    return $figfams;
}

sub create_tie
{
    my($self, $hash, $file, $type) = @_;

    my $mode = -w $file ? O_RDWR : O_RDONLY;

    return tie %$hash, 'DB_File', $file, $mode, 0666, $type;	
}

sub check_db_role {
    my($self) = @_;
    my $fam_data = $self->{dir};

    my $db_role = "$fam_data/role.db";
    if (! -s $db_role) { return undef }

    my %role_hash;
    my $role_hash_tie = $self->create_tie(\%role_hash, $db_role, $DB_HASH);
    $role_hash_tie || die "tie $db_role failed: $!";
    $self->{role_db} = \%role_hash;
}

sub check_db_function {
    my($self) = @_;
    my $fam_data = $self->{dir};

    my $db_function = "$fam_data/function.db";
    if (! -s $db_function) { return undef }
    my %function_hash;
    my $function_hash_tie = $self->create_tie(\%function_hash, $db_function,  $DB_HASH);
    $function_hash_tie || die "tie $db_function failed: $!";
    $self->{function_db} = \%function_hash;
}

sub check_db_peg_to_fams {
    my($self) = @_;
    my $fam_data = $self->{dir};

    my $db_peg_to_fams = "$fam_data/peg.db";
    if (! -s $db_peg_to_fams) { return undef }
    my %peg_to_fams_hash;
    my $peg_to_fams_hash_tie = $self->create_tie(\%peg_to_fams_hash, $db_peg_to_fams,  $DB_HASH);
    $peg_to_fams_hash_tie || die "tie $db_peg_to_fams failed: $!";
    $self->{peg_to_fams_db} = \%peg_to_fams_hash;
}

sub check_db_genome_to_fams {
    my($self) = @_;
    my $fam_data = $self->{dir};

    my $db_genome_to_fams = "$fam_data/genome.db";
    if (! -s $db_genome_to_fams) { return undef }
    my %genome_to_fams_hash;
    my $genome_to_fams_hash_tie = $self->create_tie(\%genome_to_fams_hash, $db_genome_to_fams, $DB_HASH);
    $genome_to_fams_hash_tie || die "tie $db_genome_to_fams failed: $!";
    $self->{genome_to_fams_db} = \%genome_to_fams_hash;
}

sub check_db_relevant_peg_data {
    my($self) = @_;
    my $fam_data = $self->{dir};

    my $db_relevant_peg_data = "$fam_data/relevant.peg.data.db";
    if (! -s $db_relevant_peg_data) { return undef }
    my %relevant_peg_data_hash;
    my $relevant_peg_data_hash_tie = $self->create_tie(\%relevant_peg_data_hash, $db_relevant_peg_data, $DB_HASH);
    $relevant_peg_data_hash_tie || die "tie $db_relevant_peg_data failed: $!";
    $self->{relevant_peg_data_db} = \%relevant_peg_data_hash;
}

sub check_db_PDB_connections {
    my($self) = @_;

    my $fam_data = $self->{dir};
    my $db_PDB_connections = "$fam_data/PDB.connections.db";
    if (! -s $db_PDB_connections) { return undef }
    my %PDB_hash;
    my $PDB_hash_tie = $self->create_tie(\%PDB_hash, $db_PDB_connections, $DB_HASH);
    $PDB_hash_tie || die "tie $db_PDB_connections failed: $!";
    $self->{PDB_connections_db} = \%PDB_hash;
}

sub PDB_connections {
    my($self,$fam,$raw) = @_;

    $self->check_db_PDB_connections;
    my $sims = $self->{PDB_connections_db}->{$fam};
    my @sims = map { $_ =~ /pdb\|([0-9a-zA-Z]+)/; [$1,[split(/\t/,$_)]] } split(/\n/,$sims);
    if (! $raw)  { @sims = map { $_->[0] } grep { ($_->[1]->[11] > 0.5) && ((($_->[1]->[4] - $_->[1]->[3]) / $_->[1]->[5]) > 0.8) } @sims}
    return \@sims;
}

sub function_of {
    my($self,$peg,$ignore_comments) = @_;

    $self->check_db_relevant_peg_data;  # lazily tie DB

    my $peg_data;
    if ($peg_data = $self->{relevant_peg_data_db}->{$peg})
    {
	my($org,$func,$seq,$aliases) = split(/\t/,$peg_data);
	if ($ignore_comments)
	{
	    $func =~ s/\s*\#.*$//;
	}
	return $func;
    }
    return "";
}
	
sub org_of {
    my($self,$peg) = @_;

    $self->check_db_relevant_peg_data;  # lazily tie DB

    my $peg_data;
    if ($peg_data = $self->{relevant_peg_data_db}->{$peg})
    {
	my($org,$func,$seq,$aliases) = split(/\t/,$peg_data);
	return $org;
    }
    return "";
}

sub seq_of {
    my($self,$peg) = @_;

    $self->check_db_relevant_peg_data;  # lazily tie DB

    my $peg_data;
    if ($peg_data = $self->{relevant_peg_data_db}->{$peg})
    {
	my($org,$func,$seq,$aliases) = split(/\t/,$peg_data);
	return $seq;
    }
    return "";
}

sub aliases_of {
    my($self,$peg) = @_;

    $self->check_db_relevant_peg_data;  # lazily tie DB

    my $peg_data;
    if ($peg_data = $self->{relevant_peg_data_db}->{$peg})
    {
	my($org,$func,$seq,$aliases) = split(/\t/,$peg_data);
	return wantarray() ? split(/,/,$aliases) : $aliases;
    }
    return "";
}

sub families_implementing_role {
    my($self,$role) = @_;

    $self->check_db_role;  # lazily tie DB

    my $fams = $self->{role_db}->{$role};
    return $fams ? split("\t",$fams) : ();
}

sub families_with_function {
    my($self,$function) = @_;

    $self->check_db_function;  # lazily tie DB

    my $fams = $self->{function_db}->{$function};
    return $fams ? split("\t",$fams) : ();
}

sub family_containing_peg {
    my($self,$peg) = @_;
    my @fams = $self->families_containing_peg($peg);
    return (@fams > 0) ? $fams[0] : undef;
}

sub families_containing_peg {
    my($self,$peg) = @_;

    $self->check_db_peg_to_fams;  # lazily tie DB
    my $fams = $self->{peg_to_fams_db}->{$peg};
    return $fams ? split("\t",$fams) : ();
}

sub families_in_genome {
    my($self,$genome) = @_;

    $self->check_db_genome_to_fams;  # lazily tie DB

    my $fams = $self->{genome_to_fams_db}->{$genome};
    return $fams ? split("\t",$fams) : ();
}

sub all_families {
    my($self) = @_;

    return sort map { chomp; $_ } `cut -f1 $self->{dir}/family.functions`;
}

sub figfam_active_subsystems {
    my ($self, $fig, $tmp) = @_;

    my $dir = $self->{dir};
    my ($ff_ss, $ff_func);
    if ($tmp){
	open (FH, "$tmp/figfam_subsystem.dat");
    }
    else{
	open (FH, "$dir/release_history/figfam_subsystem.dat");
    }
    while (my $line = <FH>)
    {
        chomp $line;
        my ($fam, $function, @subsystems) = split (/\t/, $line);
        foreach my $ss (@subsystems){
            push (@{$ff_ss->{$ss}}, $fam);
        }
        $ff_func->{$fam} = $function;
    }
    return ($ff_ss, $ff_func);
}

sub get_quality_control_data{
    my ($self, $fig, $tmp) = @_;
    my $data;
    my $dir = $self->{dir};
    
    if ($tmp){
	open (FH, "$tmp/QC.dat");
    }
    else{
	open (FH, "$dir/release_history/QC.dat");
    }

    $/ = "//\n";
    while (my $line = <FH>){
	chomp ($line);
	$line =~ s/\n//ig;
	my ($name, $org, $total, $annotated, $misannotated) = ($line) =~ /NAME\tRelease (.*)ORGANISM\t(.*)TOTAL\t(.*)ANNOTATED\t(.*)MISANNOTATED\t(.*)/;

	if (! defined $data->{$org}->{annotations}){
	    my $genus = $fig->genus_species($org);
	    push (@{$data->{$org}->{annotations}}, ['name', $genus]);
	    push (@{$data->{$org}->{correct}}, ['name', $genus]);
	}

	push (@{$data->{$org}->{annotations}}, [$name, $annotated/$total]);
	push (@{$data->{$org}->{correct}}, [$name, ($annotated-$misannotated)/$annotated]);

    }
    close FH;
    $/ = "\n";

    return $data;
}

sub get_current_figfam_size_data {
    my ($self, $fig, $tmp) = @_;

    my $dir = $self->{dir};
    my ($ff_size);

    if ($tmp){
        open (FH, "$tmp/size_distribution.dat");
	open (VERSION, "$tmp/VERSION");
    }
    else{
        open (FH, "$dir/release_history/size_distribution.dat");
	open (VERSION, "$dir/release_history/VERSION");
    }

    my ($total_pegs,$total_ff);
    my $distribution = [];
    while (my $line = <FH>){
	chomp($line);
	my ($ff_size, $ff_qty) = split(/\t/, $line);
	$total_pegs += $ff_size*$ff_qty;
	$total_ff += $ff_qty;
	my $array = [$ff_size, $ff_qty];
	push(@$distribution, $array);
    }
    close FH;

    my $rel_name;
    while (my $line =<VERSION>){
	chomp $line;
	if ($line =~ m/^(version)\t(.*)/){
	    $ff_size->{release} = $2;
	    push(@$distribution, ['name',$2]);
	    $ff_size->{name}=$2;
	    #last;
	}
	else{
	    my ($key, $value) = split(/\t/, $line);
	    $ff_size->{$key}=$value;
	}
    }
    close VERSION;

    $ff_size->{distribution} = $distribution;
    $ff_size->{ff_peg_qty} = $total_pegs;
    $ff_size->{ff_qty} =  $total_ff;

    return $ff_size;
}

sub get_previous_figfam_size_data {
    my ($self, $fig, $tmp) = @_;

    my $dir = $self->{dir};
    my ($ff_size);
    my $prev_dir;
    my $prev_releases ={};

    if ($tmp){
	$prev_dir = "$tmp/old_releases";
    }
    else{
	$prev_dir = "$dir/release_history/old_releases";
    }

    my @releases = glob("$prev_dir/*");
    foreach my $release (@releases){
        open (FH, "$release/size_distribution.dat");
	open (VERSION, "$release/VERSION");

	my ($total_pegs,$total_ff);
	my $distribution = [];
	while (my $line = <FH>){
	    chomp($line);
	    my ($ff_size, $ff_qty) = split(/\t/, $line);
	    $total_pegs += $ff_size*$ff_qty;
	    $total_ff += $ff_qty;
	    my $array = [$ff_size, $ff_qty];
	    push(@$distribution, $array);
	}
	close FH;
    
	my $release_name;
	while (my $line =<VERSION>){
	    chomp $line;
	    if ($line =~ m/^(version)\t(.*)/){
		$ff_size->{release} = $2;
		push(@$distribution, ['name',$2]);
		$release_name = $2;
		$ff_size->{name} = $2;
		last;
	    }
	}
	close VERSION;

	$ff_size->{distribution} = $distribution;
	$ff_size->{ff_peg_qty} = $total_pegs;
	$ff_size->{ff_qty} =  $total_ff;
	$prev_releases->{$release_name} = $ff_size;
    }
    
    return $prev_releases;
}

sub is_fid_in_family {
    my($self,$seq,$debug_peg) = @_;
    my $id_2_ff = {};

    my $ffD   = $self->{dir};
    my $ff_file     = "$ffD/families.2c";

    my $contents;
    sysopen(FAM_FIDS, $ff_file, 0) or die "could not open file '$ff_file': $!";
    sysread(FAM_FIDS, $contents, 1000000000);
    close(FAM_FIDS) or die "could not close file '$ff_file': $!";
    my %ff_fids = map { $_ =~ /(FIG.*?)\t(fig\|\d+\.\d+\.peg\.\d+)/; $2 => $1 } split("\n", $contents);

    foreach (@$seq){
	if ($ff_fids{$_}){
	    $id_2_ff->{$_} = $ff_fids{$_};
	}
    }
    return $id_2_ff;
}

sub place_in_family {
    my($self,$seq,$debug,$loose,$debug_peg,$nuc) = @_;
    my($figfam,$should_be,$sims);

    my $old_sep = $/;
    $/ = "\n";

    my $dir = $self->{dir};
    my $tmpF = "$FIG_Config::temp/tmp$$.fasta";
    open(TMP,">$tmpF") 
	|| die "could not open $tmpF";
    print TMP ">query\n$seq\n";
    close(TMP);

    if ($nuc){
#	open(BLAST,"$FIG_Config::ext_bin/blastall -i $tmpF -d $dir/repdb -m 8 -FF -p blastx -g F | head -n 20 |")
	open(BLAST,"$FIG_Config::ext_bin/blastall -i $tmpF -d $dir/repdb -m 8 -FF -p blastx -g F |")
	    || die "could not open blast";
    }
    else{
#	open(BLAST,"$FIG_Config::ext_bin/blastall -i $tmpF -d $dir/repdb -m 8 -FF -p blastp | head -n 20 |")
	open(BLAST,"$FIG_Config::ext_bin/blastall -i $tmpF -d $dir/repdb -m 8 -FF -p blastp |")
	|| die "could not open blast";
    }

    my(%seen);
    my $got = undef;
    my $min_sc = 1;
    my $checked = 0;
    while ((! $got) && ($checked < 30) && (defined($_ = <BLAST>)))
    {
	if ($debug) { print STDERR $_ }
	chop;
	my $sim = [split(/\t/,$_)];
	if ($min_sc > $sim->[10]) { $min_sc = $sim->[10] }
	my $fam_id = ($sim->[1] =~ /(FIG\d+)/) ? $1 : "";
	if (! $seen{$fam_id})
	{
	    $seen{$fam_id} = 1;
	    $figfam = new FF($fam_id,$dir);

	    if (not defined($figfam))
	    {
		next;
	    }
	    else
	    {
		$checked++;
		if ($debug) { print STDERR "checking family $fam_id ",$figfam->family_function,"\n" }
		($should_be, $sims) = $figfam->should_be_member($seq,$debug,$loose,$debug_peg,$nuc);
		if ($debug) { print STDERR "    should_be=$should_be\n" }
		my $psc;
		if ($should_be && defined($psc = &psc($sims->[0])) && ($psc <= $min_sc))
		{
		    if ($debug) { print STDERR "min_sc=$min_sc best=$psc\n" }
		    $got = $figfam;
		}
	    }
	}
    }

    if ($debug)
    {
	while ($_ && (defined($_ = <BLAST>)))  ### JUST PRINT OUT REMAINING BLASTS AGAINST REPDB
	{
	    print STDERR $_;
	}
    }
    $/ = $old_sep;
    close(BLAST);

    unlink($tmpF);
    return $got ? ($got,$sims) : (undef,undef);
}

sub psc {
    my ($sim) = @_;
    return ($sim->[10] =~ /^e-/) ? "1.0" . $sim->[10] : $sim->[10];
}


=head3
usage: $figfams->family_functions();

returns a hash of all the functions for all figfams from the family.functions file

=cut

sub family_functions {
    my($self) = @_;
    my $ffD   = $self->{dir};
    my $ff_file     = "$ffD/family.functions";

    my $contents;
    sysopen(FAM_FUNC, $ff_file, 0) or die "could not open file '$ff_file': $!";
    sysread(FAM_FUNC, $contents, 1000000000);
    close(FAM_FUNC) or die "could not close file '$ff_file': $!";
    my %ff_name = map {split(/\t/)} split("\n", $contents);

    return \%ff_name;
}

sub merge_figfams{
    my ($self, $ff_list, $user) = @_;

    my @list = sort @$ff_list;
    my $main_ff = shift @list;

#    print STDERR "$main_ff and merged are" . join(",",@list) . "\n";
    if ($self->add_to_merge_file($main_ff, \@list)){
	$self->update_peg_to_ff_db($main_ff, \@list);
	return 1;
    }
#    return 1;
}

sub update_peg_to_ff_db{
    my ($self, $main_ff, $ff_list) = @_;
    
    my $db = "$FIG_Config::FigfamsData/peg.db";
#    my $db = "$FIG_Config::temp/peg.db";
    my %peg_hash;
    my $peg_hash_tie = tie %peg_hash, 'DB_File', $db, O_RDWR, 0666, $DB_HASH;
    ($peg_hash_tie) || die "failed the tie";
    
    foreach my $fam_id (@$ff_list){
	my $figfam = new FF($fam_id,"$FIG_Config::FigfamsData");
	#print STDERR "STEP1\n";
	foreach my $peg (@{$figfam->pegs_of}){
	    #print STDERR "STEP2\n";
	    #print join("\t",($peg,$peg_hash{$peg})),"\n";
	    $peg_hash{$peg} = $main_ff;
	    #print STDERR join("\t",($peg,$peg_hash{$peg})),"\n";
	}
    }
    untie %peg_hash;
    return 1;
}

sub add_to_merge_file{
    my ($self, $main_ff, $mergers) = @_;
    my $merge_file = "$FIG_Config::global/figfams.merge";
#    print STDERR "MERGE FILE: $merge_file";

    open (FH, ">>$merge_file") || die "could not open $merge_file";
    foreach my $to_merge (@$mergers){
	print FH "$main_ff\t$to_merge\n";
    }
    close FH;
    return 1;
}

sub should_be_merged_to{
    my ($self, $ff) = @_;

    my $ffD = $self->{dir};
    my $merge_file = "$ffD/figfams.merge";

    open (FH, ">>$merge_file") || die "could not open $merge_file";
    while (my $line = <FH>){
	chomp ($line);
	my ($mainFF, $mergeFF, $user) = split (/\t/, $line);
	return $mainFF if ($ff eq $mergeFF);
    }

    return;
}

sub get_merged_children{
     my ($self, $ff) = @_;

     my $ffD = $self->{dir};
     my $merge_file = "$ffD/figfams.merge";
     my $merged_associations = [];

     open (FH, ">>$merge_file") || die "could not open $merge_file";
     while (my $line = <FH>){
	 chomp ($line);
	 my ($mainFF, $mergerFF, $user) = split (/\t/, $line);

	 push @$merged_associations, $mergerFF if ($mainFF eq $ff); 
     }
     return $merged_associations if (scalar @$merged_associations > 0);
}

1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3