[Bio] / FigKernelPackages / FigFam.pm Repository:
ViewVC logotype

View of /FigKernelPackages/FigFam.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (download) (as text) (annotate)
Sat Apr 1 18:24:23 2006 UTC (14 years ago) by overbeek
Branch: MAIN
Changes since 1.4: +9 -3 lines
fixes and errors to STAG

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

package FigFam;

use strict;
use FIG;
use SameFunc;

use Carp;
use Data::Dumper;

# This is the constructor.  Presumably, $class is 'FigFam'.  
#
# $fam_id is the ID of the family (of the form FIGnnnnnnn).  It is required.
#
# $fam_data is optional.  If it exists,
# it gives the directory that is (or will) contain the FigFam data.  
# $FIG_Config::data/FigfamsData will be used as a default.
#
# $fam_file must exist within $fam_data.  It contains a 2-column version of the FIG families.
# If the $fam_dir/yyy/FIGxxxxyyy directory (which is where the directory for the specific family
# will go) does not exist, the specification of the PEGs in $fam_file will be used to initialize
# the family.
#

sub new {
    my($class,$fig,$fam_id,$fam_data) = @_;

    ($fam_id =~ /^FIG\d{3}(\d{3})$/) || die "$fam_id is not a valid FIG family ID";

    my $yyy = $1;
    my $fam = {};
    $fam->{id}  = $fam_id;
    $fam->{fig} = $fig;

    if (! defined($fam_data))
    {
	$fam_data = "$FIG_Config::data/FigfamsData";
    }
    $fam->{data} = $fam_data;

    my $fam_dir = "$fam_data/FIGFAMS";
    my $fam_file = "$fam_data/families.2c";
    my $dir = "$fam_dir/$yyy/$fam_id";
    $fam->{dir} = $dir;

    &FIG::verify_dir($dir);
    if ((! -s "$dir/PEGs") && defined($fam_file))
    {
	if (-s $fam_file)
	{
	    system("grep $fam_id \"$fam_file\" > \"$dir/PEGs\"");
	    if (! -s "$dir/PEGs")
	    {
		system "rm -rf $dir";
		confess "PEG file $dir/PEGs does not exist --- $dir deleted";
		return undef;
	    }
	}
	else
	{
	    confess "Family file $fam_file doe not exist";
	    return undef;
	}
	
	my @first = `head -n 1 \"$dir/PEGs\"`;
	if ($first[0] =~ /^\S+\t\S+\t(\S.*\S)/)
	{
	    open(FUNC,">$dir/function") || die "could not open $dir/function";
	    print FUNC "$1\n";
	    close(FUNC);
	}
	else
	{
	    die "$dir has no valid function file";
	}
    }
    open(FUNC,"<$dir/function") || die "could not open $dir/function";
    my $func = <FUNC>;
    chomp $func;
    close(FUNC);
    $fam->{function} = $func;
    
    my($peg,$pegs);
    my $pegsL = [sort { &FIG::by_fig_id($a,$b) } map { $_ =~ /^\S+\t(\S+)/; $1 } `cat \"$dir/PEGs\"`];

    $fam->{pegsL} = $pegsL;
    my $pegsH = {};
    foreach $peg (@$pegsL)
    {
	$pegsH->{$peg} = 1;
    }
    $fam->{pegsH} = $pegsH;
    
    
    if (-s "$dir/PEGs.fasta")
    {
	open(FASTA,"<$dir/PEGs.fasta") || die "could not read-open $dir/PEGs.fasta";
	while (my ($peg, $seqP) = &FIG::read_fasta_record(\*FASTA))
	{
	    $fam->{peg_lengths}->{$peg} = length($$seqP);
	}
	close(FASTA);
    }
    else
    {
	open(FASTA,">$dir/PEGs.fasta") || die "could not write-open $dir/PEGs.fasta";
	foreach $peg (@$pegsL)
	{
	    my $seq = $fig->get_translation($peg);
	    if ($seq)
	    {
		print FASTA ">$peg\n$seq\n";
		$fam->{peg_lengths}->{$peg} = length($seq);
	    }
	    else
	    {
		confess "Could not get seq for $peg";
	    }
	}
	close(FASTA);
    }
    
    if ((! -s "$dir/PEGs.fasta.pin") || ((-M "$dir/PEGs.fasta.pin") > (-M "$dir/PEGs.fasta")))
    {
	&FIG::run("$FIG_Config::ext_bin/formatdb -i \"$dir/PEGs.fasta\" -p");
    }
    
    
    if (! -s "$dir/bounds")
    {
	open(BOUNDS,">$dir/bounds") || die "could not open $dir/bounds";
	foreach $peg (@$pegsL)
	{
	    my @sims = $fig->sims($peg,1000,1.0e-20,"fig");

	    my($i,$hits,$bad,$bad_sc,$last_good,$func2);
	    for ($i=0, $hits=0, $bad="", $bad_sc=0, $last_good=1; ($i < @sims) && (! $bad); $i++)
	    {
		my $sim = $sims[$i];
		my $id2 = $sim->id2;
		if ($pegsH->{$id2})
		{
		    $last_good = $sim->psc;
		    $hits++;
		}
		else
		{
		    $func2 = $fig->function_of($id2);
		    if (! &ok_func($fig,$func,$func2,$id2))
		    {
			$bad = $id2;
			$bad_sc = $sim->psc;
		    }
		}
	    }

	    if ($hits > 0)
	    {
		print BOUNDS join("\t",($peg,$hits,$last_good,$bad,$bad_sc)),"\n";
	    }
	}
	close(BOUNDS);
    }
    open(BOUNDS,"<$dir/bounds") || die "could not open $dir/bounds";

    my $bounds = {};
    my $x;
    while (defined($x = <BOUNDS>))
    {
	chomp $x;
	my @flds = split(/\t/,$x);
	my $peg =shift @flds;
	$bounds->{$peg} = [@flds];
    }
    close(BOUNDS);
    $fam->{bounds} = $bounds;

    my $blast_file = "$dir/blastout";
    my $rep_file   = "$dir/reps";
    if (! -d "$dir/Split")
    {
	&FIG::run("split_and_trim_sequences $dir/Split blastout=$blast_file < $dir/PEGs.fasta");
	if (-s $blast_file)
	{
	    my @out = map { chop; [split(/\t/,$_) ] } `cat $blast_file`;
	    my @keep = ();
	    my $write = 0;
	    my $i;
	    for ($i=0; ($i < @out); $i++)
	    {
		if ((($i == 0) ||
		     (($out[$i-1]->[0] ne $out[$i]->[0]) || ($out[$i-1]->[1] ne $out[$i]->[1]))) &&
		    ($out[$i-1]->[6] <= 1.0e-10))
		{
		    push(@keep,$out[$i]);
		}
		else
		{
		    $write = 1;
		}
	    }
	    if ($write)
	    {
		open(OUT,">$blast_file") || die $blast_file;
		foreach my $x (@keep)
		{
		    print OUT join("\t",@$x),"\n";
		}
		close(OUT);
	    }
	}
    }

    if ((-s $blast_file) && (! -s $rep_file))
    {
	my $n = 1;
	open(REP,">$rep_file")          || die "could not open $rep_file";
	open(FASTA,"<$dir/PEGs.fasta")  || die "could not open $dir/PEGs.fasta";
	my %seen;
	my($seq,$peg,$sim);
	$/ = "\n>";
	while (defined($_ = <FASTA>))
	{
	    chomp;
	    if ($_ =~ /^>?(\S+)[^\n]*\n(.*)/s)
	    {
		$peg  =  $1;
		if (! $seen{$peg})
		{
		    $seq =  $2;
		    $seq =~ s/\s//gs;
		    print REP ">$fam_id-$n\n$seq\n";
		    $n++;

		    $/ = "\n";
		    $seen{$peg} = 1;

		    open(BLAST,"<$blast_file") || die "could not open $blast_file";
		    while (defined($sim = <BLAST>))
		    {
			if (($sim =~ /^(\S+)\t(\S+)(\t\d+){4}\t(\S+)/) && ($1 eq $peg) &&
			    ($4 < 1.0e-10))
			{
			    $seen{$2} = 1;
			}
		    }
		    close(BLAST);
		    $/ = "\n>";
		}
	    }
	}
	$/ = "\n";
	close(FASTA);
	close(REP);
    }
    $fam->{reps} = [map { $_ =~ /^(\S+)/; $1 } `grep -v "^>" $rep_file`];

    bless $fam,$class;
    return $fam;
}

sub ok_func {
    my($fig,$func,$func2,$id2) = @_;

    my $i;
    if (&SameFunc::same_func($func,$func2)) { return 1 }
    if (! &FIG::hypo($func2))               { return 0 }
    my @sims = $fig->sims($id2,5,1.0e-30,"fig");
    for ($i=0; ($i < @sims) && ($func eq $fig->function_of($sims[$i]->id2)); $i++) {}
    return ($i == 5);
}

sub representatives {
    my($self) = @_;

    my $reps = $self->{reps};
    return @$reps;
}

sub should_be_member {
    my($self,$seq) = @_;
    
    my $dir = $self->{dir};
    open(TMP,">$FIG_Config::temp/tmp$$.fasta") 
	|| die "could not open $FIG_Config::temp/tmp$$.fasta";
    print TMP ">query\n$seq\n";
    close(TMP);

    my $query_ln = length($seq);
    open(BLAST,"blastall -i $FIG_Config::temp/tmp$$.fasta -d $dir/PEGs.fasta -m 8 -FF -p blastp |")
	|| die "could not open blast";
    my $sims = [];
    my %seen;
    my $should_be = 0;
    while (defined($_ = <BLAST>))
    {
	chop;
	my $sim = [split(/\t/,$_)];
	my $peg = $sim->[1];
	my $sc = $sim->[10];
	if ((! $seen{$peg}) && ($sc <= 1.0e-10))
	{
	    $seen{$peg} = 1;
	    push @$sim, $query_ln, $self->{peg_lengths}->{$peg};
	    bless $sim, 'Sim';
	    push @$sims, $sim;
	    my $bounds = $self->{bounds}->{$peg};
	    if ($bounds && ($sc <= $bounds->[1]) && ((! $bounds->[2]) || $bounds->[3] > $sc))
	    {
		$should_be = $peg;
	    }
	}
    }
    return ($should_be,$sims);
}

sub family_function {
    my($self) = @_;

    return $self->{function}
}

1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3