[Bio] / FigKernelPackages / FF.pm Repository:
ViewVC logotype

View of /FigKernelPackages/FF.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.22 - (download) (as text) (annotate)
Mon Aug 11 16:16:24 2008 UTC (11 years, 6 months ago) by olson
Branch: MAIN
CVS Tags: rast_rel_2008_12_18, rast_2008_0924, rast_rel_2008_09_30, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, mgrast_rel_2008_0625, rast_rel_2008_10_09, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, mgrast_rel_2008_1110, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, rast_rel_2008_11_24, rast_rel_2008_08_07
Changes since 1.21: +4 -4 lines
Fix bug where return of should_be_member is lost.

# -*- perl -*-
########################################################################
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
########################################################################

package FF;

use strict;
use Carp;
use Data::Dumper;

=head1 Module to access FIGfams

=head3 new

usage:
    my $figfam_obj = FF->new($fam_id, $fam_dir);

C<$fam_id> is the ID of the family, of the form C<FIGnnnnnn> where C<n> is a digit;
it is required.

C<$fam_data> is required
as the directory that contains (or will contain) FigFam data.

=cut

sub new {
    my($class,$fam_id,$fam_data) = @_;

    ($fam_id =~ /^FIG\d{6}$/) || confess "invalid family id: $fam_id";
    my $fam = {};
    $fam->{id}  = $fam_id;
    defined($fam_data) || confess "fam_data is undefined";
    $fam->{data} = $fam_data;

    my $fam_dir = "$fam_data/FIGFAMS";
    my $dir = &fam_dir($fam_data,$fam_id);
    $fam->{dir} = $dir;
    (-d $dir) || return undef;

    open(FUNC,"<$dir/function") || die "could not open $dir/function";
    my $func = <FUNC>;
    chomp $func;
    close(FUNC);
    $fam->{function} = $func;
    my($peg,$pegs);
    my $pegsL = [
		 sort { &by_fig_id($a,$b) }
		 map { $_ =~ /(fig\|\d+\.\d+\.peg\.\d+)/; $1 }
		 `cat \"$dir/PEGs\"`
		 ];
    
    if (@$pegsL < 2)
    {
	return undef;
    }

    $fam->{pegsL} = $pegsL;
    my $pegsH = {};
    foreach $peg (@$pegsL)
    {
	$pegsH->{$peg} = 1;
    }
    $fam->{pegsH} = $pegsH;


    if (-s "$dir/PEGs.fasta")
    {
	open(FASTA,"<$dir/PEGs.fasta") || die "could not read-open $dir/PEGs.fasta";
	while (my ($peg, $seqP) = &read_fasta_record(\*FASTA))
	{
	    $fam->{peg_lengths}->{$peg} = length($$seqP);
	}
	close(FASTA);
    }
    else
    {
	confess "$fam_id is missing PEGs.fasta";
    }

    if (&use_ross_bounds($dir) && (-s "$dir/bounds"))
    {
	$fam->{bounds} = &load_bounds("$dir/bounds");
    }
    bless $fam,$class;
    return $fam;
}

sub use_blast {
    my($dir) = @_;

    return ((-s "$dir/decision.procedure") && &which_dec("$dir/dec",'blast'));
}

sub use_ross_bounds {
    my($dir) = @_;

    return ((! (-s "$dir/decision.procedure")) || &which_dec("$dir/dec",'ross'));
}

sub which_dec {
    my($dir,$pat) = @_;

    if (open(DEC,"<$dir/decision.procedure"))
    {
	my $x = <DEC>;
	close(DEC);
	if ($x)
	{
	    
	    return ($x =~ /^$pat/);
	}
    }
    return 0;
}
	
sub load_bounds {
    my($file) = @_;

    my $bounds;
    if (open(BOUNDS,"<$file"))
    {
	$bounds = {};
	my $x;
	while (defined($x = <BOUNDS>))
	{
	    chomp $x;
	    my @flds = split(/\t/,$x);
	    my $peg  = shift @flds;
	    $bounds->{$peg} = [@flds];
	}
	close(BOUNDS);
    }
    return $bounds;
}

=head3 pegs_of

usage:
    print $figfam_obj->pegs_of();

Returns a list of just pegs.

=cut

sub pegs_of {
    my($self) = @_;
    return [$self->list_members];
}

=head3 list_members

usage:
    @ids = $figfam_obj->list_members();

Returns a list of the PEG FIDs in a family.

=cut

sub list_members {
    my ($self)  = @_;

    my $fam_dir = $self->{dir};
    my @pegs    = map { ($_ =~ /(fig\|\d+\.\d+\.peg\.\d+)/) ? $1 : () } `cat $fam_dir/PEGs`;

    return @pegs;
}

=head3 representatives

usage:
    C<< @rep_seqs = $figfam_obj->representatives(); >>

Returns a list of the "representative sequences" characterizing a FIGfam.

=cut

sub representatives {
    my($self) = @_;

    my $reps = $self->{reps};
    if (! $reps)
    {
	my $rep_file = "$self->{dir}/reps";
	$reps = $self->{reps} = (-s $rep_file) ? [map { $_ =~ /(\S+)/; $1 } `fgrep  -v ">" $rep_file`] : [];
    }
    return @$reps;
}


=head3 should_be_member

usage:
    if ( $figfam_obj->should_be_member( $seq ) ) { #...do something... }

Returns ($placed,$sims).  $placed will be
C<TRUE> if the protein sequence in C<$seq> is judged to be
"similar enough" to the members of a family to potentially be included.

I have added the "loose" argument as an optional last argument.  This means that

    if ( $figfam_obj->should_be_member( $seq,0,1 ) ) { #...do something... }

will return true, even if the input sequence is truncated (i.e., we do not force the
similarity to go across 80% of matched sequences in the family).

=cut

sub should_be_member {
    my($self,$seq,$debug,$loose,$debug_peg,$nuc) = @_;

    my $old_eol = $/;
    $/ = "\n";

    my $dir = $self->{dir};
    my($in,@rc);
    if (open(DEC,"<$dir/decision.procedure") && ($in = <DEC>) && ($in =~ /^(\S+)(\s+(\S.*\S))?/))
    {
	no strict 'refs';

	my $procedure = $1;
	my @args      = $3 ? split(/\s+/,$3) : ();
	close(DEC);
	@rc =  &{$procedure}($self,$debug,$loose,$seq,$dir,$debug_peg,$nuc,@args);
    }
    else
    {
	@rc = &ross_hack($self,$debug,$loose,$seq,$dir,$debug_peg,$nuc);
    }
    $/ = $old_eol;
    return @rc;
}

sub min {
    my($x,$y) = @_;

    return ($x <= $y) ? $x : $y;
}

use FFs;

sub blast_vote {
    my($self,$debug,$loose,$seq,$dir,$debug_peg,$nuc,$partition,$min_bsc) = @_;

    if ($ENV{DEBUG}) { $debug = 1 }
    (-s "$dir/PEGs") || return undef;

    my $ffs;
    $ffs = new FFs($self->{data});

    if ($debug) { print STDERR "checking: ",$self->family_id," min_bsc=$min_bsc\n" }
    my %pegs_in = map { $_ =~ /(fig\|\d+\.\d+\.peg\.\d+)/; $1 => 1 } `cat $dir/PEGs`;

    my $N = &min(10,scalar keys(%pegs_in) - ($debug_peg ? 1 : 0));
    my $tmpF = "$FIG_Config::temp/tmp$$.fasta";
    open(TMP,">$tmpF")
	|| die "could not open tmp$$.fasta";
    print TMP ">query\n$seq\n";
    close(TMP);

    my $query_ln = length($seq);
    my $partitionF = "$self->{data}/Partitions/" . ($partition % 1000) . "/$partition/fasta";
    my @tmp;
    if ($nuc){
	@tmp = `$FIG_Config::ext_bin/blastall -i $tmpF -d $partitionF -m 8 -FF -p blastx -g F`;
    }
    else{
	@tmp = `$FIG_Config::ext_bin/blastall -i $tmpF -d $partitionF -m 8 -FF -p blastp`;
    }
    unlink($tmpF);

    my $sims = [];
    my $in = 0;
    my $out = 0;
    my(%seen);
    for (my $simI=0; ($simI < @tmp); $simI++)
    {
	$_ = $tmp[$simI];
	if ($debug) { print STDERR $_ }
	chop;

	my $sim = [split(/\t/,$_)];
	my $peg = $sim->[1];
	next if ($debug_peg && ($debug_peg eq $peg));
	my $bit_score = $sim->[11];
	my $matched1 = abs($sim->[7] - $sim->[6]) + 1;
	my $matched2 = abs($sim->[9] - $sim->[8]) + 1;
	if ($debug) { print STDERR "normalized bit-score=",sprintf("%3.2f",$bit_score / $matched2),"\n" }
	if (! $seen{$peg})
	{
	    $seen{$peg} = 1;
	    my $count_in = &count_in($self,$ffs,\%pegs_in,$loose,$peg);
	    my $ln2 = &get_len2($self,$ffs,$peg,\%pegs_in);

	    if (sprintf("%3.2f",($bit_score / $matched2)) >= $min_bsc)
	    {
		if ($nuc){
		    if ($count_in                             &&                 # (print "in-ok\n") &&
			$ln2                                  &&
			($matched1 >= (0.7 * $query_ln))                         # (print "mat1-ok\n") &&
			)
		    {
			push @$sim, $query_ln, $self->{peg_lengths}->{$peg};
			bless $sim, 'Sim';
			push @$sims, $sim;
			if ($N > 0)
			{
			    $in++;
			}
		    }
		    else
		    {
			if ($N > 0)
			{
			    $out++;
			}
		    }
		    if ($debug) {print STDERR      &Dumper(["in=$in",
							    "out=$out",
							    $count_in,
							    "ln1=$query_ln",
							    "ln2=$ln2",
							    "matched1=$matched1",
							    "matched2=$matched2",
							    "bsc=$bit_score",
							    $ln2 ? sprintf("%3.2f",$bit_score/$matched2) : ""]); }
		    if ($N > 0)
		    {
			$N--;
		    }
		}
		else{
		    if ($count_in                             &&                 # (print "in-ok\n") &&
			$ln2                                  &&
			($matched1 >= (0.7 * $query_ln))      &&                 # (print "mat1-ok\n") &&
			($matched2 >= (0.7 * $ln2))                              # (print "mat2-ok\n") &&
			)
		    {
			push @$sim, $query_ln, $self->{peg_lengths}->{$peg};
			bless $sim, 'Sim';
			push @$sims, $sim;
			if ($N > 0)
			{
			    $in++;
			}
		    }
		    else
		    {
			if ($N > 0)
			{
			    $out++;
			}
		    }
		    if ($debug) {print STDERR      &Dumper(["in=$in",
							    "out=$out",
							    $count_in,
							    "ln1=$query_ln",
							    "ln2=$ln2",
							    "matched1=$matched1",
							    "matched2=$matched2",
							    "bsc=$bit_score",
							    $ln2 ? sprintf("%3.2f",$bit_score/$matched2) : ""]); }
		    if ($N > 0)
		    {
			$N--;
		    }
		}
	    }
	}
    }
    if ($debug) { print STDERR "in=$in out=$out, FINAL VOTE\n" }
    return (($in > $out),$sims);
}

sub get_len2 {
    my($self,$ffs,$peg,$pegs_in) = @_;

    if ($pegs_in->{$peg})
    {
	return $self->{peg_lengths}->{$peg};
    }
    else
    {
	return length($ffs->seq_of($peg));
    }
}

sub count_in {
    my($self,$ffs,$pegs_in,$loose,$peg) = @_;

    if ($pegs_in->{$peg}) { return 1 }
    if (! $loose)         { return 0 }
    my $fam_func = $self->family_function;
    my $peg_func = $ffs->function_of($peg);
    if ($peg_func)
    {
	$peg_func =~ s/\s*\#.*$//;
	return ($peg_func eq $fam_func);
    }
    return 0;
}

sub ross_hack {
    my($self,$debug,$loose,$seq,$dir,$debug_peg,$nuc,$boundsFile) = @_;

    my $all_bounds;
    if ($boundsFile)
    {
	if    ($boundsFile eq "bounds")     { $all_bounds = $self->{bounds} }
	elsif (-s "$boundsFile")            { $all_bounds = &load_bounds($boundsFile) }
	elsif (-s "$dir/$boundsFile")       { $all_bounds = &load_bounds("$dir/$boundsFile") }
	else                                { $all_bounds = $self->{bounds} }
    }
    else
    {
	$all_bounds = $self->{bounds};
    }

    my $tmpF = "$FIG_Config::temp/tmp$$.fasta";
    open(TMP,">$tmpF")
	|| die "could not open tmp$$.fasta";
    print TMP ">query\n$seq\n";
    close(TMP);

    my $query_ln = length($seq);
    my @tmp;
    if ($nuc){
	@tmp = `blastall -i $tmpF -d $dir/PEGs.fasta -m 8 -FF -p blastx -g F`;
    }
    else{
	@tmp = `blastall -i $tmpF -d $dir/PEGs.fasta -m 8 -FF -p blastp`;
    }

    unlink($tmpF);

    my %seen;
    my $should_be = 0;
    my $yes = 0;
    my $no  = 0;

    my $ln1 = length($seq);
    my $good = 0;
    my $bad = 0;

    my $sims = [];
    my $required_better = ((@{$self->{pegsL}} - ($debug_peg ? 1 : 0)) > 1) ? 1 : 0;
#   print STDERR "required_better=$required_better\n";

    for (my $simI=0; ($simI < @tmp) && (! $good) && (! $bad); $simI++)
    {
	$_ = $tmp[$simI];
	if ($debug) { print STDERR $_ }
	chop;

	my $sim = [split(/\t/,$_)];
	my $peg = $sim->[1];
	next if ($debug_peg && ($debug_peg eq $peg));

	my $sc = $sim->[10];
	my $bit_score = $sim->[11];

	my $matched1 = abs($sim->[7] - $sim->[6]) + 1;
	my $matched2 = abs($sim->[9] - $sim->[8]) + 1;
	my $ln2 = $self->{peg_lengths}->{$peg};

	if ($nuc) {
	    if ((! $seen{$peg}) &&
		($loose ||
		 (($sc <= 1.0e-10) &&
		  ($matched1 >= (0.7 * $ln1)) )
		 )
		)
	    {
		$seen{$peg} = 1;
		push @$sim, $query_ln, $self->{peg_lengths}->{$peg};
		bless $sim, 'Sim';
		push @$sims, $sim;
		
		my $bounds = $all_bounds->{$peg};
		if ($bounds && (@$sims <= 10))
		{
		    if ((($bit_score >= $bounds->[1]) && ((! $bounds->[2]) || $bounds->[3] < $bit_score)) ||
			($loose &&
			 ((($bit_score/$ln1) >= ($bounds->[1] / $ln2)) &&
			  ((! $bounds->[2]) || (($bounds->[3]/$ln2) < ($bit_score/$ln1))))))
		    {
			if ($debug) { print STDERR "    yes - $peg matched1=$matched1 ln1=$ln1 matched2=$matched2 ln2=$ln2\n" }
			++$yes;
			if ($yes > ($no+$required_better)) { $good = 1 }
		    }
		    else
		    {
			if ($debug) { print STDERR "    no - $peg ", join(",",@$bounds),"\n" }
			++$no;
			if ($no > ($yes+$required_better)) { $bad = 1 }
		    }
		}
		else {
		    if ($debug) {
			print STDERR "    bounds=", ($bounds ? qq(non-nil) : qq(nil))
			    , ", num_sims=", (scalar @$sims), "\n";
		    }
		}
	    }
	    else {
		if ($debug) {
		    print STDERR
			"    seen=", ($seen{$peg} ? $seen{$peg} : 0), " score=$sc,"
			, " matched1=$matched1, ln1=$ln1,"
			, " matched2=$matched2, ln2=$ln2,"
			, "\n";
		}
	    }
	}
	else{
	    if ((! $seen{$peg}) &&
		($loose ||
		 (($sc <= 1.0e-10) &&
		  ($matched1 >= (0.7 * $ln1)) &&
		  ($matched2 >= (0.7 * $ln2)))
		 )
		)
	    {
		$seen{$peg} = 1;
		push @$sim, $query_ln, $self->{peg_lengths}->{$peg};
		bless $sim, 'Sim';
		push @$sims, $sim;
		
		my $bounds = $all_bounds->{$peg};
		if ($bounds && (@$sims <= 10))
		{
		    if ((($bit_score >= $bounds->[1]) && ((! $bounds->[2]) || $bounds->[3] < $bit_score)) ||
			($loose &&
			 ((($bit_score/$ln1) >= ($bounds->[1] / $ln2)) &&
			  ((! $bounds->[2]) || (($bounds->[3]/$ln2) < ($bit_score/$ln1))))))
		    {
			if ($debug) { print STDERR "    yes - $peg matched1=$matched1 ln1=$ln1 matched2=$matched2 ln2=$ln2\n" }
			++$yes;
			if ($yes > ($no+$required_better)) { $good = 1 }
		    }
		    else
		    {
			if ($debug) { print STDERR "    no - $peg ", join(",",@$bounds),"\n" }
			++$no;
			if ($no > ($yes+$required_better)) { $bad = 1 }
		    }
		}
		else {
		    if ($debug) {
			print STDERR "    bounds=", ($bounds ? qq(non-nil) : qq(nil))
			    , ", num_sims=", (scalar @$sims), "\n";
		    }
		}
	    }
	    else {
		if ($debug) {
		    print STDERR
			"    seen=", ($seen{$peg} ? $seen{$peg} : 0), " score=$sc,"
			, " matched1=$matched1, ln1=$ln1,"
			, " matched2=$matched2, ln2=$ln2,"
			, "\n";
		}
	    }
	}
    }
    if ($yes > $no) { $good = 1 }
    if ($debug) { print STDERR "        yes=$yes no=$no good=$good\n"; }
    return ($good,$sims);
}

=head3 family_function

usage:
    $func = $figfam_obj->family_function();

Returns the "consensus function" assigned to a FIGfam object.

=cut

sub family_function {
    my($self,$full) = @_;

    my $func = $self->{function};
    if (! $full) { $func =~ s/^FIG\d{6} \(not subsystem-based\): // }
    return $func;
}



=head3 family_id

usage:
    $fam_id = $figfam_obj->family_id();

Returns the FIGfam ID of a FIGfam object.

=cut

sub family_id {
    my($self) = @_;

    return $self->{id};
}



=head3 family_dir

usage:

    $dir = &FigFam::family_dir( $fam_data, $fam_id );

Returns the path to the subdirectory of C<$fam_data>
that the FIGfam data for a FIGfam with ID C<$fam_id>
would be stored in.

=cut

sub fam_dir {
    my($fam_data,$fam_id) = @_;

    $fam_id =~ /(\d{3})$/;
    return "$fam_data/FIGFAMS/$1/$fam_id";
}

sub by_fig_id {
    my($a,$b) = @_;
    my($g1,$g2,$t1,$t2,$n1,$n2);
    if (($a =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g1,$t1,$n1) = ($1,$2,$3)) &&
         ($b =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g2,$t2,$n2) = ($1,$2,$3))) {
        ($g1 <=> $g2) or ($t1 cmp $t2) or ($n1 <=> $n2);
    } else {
        $a cmp $b;
    }
}

sub read_fasta_record {

    shift if UNIVERSAL::isa($_[0],__PACKAGE__);
    my ($file_handle) = @_;
    my ($old_end_of_record, $fasta_record, @lines, $head, $sequence, $seq_id, $comment, @parsed_fasta_record);

    if (not defined($file_handle))  { $file_handle = \*STDIN; }

    $old_end_of_record = $/;
    $/ = "\n>";

    if (defined($fasta_record = <$file_handle>)) {
        chomp $fasta_record;
        @lines  =  split( /\n/, $fasta_record );
        $head   =  shift @lines;
        $head   =~ s/^>?//;
        $head   =~ m/^(\S+)/;
        $seq_id = $1;
        if ($head  =~ m/^\S+\s+(.*)$/)  { $comment = $1; } else { $comment = ""; }
        $sequence  =  join( "", @lines );
        @parsed_fasta_record = ( $seq_id, \$sequence, $comment );
    } else {
        @parsed_fasta_record = ();
    }

    $/ = $old_end_of_record;

    return @parsed_fasta_record;
}

1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3