[Bio] / FigKernelScripts / FFB2_merge_families_based_on_genus_groups.pl Repository:
ViewVC logotype

View of /FigKernelScripts/FFB2_merge_families_based_on_genus_groups.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (as text) (annotate)
Mon Nov 22 17:41:30 2010 UTC (9 years ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.2: +176 -38 lines
Big round of figfam update changes

use SeedEnv;

use strict;
use Data::Dumper;
use FIG;
use List::MoreUtils qw(first_index part);
use List::Util qw(reduce);
use Cache::Memcached::Fast;

my $fig = new FIG;
my $sapO = SAPserver->new();

my $usage = "usage: FFB2_merge_families_based_on_genus_groups old.families.2c genus.groups new.families.2c\n";

@ARGV == 3 or die $usage;

my $oldFF = shift;
my $genus_groups = shift;
my $newFF = shift;

my $func_thresh = 0.75;

my $mchost = 'bio-data-2';
my $mcport = 11211;
my $mc;
if ($mchost && $mcport)
{
    $mc = new Cache::Memcached::Fast({ servers => ["$mchost:$mcport"] } );
}

open(OLD_FF, "<", $oldFF) or die "Cannot open old families file $oldFF: $!";
open(GGROUP, "<", $genus_groups) or die "Cannot open genus groups file $genus_groups: $!";
open(NEW_FF, ">", $newFF) or die "Cannot open new families file $newFF: $!";

my($peg_to_fam,$members_in_fam,$nextP) = &existing_families(\*OLD_FF,$fig);
close(OLD_FF);

while (<GGROUP>)
{
    chomp;
    my @set = split(/\t/);
    &process_set(\@set,$peg_to_fam,$members_in_fam,$nextP,$sapO);
}
close(GGROUP);

&print_families_2c(\*NEW_FF, $members_in_fam);

sub process_set {
    my($set,$peg_to_fam,$members_in_fam,$nextP,$sapO) = @_;

    #
    # Keys of %ffs are the figfams represented by the pegs in $set,
    # which is the contents of one of the new genus group families.
    #
    my %ffs = map { $peg_to_fam->{$_} ? ($peg_to_fam->{$_} => 1) : () } @$set;

    my @ffs = keys %ffs;

    my @bad = grep { !ref($members_in_fam->{$_}) } @ffs;
    if (@bad)
    {
	die "The following fams had no member_in_fam entries: @bad\n";
    }

    my $cluster_fn = function_of_set($sapO, $set);
    my $set_id = $set->[0];

    my @merge_ffs = ();
    if ($cluster_fn)
    {
	my %fam_fns = map { $_ => function_of_set($sapO, $members_in_fam->{$_}) } @ffs;

	my @nomatch;
	for my $ff (@ffs)
	{
	    if ($fam_fns{$ff} eq $cluster_fn)
	    {
		push(@merge_ffs, $ff);
	    }
	    else
	    {
		push(@nomatch, "$ff $fam_fns{$ff}");
	    }
	}
	#my @res = part { ($fam_fns{$_} eq $cluster_fn) ? 0 : 1} @ffs;
	#warn Dumper(\@res);
	#my ($ffs_same, $ffs_diff) = @res;
	# @merge_ffs = grep { ($fam_fns{$_} eq $cluster_fn) } @ffs;
	
	@merge_ffs = sort { @{$members_in_fam->{$b}} <=> @{$members_in_fam->{$a}} } @merge_ffs;
	
	warn "cluster_fun=$cluster_fn. Same: @merge_ffs. \n";
	warn "  nomatch: $_\n" for @nomatch;
    }
    else
    {
	warn "$set_id: no function\n";
    }

    my $merge_target;
    if (@merge_ffs)
    {
	$merge_target = shift(@merge_ffs);
	warn "Merge into $merge_target: $set_id @merge_ffs\n";
    }
    else
    {
	$merge_target = $$nextP;
	$$nextP++;
	$members_in_fam->{$merge_target} = [];
	warn "Merge into new $merge_target: $set_id\n";
    }

    for my $ff (@merge_ffs)
    {
	for my $peg (pegs_in_fam($ff))
	{
	    move_to_fam($peg, $merge_target);
	}
	delete_fam($ff);
    }

    for my $peg (@$set)
    {
	move_to_fam($peg, $merge_target);
    }
}

sub function_of_set
{
    my($sap, $set) = @_;

    my %need;

    $need{$_}++ for @$set;

    my @fns;
    if ($mc)
    {
	my $fns = $mc->get_multi(map { "f:$_" } @$set);
	for my $k (@$set)
	{
	    my $fn = $fns->{"f:$k"};
	    $fn =~ s/\s*$//;
	    $fn =~ s/^\s*//;
	    if ($fn)
	    {
		push(@fns, $fn);
		delete $need{$k};
		# warn "cache: $k => $fn\n";
	    }
	}
    }
    my $fns = $sap->ids_to_functions(-ids => [keys %need]);
    for my $peg (keys %need)
    {
	my $fn = $fns->{$peg};
	$fn =~ s/\s*$//;
	$fn =~ s/^\s*//;
	if ($fn && $mc)
	{
	    $mc->set("f:$peg", $fn);
	}
	# warn "lookup: $peg => $fn\n";
	push(@fns, $fn);
    }

    my %counts;
    $counts{$_}++ for @fns;
    my $fn = reduce { $counts{$a} > $counts{$b} ? $a : $b } @fns;
    my $n = @fns;
    my $frac = $counts{$fn} / $n;
    # warn "Winning count: $fn $counts{$fn} of $n frac=$frac\n";
    if ($fn ne 'hypothetical protein' && $frac >= $func_thresh)
    {
	return $fn;
    }
    else
    {
	return undef;
    }
}

sub pegs_in_fam
{
    my($fam) = @_;
    return @{$members_in_fam->{$fam}};
}

sub delete_fam
{
    my($fam) = @_;

    my $pegs = $members_in_fam->{$fam};
    if (ref($pegs) && @$pegs)
    {
	warn "Removing fam $fam with remaining pegs @$pegs\n";
	delete $peg_to_fam->{$_} for @$pegs;
    }

    delete $members_in_fam->{$fam};
}

sub move_to_fam
{
    my($peg, $dest) = @_;

    my $cur = $peg_to_fam->{$peg};
    if ($cur)
    {
	if ($cur eq $dest)
	{
	    warn "$peg already in $dest\n";
	    return;
	}
	my $pegs = $members_in_fam->{$cur};
	warn "Move $peg from $cur to $dest\n";
	my @orig = @$pegs;
	my $i = first_index { $_ eq $peg } @$pegs;
	if ($i >= 0)
	{
	    my $x = splice(@$pegs, $i, 1);
	    if ($x ne $peg)
	    {
		die "Bad splice: $cur idx $i: $x ne $peg in @$pegs (@orig)\n";
	    }
	}
	else
	{
	    die "Bad find: $peg not in fam $cur: @$pegs\n";
	}
    }
    $peg_to_fam->{$peg} = $dest;
    push(@{$members_in_fam->{$dest}}, $peg);
}

sub existing_families {
    my ($fh,$fig) = @_;
    
    my $peg_to_fam = {};
    my $members_in_fam = {};
    my $max_fam = "FIG00000000";

    my $last = <$fh>;
    while ($last && ($last =~ /^(FIG\d+)\t(fig\|\d+\.\d+\.peg\.\d+)/))
    {
	my $fam = $1;
	my $set = [];
	while ($last && ($last =~ /^(FIG\d+)\t(fig\|\d+\.\d+\.peg\.\d+)/) && ($1 eq $fam))
	{
	    my $peg = $2;
	    if (! $fig->is_deleted_fid($peg))
	    {
		push(@$set,$peg);
		$peg_to_fam->{$peg} = $fam;
	    }
	    $last = <$fh>;
	}
	if (scalar @$set > 1)
	{
	    $members_in_fam->{$fam} = $set;
	}
	else
	{
	    foreach my $peg (@$set)
	    {
		delete $peg_to_fam->{$peg};
	    }
	}
    
	if ($fam gt $max_fam) { $max_fam = $fam }
    }
    $max_fam++;
    return ($peg_to_fam,$members_in_fam,\$max_fam);
}


sub print_families_2c {
    my($fh, $members_in_fam) = @_;

    foreach my $fam (sort keys(%$members_in_fam))
    {
	foreach my $peg (@{$members_in_fam->{$fam}})
	{
	    print $fh join("\t",($fam,$peg)),"\n";
	}
    }
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3