[Bio] / FigKernelScripts / FFB2_split_fams_by_size.pl Repository:
ViewVC logotype

View of /FigKernelScripts/FFB2_split_fams_by_size.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Mon Nov 22 17:41:30 2010 UTC (9 years, 3 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011
Changes since 1.1: +141 -13 lines
Big round of figfam update changes

use strict;
use SeedUtils;
use FIG;
use List::Util 'min';
use Data::Dumper;
use Parallel::Fork::BossWorker;
use Cache::Memcached::Fast;
use Graph;
use Clustering;
use YAML::Any;
use tree_utilities;
use Algorithm::Numerical::Shuffle qw(shuffle);

$| = 1;

my $fig;

@ARGV == 2 or @ARGV == 4 or die "Usage: $0 old-families.2c new-families.2c [memcache-host memcache-port]\n";

my $coverage_thresh = 0.7;
my $length_thresh = 0.3;
my $blast_thresh = 1e-5;

my $old_f2c = shift;
my $new_f2c = shift;

my $mchost = shift;
my $mcport = shift;
my $mc;

if ($mchost && $mcport)
{
    $mc = new Cache::Memcached::Fast({ servers => ["$mchost:$mcport"] } );
}

my %length_cache;

my $n_workers = 8;

open(OLD, "<", $old_f2c) or die "cannot open $old_f2c: $!";
open(NEW, ">", $new_f2c) or die "cannot open $new_f2c: $!";

my $bw = Parallel::Fork::BossWorker->new(work_handler => \&do_work,
					 result_handler => \&handle_result,
					 worker_count => $n_workers);

my $max_fam = 0;

my @work;

SeedUtils::map_to_families(\*OLD, \&process_fam);

close(OLD);

#
# Shuffle to randomize sizes so we're not doing all large or all
# small families at once.
#
# Put index in to track where we are in the overall body of work to be done.
#
my $idx = 0;
for my $item (shuffle(@work))
{
    $item->{index} = $idx++;
    $bw->add_work($item);
}
my $n_items = @work;
my $n_done = 0;
undef @work;

my $next_fam = $max_fam + 1;
$next_fam++;

$bw->process();

close(NEW);

sub process_fam
{
    my($fam, $set) = @_;
    if ($fam =~ /^(FIG)?(\d+)/)
    {
	$max_fam = $2 if ($2 > $max_fam);
    }
    else
    {
	die "Invalid family $fam";
    }
    
    my $item = { fam => $fam, set => $set};
    push(@work, $item);
}

sub do_work
{
    my($item) = @_;
    my $fam = $item->{fam};
    my $set = $item->{set};

    if (!defined($fig))
    {
	print "Create fig in $$\n";
	$fig = new FIG;
    }

    print "Work: $$ fam=$fam\n";

    my $tmp = "$FIG_Config::temp/fasta.$$";

    my $fh;
    open($fh, ">", $tmp) or die "Cannot write $tmp: $!";

    my %len;
    for my $peg (@$set)
    {
	my $trans = get_translation($fig, $peg);
	if (length($trans) == 0)
	{
	    print STDERR "Skip zero-length sequence $peg\n";
	    next;
	}
    
	$len{$peg} = length($trans);
	print $fh ">$peg\n$trans\n";
    }
    close($fh);

    # my $graph = Graph->new();
    my $conns = {};
    
    #
    # Need this so system can catch the signal for these invocations.
    #
    {
	local $SIG{CHLD};
	undef $SIG{CHLD};
	my @cmd = ("formatdb", "-p", "t", "-i", $tmp);
	my $rc = system(@cmd);
	if ($rc < 0)
	{
	    die "failure starting: $!: @cmd\n";
	}
	$rc == 0 or die "failed with rc=$rc: @cmd\n";
	
	my $bfh;
	open($bfh, "-|", "blastall", "-p", "blastp", "-i", $tmp, "-d", $tmp, "-e", $blast_thresh, "-m8")
	    or die "Cannot open blastall pipe: $!";
	while (<$bfh>)
	{
	    my $sim = Sim->new_from_line($_);

	    next if $sim->id1 eq $sim->id2;

	    my $ln1 = $len{$sim->id1};
	    my $ln2 = $len{$sim->id2};

	    my $sim_frac = ($sim->e1 + 1 - $sim->b1)/$ln1;
	    my $len_diff = abs($ln2 - $ln1) / $ln1;

	    # print $sim->id1, "\t", $sim->id2, "\t$sim_frac\t$len_diff\n";

	    if ($sim_frac > $coverage_thresh && $len_diff <= $length_thresh)
	    {
		# $graph->add_edge($sim->id1, $sim->id2);
		$conns->{$sim->id1}->{$sim->id2} = 1;
		$conns->{$sim->id2}->{$sim->id1} = 1;
	    }
	}
	close($bfh);
    }

#    my @cc = $graph->strongly_connected_components();

#    $graph->delete_edge(@$_) for $graph->edges;
#    $graph->delete_vertex($_) for $graph->vertices;

#    print Dump($conns);

#    my ($clusters, $trees) = Clustering::cluster($conns, 1, 'min_dist');
    my $clusters = tarjan($conns);

#    print Dump($clusters);
    
    return { fam => $fam, sets => $clusters };
}

sub handle_result
{
    my($res) = @_;
    my $fam = $res->{fam};
    my $sets = $res->{sets};

    my $id;
    my @ids;

    $n_done++;
    print "Finish $fam ($n_done of $n_items)\n";
    for my $set (sort { $#$b <=> $#$a } @$sets)
    {
	if (@$set < 2)
	{
	    print STDERR "$fam: Discard small set @$set\n";
	}
	else
	{
	    if (!defined($id))
	    {
		$id = $fam;
	    }
	    else
	    {
		$id = sprintf("FIG%08d", $next_fam++);
	    }
	    push(@ids, $id);
	    print NEW "$id\t$_\n" for @$set;
	}
    }
    if (@ids > 1)
    {
	print "SPLIT: @ids\n";
    }
}

sub get_translation
{
    my($fig, $peg) = @_;

    my $tr;
    if ($mc)
    {
	$tr = $mc->get("s:$peg");
    }
    if (!$tr)
    {
	$tr = $fig->get_translation($peg);
	if ($tr && $mc)
	{
	    $mc->set("s:$peg", $tr);
	}
    }
    return $tr;
}

sub tarjan
{
    my($conns) = @_;

    my $index = 0;
    my $S = [];

    my $V = {};

    my %nodes;

    $nodes{$_} = 1 for keys %$conns;
    $nodes{$_} = 1 for map { keys %$_ } values %$conns;

    for my $v (keys %nodes)
    {
	if (!defined($V->{$v}->{index}))
	{
	    tarjan1($v, \$index, $S, $V, $conns);
	}
    }
}
sub tarjan
{
    my($conns) = @_;

    my $index = 0;
    my $S = [];

    my $V = {};

    my %nodes;

    $nodes{$_} = 1 for keys %$conns;
    $nodes{$_} = 1 for map { keys %$_ } values %$conns;

    my $comps = [];

    for my $v (keys %nodes)
    {
	if (!defined($V->{$v}->{index}))
	{
	    tarjan1($v, \$index, $S, $V, $conns, $comps);
	}
    }
    return $comps;
}

sub tarjan1
{
    my($v, $indexp, $S, $V, $conns, $comps) = @_;

    $V->{$v}->{index} = $$indexp;
    $V->{$v}->{lowlink} = $$indexp;
    $$indexp++;
    
    push(@$S, $v);
    $V->{$v}->{stacked} = 1;

#    print "Tarjan $v\n";
    my @edges = keys %{$conns->{$v}};
#    print "edges @edges\n";
#    print Dumper($V);
    
    for my $vp (@edges)
    {
	if (!defined($V->{$vp}->{index}))
	{
	    tarjan1($vp, $indexp, $S, $V, $conns, $comps);
	    $V->{$v}->{lowlink} = min $V->{$v}->{lowlink}, $V->{$vp}->{lowlink};
	}
	elsif ($V->{$vp}->{stacked})
	{
	    $V->{$v}->{lowlink} = min $V->{$v}->{lowlink}, $V->{$vp}->{lowlink};
	}
    }
    if ($V->{$v}->{lowlink} == $V->{$v}->{index})
    {
	my $vp;
	my $comp;
	do
	{
	    $vp = pop(@$S);
	    $V->{$vp}->{stacked} = 0;
	    push(@$comp, $vp);
	} while ($vp ne $v);
	push(@$comps, $comp);
    }
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3