[Bio] / FigKernelScripts / connections_between.pl Repository:
ViewVC logotype

View of /FigKernelScripts/connections_between.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Fri Apr 22 00:20:26 2005 UTC (14 years, 11 months ago) by overbeek
Branch: MAIN
CVS Tags: merge-trunktag-bobdev_news-2, Root-bobdev_news, merge-bobdev_news-1, caBIG-dataload-0, merge-trunktag-bobdev_news-1, merge-bodev_news-3, caBIG-00-00-00, merge-bobdev_news-2, merge-trunktag-bodev_news-3
Branch point for: Branch-bobdev_news
connections_between got added

use strict;

use FIG;
my $fig = new FIG;

my($simsD,$iden_diff,$neigh_diff,$req_neigh);

my $usage = "usage: connections_between Sims IdenDiff NeighDiff  > connections";
(
 ($simsD      = shift @ARGV) &&
 ($iden_diff  = shift @ARGV) &&
 ($neigh_diff = shift @ARGV)
)
    || die $usage;

#
# Here we compute whether G1 <-> G2 by first finding out
#
#       1. Are they BBHs with a safety threshold of IdenDiff (using identify, rather than Psc)?
#          If they are, then they are  matched. 
#
#       2. If 
#              a) G1 is not a safe BBH of G2 (but is similar)
#              b) there is no other safe BBH to either
#              c) they have a superior context (by at least NeighDiff)
#
#          then we call it a match
#
# We compute all binary matches.  
#
# Then, we compute families as transitive close over the binary matches.
#
# Any resulting family with more than one gene from a single genome is registered
# as an error.

opendir(SIMS,$simsD) || die "Where are the similarities?";
my @sims_files = grep { $_ !~ /^\./ } readdir(SIMS);
closedir(SIMS);

my($file,%genomes,%sims);
foreach $file (@sims_files)
{
    if ($file =~ /^(\d+\.\d+)-(\d+\.\d+)$/)
    {
	my $genome1 = $1;
	my $genome2 = $2;
	$genomes{$genome1} = 1;
	$genomes{$genome2} = 1;

	if (open(SIMS,"<$simsD/$file"))
	{
	    while (defined($_ = <SIMS>))
	    {
		if ($_ =~ /^(\S+)\t(\S+)\t(\S+)/)
		{
		    push(@{$sims{"$genome1,$genome2"}->{$1}},[$3,$2]);
		}
	    }
	    close(SIMS);
	}
	else
	{
	    die "could not open $simsD/$file";
	}
    }
    else
    {
	die "$simsD/$file is not a valid similarity file";
    }
}

my @genomes = sort { $a <=> $b } keys(%genomes);
my $n       = @genomes;
print STDERR "Building families from $n genomes\n";
my $genome;
foreach $genome (@genomes)
{
    my $gs = $fig->genus_species($genome);
    print STDERR "$genome\t$gs\n";
}
print STDERR "\n";

my $tmpF = "/tmp/connections$$";
open(TMP,">$tmpF") || die "could not open $tmpF";
my $neighbors = {};

my($genome1,$genome2);
foreach $genome1 (@genomes)
{
    foreach $genome2 (@genomes)
    {
	if ($genome1 < $genome2)
	{
	    my $genomes = "$genome1,$genome2";
	    print STDERR "processing $genomes\n";
	    my $bbhs = &bbhs($sims{$genomes},$sims{"$genome2,$genome1"},$iden_diff);

	    my($peg1,$peg2,);

	    foreach $peg1 (sort { &FIG::by_fig_id($a,$b) } keys(%{$sims{$genomes}}))
	    {
		my $conn;
		if ($peg2 = $bbhs->{$peg1})
		{
		    print TMP "$peg1\t$peg2\n";
		}
		elsif ($conn = $sims{$genomes}->{$peg1})
		{
		    my(@hits);
		    my $neigh1 = &neighbors_of($peg1,$neighbors);

		    my $tuple;
		    foreach $tuple (@$conn)
		    {
			my($iden,$peg2) = @$tuple;
			if ($iden >= ($conn->[0]->[0] - $iden_diff))
			{
			    my $neigh2 = &neighbors_of($peg2,$neighbors);
			    push(@hits,[$peg2,&matches($neigh1,$neigh2,$bbhs)]);
			}
		    }
		    @hits = sort { $b->[1] <=> $a->[1] } @hits;
		    if ((@hits == 1) || ((@hits > 1) && (($hits[0]->[1] - $hits[1]->[1]) > $neigh_diff)))
		    {
			print TMP "$peg1\t$hits[0]->[0]\n";
		    }
		}
	    }
	}
    }
}
close(TMP);
open(CLUSTERS,"cluster_objects < $tmpF |") || die "could not run cluster_objects on $tmpF";
while (defined($_ = <CLUSTERS>))
{
    chop;

    my($peg,%genomes);
    my @pegs = split(/\t/,$_);

    undef %genomes;
    my $ok = 1;
    foreach $peg (@pegs)
    {
	if ($genomes{&FIG::genome_of($peg)}) { $ok = 0 }
	$genomes{&FIG::genome_of($peg)} = 1;
    }
    
    if ($ok)
    {
	print "$_\n";
    }
    else
    {
	print STDERR "$_\n";
    }
}
#unlink($tmpF);    

sub matches {
    my($n1,$n2,$bbhs) = @_;
    my($peg,%want);

    my $n = 0;
    foreach $peg (@$n1)
    {
	if ($_ = $bbhs->{$peg})
	{
	    $want{$_} = 1;
	}
    }

    foreach $peg (@$n2)
    {
	if ($want{$peg})
	{
	    $n++;
	}
    }
    return $n;
}
    

sub neighbors_of {
    my($peg,$neighbors) = @_;

    if (! ($_ = $neighbors->{$peg}))
    {
	$neighbors->{$peg} = [grep { $_ =~ /peg/ } $fig->close_genes($peg,5000)];
    }
    return $neighbors->{$peg};
}

sub bbhs {
    my($sims1,$sims2,$iden_diff) = @_;
    my($peg1,$best1,$best2);

    my $bbhs = {};
    foreach $peg1 (sort { &FIG::by_fig_id($a,$b) } keys(%$sims1))
    {
	if (($best1 = &best($sims1->{$peg1},$iden_diff)) &&
	    ($best2 = &best($sims2->{$best1},$iden_diff)) &&
	    ($best2 eq $peg1))
	{
	    $bbhs->{$peg1} = $best1;
	    $bbhs->{$best1} = $peg1;
	}
    }
    return $bbhs;
}

sub best {
    my($sims,$iden_diff) = @_;

    if (! $sims) { return undef }

    if ((@$sims == 1) ||
	((@$sims > 1) && (($sims->[0]->[0] - $sims->[1]->[0]) >= $iden_diff)))
    {
	return $sims->[0]->[1];
    }
    return undef;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3