[Bio] / FigKernelScripts / connections_between.pl Repository:
ViewVC logotype

View of /FigKernelScripts/connections_between.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.8 - (download) (as text) (annotate)
Sun Jan 11 13:43:37 2009 UTC (10 years, 10 months ago) by overbeek
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2009_07_09, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, HEAD
Changes since 1.7: +65 -35 lines
fixes to making close strain data

########################################################################
#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#


use strict;

use FIG;
use FIGV;

my $fig;

my($simsD,$iden_diff);


my $usage = "usage: connections_between [-orgdir orgdir]  [-cluster N1] [-insub N2] Sims IdenDiff > connections";

my $orgdir;
my $must_cluster = 0;
my $must_be_in_sub = 0;

while ((@ARGV > 0) && ($ARGV[0] =~ /^-/))
{
    my $arg = shift @ARGV;
    if ($arg =~ /^-orgdir/)
    {
	$orgdir = shift @ARGV;
    }
    elsif ($arg =~ /-cluster/)
    {
	$must_cluster = shift @ARGV;
    }
    elsif ($arg =~ /-insub/)
    {
	$must_be_in_sub = shift @ARGV;
    }
    else
    {
	die "Unknown option $arg\n";
    }
}

if ($orgdir)
{
    $fig = new FIGV($orgdir);
}
else
{
    $fig = new FIG;
}

(
 ($simsD      = shift @ARGV) &&
 ($iden_diff  = shift @ARGV)
)
    || die $usage;

#
# Here we compute whether G1 <-> G2 by first finding out
#
#       1. Are they BBHs with a safety threshold of IdenDiff (using identify, rather than Psc)?
#          If they are, then they are  matched. 
#
#       2. If 
#              a) G1 is not a safe BBH of G2 (but is similar)
#              b) there is no other safe BBH to either
#              c) they have a superior context (by at least NeighDiff)
#
#          then we call it a match
#
# We compute all binary matches.  
#
# Then, we compute families as transitive close over the binary matches.
#
# Any resulting family with more than one gene from a single genome is registered
# as an error.

opendir(SIMS,$simsD) || die "Where are the similarities?";
my @sims_files = grep { $_ !~ /^\./ } readdir(SIMS);
closedir(SIMS);

my($file,%genomes,%sims,%paralogs);
foreach $file (@sims_files)
{
    if ($file =~ /^(\d+\.\d+)-(\d+\.\d+)$/)
    {
	my $genome1 = $1;
	my $genome2 = $2;
	if ($genome1 eq $genome2)
	{
	    open(PAR,"cut -f1 $simsD/$file |") || die "could not open $simsD/$file";
	    while (defined($_ = <PAR>))
	    {
		chomp;
		$paralogs{$_} = 1;
	    }
	    close(PAR);
	}
    }
}

foreach $file (@sims_files)
{
    if ($file =~ /^(\d+\.\d+)-(\d+\.\d+)$/)
    {
	my $genome1 = $1;
	my $genome2 = $2;
	$genomes{$genome1} = 1;
	$genomes{$genome2} = 1;

	if (open(SIMS,"<$simsD/$file"))
	{
	    while (defined($_ = <SIMS>))
	    {
		chomp;
		my($peg1,$peg2,$iden,$b1,$e1,$b2,$e2,$sc,$ln1,$ln2) = split(/\t/,$_);
		if (((($e1-$b1)/$ln1) > 0.7) && ((($e2-$b2)/$ln2) > 0.8) && (! $paralogs{$peg1}) && (! $paralogs{$peg2}))
		{
		    push(@{$sims{"$genome1,$genome2"}->{$peg1}},[$iden,$peg2]);
		}
	    }
	    close(SIMS);
	}
	else
	{
	    die "could not open $simsD/$file";
	}
    }
    else
    {
	die "$simsD/$file is not a valid similarity file";
    }
}

my @genomes = sort { $a <=> $b } keys(%genomes);
my $n       = @genomes;
print STDERR "Building families from $n genomes\n";
my $genome;
foreach $genome (@genomes)
{
    my $gs = $fig->genus_species($genome);
    print STDERR "$genome\t$gs\n";
}
print STDERR "\n";

my $tmpF = "/tmp/connections$$";
open(TMP,">$tmpF") || die "could not open $tmpF";
my $neighbors = {};

my($genome1,$genome2);
foreach $genome1 (@genomes)
{
    foreach $genome2 (@genomes)
    {
	if ( $genome1 lt $genome2 )
	{
	    my $genomes = "$genome1,$genome2";
	    print STDERR "processing $genomes\n";
	    my $bbhs = &bbhs($sims{$genomes},$sims{"$genome2,$genome1"},$iden_diff);

	    my($peg1,$peg2,);

	    foreach $peg1 (sort { &FIG::by_fig_id($a,$b) } keys(%{$sims{$genomes}}))
	    {
		my $conn;
		if ($peg2 = $bbhs->{$peg1})
		{
		    if (! $must_cluster)
		    {
			print TMP "$peg1\t$peg2\n";
		    }
		    else
		    { 
			my $clustered = &clustered($peg1,$peg2,\%sims,$genome1,$genome2,$bbhs);
			if (@$clustered >= $must_cluster)
			{
			    my $common_subs;
			    if ((! $must_be_in_sub) || (@{($common_subs = &insub($fig,$clustered))} >= $must_be_in_sub))
			    {
				print TMP "$peg1\t$peg2\n";
				print STDERR "connection\t$peg1\t$peg2\t";
				foreach my $x (@$clustered) { print STDERR join(",",@$x),";" }   print STDERR "\t";
				foreach my $x (@$common_subs) { print STDERR join(",",@$x),";" } print STDERR "\n";
			    }
			}
		    }
		}
	    }
	}
    }
}
close(TMP);
open(CLUSTERS,"cluster_objects < $tmpF |") || die "could not run cluster_objects on $tmpF";
while (defined($_ = <CLUSTERS>))
{
    chop;

    my($peg,%genomes);
    my @pegs = split(/\t/,$_);

    undef %genomes;
    my $ok = 1;
    foreach $peg (@pegs)
    {
	if ($genomes{&FIG::genome_of($peg)}) { $ok = 0 }
	$genomes{&FIG::genome_of($peg)} = 1;
    }
    
    if ($ok)
    {
	print "$_\n";
    }
    else
    {
	print STDERR "$_\n";
    }
}
unlink($tmpF);    

sub clustered {
    my($peg1,$peg2,$sims,$genome1,$genome2,$bbhs) = @_;
    my $neigh1 = &neighbors_of($peg1,$neighbors);
    my $neigh2 = &neighbors_of($peg2,$neighbors);
    my %neigh2H = map { $_ => 1 } @$neigh2;
    my $matches = [];
    my ($peg_in1,$peg_in2);
    foreach $peg_in1 (@$neigh1)
    {
	$peg_in2 = $bbhs->{$peg_in1};
	if ($neigh2H{$peg_in2})
	{
	    push(@$matches,[$peg_in1,$peg_in2]);
	}
    }
    return $matches;
}

sub insub {
    my($fig,$clustered) = @_;

    my $n = [];
    my $tuple;
    foreach $tuple (@$clustered)
    {
	my($peg1,$peg2) = @$tuple;
	my $func1 = $fig->function_of($peg1);
	my $func2 = $fig->function_of($peg2);
	if ($func1 eq $func2)
	{
	    my @tmp = grep { $fig->usable_subsystem($_) } $fig->peg_to_subsystems($peg1);
	    if (@tmp > 0)
	    {
		@tmp = grep { $fig->usable_subsystem($_) } $fig->peg_to_subsystems($peg2);
		if (@tmp > 0)
		{
		    push(@$n,[$peg1,$peg2,$func1]);
		}
	    }
	}
    }
    return $n;
}


sub neighbors_of {
    my($peg,$neighbors) = @_;

    if (! ($_ = $neighbors->{$peg}))
    {
	$neighbors->{$peg} = [grep { $_ =~ /peg/ } $fig->close_genes($peg,5000)];
    }
    return $neighbors->{$peg};
}

sub bbhs {
    my($sims1,$sims2,$iden_diff) = @_;
    my($peg1,$best1,$best2);

    my $bbhs = {};
    foreach $peg1 (sort { &FIG::by_fig_id($a,$b) } keys(%$sims1))
    {
	if (($best1 = &best($sims1->{$peg1},$iden_diff)) &&
	    ($best2 = &best($sims2->{$best1},$iden_diff)) &&
	    ($best2 eq $peg1))
	{
	    $bbhs->{$peg1} = $best1;
	    $bbhs->{$best1} = $peg1;
	}
    }
    return $bbhs;
}

sub best {
    my($sims,$iden_diff) = @_;

    if (! $sims) { return undef }

    if ((@$sims == 1) ||
	((@$sims > 1) && (($sims->[0]->[0] - $sims->[1]->[0]) >= $iden_diff)))
    {
	return $sims->[0]->[1];
    }
    return undef;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3