[Bio] / FigKernelScripts / close_connections.pl Repository:
ViewVC logotype

View of /FigKernelScripts/close_connections.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Mon Aug 27 18:27:33 2007 UTC (12 years, 2 months ago) by overbeek
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.1: +1 -29 lines
add strip comments argument on function_of

use FIG;
my $fig = new FIG;

use strict;

# usage: close_connections [MinNormSc] < SimsFile > Connections

# a minimum normalized bit score (bit score divided by the max of the lengths of the input seqs)
# is used to filter similarities.  We use a value of 1.3 as a default, but I have not put
# any real effort into finding what an optimal value might be.
my $min_norm_sc = (@ARGV > 0) ? $ARGV[0] : 1.3;


my(%genomes,%sims);
while (defined($_ = <STDIN>))
{
    chop;
    my($id1,$id2,undef,undef,undef,undef,undef,undef,$ln1,$ln2,$bsc) = split(/\t/,$_);
    $genomes{&FIG::genome_of($id1)} = 1;
    $genomes{&FIG::genome_of($id2)} = 1;

    my $norm_bsc = sprintf("%.3f",$bsc / &FIG::max($ln1,$ln2));
    if ($norm_bsc >= $min_norm_sc)
    {
	push(@{$sims{$id1}},[$id2,$norm_bsc]);
	push(@{$sims{$id2}},[$id1,$norm_bsc]);
    }
}

my @genomes = sort { $a <=> $b } keys(%genomes);
if (@genomes != 2)
{
    print &Dumper(\%genomes);
    die "We should have just 2 genomes";
}

my($peg,$i);
foreach $peg (keys(%sims))
{
    my @tmp = sort { $b->[1] <=> $a->[1] } @{$sims{$peg}};
    for ($i=0; ($i < @tmp) && ($tmp[$i]->[1] >= ($tmp[0]->[1] - 0.1)); $i++) {}
    $#tmp = $i-1;
    $sims{$peg} = [@tmp];
}

# We compute two hashes, one for each genome.  Each hash contains an entry for each PEG
# in that genome.  The value for a given PEG is [contig,N] where N increases by 1 as you
# go through the PEGs on a contig.  Thus, two PEGs are adjacent iff they have the same
# contigs and their N values differ by 1.
#

my $locs1  = &get_locations($fig,$genomes[0]);
my $locs2  = &get_locations($fig,$genomes[1]);

my @pegs1 = sort { ($locs1->{$a}->[0] cmp $locs1->{$b}->[0]) or 
		   ($locs1->{$a}->[1] <=> $locs1->{$b}->[1]) }
            keys(%$locs1);

$i = 0;
my($x,$j,$start,$peg1B,$peg2B,$peg2P);
while ($i < @pegs1) 
{
    if ($peg2B = &clear_bbh($pegs1[$i],\%sims)) 
    {
	$peg1B = $pegs1[$i];
	$start = $i;
	$i++;
	my $last = [$peg1B,$peg2B];
	my @matches = ($last);
	my $got_end = 0;
	while (($i < @pegs1) && (! $got_end) && ($peg2P = &poss($fig,$last,$pegs1[$i],\%sims,$locs1,$locs2)))
	{
	    $last = [$pegs1[$i],$peg2P];
	    push(@matches,$last);
	    if (&are_clear_bbhs($pegs1[$i],$peg2P,\%sims))
	    {
		$got_end = 1;
	    }
	    else
	    {
		$i++;
	    }
	}

	if ($got_end && (@matches <= 10))
	{
	    my $pair;
	    foreach $pair (@matches)
	    {
		print join("\t",@$pair),"\n";
	    }
	}
    }
    else
    {
	$i++;
    }
}

sub clear_bbh {
    my($peg1,$sims) = @_;
    my($x,$y,$peg2);

    if  (($x = $sims->{$peg1}) && (@$x == 1) && 
	 ($peg2 = $x->[0]->[0]) && 
	 ($y = $sims->{$peg2}) && (@$y == 1) &&
	 ($y->[0]->[0] eq $peg1))
    {
	return $peg2;
    }
    else
    {
	return undef;
    }
}

sub poss {
    my($fig,$last,$peg1b,$sims,$locs1,$locs2) = @_;
    my($x,$y,$i,$peg2b);

    my($peg1a,$peg2a) = @$last;
    if ($x = $sims->{$peg1b})
    {
	for ($i=0; ($i < @$x) && (! &adjacent($peg2a,$x->[$i]->[0],$locs2)); $i++) {}
	if ($i < @$x)
	{
	    $peg2b = $x->[$i]->[0];
	    if ($y = $sims->{$peg2b})
	    {
		for ($i=0; ($i < @$y) && ($y->[$i]->[0] ne $peg1b); $i++) {}
		if ($i < @$y)
		{
		    return $peg2b;
		}
	    }
	}
    }
    return undef;
}

sub adjacent {
    my($peg1,$peg2,$locs) = @_;
    my($x,$y);

    return (($x = $locs->{$peg1}) && ($y = $locs->{$peg2}) && ($x->[0] eq $y->[0]) &&
	    (abs($x->[1] - $y->[1]) == 1));
}

sub are_clear_bbhs {
    my($peg1,$peg2,$sims) = @_;
    my($x,$y);

    return (($x = $sims->{$peg1}) && (@$x == 1) && ($x->[0]->[0] eq $peg2) &&
	    ($y = $sims->{$peg2}) && (@$y == 1) && ($y->[0]->[0] eq $peg1));
}

sub get_locations {
    my($fig,$genome,$sims) = @_;
    my($peg,$i);

    my $locs = {};
    my @pegs_and_locs = ();
    foreach $peg ($fig->all_features($genome,"peg"))
    {
	my @loc = $fig->feature_location($peg);
	if ((@loc == 1) && ($loc[0] =~ /^(\S+)_(\d+)_(\d+)$/))
	{
	    push(@pegs_and_locs,[$peg,$1,int(($2+$3)/2)]);
	}
    }
    @pegs_and_locs = sort { ($a->[1] cmp $b->[1]) or ($a->[2] <=> $b->[2]) } @pegs_and_locs;
    for ($i=0; ($i < @pegs_and_locs); $i++)
    {
	$locs->{$pegs_and_locs[$i]->[0]} = [$pegs_and_locs[$i]->[1],$i];
    }
    return $locs;
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3