[Bio] / FigKernelScripts / close_connections.pl Repository:
ViewVC logotype

View of /FigKernelScripts/close_connections.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Sun Aug 26 14:58:12 2007 UTC (12 years, 10 months ago) by overbeek
Branch: MAIN
add new version of close_connection

#!/usr/bin/env /Volumes/seagate/FIGdisk/env/mac/bin/perl

BEGIN {
    @INC = qw(
              /Volumes/seagate/FIGdisk/dist/releases/ross/mac/lib
              /Volumes/seagate/FIGdisk/dist/releases/ross/mac/lib/FigKernelPackages
              /Volumes/seagate/FIGdisk/dist/releases/ross/mac/lib/WebApplication
              /Volumes/seagate/FIGdisk/dist/releases/ross/mac/lib/FortyEight
              /Volumes/seagate/FIGdisk/dist/releases/ross/mac/lib/PPO
              /Volumes/seagate/FIGdisk/dist/ross/mac/lib
              /Volumes/seagate/FIGdisk/dist/ross/mac/lib/FigKernelPackages
              /Volumes/seagate/FIGdisk/env/mac/lib/perl5/5.8.7/darwin-2level
              /Volumes/seagate/FIGdisk/env/mac/lib/perl5/5.8.7
              /Volumes/seagate/FIGdisk/env/mac/lib/perl5/site_perl/5.8.7/darwin-2level
              /Volumes/seagate/FIGdisk/env/mac/lib/perl5/site_perl/5.8.7
              /Volumes/seagate/FIGdisk/env/mac/lib/perl5/site_perl
              .
              /Volumes/seagate/FIGdisk/config
 
);
}
use Data::Dumper;
use Carp;
use FIG_Config;
$ENV{'BLASTMAT'} = "/Volumes/seagate/FIGdisk/BLASTMAT";
$ENV{'FIG_HOME'} = "/Volumes/seagate/FIGdisk";
# end of tool_hdr
########################################################################

use FIG;
my $fig = new FIG;

use strict;

# usage: close_connections [MinNormSc] < SimsFile > Connections

# a minimum normalized bit score (bit score divided by the max of the lengths of the input seqs)
# is used to filter similarities.  We use a value of 1.3 as a default, but I have not put
# any real effort into finding what an optimal value might be.
my $min_norm_sc = (@ARGV > 0) ? $ARGV[0] : 1.3;


my(%genomes,%sims);
while (defined($_ = <STDIN>))
{
    chop;
    my($id1,$id2,undef,undef,undef,undef,undef,undef,$ln1,$ln2,$bsc) = split(/\t/,$_);
    $genomes{&FIG::genome_of($id1)} = 1;
    $genomes{&FIG::genome_of($id2)} = 1;

    my $norm_bsc = sprintf("%.3f",$bsc / &FIG::max($ln1,$ln2));
    if ($norm_bsc >= $min_norm_sc)
    {
	push(@{$sims{$id1}},[$id2,$norm_bsc]);
	push(@{$sims{$id2}},[$id1,$norm_bsc]);
    }
}

my @genomes = sort { a <=> $b } keys(%genomes);
if (@genomes != 2)
{
    print &Dumper(\%genomes);
    die "We should have just 2 genomes";
}

my($peg,$i);
foreach $peg (keys(%sims))
{
    my @tmp = sort { $b->[1] <=> $a->[1] } @{$sims{$peg}};
    for ($i=0; ($i < @tmp) && ($tmp[$i]->[1] >= ($tmp[0]->[1] - 0.1)); $i++) {}
    $#tmp = $i-1;
    $sims{$peg} = [@tmp];
}

# We compute two hashes, one for each genome.  Each hash contains an entry for each PEG
# in that genome.  The value for a given PEG is [contig,N] where N increases by 1 as you
# go through the PEGs on a contig.  Thus, two PEGs are adjacent iff they have the same
# contigs and their N values differ by 1.
#

my $locs1  = &get_locations($fig,$genomes[0]);
my $locs2  = &get_locations($fig,$genomes[1]);

my @pegs1 = sort { ($locs1->{$a}->[0] cmp $locs1->{$b}->[0]) or 
		   ($locs1->{$a}->[1] <=> $locs1->{$b}->[1]) }
            keys(%$locs1);

$i = 0;
my($x,$j,$start,$peg1B,$peg2B,$peg2P);
while ($i < @pegs1) 
{
    if ($peg2B = &clear_bbh($pegs1[$i],\%sims)) 
    {
	$peg1B = $pegs1[$i];
	$start = $i;
	$i++;
	my $last = [$peg1B,$peg2B];
	my @matches = ($last);
	my $got_end = 0;
	while (($i < @pegs1) && (! $got_end) && ($peg2P = &poss($fig,$last,$pegs1[$i],\%sims,$locs1,$locs2)))
	{
	    $last = [$pegs1[$i],$peg2P];
	    push(@matches,$last);
	    if (&are_clear_bbhs($pegs1[$i],$peg2P,\%sims))
	    {
		$got_end = 1;
	    }
	    else
	    {
		$i++;
	    }
	}

	if ($got_end && (@matches <= 10))
	{
	    my $pair;
	    foreach $pair (@matches)
	    {
		print join("\t",@$pair),"\n";
	    }
	}
    }
    else
    {
	$i++;
    }
}

sub clear_bbh {
    my($peg1,$sims) = @_;
    my($x,$y,$peg2);

    if  (($x = $sims->{$peg1}) && (@$x == 1) && 
	 ($peg2 = $x->[0]->[0]) && 
	 ($y = $sims->{$peg2}) && (@$y == 1) &&
	 ($y->[0]->[0] eq $peg1))
    {
	return $peg2;
    }
    else
    {
	return undef;
    }
}

sub poss {
    my($fig,$last,$peg1b,$sims,$locs1,$locs2) = @_;
    my($x,$y,$i,$peg2b);

    my($peg1a,$peg2a) = @$last;
    if ($x = $sims->{$peg1b})
    {
	for ($i=0; ($i < @$x) && (! &adjacent($peg2a,$x->[$i]->[0],$locs2)); $i++) {}
	if ($i < @$x)
	{
	    $peg2b = $x->[$i]->[0];
	    if ($y = $sims->{$peg2b})
	    {
		for ($i=0; ($i < @$y) && ($y->[$i]->[0] ne $peg1b); $i++) {}
		if ($i < @$y)
		{
		    return $peg2b;
		}
	    }
	}
    }
    return undef;
}

sub adjacent {
    my($peg1,$peg2,$locs) = @_;
    my($x,$y);

    return (($x = $locs->{$peg1}) && ($y = $locs->{$peg2}) && ($x->[0] eq $y->[0]) &&
	    (abs($x->[1] - $y->[1]) == 1));
}

sub are_clear_bbhs {
    my($peg1,$peg2,$sims) = @_;
    my($x,$y);

    return (($x = $sims->{$peg1}) && (@$x == 1) && ($x->[0]->[0] eq $peg2) &&
	    ($y = $sims->{$peg2}) && (@$y == 1) && ($y->[0]->[0] eq $peg1));
}

sub get_locations {
    my($fig,$genome,$sims) = @_;
    my($peg,$i);

    my $locs = {};
    my @pegs_and_locs = ();
    foreach $peg ($fig->all_features($genome,"peg"))
    {
	my @loc = $fig->feature_location($peg);
	if ((@loc == 1) && ($loc[0] =~ /^(\S+)_(\d+)_(\d+)$/))
	{
	    push(@pegs_and_locs,[$peg,$1,int(($2+$3)/2)]);
	}
    }
    @pegs_and_locs = sort { ($a->[1] cmp $b->[1]) or ($a->[2] <=> $b->[2]) } @pegs_and_locs;
    for ($i=0; ($i < @pegs_and_locs); $i++)
    {
	$locs->{$pegs_and_locs[$i]->[0]} = [$pegs_and_locs[$i]->[1],$i];
    }
    return $locs;
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3