[Bio] / FigKernelScripts / ex_get_adjacency_based_estimates.pl Repository:
ViewVC logotype

View of /FigKernelScripts/ex_get_adjacency_based_estimates.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Sat Nov 23 15:10:20 2013 UTC (5 years, 11 months ago) by overbeek
Branch: MAIN
CVS Tags: rast_rel_2014_0729, rast_rel_2014_0912, HEAD
Changes since 1.1: +2 -2 lines
minor changes

use SeedEnv;
use strict;
use Carp;
use Data::Dumper;
use Getopt::Long;

my $usage = "usage: ex_get_adjacency_based_estimates -d Data\n";
my $dataD;
my $rc  = GetOptions('d=s' => \$dataD);

if ((! $rc) || (! -d $dataD))
{ 
    print STDERR $usage; exit ;
}

open(GENOME,"<$dataD/genome") || die "$dataD/genome does not exist";
my $genome = <GENOME>;
chomp $genome;
($genome =~ /^(\d+\.\d+)/) || die "genome: $genome is invalid";
$genome = $1;

my $pccsF = "$dataD/locus.pccs";
my $aliasF = "$dataD/aliases";
open(ALIAS,"<$aliasF") || die "could not open $dataD/$aliasF";
my %to_peg;
while (defined($_ = <ALIAS>))
{
    chomp;
    my @tmp = split(/\t/,$_);
    my @fig = grep { $_ =~ /^fig\|/ } @tmp;
    if (@fig == 1)
    {
	foreach my $alias (grep { $_ ne $fig[0] } @tmp)
	{
	    $to_peg{$alias} = $fig[0];
	}
    }
}
close(ALIAS);
open(PCC,"<$dataD/locus.pccs") || die "could not open $dataD/locus.pccs";
my $corrH = {};
while (defined($_ = <PCC>))
{
    if (($_ =~ /^(\S+)\t(\S+)\t(\S+)/) &&
	(my $peg1 = $to_peg{$1}) &&
	(my $peg2 = $to_peg{$2}))
    {
	$corrH->{$peg1}->{$peg2} = $corrH->{$peg2}->{$peg1} = $3;
    }
}
close(PCC);
my $sapO    = SAPserver->new;
my $genomeH = $sapO->all_features( -ids => [$genome], -type => 'peg' );
my $pegs    = $genomeH->{$genome};
$pegs || die "could not get pegs for genome $genome";

my $pegH    = $sapO->fid_locations( -ids => $pegs, -boundaries => 1 );

my @pegs_with_loc = sort { ($a->[1]->[0] cmp $b->[1]->[0]) or
			   (($a->[1]->[1]+$a->[1]->[2]) <=> ($b->[1]->[1]+$b->[1]->[2]))
                         }
                    map  { $pegH->{$_} =~ /^\d+\.\d+:(\S+)_(\d+)([\+\-])(\d+)/; 
		           defined($1) || die $pegH->{$_};
		           [$_,[$1,$2,($3 eq '+') ? ($2+($4-1)) : ($2 -($4-1))]] }
                    keys(%$pegH);      

my $clusters = &possible_clusters(\@pegs_with_loc,$corrH);

open(COREG,">$dataD/coreg.based.on.clusters")
    || die "could not open $dataD/coreg.based.on.clusters";
foreach my $cluster (@$clusters)
{
    if (@$cluster > 1)
    {
	my @pegs = sort { &SeedUtils::by_fig_id($a,$b) } @$cluster;
	print COREG join(",",@pegs),"\tClusterOnChromosome:$pegs[0],$pegs[$#pegs]\n";
    }
}
close(COREG);

sub possible_clusters {
    my($pegs_with_locs,$corrH) = @_;

    my $clusters = [];
    &gather_from_strand($pegs_with_locs,$corrH,$clusters);
    my $flipped = &flip($pegs_with_locs);
    my @pegs_with_locR = sort { ($a->[1]->[0] cmp $b->[1]->[0]) or
				    (($a->[1]->[1]+$a->[1]->[2]) <=> ($b->[1]->[1]+$b->[1]->[2]))
                              }
                         @$flipped;
    &gather_from_strand(\@pegs_with_locR,$corrH,$clusters);
    return $clusters;
}

sub gather_from_strand {
    my($pegs_with_locs,$corrH,$clusters) = @_;

    my $max = -1;
    my $i;
    while (($i = &next_to_try($max,$pegs_with_locs)) < (@$pegs_with_locs - 1))
    {
#	print STDERR "start grouping ",&Dumper($pegs_with_locs->[$i]);
	my $peg = $pegs_with_locs->[$i]->[0];
	my $cluster = [$peg];
	$max = $i;
	my $j;
	for ($j=$i+1; 
	     ($j < @$pegs_with_locs) && 
	     &ok_in_runF($pegs_with_locs->[$j-1],$pegs_with_locs->[$j],$corrH,$cluster); 
	     $j++)
	{
#	    print STDERR "adding ",&Dumper($pegs_with_locs->[$j]);
	    push(@$cluster,$pegs_with_locs->[$j]->[0]);
	    $max = $j;
	}
#	print STDERR "before going left ",&Dumper($cluster);
	if ((($i-1) >= 0) && 
	    (&corr($pegs_with_locs->[$i-1]->[0],$cluster,$corrH) >= 0.4) &&
	    ($pegs_with_locs->[$i-1]->[1]->[0] eq $pegs_with_locs->[$i]->[1]->[0]) &&
	    ($pegs_with_locs->[$i-1]->[1]->[1] > $pegs_with_locs->[$i-1]->[1]->[2]))
	{
#	    print STDERR "going left with ",&Dumper($pegs_with_locs->[$i-1]);
	    push(@$cluster,$pegs_with_locs->[$i-1]->[0]);
	    for ($j=$i-2; 
	         ($j >= 0) && &ok_in_runB($pegs_with_locs->[$j+1],$pegs_with_locs->[$j],$corrH,$cluster); 
		 $j--)
	    {
#		print STDERR "adding $pegs_with_locs->[$j]->[0]\n";
		push(@$cluster,$pegs_with_locs->[$j]->[0]);
	    }
	}
#	print STDERR "final cluster ",&Dumper($cluster); 
	if (@$cluster > 1) 
	{ 
	    push(@$clusters,$cluster);
#	    print STDERR "keeping ",join(",",@$cluster),"\n";
	}
    }
}

sub ok_in_runB {
    my($x,$y,$corrH,$cluster) = @_;

    my $loc1 = $x->[1];
    my($c1,$b1,$e1) = @$loc1;
    my $loc2 = $y->[1];
    my($c2,$b2,$e2) = @$loc2;
    if ($c1 ne $c2) { return 0 }
    if ($b2 < $e2)  { return 0 }
    if (! &corr($y->[0],$cluster,$corrH)) { return 0 }
    return (abs($e1-$b2) < 200);
}

sub ok_in_runF {
    my($x,$y,$corrH,$cluster) = @_;

    my $loc1 = $x->[1];
    my($c1,$b1,$e1) = @$loc1;
    my $loc2 = $y->[1];
    my($c2,$b2,$e2) = @$loc2;
    if ($c1 ne $c2) { return 0 }
    if ($b2 > $e2)  { return 0 }
    if (! &corr($y->[0],$cluster,$corrH)) { return 0 }
    return (abs($b2-$e1) < 200);
}

sub corr {
    my($peg1,$cluster,$corrH) = @_;

    my $sum = 0;
    foreach my $peg2 (@$cluster)
    {
	my $v = $corrH->{$peg1}->{$peg2};
	if ((! defined($v)) || ($v < 0.4)) { return 0 }
	$sum += $v;
    }
    return (($sum / @$cluster) >= 0.6);
}

sub next_to_try {
    my($max,$pegs_with_locs) = @_;
    my $i;
    for ($i=$max+1; ($i < @$pegs_with_locs) && 
	          ($pegs_with_locs->[$i]->[1]->[1] > $pegs_with_locs->[$i]->[1]->[2]);
	 $i++) {}
    return $i;
}

sub flip {
    my($pegs_with_locs) = @_;

    my $flipped = [];
    foreach my $x (@$pegs_with_locs)
    {
	my($peg,$loc) = @$x;
	my($contig,$beg,$end) = @$loc;
	my $loc1 = [$contig,100000000-$beg,100000000-$end];
	push(@$flipped,[$peg,$loc1]);
    }
    return $flipped;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3