[Bio] / FigKernelScripts / filter_overlaps.pl Repository:
ViewVC logotype

View of /FigKernelScripts/filter_overlaps.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (download) (as text) (annotate)
Sat Oct 28 14:22:29 2006 UTC (13 years ago) by overbeek
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.4: +6 -0 lines
Added some more diagnosting lines. -- /gdp

# -*- perl -*-
#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

use FIG;
use strict;

$0 =~ m/([\/]+)$/;
my $self = $1;
my $usage = "usage: 
                     filter_overlaps MinPEGln MaxRNA MaxConvOv MaxDivOv MaxSameStrand
                                     KeepTBL1 KeepTBL2 ... KeepTBLn 
                                     < superset_tbl_entries > to_keep_entries";

my($min_peg_ln,$max_RNA_overlap,$max_convergent_overlap,$max_divergent_overlap);
my($max_same_strand_overlap,$tbl,$fid,$type,$loc,$contig,$beg,$end);
(
 ($min_peg_ln               = shift @ARGV) &&
 ($max_RNA_overlap          = shift @ARGV) &&
 ($max_convergent_overlap   = shift @ARGV) &&
 ($max_divergent_overlap    = shift @ARGV) &&
 ($max_same_strand_overlap  = shift @ARGV)
)
    || die $usage;

my $parms = { min_peg_ln                => $min_peg_ln,
	      max_RNA_overlap           => $max_RNA_overlap,
	      max_convergent_overlap    => $max_convergent_overlap,
	      max_divergent_overlap     => $max_divergent_overlap,
	      max_same_strand_overlap   => $max_same_strand_overlap
	    };

print STDERR ("$self: parms=", &FIG::flatten_dumper($parms), "\n") if $ENV{VERBOSE};

my $fig = FIG->new();
# constants for positions in "keep" lists
use constant LEFT   => 0;
use constant RIGHT  => 1;
use constant STRAND => 2;
use constant TYPE   => 3;
use constant FID    => 4;

my $keep = {};
foreach $tbl (@ARGV)
{
    print STDERR "Loading $tbl ...\n" if $ENV{VERBOSE};
    
    foreach $_ (`cat $tbl`)
    {
	if ($_ =~ /^(fig\|\d+\.\d+\.([^\.]+)\.\d+)\t(\S+)/)
	{
	    ($fid,$type,$loc) = ($1,$2,$3);
	    ($contig,$beg,$end) = $fig->boundaries_of($loc);
	    if (defined($contig))
	    {
		push(@{$keep->{$contig}},
		     [&FIG::min($beg,$end),&FIG::max($beg,$end),($beg < $end) ? "+" : "-",$type,$fid]);
	    }
	}
    }
}

foreach $contig (keys(%$keep))
{
    my $x = $keep->{$contig};
    $keep->{$contig} = [sort { ($a->[LEFT]  <=> $b->[LEFT]) or
			       ($b->[RIGHT] <=> $a->[RIGHT]) } @$x];
}

my $entry;
while (defined($entry = <STDIN>))
{
    if ($entry =~ /^(fig\|\d+\.\d+\.([^\.]+)\.\d+)\t(\S+)/)
    {
	($fid,$type,$loc) = ($1,$2,$3);
	if ($type =~ /^(rna|orf|peg|cds)$/io)
	{
	    ($contig,$beg,$end) = $fig->boundaries_of($loc);
	    if (&keep_this_one($parms,$keep,$contig,$beg,$end,$fid))
	    {
		print $entry;
	    }
	}
	else
	{
	    die "Invalid type '$type' in entry:  $entry";
	}
    }
    else
    {
	die "Malformed tbl entry: $entry";
    }
}

sub keep_this_one
{
    my ($parms,$keep,$contig,$beg,$end,$fid) = @_;
    my ($ln, $x, @overlaps, $min, $max, $strand, $i);
    
    if (($ln = (abs($end-$beg)+1)) < $min_peg_ln)
    {
	print STDERR "$fid failed length test: $beg $end $ln $min_peg_ln\n" if $ENV{VERBOSE};
	return 0;
    }
    
    print STDERR "Processing "
	, join(", ", (&FIG::min($beg, $end), &FIG::max($beg, $end), (($beg < $end) ? "+" : "-"), "     $fid"))
	,  "\n" if $ENV{VERBOSE};
    if ($x = $keep->{$contig})
    {
	@overlaps = ();
	$min = &FIG::min($beg,$end);
	$max = &FIG::max($beg,$end);
	$strand = ($beg < $end) ? "+" : "-";
	for ($i=0; ($i < @$x) && ($min > $x->[$i]->[RIGHT]); $i++) {};
	while (($i < @$x) && ($max >= $x->[$i]->[LEFT]))
	{
	    print STDERR "   pushing ", join(", ", @ { $x->[$i] } )
		, "\t(", &overlap($min, $max, $x->[$i]->[LEFT],  $x->[$i]->[RIGHT]), ")"
		, "\n" if $ENV{VERBOSE};
	    push(@overlaps,$x->[$i]);
	    $i++;
	}
	
	my $serious = 0;
	for ($i=0; ($i < @overlaps); ++$i) 
	{
	    if (&serious_overlap($parms,$min,$max,$strand,$overlaps[$i],$fid)) { ++$serious; }
	}
	print STDERR "\n" if $ENV{VERBOSE};
	return ($serious == 0);
    }
    return 1;
}

sub overlap
{
    my ($min, $max, $minO, $maxO) = @_;
    my $olap = &FIG::max(0, (&FIG::min($max,$maxO) - &FIG::max($min,$minO) + 1));
    return $olap;
}

sub serious_overlap {
    my($parms,$min,$max,$strand,$overlap,$fid) = @_;
    my($minO,$maxO,$strandO,$typeO,$fidO) = @$overlap;
    
    my $olap = &overlap($min, $max, $minO, $maxO);
    
    if (&embedded($min,$max,$minO,$maxO))
    {
	print STDERR "$fidO [kept] is embedded in $fid\n" if $ENV{VERBOSE};
	return 1;
    }
    
    if (&embedded($minO,$maxO,$min,$max))
    {
	print STDERR "$fid is embedded in $fidO [kept]\n" if $ENV{VERBOSE};
	return 1;
    }
    
    if (($typeO eq "rna") && ($olap > $parms->{max_RNA_overlap}))
    {
	print STDERR "too much RNA overlap: $fid overlaps $fidO ($olap)\n" if $ENV{VERBOSE};
	return 1;
    }
    
    if (($typeO !~ /^rna/) && ($olap > $parms->{max_convergent_overlap})
       && &convergent($min,$max,$strand,$minO,$maxO,$strandO))

    {
	print STDERR "too much convergent overlap: $fid overlaps $fidO ($olap)\n" if $ENV{VERBOSE};
	return 1;
    }
    
    if (($typeO !~ /^rna/) && ($olap > $parms->{max_divergent_overlap})
       && &divergent($min,$max,$strand,$minO,$maxO,$strandO))

    {
	print STDERR "too much divergent overlap: $fid overlaps $fidO ($olap)\n" if $ENV{VERBOSE};
	return 1;
    }
    
    if (($typeO !~ /^rna/) && ($strand eq $strandO)
       && ($olap > $parms->{max_same_strand_overlap}))
    {
	print STDERR "too much same_strand overlap: $fid overlaps $fidO ($olap)\n" if $ENV{VERBOSE};
	return 1;
    }
    
    return 0;
}

sub embedded {
    my($min,$max, $minO,$maxO) = @_;

    if (($min <= $minO) && ($maxO <= $max))
    {
	return 1;
    }
    return 0;
}

sub convergent {
    my($min,$max,$strand,$minO,$maxO,$strandO) = @_;

    if (($strand ne $strandO) && 
	((($min < $minO) && ($strand eq "+")) ||
	 (($minO < $min) && ($strandO eq "+"))))
    {
	return 1;
    }
    return 0;
}

sub divergent {
    my($min,$max,$strand,$minO,$maxO,$strandO) = @_;

    if (($strand ne $strandO) && 
	((($min < $minO) && ($strand eq "-")) ||
	 (($minO < $min) && ($strandO eq "-"))))
    {
	return 1;
    }
    return 0;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3