[Bio] / FigKernelScripts / filter_new_overlaps.pl Repository:
ViewVC logotype

View of /FigKernelScripts/filter_new_overlaps.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Mon Dec 5 18:56:37 2005 UTC (13 years, 11 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, caBIG-05Apr06-00, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, caBIG-13Feb06-00, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.1: +17 -0 lines
Add license words.

# -*- perl -*-
#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#


use FIG;
$fig  = new FIG;
$code = &FIG::standard_genetic_code();

use Carp;
use Data::Dumper;

$| = 1;
$0 =~ m/([^\/]+)$/;
$usage = "$1 [-v] [-weight_file=file -min_weight=integer] closest_org contigs tbl_rna tbl_peg1 tbl_peg2 ... > merged.tbl";

=pod

=head1         filter_new_overlaps

  Usage:       filter_tbl_overlaps [-v] [-weight_file=file -min_weight=integer] closest_org contigs tbl_rna tbl_peg1 tbl_peg2 ... > merged.tbl

  Function: Resolves overlaps and conflicts between PEGs in one or more 
    tbl files, based on similarity evidence. Similarities must be with
    respect to organisms at least a moderate distance from 'closest_org'. 
    The order in which tbl files are entered is significant: Later files
    are considered to have lower priority in the resolution process. 
    Furthermore, the first tbl file is assumed to be of RNAs, not PEGs,
    and no PEG is allowed to overlap any of its entries; hence if no RNA tbl 
    is available, this argument should be given as '/dev/null'.

  Example:     filter_new_overlaps CloseOrgID contigs tbl_rna tbl_peg.1 tbl_peg.2 > tbl_peg

  Author:      Gordon D. Pusch (2004 Jan 28)

=cut

$invert      = "";
$weight_file = "";
$min_weight  = 999999999;
$trouble     = 0;
for ($i=0; $i < @ARGV; )
{
    if ($ARGV[$i] =~ m/-v/)
    {
	$invert = splice(@ARGV, $i, 1);
    }
    elsif ($ARGV[$i] =~ m/-weight_file=(\S+)/)
    {
	$weight_file = $1;
	splice(@ARGV, $i, 1);
	if (! -s $weight_file) 
	{
	    $trouble = 1;
	    print STDERR "ERROR: weight-file $weight_file does not exist\n";
	}
    }
    elsif ($ARGV[$i] =~ m/-min_weight=(\d+)/)
    {
	$min_weight = $1;
	splice(@ARGV, $i, 1);
    }
    elsif ((($i == 0) && ($ARGV[0] =~ m/^\d+\.\d+$/) && (-d "$FIG_Config::organisms/$ARGV[0]"))
	   || (($i > 0) && (-f $ARGV[$i])))
    {
	++$i;
    }
    else
    {
	$trouble = 1;
	print STDERR "ERROR: Invalid arg $ARGV[$i]\n";
	++$i;
    }
}
die "\n\nAborting due to invalid args\n\nusage: $usage\n\n" if $trouble;

(@ARGV >= 4) || die "\n\tusage: $usage\n\n";

(($closest_org = shift @ARGV) && (-d "$FIG_Config::organisms/$closest_org"))
    || die "Closest-org directory $FIG_Config::organisms/$closest_org does not exist";

(-s ($contigs_file  = shift @ARGV))  
    || die "Contigs file $contigs_file does not exist";
((@tbl_files = @ARGV) > 1) || die "Not enough tbl files";

use constant RNA    =>  0;

use constant ENTRY  =>  0;
use constant FILE   =>  1;
use constant FID    =>  2;
use constant LOCUS  =>  3;
use constant CONTIG =>  4;
use constant START  =>  5;
use constant STOP   =>  6;
use constant LEN    =>  7;
use constant STRAND =>  8;
use constant SCORE  =>  9;
use constant WEIGHT => 10;
use constant TRUNC  => 11;
use constant SUBSYS => 12;

$min_len            =  90;
$max_convergent     =  50;
$max_divergent      = 150;
$max_normal_overlap = 150;
$max_rna_overlap    = 120;

$tbl = &load_tbls($weight_file, @tbl_files);

($seq_of, $len_of) = &load_contigs($contigs_file);

foreach $contig (sort by_len_and_id keys %$tbl)
{
    $x = $tbl->{$contig};
    
    print STDERR ("Contig $contig contains ", (scalar @$x), " loci\n") if $ENV{VERBOSE};
    print STDERR Dumper($x) if ($ENV{VERBOSE} && ($ENV{VERBOSE} > 1));
    
    for ($i=0; $i < @$x; ++$i)
    {
	for($j = $i+1; (($j < @$x) && &entries_overlap($x, $i, $j)); ++$j)
	{
	    if    (&bad_rna_overlap($x, $i, $j))  { &process_rna($x, $i, $j); }
	    elsif (&bad_rna_overlap($x, $j, $i))  { &process_rna($x, $j, $i); }
	}
    }
    
    for ($i=0; $i < @$x; ++$i)
    {
	if (($x->[$i]->[LEN] < $min_len) && (not $x->[$i]->[SUBSYS]) && (not $x->[$i]->[TRUNC]))
	{
	    next if &is_rna($x, $i);
	    next if $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"};
	    
	    print STDERR "Short PEG $i ($x->[$i]->[LEN])\n" if $ENV{VERBOSE};
	    $score = &compute_score($x, $i);
	    print STDERR "score_i = $score\n" if $ENV{VERBOSE};
	    
	    if ($score < 100*$min_len)
	    {
		if (not $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"})
		{
		    print STDERR "Deleting short PEG $i with no support: $x->[$i]->[ENTRY]\n\n" 
			if $ENV{VERBOSE};
		    $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"} = "Short:\t$x->[$i]->[WEIGHT]\t$score\t" . $x->[$i]->[ENTRY];
		}
	    }
	    else
	    {
		print STDERR "\n" if $ENV{VERBOSE};
	    }
	}
    }
    
    for ($i=0; $i < @$x; ++$i)
    {
	print STDERR "\nProcessing $i: $x->[$i]->[FID]\n" if $ENV{VERBOSE};
	
	for($j = $i+1; (($j < @$x) && &entries_overlap($x, $i, $j)); ++$j)
	{
	    print STDERR "...Checking for embedding against $j: $x->[$j]->[FID]\n"
		if $ENV{VERBOSE};
	    
	    if    (&embedded($x, $i, $j))  { &process_embedded($x, $i, $j); }
 	    elsif (&embedded($x, $j, $i))  { &process_embedded($x, $j, $i); }
	}
    }
    
    for ($i=0; $i < @$x; ++$i)
    {
	for($j = $i+1; (($j < @$x) && &entries_overlap($x, $i, $j)); ++$j)
	{
	    if (&opposite_strand($x, $i, $j))
	    {
		if    (&convergent($x, $i, $j) > $max_convergent)  { &process_convergent($x, $i, $j); }
		elsif (&divergent( $x, $i, $j) > $max_divergent)   { &process_divergent( $x, $i, $j); }
	    }
	    elsif (&same_strand($x, $i, $j))
	    {
		if    (&same_stop($x, $i, $j))
		{ 
		    &process_same_stop($x, $i, $j);
		}
		elsif (&normal_overlap($x, $i, $j) > $max_normal_overlap)  
		{
		    &process_normal_overlap($x, $i, $j); 
		}
	    }
	    else 
	    { 
		&impossible_overlap_warning($x, $i, $j, "end of main loop");
	    }
	}
    }
}

foreach $contig (sort keys %$tbl)
{
    $x = $tbl->{$contig};
    
    for ($i=0; $i < @$x; ++$i)
    {
	$key = "$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]";
	
	if (&is_rna($x, $i))
	{
	    print STDERR "Skipping RNA entry:\t$x->[$i]->[ENTRY]\n" if $ENV{VERBOSE};
	}
	else
	{
	    if ($invert)
	    {
		if ($deleted{$key})
		{
		    print STDERR "$deleted{$key}\n";
		    print "$x->[$i]->[ENTRY]\n";
		}
	    }
	    else
	    {
		if ($deleted{$key})
		{
		    print STDERR "Skipping DEL entry:\t$deleted{$key}\n" if $ENV{VERBOSE};
		}
		else
		{
		    print STDERR "Accepted PEG entry:\t$x->[$i]->[ENTRY]\n" if $ENV{VERBOSE};
		    print "$x->[$i]->[ENTRY]\n";
		}
	    }
	}
    }
}
exit (0);


sub by_len_and_id
{
    return (($len_of->{$a} <=> $len_of->{$b}) || ($a cmp $b));
}


sub entries_overlap
{
    my ($x, $i, $j) = @_;
    
    return 0 if ($x->[$i]->[CONTIG] ne $x->[$j]->[CONTIG]);
    
    $ov = &overlaps( $x->[$i]->[START], $x->[$i]->[STOP]
		   , $x->[$j]->[START], $x->[$j]->[STOP]);
    
    return $ov;
}

sub overlaps
{
    my ($beg1, $end1, $beg2, $end2) = @_;
    
    my ($left1, $right1, $left2, $right2);
    
    $left1  = &FIG::min($beg1, $end1);
    $left2  = &FIG::min($beg2, $end2);
    
    $right1 = &FIG::max($beg1, $end1);
    $right2 = &FIG::max($beg2, $end2);
    
    if ($left1 > $left2)
    {
        ($left1, $left2)   = ($left2, $left1);
        ($right1, $right2) = ($right2, $right1);
    }
    
    $ov = 0;
    if ($right1 >= $left2) { $ov = &FIG::min($right1,$right2) - $left2 + 1; }
    
    return $ov;
}

sub type_of
{
    my ($id) = @_;
    
    return 'rna' if ($id =~ m/^[^\|]+\|\d+\.\d+\.rna\.\d+$/);
    return 'peg' if ($id =~ m/^[^\|]+\|\d+\.\d+\.peg\.\d+$/);
    return undef;
}

sub is_rna
{
    my ($x, $i) = @_;
    
    return ($x->[$i]->[FILE] == RNA);
}

sub same_strand
{
    my ($x, $i, $j) = @_;
    
    return 0 if $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"};
    return 0 if $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"};
    
    return ($x->[$i]->[STRAND] eq $x->[$j]->[STRAND]);
}

sub opposite_strand
{
    my ($x, $i, $j) = @_;
    
    return 0 if $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"};
    return 0 if $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"};
    
    return ($x->[$i]->[STRAND] ne $x->[$j]->[STRAND]);
}

sub bad_rna_overlap
{
    my ($x, $i, $j) = @_;
    
    return 0 if $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"};
    return 0 if $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"};
    
    #...For now, treat all RNA overlaps as "non-removable"
    if (&is_rna($x, $i))
    {
	return (&overlaps($x->[$i]->[START], $x->[$i]->[STOP],  $x->[$j]->[START], $x->[$j]->[STOP]) > $max_rna_overlap);
    }
    else
    {
	return 0;
    }
}

sub embedded
{
    my ($x, $i, $j) = @_;
    
    return 0 if $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"};
    return 0 if $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"};
    
    my $left_i  = &FIG::min($x->[$i]->[START], $x->[$i]->[STOP]);
    my $right_i = &FIG::max($x->[$i]->[START], $x->[$i]->[STOP]);
    
    my $beg_j   = $x->[$j]->[START];
    my $end_j   = $x->[$j]->[STOP];
    
    if (  ($left_i < $beg_j)   && ($left_i < $end_j) 
       && ($beg_j  < $right_i) && ($end_j  < $right_i)
       )
    {
	print STDERR "$j is embedded in $i\n" if $ENV{VERBOSE};
	return &overlaps($left_i, $right_i, $beg_j, $end_j);
    }
    
    return 0;
}

sub convergent
{
    my ($x, $i, $j) = @_;
    
    return 0 if $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"};
    return 0 if $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"};
    
    my $beg_i   = $x->[$i]->[START];
    my $end_i   = $x->[$i]->[STOP];
    
    my $beg_j   = $x->[$j]->[START];
    my $end_j   = $x->[$j]->[STOP];
    
    if (   &FIG::between($beg_i, $end_j, $end_i) 
       &&  &FIG::between($beg_j, $end_i, $end_j)
       && !&FIG::between($beg_j, $beg_i, $end_j)
       && !&FIG::between($beg_i, $beg_j, $end_i)
       )
    {
	return &overlaps($beg_i, $end_i, $beg_j, $end_j);
    }
    
    return 0;
}

sub divergent
{
    my ($x, $i, $j) = @_;
    
    return 0 if $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"};
    return 0 if $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"};
    
    my $beg_i   = $x->[$i]->[START];
    my $end_i   = $x->[$i]->[STOP];
    
    my $beg_j   = $x->[$j]->[START];
    my $end_j   = $x->[$j]->[STOP];
    
    if (   &FIG::between($beg_i, $beg_j, $end_i) 
       &&  &FIG::between($beg_j, $beg_i, $end_j)
       && !&FIG::between($beg_j, $end_i, $end_j)
       && !&FIG::between($beg_i, $end_j, $end_i)
       )
    {
	return &overlaps($beg_i, $end_i, $beg_j, $end_j);
    }
    
    return 0;
}

sub normal_overlap
{
    my ($x, $i, $j) = @_;
    
    return 0 if $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"};
    return 0 if $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"};
    
    my $beg_i   = $x->[$i]->[START];
    my $end_i   = $x->[$i]->[STOP];
    
    my $beg_j   = $x->[$j]->[START];
    my $end_j   = $x->[$j]->[STOP];
    
    if    (  ($x->[$i]->[STRAND] eq '+') && ($x->[$j]->[STRAND] eq '+')
          && ($beg_i < $beg_j) && ($beg_j <= $end_i) && ($end_i < $end_j)
          )
    {
	return &overlaps($beg_i, $end_i,  $beg_j, $end_j);
    }
    elsif (  ($x->[$i]->[STRAND] eq '-') && ($x->[$j]->[STRAND] eq '-')
	  && ($end_i < $end_j) && ($end_j <= $beg_i) && ($beg_i < $beg_j) 
	  )
    {
	return &overlaps($beg_i, $end_i,  $beg_j, $end_j);
    }
    else
    {
	&impossible_overlap_warning($x, $i, $j, "Opposite strands in a normal_overlap");
	return 0;
    }
}


sub same_stop
{
    my ($x, $i, $j) = @_;
    
    return 0 if $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"};
    return 0 if $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"};
    
    my $end_i   = $x->[$i]->[STOP];
    my $end_j   = $x->[$j]->[STOP];
    
    if (&same_strand($x, $i, $j) && ($end_i == $end_j))
    {
	return 1;
    } else {
	 return 0;
    }
}


sub impossible_overlap_warning
{
    my ($x, $i, $j, $msg) = @_;
    
    return 0 if $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"};
    return 0 if $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"};
    
    if (defined($msg))
    {
	print STDERR ("Impossible overlap in $msg:\n"
		      , join("\t", @ { $x->[$i] }), "\n"
		      , join("\t", @ { $x->[$j] }), "\n\n")
	    if $ENV{VERBOSE};
	# confess "$msg";
    }
    else
    {
	print STDERR ("Impossible overlap:\n$i: "
		      , join("\t", @ { $x->[$i] }), "\n$j: "
		      , join("\t", @ { $x->[$j] }), "\n\n")
	    if $ENV{VERBOSE};
	# confess "aborted";
    }
    
    return 0;
}



sub process_rna
{
    my ($x, $i, $j) = @_;
    
    return 0 if ($deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"} || $x->[$j]->[SUBSYS] || $x->[$j]->[TRUNC]);
    
    if (&is_rna($x, $i) && &is_rna($x, $j))
    {
	&impossible_overlap_warning($x, $i, $j, "process_rna");
    }
    else
    {
	#...For now, just delete $j;
	if (not $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"})
	{
	    print STDERR ("Deleting PEG $j overlapping RNA $i:\n"
			  , "$i: $x->[$i]->[ENTRY]\n$j: $x->[$j]->[ENTRY]\n\n")
		if $ENV{VERBOSE};
	    $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"} = "RNA:\t-9.999999e99\t" . $x->[$j]->[ENTRY];
	}
    }
}


sub process_embedded
{
    my ($x, $i, $j) = @_;
    
    return 0 if ($deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"} || $x->[$i]->[SUBSYS] || $x->[$i]->[TRUNC]);
    return 0 if ($deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"} || $x->[$j]->[SUBSYS] || $x->[$j]->[TRUNC]);
    
    unless (defined($x->[$i]->[SCORE]))  { $x->[$i]->[SCORE] = &compute_score($x, $i); }
    unless (defined($x->[$j]->[SCORE]))  { $x->[$j]->[SCORE] = &compute_score($x, $j); }
    
    my $score_i  = $x->[$i]->[SCORE];
    my $score_j  = $x->[$j]->[SCORE];
    
    my $weight_i = $x->[$i]->[WEIGHT];
    my $weight_j = $x->[$j]->[WEIGHT];
    
    if (($score_i < $score_j) && ($weight_i < $min_weight) && ($weight_i <= $weight_j))
    {
	print STDERR "Deleting embedding PEG $i in favor of embedded PEG $j, score_i = $score_i, score_j = $score_j\n"
	    if $ENV{VERBOSE};
	print STDERR "$i: $x->[$i]->[ENTRY]\n"   if $ENV{VERBOSE};
	print STDERR "$j: $x->[$j]->[ENTRY]\n\n" if $ENV{VERBOSE};
	
	$deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"} = "EMB_1:\t$weight_i\t$score_i\t" . $x->[$i]->[ENTRY];
    }
    elsif (($score_i > $score_j) && ($weight_j < $min_weight) && ($weight_j <= $weight_i))
    {
	print STDERR "Deleting embedded PEG $j in favor of embedding PEG $i, score_j = $score_j, score_i = $score_i\n"
	    if $ENV{VERBOSE};
	print STDERR "$i: $x->[$i]->[ENTRY]\n"   if $ENV{VERBOSE};
	print STDERR "$j: $x->[$j]->[ENTRY]\n\n" if $ENV{VERBOSE};
	
	$deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"} = "EMB_2:\t$weight_j\t$score_j\t" . $x->[$j]->[ENTRY];
    }
    elsif (($weight_j < $min_weight) && ($weight_j <= $weight_i))
    {
	#...In case of same score, delete $j, because $i must be longer...
	
	print STDERR "Deleting embedded PEG $j in favor of embedding PEG $i, score_j = $score_j, score_i = $score_i\n"
	    if $ENV{VERBOSE};
	print STDERR "$i: $x->[$i]->[ENTRY]\n"   if $ENV{VERBOSE};
	print STDERR "$j: $x->[$j]->[ENTRY]\n\n" if $ENV{VERBOSE};
	
	$deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"} = "EMB_3:\t$weight_j\t$score_j\t" . $x->[$j]->[ENTRY];
    }
    
    return;
}


sub process_convergent
{
    my ($x, $i, $j) = @_;
    
    return 0 if ($deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"} || $x->[$i]->[SUBSYS] || $x->[$i]->[TRUNC]);
    return 0 if ($deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"} || $x->[$j]->[SUBSYS] || $x->[$j]->[TRUNC]);
    
    unless (defined($x->[$i]->[SCORE]))  { $x->[$i]->[SCORE] = &compute_score($x, $i); }
    unless (defined($x->[$j]->[SCORE]))  { $x->[$j]->[SCORE] = &compute_score($x, $j); }
    
    my $score_i  = $x->[$i]->[SCORE];
    my $score_j  = $x->[$j]->[SCORE];
    
    my $weight_i = $x->[$i]->[WEIGHT];
    my $weight_j = $x->[$j]->[WEIGHT];
    
    if (($score_i < $score_j) && ($weight_i < $min_weight) && ($weight_i <= $weight_j))
    {
	print STDERR "Deleting downstream convergent PEG $i in favor of $j, score_i = $score_i, score_j = $score_j\n"
	    if $ENV{VERBOSE};
	print STDERR "$i: $x->[$i]->[ENTRY]\n"   if $ENV{VERBOSE};
	print STDERR "$j: $x->[$j]->[ENTRY]\n\n" if $ENV{VERBOSE};
	
	$deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"} = "CNV_1:\t$weight_i\t$score_i\t" . $x->[$i]->[ENTRY];
    }
    elsif (($score_i > $score_j) && ($weight_j < $min_weight) && ($weight_j <= $weight_i))
    {
	print STDERR "Deleting upstream convergent PEG $j in favor of $i, score_j = $score_j, score_i = $score_i\n"
	    if $ENV{VERBOSE};
	print STDERR "$i: $x->[$i]->[ENTRY]\n"   if $ENV{VERBOSE};
	print STDERR "$j: $x->[$j]->[ENTRY]\n\n" if $ENV{VERBOSE};
	
	$deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"} = "CNV_2:\t$weight_j\t$score_j\t" . $x->[$j]->[ENTRY];
    }
    els
    {
	#...default to "priority"...
	if (($x->[$i]->[FILE] < $x->[$j]->[FILE]) && ($weight_i < $min_weight) && ($weight_i <= $weight_j))
	{
	    print STDERR "Deleting downstream convergent PEG $i in favor of $j, score_i = $score_i, score_j = $score_j\n"
		if $ENV{VERBOSE};
	    print STDERR "$i: $x->[$i]->[ENTRY]\n"   if $ENV{VERBOSE};
	    print STDERR "$j: $x->[$j]->[ENTRY]\n\n" if $ENV{VERBOSE};
	    
	    $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"} = "CNV_3:\t$weight_i\t$score_i\t" . $x->[$j]->[ENTRY];
	}
	elsif (($x->[$i]->[FILE] > $x->[$j]->[FILE]) && ($weight_j < $min_weight) && ($weight_j <= $weight_i))
	{
	    print STDERR "Deleting upstream convergent PEG $j in favor of $i, score_j = $score_j, score_i = $score_i\n"
		if $ENV{VERBOSE};
	    print STDERR "$i: $x->[$i]->[ENTRY]\n"   if $ENV{VERBOSE};
	    print STDERR "$j: $x->[$j]->[ENTRY]\n\n" if $ENV{VERBOSE};
	    
	    $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"} = "CNV_4:\t$weight_j\t$score_j\t" . $x->[$j]->[ENTRY];
	}
	else
	{
	    #...Warn and move on...
	    
	    &impossible_overlap_warning($x, $i, $j, "process_convergent");
	}
    }
    
    return;
}


sub process_divergent
{
    my ($x, $i, $j) = @_;
    
    return 0 if ($deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"} || $x->[$i]->[SUBSYS] || $x->[$i]->[TRUNC]);
    return 0 if ($deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"} || $x->[$j]->[SUBSYS] || $x->[$j]->[TRUNC]);
    
    unless (defined($x->[$i]->[SCORE]))  { $x->[$i]->[SCORE] = &compute_score($x, $i); }
    unless (defined($x->[$j]->[SCORE]))  { $x->[$j]->[SCORE] = &compute_score($x, $j); }
    
    my $score_i  = $x->[$i]->[SCORE];
    my $score_j  = $x->[$j]->[SCORE];
        
    my $weight_i = $x->[$i]->[WEIGHT];
    my $weight_j = $x->[$j]->[WEIGHT];
    
    if (($score_i < $score_j) && ($weight_i < $min_weight) && ($weight_i <= $weight_j))
    {
	print STDERR "Deleting downstream divergent PEG $i in favor of $j, score_i = $score_i, score_j = $score_j\n"
	    if $ENV{VERBOSE};
	print STDERR "$i: $x->[$i]->[ENTRY]\n"   if $ENV{VERBOSE};
	print STDERR "$j: $x->[$j]->[ENTRY]\n\n" if $ENV{VERBOSE};
	
	$deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"} = "DIV_1:\t$weight_i\t$score_i\t" . $x->[$i]->[ENTRY];
    }
    elsif (($score_i > $score_j) && ($weight_j < $min_weight) && ($weight_j <= $weight_i))
    {
	print STDERR "Deleting upstream divergent PEG $j in favor of $i, score_j = $score_j, score_i = $score_i\n"
	    if $ENV{VERBOSE};
	print STDERR "$i: $x->[$i]->[ENTRY]\n"   if $ENV{VERBOSE};
	print STDERR "$j: $x->[$j]->[ENTRY]\n\n" if $ENV{VERBOSE};
	
	$deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"} = "DIV_2:\t$weight_j\t$score_j\t" . $x->[$j]->[ENTRY];
    }
    else
    {
	#...default to "priority"...
	if (($x->[$i]->[FILE] < $x->[$j]->[FILE]) && ($weight_j < $min_weight) && ($weight_j <= $weight_i))
	{
	    print STDERR "Deleting downstream divergent PEG $i in favor of $j, score_i = $score_i, score_j = $score_j\n"
		if $ENV{VERBOSE};
	    print STDERR "$i: $x->[$i]->[ENTRY]\n"   if $ENV{VERBOSE};
	    print STDERR "$j: $x->[$j]->[ENTRY]\n\n" if $ENV{VERBOSE};
	    
	    $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"} = "DIV_3:\t$weight_i\t$score_i\t" . $x->[$i]->[ENTRY];
	}
	elsif (($x->[$i]->[FILE] > $x->[$j]->[FILE]) && ($weight_j < $min_weight) && ($weight_j <= $weight_i))
	{
	    print STDERR "Deleting upstream divergent PEG $j in favor of $i, score_j = $score_j, score_i = $score_i\n"
		if $ENV{VERBOSE};
	    print STDERR "$i: $x->[$i]->[ENTRY]\n"   if $ENV{VERBOSE};
	    print STDERR "$j: $x->[$j]->[ENTRY]\n\n" if $ENV{VERBOSE};
	    
	    $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"} = "DIV_4:\t$weight_j\t$score_j\t" . $x->[$j]->[ENTRY];
	}
	else
	{
	    #...Warn and move on...
	    
	    &impossible_overlap_warning($x, $i, $j, "process_divergent");
	}
    }
    
    return;
}

sub process_same_stop
{
    my ($x, $i, $j) = @_;
    
    return 0 if ($deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"} || $x->[$i]->[SUBSYS] || $x->[$i]->[TRUNC]);
    return 0 if ($deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"} || $x->[$j]->[SUBSYS] || $x->[$j]->[TRUNC]);
    
    my $len_i    = $x->[$i]->[LEN];
    my $len_j    = $x->[$j]->[LEN];
    
    my $score_i  = $x->[$i]->[SCORE];
    my $score_j  = $x->[$j]->[SCORE];
        
    my $weight_i = $x->[$i]->[WEIGHT];
    my $weight_j = $x->[$j]->[WEIGHT];
    
    if ($len_i < $len_j)
    {
	print STDERR "Deleting same-STOP PEG $i in favor of $j, len_i = $len_i, len_j = $len_j\n"
	    if $ENV{VERBOSE};
	print STDERR "$i: $x->[$i]->[ENTRY]\n"   if $ENV{VERBOSE};
	print STDERR "$j: $x->[$j]->[ENTRY]\n\n" if $ENV{VERBOSE};
	
	$deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"} = "STP_1:\t$weight_i\t$score_i\t" . $x->[$i]->[ENTRY];
    }
    else
    {
	print STDERR "Deleting same-STOP PEG $j in favor of $i, len_i = $len_i, len_j = $len_j\n"
	    if $ENV{VERBOSE};
	print STDERR "$i: $x->[$i]->[ENTRY]\n"   if $ENV{VERBOSE};
	print STDERR "$j: $x->[$j]->[ENTRY]\n\n" if $ENV{VERBOSE};
	
	$deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"} = "STP_2:\t$weight_j\t$score_j\t" . $x->[$j]->[ENTRY];
    }
    
    return;
}    

sub process_normal_overlap
{
    my ($x, $i, $j) = @_;
    
    return 0 if ($deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"} || $x->[$i]->[SUBSYS] || $x->[$i]->[TRUNC]);
    return 0 if ($deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"} || $x->[$j]->[SUBSYS] || $x->[$j]->[TRUNC]);
    
    unless (defined($x->[$i]->[SCORE]))  { $x->[$i]->[SCORE] = &compute_score($x, $i); }
    unless (defined($x->[$j]->[SCORE]))  { $x->[$j]->[SCORE] = &compute_score($x, $j); }
    
    my $score_i  = $x->[$i]->[SCORE];
    my $score_j  = $x->[$j]->[SCORE];
        
    my $weight_i = $x->[$i]->[WEIGHT];
    my $weight_j = $x->[$j]->[WEIGHT];
    
    if (($score_i < $score_j) && ($weight_i < $min_weight) && ($weight_i <= $weight_j))
    {
	print STDERR "Deleting downstream same-strand PEG $i in favor of $j, score_i = $score_i, score_j = $score_j\n"
	    if $ENV{VERBOSE};
	print STDERR "$i: $x->[$i]->[ENTRY]\n"   if $ENV{VERBOSE};
	print STDERR "$j: $x->[$j]->[ENTRY]\n\n" if $ENV{VERBOSE};
	
	$deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"} = "NRM_1:\t$weight_i\t$score_i\t" . $x->[$i]->[ENTRY];
    }
    elsif (($score_i > $score_j) && ($weight_j < $min_weight) && ($weight_j <= $weight_i))
    {
	print STDERR "Deleting upstream same-strand PEG $j in favor of $i, score_j = $score_j, score_i = $score_i\n"
	    if $ENV{VERBOSE};
	print STDERR "$i: $x->[$i]->[ENTRY]\n"   if $ENV{VERBOSE};
	print STDERR "$j: $x->[$j]->[ENTRY]\n\n" if $ENV{VERBOSE};
	
	$deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"} = "NRM_2:\t$weight_j\t$score_j\t" . $x->[$j]->[ENTRY];
    }
    else
    {
	#...default to "priority"...
	if (($x->[$i]->[FILE] < $x->[$j]->[FILE])  && ($weight_i < $min_weight) && ($weight_i <= $weight_j))
	{
	    print STDERR "Deleting downstream same-strand PEG $i in favor of $j, score_i = $score_i, score_j = $score_j\n"
		if $ENV{VERBOSE};
	    print STDERR "$i: $x->[$i]->[ENTRY]\n"   if $ENV{VERBOSE};
	    print STDERR "$j: $x->[$j]->[ENTRY]\n\n" if $ENV{VERBOSE};
	    
	    $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"} = "NRM_3:\t$weight_i\t$score_i\t" . $x->[$i]->[ENTRY];
	}
	elsif (($x->[$i]->[FILE] > $x->[$j]->[FILE]) && ($weight_j < $min_weight) && ($weight_j <= $weight_i))
	{
	    print STDERR "Deleting upstream same-strand PEG $j in favor of $i, score_j = $score_j, score_i = $score_i\n"
		if $ENV{VERBOSE};
	    print STDERR "$i: $x->[$i]->[ENTRY]\n"   if $ENV{VERBOSE};
	    print STDERR "$j: $x->[$j]->[ENTRY]\n\n" if $ENV{VERBOSE};
	    
	    $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"} = "NRM_4:\t$weight_j\t$score_j\t" . $x->[$j]->[ENTRY];
	}
	else
	{
	    #...Warn and move on...
	    
	    &impossible_overlap_warning($x, $i, $j, "process_normal_overlap");
	}
    }
    
    return;
}


sub compute_score
{
    my ($x, $i) = @_;
    
    my $max_psc   = 1.0e-10;
    my $min_dist  = 0.002;
    my $min_match = 0.7;
    
    my ($pseq, @hits, $id2, $len2, $psc, $beg2, $end2);
    my ($dist, $width);
    
    my $id    = $x->[$i]->[FID];
    my $len   = $x->[$i]->[LEN];
    my $beg   = $x->[$i]->[START];
    my $end   = $x->[$i]->[STOP];
    my $locus = $x->[$i]->[LOCUS]; 
    
    $id =~ m/^([^\|]+)\|(\d+\.\d+)\.([^\.]+)/;
    my ($class, $org, $type) = ($1, $2, $3);
    
    return 9.999999e99        if ($type eq 'rna');
    return 0                  if ($deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"});
    return $x->[$i]->[SCORE]  if (defined($x->[$i]->[SCORE]));
    
    if ($class eq 'fig')
    {
	$pseq =  $fig->get_translation($id);
    }
    elsif ($class eq 'changed')
    {
	($_ = $id) =~ s/^changed/fig/;
	$pseq =  $fig->get_translation($_);
    }
    elsif ($class eq 'new')
    {
	$pseq =  $fig->translate( $fig->dna_seq($org, $locus, $code ) );
	$pseq =~ s/\*$//;
    }
    else
    {
	die "Could not handle feature $id, class $class:\n", Dumper($x->[$i]);
    }
    
    unless ($pseq)
    {
	print STDERR "Skipping $i: $id, locus $locus --- no sequence\n";
	return 0;
    }
   
    my $tmp = $pseq;
    $tmp =~ s/[^ARNDCQEGHILKMFPSTWYVBZ]//ig;
    if (length($tmp) < 10)
    {
	print STDERR "Skipping locus $locus --- too few valid AA chars: $pseq\n";
	return 0;
    }
    
    print STDERR "Getting sims for $i: $id\t$locus\t$pseq\n" if $ENV{VERBOSE};
    print STDERR "\n"     if ((length($pseq) > 90) && $ENV{VERBOSE});
    
    if (($id =~ m/^fig/) || ($id =~ m/^changed/))
    {
	@hits = $fig->sims($id, 500, 1.0e-5, 'all');
    }
    elsif ($id =~ m/^new/)
    {
	unless (-s "$FIG_Config::global/nr.pin")
	{
	    &FIG::run("formatdb -i $FIG_Config::global/nr -p T");
	}
	
	
	my $tmp_file = "tmp_file.$$";
	open(TMP, ">$tmp_file") 
	    || die "Could not write-open $tmp_file";
	print TMP ">$id\n$pseq\n";
	close(TMP) || die "Could not close $tmp_file";
	
	@hits = map { chomp $_;   
		      @_ = split(/\t/, $_);
		      $len2 = $fig->translation_length($_[1]);
		      push @_, ($len, $len2, 'blastp');
		      bless( [ @_ ], 'Sim' ) 
		      }	`blastall -i $tmp_file -d $FIG_Config::global/nr -p blastp -FF -m8`;
#	die Dumper(\@hits);
    }
    else
    {
	die "Could not compute score for $id: $locus";
    }
    
    my $bscore = 0;
    my $dscore = 0;
    my $qscore = 0;
    my $num_hits = 1;     #...pseudo "Laplace correction" (prevents divide by zero)
    foreach my $hit (@hits)
    {
	$id2  = $hit->id2;
	$len2 = $hit->ln2;
	$psc  = $hit->psc;
	$bsc  = $hit->bsc;
	$beg2 = $hit->b2;
	$end2 = $hit->e2;
	
	next unless (&type_of($id2) eq 'peg');
	
	$dist  = $fig->crude_estimate_of_distance($closest_org, $fig->genome_of($id2));
	die "Could not get dist between $closest_org and ". $fig->genome_of($id2) unless defined $dist;
	
	$width = $end2 - $beg2 + 1;
	if (($psc <= $max_psc) && ($dist >= $min_dist) && ($width > $min_match*$len2)
	    )
	{
	    ++$num_hits;
	    $bscore += $bsc;
	    $dscore += $dist;
	    $qscore += $dist * $bsc;
	    print STDERR "Accepting $id2:\t$psc\t$dist\t$width\t$len2\n"
		if ($ENV{VERBOSE} && ($ENV{VERBOSE} > 1));
	}
	else
	{
	    print STDERR "Rejecting $id2:\t$psc\t$dist\t$width\t$len2\n"
		if ($ENV{VERBOSE} && ($ENV{VERBOSE} > 1));
	}
    }
    
    $score   = $x->[$i]->[LEN] + 10000 * $qscore;
    if ($x->[$i]->[SUBSYS])  { $score *= 1000000; }
    
    $x->[$i]->[SCORE] = $score;
    return $score;
}


sub protseq
{
    my ($contig, $beg, $end) = @_;
    my ($seq, $pseq);
    
    if ($beg < $end)
    {
	$seq = substr($$seq_of{$contig}, $beg-1, 1+($end-$beg));
    } 
    else 
    {
	$seq = substr($$seq_of{$contig}, $end-1, 1+($beg-$end));
	$seq = $ { &rev_comp(\$seq) };
    }
    
    $pseq = &translate(\$seq);
    substr($$pseq,0,1) = &toM(\$seq);
    
    return $pseq;
}


sub load_tbls
{
    my ($weight_file, @tbl_files) = @_;
    my ($entry, $fid, $org, $locus, $contig, $beg, $end, $strand, $len);
    
    my %weight;
    if ($weight_file)
    {
	my @fids;
	open(TMP, "<$weight_file") || die "Could not open $weight_file";
	while (defined($entry = <TMP>))
	{
	    chomp $entry;
	    @fids = split /\t/, $entry;
	    foreach $fid (@fids)
	    {
		#...lazy hack --- avoids testing entries for Org-ID...
		$weight{$fid} = (scalar @fids);
	    }
	}
    }
    
    my $tbl  = {};
    my $file = 0;
    foreach my $tbl_file (@tbl_files)
    {
	my $num_fids = 0;
	open(TBL,"<$tbl_file") || die "could not open $tbl_file";
	print STDERR "Loading $tbl_file ...\n" if $ENV{VERBOSE};
	
	while (defined($entry = <TBL>))
	{
	    ++$num_fids;
	    chomp $entry;
	    if ($entry =~ /^(\S+)\s+(\S+)/)
	    {
		($fid, $locus) = ($1, $2);
		$fid =~ m/^[^\|]+\|(\d+\.\d+)/;
		$org =  $1;
		
		($contig, $beg, $end) = $fig->boundaries_of($locus);
		
		if ($beg < $end) { $strand = '+'; } else { $strand = '-'; }
		
		$len = 1 + abs($end-$beg);
		
		unless (defined($tbl->{$contig})) { $tbl->{$contig} = []; }
		push(@ { $tbl->{$contig} }
		     , [$entry, $file, $fid, $locus, $contig, $beg, $end, $len, $strand
			, (($file == RNA) ? 9.999999e99 : undef), (defined($weight{$fid}) ? $weight{$fid} : 1)
			, $fig->possibly_truncated($org, $locus), $fig->peg_to_subsystems($fid)
			] );
	    }
	    else
	    {
		print STDERR "Skipping invalid entry $entry\n";
	    }
	}
	print STDERR "Loaded $num_fids features from $tbl_file\n\n" if $ENV{VERBOSE};
	close(TBL);
	++$file;
    }
    
    foreach my $contig (keys(%$tbl))
    {
	my $x = $tbl->{$contig};
	$tbl->{$contig} = [sort { &FIG::min($a->[START],$a->[STOP]) <=> &FIG::min($b->[START],$b->[STOP]) } @$x];
    }
    
    return $tbl;
}



sub load_contigs
{
    my ($contigs_file) = @_;
    
    my $seq_of = {};
    my $len_of = {};
    
    open (CONTIGS, "<$contigs_file") or die "could not open $contigs_file to read";
    while ( (! eof(CONTIGS)) && ( my ($id, $seqP) = &get_a_fasta_record(\*CONTIGS) ) )
    {
    	$$seqP =~ tr/a-z/A-Z/;
	$seq_of->{$id} = $$seqP;
	$len_of->{$id} = length($$seqP);
    }
    close(CONTIGS) or die "could not close $contigs_file";
    
    return ($seq_of, $len_of);
}
 
sub get_a_fasta_record
{
    my ($fh) = @_;
    my ( $old_eol, $entry, @lines, $head, $id, $seq );
 
    if (not defined($fh))  { $fh = \*STDIN; }
 
    $old_eol = $/;
    $/ = "\n>";
 
    $entry =  <$fh>;
    chomp $entry;
    @lines =  split( /\n/, $entry );
    $head  =  shift @lines;
    $head  =~ s/^>?//;
    $head  =~ m/^(\S+)/;
    $id    =  $1;
    
    $seq   = join( "", @lines );
 
    $/ = $old_eol;
    return ( $id, \$seq );
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3