[Bio] / StandaloneTools / filter_tbl_overlaps.pl Repository:
ViewVC logotype

View of /StandaloneTools/filter_tbl_overlaps.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Wed Jun 22 20:51:29 2005 UTC (14 years, 4 months ago) by parrello
Branch: MAIN
CVS Tags: HEAD
Additional tools from Gordon for processing feature overlaps.

#!/disks/space0/fig/FIGdisk.Apr1/env/linux-gentoo/bin/perl

BEGIN {
    @INC = qw(
	/disks/space0/fig/FIGdisk.Apr1/env/linux-gentoo/lib/perl5/5.8.6/i686-linux
	/disks/space0/fig/FIGdisk.Apr1/env/linux-gentoo/lib/perl5/5.8.6
	/disks/space0/fig/FIGdisk.Apr1/env/linux-gentoo/lib/perl5/site_perl/5.8.6/i686-linux
	/disks/space0/fig/FIGdisk.Apr1/env/linux-gentoo/lib/perl5/site_perl/5.8.6
	/disks/space0/fig/FIGdisk.Apr1/env/linux-gentoo/lib/perl5/site_perl
);
}
use Data::Dumper;
use Carp;
# Following block is expanded by switch_to_release to add use lib directives
# to point at the correct locations in the release directory.
#BEGIN switch_to_release generated code
use lib '/disks/space0/fig/FIGdisk.Apr1/dist/releases/cvs.1114807519/linux-gentoo/lib';
use lib '/disks/space0/fig/FIGdisk.Apr1/dist/releases/cvs.1114807519/linux-gentoo/lib/FigKernelPackages';
#END switch_to_release generated code

use lib "/disks/space0/fig/FIGdisk.Apr1/config";
use FIG_Config;

#### END tool_hdr ####

# -*- perl -*-

use FIG;
$fig = new FIG;

use Carp;
use Data::Dumper;

$| = 1;
$0 =~ m/([^\/]+)$/;
$usage = "$1 closest_org contigs tbl_rna tbl_peg1 tbl_peg2 ... > merged.tbl";

=pod

=head1         filter_tbl_overlaps

  Usage:       filter_tb_overlaps closest_org contigs tbl_rna tbl_peg1 tbl_peg2 ... > merged.tbl

  Function: Resolves overlaps and conflicts between PEGs in one or more 
    tbl files, based on similarity evidence. Similarities must be with
    respect to organisms at least a moderate distance from 'closest_org'. 
    The order in which tbl files are entered is significant: Later files
    are considered to have lower priority in the resolution process. 
    Furthermore, the first tbl file is assumed to be of RNAs, not PEGs,
    and no PEG is allowed to overlap any of its entries; hence if no RNA tbl 
    is available, this argument should be given as '/dev/null'.

  Example:     filter_tbl_overlaps CloseOrgID contigs tbl_rna tbl_peg.1 tbl_peg.2 > tbl_peg

  Author:      Gordon D. Pusch (2004 Jan 28)

=cut

(@ARGV >= 4) || die "\n\tusage: $usage\n\n";

(($closest_org = shift @ARGV) && (-d "$FIG_Config::organisms/$closest_org"))
    || die "Org directory $FIG_Config::organsisms/$closest_org does not exist";

(-s ($contigs_file  = shift @ARGV))  
    || die "Contigs file $contigs_file does not exist";
((@tbl_files = @ARGV) > 1) || die "Not enough tbl files";

use constant RNA    => 0;

use constant ENTRY  => 0;
use constant FILE   => 1;
use constant FID    => 2;
use constant LOCUS  => 3;
use constant CONTIG => 4;
use constant START  => 5;
use constant STOP   => 6;
use constant LEN    => 7;
use constant STRAND => 8;
use constant SCORE  => 9;

$min_len            =  90;
$max_convergent     =  30;
$max_divergent      = 150;
$max_normal_overlap = 150;

$tbl = &load_tbls(@tbl_files);
($seq_of, $len_of) = &load_contigs($contigs_file);

foreach $contig (sort by_len_and_id keys %$tbl)
{
    $x = $tbl->{$contig};
    
    print STDERR "Contig $contig contains ", (scalar @$x), " loci\n";
    print STDERR Dumper($x) if ($ENV{VERBOSE} && ($ENV{VERBOSE} > 1));
    
    for ($i=0; $i < @$x; ++$i)
    {
	for($j = $i+1; (($j < @$x) && &entries_overlap($x, $i, $j)); ++$j)
	{
	    if    (&bad_rna_overlap($x, $i, $j))  { &process_rna($x, $i, $j); }
	    elsif (&bad_rna_overlap($x, $j, $i))  { &process_rna($x, $j, $i); }
	}
    }
    
    for ($i=0; $i < @$x; ++$i)
    {
	if ($x->[$i]->[LEN] < $min_len)
	{
	    next if &is_rna($x, $i);
	    next if $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"};
	    
	    print STDERR "Short PEG $i ($x->[$i]->[LEN])\n";
	    $score = &compute_score($x, $i);
	    print STDERR "score_i = $score\n";
	    
	    if ($score < 100*$min_len)
	    {
		if (not $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"})
		{
		    print STDERR "Deleting short PEG $i with no support: $x->[$i]->[ENTRY]\n\n";
		    $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"} = "Short:\t" . $x->[$i]->[ENTRY];
		}
	    }
	    else
	    {
		print STDERR "\n";
	    }
	}
    }
    
    for ($i=0; $i < @$x; ++$i)
    {
	print STDERR "\nProcessing $i: $x->[$i]->[FID]\n";
	
	for($j = $i+1; (($j < @$x) && &entries_overlap($x, $i, $j)); ++$j)
	{
	    print STDERR "...Checking for embedding against $j: $x->[$j]->[FID]\n";
	    
	    if    (&embedded($x, $i, $j))  { &process_embedded($x, $i, $j); }
 	    elsif (&embedded($x, $j, $i))  { &process_embedded($x, $j, $i); }
	}
    }
    
    for ($i=0; $i < @$x; ++$i)
    {
	for($j = $i+1; (($j < @$x) && &entries_overlap($x, $i, $j)); ++$j)
	{
	    if (&opposite_strand($x, $i, $j))
	    {
		if    (&convergent($x, $i, $j) > $max_convergent)  { &process_convergent($x, $i, $j); }
		elsif (&divergent( $x, $i, $j) > $max_divergent)   { &process_divergent( $x, $i, $j); }
	    }
	    elsif (&same_strand($x, $i, $j))
	    {
		if    (&same_stop($x, $i, $j))
		{ 
		    &process_same_stop($x, $i, $j);
		}
		elsif (&normal_overlap($x, $i, $j) > $max_normal_overlap)  
		{
		    &process_normal_overlap($x, $i, $j); 
		}
	    }
	    else 
	    { 
		&impossible_overlap_warning($x, $i, $j, "end of main loop");
	    }
	}
    }
}

foreach $contig (sort keys %$tbl)
{
    $x = $tbl->{$contig};
    
    for ($i=0; $i < @$x; ++$i)
    {
	$key = "$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]";
	if (&is_rna($x, $i))
	{
	    print STDERR "Skipping RNA entry:\t$x->[$i]->[ENTRY]\n";
	}
	elsif ($deleted{$key})
	{
	    print STDERR "Skipping DEL entry:\t$deleted{$key}\n";
	}
	else
	{
	    print STDERR "Accepted PEG entry:\t$x->[$i]->[ENTRY]\n";
	    print "$x->[$i]->[ENTRY]\n";
	}
    }
}
exit (0);


sub by_len_and_id
{
    return (($len_of->{$a} <=> $len_of->{$b}) || ($a cmp $b));
}


sub entries_overlap
{
    my ($x, $i, $j) = @_;
    
    return 0 if ($x->[$i]->[CONTIG] ne $x->[$j]->[CONTIG]);
    
    $ov = &overlaps( $x->[$i]->[START], $x->[$i]->[STOP]
		   , $x->[$j]->[START], $x->[$j]->[STOP]);
    
    return $ov;
}

sub overlaps
{
    my ($beg1, $end1, $beg2, $end2) = @_;
    
    my ($left1, $right1, $left2, $right2);
    
    $left1  = &FIG::min($beg1, $end1);
    $left2  = &FIG::min($beg2, $end2);
    
    $right1 = &FIG::max($beg1, $end1);
    $right2 = &FIG::max($beg2, $end2);
    
    if ($left1 > $left2)
    {
        ($left1, $left2)   = ($left2, $left1);
        ($right1, $right2) = ($right2, $right1);
    }
    
    $ov = 0;
    if ($right1 >= $left2) { $ov = &FIG::min($right1,$right2) - $left2 + 1; }
    
    return $ov;
}

sub type_of
{
    my ($id) = @_;
    
    return 'rna' if ($id =~ m/^fig\|\d+\.\d+\.rna\.\d+$/);
    return 'peg' if ($id =~ m/^fig\|\d+\.\d+\.peg\.\d+$/);
    return undef;
}

sub is_rna
{
    my ($x, $i) = @_;
    
    return ($x->[$i]->[FILE] == RNA);
}

sub same_strand
{
    my ($x, $i, $j) = @_;
    
    return 0 if $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"};
    return 0 if $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"};
    
    return ($x->[$i]->[STRAND] eq $x->[$j]->[STRAND]);
}

sub opposite_strand
{
    my ($x, $i, $j) = @_;
    
    return 0 if $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"};
    return 0 if $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"};
    
    return ($x->[$i]->[STRAND] ne $x->[$j]->[STRAND]);
}

sub bad_rna_overlap
{
    my ($x, $i, $j) = @_;
    
    return 0 if $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"};
    return 0 if $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"};
    
    #...For now, treat all RNA overlaps as "non-removable"
    if (&is_rna($x, $i))
    {
	return &overlaps($x->[$i]->[START], $x->[$i]->[STOP],  $x->[$j]->[START], $x->[$j]->[STOP]);
    }
    else
    {
	return 0;
    }
}

sub embedded
{
    my ($x, $i, $j) = @_;
    
    return 0 if $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"};
    return 0 if $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"};
    
    my $left_i  = &FIG::min($x->[$i]->[START], $x->[$i]->[STOP]);
    my $right_i = &FIG::max($x->[$i]->[START], $x->[$i]->[STOP]);
    
    my $beg_j   = $x->[$j]->[START];
    my $end_j   = $x->[$j]->[STOP];
    
    if (  ($left_i < $beg_j)   && ($left_i < $end_j) 
       && ($beg_j  < $right_i) && ($end_j  < $right_i)
       )
    {
	print STDERR "$j is embedded in $i\n";
	return &overlaps($left_i, $right_i, $beg_j, $end_j);
    }
    
    return 0;
}

sub convergent
{
    my ($x, $i, $j) = @_;
    
    return 0 if $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"};
    return 0 if $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"};
    
    my $beg_i   = $x->[$i]->[START];
    my $end_i   = $x->[$i]->[STOP];
    
    my $beg_j   = $x->[$j]->[START];
    my $end_j   = $x->[$j]->[STOP];
    
    if (   &FIG::between($beg_i, $end_j, $end_i) 
       &&  &FIG::between($beg_j, $end_i, $end_j)
       && !&FIG::between($beg_j, $beg_i, $end_j)
       && !&FIG::between($beg_i, $beg_j, $end_i)
       )
    {
	return &overlaps($beg_i, $end_i, $beg_j, $end_j);
    }
    
    return 0;
}

sub divergent
{
    my ($x, $i, $j) = @_;
    
    return 0 if $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"};
    return 0 if $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"};
    
    my $beg_i   = $x->[$i]->[START];
    my $end_i   = $x->[$i]->[STOP];
    
    my $beg_j   = $x->[$j]->[START];
    my $end_j   = $x->[$j]->[STOP];
    
    if (   &FIG::between($beg_i, $beg_j, $end_i) 
       &&  &FIG::between($beg_j, $beg_i, $end_j)
       && !&FIG::between($beg_j, $end_i, $end_j)
       && !&FIG::between($beg_i, $end_j, $end_i)
       )
    {
	return &overlaps($beg_i, $end_i, $beg_j, $end_j);
    }
    
    return 0;
}

sub normal_overlap
{
    my ($x, $i, $j) = @_;
    
    return 0 if $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"};
    return 0 if $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"};
    
    my $beg_i   = $x->[$i]->[START];
    my $end_i   = $x->[$i]->[STOP];
    
    my $beg_j   = $x->[$j]->[START];
    my $end_j   = $x->[$j]->[STOP];
    
    if    (  ($x->[$i]->[STRAND] eq '+') && ($x->[$j]->[STRAND] eq '+')
          && ($beg_i < $beg_j) && ($beg_j <= $end_i) && ($end_i < $end_j)
          )
    {
	return &overlaps($beg_i, $end_i,  $beg_j, $end_j);
    }
    elsif (  ($x->[$i]->[STRAND] eq '-') && ($x->[$j]->[STRAND] eq '-')
	  && ($end_i < $end_j) && ($end_j <= $beg_i) && ($beg_i < $beg_j) 
	  )
    {
	return &overlaps($beg_i, $end_i,  $beg_j, $end_j);
    }
    else
    {
	&impossible_overlap_warning($x, $i, $j, "Opposite strands in a normal_overlap");
	return 0;
    }
}


sub same_stop
{
    my ($x, $i, $j) = @_;
    
    return 0 if $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"};
    return 0 if $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"};
    
    my $end_i   = $x->[$i]->[STOP];
    my $end_j   = $x->[$j]->[STOP];
    
    if (&same_strand($x, $i, $j) && ($end_i == $end_j))
    {
	return 1;
    } else {
	 return 0;
    }
}


sub impossible_overlap_warning
{
    my ($x, $i, $j, $msg) = @_;
    
    return 0 if $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"};
    return 0 if $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"};
    
    if (defined($msg))
    {
	print STDERR "Impossible overlap in $msg:\n", join("\t", @ { $x->[$i] }), "\n", join("\t", @ { $x->[$j] }), "\n\n";
	# confess "$msg";
    }
    else
    {
	print STDERR "Impossible overlap:\n$i: ", join("\t", @ { $x->[$i] }), "\n$j: ", join("\t", @ { $x->[$j] }), "\n\n";
	# confess "aborted";
    }
    
    return 0;
}



sub process_rna
{
    my ($x, $i, $j) = @_;
    
    return 0 if $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"};
    
    if (&is_rna($x, $i) && &is_rna($x, $j))
    {
	&impossible_overlap_warning($x, $i, $j, "process_rna");
    }
    else
    {
	#...For now, just delete $j;
	if (not $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"})
	{
	    print STDERR "Deleting PEG $j overlapping RNA $i:\n"
		, "$i: $x->[$i]->[ENTRY]\n$j: $x->[$j]->[ENTRY]\n\n";
	    $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"} = "RNA:\t" . $x->[$j]->[ENTRY];
	}
    }
}


sub process_embedded
{
    my ($x, $i, $j) = @_;
    
    return 0 if $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"};
    return 0 if $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"};
    
    unless (defined($x->[$i]->[SCORE]))  { $x->[$i]->[SCORE] = &compute_score($x, $i); }
    unless (defined($x->[$j]->[SCORE]))  { $x->[$j]->[SCORE] = &compute_score($x, $j); }
    
    my $score_i = $x->[$i]->[SCORE];
    my $score_j = $x->[$j]->[SCORE];
    
    if ($score_i < $score_j)
    {
	print STDERR "Deleting embedding PEG $i in favor of embedded PEG $j, score_i = $score_i, score_j = $score_j\n";
	print STDERR "$i: $x->[$i]->[ENTRY]\n";
	print STDERR "$j: $x->[$j]->[ENTRY]\n\n";
	
	$deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"} = "EMB_1:\t" . $x->[$i]->[ENTRY];
    }
    elsif ($score_i > $score_j)
    {
	print STDERR "Deleting embedded PEG $j in favor of embedding PEG $i, score_j = $score_j, score_i = $score_i\n";
	print STDERR "$i: $x->[$i]->[ENTRY]\n";
	print STDERR "$j: $x->[$j]->[ENTRY]\n\n";
	
	$deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"} = "EMB_2:\t" . $x->[$j]->[ENTRY];
    }
    else
    {
	#...In case of same score, delete $j, because $i must be longer...
	
	print STDERR "Deleting embedded PEG $j in favor of embedding PEG $i, score_j = $score_j, score_i = $score_i\n";
	print STDERR "$i: $x->[$i]->[ENTRY]\n";
	print STDERR "$j: $x->[$j]->[ENTRY]\n\n";
	
	$deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"} = "EMB_3:\t" . $x->[$j]->[ENTRY];
    }
    
    return;
}


sub process_convergent
{
    my ($x, $i, $j) = @_;
    
    return 0 if $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"};
    return 0 if $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"};
    
    unless (defined($x->[$i]->[SCORE]))  { $x->[$i]->[SCORE] = &compute_score($x, $i); }
    unless (defined($x->[$j]->[SCORE]))  { $x->[$j]->[SCORE] = &compute_score($x, $j); }
    
    my $score_i = $x->[$i]->[SCORE];
    my $score_j = $x->[$j]->[SCORE];
    
    if ($score_i < $score_j)
    {
	print STDERR "Deleting downstream convergent PEG $i in favor of $j, score_i = $score_i, score_j = $score_j\n";
	print STDERR "$i: $x->[$i]->[ENTRY]\n";
	print STDERR "$j: $x->[$j]->[ENTRY]\n\n";
	
	$deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"} = "CNV_1:\t" . $x->[$i]->[ENTRY];
    }
    elsif ($score_i > $score_j)
    {
	print STDERR "Deleting upstream convergent PEG $j in favor of $i, score_j = $score_j, score_i = $score_i\n";
	print STDERR "$i: $x->[$i]->[ENTRY]\n";
	print STDERR "$j: $x->[$j]->[ENTRY]\n\n";
	
	$deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"} = "CNV_2:\t" . $x->[$j]->[ENTRY];
    }
    else
    {
	#...default to "priority"...
	if ($x->[$i]->[FILE] < $x->[$j]->[FILE])
	{
	    print STDERR "Deleting downstream convergent PEG $i in favor of $j, score_i = $score_i, score_j = $score_j\n";
	    print STDERR "$i: $x->[$i]->[ENTRY]\n";
	    print STDERR "$j: $x->[$j]->[ENTRY]\n\n";
	    
	    $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"} = "CNV_3:\t" . $x->[$j]->[ENTRY];
	}
	elsif ($x->[$i]->[FILE] > $x->[$j]->[FILE])
	{
	    print STDERR "Deleting upstream convergent PEG $j in favor of $i, score_j = $score_j, score_i = $score_i\n";
	    print STDERR "$i: $x->[$i]->[ENTRY]\n";
	    print STDERR "$j: $x->[$j]->[ENTRY]\n\n";
	    
	    $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"} = "CNV_4:\t" . $x->[$j]->[ENTRY];
	}
	else
	{
	    #...Warn and move on...
	    
	    &impossible_overlap_warning($x, $i, $j, "process_convergent");
	}
    }
    
    return;
}


sub process_divergent
{
    my ($x, $i, $j) = @_;
    
    return 0 if $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"};
    return 0 if $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"};
    
    unless (defined($x->[$i]->[SCORE]))  { $x->[$i]->[SCORE] = &compute_score($x, $i); }
    unless (defined($x->[$j]->[SCORE]))  { $x->[$j]->[SCORE] = &compute_score($x, $j); }
    
    my $score_i = $x->[$i]->[SCORE];
    my $score_j = $x->[$j]->[SCORE];
    
    if ($score_i < $score_j)
    {
	print STDERR "Deleting downstream divergent PEG $i in favor of $j, score_i = $score_i, score_j = $score_j\n";
	print STDERR "$i: $x->[$i]->[ENTRY]\n";
	print STDERR "$j: $x->[$j]->[ENTRY]\n\n";
	
	$deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"} = "DIV_1:\t" . $x->[$i]->[ENTRY];
    }
    elsif ($score_i > $score_j)
    {
	print STDERR "Deleting upstream divergent PEG $j in favor of $i, score_j = $score_j, score_i = $score_i\n";
	print STDERR "$i: $x->[$i]->[ENTRY]\n";
	print STDERR "$j: $x->[$j]->[ENTRY]\n\n";
	
	$deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"} = "DIV_2:\t" . $x->[$j]->[ENTRY];
    }
    else
    {
	#...default to "priority"...
	if ($x->[$i]->[FILE] < $x->[$j]->[FILE])
	{
	    print STDERR "Deleting downstream divergent PEG $i in favor of $j, score_i = $score_i, score_j = $score_j\n";
	    print STDERR "$i: $x->[$i]->[ENTRY]\n";
	    print STDERR "$j: $x->[$j]->[ENTRY]\n\n";
	    
	    $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"} = "DIV_3:\t" . $x->[$i]->[ENTRY];
	}
	elsif ($x->[$i]->[FILE] > $x->[$j]->[FILE])
	{
	    print STDERR "Deleting upstream divergent PEG $j in favor of $i, score_j = $score_j, score_i = $score_i\n";
	    print STDERR "$i: $x->[$i]->[ENTRY]\n";
	    print STDERR "$j: $x->[$j]->[ENTRY]\n\n";
	    
	    $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"} = "DIV_4:\t" . $x->[$j]->[ENTRY];
	}
	else
	{
	    #...Warn and move on...
	    
	    &impossible_overlap_warning($x, $i, $j, "process_divergent");
	}
    }
    
    return;
}

sub process_same_stop
{
    my ($x, $i, $j) = @_;
    
    return 0 if $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"};
    return 0 if $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"};
    
    my $len_i = $x->[$i]->[LEN];
    my $len_j = $x->[$j]->[LEN];
    
    if ($len_i < $len_j)
    {
	print STDERR "Deleting same-STOP PEG $i in favor of $j, len_i = $len_i, len_j = $len_j\n";
	print STDERR "$i: $x->[$i]->[ENTRY]\n";
	print STDERR "$j: $x->[$j]->[ENTRY]\n\n";
	
	$deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"} = "STP_1:\t" . $x->[$i]->[ENTRY];
    }
    else
    {
	print STDERR "Deleting same-STOP PEG $j in favor of $i, len_i = $len_i, len_j = $len_j\n";
	print STDERR "$i: $x->[$i]->[ENTRY]\n";
	print STDERR "$j: $x->[$j]->[ENTRY]\n\n";
	
	$deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"} = "STP_2:\t" . $x->[$j]->[ENTRY];
    }
    
    return;
}    

sub process_normal_overlap
{
    my ($x, $i, $j) = @_;
    
    return 0 if $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"};
    return 0 if $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"};
    
    unless (defined($x->[$i]->[SCORE]))  { $x->[$i]->[SCORE] = &compute_score($x, $i); }
    unless (defined($x->[$j]->[SCORE]))  { $x->[$j]->[SCORE] = &compute_score($x, $j); }
    
    my $score_i = $x->[$i]->[SCORE];
    my $score_j = $x->[$j]->[SCORE];
    
    if ($score_i < $score_j)
    {
	print STDERR "Deleting downstream same-strand PEG $i in favor of $j, score_i = $score_i, score_j = $score_j\n";
	print STDERR "$i: $x->[$i]->[ENTRY]\n";
	print STDERR "$j: $x->[$j]->[ENTRY]\n\n";
	
	$deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"} = "NRM_1:\t" . $x->[$i]->[ENTRY];
    }
    elsif ($score_i > $score_j)
    {
	print STDERR "Deleting upstream same-strand PEG $j in favor of $i, score_j = $score_j, score_i = $score_i\n";
	print STDERR "$i: $x->[$i]->[ENTRY]\n";
	print STDERR "$j: $x->[$j]->[ENTRY]\n\n";
	
	$deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"} = "NRM_2:\t" . $x->[$j]->[ENTRY];
    }
    else
    {
	#...default to "priority"...
	if ($x->[$i]->[FILE] < $x->[$j]->[FILE])
	{
	    print STDERR "Deleting downstream same-strand PEG $i in favor of $j, score_i = $score_i, score_j = $score_j\n";
	    print STDERR "$i: $x->[$i]->[ENTRY]\n";
	    print STDERR "$j: $x->[$j]->[ENTRY]\n\n";
	    
	    $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"} = "NRM_3:\t" . $x->[$i]->[ENTRY];
	}
	elsif ($x->[$i]->[FILE] > $x->[$j]->[FILE])
	{
	    print STDERR "Deleting upstream same-strand PEG $j in favor of $i, score_j = $score_j, score_i = $score_i\n";
	    print STDERR "$i: $x->[$i]->[ENTRY]\n";
	    print STDERR "$j: $x->[$j]->[ENTRY]\n\n";
	    
	    $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"} = "NRM_4:\t" . $x->[$j]->[ENTRY];
	}
	else
	{
	    #...Warn and move on...
	    
	    &impossible_overlap_warning($x, $i, $j, "process_normal_overlap");
	}
    }
    
    return;
}


sub compute_score
{
    my ($x, $i) = @_;
    
    my $max_psc   = 1.0e-10;
    my $min_dist  = 0.002;
    my $min_match = 0.7;
    
    my ($id2, $len2, $psc, $beg2, $end2);
    my ($dist, $width);
    
    return 0   if $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"};
    return $x->[$i]->[SCORE]  if (defined($x->[$i]->[SCORE]));
    
    my $id    = $x->[$i]->[FID];
    my $beg   = $x->[$i]->[START];
    my $end   = $x->[$i]->[STOP];
    my $locus = $x->[$i]->[LOCUS]; 

#     my $pseq  = $fig->get_translation($id);
#    
#     unless ($pseq)
#     {
# 	print STDERR "Skipping $i: $id, locus $locus --- no sequence\n";
# 	return 0;
#     }
#    
#     my $tmp = $pseq;
#     $tmp =~ s/[^ARNDCQEGHILKMFPSTWYVBZ]//ig;
#     if (length($tmp) < 10)
#     {
# 	print STDERR "Skipping locus $locus --- too few valid AA chars: $pseq\n";
# 	return 0;
#     }
    
    if (not exists $in_subsystem{$id})
    {
	$in_subsystem{$id} = $fig->peg_to_subsystems($id);
    }
    else
    {
	$in_subsystem{$id} = 0;
    }
    
    print STDERR "Getting sims for $i: $id\t$locus\t$pseq\n";  
    print STDERR "\n" if (length($pseq) > 90);
    my @hits = $fig->sims($id, 500, 1.0e-5, 'all');
    
    my $pscore = 0;
    my $dscore = 0;
    my $num_hits = 1;     #...pseudo "Laplace correction" (prevents divide by zero)
    foreach my $hit (@hits)
    {
	$id2  = $hit->id2;
	$len2 = $hit->ln2;
	$psc  = $hit->psc;
	$beg2 = $hit->b2;
	$end2 = $hit->e2;

	next unless (&type_of($id2) eq 'peg');
	
	$dist  = $fig->crude_estimate_of_distance($closest_org, $fig->genome_of($id2));
	die "Could not get dist between $closest_org and ". $fig->genome_of($id2) unless defined $dist;
	
	$width = $end2 - $beg2 + 1;
	if (($psc <= $max_psc) && ($dist >= $min_dist)
	    && ($width > $min_match*$len2)
	    )
	{
	    ++$num_hits;
	    $pscore += -log(2**(-1074) + $psc);
	    $dscore += $dist;
	    print STDERR "Accepting $id2:\t$psc\t$dist\t$width\t$len2\n" if $ENV{VERBOSE};
	}
	else
	{
	    print STDERR "Rejecting $id2:\t$psc\t$dist\t$width\t$len2\n" if $ENV{VERBOSE};
	}
    }
    
    $pscore /= $num_hits;
    $score   = $x->[$i]->[LEN] + 10000 * $pscore;
    if ($in_subsystem{$id})    { $score *= 100; }
    
    $x->[$i]->[SCORE] = $score;
    return $score;
}


sub protseq
{
    my ($contig, $beg, $end) = @_;
    my ($seq, $pseq);
    
    if ($beg < $end)
    {
	$seq = substr($$seq_of{$contig}, $beg-1, 1+($end-$beg));
    } 
    else 
    {
	$seq = substr($$seq_of{$contig}, $end-1, 1+($beg-$end));
	$seq = $ { &rev_comp(\$seq) };
    }
    
    $pseq = &translate(\$seq);
    substr($$pseq,0,1) = &toM(\$seq);
    
    return $pseq;
}


sub load_tbls
{
    my (@tbl_files) = @_;
    my ($entry, $fid, $locus, $contig, $beg, $end, $strand, $len);
    
    my $tbl  = {};
    my $file = 0;
    foreach my $tbl_file (@tbl_files)
    {
	my $num_fids = 0;
	open(TBL,"<$tbl_file") || die "could not open $tbl_file";
	print STDERR "Loading $tbl_file ...\n";
	
	while (defined($entry = <TBL>))
	{
	    ++$num_fids;
	    chomp $entry;
	    if ($entry =~ /^(\S+)\s+(\S+)/)
	    {
		($fid, $locus) = ($1, $2);
		
		($contig, $beg, $end) = $fig->boundaries_of($locus);
		
		if ($beg < $end) { $strand = '+'; } else { $strand = '-'; }
		
		$len = 1 + abs($end-$beg);
		
		unless (defined($tbl->{$contig})) { $tbl->{$contig} = []; }
		push(@ { $tbl->{$contig} }, [$entry, $file, $fid, $locus, $contig, $beg, $end, $len, $strand, undef] );
	    }
	    else
	    {
		print STDERR "Skipping invalid entry $entry\n";
	    }
	}
	print STDERR "Loaded $num_fids features from $tbl_file\n\n";
	close(TBL);
	++$file;
    }
    
    foreach my $contig (keys(%$tbl))
    {
	my $x = $tbl->{$contig};
	$tbl->{$contig} = [sort { &FIG::min($a->[START],$a->[STOP]) <=> &FIG::min($b->[START],$b->[STOP]) } @$x];
    }
    
    return $tbl;
}



sub load_contigs
{
    my ($contigs_file) = @_;
    
    my $seq_of = {};
    my $len_of = {};
    
    open (CONTIGS, "<$contigs_file") or die "could not open $contigs_file to read";
    while ( (! eof(CONTIGS)) && ( my ($id, $seqP) = &get_a_fasta_record(\*CONTIGS) ) )
    {
    	$$seqP =~ tr/a-z/A-Z/;
	$seq_of->{$id} = $$seqP;
	$len_of->{$id} = length($$seqP);
    }
    close(CONTIGS) or die "could not close $contigs_file";
    
    return ($seq_of, $len_of);
}
 
sub get_a_fasta_record
{
    my ($fh) = @_;
    my ( $old_eol, $entry, @lines, $head, $id, $seq );
 
    if (not defined($fh))  { $fh = \*STDIN; }
 
    $old_eol = $/;
    $/ = "\n>";
 
    $entry =  <$fh>;
    chomp $entry;
    @lines =  split( /\n/, $entry );
    $head  =  shift @lines;
    $head  =~ s/^>?//;
    $head  =~ m/^(\S+)/;
    $id    =  $1;
    
    $seq   = join( "", @lines );
 
    $/ = $old_eol;
    return ( $id, \$seq );
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3