[Bio] / StandaloneTools / find_peg_overlaps.pl Repository:
ViewVC logotype

View of /StandaloneTools/find_peg_overlaps.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (as text) (annotate)
Fri May 26 22:30:20 2006 UTC (13 years, 5 months ago) by overbeek
Branch: MAIN
CVS Tags: HEAD
Changes since 1.2: +29 -70 lines
Fixed counting bug. -- /gdp

#!/usr/bin/env /Volumes/raid2/FIGdisk.v5/env/mac/bin/perl

BEGIN {
    @INC = qw(
              /Volumes/raid2/FIGdisk.v5/dist/releases/dev/mac/lib
              /Volumes/raid2/FIGdisk.v5/dist/releases/dev/mac/lib/FigKernelPackages
              /Volumes/raid2/FIGdisk.v5/dist/dev/mac/lib
              /Volumes/raid2/FIGdisk.v5/dist/dev/mac/lib/FigKernelPackages
              /Volumes/raid2/FIGdisk.v5/env/mac/lib/perl5/5.8.7/darwin-2level
              /Volumes/raid2/FIGdisk.v5/env/mac/lib/perl5/5.8.7
              /Volumes/raid2/FIGdisk.v5/env/mac/lib/perl5/site_perl/5.8.7/darwin-2level
              /Volumes/raid2/FIGdisk.v5/env/mac/lib/perl5/site_perl/5.8.7
              /Volumes/raid2/FIGdisk.v5/env/mac/lib/perl5/site_perl
              .
              /Volumes/raid2/FIGdisk.v5/config
 
);
}
use Data::Dumper;
use Carp;
use FIG_Config;
$ENV{'BLASTMAT'} = "/Volumes/raid2/FIGdisk.v5/BLASTMAT";
$ENV{'FIG_HOME'} = "/Volumes/raid2/FIGdisk.v5";
# end of tool_hdr
########################################################################

# -*- perl -*-

use FIG;
$fig = new FIG;

$usage = "usage: find_peg_overlaps Contigs Tbl_1 Tbl_2 ...";
# First file is always the RNA file.

$max_acceptable_overlap = 20;
if (@ARGV && ($ARGV[0] =~ m/^-lap=(\d+)/)) { $max_acceptable_overlap = $1; shift; }

(($contigs_file = shift @ARGV) && (@tbl_files = @ARGV)) || die $usage;


$seq_of = &load_contigs($contigs_file);
$tbl    = &load_tbl(@tbl_files);

use constant RNA     => 0;

use constant FID     => 0;
use constant TYPE    => 1;
use constant START   => 2;
use constant STOP    => 3;
use constant STRAND  => 4;
use constant LEN     => 5;

use constant CONTIG  => 0;
use constant ID1     => 1;
use constant ID2     => 2;
use constant OVERLAP => 3;

@bad_starts = ();
@bad_stops  = ();

@same_stop_pegs       = ();
@embedded_pegs        = ();
@same_strand_overlaps = ();
@convergent_overlaps  = ();
@divergent_overlaps   = ();
@impossible_overlaps  = ();

%start = map { $_ => 1 } qw(ATG GTG TTG);
%stop  = map { $_ => 1 } qw(TAA TAG TGA);

# Tuning on when something is significant overlap.
$min_len            = 150;
$max_convergent     =  20;
$max_divergent      = 120;
$max_normal_overlap = 120; # max allowable same-strand overlap

$num_bad = 0;
foreach $contig (sort keys %$tbl)   # $tbl is the TBL files, ref to hash of contigs to lists of pegs
{
    $contig_seq = $seq_of->{$contig};
    $contig_len = length($contig_seq);
    
    $x = $tbl->{$contig};
    for ($i=0; $i < @$x; ++$i)
    {
	$bad = 0;
	($fid, $type, $beg, $end, $strand, $len) = @ { $x->[$i] };

#...Checking for most serious type of problem: Invalid START or STOP codon. 
# If we find this problem for non-truncated feature, $bad will be 1, 
# and info will be in appropriate list.
	
	# invalid starts and stops are okay near beginning and end of contigs
	$truncated_start = 0;
	$truncated_stop  = 0;  
	
	$start = $stop = "";
	if (&type_of($fid) ne 'rna')
	{
	    if ($strand eq '+')
	    {
		if ($beg <= 3)  { $truncated_start = 1;   $margin = $beg - 1; }
		$start = uc(substr($$seq_of{$contig}, $beg-1, 3));
		
		if ($end <= ($contig_len - 3))
		{
		    $stop = uc(substr($contig_seq, ($end-3), 3));
		}
		else
		{
		    $truncated_stop = 1;   $margin = $contig_len - $end + 3;
		}
	    }
	    else
	    {
		
		if ($beg >= ($contig_len - 3))  { $truncated_start = 1;   $margin = $contig_len - 3; }
		$start = substr($contig_seq, $beg-3, 3);
		$start = uc(&FIG::reverse_comp($start));
		
		if ($end >= 4)
		{
		    $stop  = substr($contig_seq, ($end-1), 3);
		    $stop  = uc(&FIG::reverse_comp($stop));
		}
		else
		{
		    $truncated_stop = 1;   $margin = $end - 4;
		}
	    }
	    
	    unless ($truncated_start || $start{$start})
	    {
		$bad = 1;
		push @bad_starts, [$fid, $strand, $start, $len];
	    }
	    
	    unless ($truncated_stop  || $stop{$stop}) 
	    {
		$bad = 1;
		push @bad_stops,  [$fid, $strand, $stop, $len];
	    }
	}
	
	
	for ($j = &FIG::max(0, $i-100); $j < $i; ++$j)
	{
	    $overlap = &overlaps($x->[$i]->[START], $x->[$i]->[STOP], $x->[$j]->[START], $x->[$j]->[STOP]);
	    
	    if ($overlap) # errors processed below are in order from most severe to least severe
	    {
		if (&same_stop($x, $i, $j))
		{
		    $bad = 1;
		    push @same_stop_pegs, [$contig, $i, $j, $overlap];
		}
		elsif ( &embedded($x, $i, $j) )
		{
		    $bad = 1;
		    push @embedded_pegs,  [$contig, $i, $j, $overlap];
		}
		elsif ( &embedded($x, $j, $i) )
		{
		    $bad = 1;
		    push @embedded_pegs,  [$contig, $j, $i, $overlap];
		}
		else
		{
		    if    (&same_strand($x, $i, $j))
		    {
			if (&normal_overlap($x, $i, $j) >= $max_normal_overlap)
			{
			    $bad = 1;
			    push @same_strand_overlaps, [$contig, $j, $i, $overlap];
			}
		    }
		    elsif (&opposite_strand($x, $i, $j))
		    {
			if    ( &convergent($x, $i, $j) )
			{
			    $bad = 1;
			    push @convergent_overlaps, [$contig, $j, $i, $overlap];
			}
			elsif ( &divergent($x, $i, $j) )
			{
			    if ($overlap >= $max_divergent)
			    {
				$bad = 1;
				push @divergent_overlaps, [$contig, $j, $i, $overlap];
			    }
			}
			else
			{
			    $bad = 1;
			    push @impossible_overlaps, [$contig, $j, $i, $overlap];
			}
		    }
		    else
		    {
			$bad = 1;
			push @impossible_overlaps, [$contig, $j, $i, $overlap];
		    }
		}
	    }
	}
	
	if ($bad) { ++$num_bad; }
    }
}


print STDOUT ("Bad STOP codons:\t", (scalar @bad_stops), "\n") if @bad_stops;
print STDERR ("Bad STOP codons:\t", (scalar @bad_stops), "\n") if @bad_stops;
foreach $bad_stop (sort {$a->[0] cmp $b->[0]} @bad_stops)
{
    print STDOUT "$bad_stop->[1]$bad_stop->[0] ($bad_stop->[3]) has bad STOP codon $bad_stop->[2]\n";
}
print STDOUT "\n" if @bad_stops;


print STDOUT ("Bad START codons:\t", (scalar @bad_starts), "\n") if @bad_starts;
print STDERR ("Bad START codons:\t", (scalar @bad_starts), "\n") if @bad_starts;
foreach $bad_start (sort {$a->[0] cmp $b->[0]} @bad_starts)
{
    print STDOUT "$bad_start->[1]$bad_start->[0] ($bad_start->[3]) has bad START codon $bad_start->[2]\n";
}
print STDOUT "\n" if @bad_starts;


print STDOUT ("Same-STOP PEGs:\t", (scalar @same_stop_pegs), "\n") if @same_stop_pegs;
print STDERR ("Same-STOP PEGs:\t", (scalar @same_stop_pegs), "\n") if @same_stop_pegs;
foreach $pair (sort by_overlap @same_stop_pegs)
{
    &print_pair_report($pair, 'same-STOP');
}
print STDOUT "\n" if @same_stop_pegs;


print STDOUT ("Embedded PEGs:\t", (scalar @embedded_pegs), "\n") if @embedded_pegs;
print STDERR ("Embedded PEGs:\t", (scalar @embedded_pegs), "\n") if @embedded_pegs;
foreach $pair (sort by_overlap @embedded_pegs)
{
    &print_pair_report($pair, 'embedded');
}
print STDOUT "\n" if @embedded_pegs;


print STDOUT ("Convergent overlaps:\t", (scalar @convergent_overlaps), "\n") if @convergent_overlaps;
print STDERR ("Convergent overlaps:\t", (scalar @convergent_overlaps), "\n") if @convergent_overlaps;
foreach $pair (sort by_overlap @convergent_overlaps)
{
    &print_pair_report($pair, 'convergent overlap');
}
print STDOUT "\n" if @convergent_overlaps;


print STDOUT ("Divergent overlaps:\t", (scalar @divergent_overlaps), "\n") if @divergent_overlaps;
print STDERR ("Divergent overlaps:\t", (scalar @divergent_overlaps), "\n") if @divergent_overlaps;
foreach $pair (sort by_overlap @divergent_overlaps)
{
    &print_pair_report($pair, 'divergent overlap');
}
print STDOUT "\n" if @divergent_overlaps;


print STDOUT ("Same-strand overlaps:\t", (scalar @same_strand_overlaps), "\n") if @same_strand_overlaps;
print STDERR ("Same-strand overlaps:\t", (scalar @same_strand_overlaps), "\n") if @same_strand_overlaps;
foreach $pair (sort by_overlap @same_strand_overlaps)
{
    &print_pair_report($pair, 'same-strand');
}
print STDOUT "\n" if @same_strand_overlaps;


print STDOUT ("Impossible overlaps:\t", (scalar @impossible_overlaps), "\n") if @impossible_overlaps;
print STDERR ("Impossible overlaps:\t", (scalar @impossible_overlaps), "\n") if @impossible_overlaps;
foreach $pair (sort by_overlap @impossible_overlaps)
{
    &print_pair_report($pair, 'impossible');
}
print STDOUT "\n" if @impossible_overlaps;


if ($num_bad) {
    print STDOUT   "$num_bad PEGs had problems\n";
    print STDERR "\n$num_bad PEGs had problems\n\n";
    # exit(1);
} else {
    print STDERR "\nNo problems found\n\n";
    # exit(0);
}
exit(0);


sub by_overlap
{
    return (($b->[OVERLAP] <=> $a->[OVERLAP]) || ($a->[ID1] <=> $b->[ID1]) || ($a->[ID2] <=> $b->[ID2]));
}

sub print_pair_report
{
    my ($pair, $msg) = @_;

    my $x = $tbl->{$pair->[CONTIG]};

    my ($i, $j) = ($pair->[ID1], $pair->[ID2]);

    print "$x->[$i]->[FID]\t$x->[$j]->[FID]\t"
	, "($x->[$i]->[STRAND],$x->[$i]->[LEN] / $x->[$j]->[STRAND],$x->[$j]->[LEN]) $msg ($pair->[OVERLAP]bp)\n";
}


sub load_contigs
{
    my ($contigs_file) = @_;
    my $contigs = {};

    open (CONTIGS, "<$contigs_file") or die "Could not read-open $contigs_file";

    while ( (! eof(CONTIGS)) && ( ($id, $seqP) = &get_a_fasta_record(\*CONTIGS) ) )
    {
    	$$seqP =~ tr/a-z/A-Z/;
	$contigs->{$id} = $$seqP;
    }

    return $contigs;
}

sub get_a_fasta_record
{
    my ($fh) = @_;
    my ( $old_eol, $entry, @lines, $head, $id, $seq );

    if (not defined($fh))  { $fh = \*STDIN; }

    $old_eol = $/;
    $/ = "\n>";

    $entry =  <$fh>;
    chomp $entry;
    @lines =  split( /\n/, $entry );
    $head  =  shift @lines;
    $head  =~ s/^>?//;
    $head  =~ m/^(\S+)/;
    $id    =  $1;

    $seq   = join( "", @lines );

    $/ = $old_eol;
    return ( $id, \$seq );
}


sub load_tbl
{
    my(@tbl_files) = @_;

    my ($num_fids, $num_kept);
    my ($file_num, $entry, $fid, $contig, $beg, $end, $strand, $len);

    my $tbl = {};
    for ($file_num=0; $file_num < @tbl_files; ++$file_num)
    {
	$tbl_file = $tbl_files[$file_num];

	$num_fids = 0;
	open(TBL,"<$tbl_file") || die "Could not read-open $tbl_file";
	while (defined($entry = <TBL>))
	{
	    ++$num_fids;
            chomp $entry;

	    if ($entry =~ /^(\S+)\s+(\S+)/)
	    {
		($fid, $locus) = ($1, $2);

		($contig, $beg, $end) = $fig->boundaries_of($locus);

		if ($beg < $end) { $strand = '+'; } else { $strand = '-'; }

		$len = 1 + abs($end-$beg);

		unless (defined($tbl->{$contig})) { $tbl->{$contig} = []; }
		push(@{$tbl->{$contig}}, [$fid,$file_num,$beg,$end,$strand,$len] );
	    }
	    else
	    {
		print STDERR "Skipping invalid entry $entry\n";
	    }
	}
	close(TBL);
    }

    foreach $contig (keys(%$tbl))
    {
	my $x = $tbl->{$contig};
	$tbl->{$contig} = [sort { &FIG::min($a->[1],$a->[2]) <=> &FIG::min($b->[1],$b->[2]) } @$x];
    }

    return $tbl;
}

sub overlaps
{
    my ($beg1, $end1, $beg2, $end2) = @_;

    my ($left1, $right1, $left2, $right2);

    $left1  = &FIG::min($beg1, $end1);
    $left2  = &FIG::min($beg2, $end2);

    $right1 = &FIG::max($beg1, $end1);
    $right2 = &FIG::max($beg2, $end2);

    if ($left1 > $left2)
    {
        ($left1, $left2)   = ($left2, $left1);
        ($right1, $right2) = ($right2, $right1);
    }

    $ov = 0;
    if ($right1 >= $left2) { $ov = &FIG::min($right1,$right2) - $left2 + 1; }

    return $ov;
}

sub type_of
{
    my ($id) = @_;
    
    if ($id =~ m/^fig\|\d+\.\d+\.([^\.]+)\.\d+$/) { return $1; }
    return undef;
}

sub is_rna
{
    my ($x, $i) = @_;

    return ($x->[$i]->[FILE] == RNA);
}

sub same_strand
{
    my ($x, $i, $j) = @_;

    return ($x->[$i]->[STRAND] eq $x->[$j]->[STRAND]);
}

sub opposite_strand
{
    my ($x, $i, $j) = @_;

    return ($x->[$i]->[STRAND] ne $x->[$j]->[STRAND]);
}

sub bad_rna_overlap
{
    my ($x, $i, $j) = @_;

    #...For now, treat all RNA overlaps as "non-removable"
    if (&is_rna($x, $i))
    {
	return &overlaps($x->[$i]->[START], $x->[$i]->[STOP],  $x->[$j]->[START], $x->[$j]->[STOP]);
    }
    else
    {
	return 0;
    }
}

sub embedded
{
    my ($x, $i, $j) = @_;

    my $left_i  = &FIG::min($x->[$i]->[START], $x->[$i]->[STOP]);
    my $right_i = &FIG::max($x->[$i]->[START], $x->[$i]->[STOP]);

    my $beg_j   = $x->[$j]->[START];
    my $end_j   = $x->[$j]->[STOP];

    if (  ($left_i < $beg_j)   && ($left_i < $end_j)
       && ($beg_j  < $right_i) && ($end_j  < $right_i)
       )
    {
	return &overlaps($left_i, $right_i, $beg_j, $end_j);
    }

    return 0;
}

sub convergent
{
    my ($x, $i, $j) = @_;

    my $beg_i   = $x->[$i]->[START];
    my $end_i   = $x->[$i]->[STOP];

    my $beg_j   = $x->[$j]->[START];
    my $end_j   = $x->[$j]->[STOP];

    if (   &FIG::between($beg_i, $end_j, $end_i)
       &&  &FIG::between($beg_j, $end_i, $end_j)
       && !&FIG::between($beg_j, $beg_i, $end_j)
       && !&FIG::between($beg_i, $beg_j, $end_i)
       )
    {
	return &overlaps($beg_i, $end_i, $beg_j, $end_j);
    }

    return 0;
}

sub divergent
{
    my ($x, $i, $j) = @_;

    my $beg_i   = $x->[$i]->[START];
    my $end_i   = $x->[$i]->[STOP];

    my $beg_j   = $x->[$j]->[START];
    my $end_j   = $x->[$j]->[STOP];

    if (   &FIG::between($beg_i, $beg_j, $end_i)
       &&  &FIG::between($beg_j, $beg_i, $end_j)
       && !&FIG::between($beg_j, $end_i, $end_j)
       && !&FIG::between($beg_i, $end_j, $end_i)
       )
    {
	return &overlaps($beg_i, $end_i, $beg_j, $end_j);
    }

    return 0;
}

sub normal_overlap
{
    my ($x, $i, $j) = @_;

    my $beg_i    = $x->[$i]->[START];
    my $end_i    = $x->[$i]->[STOP];
    my $strand_i = $x->[$i]->[STRAND];

    my $beg_j    = $x->[$j]->[START];
    my $end_j    = $x->[$j]->[STOP];
    my $strand_j = $x->[$j]->[STRAND];

#   print STDERR "$beg_i, $end_i, $strand_i\t$beg_j, $end_j, $strand_j\n";

    if    (($strand_i eq '+') && ($strand_j eq '+'))
    {
	if (($beg_i < $beg_j) && ($beg_j <= $end_i) && ($end_i < $end_j))
	{
	    return &overlaps($beg_i, $end_i,  $beg_j, $end_j);
	}
    }
    elsif (($strand_i eq '-') && ($strand_j eq '-'))
    {
	if (($end_i < $end_j) && ($end_j <= $beg_i) && ($beg_i < $beg_j))
	{
	    return &overlaps($beg_i, $end_i,  $beg_j, $end_j);
	}
    }
    else
    {
	&impossible_overlap_warning($x, $i, $j, "Opposite strands in a normal_overlap");
	return 0;
    }
}


sub same_stop
{
    my ($x, $i, $j) = @_;

    my $end_i   = $x->[$i]->[STOP];
    my $end_j   = $x->[$j]->[STOP];

    if (&same_strand($x, $i, $j) && ($end_i == $end_j))
    {
	return 1;
    } else {
	 return 0;
    }
}


sub impossible_overlap_warning
{
    my ($x, $i, $j, $msg) = @_;

    if (defined($msg))
    {
	print STDERR "Impossible overlap in $msg:\n$i:\t", join("\t", @ { $x->[$i] })
	                                      , "\n$j:\t", join("\t", @ { $x->[$j] }), "\n\n";
	confess "$msg";
    }
    else
    {
	print STDERR "Impossible overlap:\n$i:\t", join("\t", @ { $x->[$i] })
	                              , "\n$j:\t", join("\t", @ { $x->[$j] }), "\n\n";
	confess "aborted";
    }

    return 0;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3