[Bio] / FigKernelScripts / build_initial_objects_for_start_predictions.pl Repository:
ViewVC logotype

View of /FigKernelScripts/build_initial_objects_for_start_predictions.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.8 - (download) (as text) (annotate)
Sat Nov 5 22:59:44 2005 UTC (14 years, 1 month ago) by overbeek
Branch: MAIN
CVS Tags: caBIG-00-00-00
Changes since 1.7: +236 -1 lines
fixes to extract_genomes

# -*- perl -*-

use FIG;
my $fig = new FIG;

$usage = "usage: build_initial_objects_for_start_predictions [tbl] < pegs_to_call > initial_objects";

use constant FID    =>  0;
use constant LOCUS  =>  1;
use constant CONTIG =>  2;
use constant START  =>  3;
use constant STOP   =>  4;
use constant LEFT   =>  5;
use constant RIGHT  =>  6;
use constant LEN    =>  7;
use constant STRAND =>  8;


$tbl_file = "";
$trouble  = 0;
while (@ARGV)
{
    if ($ARGV[0] =~ m/-h(elp)?/)
    {
	die "\n\tusage: $usage\n\n";
    }
    elsif (-s $ARGV[0])
    {
	$tbl_file = $ARGV[0];
    }
    else
    {
	warn "Invalid argument $ARGV[0]\n";
    }
    shift @ARGV;
}
die "\nusage: $usage\n\n" if $trouble;

@pegs = <STDIN>;
chomp @pegs;
# print "NPEGS=", scalar(@pegs), "\n";

if ($tbl_file)
{
    open(TBL, "<$tbl_file") || die "Could not read-open $tbl_file";
    while (defined($entry = <TBL>))
    {
	if ($entry =~ m/^(\S+)\t(\S+)/)
	{
	    ($fid, $loc) = ($1, $2);
	    ($contig, $beg, $end, $left, $right, $len, $strand) = &from_locus($loc);
	    $tbl->{$contig}->{$strand.$end}
	        = [ $fid, $loc, $contig, $beg, $end, $left, $right, $len, $strand ];
	}
	else
	{
	    print STDERR "Skipping invalid tbl entry: $entry";
	}
    }
}

#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#...If there are "new" PEGs, overwrite any existing tbl entries...
#-----------------------------------------------------------------------
foreach $peg (@pegs)
{
    if ($peg =~ /^(new\|(\d+\.\d+)\.peg\.\d+)\t(\S+)/)
    {
	$fid    = $1;
	$genome = $2;
	$loc    = $3;
	
	($contig, $beg, $end, $left, $right, $len, $strand) = &from_locus($loc);
	$tbl->{$contig}->{$strand.$end} 
	    = [ $fid, $loc, $contig, $beg, $end, $left, $right, $len, $strand ];
    }
}

foreach $contig (keys %$tbl)
{
    @$x = sort { $a->[LEFT] <=> $b->[LEFT] } values % { $tbl->{$contig} };
    
    for ($i=0; $i < @$x; ++$i)
    {
	unless (defined($overlaps{$x->[$i]->[FID]}))  { $overlaps{$x->[$i]->[FID]} = 0; }
	
	for ($j=$i+1; (($j < @$x) && ($overlap = &entries_overlap($x, $i, $j))); ++$j)
	{
	    next unless (&same_strand_overlap($x, $i, $j) || &divergent($x, $i, $j));
	    
	    if ($x->[$j]->[STRAND] eq '+')
	    {
		$overlaps{$x->[$j]->[FID]} = &FIG::max( $overlap, $overlaps{$x->[$j]->[FID]} );
	    }
	    
	    if ($x->[$i]->[STRAND] eq '-')
	    {
		$overlaps{$x->[$i]->[FID]} = &FIG::max( $overlap, $overlaps{$x->[$i]->[FID]} );
	    }
	}
    }
}



foreach $peg (@pegs)
{
    if ($peg =~ /^fig\|\d+\.\d+\.peg\.\d+$/)
    {
	$genome = $fig->genome_of($peg);
	$loc = $fig->feature_location($peg);
	# print "$len{$peg}\t$peg\t$loc\n";
	if ($loc =~ /,/)
	{
	    print STDERR "skipping $peg - do not handle complex locs\n";
	    next;
	}
    }
    elsif ($peg =~ /^(new\|(\d+\.\d+)\.peg\.\d+)\t(\S+)/)
    {
	$peg = $1;
	$genome = $2;
	$loc = $3;
    }

    ($genome && $loc) || confess $peg;
   
    my $lnC = $fig->contig_ln($genome,$contig);
    ($contig,$begin,$end) = $fig->boundaries_of($loc);
    if ($begin < $end)
    {
        $l = sprintf("%s_%d_%d",$contig,1,$begin-1);
	if ($end < ($lnC-3))
	{
	    $end += 3;
	}
    }
    else
    {
        $l = sprintf("%s_%d_%d",$contig,$fig->contig_ln($genome,$contig),$begin+1);
	if ($end > 3)
	{
	    $end -= 3;
	}
    }

    $loc = join("_",($contig,$begin,$end));
    $seq = uc $fig->dna_seq($genome,$loc);

    if (! $seq)
    {
	print STDERR &Dumper($peg,$loc,$seq); die "could not get sequence for $peg";
    }

#   The following hack handles uncertainty about whether SEED gives back the STOP or not
    if (substr($seq,-6,3) =~ /TAA|TAG|TGA/i)
    {
	$end = ($begin < $end) ? $end-3 : $end+3;
	$seq = substr($seq,0,length($seq) - 3);
    }
    elsif (substr($seq,-3,3) !~ /TAA|TAG|TGA/i)
    {
	warn "BAD STOP: $peg $genome $loc $seq\n";
    }

    $loc = join("_",($contig,$begin,$end));


    # BACK INTO THE STOP FROM START OF PEG
    $pre_peg = uc $fig->dna_seq($genome,$l);
    $found = 0;
    $len_pre_peg = length($pre_peg);

    for ($i=$len_pre_peg-3; $i > 0 && ! $found; $i-=3)
    {
        $stop =  substr($pre_peg,$i,3);
        if ($stop eq "TAG"  ||  $stop eq "TAA"  ||  $stop eq "TGA")
        {
            # print "FOUND $stop for $l\n";
            $orf = substr($pre_peg,$i+3) . $seq;
            if (($i-27) > 0)
            {
                $pre_orf = substr($pre_peg,$i-27,30);
		my $gs = $fig->genus_species($genome);
		my @aliases = ();
		if ($peg =~ /^fig/)
		{
		    @aliases = grep { $_ =~ /^([NXYZA]P_[0-9\.]+)|(uni\|)|(sp\|)/ } $fig->feature_aliases($peg);
		}

                print "ID=$peg\n";
	        print "GENOME=$gs\n";
	        if (@aliases > 0)
		{
		    print "ALIASES=",join(",",@aliases),"\n";
		}
                print "CONTIG=$contig\n";

                if ($begin < $end)
                {
                    print "BEGIN=", $begin-($len_pre_peg-$i-3), "\n";
                }
                else
                {
                    print "BEGIN=", $begin+($len_pre_peg-$i-3), "\n";
                }
                print "END=$end\n";
                print "SEQ=$orf\n";
                print "PREFIX=$pre_orf\n";
                print "OLD_START_POS=", $len_pre_peg-$i-2, "\n";
		print "OVERLAP=$overlaps{$peg}\n";
                print "///\n";
            	$found = 1;
            }
            else
            {
                print STDERR "skipping $peg - not enough prefix before orf\n";
            }
            last;
        }
    }
    if ( ! $found)
    {
        print STDERR "DID NOT FIND STOP BEFORE $peg  $contig $begin $end $l\n";
    }
}



sub from_locus
{
    my ($locus) = @_;
    my ($contig, $beg, $end) = $fig->boundaries_of($locus);
    my ($left, $right, $len, $strand);
	
    if ($contig && $beg && $end)
    {
	$left   = &FIG::min($beg, $end);
	$right  = &FIG::max($beg, $end);
	$len    = (1+abs($end-$beg));
	$strand = ($beg < $end) ? '+' : '-';
	
        return ($contig, $beg, $end, $left, $right, $len, $strand);
    }
    else
    {
        die "Invalid locus $locus";
    }

    return ();
}

sub divergent
{
    my ($x, $i, $j) = @_;

    my $beg_i   = $x->[$i]->[START];
    my $end_i   = $x->[$i]->[STOP];

    my $beg_j   = $x->[$j]->[START];
    my $end_j   = $x->[$j]->[STOP];

    if (   &FIG::between($beg_i, $beg_j, $end_i)
       &&  &FIG::between($beg_j, $beg_i, $end_j)
       && !&FIG::between($beg_j, $end_i, $end_j)
       && !&FIG::between($beg_i, $end_j, $end_i)
       )
    {
        return &overlaps($beg_i, $end_i, $beg_j, $end_j);
    }

    return 0;
}

sub same_strand_overlap
{
    my ($x, $i, $j) = @_;

    my $beg_i   = $x->[$i]->[START];
    my $end_i   = $x->[$i]->[STOP];

    my $beg_j   = $x->[$j]->[START];
    my $end_j   = $x->[$j]->[STOP];

    if    (  ($x->[$i]->[STRAND] eq '+') && ($x->[$j]->[STRAND] eq '+')
          && ($beg_i < $beg_j) && ($beg_j <= $end_i) && ($end_i < $end_j)
          )
    {
        return &overlaps($beg_i, $end_i,  $beg_j, $end_j);
    }
    elsif (  ($x->[$i]->[STRAND] eq '-') && ($x->[$j]->[STRAND] eq '-')
          && ($end_i < $end_j) && ($end_j <= $beg_i) && ($beg_i < $beg_j)
          )
    {
        return &overlaps($beg_i, $end_i,  $beg_j, $end_j);
    }
    else
    {
        &impossible_overlap_warning($x, $i, $j, "Opposite strands in a normal_overlap");
        return 0;
    }
}

sub entries_overlap
{
    my ($x, $i, $j) = @_;
    
    return 0 if ($x->[$i]->[CONTIG] ne $x->[$j]->[CONTIG]);
    
    $ov = &overlaps( $x->[$i]->[START], $x->[$i]->[STOP]
		   , $x->[$j]->[START], $x->[$j]->[STOP]);
    
    return $ov;
}

sub overlaps
{
    my ($beg1, $end1, $beg2, $end2) = @_;
    
    my ($left1, $right1, $left2, $right2);
    
    $left1  = &FIG::min($beg1, $end1);
    $left2  = &FIG::min($beg2, $end2);
    
    $right1 = &FIG::max($beg1, $end1);
    $right2 = &FIG::max($beg2, $end2);
    
    if ($left1 > $left2)
    {
        ($left1, $left2)   = ($left2, $left1);
        ($right1, $right2) = ($right2, $right1);
    }
    
    $ov = 0;
    if ($right1 >= $left2) { $ov = &FIG::min($right1,$right2) - $left2 + 1; }
    
    return $ov;
}

sub impossible_overlap_warning
{
    my ($x, $i, $j, $msg) = @_;
    
    return 0 if $deleted{"$x->[$i]->[FILE]\t$x->[$i]->[LOCUS]"};
    return 0 if $deleted{"$x->[$j]->[FILE]\t$x->[$j]->[LOCUS]"};
    
    if (defined($msg))
    {
	print STDERR ("Impossible overlap in $msg:\n"
		      , join("\t", @ { $x->[$i] }), "\n"
		      , join("\t", @ { $x->[$j] }), "\n\n")
	    if $ENV{VERBOSE};
	# confess "$msg";
    }
    else
    {
	print STDERR ("Impossible overlap:\n$i: "
		      , join("\t", @ { $x->[$i] }), "\n$j: "
		      , join("\t", @ { $x->[$j] }), "\n\n")
	    if $ENV{VERBOSE};
	# confess "aborted";
    }
    
    return 0;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3