[Bio] / FigKernelScripts / map_pegs.pl Repository:
ViewVC logotype

View of /FigKernelScripts/map_pegs.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (as text) (annotate)
Fri Jul 14 22:21:16 2006 UTC (13 years, 4 months ago) by parrello
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.2: +3 -1 lines
Fixed obsolete form of boundaries_of.

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

use strict;
use FIG;

my(@orgs,$org);
my($usage,$old_contigs,$old_tbl,$new_contigs,$new_tbl,$max);
my($nxt,$id,$pre,$rest,$old,$version,$mapF);

$usage = "usage: map_pegs OldContigs OldTbl NewContigs NewTbl Map > adjusted.new.tbl";

(($old_contigs = shift @ARGV) &&
 ($old_tbl     = shift @ARGV) &&
 ($new_contigs = shift @ARGV) &&
 ($new_tbl     = shift @ARGV) &&
 ($mapF         = shift @ARGV) && open(MAP,">$mapF")
)
    || die $usage;

my $fig = FIG->new();

foreach $id (map { $_ =~ /^(\S+)/; $1 ? $1 : () } `cat $old_tbl`)
{
    $id =~ /peg\.(\d+)$/;
    if ((! $max) || ($1 > $max)) { $max = $1 };
}
$nxt = $max+1;

# print STDERR "nxt=$nxt\n";

my %remap = map { $_->[0] => $_->[1] } &cluster_genes($old_contigs,$old_tbl,$new_contigs,$new_tbl);
# print STDERR &Dumper(\%remap);
foreach $_ (keys(%remap))
{
    print MAP "$_\t$remap{$_}\n";
}

foreach $_ (`cat $new_tbl`)
{
    if ($_ =~ /^((fig\|\d+\.\d+\.peg\.)\d+)\t(.*)$/)
    {
	$id   = $1;
	$pre  = $2;
	$rest = $3;
	if ($old = $remap{$id})
	{
	    print "$old\t$rest\n";
	}
	else
	{
	    print "$pre$nxt\t$rest\n";
	    print MAP "$id\t$pre$nxt\n";
	    $nxt++;
	}
    }
}

sub cluster_genes {
    my($old_contigs,$old_tbl,$new_contigs,$new_tbl) = @_;
    my($contigs,$genes,$gene,$dna,$max,$kmer,$i,$line);
    my($curr,@set,$j,$pair,$g1,$g2,@possibly_correspond,%possible);

    my $tmp_kmers = "tmp.kmers.$$";
    my $clusters = [];
    my @genes = ();
    
    open(KMERS,"| sort +0 -1 +1n -2 > $tmp_kmers") || die "aborted";
    foreach $version (([$new_contigs,$new_tbl,"new"],[$old_contigs,$old_tbl,"old"]))
    {
	my($contigsF,$tblF,$which) = @$version;
#	print STDERR "loading $which\n";

	if (($contigs = &get_contigs($contigsF)) &&
	    ($genes   = &get_genes($contigs,$tblF,$which)))
	{
#	    $_ = @$genes; print STDERR "$_ genes loaded\n";

	    foreach $gene (@$genes)     # $gene is [Id,Contig,Beg,End,DNA,which]
	    {
		$dna = $gene->[4];
		push(@genes,$gene);
		$max = length($dna) - 20;
		for ($i=0; ($i <= $max); $i++)
		{
		    $kmer = uc substr($dna,$i,20);
		    $kmer =~ tr/U/T/;
		    if ($kmer !~ /[^ACGT]/)
		    {
			print KMERS "$kmer\t$#genes\n";
		    }
		}
	    }
	}
#	print STDERR "finishished loading $which\n";
    }
    close(KMERS);

#    print STDERR "processing kmers\n";
    open(KMERS,"<$tmp_kmers") 
	|| die "could not open $tmp_kmers";

    $line = <KMERS>;
    while ($line && ($line =~ /^(\S+)\t(\d+)$/))
    {
	$curr = $1;
	@set = ();
	while ($line && ($line =~ /^$curr\t(\d+)/))
	{
	    push(@set,$1);
	    $line = <KMERS>;
	}

	for ($i=0; ($i < $#set); $i++)
	{
	    for ($j=$i+1; ($j < @set); $j++)
	    {
		if ($genes[$set[$i]]->[5] ne $genes[$set[$j]]->[5])
		{
		    $possible{"$set[$i],$set[$j]"}++;
		}
	    }
	}
    }
    close(KMERS);
    unlink($tmp_kmers);
#    print STDERR "built kmer matches\n";

    foreach $pair (keys(%possible))
    {
	if ($possible{$pair} > 4)
	{
	    ($g1,$g2) = split(/,/,$pair);
#	    print STDERR "possibly related: $genes[$g1]->[0] $genes[$g2]->[0]\n";
	    if (&possibly_correspond($genes[$g1]->[4],$genes[$g2]->[4]))
	    {
		push(@possibly_correspond,[$g1,$g2]);
	    }
	}
    }
    return map { [$genes[$_->[0]]->[0],$genes[$_->[1]]->[0]] } &compute_correspondence(\@possibly_correspond,\@genes);
}

sub compute_correspondence {
    my($possible,$genes) = @_;
    my($clearly,$possibly,$pair);

    ($clearly,$possibly) = &map_unique($possible);
#    print STDERR &Dumper(["possibly",$possibly]);

#    print STDERR "CLEARLY\n";
    foreach $pair (@$clearly)
    {
	my($g1,$g2) = @$pair;
#	print STDERR &Dumper($genes->[$g1],$genes->[$g2]);
    }
#    print STDERR "============\n";

    foreach $pair (@$possibly)
    {
#	print STDERR "checking for possibility: ",join(" ",@$pair),"\n";
	if (&using_bounds($pair,$possibly,$clearly,$genes))
	{
#	    print STDERR "adding possible pair to clearly ",join(" ",@$pair),"\n";
	    push(@$clearly,$pair);
	}
	else
	{
#	    print STDERR "failed: ",&Dumper($genes->[$pair->[0]],$genes->[$pair->[1]]),"\n";
	}
    }
    return @$clearly;
}

sub using_bounds {
    my($pair,$possibly,$clearly,$genes) = @_;
    my($g1,$g2,$g1A,$g2A,$pairA,$lg1,$rg1,$lg2,$rg2);

#    print STDERR "USING BOUNDS\n";
    ($g1,$g2) = @$pair;
#    print &Dumper($genes->[$g1],$genes->[$g2]);
    foreach $pairA (@$clearly)
    {
	($g1A,$g2A) = @$pairA;
	if (&better_left($genes,$g1,$lg1,$g1A) && &better_left($genes,$g2,$lg2,$g2A))
	{
	    $lg1 = $g1A;
	    $lg2 = $g2A;
	}

	if (&better_right($genes,$g1,$rg1,$g1A) && &better_right($genes,$g2,$rg2,$g2A))
	{
	    $rg1 = $g1A;
	    $rg2 = $g2A;
	}
    }
#    print STDERR &Dumper($lg1,$g1,$rg1);
    if (defined($lg1) && defined($g1) && defined($rg1))
    {
#	print STDERR &Dumper($genes->[$lg1],$genes->[$g1],$genes->[$rg1]);
    }

#    print STDERR &Dumper($lg2,$g2,$rg2);
    if (defined($lg2) && defined($g2) && defined($rg2))
    {
#        print STDERR &Dumper($genes->[$lg2],$genes->[$g2],$genes->[$rg2]);
    }

    if (defined($lg1) && defined($rg1) && defined($lg2) && defined($rg2))
    {
	foreach $pairA (@$possibly)
	{
	    ($g1A,$g2A) = @$pairA;
	    if (($g1A eq $g1) && ($g2A ne $g2) && &in_bounds($genes,$g2A,$lg2,$rg2)) { return 0 }
	    if (($g2A eq $g2) && ($g1A ne $g1) && &in_bounds($genes,$g1A,$lg1,$rg1)) { return 0 }
	}
#	print STDERR "OK: $g1 $g2\n";
	return 1;
    }
    else
    {
#	print STDERR "$g1 $g2 FAILED\n";
	return 0;
    }
}

sub in_bounds {
    my($genes,$g,$lg,$rg) = @_;

    return (($genes->[$g]->[1] eq $genes->[$lg]->[1]) && 
	    &left($genes->[$lg]->[2],$genes->[$lg]->[3],$genes->[$g]->[2],$genes->[$g]->[3]) && 
	    &left($genes->[$g]->[2],$genes->[$g]->[3],$genes->[$rg]->[2],$genes->[$rg]->[3]));
}

sub better_left {
    my($genes,$g,$lg,$ga) = @_;
    my($rc);

    if (($genes->[$g]->[1] ne $genes->[$ga]->[1]) || 
	(! &left($genes->[$ga]->[2],$genes->[$ga]->[3],$genes->[$g]->[2],$genes->[$g]->[3]))) 
    { 
	$rc = 0;
    }
    elsif ((! $lg) || &left($genes->[$lg]->[2],$genes->[$lg]->[3],$genes->[$ga]->[2],$genes->[$ga]->[3])) 
    { 
	$rc = 1;
    }
    else
    {
	$rc = 0;
    }
#    print STDERR "better left: $g $ga = $rc\n";
    return $rc;
}

sub left {
    my($b1,$e1,$b2,$e2) = @_;
    my($min,$max);

    return (&FIG::max($b1,$e1) < &FIG::min($b2,$e2));
}

sub better_right {
    my($genes,$g,$rg,$ga) = @_;

    if (($genes->[$g]->[1] ne $genes->[$ga]->[1]) || 
	(! &left($genes->[$g]->[2],$genes->[$g]->[3],$genes->[$ga]->[2],$genes->[$ga]->[3]))) { return 0 }
    if ((! $rg) || &left($genes->[$ga]->[2],$genes->[$ga]->[3],$genes->[$rg]->[2],$genes->[$rg]->[3])) { return 1 }
    return 0;
}

sub map_unique {
    my($possible) = @_;
    my($pair,%hits1,%hits2,$g1,$g2);
    my $clearly = [];
    my $possibly = [];

    foreach $pair (@$possible)
    {
	($g1,$g2) = @$pair;
	$hits1{$g1}++;
	$hits2{$g2}++;
    }

    foreach $pair (@$possible)
    {
	($g1,$g2) = @$pair;
	if (($hits1{$g1} == 1) && ($hits2{$g2} == 1))
	{
	    push(@$clearly,$pair);
	}
	else
	{
	    push(@$possibly,$pair);
	}
    }
    return ($clearly,$possibly);
}

sub possibly_correspond {
    my($dna1,$dna2) = @_;
    my(@ambig,$best,$where1,$where2,$i1,$i2,$i,$max,$chunk);

    if (($dna1 =~ /[^ACGT]{4}/) || ($dna2 =~ /[^ACGT]{4}/)) { return 0 }

    if (length($dna1) > length($dna2))
    {
	($dna1,$dna2) = ($dna2,$dna1);
    }

    @ambig = (0);
    while ($dna1 =~ /[MRWSYKBDHVN]/g)
    {
	push(@ambig,pos($dna1));
    }
    push(@ambig,length($dna1));

    $best = 0;
    for ($i=0; ($i < $#ambig); $i++)
    {
	if (($ambig[$i+1] - $ambig[$i]) > $best)
	{
	    $best   = $ambig[$i+1] - $ambig[$i];
	    $where1 = $ambig[$i];
	}
    }
    $chunk = substr($dna1,$where1,$best);
    $chunk =~ s/[MRWSYKBDHVN]/./g;
    
    if ($dna2 =~ /$chunk/g)
    {
	$where2 = pos($dna2) - length($chunk);
	$max = length($dna1);
	if ($where2 >= $where1)
	{
	    for ($i1=0, $i2=$where2-$where1; ($i1 < $max) && &match(substr($dna1,$i1,1),substr($dna2,$i2,1)); $i1++, $i2++) {}
	    return ($i1 == $max);
	}
    }
    return 0;
}

sub match {
    my($c1,$c2) = @_;

    if ($c1 eq $c2) { return 1 }
    if ($c1 !~ /^[ACGT]/) { ($c1,$c2) = ($c2,$c1) }
    if ($c1 !~ /^[ACGT]/) { return 0 }
    if    ($c1 eq "A") { return ($c2 =~ /[MRWDHVN]/) }
    elsif ($c1 eq "C") { return ($c2 =~ /[MSYBHVN]/) }
    elsif ($c1 eq "G") { return ($c2 =~ /[RSKBDVN]/) }
    elsif ($c1 eq "T") { return ($c2 =~ /[WYKBDHN]/) }
    return 0;
}

sub get_contigs {
    my($file) = @_;
    my($contigs,$id,$seq);

    open(CONTIGS,"<$file") || die "could not open $file";
    print STDERR "loading contigs from $file\n";

    $contigs = {};
    $/ = "\n>";
    while (defined($_ = <CONTIGS>))
    {
	chomp;
	if ($_ =~ /^>?(\S+)[^\n]*\n(.*)/s)
	{
	    $id  =  $1;
	    $seq =  $2;
	    $seq =~ s/\s//gs;
	    $contigs->{$id} = $seq;
	}
    }
    $/ = "\n";
    close(CONTIGS);
    return $contigs;
}

sub get_genes {
    my($contigs,$tblF,$which) = @_;
    my($contig,$loc,$dna);
    my($beg,$end,$line,$seq);

    open(TBL,"<$tblF") || die "could not open $tblF";
    my $genes = [];
    while (defined($line = <TBL>))
    {
	if ($line =~ /^(\S+)\t(\S+)/)
	{
	    $id  = $1;
	    $loc = $2;
	    ($contig,$beg,$end) = $fig->boundaries_of($loc);
	    $seq = $contigs->{$contig};
	    if ($beg < $end)
	    {
		if (($beg > 0) && ($end <= length($seq)))
		{
		    $dna = uc substr($seq,$beg-1,($end+1)-$beg);
		    push(@$genes,[$id,$contig,$beg,$end,$dna,$which]);
		}
	    }
	    else
	    {
		if (($end > 0) && ($beg <= length($seq)))
		{
		    $dna = uc &reverse_complement(substr($seq,$end-1,($beg+1)-$end));
		    push(@$genes,[$id,$contig,$beg,$end,$dna,$which]);
		}
	    }
	}
    }
    return $genes;
}

sub reverse_complement {
    my($seq) = (@_ == 1) ? $_[0] : $_[1];
    my($rev);

    $rev =  reverse($seq);
    $rev =~ tr/a-z/A-Z/;
    $rev =~ tr/ACGTUMRWSYKBDHV/TGCAAKYWSRMVHDB/;
    return $rev;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3