[Bio] / FigKernelScripts / make_probes_to_genes.pl Repository:
ViewVC logotype

View of /FigKernelScripts/make_probes_to_genes.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.8 - (download) (as text) (annotate)
Fri Mar 25 21:08:46 2011 UTC (9 years ago) by dejongh
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2014_0729, mgrast_release_3_0_4, mgrast_release_3_0_3, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_10262011, HEAD
Changes since 1.7: +1 -1 lines
check for same contig when matching probe to peg

use strict;
use SeedEnv;
use gjoseqlib;
use Data::Dumper;
# Fig was only used for testing deleted pegs; we now require
# the calling code to elide removed deleted genomes from any tbl files provided.
#use FIG;
#my $fig = new FIG;

my $usage = "usage: make_probes_to_genes Probes Contigs Tbl peg.probe.table probe.occ.table [more tbl files]";
my($probesF,$contigsF,$tblF,$peg_probeF,$probe_occF);

(
 ($probesF    = shift @ARGV) &&
 ($contigsF   = shift @ARGV) &&
 ($tblF       = shift @ARGV) &&
 ($peg_probeF = shift @ARGV) &&
 ($probe_occF = shift @ARGV)
)
    || die $usage;
my @other_tblF = @ARGV;

my @contigs = map { $_->[2] = lc $_->[2]; $_ } &gjoseqlib::read_fasta($contigsF);

my @probes  = ();
my %probelens;

my $fh;
open($fh, "<", $probesF) or die "Cannot read $probesF: $!";

while (<$fh>)
{
    if ($_ =~ /^(\S+)\t(\S+)/)
    {
	my $probe_id = $1;
	my $seq      = lc $2;
	my $seqR     = lc &SeedUtils::rev_comp($seq);
	push(@probes,[$probe_id,$seq,$seqR]);
	$probelens{length($seq)}++;
    }
}
close($fh);
my %contig_index;
my $probe_size;

#
# If we have a single probe length, create an index of the contigs
# and use for later lookups.
#
if (keys %probelens == 1)
{
    $probe_size = (keys %probelens)[0];
    for my $contig (@contigs)
    {
	my($id, undef, $seq) = @$contig;
	for (my $i = 0; $i < length($seq) - $probe_size + 1; $i++)
	{
	    my $s = substr($seq, $i, $probe_size);
	    push(@{$contig_index{$s}}, [$id, $i]);
	}
    }
}
#print STDERR "got probes\n";

#my @pegs = map { ($_ =~ /^(\S+)\t(\S+)_(\d+)_(\d+)\s/) ? [$1,[$2,$3,$4]] : () } `cat $tblF`;
my @pegs;
for my $tbl_file ($tblF, @other_tblF)
{
    open($fh, "<", $tbl_file) or die "Cannot open $tbl_file: $!";
    while (<$fh>)
    {
	if ($_ =~ /^(fig\|\d+\.\d+\.(peg|rna)\.\d+)\t(\S+)/)
	{
	    my $peg = $1;
	    my($c1,$b1,$e1) = &boundaries($3);

	    # It is responsibility of calling code to provide tbl files with
	    # deleted genomes removed.
# 	    if (! $fig->is_deleted_fid($peg))
# 	    {
# 		push(@pegs,[$peg,[$c1,$b1,$e1]]);
# 	    }
	    push(@pegs,[$peg,[$c1,$b1,$e1]]);
	}
    }
    close($fh);
}

my %multiple;
my @all_hits = ();
foreach my $probe (@probes)
{
    my($probe_id,$seq,$seqR) = @$probe;
    my @matches = &matches_in_contigs($seq,\@contigs,'+', \%contig_index);
    push(@matches,&matches_in_contigs($seqR,\@contigs,'-', \%contig_index));
    if (@matches == 0)
    {
	print STDERR "$probe_id does not occur\n";
    }
    elsif (@matches > 1)
    {
	$multiple{$probe_id} = scalar @matches;
	print STDERR "$probe_id occurs ",scalar @matches," times: ",join(",",map {join(":",@$_) } @matches),"\n";
    }
    push(@all_hits,map { [$probe_id,$_] } @matches);
}
@all_hits = sort { ($a->[1]->[0] cmp $b->[1]->[0]) or ($a->[1]->[1] <=> $b->[1]->[1]) } @all_hits;
@pegs     = sort { ($a->[1]->[0] cmp $b->[1]->[0]) or 
		       (&min($a->[1]->[1],$a->[1]->[2]) <=> &min($b->[1]->[1],$b->[1]->[2])) }
	    @pegs;
# print STDERR &Dumper(\@all_hits,\@pegs);

my @corr = ();
foreach my $hit_tuple (@all_hits)
{
    my($probe_id,$hit)            = @$hit_tuple;
    my($contigH,$bH,$eH,$strandH) = @$hit;
    foreach my $peg_tuple (@pegs)
    {
	my($peg,$loc)        = @$peg_tuple;
	my($contigP,$bP,$eP) = @$loc;
	if ($contigH eq $contigP && &SeedUtils::between($bP,$bH,$eP))
	{
	    my $n;
	    if ((($bP < $eP) && (($bH - $bP) > 3)) ||
		(($bP > $eP) && (($bH - $eP) > 3)))
	    {
		$n = $multiple{$probe_id} ? $multiple{$probe_id} : 0;
	    }
	    my $strandP;
	    if ($bP > $eP)
	    {
		($bP,$eP) = ($eP,$bP);
		$strandP = '-';
	    }
	    else
	    {
		$strandP = '+';
	    }
	    push(@corr,[$peg,$probe_id,$n,$strandP,$strandH,$contigH,$bH,$eH]);
	}
    }
}

open(P2P,">$peg_probeF") || die "could not open $peg_probeF";
open(OCC,">$probe_occF") || die "could not open $probe_occF";
my %seen;
my %probes_in_peg;
foreach my $x (sort { &SeedUtils::by_fig_id($a->[0],$b->[0]) or ($a->[1] cmp $b->[1]) } @corr)
{
    my($peg,$probe,$n,$strandH,$strandP,$contigH,$bH,$eH) = @$x;
    if ($strandH eq $strandP)
    {
	print P2P "$peg\t$probe\n";
	if (! $seen{$probe})
	{
	    $seen{$probe} = 1;
	    print OCC "$probe\t$n\t$contigH,$bH,$eH\n";
	}
	$probes_in_peg{$x->[0]} = 1;
    }
}

foreach my $peg (map { $_->[0] } @pegs)
{
    if (! $probes_in_peg{$peg})
    {
	print STDERR "$peg is not matched by probes\n";
    }
}

sub matches_in_contigs {
    my($seq,$contigs,$strand, $idx) = @_;
    my @hits;

    if (%$idx)
    {
	my $hits = $idx->{$seq};
	if ($hits)
	{
	    for my $hit (@$hits)
	    {
		my($ctg, $loc) = @$hit;
		push(@hits, [$ctg, $loc + 1, $loc + length($seq), $strand]);
	    }
	}
	return @hits;
    }
    foreach my $contig (@$contigs)
    {
	my($idC,undef,$seqC) = @$contig;
	my $off = 0;
	while (($_ = index($seqC,$seq,$off)) >= 0)
	{
	    push(@hits,[$idC,$_+1,$_+length($seq),$strand]);
	    $off = $_ + 1;
	}
    }
    return @hits;
}

sub boundaries {
    if (!UNIVERSAL::isa($_[0],__PACKAGE__)) {
        my ($package, $filename, $line) = caller;
#       warn "Deprecated boundaries_of called from $filename line $line.";
    } else {
        shift;
    }
    my($location) = (@_ == 1) ? $_[0] : $_[1];
    my($contigQ);

    if (defined($location))
    {
        my @exons = split(/\s*,\s*/,$location);
        my($contig, $beg, $end);

        if (($exons[0] =~ /^(\S+)_(\d+)_\d+$/))
        {   
            ($contig, $beg) = ($1,$2);
            $contigQ = quotemeta $contig;

            if ($exons[$#exons] =~ /^$contigQ\_\d+_(\d+)$/)
            {
                $end = $1;
		if ($beg > 0 && $end > 0)
		{
		    my $strand = (($beg < $end) ? qq(+) : qq(-));
		    return ($contig, $beg, $end, $strand);
		}
            }
        }
        Cluck("Could not parse loc=$location.") if T(0);
    }
    return ();
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3