[Bio] / FigKernelScripts / generate_intergenic_blocks.pl Repository:
ViewVC logotype

View of /FigKernelScripts/generate_intergenic_blocks.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (as text) (annotate)
Mon Dec 5 18:56:37 2005 UTC (14 years, 6 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, caBIG-05Apr06-00, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, caBIG-13Feb06-00, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.2: +17 -0 lines
Add license words.

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#


use FIG;
my $fig = new FIG;

my $usage = "usage: generate_intergenic_blocks Dir";

(
 ($dir         = shift @ARGV)
)
    || die $usage;

if (! -d $dir)    { die "$dir must exist with the gene blocks in it" }

open(REG,"<$dir/region.tbl") || die "could not open $dir/region.tbl";
while (defined($_ = <REG>))
{
    ($peg,undef,$contig,$beg,$end,$blockid) = split(/\t/,$_);
    $in{$peg} = [$blockid,$contig,$beg,$end];
    push(@{$contains{$blockid}},$peg);
}
close(REG);

open(BLOCK,"<$dir/block.tbl") || die "could not open $dir/block.tbl";
$max = 0;
while (defined($_ = <BLOCK>))
{
    chop;
    ($blockid,$gene) = split(/\t/,$_);
    $gene_of{$blockid} = $gene;
    $max = &FIG::max($max,$blockid);
}
close(BLOCK);
$blockid = $max+1;

open(BLOCKS,">$dir/intergenic_block.tbl")   || die "could not open intergenic_block.tbl";
open(REGIONS,">$dir/intergenic_region.tbl") || die "could not open intergenic_region.tbl";


open(NEIGH,">neigh") || die "aborted";
foreach $peg (keys(%in))
{
    ($peg1,$peg2) = &conn($peg);

    $n = $in{$peg}->[0];
    if ($peg1 && ($n1 = $in{$peg1}))  { $precedes{$n}->{$n1->[0]}++ }
    if ($peg2 && ($n1 = $in{$peg2}))  { $follows{$n}->{$n1->[0]}++ }
    $peg1 = $peg1 ? $peg1 : "";
    $peg2 = $peg2 ? $peg2 : "";
    print NEIGH "$peg\t$peg1\t$peg2\n";
}
#close(NEIGH);

####################
#open(NEIGH,"<neigh") || die "aborted";
#while (defined($_ = <NEIGH>))
#{
#    if ($_ =~ /^(\S+)\t(\S+)\t(\S+)/)
#    {
#	($peg,$peg1,$peg2) = ($1,$2,$3);
#	
#       $n = $in{$peg}->[0];
#       if ($peg1 && ($n1 = $in{$peg1}))  { $precedes{$n}->{$n1->[0]}++ }
#       if ($peg2 && ($n1 = $in{$peg2}))  { $follows{$n}->{$n1->[0]}++ }
#    }
#}
#close(NEIGH);
####################

foreach $n (keys(%follows))
{
    $x = $follows{$n};
    @poss = sort { $x->{$b} <=> $x->{$a} } keys(%$x);
    if ((@poss == 1) || ($x->{$poss[0]} > $x->{$poss[1]}) && ($x->{$poss[0]} > 2))
    {
	$key = join("\t",sort ($n,$poss[0]));
	$pair{$key}++;
    }
}

foreach $n (keys(%precedes))
{
    $x = $precedes{$n};
    @poss = sort { $x->{$b} <=> $x->{$a} } keys(%$x);
    if ((@poss == 1) || ($x->{$poss[0]} > $x->{$poss[1]}) && ($x->{$poss[0]} > 2))
    {
	$key = join("\t",sort ($n,$poss[0]));
	$pair{$key}++;
    }
}

foreach $tuple (keys(%pair))
{
    ($n1,$n2) = split(/\t/,$tuple);
    $pegs1 = $contains{$n1};
    $pegs2 = $contains{$n2};
    @poss = ();

    foreach $peg1 (@$pegs1)
    {
	$genome1 = &FIG::genome_of($peg1);
	(undef,$contig1,$beg1,$end1) = @{$in{$peg1}};
	foreach $peg2 (@$pegs2)
	{
	    $genome2 = &FIG::genome_of($peg2);
	    if ($genome1 eq $genome2)
	    {
		(undef,$contig2,$beg2,$end2) = @{$in{$peg2}};
		if ($contig1 eq $contig2)
		{
#		    warn "$peg1 $peg2 $contig1 $beg1 $end1 $contig2 $beg2 $end2\n";
		    if (($beg1 < $beg2) && ($end1 < $beg2) && ($beg1 < $end2) && ($end1 < $end2))
		    {
			$start = &FIG::max($beg1,$end1) + 1;
			$end   = &FIG::min($beg2,$end2) - 1;
			if ($start < $end)
			{
			    push(@poss,[$peg1,$peg2,$contig1,$start,$end,
					$fig->dna_seq($genome1,join("_",($contig1,$start,$end)))]);
			}
		    }
		    elsif (($beg1 > $beg2) && ($end1 > $beg2) && ($beg1 > $end2) && ($end1 > $end2))
		    {
			$start = &FIG::min($beg1,$end1) - 1;
			$end   = &FIG::max($beg2,$end2) + 1;
			if ($start > $end)
			{
			    push(@poss,[$peg1,$peg2,$contig1,$start,$end,
					$fig->dna_seq($genome1,join("_",($contig1,$start,$end)))]);
			}
		    }
#		    print &Dumper(\@poss);
		}
	    }
	}
    }

    @poss = sort { length($a->[5]) <=> length($b->[5]) } @poss;
    if ((@poss > 1) &&
	(((length($poss[$#poss]->[5]) - length($poss[$#poss]->[0])) / length($poss[$#poss]->[5])) < 0.1))
    {
	my @reordered = map { [&pegN($_->[0]) . ":" . &pegN($_->[1]),
			       &FIG::genome_of($_->[0]),$_->[5],join("_",($_->[2],$_->[3],$_->[4]))] } @poss;
	my @aligned = &align_DNA_for_a_set_of_seqs($fig,\@reordered);
	$name = $gene_of{$n1} . "-" . $gene_of{$n2};
	print BLOCKS "$blockid\t$name\n";

	foreach $tuple (@aligned)
	{
	    &write_tuple($blockid,\*REGIONS,$tuple);
	}
	$blockid++;
    }
}   

sub pegN {
    my($peg) = @_;

    if ($peg =~ /(\d+)$/) { return $1 }
    return undef;
}

sub write_tuple {
    my($blockid,$fh,$tuple) = @_;

    my($pegs,$genome,$ali_seq,$loc) = @$tuple;
    if ($loc =~ /^(\S+)_(\d+)_(\d+)$/)
    {
	print $fh join("\t",($pegs,$genome,$1,$2,$3,$blockid,$ali_seq)),"\n";
    }
}


sub align_DNA_for_a_set_of_seqs {
    my($fig,$tuples) = @_;
    my($tuple,$id,$dseq,$hdr,$aseq,%aligned);

    my $tmpS = "/tmp/seq$$";

    open(SEQS,">$tmpS.fasta") || die "could not open $tmpS";
    foreach $tuple (@$tuples)
    {
	(undef,$id,$dseq) = @$tuple;
	print SEQS ">$id\n$dseq\n";
    }
    close(SEQS);
    system "$FIG_Config::ext_bin/clustalw -infile=$tmpS.fasta -align -outorder=aligned > /dev/null";
    my @ali = `$FIG_Config::bin/clustal_to_fasta < $tmpS.aln`;
    while (($hdr = shift @ali) && ($aseq = shift @ali))
    {
	if ($hdr =~ /^>(\S+)/)
	{
	    chop $aseq;
	    $aligned{$1} = $aseq;
	}
    }
    return map { my($pegs,$genome,undef,@rest) = @$_; $aligned{$genome} ? [$pegs,$genome,$aligned{$genome},@rest] : () } @$tuples;
}

sub conn {
    my($peg) = @_;
    my($beg,$end,$i,@with_pos,@close_genes);

    (undef,$beg,$end) = $fig->boundaries_of($fig->feature_location($peg));
    @close_genes = $fig->close_genes($peg,4000);
    @with_pos = sort { $a->[0] <=> $b->[0] }
                map  { 
		          my $peg1 = $_;
		          my(undef,$beg,$end) = $fig->boundaries_of($fig->feature_location($peg1));
		          [int($beg+$end)/2,$peg1]
		     }
                     @close_genes;
    for ($i=0; ($i < @with_pos) && ($with_pos[$i]->[1] ne $peg); $i++) {}

    my($after,$before);
    if    (($i < $#with_pos) && ($beg < $end)) { $after  = $with_pos[$i+1]->[1] }
    if    (($i < $#with_pos) && ($beg > $end)) { $before = $with_pos[$i+1]->[1] }
    if    (($i > 0) && ($beg < $end))          { $before = $with_pos[$i-1]->[1] }
    if    (($i > 0) && ($beg > $end))          { $after  = $with_pos[$i-1]->[1] }
    return ($before,$after);
}

sub before {
    my($n,$strand) = @_;
    my($n1,$strand1);

    if ($strand eq "+")
    {
	$n1 = $really_precedes{$n};
	$strand1 = ($really_follows{$n1} == $n) ? "+" : "-";
    }
    else
    {
	$n1 = $really_follows{$n};
	$strand1 = ($really_precedes{$n1} == $n) ? "-" : "+";
    }
    return ($n1,$strand1);
}

sub after {
    my($n,$strand) = @_;
    my($n1,$strand1);

    if ($strand eq "+")
    {
	$n1 = $really_follows{$n};
	$strand1 = ($n1 && ($really_precedes{$n1} == $n)) ? "+" : "-";
    }
    else
    {
	$n1 = $really_precedes{$n};
	$strand1 = ($n1 && ($really_follows{$n1} == $n)) ? "-" : "+";
    }
    return ($n1,$strand1);
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3