[Bio] / FigKernelScripts / check_for_possible_mobile_elements.pl Repository:
ViewVC logotype

View of /FigKernelScripts/check_for_possible_mobile_elements.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Sun Sep 16 21:51:21 2007 UTC (12 years, 5 months ago) by overbeek
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
add code to search for mobile elements

$usage = "usage: check_for_possible_mobile_elements [-p Pad] [-b Blast] [-s ShowConnections] [-i MinIdentity] [-d GenomeDir] Genome Dir";

my $blast = 1;
my $pad      = 20;
my $show_conn = 0;
my $min_iden = 95;
my $gdir = undef;

while ($ARGV[0] =~ /^-/)
{
    $_ = shift @ARGV;
    if ($_ =~ s/^-b//)    { $blast = (length($_) > 0)     ? $_ :  shift @ARGV }
    elsif ($_ =~ s/^-p//) { $pad = (length($_) > 0)       ? $_ :  shift @ARGV }
    elsif ($_ =~ s/^-s//) { $show_conn = (length($_) > 0) ? $_ :  shift @ARGV }
    elsif ($_ =~ s/^-g//) { $gdir = (length($_) > 0)      ? $_ :  shift @ARGV }
    elsif ($_ =~ s/^-i//) { $min_iden = (length($_) > 0)  ? $_ :  shift @ARGV }
    else                  { die "Bad Flag: $_" }
}

(
 ($genome  = shift @ARGV) && 
 ($tmp_dir = shift @ARGV)
)
    || die $usage;

use FIGV;
my $fig = new FIGV($gdir);

if (! $gdir) { $gdir = "$FIG_Config::organisms/$genome" }

foreach $fid ($fig->all_features($genome,"rna"))
{
    ($contig,$beg,$end) = $fig->boundaries_of($fig->feature_location($fid));
    if ($contig)
    {
	push(@{$rnas{$contig}},[&FIG::min($beg,$end),&FIG::max($beg,$end)]);
    }
}

&FIG::verify_dir($tmp_dir);
$tmp_ext_repeats = "$tmp_dir/extended.repeats";
$tmp_repeats = "$tmp_dir/repeats";
$tmp_hits    = "$tmp_dir/hits";
$tmp_summ    = "$tmp_dir/summary";

&FIG::verify_dir($tmp_dir);

if ($blast)
{
    &FIG::run("$FIG_Config::bin/blastn_chunks $gdir/contigs $gdir/contigs 10000 1.0e-50 $min_iden 100 -g F -r 1 -q -1 -FF | sort -u > $tmp_dir/repeats");
}

open(REPEATS,"<$tmp_dir/repeats")
    || die "could not open $tmp_dir/repeats";

open(EXTREP,">$tmp_ext_repeats")
    || die "could not open $tmp_ext_repeats";
$contigs = $fig->contig_lengths($genome);

while (defined($_ = <REPEATS>))
{
    chop;
    ($id1,$id2,undef,$b1,$e1,$b2,$e2) = split(/\t/,$_);
    if (&not_rna(\%rnas,$id1,$b1,$e1))
    {
	my $key1 = &key($id1,$b1,$e1);
	my $key2 = &key($id2,$b2,$e2);
	push(@{$connected{$key1}},$key2); if ($show_conn) { print STDERR "$key1 connects to $key2 via repeats\n"; }
	push(@{$connected{$key2}},$key1); if ($show_conn) { print STDERR "$key2 connects to $key1 via repeats\n"; }
	$ln = abs($e1-$b1)+1;
	$min1 = &FIG::min($b1,$e1);
	$max1 = &FIG::max($b1,$e1);
	if (($min1 > $pad) && ($max1 < ($contigs->{$id1} - $pad)))
	{
	    $b1a = ($b1 < $e1) ? ($b1 - $pad) : ($b1 + $pad);
	    $e1a = ($b1 < $e1) ? ($e1 + $pad) : ($e1 - $pad);
	    $loc = join("_",($id1,$b1a,$e1a));
	    $seq = $fig->dna_seq($genome,$loc);
	    if (! $seen{$loc})
	    {
		&FIG::display_id_and_seq("$loc\_$ln",\$seq,\*EXTREP);
		$seen{$loc} = 1;
	    }
	}
	else
	{
#	    die "b1=$b1 e1=$e1 $contigs->{$id1}";
	}
    }
}
close(REPEATS);
close(EXTREP);

open(EXTREP,"<$tmp_ext_repeats")
    || die "could not open $tmp_ext_repeats";

open(HITS,">$tmp_hits") || die "could not open $tmp_hits";

undef %seen;
$/ = "\n>";
while (defined($_ = <EXTREP>))
{
    chomp;
    if ($_ =~ /^>?(\S+)_(\d+)\n(.*)/s)
    {
	my($id,$ln,$seq) = ($1,$2,$3);
#	print STDERR "checking $id ln=$ln\n";
	$seq =~ s/\s//g;
	
	$tmp_seq = "$tmp_dir/one_seq";
	open(SEQ,">$tmp_seq") || die "could not open $tmp_seq";
	print SEQ ">$id\n$seq\n";
	close(SEQ);

	my $min = $ln-70;
	$tmp_pat = "$tmp_dir/pattern";
	open(PAT,">$tmp_pat") || die "could not open $tmp_pat";
	print PAT "p1=8...10 0...6 p2=5...10 $min...$ln ~p2[1,0,0] 0...6 p1\n";
#	print STDERR "p1=8...10 0...6 p2=5...10 $min...$ln ~p2[1,0,0] 0...6 p1\n";
	close(PAT);

	open(SCAN,"$FIG_Config::ext_bin/scan_for_matches $tmp_pat < $tmp_seq |")
	    || die "could not open scan";

	if (my $hit = <SCAN>)
	{
	    chomp $hit;
	    if ($hit =~ /^>?(\S+)[^\n]*\n(.*)/s)
	    {
		my($id,$seq) = ($1,$2);
		@pieces = split(/\s+/,$seq);
		print HITS ">$id\n";
		if ($id =~ /^(\S+)_(\d+)_(\d+):/) 
		{
		    push(@{$hits{$1}},($2 < $3) ? [$2,$3] : [$3,$2]) ;
		}
		foreach $piece (@pieces)
		{
		    for ($i=0; ($i < length($piece)); $i += 100)
		    {
			$ln = ($i < (length($piece) - 100)) ? 100 : (length($piece) - $i);
			print HITS substr($piece,$i,$ln),"\n";
		    }
		    print HITS "---\n";
		}
	    }
	}
	close(SCAN);
    }
}
close(HITS);

@sets = &trans_closure_connected(\%connected,$show_conn);
#print STDERR &Dumper(\@sets); die "aborted";

open(SUMM,">$tmp_summ") || die "could not open $tmp_sum";
$n = 1;
open(SEQS,">$tmp_dir/sequences.fasta") || die "could not open $tmp_dir/sequences.fasta";
foreach $set (@sets)
{
    ($hits_in,$non_hits_in) = &matching_hits($set,\%hits);
    if (@$hits_in > 0)
    {
	print SUMM "SET $n\n\tPossible IS elements\n";
	foreach $_ (@$hits_in)
	{
	    if ($_ =~ /^\S+_(\d+)_(\d+)$/)
	    {
		print SUMM "\t\t$_\t",abs($2 - $1)+1,"\n";
	        &print_seq($fig,$genome,$_,\*SEQS);
	    }
	}
	print SUMM "\n";

	print SUMM "\tPossible Degraded IS elements\n";
	foreach $_ (@$non_hits_in)
	{
	    if ($_ =~ /^\S+_(\d+)_(\d+)$/)
	    {
		print SUMM "\t\t$_\t",abs($2 - $1)+1,"\n";
	        &print_seq($fig,$genome,$_,\*SEQS);
	    }
	}
	print SUMM "\n";
	$n++;
    }
}
close(SUMM);
close(SEQS);

sub matching_hits {
    my($set,$hits) = @_;

#    print STDERR &Dumper('set and hits',$set,$hits);
    my $in  = [];
    my $out = [];

    foreach my $x (@$set)
    {
	$x =~ /^(\S+)_(\d+)_(\d+)$/;
	my($c,$b,$e) = ($1,$2,$3);
	my $y;
	if ($y = $hits->{$c})
	{
	    my $i;
	    for ($i=0; ($i < @$y) && (! &overlaps($b,$e,$y->[$i]->[0],$y->[$i]->[1])); $i++) {}
	    if ($i < @$y)
	    {
		push(@$in,$x);
	    }
	    else
	    {
		push(@$out,$x);
	    }
	}
	else
	{
	    push(@$out,$x);
	}
    }
#    print STDERR &Dumper('in and out',$in,$out);
    return ($in,$out);
}

sub trans_closure_connected {
    my($connected,$show_conn) = @_;
    my @locs = sort keys(%$connected);

    my @clusters = ();
    my %in_clust;
    while (@locs > 0)
    {
	my $loc = shift @locs; 
	undef %in_clust;

	my $cluster = [];
	&add_clust($cluster,$loc,\%in_clust);
	
	my $x;
	if ($x = $connected{$loc})
	{
	    foreach my $y (@$x)
	    {
		&add_clust($cluster,$y,\%in_clust);
	    }
	}
#	print STDERR &Dumper(['cluster',$cluster]);
	my $i = 0;
	while ($i < @$cluster)
	{
	    $x = $cluster->[$i];
	    $x =~ /^(\S+)_(\d+)_(\d+)$/;
	    my($c1,$b1,$e1) = ($1,$2,$3);

	    my $j = @locs - 1;
	    while ($j >= 0)
	    {
		if (! $in_clust{$locs[$j]})
		{
		    $locs[$j] =~ /^(\S+)_(\d+)_(\d+)$/;
		    my($c2,$b2,$e2) = ($1,$2,$3);
		    if (($c1 eq $c2) && &overlaps($b1,$e1,$b2,$e2))
		    {
			if ($show_conn)
			{
			    my $key1 = &key($c1,$b1,$e1);
			    my $key2 = &key($c2,$b2,$e2);
			    print STDERR "$key1 connects to $key2 due to overlap\n";
			    print STDERR "$key2 connects to $key1 due to overlap\n";
			}
			&add_clust($cluster,$locs[$j],\%in_clust);
			if (my $x = $connected{$locs[$j]})
			{
			    foreach my $y (@$x)
			    {
				&add_clust($cluster,$y,\%in_clust);
			    }
			}
		    }
		}
		else
		{
#		    print STDERR "$locs[$j] does not overlap $c1 $b1 $e1\n";
		}
		$j--;
	    }
	    $i++;
#	    print STDERR &Dumper(["cluster with i=$i",$cluster]);
	}
#	print STDERR &Dumper($cluster,\%in_clust);
	push(@clusters,$cluster);
#	print STDERR &Dumper($cluster,\@locs);
	for(my $i = $#locs; ($i >= 0); $i--)
	{
	    if ($in_clust{$locs[$i]})
	    {
#		print STDERR "cutting $locs[$i]\n";
		splice(@locs,$i,1);
	    }
	}
#	print STDERR &Dumper(\@locs); die "aborted";
    }
    return @clusters;
}

sub not_rna {
    my($rnas,$contig,$beg,$end) = @_;

    if (my $x = $rnas->{$contig})
    {
	my $i;
	for ($i=0; ($i < @$x) && (! &overlaps($beg,$end,$x->[$i]->[0],$x->[$i]->[1])) &&
	                         (! &overlaps($end,$beg,$x->[$i]->[0],$x->[$i]->[1])); $i++) 
	{
#	    print &Dumper([$beg,$end,$x->[$i]->[0],$x->[$i]->[1]]);
	}
#	print &Dumper($i,scalar @$x);
	return ($i == @$x);
    }
    return 1;
}

sub overlaps {
    my($b1,$e1,$b2,$e2) = @_;
    my($sz);
    if ((($b1 < $e1) && ($b2 > $e2)) || ($b1 > $e1) && ($b2 < $e2)) { return 0 }
    ($b1,$e1) = ($b1 < $e1) ? ($b1,$e1) : ($e1,$b1);
    ($b2,$e2) = ($b2 < $e2) ? ($b2,$e2) : ($e2,$b2);
    if (&FIG::between($b1,$b2,$e1))
    {
	$sz = &FIG::min($e1,$e2) - $b2;
	return ($sz > (0.3 * ($e1 - $b1)));
    }
    if (&FIG::between($b2,$b1,$e2))
    {
	$sz = &FIG::min($e1,$e2) - $b1;
	return ($sz > (0.3 * ($e2 - $b2)));
    }
    return 0;
}

sub key {
    my($c,$b,$e) = @_;

    return ($b < $e) ? join("_",($c,$b,$e)) : join("_",($c,$e,$b));
}

sub add_clust {
    my($cluster,$x,$in_clust) = @_;

    if (! $in_clust->{$x})
    {
	push(@$cluster,$x);
	$in_clust->{$x} = 1;
    }
}

sub print_seq {
    my($fig,$genome,$loc,$fh) = @_;

    $seq = $fig->dna_seq($genome,$loc);
    &FIG::display_id_and_seq($loc,\$seq,$fh);
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3