[Bio] / FigKernelScripts / check_for_new_genomes.pl Repository:
ViewVC logotype

View of /FigKernelScripts/check_for_new_genomes.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (as text) (annotate)
Tue May 19 02:27:26 2009 UTC (10 years, 5 months ago) by overbeek
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2009_07_09, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.2: +7 -7 lines
print stats as you go

########################################################################
use strict;

my $chunkF   = "tmp_chunk_$$.fasta";

use FIG;
my $fig = new FIG;

my($old,$new);
my $usage = "usage: check_new_genomes OldF NewF";
(
 ($old = shift @ARGV) &&
 ($new = shift @ARGV)
)
    || die $usage;

my $file_of = {};
foreach my $pair (map { $_ =~ /^(\S+)\s+(\S+)/; [$1,$2] } `cat $old $new`)
{
    my($genome,$file) = @$pair;

    if ($file_of->{$genome})     { die "$genome is duplicated" }
    if ($genome !~ /^\d+\.\d+$/) { die "$genome is not in the proper format" }
    $file_of->{$genome} = $file;
    if (! -s "$file.nsq")
    { 
	&FIG::run("formatdb -i $file -p F");
   }
}

my $replace = {};
my @poss = &get_poss($chunkF,$new,$old);
# my @poss = map { chomp; [split(/\s+/,$_)] } `cat tmp_poss`;

foreach my $x (@poss)
{
    my($g1,$g2) = @$x;
    my $gs1 = $fig->genus_species($g1);
    my $gs2 = $fig->genus_species($g2);
    print STDERR "Checking $gs1 [$g1] and $gs2 [$g2]\n";
    &test(@$x,$file_of,$replace,$fig);
}

my %oldH = map { $_ =~ /^(\S+)/; $1 => 1 } `cat $old`;
my %newH = map { $_ =~ /^(\S+)/; $1 => 1 } `cat $new`;

my($from,$to,$n);
foreach $_ (keys(%$replace))
{
    $from = $_;
    $n = 10;
    while ($n-- && $replace->{$replace->{$from}})
    {
	$from = $replace->{$from};
    }

    if ($n == 0)
    {
	print STDERR &Dumper(['CYCLE',$from,$replace]);
	die "aborted";
    }
    else
    {
	if ($oldH{$_} && $newH{$replace->{$_}})
	{
	    print $replace->{$_}," REPLACES $_\n";
	}
	else
	{
	    print "DELETE\t$_\n";
	}
    }
}

sub get_poss {
    my($chunkF,$new,$old) = @_;

    my @against = map { $_ =~ /^(\S+)\s+(\S+)/; [$1,$2] } `cat $old $new`;

    my $hits = {};
    foreach my $pair (map { $_ =~ /^(\S+)\s+(\S+)/; [$1,$2] } `cat $new`)
    {
	my($genome,$file) = @$pair;
	$/ = "\n>";
	open(IN,"<$file") || die "could not open $file";
	my $done = 0;
	while ((! $done) && (defined($_ = <IN>)))
	{
	    chomp;
	    if ($_ =~ /^>?(\S+)[^\n]*\n(.*)/s)
	    {
		my $id = $1;
		my $seq = $2;
		$seq =~ s/\s//g;
		if (length($seq) > 700)
		{
		    $done = 1;
		    $seq = substr($seq,100,500);
		    open(CHUNK,">$chunkF") || die "could not open $chunkF";
		    print CHUNK ">$genome:$id\n$seq\n";
		    close(CHUNK);
		    $/ = "\n";
		    &process_chunk($chunkF,\@against,$hits,$genome);
		    unlink($chunkF);
		}
		$/ = "\n>";
	    }
	}
	close(IN);
	$/ = "\n";
    }
    return sort { $a->[0] <=> $b->[0] } map { [split(/\t/,$_)] } keys(%$hits);
}

sub process_chunk {
    my($chunkF,$against,$hits,$genome) = @_;

    foreach my $pair (@$against)
    {
	my($genome2,$contigsF) = @$pair;
	my @blastout = `blastall -m8 -i $chunkF -d $contigsF -p blastn -FF -e 1.0e-100`;
	foreach my $hit (map { chomp; [split(/\t/,$_)] } @blastout)
	{
	    my($id1,$id2,$iden,undef,undef,undef,$b1,$e1,$b2,$e2) = @$hit;
	    if (($genome ne $genome2) &&
		($iden >= 98) &&
		(abs($e1 - $b1) > 495))
	    {
		my($g1,$g2) = sort { $a <=> $b } ($genome,$genome2);
		$hits->{"$g1\t$g2"} = 1;
	    }
	}
    }
}

use Digest::MD5;

sub test {
    my($genome1,$genome2,$file_of,$replace,$fig) = @_;

    my ($md52id1,$id2md51) = &load_md5($file_of->{$genome1});
    my ($md52id2,$id2md52) = &load_md5($file_of->{$genome2});
    
    my $tmpF1 = "tmp_left_1_$$.fasta";
    my $tmpF2 = "tmp_left_2_$$.fasta";
    my $sizes1 = &build_remaining($file_of->{$genome1},$id2md51,$md52id2,$tmpF1);
    my $sizes2 = &build_remaining($file_of->{$genome2},$id2md52,$md52id1,$tmpF2);
#   print STDERR &Dumper($md52id1,$id2md51,$md52id2,$id2md52,$sizes1,$sizes2);

    my($gs1,$gs2);
    if ((! -s $tmpF1) && (! -s $tmpF2))  ### exactly the same
    {
	if ((-M $file_of->{$genome1}) <= -M $file_of->{$genome2})  ## if Genome1 is more recent
	{
	    &replace($fig,$replace,$genome1,$genome2,1.0,1.0);
	}
	else
	{
	    &replace($fig,$replace,$genome2,$genome1,1.0,1.0);
	}
    }
    elsif (-s $tmpF1 && (! -s $tmpF2))     ###  Genome1 has some extra stuff
    {
	&replace($fig,$replace,$genome2,$genome1,0.0,1.0);
    }
    elsif (-s $tmpF2 && (! -s $tmpF1))     ###  Genome2 has some extra stuff
    {
	&replace($fig,$replace,$genome1,$genome2,1.0,0.0);
    }
    else
    {
	&resolve_with_blast($fig,$genome1,$genome2,$tmpF1,$tmpF2,$replace,$sizes1,$sizes2,$file_of);
    }
    system "rm $tmpF1\* $tmpF2\*";
}

sub build_remaining {
    my($file,$id2md5,$md52id,$tmpF) = @_;

    my $sizes = {};
    open(IN,"<$file") || die "could not open $file";
    open(OUT,">$tmpF") || die "could not open $tmpF";
    $/ = "\n>";
    
    while (defined($_ = <IN>))
    {
	chomp;
	if ($_ =~ /^>?(\S+)[^\n]*\n(.*)/s)
	{
	    my $id = $1;
	    my $seq = $2;
	    if (! $md52id->{$id2md5->{$id}})
	    {
		my $ln = length($seq);
		if ($ln > 200)
		{
		    $sizes->{$id} += ($ln - 200);
		    print OUT ">$id\n$seq\n";
		}
	    }
	}
    }
    close(IN);
    close(OUT);
    $/ = "\n";

    return $sizes;
}

sub resolve_with_blast {
    my($fig,$genome1,$genome2,$tmpF1,$tmpF2,$replace,$sizes1,$sizes2,$file_of) = @_;

    &FIG::run("formatdb -i $tmpF1 -p F");
    &FIG::run("formatdb -i $tmpF2 -p F");
    my @blast1 = map { chomp; [split(/\t/,$_)] }`blastall -m8 -i $tmpF1 -d $tmpF2 -p blastn -FF -e 1.0e-100`;
    my @blast2 = map { chomp; [split(/\t/,$_)] }`blastall -m8 -i $tmpF2 -d $tmpF1 -p blastn -FF -e 1.0e-100`;

    my $cov1 = &covered(\@blast1,$sizes1,$genome1);
    my $cov2 = &covered(\@blast2,$sizes2,$genome2);
    print STDERR "cov1=$cov1 cov2=$cov2\n";

    my $THRESH = 0.98;
    if    (($cov1 >= $THRESH) && (! ($cov2 >= $THRESH))) { &replace($fig,$replace,$genome1,$genome2,$cov1,$cov2) }
    elsif (($cov2 >= $THRESH) && (! ($cov1 >= $THRESH))) { &replace($fig,$replace,$genome2,$genome1,$cov2,$cov1) }
    elsif (($cov1 >= $THRESH) && ($cov2 >= $THRESH))
    {
	my($gs1,$gs2);
	if ((-M $file_of->{$genome1}) <= (-M $file_of->{$genome2})) 
	{ 
	    &replace($fig,$replace,$genome1,$genome2,$cov1,$cov2); 
	}
	else                                                        
	{ 
	    &replace($fig,$replace,$genome2,$genome1,$cov2,$cov1) ;
	}
    }
}

sub covered {
    my($blast,$sizes,$genome) = @_;

    my $all = 0;
    foreach $_ (keys(%$sizes))
    {
#	print STDERR "contig: $_ $sizes->{$_}\n";
	$all += $sizes->{$_};
    }
#   print STDERR &Dumper($all,$sizes);

    my @cov = sort { ($a->[0] cmp $b->[0]) or ($a->[1] <=> $b->[1]) }
	      map { [$_->[0],&FIG::min($_->[6],$_->[7]),&FIG::max($_->[6],$_->[7])] } 
              grep { $_->[2] >= 98 } 
              @$blast;

    my $by_contig = {};

    my($i,$j,$bound);
    $i = 0;

    my $tot = 0;
    while ($i < @cov)
    {
#	print STDERR join("\t",($cov[$i]->[0],$cov[$i]->[1],$cov[$i]->[2])),"\n";
	$tot += ($cov[$i]->[2] - $cov[$i]->[1]) + 1;

	if ($i == $#cov)
	{
	    $i++;
	}
	else
	{
	    $bound = $cov[$i]->[2];
#	    print STDERR "bound set to $bound initially\n";
	    $j = $i+1;
	    while (($j < @cov) && ($cov[$j]->[0] eq $cov[$i]->[0]) && ($cov[$j]->[1] <= $bound))
	    {
		if ($cov[$j]->[2] > $bound)
		{
		    $tot += ($cov[$j]->[2] - $bound) + 1;
#		    print STDERR join("\t",($cov[$j]->[0],$bound,$cov[$j]->[2])),"\n";
		    $bound = $cov[$j]->[2];
#		    print STDERR "bound set to $bound\n";
		}
		$j++;
	    }
	    $i = $j;
	}
    }
    my $frac = sprintf("%0.3f",$tot/$all);
#   print STDERR "coverage: $frac\n";
    return $frac;
}

sub load_md5 {
    my($file) = @_;

    my $md52id = {};
    my $id2md5 = {};

    $/ = "\n>";
    open(CONTIGS,"<$file") || confess "could not open $file";
    while (defined($_ = <CONTIGS>))
    {
	chomp;
	if ($_ =~ /^>?(\S+)[^\n]*\n(.*)/s)
	{
	    my $id = $1;
	    my $seq = uc $2;
	    $seq =~ s/\s//g;
	    my $md5 = Digest::MD5::md5_hex( $seq );
	    $md52id->{$md5} = $id;
	    $id2md5->{$id}  = $md5;
	}
    }
    $/ = "\n";
    close(CONTIGS);
    return ($md52id,$id2md5);
}

sub replace {
    my($fig,$replace,$genome1,$genome2) = @_;

    my $gs1 = $fig->genus_species($genome1); 
    my $gs2 = $fig->genus_species($genome2); 
    print STDERR "$genome1 [$gs1] => $genome2 [$gs2]\n";
    $replace->{$genome1} = $genome2;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3