[Bio] / FigKernelScripts / all_bbhs.pl Repository:
ViewVC logotype

View of /FigKernelScripts/all_bbhs.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.16 - (download) (as text) (annotate)
Wed Sep 5 21:15:22 2007 UTC (12 years, 2 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.15: +1 -1 lines
add $ENV{SORT_ARGS} to sort invocations for running on large-memory machines

########################################################################
#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

#
# TODO: seed FIG genome list with extra OK genomes, for use in
# RAST import.
#

use strict;

my $tmpF      = "$FIG_Config::temp";
my $cutoff    = 1.0e-10;

use FIG;
my $fig = new FIG;

my $usage = "usage: all_bbhs [-pegsyn peg.synonyms] [-sims SimsDir] GenomesF BBHs";

my($genomesF, $simsDir, $bbhs, $peg_syn);

$simsDir = "$FIG_Config::data/Sims";
$peg_syn = "$FIG_Config::global/peg.synonyms";

while (@ARGV > 0 and $ARGV[0] =~ /^-/)
{
    my $arg = shift @ARGV;
    if ($arg =~ /^-sims/)
    {
	$simsDir = shift @ARGV;
    }
    elsif ($arg =~ /^-pegsyn/)
    {
	$peg_syn = shift @ARGV;
    }
    else
    {
	die $usage;
    }
}

(
 ($genomesF = shift @ARGV) &&
 ($bbhs     = shift @ARGV)
)
    || die $usage;

#
# Globals.
#
my %opened;

system "rm -rf $bbhs";
&FIG::verify_dir($bbhs);

#
# Ensure that all the genomes in the given list are asserted to be present.
#

my %genomes = map { ($_ =~ /^(\d+\.\d+)/) ? ($1 => 1) : () } `cat $genomesF`;

my @not_present = grep { not $fig->is_genome($_) } keys %genomes ;
warn "genomes not present: @not_present\n";
$fig->assert_genomes(@not_present);

&make_binary_format($fig,\%genomes);

sub make_binary_format {
    my($fig,$genomes) = @_;
    my $tmpF = "$FIG_Config::temp/best.by.genome.$$";
    &make_best_by_genome($fig,$genomes,$tmpF);

    foreach my $file (sort keys(%opened))
    {
	open(SBEST,"sort $ENV{SORT_ARGS} $tmpF/$file |") 
	    || die "could not open sorted $tmpF/best.by.genome";
	open(TMP,">$tmpF.bbhs.1") || die "could not open $tmpF.bbhs.1";

	my $line = <SBEST>;
	while ($line && ($line =~ /^(\S+)\t(\S+)/))
	{
	    my $curr1 = $1;
	    my $curr2 = $2;
	    my @set = ();
	    while ($line && ($line =~ /^(\S+)\t(\S+)\t(\S+)\t(\S+)/) && ($1 eq $curr1) && ($2 eq $curr2))
	    {
		push(@set,[$1,$2,$3,$4]);
		$line = <SBEST>;
	    }
	    @set = sort { $a->[2] <=> $b->[2] } @set;
	    if ((@set > 1) && ($set[0]->[1] eq $set[1]->[1]))
	    {
		print TMP join("\t",($set[0]->[0],$set[0]->[1],$set[0]->[2],$set[0]->[3])),"\n";
		print TMP join("\t",($set[0]->[1],$set[0]->[0],$set[0]->[2],$set[0]->[3])),"\n";

	    }
	}
	close(SBEST);
	close(TMP);
	&FIG::run("sort_and_split_bbhs $tmpF.bbhs.1 $bbhs");
    }
}

sub make_best_by_genome {
    my($fig,$genomes,$dir) = @_;
    my($id_trans,$sim_file,%seen,$id1,$x,$y,@ids1,@ids2,$line,$sc);
    my($pair,$key,$rewritten_peg);

    my $bbh_by_iden = {};

    &FIG::verify_dir($dir);
    open(TMPBBH,">$dir/identical") || die "could not open $dir/identical";

    opendir(SIMS,"$simsDir") || die "could not open $simsDir";
    my @sim_files = grep { $_ !~ /^\./ } readdir(SIMS);
    closedir(SIMS);

    ($id_trans,$rewritten_peg) = &load_syns($fig,\*TMPBBH,$bbh_by_iden,$genomes);
    print STDERR "loaded synonyms\n";

    foreach $sim_file (@sim_files)
    {
	print STDERR "processing $sim_file\n";
	open(SIMS,"cut -f1,2,11,12,13,14 $simsDir/$sim_file |")  
	    || die "could not open $sim_file";

	$line = <SIMS>;
	while ($line && ($line =~ /^(\S+)/))
	{
	    $id1 = $1;
	    undef %seen;
	    @ids1 = &trans_id($id_trans,$id1);

	    if ((@ids1 == 0) || $rewritten_peg->{$id1})
	    {
		while ($line && ($line =~ /^(\S+)/) && ($id1 eq $1))
		{
		    $line = <SIMS>;
		}
	    }
	    else
	    {
		while ($line && ($line =~ /^(\S+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)\t(\S+)/) && ($1 eq $id1))
		{
		    if ($3 <= $cutoff)
		    {
			$sc = $3;
			if (! $rewritten_peg->{$2})
			{
			    my $nsc = sprintf("%0.3f",$4 / (($5 > $6) ? $5 : $6));
			    @ids2 = &trans_id($id_trans,$2);
			    foreach $y (@ids2)
			    {
				$y =~ /^fig\|(\d+\.\d+)/;
				my $genome2 = $1;
				foreach $x (@ids1)
				{
				    $x =~ /^fig\|(\d+\.\d+)/;
				    my $genome1 = $1;
				    if ($genomes->{$genome1} &&
					$genomes->{$genome2} &&
					($genome1 != $genome2) &&
					(! $fig->is_deleted_fid($x)) &&
					(! $fig->is_deleted_fid($y)))
				    {
					if ($x lt $y)
					{
					    $pair = "$x\t$y";
					}
					else
					{
					    $pair = "$y\t$x";
					}
					
					$key = "$x\t$genome2";
					if ((! $seen{$key}) && (! $bbh_by_iden->{$key}))
					{
					    $seen{$key} = [$sc,$pair,$nsc,1];
					}
					elsif ($seen{$key}->[1] ne $pair)   # handles possible duplicates in sims
					{
					    my $old = &FIG::neg_log($seen{$key}->[0]);
					    my $new = &FIG::neg_log($sc);
					    if (($new - $old) > 5)
					    {
						$seen{$key} = [$sc,$pair,$nsc,1];
					    }
					    elsif ($new > ($old-5))
					    {
						$seen{$key}->[3] = 0;
						if ($new > $old)
						{
						    $seen{$key}->[0] = $sc;
						}
					    }
					}
				    }
				}
			    }
			}
		    }
		    $line = <SIMS>;
		}

		foreach $key (keys(%seen))
		{
		    my $tuple = $seen{$key};
		    my($sc,$pair,$nsc,$ok) = @$tuple;
		    if (defined($pair) && $ok)
		    {
			my $msg = join("\t",($pair,$sc,$nsc)) . "\n";
			&write_pair($dir,$msg);
		    }
		}
	    }
	}
#	print STDERR &Dumper([$sim_file,\%seen]);
	close(SIMS);
    }

    foreach my $file (keys(%opened))
    {
	undef $opened{$file};
    }
    close(TMPBBH);
    $opened{"identical"} = 1;
}
	
use FileHandle;

sub write_pair {
    my($dir,$line) = @_;

    if ($line =~ /^fig\|\d*(\d\d)\./)
    {
	my($file,$fh);

	$file = $1;
	$fh   = $opened{$file};
	if (! $fh)
	{
	    if (defined($fh = new FileHandle ">$dir/$file")) 
	    {
		$opened{$file} = $fh;
	    }
	    else
	    {
		die "could not open $dir/$file";
	    }
	}
	print $fh $line;
    }
}
	    
sub trans_id {
    my($id_trans,$id) = @_;

    my $ids = $id_trans->{$id};
    if ($ids && (@$ids > 0))
    {
	return @$ids;
    }
    elsif ($id =~ /^fig\|/)
    {
	return ($id);
    }
    else
    {
	return ();
    }
}

sub load_syns {
    my($fig,$fh,$bbh_by_iden,$genomes) = @_;

    my($syns,$maj,@fig,$rewritten_peg);    

    open(SYNS,"<$peg_syn") || die "could not open $peg_syn: $!";
    $syns = {};
    $rewritten_peg = {};
    while (defined($_ = <SYNS>))
    {
	if ($_ =~ /^([^,]+),\d+\t(\S+)/)
	{
	    $maj = $1;
	    my $same_set = $2;
	    @fig = map { $_ =~ /^([^,]+)/; $1 } grep { $_ =~ /^fig\|/ } (split(/;/,$same_set),$maj);
	    if (@fig > 0)
	    {
		&dump_bbhs_for_identical($fig,\@fig,$fh,$bbh_by_iden,$genomes);
		$syns->{$maj} = [@fig];
		foreach my $peg (@fig)
		{
		    $rewritten_peg->{$peg} = 1;
		}
	    }
	}
    }
    close(SYNS);
    return ($syns,$rewritten_peg);
}
    

sub dump_bbhs_for_identical {
    my($fig,$syn,$fh,$bbh_by_iden,$genomes) = @_;
    my($peg1,$peg2,%genome_count,$key);

    foreach $peg1 (@$syn)
    {
	if (! $fig->is_deleted_fid($peg1))
	{
	    $genome_count{&FIG::genome_of($peg1)}++;
	}
    }

    foreach $peg1 (@$syn)
    {
	next if ($fig->is_deleted_fid($peg1));
	if ($genome_count{&FIG::genome_of($peg1)} == 1)
	{
	    foreach $peg2 (@$syn)
	    {
		next if ($fig->is_deleted_fid($peg2));
		if (($genome_count{&FIG::genome_of($peg2)} == 1) && ($peg2 ne $peg1))
		{
		    if ($genomes->{&FIG::genome_of($peg2)} && $genomes->{&FIG::genome_of($peg1)})
		    {
			print $fh join("\t",sort ($peg1,$peg2)),"\t0\t2\n";
			$key = "$peg1\t" . &FIG::genome_of($peg2);
			$bbh_by_iden->{$key} = 1;
		    }
		}
	    }
	}
    }
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3