[Bio] / FigKernelScripts / FFB2_compare_tests.pl Repository:
ViewVC logotype

View of /FigKernelScripts/FFB2_compare_tests.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Thu Apr 1 19:28:40 2010 UTC (9 years, 8 months ago) by olson
Branch: MAIN
CVS Tags: rast_rel_2010_0526
figfam build updates

#
# Compare the results of two tested FIGfam dirs, writing the results into the
# newer one.
#

use Data::Dumper;
use YAML::Any;
use strict;
use File::Basename;
use SeedUtils;

@ARGV == 2 or die "Usage: FFB2_compare_tests FFdir1 FFdir2\n";

my $ff_dir_1 = shift;
my $ff_dir_2 = shift;

my $ff_rel_1 = validate($ff_dir_1);
my $ff_rel_2 = validate($ff_dir_2);

print "Comparing release $ff_rel_1 ($ff_dir_1) to release $ff_rel_2 ($ff_dir_2)\n";

my $out_dir = "$ff_dir_2/comparisons/$ff_rel_1";
&SeedUtils::verify_dir($out_dir);

#
# Find the set of kmers & genomes in common between the two test runs.
#

my %all_genomes;
my @to_compare;

for my $f1kdir (<$ff_dir_1/tests/*>)
{
    my $k = basename($f1kdir);
    my $f2kdir = "$ff_dir_2/tests/$k";
    if (! -d $f2kdir)
    {
	warn "Skipping $k, not present in release $ff_rel_2\n";
	next;
    }

    for my $outfile (<$f1kdir/*.out>)
    {
	my $outbase = basename($outfile);

	if (! -f "$f2kdir/$outbase")
	{
	    warn "Skipping $outbase, not present in release $ff_rel_2\n";
	    next;
	}

	my ($genome) = $outbase =~ /^(\d+\.\d+)/;
	$all_genomes{$genome}++;
	push(@to_compare, [$k, $genome, $outfile, "$f2kdir/$outbase"]);
    }
}

#
# Find genome peg counts (common data).
#

my %genome_peg_count;
for my $genome (keys %all_genomes)
{
    open(F, "<", "/vol/figfam-prod/test_genomes/$genome.peg.fasta") or die "cannot open /vol/figfam-prod/test_genomes/$genome.peg.fasta: $!";
    my $c = 0;
    while (<F>)
    {
	/^>/ and $c++;
    }
    close(F);
    $genome_peg_count{$genome} = $c;
}

for my $comp (sort { $a->[0] <=> $b->[0] or $a->[1] <=> $b->[1] } @to_compare)
{
    my($k, $genome, $f1, $f2) = @$comp;

    print "Compare @$comp\n";

    &compare($k, $genome, $f1, $f2);
}

#
# Compare two runs of the kmers.
#
# Compute a bunch of metrics, put in data structure, and
# write the YAML version into $ff_dir_2/comparisons/$ff_rel_1/$k.$genome.
#
sub compare
{
    my($k, $genome, $f1, $f2) = @_;

    my $results = {};

    $results->{peg_count} = $genome_peg_count{$genome};
    $results->{kmer} = $k;
    $results->{genome} = $genome;
    $results->{this_version} = $ff_rel_2;
    $results->{prev_version} = $ff_rel_1;
    $results->{this_data} = $f2;
    $results->{prev_data} = $f1;

    #
    # Read first run.
    #
    open(F, "<", $f1) or die "Cannot open $f1: $!";

    my %r1;
    my $num_assigned_1 = 0;
    while (<F>)
    {
	chomp;
	my($id, $func) = split(/\t/);
	$func = SeedUtils::strip_func($func);
	$r1{$id} = $func;
	$num_assigned_1++;
    }

    $results->{prev_num_assigned} = $num_assigned_1;
    
    #
    # Read second run.
    #
    open(F, "<", $f2) or die "Cannot open $f2: $!";
    my $num_assigned_2 = 0;
    while (<F>)
    {
	chomp;
	my($id, $func) = split(/\t/);
	$func = SeedUtils::strip_func($func);

	my $pfunc = $r1{$id};

	if ($func eq '' && $pfunc eq '')
	{
	    # neither assigned, don't care.
	}
	elsif ($func eq '' && $pfunc ne '')
	{
	    # last assigned, we did not
	    $results->{num_calls_lost}++;
	    push(@{$results->{calls_lost}}, [$id, $pfunc, $func]);
	}
	elsif ($func ne '' && $pfunc eq '')
	{
	    # we assigned, last did not
	    $results->{num_calls_gained}++;
	    push(@{$results->{calls_gained}}, [$id, $pfunc, $func]);
	}
	else
	{
	    # we both assigned, are same?
	    if ($func eq $pfunc)
	    {
		$results->{num_calls_same}++;
	    }
	    else
	    {
		$results->{num_calls_diff}++;
		push(@{$results->{calls_diff}}, [$id, $pfunc, $func]);
	    }
	}

	$num_assigned_2++;
    }

    $results->{this_num_assigned} = $num_assigned_2;

    close(F);

    open(OUT, ">", "$out_dir/$k.$genome") or die "Cannot write $out_dir/$k.$genome: $!";
    print OUT Dump($results);
    close(OUT);
}


sub validate
{
    my($dir) = @_;

    -d $dir or die "$dir not a directory\n";
    -d "$dir/tests" or die "$dir has no test data available\n";

    my $base = basename($dir);
    if ($base =~ /^Release(\d+)$/)
    {
	return $1;
    }

    die "$dir doesn't have the right directory name (should be ReleaseXX)\n";
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3