[Bio] / FigKernelScripts / FFB2_compare_tests.pl Repository:
ViewVC logotype

View of /FigKernelScripts/FFB2_compare_tests.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (download) (as text) (annotate)
Mon Feb 14 22:44:07 2011 UTC (8 years, 10 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2014_0729, mgrast_dev_02212011, mgrast_release_3_0, mgrast_dev_03252011, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.3: +10 -2 lines
Update to figfam build code

#
# Compare the results of two tested FIGfam dirs, writing the results into the
# newer one.
#

use Data::Dumper;
use YAML::Any;
use strict;
use File::Basename;
use SeedUtils;

@ARGV == 2 or die "Usage: FFB2_compare_tests FFdir1 FFdir2\n";

my $ff_dir_1 = shift;
my $ff_dir_2 = shift;

my $ff_rel_1 = validate($ff_dir_1);
my $ff_rel_2 = validate($ff_dir_2);

print "Comparing release $ff_rel_1 ($ff_dir_1) to release $ff_rel_2 ($ff_dir_2)\n";

my $out_dir = "$ff_dir_2/comparisons/$ff_rel_1";
&SeedUtils::verify_dir($out_dir);

#
# Find the set of kmers & genomes in common between the two test runs.
#

my %all_genomes;
my @to_compare;

for my $f1kdir (<$ff_dir_1/tests/*>)
{
    my $k = basename($f1kdir);
    my $f2kdir = "$ff_dir_2/tests/$k";
    if (! -d $f2kdir)
    {
	warn "Skipping $k, not present in release $ff_rel_2\n";
	next;
    }

    for my $outfile (<$f1kdir/*.out>)
    {
	my $outbase = basename($outfile);

	if (! -f "$f2kdir/$outbase")
	{
	    warn "Skipping $outbase, not present in release $ff_rel_2\n";
	    next;
	}

	my ($genome) = $outbase =~ /^(\d+\.\d+)/;
	$all_genomes{$genome}++;
	push(@to_compare, [$k, $genome, $outfile, "$f2kdir/$outbase"]);
    }
}

#
# Find genome peg counts (common data).
#

my %genome_peg_count;
for my $genome (keys %all_genomes)
{
    open(F, "<", "/vol/figfam-prod/test_genomes/$genome.peg.fasta") or die "cannot open /vol/figfam-prod/test_genomes/$genome.peg.fasta: $!";
    my $c = 0;
    while (<F>)
    {
	/^>/ and $c++;
    }
    close(F);
    $genome_peg_count{$genome} = $c;
}

for my $comp (sort { $a->[0] <=> $b->[0] or $a->[1] <=> $b->[1] } @to_compare)
{
    my($k, $genome, $f1, $f2) = @$comp;

    print "Compare @$comp\n";

    &compare($k, $genome, $f1, $f2);
}

#
# Compare two runs of the kmers.
#
# Compute a bunch of metrics, put in data structure, and
# write the YAML version into $ff_dir_2/comparisons/$ff_rel_1/$k.$genome.
#
sub compare
{
    my($k, $genome, $f1, $f2) = @_;

    my $results = {};

    $results->{peg_count} = $genome_peg_count{$genome};
    $results->{kmer} = $k;
    $results->{genome} = $genome;
    $results->{this_version} = $ff_rel_2;
    $results->{prev_version} = $ff_rel_1;
    $results->{this_data} = $f2;
    $results->{prev_data} = $f1;
    $results->{num_calls_lost} = 0;
    $results->{num_calls_gained} = 0;
    $results->{num_calls_same} = 0;
    $results->{num_calls_diff} = 0;
    
    #
    # Read first run.
    #
    open(F, "<", $f1) or die "Cannot open $f1: $!";

    my %r1;
    my $num_assigned_1 = 0;
    while (<F>)
    {
	chomp;
	my($id, $func) = split(/\t/);
	$func = SeedUtils::strip_func($func);
	$r1{$id} = $func;
	$num_assigned_1++;
    }

    $results->{prev_num_assigned} = $num_assigned_1;

    #
    # Read second run.
    #
    open(F, "<", $f2) or die "Cannot open $f2: $!";
    my $num_assigned_2 = 0;
    while (<F>)
    {
	chomp;
	my($id, $func) = split(/\t/);
	$func = SeedUtils::strip_func($func);

	my $pfunc = delete $r1{$id};

	if ($func eq '' && $pfunc eq '')
	{
	    # neither assigned, don't care.
	}
	elsif ($func eq '' && $pfunc ne '')
	{
	    # last assigned, we did not
	    $results->{num_calls_lost}++;
	    push(@{$results->{calls_lost}}, [$id, $pfunc, $func]);
	}
	elsif ($func ne '' && $pfunc eq '')
	{
	    # we assigned, last did not
	    $results->{num_calls_gained}++;
	    push(@{$results->{calls_gained}}, [$id, $pfunc, $func]);
	}
	else
	{
	    # we both assigned, are same?
	    if ($func eq $pfunc)
	    {
		$results->{num_calls_same}++;
	    }
	    else
	    {
		$results->{num_calls_diff}++;
		push(@{$results->{calls_diff}}, [$id, $pfunc, $func]);
	    }
	}

	$num_assigned_2++;
    }

    for my $id (keys %r1)
    {
	# last assigned, we did not
	$results->{num_calls_lost}++;
	my $pfunc = $r1{$id};
	push(@{$results->{calls_lost}}, [$id, $pfunc, undef]);
    }

    $results->{this_num_assigned} = $num_assigned_2;

    close(F);

    #
    # Compute differences in metabolic reconstructions.
    # They are in 
    #
    my $meta_old = $f1;
    $meta_old =~ s/\.out$/.reconstruction/;
    my $meta_new = $f2;
    $meta_new =~ s/\.out$/.reconstruction/;
    
    my ($in_rec_old, $ss_old) = &count_lines($meta_old);
    my ($in_rec_new, $ss_new) = &count_lines($meta_new);

    $results->{prev_ss_in_reconstruction} = $ss_old;
    $results->{this_ss_in_reconstruction} = $ss_new;
    $results->{ss_in_reconstruction_gained} = int($ss_new - $ss_old);

    $results->{prev_pegs_in_reconstruction} = $in_rec_old;
    $results->{this_pegs_in_reconstruction} = $in_rec_new;
    $results->{pegs_in_reconstruction_gained} = int($in_rec_new - $in_rec_old);

    open(OUT, ">", "$out_dir/$k.$genome") or die "Cannot write $out_dir/$k.$genome: $!";
    print OUT Dump($results);
    close(OUT);
}

sub count_lines
{
    my($f) = @_;
    if (!open(F, "<", $f))
    {
	return (0, 0);
    }
    my $n = 0;
    my %ss;
    while (<F>)
    {
	chomp;
	my($id, $fn, $subsys) = split(/\t/);
	$ss{$subsys}++;
	$n++;
    }
    close(F);
    return $n, int(keys %ss);
}

sub validate
{
    my($dir) = @_;

    -d $dir or die "$dir not a directory\n";
    -d "$dir/tests" or die "$dir has no test data available\n";

    my $base = basename($dir);
    if ($base =~ /^Release(\S+)$/)
    {
	return $1;
    }

    die "$dir doesn't have the right directory name (should be ReleaseXX)\n";
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3