[Bio] / FigKernelScripts / FFB2_compare_to_anno3.pl Repository:
ViewVC logotype

View of /FigKernelScripts/FFB2_compare_to_anno3.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Fri Aug 13 21:46:34 2010 UTC (9 years, 7 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.1: +2 -2 lines
fixes for release names

#
# Compare the results of a kmer test run to the annotations on the annotator seed.
#

use FIG;
use Data::Dumper;
use YAML::Any;
use strict;
use File::Basename;
use SeedUtils;

@ARGV == 1 or die "Usage: FFB2_compare_to_anno3 FFdir\n";

my $fig = new FIG;

my $ff_dir = shift;

my $ff_rel = validate($ff_dir);

print "Comparing release $ff_rel ($ff_dir) to annotator seed\n";

my $out_dir = "$ff_dir/comparisons/to_anno3";
&SeedUtils::verify_dir($out_dir);

my %all_genomes;
my @to_compare;

for my $kdir (<$ff_dir/tests/*>)
{
    my $k = basename($kdir);
    for my $outfile (<$kdir/*.out>)
    {
	my $outbase = basename($outfile);

	my ($genome) = $outbase =~ /^(\d+\.\d+)/;
	$all_genomes{$genome}++;
	push(@to_compare, [$k, $genome, $outfile]);
    }
}

#
# Find genome peg counts (common data).
#

my %genome_peg_count;
my %genome_fns;
for my $genome (keys %all_genomes)
{
    open(F, "<", "/vol/figfam-prod/test_genomes/$genome.peg.fasta") or die "cannot open /vol/figfam-prod/test_genomes/$genome.peg.fasta: $!";
    my $c = 0;
    my @pegs;
    while (<F>)
    {
	if (/^>(\S+)/)
	{
	    $c++;
	    push(@pegs, $1);
	}
    }
    close(F);
    $genome_peg_count{$genome} = $c;
    $genome_fns{$genome} = $fig->function_of_bulk(\@pegs);
}

for my $comp (sort { $a->[0] <=> $b->[0] or $a->[1] <=> $b->[1] } @to_compare)
{
    my($k, $genome, $f1) = @$comp;

    print "Compare @$comp\n";

    &compare($k, $genome, $f1, $genome_fns{$genome});
}

sub compare
{
    my($k, $genome, $calls, $anno_calls) = @_;

    my $results = {
	num_calls_anno_only => 0,
	num_calls_kmer_only => 0,
	num_calls_same => 0,
	num_calls_diff => 0,
    };
    

    open(F, "<", $calls) or die "Cannot open $calls: $!";

    while (<F>)
    {
	chomp;
	my($id, $func) = split(/\t/);
	$func = SeedUtils::strip_func($func);

	my $pfunc = SeedUtils::strip_func($anno_calls->{$id});

	if ($func eq '' && $pfunc eq '')
	{
	    # neither assigned, don't care.
	}
	elsif ($func eq '' && $pfunc ne '')
	{
	    # anno assigned, we did not
	    $results->{num_calls_anno_only}++;
	    push(@{$results->{calls_anno_only}}, [$id, $pfunc, $func]);
	}
	elsif ($func ne '' && $pfunc eq '')
	{
	    # we assigned, last did not
	    $results->{num_calls_kmer_only}++;
	    push(@{$results->{calls_kmer_only}}, [$id, $pfunc, $func]);
	}
	else
	{
	    # we both assigned, are same?
	    if ($func eq $pfunc)
	    {
		$results->{num_calls_same}++;
	    }
	    else
	    {
		$results->{num_calls_diff}++;
		push(@{$results->{calls_diff}}, [$id, $pfunc, $func]);
	    }
	}
    }

    close(F);

    open(OUT, ">", "$out_dir/$k.$genome") or die "Cannot write $out_dir/$k.$genome: $!";
    print OUT Dump($results);
    close(OUT);
}

sub count_lines
{
    my($f) = @_;
    if (!open(F, "<", $f))
    {
	return (0, 0);
    }
    my $n = 0;
    my %ss;
    while (<F>)
    {
	chomp;
	my($id, $fn, $subsys) = split(/\t/);
	$ss{$subsys}++;
	$n++;
    }
    close(F);
    return $n, int(keys %ss);
}

sub validate
{
    my($dir) = @_;

    -d $dir or die "$dir not a directory\n";
    -d "$dir/tests" or die "$dir has no test data available\n";

    my $base = basename($dir);
    if ($base =~ /^Release(\S+)$/)
    {
	return $1;
    }

    die "$dir doesn't have the right directory name (should be ReleaseXX)\n";
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3