[Bio] / FigKernelScripts / p3x-compare-patric-calls.pl Repository:
ViewVC logotype

View of /FigKernelScripts/p3x-compare-patric-calls.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Wed Nov 2 16:25:31 2016 UTC (3 years ago) by olson
Branch: MAIN
CVS Tags: HEAD
P3 utility scripts.

use Data::Dumper;
use File::Copy;
use File::Basename;
use POSIX;
use strict;
use Proc::ParallelLoop;
use Getopt::Long::Descriptive;
use IPC::Run 'run';
use IO::Handle;
use gjoseqlib;
use SeedUtils;
use Proc::ParallelLoop;

#
# Wishlist:
# PEG ID,
# newly generated annotation
# currently existing annotation in PATRIC
# genome ID
# organism name
# local family identifier
# global family identifier
#  

my($opt, $usage) = describe_options("%c %o output-dir summary-output-basename",
				    ["amr-roles=s", "File of AMR roles"],
				    ["parallel|p=i", "Number of parallel processe", { default => 1 }],
				    ["genome-dir=s", "Directory holding PATRIC genome data", { default => "/vol/patric3/downloads/genomes" }],
				    ["help|h", "Show this help message."],
				    );
print($usage->text), exit 0 if $opt->help;
die($usage->text) unless @ARGV == 2;

my $out_dir = shift;
my $sum_file_base = shift;

my $diff_file = "$sum_file_base.changed";
my $nocall_file = "$sum_file_base.no_call";
my $summary_file = "$sum_file_base.summary";
my $amr_file;

my %amr_roles;

if ($opt->amr_roles)
{
    open(A, "<", $opt->amr_roles) or die "Cannot open AMR roles file " . $opt->amr_roles . ": $!\n";
    while (<A>)
    {
	chomp;
	s/^\s+//;
	s/\s+$//;
	$amr_roles{$_}++;
    }
    close(A);
    $amr_file = "$sum_file_base.amr_features";
    open(AMR, ">", $amr_file) or die "Cannot write $amr_file: $!";
}

-d $out_dir or die "Output directory $out_dir does not exist\n";

open(DIFF, ">", $diff_file) or die "Cannot open $diff_file: $!\n";
open(NOCALL, ">", $nocall_file) or die "Cannot open $nocall_file: $!\n";
open(SUMMARY, ">", $summary_file) or die "Cannot open $summary_file: $!\n";

SUMMARY->autoflush(1);

my $genome_dir = $opt->genome_dir;

my @workbits;

for my $top (<$out_dir/*>)
{
    for my $gfile (<$top/*>)
    {
	my $g = basename($gfile);
	my $prot = "$genome_dir/$g/$g.PATRIC.faa";

	push(@workbits, [$gfile, $g, $prot]);
    }
}

my $n = @workbits;
my $per = ceil($n / $opt->parallel);

print "$n workbits, per=$per\n";

my @work;
my $n = 1;
my @idx;
while (@workbits)
{
    push(@idx, $n);
    my @some = splice(@workbits, 0, $per);
    my $nsome = @some;
    print "Work item $n has $nsome\n";
    push(@work, [$n, [@some]]);
    $n++;
}
my $nwork = @work;
print "NW=$nwork\n";

pareach \@work, sub {
    my($work) = @_;
    my($idx, $worklist) = @$work;

    open(DIFF_L, ">", $diff_file . ".p$idx") or die "Cannot open $diff_file.p$idx: $!\n";
    open(NOCALL_L, ">", $nocall_file . ".p$idx") or die "Cannot open $nocall_file.p$idx: $!\n";
    open(SUMMARY_L, ">", $summary_file . ".p$idx") or die "Cannot open $summary_file.p$idx: $!\n";
    SUMMARY_L->autoflush(1);
    if ($amr_file)
    {
	open(AMR_L, ">", $amr_file . ".p$idx") or die "Cannot write $amr_file.p$idx: $!";
    }
    
    for my $ent (@$worklist)
    {
	my($gfile, $g, $prot) = @$ent;

	eval {
	    
	    my %calls;
	    
	    open(G, "<", $gfile) or die "Cannot open $gfile: $!";
	    while (<G>)
	    {
		chomp;
		my($id, $fn, $score) = split(/\t/);
		$calls{$id} = [$fn, $score];
	    }
	    close(G);
	    
	    open(G, "<", $prot) or die "Cannot open $prot: $!";
	    my $same = my $diff = my $nocall = 0;
	    while (<G>)
	    {
		next unless /^>/;
		my($id, $fid, $pfn, $org) = /^>((fig\|[^|]+).+)\s{3}(.*)\s{3}(\[.*)/;
		if (!$fid)
		{
		    warn "Cannot parse $_\n";
		    next;
		}		
		
		my $call = $calls{$id};
		
		if ($call)
		{
		    my($fn, $score) = @$call;
		    
		    if ($amr_file)
		    {
			my $have_amr = grep { $amr_roles{$_} } SeedUtils::roles_of_function($fn);
			if ($have_amr)
			{
			    if ($fn eq $pfn)
			    {
				print AMR_L "$fid\t$fn\n";
			    }
			    else
			    {
				print AMR_L "$fid\t$fn\t$pfn\n";
			    }
			}
		    }
		    
		    if ($fn ne $pfn)
		    {
			print DIFF_L "$fid\t$pfn\t$fn\t$score\n";
			$diff++;
			
		    }
		    else
		    {
			$same++;
		    }
		}
		else
		{
		    print NOCALL_L "$fid\t$pfn\n";
		    $nocall++;
		}
	    }
	    print SUMMARY_L "$g\t$same\t$diff\t$nocall\n";
	    close(G);
	};
	if ($@)
	{
	    warn "Error processing $gfile, $g, $prot: $@";
	}
    }
    close(DIFF_L);
    close(NOCALL_L);
    close(SUMMARY_L);
    close(AMR_L) if $amr_file;
}, { Max_Workers => $opt->parallel};

for my $i (@idx)
{
    open(D, "<", "$diff_file.p$i") or die "Cannot open $diff_file.p$i: $!";
    copy(\*D, \*DIFF);
    close(D);

    open(D, "<", "$nocall_file.p$i") or die "Cannot open $nocall_file.p$i: $!";
    copy(\*D, \*NOCALL);
    close(D);

    open(D, "<", "$summary_file.p$i") or die "Cannot open $summary_file.p$i: $!";
    copy(\*D, \*SUMMARY);
    close(D);
    if ($amr_file)
    {
	open(D, "<", "$amr_file.p$i") or die "Cannot open $amr_file.p$i: $!";
	copy(\*D, \*AMR);
	close(D);
    }
}
	  

close(DIFF);
close(NOCALL);
close(SUMMARY);
close(AMR) if $amr_file;


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3