[Bio] / FigKernelScripts / get_PSEED_assignment_updates.pl Repository:
ViewVC logotype

View of /FigKernelScripts/get_PSEED_assignment_updates.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (download) (as text) (annotate)
Mon Feb 14 22:47:40 2011 UTC (9 years ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2014_0729, mgrast_dev_02212011, mgrast_release_3_0, mgrast_dev_03252011, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011
Changes since 1.4: +149 -28 lines
More fixups.

use strict;
use SeedEnv;
use Getopt::Long;
use Parallel::Fork::BossWorker;
use FileHandle;
use Data::Dumper;

my $ff_dir;
my @other_genome_files;
my $output_file;
my $log_file;
my $parallel = 0;
my $force_genome;
my $fams_file;
my $override_file;
my $pseed_file;

my $rc = GetOptions("ff=s" => \$ff_dir,
		    "override=s" => \$override_file,
		    "genomes=s" => \@other_genome_files,
		    "force=s" => \$force_genome,
		    "output=s" => \$output_file,
		    "log=s" => \$log_file,
		    "fams=s" => \$fams_file,
		    "parallel=i" => \$parallel,
		    "pseed-genomes=s" => \$pseed_file,
		   );

if (!$rc || @ARGV != 0)
{
    die "Usage: get_PSEED_assignment_updates [-ff figfam-directory] [-genomes genome-file] [-output output-file] [-log log-file]\n";
}

my %override_fns;
if ($override_file ne '')
{
    if (open(FF,"<", $override_file))
    {
	while (<FF>)
	{
	    chomp;
	    my($id, $func) = split(/\t/);
	    $override_fns{$id} = $func;
	}
	close(FF);
    }
    else
    {
	die "Cannot open override file $override_file: $!";
    }
}

#
# Force to anno seed.
#
delete $ENV{SAS_SERVER};
my $sapO = SAPserver->new();
my $annO;

if ($ff_dir)
{
    #
    # Defined below.
    #
    $annO = AnnoUsingFFDir->new($ff_dir);
}
else
{
    $annO = ANNOserver->new();
}


# $genH has the Annotator SEED genomes
my $genH = $sapO->all_genomes( -complete => 1);

#
# Other genomes we might want to include (e.g. phages)
#

for my $f (@other_genome_files)
{
    open(F, "<", $f) or die "cannot open $f: $!";
    my $n;
    while (<F>)
    {
	if (/(\d+\.\d+)/)
	{
	    $genH->{$1}++;
	    $n++;
	}
    }
    close(F);
    print "Loaded $n genomes from $f\n";
}

my $output_fh = \*STDOUT;
if (defined($output_file))
{
    open($output_fh, ">", $output_file) or die "Cannot open output $output_file: $!";
    $output_fh->autoflush(1);
}

my $log_fh;
if (defined($log_file))
{
    open($log_fh, ">", $log_file) or die "Cannot open log $log_file: $!";
    $log_fh->autoflush(1);
}

my $fams_fh;
if (defined($fams_file))
{
    open($fams_fh, ">", $fams_file) or die "Cannot open fams file $fams_file: $!";
    $fams_fh->autoflush(1);
}


use FIG;
use FIG_Config;
my $fig = new FIG;  # This references the PSEED;

if ($FIG_Config::data !~ m,/vol/pseed/,)
{
    die "$0 must be invoked after sourcing the PSEED environment\n";
}

my $nA = keys(%$genH);
my $nP = $fig->genomes('complete');

if ($nP < (2 * $nA)) { die "nA=$nA nP=$nP" }

#
# Compute work list.
#

my @work;

my @pseed_genomes = $fig->genomes;
@pseed_genomes = ($force_genome) if defined($force_genome);

#
# Support reading the file of jobs that Maulik provides.
#
my %genome_job;
if ($pseed_file ne '')
{
    @pseed_genomes = ();
    open(PS, "<", $pseed_file) or die "Cannot open pseed genomes file $pseed_file: $!";
    while (<PS>)
    {
	chomp;
	my($jobid, undef, $genome) = split(/\t/);

	if ($genome eq '')
	{
	    open(F, "<", "/vol/rast-prod/jobs/$jobid/GENOME_ID") or die "cannot read /vol/rast-prod/jobs/$jobid/GENOME_ID: $!";
	    $genome = <F>;
	    chomp $genome;
	    if ($genome eq '')
	    {
		die "bad /vol/rast-prod/jobs/$jobid/GENOME_ID";
	    }
	}

	if (-d "/vol/pseed/FIGdisk/FIG/Data/Organisms/$genome")
	{
	    warn "$genome already in pseed\n";
	    next;
	}
	$genome_job{$genome} = $jobid;
	push(@pseed_genomes, $genome);
    }
    close(PS);
}
    
foreach my $genome (sort {$a <=> $b } @pseed_genomes)   # for each PSEED genome
{
    if ($genome_job{$genome})
    {
	push(@work, [$genome, "rast", $genome_job{$genome}]);
    }
    elsif ($genH->{$genome})
    {
	push(@work, [$genome, "anno"]);
    }
    else
    {
	push(@work, [$genome, 'non-anno']);
    }
}

if (!$parallel)
{
    for my $ent (@work)
    {
	my ($genome, $kind, $job) = @$ent;

	my @out;
	if ($kind eq 'anno')
	{
	    @out = &process_annotator_seed_genome($sapO,$fig,$genome);
	}
	elsif ($kind eq 'rast')
	{
	    @out = &process_rast_genome($annO,$fig,$genome, $job);
	}
	else
	{
	    @out = &process_non_anno_seed_genome($annO,$fig,$genome);
	}
	for my $ent (@out)
	{
	    my($fid, $old, $new, $fam, $score, $hits, $ovhits, $details) = @$ent;

	    print $output_fh "$fid\t$new\t$fam\n";
	    if ($log_fh)
	    {
		print $log_fh "$fid\t$old\t$new\t$fam\t$score\t$hits\t$ovhits\n";
		if (ref($details))
		{
		    print $log_fh "\t", join("\t", @$_), "\n" for @$details;
		}
	    }
	}
    }
    exit;
}

#
# Else try the boss-worker setup.
#

undef $fig;

my $mgr = Parallel::Fork::BossWorker->new(work_handler => \&handle_work,
					  result_handler => \&handle_result,
					  worker_count => $parallel);
map { $mgr->add_work({ item => $_ }) } @work;
$mgr->process();

my $worker_initialized = 0;

sub handle_work
{
    my($item) = @_;

    if (!$worker_initialized)
    {
	$fig = new FIG;
	$worker_initialized = 1;
    }
    my($genome, $kind, $job) = @{$item->{item}};

    my($out, $fams);
    if ($kind eq 'anno')
    {
	($out, $fams) = &process_annotator_seed_genome($sapO,$fig,$genome);
    }
    elsif ($kind eq 'rast')
    {
	($out, $fams) = &process_rast_genome($annO,$fig,$genome, $job);
    }
    else
    {
	($out, $fams) = &process_non_anno_seed_genome($annO,$fig,$genome);
    }
    return { output => $out, fams => $fams };
}

sub handle_result
{
    my($res) = @_;

    my $list = $res->{output};
    if (ref($list) eq 'ARRAY')
    {
	for my $ent (@$list)
	{
	    my($fid, $old, $new, $fam, $score, $hits, $ovhits, $details) = @$ent;
	    print $output_fh "$fid\t$new\t$fam\n";
	    if ($log_fh)
	    {
		print $log_fh "$fid\t$old\t$new\t$fam\t$score\t$hits\t$ovhits\n";
		if (ref($details))
		{
		    print $log_fh "\t", join("\t", @$_), "\n" for @$details;

		}
	    }
	}
    }

    if (ref(my $fhash = $res->{fams}) eq 'HASH')
    {
	for my $peg (sort { &FIG::by_fig_id($a, $b) } keys %$fhash)
	{
	    my $fam = $fhash->{$peg};
	    print $fams_fh "$peg\t$fam\n" if $fam ne '';
	}
    }
}

sub write_output
{
    my($handle, $txt) = @_;
    if (ref($handle) eq 'ARRAY')
    {
	push(@$handle, $txt);
    }
    else
    {
	print $handle $txt;
    }
}

sub process_annotator_seed_genome {
    my($sapO,$fig,$genome) = @_;

    print STDERR "$genome: processing Annotators SEED genome\n";

    #
    # We call with kmers to determine the updated FIGfam membership.
    #
    
    my $fasta = "$FIG_Config::organisms/$genome/Features/peg/fasta";
    my($kmer_funcs, $kmer_fams, $kmer_score) = call_with_kmers($fasta, $annO);

    my $tmpH = $sapO->all_features( -ids => [$genome], -type => ['peg'] );
    my $pegs = $tmpH->{$genome};
    my $pegH = $sapO->ids_to_functions( -ids => $pegs );

    my @pegs = $fig->all_features($genome,'peg');
    my $fig_funcs = $fig->function_of_bulk(\@pegs);
    my $n = 0;
    my @out;
    foreach my $peg (@pegs)
    {
	my $func = $fig_funcs->{$peg};
	$func = $override_fns{$peg} if exists $override_fns{$peg};

	my $fam = $kmer_fams->{$peg};

	if ($pegH->{$peg} && ((! $func) || ($func ne $pegH->{$peg})))
	{
	    push(@out, [$peg, $func, $pegH->{$peg}, $fam]);
	    $n++;
	}
    }
    print STDERR "$genome: $n updates\n";
    return (\@out, $kmer_fams);
}


sub process_non_anno_seed_genome {
    my($annO,$fig,$genome) = @_;

    print STDERR "$genome: processing non-Annotator SEED genome\n";
    my $fasta = "$FIG_Config::organisms/$genome/Features/peg/fasta";
    my($kmer_funcs, $kmer_fams, $kmer_score, $pegs) = call_with_kmers($fasta, $annO);

    my $fig_funcs = $fig->function_of_bulk([keys %$kmer_funcs]);

    my @out;
    my $n = 0;
    for my $peg (@$pegs)
    {
	my $funcK = $kmer_funcs->{$peg};
	my $funcP = $fig_funcs->{$peg};
	$funcP = $override_fns{$peg} if exists $override_fns{$peg};
	
	if ($funcK && ((! $funcP) || ($funcK ne $funcP)))
	{
	    push(@out, [$peg, $funcP, $funcK, $kmer_fams->{$peg}, @{$kmer_score->{$peg}}]);
	    $n++;
	}
    }
    print STDERR "$genome: $n updates\n";
    return (\@out, $kmer_fams);
}

sub process_rast_genome {
    my($annO,$fig,$genome, $job) = @_;

    print STDERR "$genome: processing RAST genome\n";
    my $fasta = "/vol/rast-prod/jobs/$job/rp/$genome/Features/peg/fasta";
    my($kmer_funcs, $kmer_fams, $kmer_score, $pegs) = call_with_kmers($fasta, $annO);

    my @out;
    my $n = 0;
    for my $peg (@$pegs)
    {
	my $funcK = $kmer_funcs->{$peg};
	
	push(@out, [$peg, '', $funcK, $kmer_fams->{$peg}, @{$kmer_score->{$peg}}]);
	$n++;
    }
    print STDERR "$genome: $n updates\n";
    return (\@out, $kmer_fams);
}

sub call_with_kmers
{
    my($fasta, $annO) = @_;

    my $fh;
    open($fh,"<",$fasta) || die "failed to open $fasta";
    my $handle = $annO->assign_function_to_prot(-input => $fh,
						-kmer => 8,
						-seqHitThreshold => 2,
						-detailed => 0,
						-hitThreshold => 3 );

    my %kmer_funcs;
    my %kmer_fams;
    my %kmer_score;
    my @pegs;
    while (my $result = $handle->get_next)
    {
	my($peg, $funcK, $otu, $score, $nonoverlap_hits, $overlap_hits, $details, $fam) = @$result;

	$kmer_funcs{$peg} = $funcK;
	$kmer_fams{$peg} = $fam;
	$kmer_score{$peg} = [$score, $nonoverlap_hits, $overlap_hits, $details];
	push(@pegs, $peg);
    }
    close($fh);
    return(\%kmer_funcs, \%kmer_fams, \%kmer_score, \@pegs);
}

package AnnoUsingFFDir;
use FIG;
use strict;
use Kmers;
use Data::Dumper;

sub new
{
    my($class, $ff_dir) = @_;

    -d $ff_dir or die "Invalid ff dir $ff_dir";

    my $kmer = 8;
    my $friDB = "$ff_dir/FRI.db";
    my $setIDB = "$ff_dir/setI.db";
    my $table = "$ff_dir/Merged/$kmer/table.binary";

    my $kmers = Kmers->new(-frIdb => $friDB, -setIdb => $setIDB, -table => $table);
    my $self = {
	kmers => $kmers,
    };
    return bless $self, $class;
}

sub assign_function_to_prot
{
    my($self, %args) = @_;

    my $in = delete $args{-input};

    my $handle =  Getter->new($self->{kmers}, $in, \%args);
    return $handle;
}

package Getter;
use FIG;
use strict;

sub new
{
    my($class, $kmers, $inp, $args) = @_;

    my $self = {
	kmers => $kmers,
	input => $inp,
	args => $args,
    };
    return bless $self, $class;
}

sub get_next
{
    my($self) = @_;


    while (my($id, $seqp, $com) = &FIG::read_fasta_record($self->{input}))
    {
	my @res = $self->{kmers}->assign_functions_to_prot_set(-seqs => [[$id, $com, $$seqp]], %{$self->{args}});
	if (@res)
	{
	    return $res[0];
	}
    }

    return undef;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3