[Bio] / FigKernelScripts / find_genes_based_on_neighbors_par.pl Repository:
ViewVC logotype

View of /FigKernelScripts/find_genes_based_on_neighbors_par.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Fri Jan 29 19:18:48 2010 UTC (9 years, 9 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
parallel version of find_genes_based_on_neighbors

# -*- perl -*-
########################################################################
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
########################################################################

use forks;
use forks::shared;
use Thread::Queue;

use strict;
use warnings;
use Data::Dumper;

use FIG;

use FF;
use FFs;

use NewGenome;
use ToCall;

$0 =~ m/([^\/]+)$/o;
my $self  = $1;
my $usage = "usage: find_genes_based_on_neighbors [-fams=FigfamsData] To_Call_Dir FoundFamilies [old] < closest.N.genomes";

if (@ARGV && ($ARGV[0] =~ m/-h(elp)?/)) {
    print STDERR "\n   $usage\n\n";
    exit (0);
}

my ($i, $fam_data, $to_call_dir, $N, $foundF);

# First, let's determine which FigfamsData directory to use

for ($i=0; ($i < @ARGV) && ($ARGV[$i] !~ /^-fams=/); $i++) {}
if ($i < @ARGV) {
    if ($ARGV[$i] =~ /^-fams=(\S+)/) {
	if (-d $1) {
	    $fam_data = $1;
	}
	else {
	    die "Nonexistent FIGfams directory $1\n";
	}
	splice(@ARGV,$i,1);
    }
}

$fam_data = &FIG::get_figfams_data($fam_data);
my $figfams = new FFs($fam_data);

(
 ($to_call_dir = shift @ARGV) &&
 ($foundF      = shift @ARGV) 
)
    || die $usage;

if (open(FOUNDFAMS,"<$foundF")) {
    my %found = map { ($_ =~ /^(\S+)/) ? ($1 => 1) : () } <FOUNDFAMS>;
    close(FOUNDFAMS);
}
open(FOUNDFAMS,">>$foundF") || die "could not append-open $foundF";

my $called_by_file = "$to_call_dir/called_by";
open(CALLED_BY, ">>$called_by_file")
    || die "Could not append-open $called_by_file";

my $to_call;
my $keep_original_calls = 0;
if ((@ARGV > 0) && ($ARGV[0] =~ /^old/i)) {
    $keep_original_calls = 1;
    $to_call = ToCall->new($to_call_dir);
}
else {
    $to_call = NewGenome->new($to_call_dir);
}

my %fams_to_use;
my @close_genomes = map { $_ =~ /^(\d+\.\d+)/ ? $1 : () } <STDIN>;
my $closest = $close_genomes[0] || die "No closest genome could be read";

if (not $to_call->get_fids_for_type('orf')) {
    print STDERR "Recalling ORFs\n"    if $ENV{VERBOSE};
    $to_call->recall_orfs;
    my $num_called = (@_ = $to_call->get_fids_for_type('orf'));
    print STDERR "Found $num_called\n" if $ENV{VERBOSE};
    $to_call->export_features("orf");
}

foreach my $genome (@close_genomes) {
    foreach my $family ($figfams->families_in_genome($genome)) {
	$fams_to_use{$family} = 1;
    }
}


my @fams_work = sort keys %fams_to_use;
my $total = @fams_work;
my %fams_found;

my $work_queue = new Thread::Queue(@fams_work);
my $result_queue = new Thread::Queue();

my $procs= $ENV{PROCS};
map { $work_queue->enqueue(undef) } 1..$procs;

my @threads = map { threads->create(\&thread_main, $_) } 1..$procs;

sub thread_main
{
    my($id) = @_;
    print "started thread $id\n";

    while (my $fam_id = $work_queue->dequeue())
    {
	print STDERR "$id Processing family $fam_id\n" if $ENV{VERBOSE};
	
	#...Last arg of &process_famid() says "1 per family"

	&process_famid($id, $fam_id, $fam_data, 0);
    }
    $result_queue->enqueue(undef);
}

my $n_left = $procs;
while ($n_left)
{
    my $dat = $result_queue->dequeue();
    if ($dat)
    {
	process_famid_result($dat, \*FOUNDFAMS, \*CALLED_BY);
    }
    else
    {
	$n_left--;
    }
}

map { $_->join() } @threads;

close(FOUNDFAMS);
close(CALLED_BY);

my $found  = keys %fams_found;
my $missed = $total - $found;


print STDERR "found      = $found\n";
print STDERR "missed     = $missed\n";
print STDERR "total      = $total\n";

$to_call->recall_orfs;
$to_call->export_features;

exit(0);

sub process_famid {
    my ($thread_id, $fam_id, $fam_data, $allow_multiple) = @_;
    
    my $fam;
    eval {
        $fam = new FF($fam_id,$fam_data);
    };
    
    if ($@) {
	warn "Error opening FigFam $fam_id $fam_data:\n$@\n";
	return;
    }
    
    if (not defined($fam)) {
	warn "No family found for $fam_id\n";
	return ();
    }
    print STDERR "thread $thread_id processing $fam_id ", $fam->family_function, "\n";
    
    my @got = ();
    my @rep_seqs = $fam->representatives;
    print STDERR "\t$fam_id has ", (scalar @rep_seqs), " representative"
	. ((@rep_seqs == 1) ? "" : "s")
	, "\n" if (defined($ENV{VERBOSE}) && ($ENV{VERBOSE} > 1));
    
    my ($rep_seq, $orf_seq, $i, $should_be, $sims);
    my (%seen, $orf_id, $orf, @orfs);
    my $done = 0;
    while ((! $done) && ($rep_seq = shift @rep_seqs)) {
	@orfs = $to_call->candidate_orfs(-seq => $rep_seq);
	print STDERR "\t", (scalar @orfs), " ORF".((@orfs == 1) ? "" : "s")
	    , " from rep-seq $rep_seq\n"
	    if (defined($ENV{VERBOSE}) && ($ENV{VERBOSE} > 1));
	
	while ((! $done) && ($orf = shift @orfs)) {
	    $orf_id = $orf->get_fid;
	    if (! $seen{$orf_id}) {
		$seen{$orf_id} = $orf;
		$done = $allow_multiple ? 0 : 1;
	    }
	}
    }
    
    foreach $orf_id (sort keys(%seen)) {
	$orf = $seen{$orf_id};
	if (defined($orf_seq = $orf->seq())) {
	    print "$orf_id $orf_seq\n";
	    my ($should_be, $sims) = $fam->should_be_member($orf_seq,
							    (defined($ENV{VERBOSE}) && ($ENV{VERBOSE} > 1)),
							    1
							    );
	    print "should_be=$should_be\n";
	    if (ref($sims))
	    {
		print "  $_\n" for @$sims;
	    }
	    if ($should_be) {
		my $func = $fam->family_function;
		my $annot = [ qq(RAST),
			      qq($func\nCalled by "$self" using FIGfam $fam_id.)
			      ];
		$result_queue->enqueue([$fam_id, $orf_id, $sims, $func, $annot]);
	    }
	}
    }
}

sub process_famid_result {
    my($res, $found_fh, $called_by_fh) = @_;

    my($fam_id, $orf_id, $sims, $func, $annot)  = @$res;

    if (!$to_call->is_feature($orf_id))
    {
	print "$orf_id already promoted\n";
	return;
    }

    my $orf = NewGenome::ORF->new($to_call, $orf_id);
    my $fid  = $orf->promote_to_peg($sims, $func, $annot);

    my $ret = 0;
		
    if ($fid) {

	print $found_fh     "$fid\t$fam_id\t$func\n";
	print $called_by_fh "$fid\t$self\n";
	print STDERR "Succeeded:\t$orf_id --> $fid\n\n";
	$fams_found{$fam_id}++;
    }
    else {
	print STDERR "Could not promote:\t$orf_id\n\n";
    }

}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3