[Bio] / FigKernelScripts / find_genes_based_on_neighbors.pl Repository:
ViewVC logotype

View of /FigKernelScripts/find_genes_based_on_neighbors.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.24 - (download) (as text) (annotate)
Thu Nov 5 19:45:28 2009 UTC (10 years, 3 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.23: +6 -5 lines
Switch to FFs and FF instead of FigFams and FigFam

# -*- perl -*-
########################################################################
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
########################################################################

use strict;
use warnings;

use FIG;
my $fig = new FIG;

use FF;
use FFs;

use NewGenome;
use ToCall;

$0 =~ m/([^\/]+)$/o;
my $self  = $1;
my $usage = "usage: find_genes_based_on_neighbors [-fams=FigfamsData] To_Call_Dir FoundFamilies [old] < closest.N.genomes";

if (@ARGV && ($ARGV[0] =~ m/-h(elp)?/)) {
    print STDERR "\n   $usage\n\n";
    exit (0);
}

my ($i, $fam_data, $to_call_dir, $N, $foundF);

# First, let's determine which FigfamsData directory to use

for ($i=0; ($i < @ARGV) && ($ARGV[$i] !~ /^-fams=/); $i++) {}
if ($i < @ARGV) {
    if ($ARGV[$i] =~ /^-fams=(\S+)/) {
	if (-d $1) {
	    $fam_data = $1;
	}
	else {
	    die "Nonexistent FIGfams directory $1\n";
	}
	splice(@ARGV,$i,1);
    }
}

$fam_data = $fig->get_figfams_data($fam_data);
my $figfams = new FFs($fam_data, $fig);

(
 ($to_call_dir = shift @ARGV) &&
 ($foundF      = shift @ARGV) 
)
    || die $usage;

if (open(FOUNDFAMS,"<$foundF")) {
    my %found = map { ($_ =~ /^(\S+)/) ? ($1 => 1) : () } <FOUNDFAMS>;
    close(FOUNDFAMS);
}
open(FOUNDFAMS,">>$foundF") || die "could not append-open $foundF";

my $called_by_file = "$to_call_dir/called_by";
open(CALLED_BY, ">>$called_by_file")
    || die "Could not append-open $called_by_file";

my $to_call;
my $keep_original_calls = 0;
if ((@ARGV > 0) && ($ARGV[0] =~ /^old/i)) {
    $keep_original_calls = 1;
    $to_call = ToCall->new($to_call_dir);
}
else {
    $to_call = NewGenome->new($to_call_dir);
}

my %fams_to_use;
my @close_genomes = map { $_ =~ /^(\d+\.\d+)/ ? $1 : () } <STDIN>;
my $closest = $close_genomes[0] || die "No closest genome could be read";

if (not $to_call->get_fids_for_type('orf')) {
    print STDERR "Recalling ORFs\n"    if $ENV{VERBOSE};
    $to_call->recall_orfs;
    my $num_called = (@_ = $to_call->get_fids_for_type('orf'));
    print STDERR "Found $num_called\n" if $ENV{VERBOSE};
    $to_call->export_features("orf");
}

foreach my $genome (@close_genomes) {
    foreach my $family ($figfams->families_in_genome($genome)) {
	$fams_to_use{$family} = 1;
    }
}

my $found  = 0;
my $missed = 0;

my($family, $fam_id);
foreach $fam_id (sort keys(%fams_to_use)) {
    print STDERR "Processing family $fam_id\n" if $ENV{VERBOSE};
    
    #...Last arg of &process_famid() says "1 per family"
    my ($got, $sims) = &process_famid($fig, $fam_id, $fam_data, \*FOUNDFAMS, \*CALLED_BY, 0);
    if ($got) {
	++$found;
    }
    else {
	++$missed;
    }
}

close(FOUNDFAMS);
close(CALLED_BY);

my $total = $found + $missed;

print STDERR "found      = $found\n";
print STDERR "missed     = $missed\n";
print STDERR "total      = $total\n";

$to_call->recall_orfs;
$to_call->export_features;

exit(0);



sub process_famid {
    my ($fig, $fam_id, $fam_data, $found_fh, $called_by_fh, $allow_multiple) = @_;
    
    my $fam;
    eval {
        $fam = new FF($fam_id,$fam_data);
    };
    
    if ($@) {
	warn "Error opening FigFam $fam_id $fam_data:\n$@\n";
	return;
    }
    
    if (not defined($fam)) {
	warn "No family found for $fam_id\n";
	return ();
    }
    print STDERR "processing $fam_id ", $fam->family_function, "\n";
    
    my @got = ();
    my @rep_seqs = $fam->representatives;
    print STDERR "\t$fam_id has ", (scalar @rep_seqs), " representative"
	. ((@rep_seqs == 1) ? "" : "s")
	, "\n" if (defined($ENV{VERBOSE}) && ($ENV{VERBOSE} > 1));
    
    my ($rep_seq, $orf_seq, $i, $should_be, $sims);
    my (%seen, $orf_id, $orf, @orfs);
    my $done = 0;
    while ((! $done) && ($rep_seq = shift @rep_seqs)) {
	@orfs = $to_call->candidate_orfs(-seq => $rep_seq);
	print STDERR "\t", (scalar @orfs), " ORF".((@orfs == 1) ? "" : "s")
	    , " from rep-seq $rep_seq\n"
	    if (defined($ENV{VERBOSE}) && ($ENV{VERBOSE} > 1));
	
	while ((! $done) && ($orf = shift @orfs)) {
	    $orf_id = $orf->get_fid;
	    if (! $seen{$orf_id}) {
		$seen{$orf_id} = $orf;
		$done = $allow_multiple ? 0 : 1;
	    }
	}
    }
    
    foreach $orf_id (sort keys(%seen)) {
	$orf = $seen{$orf_id};
	if (defined($orf_seq = $orf->seq())) {
	    my ($should_be, $sims) = $fam->should_be_member($orf_seq,
							    (defined($ENV{VERBOSE}) && ($ENV{VERBOSE} > 1)),
							    1
							    );
	    if ($should_be) {
		my $func = $fam->family_function;
		my $annot = [ qq(RAST),
			      qq($func\nCalled by "$self" using FIGfam $fam_id.)
			      ];
		my $fid  = $orf->promote_to_peg($sims, $func, $annot);
		
		if ($fid) {
		    push  (@got, [$fid, $func, $sims]);
		    print $found_fh     "$fid\t$fam_id\t$func\n";
		    print $called_by_fh "$fid\t$self\n";
		    print STDERR "Succeeded:\t$orf_id --> $fid\n\n";
		}
		else {
		    print STDERR "Could not promote:\t$orf_id\n\n";
		}
	    }
	    else {
		print STDERR "Failed:\t$orf_id\n\n";
	    }
	    
	    print STDERR map { join("\t", @$_) . "\n" } @$sims if $ENV{VERBOSE};
	}
    }
    
    return @got;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3