[Bio] / FigKernelScripts / find_neighbors_using_figfams.pl Repository:
ViewVC logotype

View of /FigKernelScripts/find_neighbors_using_figfams.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.10 - (download) (as text) (annotate)
Tue Jul 7 22:29:05 2009 UTC (10 years, 5 months ago) by gdpusch
Branch: MAIN
CVS Tags: rast_rel_2009_0925, rast_rel_2009_07_09
Changes since 1.9: +45 -31 lines
Major rewrite to prevent already-found PEGs from being re-called by every stage of RAST.

# -*- perl -*-
########################################################################
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
########################################################################

use strict;
use warnings;

use FIG;
my $fig = FIG->new();

use FigFams;
use ToCall;
use NewGenome;

$0 =~ m/([^\/]+)$/o;
my $self  = $1;
my $usage = "usage: $self [-fams=FigfamsData] To_Call_Dir N FoundFamilies [old] > closest.N.genomes";

if (@ARGV && ($ARGV[0] =~ m/-h(elp)?/)) {
    print STDERR "\n   $usage\n\n";
    exit (0);
}

my ($i, $fam_data, $to_call_dir, $N, $foundF);

#...First, let's determine which FigfamsData directory to use:
for ($i=0; ($i < @ARGV) && ($ARGV[$i] !~ /^-fams=/); $i++) {}
if ($i < @ARGV) {
    if ($ARGV[$i] =~ /^-fams=(\S+)/) {
	if (-d $1) {
	    $fam_data = $1;
	}
	else {
	    die "Nonexistent FIGfams directory $1\n";
	}
	splice(@ARGV,$i,1);
    }
}
	
my $figfams = new FigFams($fig, $fam_data);

# Now pick up the normal parameters
(
 ($to_call_dir = shift @ARGV) && 
 ($N           = shift @ARGV) &&
 ($foundF      = shift @ARGV)
)
    || die "\n   $usage\n\n";


### This is a bit complex.  Normally, one uses this program to call genes
# in a newly-sequenced genome.  In this case the functionality of NewGenome.pm
# is needed.  To call genes in an existing genome (to, for example, see how well
# we are doing) you would use ToCall.pm.

my $to_call;
my $keep_original_calls = 0;
if ((@ARGV > 0) && ($ARGV[0] =~ /^old/i)) {
    print STDERR qq(Re-annotating original calls\n) if $ENV{VERBOSE};
    $keep_original_calls = 1;
    $to_call = ToCall->new($to_call_dir);
}
else {
    $to_call = NewGenome->new($to_call_dir);
}


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#...Open auxiliary files (unless "Keep Original Calls" selected)
#-----------------------------------------------------------------------
open(FOUNDFAMS, ">>$foundF") || die "Could not append-open $foundF";

my $called_by_file = qq($to_call_dir/called_by);
open(CALLED_BY, ">>$called_by_file")
    || die "Could not append-open $called_by_file";


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#...Get the current ORF-calls if they exist;
#   else, try to call a set of candidate ORFs. 
#-----------------------------------------------------------------------
my @orf_ids = $to_call->get_fids_for_type('orf');
if (not @orf_ids) {
    $to_call->possible_orfs();
    @orf_ids = $to_call->get_fids_for_type('orf');
}

my (%seq_of, %len_of, $orf_len);
for my $fid (@orf_ids) {
    if (($orf_len = $to_call->get_feature_length($fid)) > 900) {
	$len_of{$fid} = $orf_len;
	$seq_of{$fid} = $to_call->get_feature_sequence($fid);
    }
}


my @peg_ids = ();
my %genomes_hit;
my %weight_of_hits;
foreach my $orf_id (sort { $len_of{$b} <=> $len_of{$a} } keys %len_of) {

    print STDERR "\nChecking $orf_id ($len_of{$orf_id})\n" if $ENV{VERBOSE};
    my ($fam, $sims) = $figfams->place_in_family($seq_of{$orf_id});
    if (not defined($fam)) {
	print STDERR "\tCould not place in family --- skipping\n" if $ENV{VERBOSE};
	next;
    }
    else {
	my $fam_id = $fam->family_id();
	my $func   = $fam->family_function();
	print STDERR "\tplaced in $fam_id ("
	    , (scalar @$sims)
	    , " sims)\t$func\n"
	    if $ENV{VERBOSE};
	
	my $annot = [ qq(RAST),
		      qq($func\nCalled by "$self" using FIGfam $fam_id.)
		      ];
	
	my $orf;
	if ($keep_original_calls) {
	    $orf = &ToCall::PEG::new(   'ToCall::PEG',    $to_call, $orf_id, $seq_of{$orf_id});
	}
	else {
	    $orf = &NewGenome::ORF::new('NewGenome::ORF', $to_call, $orf_id);
	}
	
	my $fid  = $orf->promote_to_peg($sims, $func, $annot);
	
	if ($fid) {
	    print CALLED_BY "$fid\t$self\n";
	    print FOUNDFAMS "$fid\t$fam_id\t$func\n";
	}
	else {
	    die "Could not promote $orf_id";
	}
	
	if ($fid) {
	    push @peg_ids, $fid;
	    
	    for ($i=0; ($i < $N) && ($i < @$sims); ++$i) {
		my $org_2 = &FIG::genome_of($sims->[$i]->id2);
		if (not defined($genomes_hit{$org_2})) {
		    $genomes_hit{$org_2}    = 0;
		    $weight_of_hits{$org_2} = 0;
		}
		
		++$genomes_hit{$org_2};
		
		if ($sims->[$i]->ln2) {
		    $weight_of_hits{$org_2} +=
			($sims->[$i]->bsc() / $sims->[$i]->ln2());
		}
	    }
	}
	else {
	    die "Could not promote $orf_id";
	}
    }
    
    last if (@peg_ids >= $N);
}

my $num_pegs = @peg_ids;
warn "Found $num_pegs PEG candidates\n";

if ($num_pegs == 0) {
    die "Could not find any features of sufficient length and in a FIGfam\n";
}

my @best = sort { $weight_of_hits{$b} <=> $weight_of_hits{$a} } keys(%genomes_hit);
for ($i=0; ($i < $N) && ($i < @best); ++$i) {
    print STDOUT (join(qq(\t), ($best[$i],
				$genomes_hit{$best[$i]},
				$weight_of_hits{$best[$i]},
				$fig->genus_species($best[$i])
				)
		       ),
		  qq(\n)
		  );
}
close(CALLED_BY);
close(FOUNDFAMS);

$to_call->export_features();

exit(0);

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3