[Bio] / FigKernelScripts / find_neighbors.pl Repository:
ViewVC logotype

View of /FigKernelScripts/find_neighbors.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.39 - (download) (as text) (annotate)
Thu Nov 11 17:42:31 2010 UTC (9 years, 3 months ago) by disz
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.38: +1 -1 lines
FigFam 6 digit problem

# -*- perl -*-
########################################################################
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
########################################################################

use strict;
use warnings;

use FIG;
my $fig = FIG->new();

use FF;

use ToCall;
use NewGenome;

$0 =~ m/([^\/]+)$/o;
my $self  = $1;
my $usage = "usage:  $self  [-fams=FigfamsData] To_Call_Dir NumNeighbors FoundFamilies [old]  >  closest.N.genomes";

if (@ARGV && ($ARGV[0] =~ m/-h(elp)?/)) {
    print STDERR "\n   $usage\n\n";
    exit (0);
}

my ($fam_data, $to_call_dir, $N, $foundF);

# First, let's determine which FigfamsData directory to use

if (@ARGV && ($ARGV[0] =~ /^-{1,2}fams=(\S+)/)) {
    if (-d $1) {
	$fam_data = $1;
    }
    else {
	die "Nonexistent FIGfams directory $1\n";
    }
    shift @ARGV;
}

$fam_data = &FIG::get_figfams_data($fam_data);
	
# Now pick up the normal parameters
(
 ($to_call_dir = shift @ARGV) && 
 ($N           = shift @ARGV) &&
 ($foundF      = shift @ARGV)
)
    || die "\n   $usage\n\n";


### This is a bit complex.  Normally, one uses this program to call genes
# in a newly-sequenced genome.  In this case the functionality of NewGenome.pm
# is needed.  To call genes in an existing genome (to, for example, see how well
# we are doing) you would use ToCall.pm.

my $to_call;
my $keep_original_calls = 0;
if ((@ARGV > 0) && ($ARGV[0] =~ /^old/i)) {
    $keep_original_calls = 1;
    $to_call = ToCall->new($to_call_dir);
}
else {
    $to_call = NewGenome->new($to_call_dir);
}


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#...Open auxiliary files (unless "Keep Original Calls" selected)
#-----------------------------------------------------------------------
open(FOUNDFAMS, ">>$foundF") || die "Could not append-open $foundF";

my $called_by = qq($to_call_dir/called_by);
open(CALLED_BY, ">>$called_by")
    || die "Could not append-open $to_call_dir/called_by";


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#...Ask the $to_call object to compute itself a set of candidate ORFs:
#-----------------------------------------------------------------------
my @orf_ids = $to_call->get_fids_for_type('orf');
if (not @orf_ids) {
    $to_call->possible_orfs();
    @orf_ids = $to_call->get_fids_for_type('orf');
    print STDERR (qq(\n>>> Found ), (scalar @orf_ids), qq( possible ORFs\n\n)) if $ENV{VERBOSE};
}


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#...Initialize hash keyed by the "Universal" protein family functions:
#-----------------------------------------------------------------------
my %univ = map { chomp $_; ($_ => 1) } <DATA>;


#...Find the FIGfam IDs corresponding to the "Universal" families:
my @univ = ();
if (-s "$fam_data/family.functions") {
    open(FAM_FUNCS, "<$fam_data/family.functions")
	|| die "Could not read-open $fam_data/family.functions";
    my $entry;
    while (defined($entry = <FAM_FUNCS>)) {
	chomp $entry;
	if ($entry =~ m/^(FIG\d+)\t(.*)/) {
	    my ($fam_id, $func) = ($1, $2);
	    $func =~ s/^FIG\d+:\s+//o;    #...trim off leading FIGfam ID,
	    $func =~ s/\s+\#\.*$//o;        #...and trailing comment...
	    if ($univ{$func}) {
		push @univ, $fam_id;
		print STDERR "Universal $fam_id:\t$func\n" if $ENV{VERBOSE};
	    }
	}
	else {
	    die "Could not parse $fam_data/family.functions entry: $entry";
	}
    }
}
else {
    die "Could not read $fam_data/family.functions";
}

print STDERR
    ( qq(\n>>> Found ),
      (scalar @univ),
      qq( FIGfams matching a "Universal" function in $fam_data/family.functions:\n\n),
      )
    if $ENV{VERBOSE};



#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#...Foreach "Universal" family, find the genome's best-matching
#   candidate ORF and its sims against that FIGfam.
#   Count the number of times each genome "wins the vote,
#   and add up the "normalized bit-scores" for its hits.
#-----------------------------------------------------------------------
my ($i,$rep,$fam,@rep_seqs,$rep_seq,@orfs,$orf, $orf_seq, @fids,$sims);
my (%genomes_hit, %weight_of_hits);
foreach my $fam_id (@univ) {
    print STDERR "Processing family $fam_id\n" if $ENV{VERBOSE};
    
    #...Last arg of &process_famid() means "Return only one hit per family"
    my @got = &process_famid($fig, $fam_id, $fam_data, \*FOUNDFAMS, \*CALLED_BY, 0);
    
    foreach my $hit (@got) {
	my $sims = $hit->[2];
	for ($i=0; ($i < $N) && ($i < @$sims); ++$i) {
	    my $org_2 = &FIG::genome_of($sims->[$i]->id2);
	    if (not defined($genomes_hit{$org_2})) {
		$genomes_hit{$org_2}    = 0;
		$weight_of_hits{$org_2} = 0;
	    }
	    
	    ++$genomes_hit{$org_2};
	    
	    if ($sims->[$i]->ln2) {
		$weight_of_hits{$org_2} +=
		    ($sims->[$i]->bsc() / $sims->[$i]->ln2());
	    }
	}
    }
}

my @best = sort { $weight_of_hits{$b} <=> $weight_of_hits{$a} } keys(%genomes_hit);
for ($i=0; ($i < $N) && ($i < @best); ++$i) {
    print STDOUT (join(qq(\t), ($best[$i],
				$genomes_hit{$best[$i]},
				$weight_of_hits{$best[$i]},
				$fig->genus_species($best[$i])
				)
		       ),
		  qq(\n)
		  );
}
close(FOUNDFAMS);
close(CALLED_BY);

$to_call->export_features;

exit(0);



sub process_famid {
    my ($fig, $fam_id, $fam_data, $found_fh, $called_by_fh, $allow_multiple) = @_;
    
    my $fam;
    eval {
        $fam = new FF($fam_id,$fam_data);
    };
    
    if ($@) {
	warn "Error opening FigFam $fam_id $fam_data:\n$@\n";
	return ();
    }
    
    if (not defined($fam)) {
	warn "No family found for $fam_id\n";
	return ();
    }
    
    print STDERR (qq(processing $fam_id: ),
		  $fam->family_function,
		  ($ENV{VERBOSE} ? qq(\n) : qq(\t)),
		  );
    
    my @got = ();
    my @rep_seqs = $fam->representatives;
    print STDERR "\t$fam_id has ", (scalar @rep_seqs), " representative"
	. ((@rep_seqs == 1) ? "" : "s")
	, "\n" if (defined($ENV{VERBOSE}) && ($ENV{VERBOSE} > 1));
    
    my ($rep_seq, $orf_seq, $i, $should_be, $sims);
    my (%seen, $orf_id, $orf, @orfs);
    my $done = 0;
    while ((! $done) && ($rep_seq = shift @rep_seqs)) {
	@orfs = $to_call->candidate_orfs(-seq => $rep_seq);
	print STDERR "\t", (scalar @orfs), " ORF".((@orfs == 1) ? "" : "s")
	    , " from rep-seq $rep_seq\n"
	    if (defined($ENV{VERBOSE}) && ($ENV{VERBOSE} > 1));
	
	while ((! $done) && ($orf = shift @orfs)) {
	    $orf_id = $orf->get_fid;
	    if (! $seen{$orf_id}) {
		$seen{$orf_id} = $orf;
#		die Dumper($orf);
		$done = $allow_multiple ? 0 : 1;
	    }
	}
    }
    
    foreach $orf_id (sort keys(%seen)) {
	$orf = $seen{$orf_id};
	if (defined($orf_seq = $orf->seq)) {
	    my ($should_be, $sims) = $fam->should_be_member($orf_seq,
							    (defined($ENV{VERBOSE}) && ($ENV{VERBOSE} > 1)),
							    1
							    );
	    
	    @$sims = grep { defined($_->ln1()) && defined($_->ln2())
				&& ($_->ln1() > 0) && ($_->ln2() > 0)
			    } @$sims;
	    
	    if ($should_be) {
		my $func  = $fam->family_function();
		my $annot = [ qq(RAST),
			     qq($func\nCalled by "$self" using FIGfam $fam_id.)
			     ];
		my $fid   = $orf->promote_to_peg($sims, $func, $annot);
		
		push  (@got, [$fid, $func, $sims]);
		print $found_fh     "$fid\t$fam_id\t$func\n";    #...Write entry in 'found' file...
		print $called_by_fh "$fid\t$self\n";
		
		print STDERR "Succeeded: $orf_id --> $fid\n" if $ENV{VERBOSE};
	    }
	    else {
		print STDERR "Failed: $orf_id\n" if $ENV{VERBOSE};
	    }
	    
	    print STDERR (qq(Sims:\n),
			  map { join(qq(\t), @$_) . "\n" } @$sims
			  ) if $ENV{VERBOSE};
	}
	else {
	    die "Something is wrong --- could not get translation for $orf_id";
	}
    }
    print STDERR (qq(\t--- found ),
		  (scalar @got),
		  ($ENV{VERBOSE} ? qq(\n\n) : qq(\n)),
		  );
    
    return @got;
}

__DATA__
Phenylalanyl-tRNA synthetase beta chain (EC 6.1.1.20)
Prolyl-tRNA synthetase (EC 6.1.1.15)
Phenylalanyl-tRNA synthetase alpha chain (EC 6.1.1.20)
Histidyl-tRNA synthetase (EC 6.1.1.21)
Arginyl-tRNA synthetase (EC 6.1.1.19)
Tryptophanyl-tRNA synthetase (EC 6.1.1.2)
Tyrosyl-tRNA synthetase (EC 6.1.1.1)
Methionyl-tRNA synthetase (EC 6.1.1.10)
Threonyl-tRNA synthetase (EC 6.1.1.3)
Valyl-tRNA synthetase (EC 6.1.1.9)
Preprotein translocase secY subunit (TC 3.A.5.1.1)
DNA primase (EC 2.7.7.-)
Ribosome recycling factor
Translation initiation factor 3
Transcription termination protein NusA
Phosphoglycerate kinase (EC 2.7.2.3)
Triosephosphate isomerase (EC 5.3.1.1)
CTP synthase (EC 6.3.4.2)
DNA-directed RNA polymerase beta subunit (EC 2.7.7.6)
DNA-directed RNA polymerase beta' subunit (EC 2.7.7.6)
LSU ribosomal protein L1p (L10Ae)
LSU ribosomal protein L2p (L8e)
LSU ribosomal protein L3p (L3e)
LSU ribosomal protein L4p (L1e)
LSU ribosomal protein L5p (L11e)
LSU ribosomal protein L6p (L9e)
LSU ribosomal protein L11p (L12e)
LSU ribosomal protein L7/L12 (L23e)
LSU ribosomal protein L13p (L13Ae)
LSU ribosomal protein L14p (L23e)
LSU ribosomal protein L16p (L10e)
LSU ribosomal protein L19p
LSU ribosomal protein L20p
LSU ribosomal protein L27p
SSU ribosomal protein S2p (SAe)
SSU ribosomal protein S3p (S3e)
SSU ribosomal protein S5p (S2e)
SSU ribosomal protein S9p (S16e)
SSU ribosomal protein S10p (S20e)
SSU ribosomal protein S11p (S14e)
SSU ribosomal protein S13p (S18e)
SSU ribosomal protein S19p (S15e)
tmRNA-binding protein SmpB
Translation elongation factor Ts

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3