[Bio] / FigKernelScripts / find_approx_neigh.pl Repository:
ViewVC logotype

View of /FigKernelScripts/find_approx_neigh.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.9 - (download) (as text) (annotate)
Sat Sep 14 04:26:47 2019 UTC (2 months ago) by gdpusch
Branch: MAIN
CVS Tags: HEAD
Changes since 1.8: +13 -3 lines
Added some code to ensure that this tool only returns genomes that are installed in the current SEED

########################################################################

use strict;
use warnings;

use SeedEnv;
use ProtSims;
use gjoseqlib;
use Data::Dumper;

use FIG;
my $figO = FIG->new();

#my $sapO = SAPserver->new;

my $verbose = $ENV{VERBOSE};
my $debug   = $ENV{DEBUG};

$| = 1;
my $usage = "usage: find_approx_neigh  GenomeDir [MaxNum]";
my $gdir;



#...Process @ARGV...
my $trouble = 0;
while (@ARGV && ($ARGV[0] =~ m/^-/o)) {
    if    ($ARGV[0] =~ m/^-{1,2}help/o) {
	print STDERR (qq(\n  ), $usage, qq(\n));
	exit(0);
    }
    else {
	$trouble = 1;
	warn qq(Unrecognized switch \'$ARGV[0]\'\n);
	$trouble = 1;
    }
    
    shift @ARGV;
}
die "\n   $usage\n\n" if $trouble;

($gdir =  shift @ARGV)   || die $usage;
$gdir  =~ s/\/$//o;
($gdir =~ /(\d+\.\d+)$/) || die "Invalid Genome Directory: $gdir";
my $gdir_id = $1;

my $max_num = (@ARGV > 0) ? $ARGV[0] : 30;


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#...Get SEED genome IDs and bionames...
#-----------------------------------------------------------------------
my $genomesH = {};
%$genomesH = map { my $gs = $figO->genus_species($_);
		   ($_ && $gs) ? ($_ => $gs) : ()
} $figO->genomes();
print STDERR ("Read ", (scalar keys %$genomesH), " genomes\n") if $ENV{DEBUG};


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#...Get PEG sequences...
#-----------------------------------------------------------------------
my @fasta = ();
if (-d "$gdir/Features/peg")  {
    push @fasta, &gjoseqlib::read_fasta("$gdir/Features/peg/fasta");
}

if ((@fasta < 500) && (-d "$gdir/Features/orf")) {
    push @fasta, &gjoseqlib::read_fasta("$gdir/Features/orf/fasta");
}
die "No translatable features found in OrgDir=$gdir" unless @fasta;

my %id2seqH = map { ($_->[2] && (length($_->[2]) > 30)) ? ($_->[0] => $_->[2]) : () } @fasta;


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#...Prioritize "Universal" functions...
#-----------------------------------------------------------------------
my @poss_pegs = &prioritize_pegs_used_to_find_neighbors($gdir);
my %counts;
my $best  = 0;
my $tuple;
while (($best < 500) && ($tuple = shift @poss_pegs)) {
    my($role,$peg) = @$tuple;
    
    print STDERR ("\nProcessing role='$role'", "\t", "peg=$peg", "\n") if $ENV{DEBUG};
    if ($id2seqH{$peg} && (length($id2seqH{$peg}) > 30)) {
	&compute_hits_and_set_best($tuple, \%id2seqH, \%counts, \$best);
    }
    print STDERR "//\n" if $debug;
}

if ($best == 0) {
    print STDERR "WARNING: $gdir describes a genome without enough RAST-called genes to identify neighbors\n";
    exit(0);
}

my @reference = sort {
    ($counts{$b} <=> $counts{$a})
	|| ($a cmp $b)
} grep { 
    $genomesH->{$_}
} (keys %counts);

if (@reference > $max_num) { $#reference = $max_num-1 }

foreach my $g2 (@reference) {
    if (($g2 ne $gdir_id) && defined($genomesH->{$g2})) {
	print STDOUT (join("\t",($g2, $counts{$g2}, $genomesH->{$g2})), "\n");
    }
}

sub prioritize_pegs_used_to_find_neighbors {
    my($gdir) = @_;
    
    my %func_of;
    my $functions_file;
    if    (-s ($functions_file = "$gdir/proposed_functions")) {
	foreach my $line (&SeedUtils::file_read($functions_file)) {
	    if ($line =~ /^(fig\|\d+\.\d+\.(peg|orf)\.\d+)\t(\S[^\#]+\S)/) {
		$func_of{$1} = $3;
	    }
	}
    }
    
    if ((scalar keys %func_of) < 200) {
	if (-s ($functions_file = "$gdir/proposed_non_ff_functions")) {
	    foreach my $line (&SeedUtils::file_read($functions_file)) {
		if ($line =~ /^(fig\|\d+\.\d+\.(peg|orf)\.\d+)\t(\S[^\#]+\S)/) {
		    if (not defined($func_of{$1})) { $func_of{$1} = $3; }
		}
	    }
	}
    }
    
    if ((scalar keys %func_of) < 200) {
	if (-s ($functions_file = "$gdir/assigned_functions")) {
	    foreach my $line (&SeedUtils::file_read($functions_file)) {
		if ($line =~ /^(fig\|\d+\.\d+\.(peg|orf)\.\d+)\t(\S[^\#]+\S)/) {
		    if (not defined($func_of{$1})) { $func_of{$1} = $3; }
		}
	    }
	}
    }
    
    if ((scalar keys %func_of) == 0) {
	print STDERR "WARNING: $gdir contains no assigned functions\n";
	exit(0);
    }
    
    
    my %by_func;
    foreach my $peg (sort keys %func_of) {
	my $func = $func_of{$peg};
	$func =~ s/\s*\#.*$//;
	push @ { $by_func{$func} }, $peg;
    }
    
    my @synthetases = map { [$_,$by_func{$_}->[0]] }
    grep { @{$by_func{$_}} == 1 }
    grep { $_ =~ /tRNA synthetase/o } sort keys(%by_func);
    
    my @ribosomal_proteins = map {[$_,$by_func{$_}->[0]] }
    grep { @{$by_func{$_}} == 1 }
    grep { $_ =~ /ribosomal protein/o } sort keys(%by_func);
    
    my @ok_pegs = map {[$_,$by_func{$_}->[0]] } grep { @{$by_func{$_}} == 1 } sort keys(%by_func);
    
    if ($ENV{VERBOSE} || $ENV{DEBUG}) {
	print STDERR (q(Found ),
		      (scalar @synthetases), q( unique synthetases, ),
		      (scalar @ribosomal_proteins), q( unique ribosomal proteins, ),
		      (scalar @ok_pegs), q( PEGs with unique function),
		      qq(\n)
		      );
    }
    
    my @prioritized = ();
    my %seen;
    foreach my $tuple (@synthetases,@ribosomal_proteins,@ok_pegs) {
	if (! $seen{$tuple->[0]}) {
	    $seen{$tuple->[0]} = 1;
	    push(@prioritized,$tuple);
	}
    }
    return @prioritized;
}

sub compute_hits_and_set_best {
    my ($tuple, $id2seqH, $counts, $bestP) = @_;
    
    my ($role, $peg) = @$tuple;
    my $pattyfam_pegs  = &pattyfam_pegs_for_role($role);
    # die Dumper($pattyfam_pegs);
    
    if (@$pattyfam_pegs > 0) {
	my @sims         = &ProtSims::blastP([[$peg, '', $id2seqH->{$peg}]], $pattyfam_pegs, 10);
	
	for (my $i=0; (($i < @sims) && ($i < 50)); ++$i) {
	    my $g2 = &SeedUtils::genome_of($sims[$i]->id2);
	    $counts->{$g2} += 50 - $i;
	    if ($counts->{$g2} > $$bestP) { $$bestP = $counts->{$g2} }
	}
    }
}

sub pattyfam_pegs_for_role {
    my ($role) = @_;
    
    my $res  = $figO->pattyfams_for_function($role);
    #die Dumper($res);
    
    #...The below grep shouldn't be necessary, since FIG::pattyfams_for function()
    # is _supposed_ to only return genomes that are within the local SEED,
    # but I'm feeling paranoid... :-(
    my %fids = map { my $x = $_->[0]; 
		     my $y = $figO->genome_of($x);
		     # die Dumper($x,$y);
		     $figO->is_genome($y) ? ($x => 1) : () } @$res;
    #die Dumper(\%fids);
    
    #...FID order shouldn't matter for the sequence fetch, but again I'm feeling paranoid... :-(
    my @fids =  sort { &FIG::by_fig_id($a,$b) } keys %fids;
    print STDERR ('Found ', scalar(@fids), " PATtyFam PEGs with role='", $role, "'\n") if $debug;
    #die Dumper($res, \%fids, \@fids);
    
    $res = $figO->get_translation_bulk(\@fids);
    return [map { [$_,'',$res->{$_}] } sort { &FIG::by_fig_id($a,$b) } keys %$res];
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3