[Bio] / FigKernelScripts / get_neighbors_and_corr_to_ref.pl Repository:
ViewVC logotype

View of /FigKernelScripts/get_neighbors_and_corr_to_ref.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Sun Dec 13 01:11:40 2009 UTC (10 years, 5 months ago) by overbeek
Branch: MAIN
Changes since 1.1: +1 -1 lines
minor fix

########################################################################

use SeedHTML;
use strict;
use SeedEnv;
use ProtSims;
use gjoseqlib;

my $usage = "usage: get_neighbors_and_corr_to_ref GenomeDir";
my $gdir;

($gdir   =  shift @ARGV)
    || die $usage;

my @fasta = &gjoseqlib::read_fasta("$gdir/Features/peg/fasta");
my %id2seqH = map { $_->[0] => $_->[2] } @fasta;

&SeedUtils::verify_dir("$gdir/CorrToReferenceGenomes");
my @poss_pegs = &prioritize_pegs_used_to_find_neighbors($gdir);

my %counts;
my $best  = 0;
my $tuple;
while (($best < 500) && ($tuple = shift @poss_pegs))
{
    &compute_hits_and_set_best($tuple,\%id2seqH,\%counts,\$best);
}
if ($best == 0) { die "$gdir describes a genome without enough RAST-called genes to identify neighbors" }
my @reference = sort { $counts{$b} <=> $counts{$a} } keys(%counts);
if (@reference > 30) { $#reference = 29 }

my $sapO = SAPserver->new;
my $genomesH  = $sapO->all_genomes(-complete => 1);
open(CLOSE,">$gdir/closest.genomes") || die "could not open closest.genomes";
foreach my $g2 (@reference)
{
    &generate_correspondence_table($g2,$gdir);
    print CLOSE join("\t",($g2,$genomesH->{$g2})),"\n";
}
close(CLOSE);

sub generate_correspondence_table {
    my($g2,$gdir) = @_;

    ($gdir =~ /(\d+\.\d+)$/) || die "Invalid Genome Directory: $gdir";
    my $g1 = $1;
    system "svr_corresponding_genes -d $gdir $g1 $g2 > $gdir/CorrToReferenceGenomes/$g2";
}

sub prioritize_pegs_used_to_find_neighbors {
    my($gdir) = @_;

    my %by_func;
    (-s "$gdir/assigned_functions") || die "$gdir contains no assigned_functions";

    foreach my $line (`cat $gdir/assigned_functions`)
    {
	if ($line =~ /^(fig\|\d+\.\d+\.peg\.\d+)\t(\S[^\#]+\S)/)
	{
	    my($peg,$func) = ($1,$2);
	    $func =~ s/\s*\#$//;
	    push(@{$by_func{$2}},$1);
	}
    }
    my @synthetases        = map {[$_,$by_func{$_}->[0]] } grep { @{$by_func{$_}} == 1 } grep { $_ =~ /tRNA synthetase/ }   keys(%by_func);
    my @ribosomal_proteins = map {[$_,$by_func{$_}->[0]] } grep { @{$by_func{$_}} == 1 } grep { $_ =~ /ribosomal protein/ } keys(%by_func);
    my @ok_pegs            = map {[$_,$by_func{$_}->[0]] } grep { @{$by_func{$_}} == 1 }                                    keys(%by_func);
    my @prioritized = ();
    my %seen;
    foreach my $tuple (@synthetases,@ribosomal_proteins,@ok_pegs)
    {
	if (! $seen{$tuple->[0]})
	{
	    $seen{$tuple->[0]} = 1;
	    push(@prioritized,$tuple);
	}
    }
    return @prioritized;
}

sub compute_hits_and_set_best {
    my($tuple,$id2seqH,$counts,$bestP) = @_;

    my($role,$peg) = @$tuple;
    my $figfam_pegs = &figfam_pegs_for_role($role);

    my @sims        = &ProtSims::blastP([[$peg,'',$id2seqH->{$peg}]],$figfam_pegs,10);
    my $i;
    for ($i=0; (($i < @sims) && ($i < 50)); $i++)
    {
	my $g2 = &SeedUtils::genome_of($sims[$i]->id2);
	$counts->{$g2} += 50 - $i;
	if ($counts->{$g2} > $$bestP) { $$bestP = $counts->{$g2} }
    }
}

#### MUST update ###
sub figfam_pegs_for_role {
    my($role) = @_;

    my %figfams;
    foreach $_ (`cat /Users/rossoverbeek/FIGdisk/FIG/Data/FigfamsData/family.functions`)
    {
	if ((index($_,$role) >= 0) && ($_ =~ /^(FIG\d{6})/))
	{
	    $figfams{$1} = 1;
	}
    }
    my $sapO = SAPserver->new;
    my $genomesH  = $sapO->all_genomes(-complete => 1);
    my @ids =  grep { $genomesH->{&SeedUtils::genome_of($_)} }
               map { (($_ =~ /^(\S+)\t(\S+)/) && $figfams{$1}) ? $2 : () } 
              `cat /Users/rossoverbeek/FIGdisk/FIG/Data/FigfamsData/families.2c`;
    my $idsH = $sapO->ids_to_sequences(-ids => \@ids, -protein => 1);
    return [map { my $seq = $idsH->{$_}; $seq ? [$_,'',$seq] : () } keys(%$idsH)];
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3