[Bio] / FigKernelScripts / FFB2_make_FF_index.pl Repository:
ViewVC logotype

View of /FigKernelScripts/FFB2_make_FF_index.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Wed Jun 23 21:16:51 2010 UTC (9 years, 8 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011
Changes since 1.1: +83 -14 lines
Rollup of figfam update fixes

use FIG_Config;
use FIG;
use Proc::ParallelLoop;
use Data::Dumper;
use strict;

my $usage = "usage: make_FF_index FigfamRelDir IndexFile BlastDir";
my($ff_dir,$indexF,$blastDir);

(
 ($ff_dir = shift @ARGV) && (-s "$ff_dir/families.2c") && (-s "$ff_dir/family.functions") &&
 ($indexF = shift @ARGV) &&
 ($blastDir = shift @ARGV)
)
    || die $usage;

if (-d $blastDir) 
{
    die "You have a version of $blastDir; delete it before trying to rebuild it";
}

my %funcH;
open(FUNCS,"<$ff_dir/family.functions") || die "could not open $ff_dir/family.functions";
open(PEGS,"<$ff_dir/families.2c")       || die "could not open $ff_dir/families.2c";
open(INDEX,">$indexF")                  || die "could not open $indexF";

my %ff_to_pegs;
my %genomes;
while (defined($_ = <PEGS>))
{
    chomp;

    if (my($ff, $peg, $genome) = /^(FIG\d+)\t(fig\|(\d+\.\d+)\S+)/)
    {
	push(@{$ff_to_pegs{$ff}},$peg);
	$genomes{$genome}->{$peg} = 1;
    }
}
close(PEGS);

#
# Pull all translations for the genomes we see.
#

my %translations;
for my $genome (sort { $a <=> $b } keys %genomes)
{
    my $phash = $genomes{$genome};
    my $fa = "$FIG_Config::organisms/$genome/Features/peg/fasta";
    open(FA, "<", $fa) or die "Cannot open $fa: $!";
    while (my($id, $seqp) = &FIG::read_fasta_record(\*FA))
    {
	if ($phash->{$id})
	{
	    $translations{$id} = $seqp;
	}
    }
    close(FA);
}
print "Translations laoded\n";

while (defined($_ = <FUNCS>))
{
    chomp;
    my($ff,$func) = split(/\t/,$_);
    push(@{$funcH{$func}},$ff);
}
close(FUNCS);

my $indexI = 1;

my @buckets;
my $next_bucket = 0;
my $num_buckets = 5;

my %index;
my @func_list = sort { @{$funcH{$b}} <=> @{$funcH{$a}} } keys(%funcH);
foreach my $func (@func_list)
{
    my $subI = $indexI % 1000;
    my @ffs = sort @{$funcH{$func}};
    foreach my $ff (@ffs)
    {
	print INDEX join("\t",($indexI,$ff,$func)),"\n";
    }
    $index{$func} = $indexI;
    $indexI++;

    push(@{$buckets[$next_bucket]}, $func);
    $next_bucket = ($next_bucket + 1) % $num_buckets;
}
close(INDEX);

#foreach my $func (@func_list){

pareach \@buckets, sub {
    my $bucket= shift;
#    my $fig = new FIG;
    for my $func (@$bucket)
    {
	my $indexI = $index{$func};
	my $subI = $indexI % 1000;
	&FIG::verify_dir("$blastDir/$subI");
	my $fasta = "$blastDir/$subI/$indexI.fasta";
	my $fh;
	open($fh,">", $fasta) || die "could not open $blastDir/$subI/$indexI.fasta";
	my @ffs = sort @{$funcH{$func}};
	foreach my $ff (@ffs)
	{
	    foreach my $peg (sort { &FIG::by_fig_id($a,$b) } @{$ff_to_pegs{$ff}})
	    {
		#	    my $seq = $fig->get_translation($peg);
		my $seqp = $translations{$peg};
		if ($seqp)
		{
		    my $seq = $$seqp;
		    my $len = length($seq);
		    print $fh ">${ff}:${len}:$peg\n$seq\n";
		}
	    }
	}
	close($fh);
	if (-s $fasta == 0)
	{
	    warn "Zero-length fasta written for index $indexI: $func\n";
	}
	my $rc = system("$FIG_Config::ext_bin/formatdb -pT -i $fasta");
	if ($rc != 0)
	{
	    warn "formatdb failed on $fasta\n";
	}
    }
}
    
    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3