[Bio] / FigKernelScripts / build_anno_clearinghouse.pl Repository:
ViewVC logotype

View of /FigKernelScripts/build_anno_clearinghouse.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (download) (as text) (annotate)
Mon Feb 18 22:32:20 2008 UTC (11 years, 9 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.5: +7 -2 lines
Add -no-duplicates support for build_anno_clearinghouse and build_nr.

use strict;
use NRTools;
use FIG;
use Cwd 'abs_path';

#
# Build an instance of an annotation clearinghouse.
#
# Use the NR files in the given NR directory and the seed data in the SEED directory
# to build a NR and peg.synonyms in the target directory.
#
# Build DB_File indices of the peg.synonyms.
#
# Build glimpse indices of the NR data.
#

my $usage = "build_anno_clearinghouse [-no-duplicates] [-no-copy-nr] NR-directory SEED-directory target-directory";

my $copy_nr = 1;
my $sort_size = "1G";
my $no_dups;

while ((@ARGV > 0) && ($ARGV[0] =~ /^-/))
{
    my $arg = shift @ARGV;
    if ($arg =~ /^-no-copy-nr/i)
    {
	$copy_nr = 0;
    }
    elsif ($arg =~ /^-sort-size/)
    {
	$sort_size = shift @ARGV;
    }
    elsif ($arg =~ /^-no-duplicates/)
    {
	$no_dups++;
    }
    else
    {
	die $usage;
    }
}

@ARGV == 3 or die  $usage;

my $dir_nr = shift;
my $dir_seed = shift;
my $dir_target = shift;

&FIG::verify_dir($dir_target);

#
# Copy NR directories into our data directory.
#

my $local_nr_dir = $dir_nr;
if ($copy_nr)
{
    my $local_nr_dir = abs_path("$dir_target/NR");
    &FIG::verify_dir($local_nr_dir);
    
    #my $cmd= "cp -rp $dir_nr/* $local_nr_dir";
    my $cmd  = "rsync -arv --exclude=nohup.out $dir_nr/* $local_nr_dir";
    print "Copy NR dirs: $cmd\n";
    my $rc = system($cmd);
    $rc == 0 or die "Error running $cmd: $! $rc\n";
}

my %NR_files;

#
# Scan inputs.
#

print "Scan NR\n";
scan_NR_dir(\%NR_files, $local_nr_dir);
#print Dumper(\%NR_files);
print "Scan SEED\n";
scan_seed_dir(\%NR_files, $dir_seed);
#scan_seed_dir(\%NR_files, $dir_seed, { limit => 10 });

#
# Write nr.sources.
#

open(N, ">$dir_target/nr.sources") or die "Cannot write $dir_target/nr.sources: $!\n";

for my $key (sort keys %NR_files)
{
    my($v) = $NR_files{$key};

    print N "$v->{fasta_path}\n";
}
close(N);

#
# Create assignments file for seed orgs.
#

&FIG::verify_dir("$dir_target/to_index");
open(F, ">$dir_target/to_index/seed.assigned_functions") or die "Cannot write $dir_target/seed.assigned_functions: $!";
for my $ent (values %NR_files)
{
    next unless $ent->{type} eq 'seed_org';

    my $af = "$ent->{path}/assigned_functions";
    next unless -f $af;
    my %seen;
    open(A, "tac $af|") or die "cannot open tac $af pipe: $!\n";
    while (<A>)
    {
	if (/^(fig\|\d+\.\d+\.peg\.\d+)\t/)
	{
	    next if $seen{$1};
	    print F $_;
	    $seen{$1}++;
	}
    }
    close(A);
}
close(F);

#
# Construct glimpse indexes of assignments
#

#
# We're not doing keyword searches any more.
#

if (0)
{
    print "Building Glimpse index\n";
    open(G, "|$FIG_Config::ext_bin/glimpseindex -F -o -n 100 -E -H $dir_target/to_index -M 40") or die "cannot open glimpseindex: $!";
    G->autoflush(1);
    print G "$dir_target/to_index/seed.assigned_functions\n";
    
    for my $ent (values %NR_files)
    {
	next unless $ent->{type}  eq 'NR';
	
	print G "$ent->{path}/assigned_functions\n";
    }
    #
    # Let this run in parallel while we build nr
    #
    # close(G) or die "glimpseindex failed: $! $?\n";
}

#
# Build the NR
#
print "Building NR\n";
my @nargs = ("-singleton-file", "$dir_target/singletons",
	     "-singleton-index", "$dir_target/singleton.index",
	     "-index", "$dir_target/peg.synonyms.index",
	     "-emit-singleton",
	     "-rev-block-sort",
	     $no_dups ? "-no-duplicates" : "-skip-duplicates",
	     "-sort-size", $sort_size,
	     "$dir_target/nr.sources", "/dev/null", "/dev/null", "$dir_target/nr", "$dir_target/peg.synonyms");
print "Args: @nargs\n";
my $rc = system("$FIG_Config::bin/build_nr", @nargs);

if ($rc != 0)
{
    die "build_nr @nargs failed with rc=$rc\n";
}

if (0)
{
    close(G) or die "glimpseindex failed: $! $?\n";

    system("chmod", "-R", "go+r", "$dir_target/to_index");
}

#
# Index pegsyns
#
# (build_nr now does this)
#
#print "Indexing\n";
#my @pargs = ("$dir_target/peg.synonyms", "$dir_target/peg.synonyms.index");
#$rc = system("$FIG_Config::bin/index-pegsyn", @pargs);
#if ($rc != 0)
#{
#    die "index-pegsyn @pargs failed with rc=$rc\n";
#}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3