[Bio] / FigKernelScripts / p3x-create-split-nr-for-fams.pl Repository:
ViewVC logotype

View of /FigKernelScripts/p3x-create-split-nr-for-fams.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Wed Nov 2 16:25:31 2016 UTC (3 years ago) by olson
Branch: MAIN
CVS Tags: HEAD
P3 utility scripts.

#
# Read the reduced families file to get a list of ids.
# Read the genus.data peg.synonym files to pull the MD5s for them and then the
# fasta data.
# Write to files in the given NR dir.
#

use strict;
use gjoseqlib;
use Getopt::Long::Descriptive;

my($opt, $usage) = describe_options("%c %o family-dir fam-file nr-dir",
				    ["max-nr-file-size=s" => "Max size of file to create in NR dir", { default => 1_000_000_000 }],
				    ["genus=s" => "Use only this genus directory"],
				    ["help|h" => "Show this help message"]);
print($usage->text), exit 0 if $opt->help;
die($usage->text) if @ARGV != 3;

my $fam_dir = shift;
my $fam_file = shift;
my $nr_dir = shift;

-d $fam_dir or die "Family directory $fam_dir does not exist\n";
-d $nr_dir or die "NR directory $nr_dir does not exist\n";

my $in;
open($in, "<", $fam_file) or die "Cannot read $fam_file: $!\n";

my %need;

#
# Read families file to determine list of pegs needed.
#

while (<$in>)
{
    my($fid) = /^[^\t]+\t[^\t]+\t[^\t]+\t(fig[^\t]+)/;
    $need{$fid} = 1;
}
close($in);

#
# For each genus dir, read the peg.synonyms to determine the MD5
# to pull data for, then read the nr to pull and write sequence data.
#

my $out_size = 0;
my $out_idx = 1;
my $out_file = sprintf("$nr_dir/nr.%04d", $out_idx);
open(NR_OUT, ">", $out_file) or die "Cannot write $out_file: $!";

for my $nrdir (<$fam_dir/genus.data/*/nr>)
{
    my($genus) = $nrdir =~ m,/genus.data/([^/]+),;
    next if $opt->genus && $opt->genus ne $genus;
    my %needmd5;
    open(PS, "<", "$nrdir/peg.synonyms") or die "Cannot read $nrdir/peg.synonyms: $!";
    while (<PS>)
    {
	my($md5, $peg) = /^([^,]+),\d+\t(fig[^,]+)/;
	$needmd5{$md5} = $peg if $need{$peg};
    }
    close(PS);

    open(NR, "<", "$nrdir/nr") or die "Cannot read $nrdir/nr:$ !";
    while (my($id, $def, $seq) = read_next_fasta_seq(\*NR))
    {
	if (my $peg = $needmd5{$id})
	{
	    print_alignment_as_fasta(\*NR_OUT, [$peg, '', $seq]);
	    $out_size += length($seq);
	    if ($out_size >= $opt->max_nr_file_size)
	    {
		close(NR_OUT);
		$out_idx++;
		my $out_file = sprintf("$nr_dir/nr.%04d", $out_idx);
		open(NR_OUT, ">", $out_file) or die "Cannot write $out_file: $!";
		$out_size = 0;
	    }
	}
    }
    close(NR);
}
close(NR_OUT);

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3