[Bio] / FigKernelScripts / build_family_dir_from_genera.pl Repository:
ViewVC logotype

View of /FigKernelScripts/build_family_dir_from_genera.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (as text) (annotate)
Thu Oct 18 18:08:47 2018 UTC (13 months ago) by olson
Branch: MAIN
CVS Tags: HEAD
Changes since 1.2: +41 -3 lines
Checkpoint before viruses change.

#
# Build the data directory for a family computation based on the list
# of genera pulled using the PATRIC data api.
#
#
# NEED to pull genus mapping data at this time as well.
#
#" http://localhost:9006/solr/taxonomy/select?q=taxon_rank%3Agenus&fl=taxon_name%2Ctaxon_id&wt=csv&csv.separator=%09&rows=1000000&sort=taxon_name%20asc
    

use strict;
use File::Path 'make_path';
use File::Slurp;
use LWP::UserAgent;
use Getopt::Long::Descriptive;
use Data::Dumper;
use URI;
use JSON::XS;
use DBI;
use pl;
#use Proc::ParallelLoop;
use IPC::Run 'run';
use gjoseqlib;
use DB_File;

my($opt, $usage) = describe_options("%c %o data-dir",
				    ['rank=s', "Use the given taxon rank for grouping (defaults to genus)",
				 		{ default => 'genus' }],
				    ['genus|g=s@', 'Limit to the given genus. May be repeated for multiple genera.'],
				    ['genomes=s', 'Limit to the genomes in this file'],
				    ['bad-genome|b=s@', "A bad genome. Don't use it.", { default => ['340189.4'] }],
				    ["min-genomes|m=i", "Minimum number of genomes required in a genus to build families", { default => 4 }],
				    ["solr-url|d=s", "Solr API url", { default => 'https://www.patricbrc.org/api' }],
				    ["parallel|p=i", "Run with this many procs", { default => 1 }],
				    ["genome-dir=s", "Directory holding PATRIC genome data", { default => "/vol/patric3/downloads/genomes" }],
				    ["help|h", "Show this help message"],
				    );

print($usage->text), exit if $opt->help;
die($usage->text) if @ARGV != 1;

my $base_data_dir = shift;


my %gnames;
my @genome_data = get_genomes($opt, \%gnames);
# die Dumper(\@genome_data);
# exit;
pareach \@genome_data, sub {
    my($ent) = @_;
    my($genus, $genome_ids) = @$ent;

    my $data_dir = "$base_data_dir/$genus";
    die "Data directory $data_dir already exists\n" if -d $data_dir;

    my $seqs_dir = "$data_dir/Seqs";

    make_path($data_dir, $seqs_dir);

    open(GFILE, ">", "$data_dir/genomes") or die "Cannot write $data_dir/genomes: $!";
    open(GNAME, ">", "$data_dir/genome.names") or die "Cannot write $data_dir/genome.names: $!";
    open(SOURCES, ">", "$data_dir/sources") or die "Cannot write $data_dir/sources: $!";

    my %seq_len;
    my $tied = tie %seq_len, 'DB_File', "$data_dir/seq_len.db", O_RDWR | O_CREAT, 0664, $DB_BTREE;
    $tied or die "Cannot create $data_dir/seq_len.db: $!";
    
    my @missed;
    for my $gid (@$genome_ids)
    {
	my $prots = $opt->genome_dir . "/$gid/$gid.PATRIC.faa";
	if (!open(P, "<", $prots))
	{
	    warn "Cannot open prots file $prots: $!";
	    next;
	}
	open(SEQS, ">", "$seqs_dir/$gid") or die "Cannot write $seqs_dir/$gid: $!";
	my $skip;
	while (my($id, $def, $seq) = read_next_fasta_seq(\*P))
	{
	    if ($id =~ /^(fig\|[^|]+)/)
	    {
		if ($seq =~ /X{10}/)
		{
		    warn "Skipping bad sequence $id from $prots at $.\n";
		}
		else
		{
		    print_alignment_as_fasta(\*SEQS, [$1, '', $seq]);
		    $seq_len{$1} = length($seq);
		}
	    }
	    else
	    {
		warn "Cannot parse $id from $prots at $.\n";
	    }
	}
	close(SEQS);
	close(P);

	$tied->sync();
	print GFILE "$gid\n";
	print GNAME "$gid\t$gnames{$gid}\n";
	print SOURCES "$seqs_dir/$gid\n";
    }

    untie %seq_len;
    undef $tied;
    close(GFILE);
    close(GNAME);
    close(SOURCES);

    mkdir("$data_dir/nr") or die "cannot mkdir $data_dir/nr: $!";
    my $rc = system("build_nr_md5", "$data_dir/sources", "$data_dir/nr/nr",
		    "$data_dir/nr/peg.synonyms", "$data_dir/nr/nr-len.btree", "$data_dir/nr/figids");
    if ($rc != 0)
    {
	warn "build_nr failed: $rc\n";
    }
    
    if (@missed)
    {
	print STDERR "Missed the following genomes:\n";
	print STDERR "  $_\n" foreach @missed;
    }
}, { Max_Workers => $opt->parallel };

sub get_genomes
{
    my($opt, $gnames) = @_;

    # curl 'https://www.patricbrc.org/api/genome/?q=taxon_lineage_ids:33882&rows=250&start=1&http_content-type=application/solrquery+x-www-form-urlencoded&http_accept=application/solr+json'

    # curl 'http://macleod.vbi.vt.edu:8080/solr/genome/select?q=!genus:%22%22&fl=genus,genome_id,genome_name&wt=csv&csv.separator=%09&rows=100000&sort=genus+asc'

    my $ua = LWP::UserAgent->new;

    my $start = 0;
    my $block = 25000;
    my @out;
    my $lg;
    my $cur;

    my $rank = $opt->rank;
    my $have_bad = $opt->bad_genome;
    my %bad_genome;
    if ($have_bad)
    {
	$bad_genome{$_} = 1 foreach @{$opt->bad_genome};
    }
    # print Dumper(\%bad_genome);

    my $limit_genomes;
    if ($opt->genomes)
    {
	$limit_genomes = {};
	open(G, "<", $opt->genomes) or die "Cannot read " . $opt->genomes . ": $!";
	while (<G>)
	{
	    if (/(\d+\.\d+)/)
	    {
		$limit_genomes->{$1} = 1;
	    }
	    else
	    {
		die "Cannot parse line $. in " . $opt->genomes;
	    }
	}
	close(G);
    }

    my %by_genus;
    #
    # We hardcode a policy change here where anything in viruses gets
    # stuffed in the Viruses directory.
    #
    # To make this workable, collect up all the data and postprocess.
    #

    while (1)
    {
	my $q = make_query(q => "$rank:*",
			   fl => "$rank,genome_id,genome_name,domain,kingdom",
			   rows => $block,
			   start => $start,
			   sort => "$rank asc",
			  );

	my $url = $opt->solr_url . "/genome/?$q";
	
	print STDERR "$url\n";
	my $res = $ua->get($url,
			   'Content-type' => 'application/solrquery+x-www-form-urlencoded',
			   'Accept' => 'application/solr+json',
			  );
	if (!$res->is_success)
	{
	    die "Failed: " . $res->status_line;
	}

	my $range = $res->header('content-range');
	print "Range: $range\n";
	my($tstart, $tstop, $tlast) = $range =~ m,(\d+)-(\d+)/(\d+),;

	my $r = $res->content;
	my $data = decode_json($r);

	# print Dumper($data);

	my $items = $data->{response}->{docs};

	my $limit_genera = defined($opt->genus);

	for my $item (@$items)
	{
	    my($genus, $gid, $name, $kingdom) = @$item{$rank, qw(genome_id genome_name kingdom)};

	    if ($kingdom eq 'Viruses')
	    {
		XXXXX
	    }
	    else
	    {
	    next if $limit_genomes && !$limit_genomes->{$gid};

	    $gnames->{$gid} = $name;
	    if ($have_bad && $bad_genome{$gid})
	    {
		print STDERR "Skipping bad genome $gid\n";
		print STDERR Dumper($item);
		next;
	    }
	    if ($limit_genera)
	    {
		my $ok;
		for my $x (@{$opt->genus})
		{
		    if ($genus =~ /^$x$/)
		    {
			$ok = 1;
			last;
		    }
		}
		next unless $ok;
	    }
	    
	    next if $genus eq '$genus';
	    next if $genus eq '""';
	    next if $genus eq '';
	    if ($genus ne $lg)
	    {
		if ($cur && @$cur >= $opt->min_genomes)
		{
		    push(@out, [$lg, $cur]);
		}
		$cur = [];
		$lg = $genus;
	    }
	    # print Dumper(OK => $item);
	    push(@$cur, $gid);
	}

	if ($tstop < $tlast)
	{
	    $start = $tstop;
	}
	else
	{
	    last;
	}
    }
    if ($cur && @$cur >= $opt->min_genomes)
    {
	push(@out, [$lg, $cur]);
    }
    return @out;
}

sub make_query
{
    my(@list) = @_;

    my @q;
    while (@list)
    {
	my($k, $v) = splice(@list, 0, 2);
	push(@q, "$k=$v");
    }
    return join("&", @q);
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3