[Bio] / FigKernelScripts / get_merged_families_2.pl Repository:
ViewVC logotype

View of /FigKernelScripts/get_merged_families_2.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Fri Jun 19 16:53:46 2015 UTC (4 years, 5 months ago) by olson
Branch: MAIN
CVS Tags: HEAD
Global merge code.

use strict;
use Data::Dumper;
use Getopt::Long::Descriptive;
use File::Path 'make_path';
use File::Slurp;
use Statistics::Descriptive;
use File::Basename;
use gjoseqlib;
use IPC::Run;
use Proc::ParallelLoop;

my($opt, $usage) = describe_options("%c %o kmer-dir merge-dir html-dir",
				    ['inflation|I=s@', 'Run MCL with this inflation parameter', { default => [2.0]} ],
				    ["parallel|p=i", "Run with this many procs", { default => 1 }],
				    ["help|h", "Show this help message"]);

print($usage->text), exit if $opt->help;
die($usage->text) if @ARGV != 3;

my $kmer_dir = shift;
my $merge_dir = shift;
my $html_dir =shift;
my $mcl = '/scratch/olson/FuncGroups/bin/mcl';

my @work;

make_path($html_dir);

my($fn_to_id, $id_to_fn) = read_function_index($kmer_dir);

my @functions;
for my $dist (<$merge_dir/family.dist/*>)
{
    my $f = basename($dist);
    push(@functions, $f);
}

for my $inflation (@{$opt->inflation})
{
    my $mcl_dir = "$merge_dir/mcl/$inflation";
    make_path($mcl_dir);
    
    for my $dist (<$merge_dir/family.dist/*>)
    {
	my $f = basename($dist);
	my $fn = $id_to_fn->[$f];
	if ($fn eq 'hypothetical protein')
	{
	    # we will deal with these separately
	    next;
	}
	if (! -s "$mcl_dir/$f")
	{
	    push(@work, [$f, $fn, $inflation, $dist, "$mcl_dir/$f", "$html_dir/$inflation.$f.html"]);
	}
    }
}

my %peginfo;
open(PM, "<", "$merge_dir/peg.map") or die "Cannot open $merge_dir/peg.map: $!";
while (<PM>)
{
    chomp;
    my @a = split(/\t/);
    $peginfo{$a[0]} = \@a;
}
close(PM);

my $n = @work;
print "Starting $n work items\n";

pareach \@work, sub {
    my $work = shift;
    process(@$work);
}, { Max_Workers => $opt->parallel };

open(I, ">", "$html_dir/index.html") or die "cannot write $html_dir/index.html: $!";

print I "<table>\n";
for my $inflation (@{$opt->inflation})
{
    my $mcl_dir = "$merge_dir/mcl/$inflation";
    
    for my $f (@functions)
    {
	my $mcl = "$mcl_dir/$f";
	my $fun = $id_to_fn->[$f];
	open(F, "<", $mcl) or next;

	my $link = "$inflation.$f.html";
	print I "<tr><th colspan=6><a href='$link' target='_blank'>$fun inflation=$inflation</a></th></tr>\n";

	my $fidx = 1;
	print  I "<tr><th>family</th><th># genera</th><th># genomes</th><th># lfams</th><th># pegs</th></tr>\n";
	while (<F>)
	{
	    chomp;
	    my(%genus, %genome, %lfam );
	    my @pegs = split(/\t/);
	    for my $peg (@pegs)
	    {
		my $pi = $peginfo{$peg};
		defined($pi) or die "No peginfo for '$peg'";
		my(undef, $genus, $lfam, $fidx, $fn, $genome) = @{$peginfo{$peg}};
		$genus{$genus}++;
		$genome{$genome}++;
		$lfam{$lfam}++;
	    }
	    my $ngenus = scalar keys %genus;
	    my $ngenome = scalar keys %genome;
	    my $nlfam = scalar keys %lfam;
	    my $npegs = scalar @pegs;
	    print I "<tr><td>$fidx</td><td>$ngenus</td><td>$ngenome</td><td>$nlfam</td><td>$npegs</td></tr>\n";

	    $fidx++;
	}
    }
}
print I "</table>\n";
close(I);    


sub process
{
    my($fun, $function_name, $inflation, $dist_file, $mcl_file, $html_file) = @_;

    print "Process @_\n";
    my $ok = IPC::Run::run(["cut", "-f", "1,2,4", $dist_file],
			       '|',
			   [$mcl, "-", "-o", $mcl_file, "--abc", "-I", $inflation], "2>", "$mcl_file.mcl.out");
    $ok or die "mcl failed with $?\n";

    #
    # Postprocess to generate webpage
    #

    open(my $fh, ">", $html_file) or die "Cannot open $html_file: $!";
    print $fh "<title>$inflation $function_name</title>\n";
    print $fh "<h1>$function_name</h1>\n";
    print $fh "<h1>Inflation=$inflation</h1>\n";

    open(my $m, "<", $mcl_file) or die "Cannot open $mcl_file: $!";

    my @all_pegs;
    my $fidx = 1;
    print $fh "<table>\n";
    
    while (<$m>)
    {
	chomp;

	print $fh "<tr><th colspan=3>Family $fidx</td></tr>\n";
	print $fh "<tr><th>peg</th><th>lfam</th><th>genome</th></tr>\n";
	my @pegs = split(/\t/);
	for my $peg (@pegs)
	{
	    my(undef, $genus, $lfam, $fidx, $fn, $genome) = @{$peginfo{$peg}};
	    print $fh "<tr><td>$peg</td><td>$lfam</td><td>$genome</td></tr>\n";
	}

	push(@all_pegs, @pegs);
	$fidx++;
    }
    print $fh "</table>";

    my $s = join("&", map { "feature=$_" } @all_pegs);
    my $n = @all_pegs;
    print $fh "<a target='_blank' href='http://pseed.theseed.org/seedviewer.cgi?page=Regions&$s'>compared region: $n pegs</a>\n";

    close($fh);
}

sub read_function_index
{
    my($dir) = @_;

    my $idx = {};
    my $from_id = [];

    open(F, "<", "$dir/function.index") or die "Cannot open $dir/function.index:$ !";
    while (<F>)
    {
	chomp;
	my($i, $fun) = split(/\t/);
	$idx->{$fun} = $i;
	$from_id->[$i] = $fun;
    }
    close(F);
    
    return($idx, $from_id);
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3