[Bio] / FigKernelScripts / generate_kernel_css.pl Repository:
ViewVC logotype

View of /FigKernelScripts/generate_kernel_css.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (download) (as text) (annotate)
Mon Dec 5 18:56:37 2005 UTC (13 years, 11 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, caBIG-05Apr06-00, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, caBIG-13Feb06-00, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.4: +17 -0 lines
Add license words.

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#


use strict;

use FIG;
my $fig = new FIG;

&make_sure("formatdb","blastall");

my $usage = "usage: generate_kernel_css Directory ParmFile [PegToStartAt] < coupling.tuples";

my($outputD,$parmF);
(
 ($outputD  = shift @ARGV) &&
 ($parmF    = shift @ARGV)
)
    || die $usage;

my $parms = &read_parameters($parmF);
if ($ENV{'VERBOSE'}) { &print_parms($parms) }

&FIG::verify_dir($outputD);

my $minPCHs = $parms->{'minPCHs'} ? $parms->{'minPCHs'} : 4;

my %conn;
while (defined($_ = <STDIN>))
{
    if (($_ =~ /^(fig\|\d+\.\d+\.peg\.\d+)\t(fig\|\d+\.\d+\.peg\.\d+)\t(\d+)/) &&
	($3 >= $minPCHs))
    {
	my($peg1,$peg2) = ($1,$2);
	if ((@ARGV == 0) || (&FIG::by_fig_id($peg1,$ARGV[0]) > -1))
	{
	    push(@{$conn{$peg1}},$peg2);
	}
    }
}

my @pegs = sort { &FIG::by_fig_id($a,$b) } keys(%conn);

&make_css($fig,$outputD,$parms,\@pegs,\%conn);

sub make_css {
    my($fig,$outputD,$parms,$pegs,$conn) = @_;

    my $min_sz        = $parms->{'min_sz'}  ? $parms->{'min_sz'}  : 3;
    my $minNsub       = $parms->{'minNsub'} ? $parms->{'minNsub'} : 2;
    my $min_sig       = $parms->{'min_sig'} ? $parms->{'min_sig'} : 2;
    my $existing_subs = $parms->{'subsystems'};
    my $max_psc       = $parms->{'max_psc'} ? $parms->{'max_psc'} : 1.0e-20;

    my($peg,%seen,$critical);
    foreach $peg (@$pegs)
    {
	if (! $seen{$peg})
	{
	    my @cluster = &cluster_connected($peg,\%conn,\%seen);
	    if ($ENV{'VERBOSE'}) { print &Dumper(['cluster:',\@cluster]) }

	    if ((@cluster >= $min_sz) && 
		($critical = &check_subsys_and_sigs($fig,$minNsub,$existing_subs,$min_sig,$max_psc,\@cluster)))
	    {
		if ($ENV{'VERBOSE'}) { print STDERR &Dumper(["critical",$critical]) } 
		my $css;
		if ($css = &build_css($fig,$parms,$critical,\@cluster))
		{
		    &write_css($fig,$parms,$css,\@cluster,$critical,$outputD);
		}
	    }
	}
    }
}

sub check_subsys_and_sigs {
    my($fig,$minNsub,$existing_subs,$min_sig,$max_psc,$cluster) = @_;
    my($genome,$peg,$pseq,@close,@subs,$i,$col);

    my $critical = [];
    my $no_sub   = 0;
    my $sigs     = 0;
    for ($col=0; ($col < @$cluster); $col++)
    {
	$peg = $cluster->[$col];
#	@subs = map { $_ =~ s/ /_/g; $_ } $fig->peg_to_subsystems($peg);
#	for ($i=0; ($i < @subs) && (! $existing_subs->{$subs[$i]}); $i++) {}
#	if ($i == @subs)
#	{
#	    if ($ENV{'VERBOSE'}) { print STDERR "not in subsystem $peg\n" }
#	    $no_sub++;
#
#	    $genome = &FIG::genome_of($peg);
#	    $pseq = $fig->get_translation($peg);
#	    @close = &similar_prots($pseq,"$FIG_Config::organisms/$genome/Features/peg/fasta",$max_psc);
#	    if (@close == 1)
#	    {
#		$sigs++;
#		push(@$critical,$col);
#	    }
#	}

	    
#################
	$genome = &FIG::genome_of($peg);
	$pseq = $fig->get_translation($peg);
	@close = &similar_prots($pseq,"$FIG_Config::organisms/$genome/Features/peg/fasta",$max_psc);
	if (@close == 1)
	{
	    $sigs++;
	    push(@$critical,$col);
	}
	$no_sub = $minNsub;
#################
    }
    if ($ENV{'VERBOSE'}) { print STDERR "no_sub=$no_sub minNsub=$minNsub  sigs=$sigs min_sig=$min_sig\n" }
    return (($no_sub >= $minNsub) && ($sigs >= $min_sig)) ? $critical : undef;
}


sub cluster_connected {
    my($peg,$conn,$seen) = @_;

    my @cluster = ($peg);
    $seen->{$peg} = 1;
    
    my($i,$peg1,$peg2,$connected);
    for ($i=0; ($i < @cluster); $i++)
    {
	$peg1 = $cluster[$i];
	$connected = $conn->{$peg1};
	foreach $peg2 (@$connected)
	{
	    if (! $seen->{$peg2})
	    {
		push(@cluster,$peg2);
		$seen->{$peg2} = 1;
	    }
	}
    }
    return sort { &FIG::by_fig_id($a,$b) } @cluster;
}

sub enough_not_in_subsys {
    my($fig,$minNsub,$cluster,$existing_subs) = @_;
    my($peg,@subs,$i);

    my $n = 0;
    foreach $peg (@$cluster)
    {
	@subs = map { $_ =~ s/ /_/g; $_ } $fig->peg_to_subsystems($peg);
	for ($i=0; ($i < @subs) && (! $existing_subs->{$subs[$i]}); $i++) {}
	if ($i == @subs)
	{
	    print STDERR "not in subsystem $peg\n";
	    $n++;
	}
    }
    if ($ENV{'VERBOSE'}) { print STDERR "enough_not_in_subsys: n=$n minNsub=$minNsub\n" }
    return ($n >= $minNsub);
}

sub enough_sig {
    my($fig,$min_sig,$max_psc,$cluster) = @_;
    my($genome,$n,$peg,$pseq,@close);

    $genome = &FIG::genome_of($cluster->[0]);
    $n = 0;
    foreach $peg (@$cluster)
    {
	$pseq = $fig->get_translation($peg);
	@close = &similar_prots($pseq,"$FIG_Config::organisms/$genome/Features/peg/fasta",$max_psc);
	if (@close == 1)
	{
	    $n++;
	}
    }
    if ($ENV{'VERBOSE'}) { print STDERR "enough_sig: n=$n min_sig=$min_sig\n" }
    return ($n >= $min_sig);
}

sub similar_prots {
    my($pseq,$file,$max_psc) = @_;

    if ((! -s "$file.psq") || ((-M $file) < (-M "$file.psq")))
    {
	&FIG::run("formatdb -i $file -p T");
    }
    open(TMP,">/tmp/query$$") || die "coule not open /tmp/query$$";
    print TMP ">query\n$pseq\n";
    close(TMP);
    my @sims  = grep { $_->[6] <= $max_psc }
                map { $_ =~ /^(\S+)\t(\S+)\t(\S+\t){4}(\S+)\t(\S+)\t(\S+)\t(\S+)\s+(\S+)/;
                      [$1,$2,$4,$5,$6,$7,$8] }
                `blastall -i /tmp/query$$ -d $file -FF -m 8 -p blastp`;
    return @sims;
}

sub read_parameters {
    my($parmF) = @_;

    my $parms = {};
    $parms->{'subsystems'} = {};

    my $p;
    foreach $p (`cat $parmF`)
    {
	chop $p;
	my($key,$value) = split(/\t/,$p);
	if ($key eq "subsystems")
	{
	    my $s;
	    foreach $s (`cat $value`)
	    {
		if ($s =~ /^\s*(\S[^\t]*\S)/)
		{
		    $parms->{'subsystems'}->{$1} = 1;
		}
	    }
	}
	else
	{
	    $parms->{$key} = $value;
	}
    }
    return $parms;
}

sub print_parms {
    my($parms) = @_;
    my($key,@subs,$n);

    print STDERR "PARAMTERS\n\n";
    foreach $key (sort keys(%$parms))
    {
	if ($key ne "subsystems")
	{
	    print STDERR "\t$key\t$parms->{$key}\n";
	}
	else
	{
	    @subs = keys(%{$parms->{'subsystems'}});
	    $n = @subs;
	    print STDERR "\t$key\t$n subsystems\n";
	}
    }
    print STDERR "\n";
}

sub make_sure {
    my(@progs) = @_;

    my $prog;
    foreach $prog (@progs)
    {
	my @tmp = `which $prog`;
	if ($tmp[0] =~ /no $prog/)
	{
	    print STDERR $tmp[0];
	    exit(1);
	}
    }
}

sub build_css {
    my($fig,$parms,$critical,$cluster) = @_;
    my($ncols,$row,$ss,$col1,$col2,@couples,$tuple,$peg1,$peg2,$peg1b,$peg2b);
    my($ev,$ev_tuple,$genome2,%ssH);

    my $min_PCHs      = $parms->{'minPCHs'} ? $parms->{'minPCHs'} : 4;
    my $max_psc       = $parms->{'max_psc'} ? $parms->{'max_psc'} : 1.0e-20;
    my $min_sig       = $parms->{'min_sig'} ? $parms->{'min_sig'} : 2;

    $ncols = @$cluster;
    $row = &format_row(&FIG::genome_of($cluster->[0]),[map { [$_] } @$cluster]);
    $ss  = [$row];

    for ($col1=0; ($col1 < @$cluster); $col1++)
    {
	$peg1 = $cluster->[$col1];
	my @coupled = $fig->coupling_and_evidence($peg1,5000,$max_psc,$min_PCHs,1);
	if ($ENV{'VERBOSE'}) { print STDERR &Dumper(["coupled:", \@coupled]) }

	foreach $tuple (@coupled)
	{
	    (undef,$peg2,$ev) = @$tuple;
	    for ($col2=0; ($col2 < @$cluster) && ($cluster->[$col2] ne $peg2); $col2++) {}
	    if ($col2 < @$cluster)
	    {
		foreach $ev_tuple (@$ev)
		{
		    ($peg1b,$peg2b) = @$ev_tuple;
		    $genome2 = &FIG::genome_of($peg1b);
		    $ssH{$genome2}->{$col1}->{$peg1b} = 1;
		    $ssH{$genome2}->{$col2}->{$peg2b} = 1;
		}
	    }
	}
    }

    my($i,$genome);
    foreach $genome (sort { $a <=> $b } keys(%ssH))
    {
	if ($ENV{'VERBOSE'}) { print STDERR &Dumper(["checking $genome",$ssH{$genome},$critical]) }
	my $ok = 0;
	for ($i=0; ($i < @$critical); $i++)
	{
	    if ($ssH{$genome}->{$critical->[$i]} && &no_sub_sig($fig,$parms,$ssH{$genome}->{$critical->[$i]}))
	    {
		$ok++;
	    }
	}

	if ($ok >= $min_sig)
	{
	    my $cols = [];
	    for ($i=0; ($i < @$cluster); $i++)
	    {
		$_ = $ssH{$genome}->{$i};
		push(@$cols, $_ ? [keys(%$_)] : []);
	    }
	    push(@$ss,&format_row($genome,$cols));
	}
    }
    return (@$ss > 4) ? $ss : undef;
}

sub format_row {
    my($genome,$cols) = @_;

    return [$genome,0,$cols];
}

sub write_css {
    my($fig,$parms,$css,$cluster,$critical,$outputD) = @_;

    my $name = &compose($css->[0]->[2]);
#    print STDERR &Dumper($name,$css);

    open(CSS,">$outputD/$name") || die "aborted";
    my $min_sig       = $parms->{'min_sig'} ? $parms->{'min_sig'} : 2;
    print CSS join("\t",@$critical),"\n$min_sig\n//\n";

    my($tuple,$col);
    foreach $tuple (@$css)
    {
	my($genome,$variant,$cols) = @$tuple;
	print CSS "$genome\t$variant";
	foreach $col (@$cols)
	{
	    my @ids = map { $_ =~ /^fig\|\d+\.\d+\.peg\.(\d+)$/; $1 } @$col;
	    print CSS "\t",join(",",@ids);
	}
	print CSS "\n";
    }
    close(CSS);
}

sub no_sub_sig {
    my($fig,$parms,$pegs) = @_;

    my(@pegs) = keys(%$pegs);
    if (@pegs != 1) { return 0 }

    my $existing_subs = $parms->{'subsystems'};
    my @subs = map { $_ =~ s/ /_/g; $_ } $fig->peg_to_subsystems($pegs[0]);

    my $i;
    for ($i=0; ($i < @subs) && (! $existing_subs->{$subs[$i]}); $i++) {}
    if ($i == @subs)
    {
	my $max_psc       = $parms->{'max_psc'} ? $parms->{'max_psc'} : 1.0e-20;
	my $genome = &FIG::genome_of($pegs[0]);

	my $pseq = $fig->get_translation($pegs[0]);
	my @close = &similar_prots($pseq,"$FIG_Config::organisms/$genome/Features/peg/fasta",$max_psc);
	if (@close == 1)
	{
	    return 1;
	}
    }
    return 0;
}

sub compose {
    my($cols) = @_;
    
    my @all_pegs = sort { $a <=> $b } 
                  map { $_ =~ /(\d+)$/; $1 } 
                  map { @$_ } @$cols;
    return join("-",($all_pegs[0],"..." . scalar @all_pegs . "...",$all_pegs[$#all_pegs]));
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3