[Bio] / FigKernelScripts / find_maximal_clusters.pl Repository:
ViewVC logotype

View of /FigKernelScripts/find_maximal_clusters.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Mon Dec 5 18:56:37 2005 UTC (13 years, 11 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, caBIG-05Apr06-00, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, caBIG-13Feb06-00, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.1: +17 -0 lines
Add license words.

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#


use FIG;
my $fig = new FIG;

$usage = "usage: find_maximal_clusters Genome [Exclude] > ClusterData";

(
 ($genome = shift @ARGV) && (-d "$FIG_Config::organisms/$genome")
)
    || die $usage;

if ((@ARGV > 0) && open(EXCLUDE,"<$ARGV[0]"))
{
    while (defined($_ = <EXCLUDE>))
    {
	while ($_ =~ /(fig\|\d+\.\d+\.peg\.\d+)/g)
	{
	    $exclude{$1} = 1;
	}
    }
    close(EXCLUDE);
}

foreach $peg (sort { &FIG::by_fig_id($a,$b) } $fig->all_features($genome,"peg"))
{
    if ((! $exclude{$peg}) && (! $processed{$peg}))
    {
	if ($ENV{'VERBOSE'}) { print STDERR "processing $peg\n" }
	$cluster = &in_cluster($peg);
	if ($ENV{'VERBOSE'}) { print STDERR &Dumper(["produced cluster",$cluster]) }

	if (@$cluster > 1)
	{
	    $stack     = [$peg];        # pegs to be expanded to clusters
	    $in_sets   = {};            # pegs are in "similarity sets"
	    $next_set  = 0;             # sets are numbered 0,1,2...
	    $clusters  = {};            # a cluster is a sorted list of similarity sets (e.g., "0,2,3,8")
	                                #    This hash takes the similarity set name to [Genome-Beg...N...End]
	                                #    That is, to a list of pointers to specific clusters in genomes.
	    &add_cluster_to_sets($cluster,$in_sets,\$next_set);
	    $must_cluster = &abstract_name($cluster,$in_sets);
	    print "cluster\t",join(",",@$cluster),"\n";

	    &process($fig,$stack,$clusters,$in_sets,\$next_set,[split(/,/,$must_cluster)]);
	    foreach $peg1 (@$cluster)
	    {
		$processed{$peg1} = 1;
	    }
	    &write_output($clusters);
	}
    }
}

sub write_output {
    my($clusters) = @_;
    my(@clusters,$cluster,$specificH,$abstract,$cluster1,@remaining);

    if ($ENV{'VERBOSE'}) { print STDERR "writing ",&Dumper($clusters) }

    @clusters = ();
    foreach $cluster (keys(%$clusters))
    {
	$abstract  = [split(/,/,$cluster)];
	$specificH = $clusters->{$cluster};
	foreach $specific (keys(%$specificH))
	{
	    push(@clusters,[$abstract,$specific,$clusters->{$cluster}->{$specific}]);
	}
    }

    @clusters = sort { @{$b->[0]} <=> @{$a->[0]} } @clusters;
    if ($ENV{'VERBOSE'}) { print STDERR "writing expanded ",&Dumper(\@clusters) }

    while ($cluster = shift @clusters)
    {
	&write_output1($cluster,"maximal");
	@remaining = ();
	foreach $cluster1 (@clusters)
	{
	    if (&subsumes($cluster->[0],$cluster1->[0]))
	    {
		&write_output1($cluster1,"subsumed");
	    }
	    else
	    {
		push(@remaining,$cluster1);
	    }
	}
	@clusters = @remaining;
    }
    print "//\n";
}

sub subsumes {
    my($cluster1,$cluster2) = @_;
    my $i;

    my %set1 = map { $_ => 1 } @$cluster1;
    for ($i=0; ($i < @$cluster2) && $set1{$cluster2->[$i]}; $i++) {}
    return ($i == @$cluster2);
}

sub write_output1 {
    my($cluster,$type) = @_;

    print "$type\t",join(",",@{$cluster->[0]}),"\t$cluster->[1]\t",join(",",@{$cluster->[2]}),"\n";
}

sub process {
    my($fig,$stack,$clusters,$in_sets,$next_setP,$must_contain) = @_;
    my($peg1,$peg2,$peg3,$peg4,$cluster,$abstract_cluster,$specific_cluster,$tuple);

    my $stacked = {};
    while ($peg1 = pop @$stack)
    {
	if ($ENV{'VERBOSE'}) { print STDERR "    popped $peg1 leaving ",scalar @$stack,"\n"; }
	if ((! $expanded{$peg1}) && ($cluster = &in_cluster($peg1)))
	{
	    if ($ENV{'VERBOSE'}) { print STDERR &Dumper(["expanded_cluster",$cluster]) }
	    &add_cluster_to_sets($cluster,$in_sets,$next_setP);
	    $abstract_cluster = &abstract_name($cluster,$in_sets);
	    if ($ENV{'VERBOSE'}) { print STDERR "    abstract=$abstract_cluster\n" }

	    if (&subsumes([split(/,/,$abstract_cluster)],$must_contain))
	    {
		if ($ENV{'VERBOSE'}) { print STDERR "    subsumed original cluster\n" }
		$specific_cluster = &specific_name($cluster);
		if ($ENV{'VERBOSE'}) { print STDERR "    specific=$specific_cluster\n" }
		$clusters->{$abstract_cluster}->{$specific_cluster} = $cluster;

		foreach $peg2 (@$cluster)
		{
		    foreach $tuple ($fig->coupling_evidence($peg1,$peg2))
		    {
			($peg3,$peg4) = @$tuple;
			&add_to_sets($peg3,$in_sets,$peg1,$stack,$stacked);
			&add_to_sets($peg4,$in_sets,$peg2,$stack,$stacked);
		    }
		}
		foreach $peg2 (@$cluster)
		{
		    $expanded{$peg2} = 1;
		}
	    }
	}
    }
}
		
sub add_to_sets {
    my($peg_to_add,$in_sets,$peg,$stack,$stacked) = @_;
    my($set);

    foreach $set (keys(%{$in_sets->{$peg}}))
    {
	$in_sets->{$peg_to_add}->{$set} = 1;
	if ($ENV{'VERBOSE'}) { print STDERR "    added $peg_to_add to $set\n" }
    }
    if (! $stacked->{$peg_to_add})
    {
	push(@$stack,$peg_to_add);
	$stacked->{$peg_to_add} = 1;
	if ($ENV{'VERBOSE'}) { print STDERR "    stacked $peg_to_add\n" }
    }
}

sub in_cluster {
    my($peg) = @_;
    my(%seen,$i,$peg1);

    my @cluster = ($peg);
    $seen{$peg} = 1;
    for ($i=0; ($i < @cluster); $i++)
    {
	foreach $peg1 (map { $_->[0] } grep { $_->[1] >= 4 } $fig->coupled_to($cluster[$i]))
	{
	    if (! $seen{$peg1})
	    {
		push(@cluster,$peg1);
		$seen{$peg1} = 1;
	    }
	}
    }
    @cluster = sort { &FIG::by_fig_id($a,$b) } @cluster;
    return (@cluster > 1) ? \@cluster : undef;
}

sub abstract_name {
    my($cluster,$in_sets) = @_;

    return join(",",sort { $a <=> $b } map { keys(%{$in_sets->{$_}}) } @$cluster);
}

sub specific_name {
    my ($cluster) = @_;
    my($genome,$beg,$end,$n);

    if ($cluster->[0] =~ /fig\|(\d+\.\d+)\.peg\.(\d+)/) 
    {
	$genome = $1;
	$beg = $2;
	if ($cluster->[$#{$cluster}] =~ /fig\|\d+\.\d+\.peg\.(\d+)/)
	{
	    $end = $1;
	    $n = @$cluster;
	    return "$genome-$beg..$n..$end";
	}
    }
    print STDERR &Dumper($cluster);
    die "could not get specific name";
}
    
sub add_cluster_to_sets {
    my($cluster,$in_sets,$next_setP) = @_;

    my $peg2;
    foreach $peg2 (@$cluster)
    {
	if (! $in_sets->{$peg2})
	{
	    if ($ENV{'VERBOSE'}) { print STDERR "    placed $peg2 into $$next_setP\n" }
	    $in_sets->{$peg2}->{$$next_setP} = 1;
	    $$next_setP++;
	}
    }
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3