[Bio] / FigKernelPackages / Clustering.pm Repository:
ViewVC logotype

View of /FigKernelPackages/Clustering.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (as text) (annotate)
Sun Feb 21 17:47:01 2010 UTC (10 years ago) by overbeek
Branch: MAIN
Changes since 1.2: +1 -0 lines
min_dist is same as single-linkage

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

package Clustering;

use Carp;
use Data::Dumper;
use tree_utilities;

# $connections->{$object1} ->{$object2} is the distance between $object1 and $object2, if it is defined (undef
# is equivalent to infinity)
#
sub cluster {
    my($connections,$max_dist,$dist_func_ref,$things) = @_;

    if (! ref($dist_func_ref))
    {
	if    ($dist_func_ref eq "avg_dist")            { $dist_func_ref = \&avg_dist }
	elsif ($dist_func_ref eq "max_dist")            { $dist_func_ref = \&max_dist }
	elsif ($dist_func_ref eq "single_linkage_dist") { $dist_func_ref = \&single_linkage_dist }
	elsif ($dist_func_ref eq "min_dist")            { $dist_func_ref = \&single_linkage_dist }
	else { confess "Could not resolve the distance function" }
    }
    my @clusters = defined($things) ? @$things :
                                      map { [$_] } keys(%$connections);
    my @trees    = map { [$_->[0],0,[undef]] } @clusters;

    my ($cI,$cJ,$d) = &closest($connections,\@clusters,$max_dist,$dist_func_ref);
    while (defined($cI))
    {
	my $treeI   = $trees[$cI];
	my $treeJ   = $trees[$cJ];
	my $parent  = ['',0,[0,$treeI,$treeJ]];
	$treeI->[2]->[0] = $treeJ->[2]->[0] = $parent;
	$treeI->[1] = $treeJ->[1] = $d/2;
	$trees[$cI] = $parent;
	splice(@trees,$cJ,1);

	push(@{$clusters[$cI]},@{$clusters[$cJ]});
	splice(@clusters,$cJ,1);
	($cI,$cJ,$d) = &closest($connections,\@clusters,$max_dist,$dist_func_ref);
    }
    return (\@clusters,\@trees);
}

sub closest {
    my($connections,$clusters,$max_dist,$dist_func_ref) = @_;

    my($i,$j,$best,$bestI,$bestJ);
    for ($i=0; ($i < (@$clusters - 1)); $i++)
    {
	for ($j=$i+1; ($j < @$clusters); $j++)
	{
	    my $dist = &$dist_func_ref($connections,$clusters->[$i],$clusters->[$j]);
	    if (defined($dist) && ($dist <= $max_dist))
	    {
		if ((! defined($best)) || ($best > $dist))
		{
		    $bestI = $i;
		    $bestJ = $j;
		    $best  = $dist;
		}
	    }
	}
    }
    return ($bestI,$bestJ,$best);
}
    
sub single_linkage_dist {
    my($connections,$clust1,$clust2) = @_;

    my $best;
    foreach my $x (@$clust1)
    {
	foreach my $y (@$clust2)
	{
	    my $dist = $connections->{$x}->{$y};
	    if ((! defined($best)) || (defined($dist) && ($dist < $best)))
	    {
		$best = $dist;
	    }
	}
    }
    return $best;
}

sub max_dist {
    my($connections,$clust1,$clust2) = @_;

    my $best;
    foreach my $x (@$clust1)
    {
	foreach my $y (@$clust2)
	{
	    my $dist = $connections->{$x}->{$y};
	    if ((! defined($best)) || (defined($dist) && ($dist > $best)))
	    {
		$best = $dist;
	    }
	}
    }
    return $best;
}

sub avg_dist {
    my($connections,$clust1,$clust2) = @_;

    my $sum = 0;
    my $n   = 0;
    foreach my $x (@$clust1)
    {
	foreach my $y (@$clust2)
	{
	    my $dist = $connections->{$x}->{$y};
	    $n++;
	    $sum += $dist;
	}
    }
    return $n ? ($sum/$n) : undef;
}

1

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3