Source code for infpy.madbayes.dpmeans

#
# Copyright John Reid 2013
#


"""
The DP-means algorithm by `Kulis et al.`_

.. _Kulis et al.: http://arxiv.org/abs/1111.0352    
"""

import logging, numpy
from itertools import count
from numpy import newaxis


[docs]def plot_clusters(x, z, format_cycler=None): import pylab as P from cookbook.pylab_utils import create_format_cycler, simple_marker_styles, simple_colours if not format_cycler: format_cycler = create_format_cycler(marker=simple_marker_styles, color=simple_colours) clusters = set(z) for i, c in enumerate(clusters): cluster_x = x[z==c] P.scatter(cluster_x[:,0], cluster_x[:,1], **format_cycler(i))
[docs]def dpmeans(x, lambda_, progress_plots=False): """ The DP-means algorithm by `Kulis et al.`_ .. _Kulis et al.: http://arxiv.org/abs/1111.0352 :parameters: - :math:`x` : input data, a sequence of length :math:`N` - :math:`\lambda` : cluster penalty parameter - progress_plots : Scatter plot clusters at every iteration :returns: Cluster indicator variables Algorithm: #. Initialise: * Number of clusters :math:`K=1` * Global cluster mean :math:`\\mu_1 = \\frac{1}{n} \\sum_n x_n` * Cluster indicator variables :math:`z_n = 0 \\quad \\forall n` #. Repeat until convergence: * For each point :math:`n`: - Compute distance to each cluster :math:`d_{nk} = ||x_n - \\mu_k||^2` - If :math:`\\min d_{nk} > \lambda` then set :math:`K=K+1, z_n=K, \\mu_k=x_n` - Otherwise set :math:`z_n= \\arg\!\\min_k d_{nk}` * For each cluster :math:`k`, compute :math:`\\mu_k = \\frac{1}{|\{n: z_n = k\}|}\sum_{n: z_n = k} x_n` """ if progress_plots: import pylab as P from cookbook.pylab_utils import pylab_context_ioff, \ create_format_cycler, simple_marker_styles, simple_colours format_cycler = create_format_cycler(marker=simple_marker_styles, color=simple_colours) N = len(x) logging.info('Got %d data', N) lambda2 = lambda_ ** 2 z = numpy.zeros(N, dtype=numpy.int) # initialise cluster indicators last_z = None for i in count(1): #for i in xrange(1,21): logging.info('Iteration %d: have %d cluster(s)', i, int(z.max() + 1)) # calculate cluster means mu = [numpy.mean(x[z==k], axis=0) for k in xrange(int(z.max() + 1))] for n, xn in enumerate(x): d2 = numpy.array([((xn - muk)**2).sum() for muk in mu]) closest_k = d2.argmin() if d2[closest_k] > lambda2: mu.append(xn) else: z[n] = closest_k # make clusters contiguous from 0 cluster_map = dict((c, k) for k, c in enumerate(set(z))) if len(cluster_map) < int(z.max() + 1): logging.warning('Reducing cluster indices') for n, c in enumerate(z): z[n] = cluster_map[c] # check if we have converged by testing if no z have changed if None != last_z and (z == last_z).all(): break last_z = z.copy() if progress_plots: with pylab_context_ioff(): P.figure() plot_clusters(x, z, format_cycler) P.savefig('dpmeans-%04d.png' % i) P.close() num_clusters = len(set(z)) logging.info('Have %d cluster(s)', num_clusters) return z