Source code for nipy.algorithms.clustering.utils

# emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*-
# vi: set ft=python sts=4 ts=4 sw=4 et:
#from _clustering import *
#from _clustering import __doc__
from __future__ import print_function, absolute_import

import numpy as np


[docs]def kmeans(X, nbclusters=2, Labels=None, maxiter=300, delta=0.0001, verbose=0, ninit=1): """ kmeans clustering algorithm Parameters ---------- X: array of shape (n,p): n = number of items, p = dimension data array nbclusters (int), the number of desired clusters Labels = None array of shape (n) prior Labels. if None or inadequate a random initilization is performed. maxiter=300 (int), the maximum number of iterations before convergence delta: float, optional, the relative increment in the results before declaring convergence. verbose: verbosity mode, optionall ninit: int, optional, number of random initalizations Returns ------- Centers: array of shape (nbclusters, p), the centroids of the resulting clusters Labels : array of size n, the discrete labels of the input items J (float): the final value of the inertia criterion """ nbitems = X.shape[0] if nbitems < 1: if verbose: raise ValueError(" I need at least one item to cluster") if np.size(X.shape) > 2: if verbose: raise ValueError("Please enter a two-dimensional array \ for clustering") if np.size(X.shape) == 1: X = np.reshape(X, (nbitems, 1)) X = X.astype('d') nbclusters = int(nbclusters) if nbclusters < 1: if verbose: print(" cannot compute less than 1 cluster") nbclusters = 1 if nbclusters > nbitems: if verbose: print(" cannot find more clusters than items") nbclusters = nbitems if (ninit < 1) & verbose: print("making at least one iteration") ninit = np.maximum(int(ninit), 1) if Labels is not None: if np.size(Labels) == nbitems: Labels = Labels.astype(np.int) OK = (Labels.min() > -1) & (Labels.max() < nbclusters + 1) if OK: maxiter = int(maxiter) if maxiter > 0: delta = float(delta) if delta < 0: if verbose: print("incorrect stopping criterion - ignored") delta = 0.0001 else: pass else: if verbose: print("incorrect number of iterations - ignored") maxiter = 300 else: if verbose: print("incorrect labelling - ignored") else: if verbose: print("incompatible number of labels provided - ignored") Centers, labels, J = _kmeans(X, nbclusters, Labels, maxiter, delta, ninit) return Centers, labels, J
def _MStep(x, z, k): """Computation of cluster centers/means Parameters ---------- x array of shape (n,p) where n = number of samples, p = data dimension z, array of shape (x.shape[0]) current assignment k, int, number of desired clusters Returns ------- centers, array of shape (k,p) the resulting centers """ dim = x.shape[1] centers = np.repeat(np.reshape(x.mean(0), (1, dim)), k, 0) for q in range(k): if np.sum(z == q) == 0: pass else: centers[q] = np.mean(x[z == q], 0) return centers def _EStep(x, centers): """ Computation of the input-to-cluster assignment Parameters ---------- x array of shape (n,p) n = number of items, p = data dimension centers, array of shape (k,p) the cluster centers Returns ------- z vector of shape(n), the resulting assignment """ nbitem = x.shape[0] z = - np.ones(nbitem).astype(np.int) mindist = np.inf * np.ones(nbitem) k = centers.shape[0] for q in range(k): dist = np.sum((x - centers[q]) ** 2, 1) z[dist < mindist] = q mindist = np.minimum(dist, mindist) J = mindist.sum() return z, J
[docs]def voronoi(x, centers): """ Assignment of data items to nearest cluster center Parameters ---------- x array of shape (n,p) n = number of items, p = data dimension centers, array of shape (k, p) the cluster centers Returns ------- z vector of shape(n), the resulting assignment """ if np.size(x) == x.shape[0]: x = np.reshape(x, (np.size(x), 1)) if np.size(centers) == centers.shape[0]: centers = np.reshape(centers, (np.size(centers), 1)) if x.shape[1] != centers.shape[1]: raise ValueError("Inconsistent dimensions for x and centers") return _EStep(x, centers)[0]
def _kmeans(X, nbclusters=2, Labels=None, maxiter=300, delta=1.e-4, ninit=1, verbose=0): """ kmeans clustering algorithm Parameters ---------- X: array of shape (n,p): n = number of items, p = dimension data array nbclusters (int), the number of desired clusters Labels: array of shape (n) prior Labels, optional if None or inadequate a random initilization is performed. maxiter: int, optional the maximum number of iterations before convergence delta: float, optional the relative increment in the results before declaring convergence. verbose=0: verboseity mode Returns ------- Centers: array of shape (nbclusters, p), the centroids of the resulting clusters Labels: array of size n, the discrete labels of the input items J, float, the final value of the inertia criterion """ # fixme: do the checks nbitem = X.shape[0] vdata = np.mean(np.var(X, 0)) bJ = np.inf for it in range(ninit): # init if Labels is None: seeds = np.argsort(np.random.rand(nbitem))[:nbclusters] centers = X[seeds] else: centers = _MStep(X, Labels, nbclusters) centers_old = centers.copy() # iterations for i in range(maxiter): z, J = _EStep(X, centers) centers = _MStep(X, z, nbclusters) if verbose: print(i, J) if np.sum((centers_old - centers) ** 2) < delta * vdata: if verbose: print(i) break centers_old = centers.copy() if J < bJ: bJ = J centers_output = centers.copy() z_output = z.copy() else: centers_output = centers z_output = z return centers_output, z_output, bJ