Source code for nipy.algorithms.clustering.hierarchical_clustering

# emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*-
# vi: set ft=python sts=4 ts=4 sw=4 et:
"""
These routines perform some hierrachical agglomerative clustering
of some input data. The following alternatives are proposed:
- Distance based average-link
- Similarity-based average-link
- Distance based maximum-link
- Ward's algorithm under graph constraints
- Ward's algorithm without graph constraints

In this latest version, the results are returned in a 'WeightedForest'
structure, which gives access to the clustering hierarchy, facilitates
the plot of the result etc.

For back-compatibility, *_segment versions of the algorithms have been
appended, with the old API (except the qmax parameter, which now
represents the number of wanted clusters)

Author : Bertrand Thirion,Pamela Guevara, 2006-2009
"""
from __future__ import print_function
from __future__ import absolute_import
from __future__ import division

#---------------------------------------------------------------------------
# ------ Routines for Agglomerative Hierarchical Clustering ----------------
# --------------------------------------------------------------------------

import numpy as np
from warnings import warn

from ..graph.graph import WeightedGraph
from ..graph.forest import Forest


[docs]class WeightedForest(Forest): """ This is a weighted Forest structure, i.e. a tree - each node has one parent and children (hierarchical structure) - some of the nodes can be viewed as leaves, other as roots - the edges within a tree are associated with a weight: +1 from child to parent -1 from parent to child - additionally, the nodes have a value, which is called 'height', especially useful from dendrograms members ------- V : (int, >0) the number of vertices E : (int) the number of edges parents: array of shape (self.V) the parent array edges: array of shape (self.E,2) reprensenting pairwise neighbors weights, array of shape (self.E), +1/-1 for scending/descending links children: list of arrays that represents the childs of any node height: array of shape(self.V) """
[docs] def __init__(self, V, parents=None, height=None): """ Parameters ---------- V: the number of edges of the graph parents=None: array of shape (V) the parents of the graph by default, the parents are set to range(V), i.e. each node is its own parent, and each node is a tree height=None: array of shape(V) the height of the nodes """ V = int(V) if V < 1: raise ValueError('cannot create graphs with no vertex') self.V = int(V) # define the parents if parents is None: self.parents = np.arange(self.V) else: if np.size(parents) != V: raise ValueError('Incorrect size for parents') if parents.max() > self.V: raise ValueError('Incorrect value for parents') self.parents = np.reshape(parents, self.V) self.define_graph_attributes() if self.check() == 0: raise ValueError('The proposed structure is not a forest') self.children = [] if height is None: height = np.zeros(self.V) else: if np.size(height) != V: raise ValueError('Incorrect size for height') self.height = np.reshape(height, self.V)
[docs] def set_height(self, height=None): """Set the height array """ if height is None: height = np.zeros(self.V) if np.size(height) != self.V: raise ValueError('Incorrect size for height') self.height = np.reshape(height, self.V)
[docs] def get_height(self): """Get the height array """ return self.height
[docs] def check_compatible_height(self): """Check that height[parents[i]]>=height[i] for all nodes """ OK = True for i in range(self.V): if self.height[self.parents[i]] < self.height[i]: OK = False return OK
[docs] def plot(self, ax=None): """Plot the dendrogram associated with self the rank of the data in the dendogram is returned Parameters ---------- ax: axis handle, optional Returns ------- ax, the axis handle """ import matplotlib.pylab as mp if self.check_compatible_height() == False: raise ValueError('cannot plot myself in my current state') n = np.sum(self.isleaf()) # 1. find a permutation of the leaves that makes it nice aux = _label(self.parents) temp = np.zeros(self.V) rank = np.arange(self.V) temp[:n] = np.argsort(aux[:n]) for i in range(n): rank[int(temp[i])] = i # 2. derive the abscissa in the dendrogram idx = np.zeros(self.V) temp = np.argsort(rank[:n]) for i in range(n): idx[temp[i]] = i for i in range(n, self.V): j = np.nonzero(self.parents == i)[0] idx[i] = np.mean(idx[j]) # 3. plot if ax is None: mp.figure() ax = mp.subplot(1, 1, 1) for i in range(self.V): h1 = self.height[i] h2 = self.height[self.parents[i]] mp.plot([idx[i], idx[i]], [h1, h2], 'k') ch = self.get_children() for i in range(self.V): if np.size(ch[i]) > 0: lidx = idx[ch[i]] m = lidx.min() M = lidx.max() h = self.height[i] mp.plot([m, M], [h, h], 'k') cM = 1.05 * self.height.max() - 0.05 * self.height.min() cm = 1.05 * self.height.min() - 0.05 * self.height.max() mp.axis([-1, idx.max() + 1, cm, cM]) return ax
[docs] def partition(self, threshold): """ Partition the tree according to a cut criterion """ valid = self.height < threshold f = self.subforest(valid) u = f.cc() return u[f.isleaf()]
[docs] def split(self, k): """ idem as partition, but a number of components are supplied instead """ k = int(k) if k > self.V: k = self.V nbcc = self.cc().max() + 1 if k <= nbcc: u = self.cc() return u[self.isleaf()] sh = np.sort(self.height) th = sh[nbcc - k] u = self.partition(th) return u
[docs] def plot_height(self): """Plot the height of the non-leaves nodes """ import matplotlib.pylab as mp mp.figure() sh = np.sort(self.height[self.isleaf() == False]) n = np.sum(self.isleaf() == False) mp.bar(np.arange(n), sh)
[docs] def list_of_subtrees(self): """ returns the list of all non-trivial subtrees in the graph Caveat: theis function assumes that the vertices are sorted in a way such that parent[i]>i forall i Only the leaves are listeed, not the subtrees themselves """ lst = [] n = np.sum(self.isleaf()) for i in range(self.V): lst.append(np.array([], np.int)) for i in range(n): lst[i] = np.array([i], np.int) for i in range(self.V - 1): j = self.parents[i] lst[j] = np.hstack((lst[i], lst[j])) return lst[n:self.V]
#-------------------------------------------------------------------------- #------------- Average link clustering on a graph ------------------------- # -------------------------------------------------------------------------
[docs]def fusion(K, pop, i, j, k): """ Modifies the graph K to merge nodes i and j into nodes k The similarity values are weighted averaged, where pop[i] and pop[j] yield the relative weights. this is used in average_link_slow (deprecated) """ # fi = float(pop[i]) / (pop[k]) fj = 1.0 - fi # # replace i ny k # idxi = np.nonzero(K.edges[:, 0] == i) K.weights[idxi] = K.weights[idxi] * fi K.edges[idxi, 0] = k idxi = np.nonzero(K.edges[:, 1] == i) K.weights[idxi] = K.weights[idxi] * fi K.edges[idxi, 1] = k # # replace j by k # idxj = np.nonzero(K.edges[:, 0] == j) K.weights[idxj] = K.weights[idxj] * fj K.edges[idxj, 0] = k idxj = np.nonzero(K.edges[:, 1] == j) K.weights[idxj] = K.weights[idxj] * fj K.edges[idxj, 1] = k # #sum/remove double edges # #left side idxk = np.nonzero(K.edges[:, 0] == k)[0] corr = K.edges[idxk, 1] scorr = np.sort(corr) acorr = np.argsort(corr) for a in range(np.size(scorr) - 1): if scorr[a] == scorr[a + 1]: i1 = idxk[acorr[a]] i2 = idxk[acorr[a + 1]] K.weights[i1] = K.weights[i1] + K.weights[i2] K.weights[i2] = - np.inf K.edges[i2] = -1 #right side idxk = np.nonzero(K.edges[:, 1] == k)[0] corr = K.edges[idxk, 0] scorr = np.sort(corr) acorr = np.argsort(corr) for a in range(np.size(scorr) - 1): if scorr[a] == scorr[a + 1]: i1 = idxk[acorr[a]] i2 = idxk[acorr[a + 1]] K.weights[i1] = K.weights[i1] + K.weights[i2] K.weights[i2] = - np.inf K.edges[i2] = - 1
#-------------------------------------------------------------------------- #------------- Ward's algorithm with graph constraints -------------------- # ------------------------------------------------------------------------- def _inertia_(i, j, Features): """ Compute the variance of the set which is the concatenation of Feature[i] and Features[j] """ if np.size(np.shape(Features[i])) < 2: print(i, np.shape(Features[i]), Features[i]) if np.size(np.shape(Features[i])) < 2: print(j, np.shape(Features[j]), Features[j]) if np.shape(Features[i])[1] != np.shape(Features[j])[1]: print(i, j, np.shape(Features[i]), np.shape(Features[j])) localset = np.vstack((Features[i], Features[j])) return np.var(localset, 0).sum() def _inertia(i, j, Features): """ Compute the variance of the set which is the concatenation of Feature[i] and Features[j] """ n = Features[0][i] + Features[0][j] s = Features[1][i] + Features[1][j] q = Features[2][i] + Features[2][j] return np.sum(q - (s ** 2 / n)) def _initial_inertia(K, Features, seeds=None): """ Compute the variance associated with each edge-related pair of vertices Thre sult is written in K;weights if seeds if provided (seeds!=None) this is done only for vertices adjacent to the seeds """ if seeds is None: for e in range(K.E): i = K.edges[e, 0] j = K.edges[e, 1] ESS = _inertia(i, j, Features) K.weights[e] = ESS else: aux = np.zeros(K.V).astype('bool') aux[seeds] = 1 for e in range(K.E): i = K.edges[e, 0] j = K.edges[e, 1] if (aux[i] or aux[j]): K.weights[e] = _inertia(i, j, Features) else: K.weights[e] = np.inf def _auxiliary_graph(G, Features): """ prepare a graph with twice the number of vertices this graph will contain the connectivity information along the merges. """ K = WeightedGraph(2 * G.V - 1) K.E = G.E K.edges = G.edges.copy() K.weights = np.ones(K.E) K.symmeterize() if K.E > 0: valid = K.edges[:, 0] < K.edges[:, 1] K.remove_edges(valid) # K.remove_trivial_edges() _initial_inertia(K, Features) return K def _remap(K, i, j, k, Features, linc, rinc): """Modifies the graph K to merge nodes i and j into nodes k the graph weights are modified accordingly Parameters ---------- K graph instance: the existing graphical model i,j,k: int indexes of the nodes to be merged and of the parent respectively Features: list of node-per-node features linc: array of shape(K.V) left incidence matrix rinc: array of shape(K.V) right incidencematrix """ # ------- # replace i by k # -------- idxi = np.array(linc[i]).astype(np.int) if np.size(idxi) > 1: for l in idxi: K.weights[l] = _inertia(k, K.edges[l, 1], Features) elif np.size(idxi) == 1: K.weights[idxi] = _inertia(k, K.edges[idxi, 1], Features) if np.size(idxi) > 0: K.edges[idxi, 0] = k idxi = np.array(rinc[i]).astype(np.int) if np.size(idxi) > 1: for l in idxi: K.weights[l] = _inertia(K.edges[l, 0], k, Features) elif np.size(idxi) == 1: K.weights[idxi] = _inertia(K.edges[idxi, 0], k, Features) if np.size(idxi) > 0: K.edges[idxi, 1] = k #------ # replace j by k #------- idxj = np.array(linc[j]).astype(np.int) if np.size(idxj) > 1: for l in idxj: K.weights[l] = _inertia(k, K.edges[l, 1], Features) elif np.size(idxj) == 1: K.weights[idxj] = _inertia(k, K.edges[idxj, 1], Features) if np.size(idxj) > 0: K.edges[idxj, 0] = k idxj = np.array(rinc[j]).astype(np.int) if np.size(idxj) > 1: for l in idxj: K.weights[l] = _inertia(k, K.edges[l, 0], Features) elif np.size(idxj) == 1: K.weights[idxj] = _inertia(k, K.edges[idxj, 0], Features) if np.size(idxj) > 0: K.edges[idxj, 1] = k #------ # update linc, rinc #------ lidxk = list(linc[j]) + list(linc[i]) for L in lidxk: if K.edges[L, 1] == -1: lidxk.remove(L) linc[k] = lidxk linc[i] = [] linc[j] = [] ridxk = list(rinc[j]) + list(rinc[i]) for L in ridxk: if K.edges[L, 0] == -1: ridxk.remove(L) rinc[k] = ridxk rinc[i] = [] rinc[j] = [] #------ #remove double edges #------ #left side idxk = np.array(linc[k]).astype(np.int) if np.size(idxk) > 0: corr = K.edges[idxk, 1] scorr = np.sort(corr) acorr = np.argsort(corr) for a in range(np.size(scorr) - 1): if scorr[a] == scorr[a + 1]: i2 = idxk[acorr[a + 1]] K.weights[i2] = np.inf rinc[K.edges[i2, 1]].remove(i2) K.edges[i2] = - 1 linc[k].remove(i2) #right side idxk = np.array(rinc[k]).astype(np.int) if np.size(idxk) > 0: corr = K.edges[idxk, 0] scorr = np.sort(corr) acorr = np.argsort(corr) for a in range(np.size(scorr) - 1): if scorr[a] == scorr[a + 1]: i2 = idxk[acorr[a + 1]] K.weights[i2] = np.inf linc[K.edges[i2, 0]].remove(i2) K.edges[i2] = - 1 rinc[k].remove(i2) return linc, rinc
[docs]def ward_quick(G, feature, verbose=False): """ Agglomerative function based on a topology-defining graph and a feature matrix. Parameters ---------- G : graph instance topology-defining graph feature: array of shape (G.V,dim_feature) some vectorial information related to the graph vertices verbose : bool, optional If True, print diagnostic information Returns ------- t: weightForest instance, that represents the dendrogram of the data Notes ---- Hopefully a quicker version A euclidean distance is used in the feature space Caveat : only approximate """ warn('Function ward_quick from ' + 'nipy.algorithms.clustering.hierrachical_clustering ' + 'deprecated, will be removed', FutureWarning, stacklevel=2) # basic check if feature.ndim == 1: feature = np.reshape(feature, (-1, 1)) if feature.shape[0] != G.V: raise ValueError( "Incompatible dimension for the feature matrix and the graph") Features = [np.ones(2 * G.V), np.zeros((2 * G.V, feature.shape[1])), np.zeros((2 * G.V, feature.shape[1]))] Features[1][:G.V] = feature Features[2][:G.V] = feature ** 2 n = G.V nbcc = G.cc().max() + 1 # prepare a graph with twice the number of vertices K = _auxiliary_graph(G, Features) parent = np.arange(2 * n - nbcc).astype(np.int) height = np.zeros(2 * n - nbcc) linc = K.left_incidence() rinc = K.right_incidence() # iteratively merge clusters q = 0 while (q < n - nbcc): # 1. find the lightest edges aux = np.zeros(2 * n) ape = np.nonzero(K.weights < np.inf) ape = np.reshape(ape, np.size(ape)) idx = np.argsort(K.weights[ape]) for e in range(n - nbcc - q): i, j = K.edges[ape[idx[e]], 0], K.edges[ape[idx[e]], 1] if (aux[i] == 1) or (aux[j] == 1): break aux[i] = 1 aux[j] = 1 emax = np.maximum(e, 1) for e in range(emax): m = ape[idx[e]] cost = K.weights[m] k = q + n i = K.edges[m, 0] j = K.edges[m, 1] height[k] = cost if verbose: print(q, i, j, m, cost) # 2. remove the current edge K.edges[m] = -1 K.weights[m] = np.inf linc[i].remove(m) rinc[j].remove(m) ml = linc[j] if np.sum(K.edges[ml, 1] == i) > 0: m = ml[int(np.flatnonzero(K.edges[ml, 1] == i))] K.edges[m] = -1 K.weights[m] = np.inf linc[j].remove(m) rinc[i].remove(m) # 3. merge the edges with third part edges parent[i] = k parent[j] = k for p in range(3): Features[p][k] = Features[p][i] + Features[p][j] linc, rinc = _remap(K, i, j, k, Features, linc, rinc) q += 1 # build a tree to encode the results t = WeightedForest(2 * n - nbcc, parent, height) return t
[docs]def ward_field_segment(F, stop=-1, qmax=-1, verbose=False): """Agglomerative function based on a field structure Parameters ---------- F the input field (graph+feature) stop: float, optional the stopping crterion. if stop==-1, then no stopping criterion is used qmax: int, optional the maximum number of desired clusters (in the limit of the stopping criterion) verbose : bool, optional If True, print diagnostic information Returns ------- u: array of shape (F.V) labelling of the graph vertices according to the criterion cost array of shape (F.V - 1) the cost of each merge step during the clustering procedure Notes ----- See ward_quick_segment for more information Caveat : only approximate """ u, cost = ward_quick_segment(F, F.field, stop, qmax, verbose) return u, cost
[docs]def ward_quick_segment(G, feature, stop=-1, qmax=1, verbose=False): """ Agglomerative function based on a topology-defining graph and a feature matrix. Parameters ---------- G: labs.graph.WeightedGraph instance the input graph (a topological graph essentially) feature array of shape (G.V,dim_feature) vectorial information related to the graph vertices stop1 : int or float, optional the stopping crterion if stop==-1, then no stopping criterion is used qmax : int, optional the maximum number of desired clusters (in the limit of the stopping criterion) verbose : bool, optional If True, print diagnostic information Returns ------- u: array of shape (G.V) labelling of the graph vertices according to the criterion cost: array of shape (G.V - 1) the cost of each merge step during the clustering procedure Notes ----- Hopefully a quicker version A euclidean distance is used in the feature space Caveat : only approximate """ # basic check if feature.ndim == 1: feature = np.reshape(feature, (-1, 1)) if feature.shape[0] != G.V: raise ValueError( "Incompatible dimension for the feature matrix and the graph") n = G.V if stop == - 1: stop = np.inf qmax = int(np.minimum(qmax, n - 1)) t = ward_quick(G, feature, verbose) if verbose: t.plot() u1 = np.zeros(n, np.int) u2 = np.zeros(n, np.int) if stop >= 0: u1 = t.partition(stop) if qmax > 0: u2 = t.split(qmax) if u1.max() < u2.max(): u = u2 else: u = u1 cost = t.get_height() cost = cost[t.isleaf() == False] return u, cost
[docs]def ward_segment(G, feature, stop=-1, qmax=1, verbose=False): """ Agglomerative function based on a topology-defining graph and a feature matrix. Parameters ---------- G : graph object the input graph (a topological graph essentially) feature : array of shape (G.V,dim_feature) some vectorial information related to the graph vertices stop : int or float, optional the stopping crterion. if stop==-1, then no stopping criterion is used qmax : int, optional the maximum number of desired clusters (in the limit of the stopping criterion) verbose : bool, optional If True, print diagnostic information Returns ------- u: array of shape (G.V): a labelling of the graph vertices according to the criterion cost: array of shape (G.V - 1) the cost of each merge step during the clustering procedure Notes ----- A euclidean distance is used in the feature space Caveat : when the number of cc in G (nbcc) is greter than qmax, u contains nbcc values, not qmax ! """ # basic check if feature.ndim == 1: feature = np.reshape(feature, (-1, 1)) if feature.shape[0] != G.V: raise ValueError( "Incompatible dimension for the feature matrix and the graph") # prepare a graph with twice the number of vertices n = G.V if qmax == -1: qmax = n - 1 if stop == -1: stop = np.inf qmax = int(np.minimum(qmax, n - 1)) t = ward(G, feature, verbose) u1 = np.zeros(n, np.int) u2 = np.zeros(n, np.int) if stop >= 0: u1 = t.partition(stop) if qmax > 0: u2 = t.split(qmax) if u1.max() < u2.max(): u = u2 else: u = u1 cost = t.get_height() cost = cost[t.isleaf() == False] return u, cost
[docs]def ward(G, feature, verbose=False): """ Agglomerative function based on a topology-defining graph and a feature matrix. Parameters ---------- G : graph the input graph (a topological graph essentially) feature : array of shape (G.V,dim_feature) vectorial information related to the graph vertices verbose : bool, optional If True, print diagnostic information Returns -------- t : ``WeightedForest`` instance structure that represents the dendrogram Notes ----- When G has more than 1 connected component, t is no longer a tree. This case is handled cleanly now """ warn('Function ward from ' + 'nipy.algorithms.clustering.hierrachical_clustering ' + 'deprecated, will be removed', FutureWarning, stacklevel=2) # basic check if feature.ndim == 1: feature = np.reshape(feature, (-1, 1)) if feature.shape[0] != G.V: raise ValueError( "Incompatible dimension for the feature matrix and the graph") Features = [np.ones(2 * G.V), np.zeros((2 * G.V, feature.shape[1])), np.zeros((2 * G.V, feature.shape[1]))] Features[1][:G.V] = feature Features[2][:G.V] = feature ** 2 # prepare a graph with twice the number of vertices # this graph will contain the connectivity information # along the merges. n = G.V nbcc = G.cc().max() + 1 K = _auxiliary_graph(G, Features) # prepare some variables that are useful tp speed up the algorithm parent = np.arange(2 * n - nbcc).astype(np.int) height = np.zeros(2 * n - nbcc) linc = K.left_incidence() rinc = K.right_incidence() # iteratively merge clusters for q in range(n - nbcc): # 1. find the lightest edge m = (K.weights).argmin() cost = K.weights[m] k = q + n i = K.edges[m, 0] j = K.edges[m, 1] height[k] = cost if verbose: print(q, i, j, m, cost) # 2. remove the current edge K.edges[m] = - 1 K.weights[m] = np.inf linc[i].remove(m) rinc[j].remove(m) ml = linc[j] if np.sum(K.edges[ml, 1] == i) > 0: m = ml[int(np.flatnonzero(K.edges[ml, 1] == i))] K.edges[m] = -1 K.weights[m] = np.inf linc[j].remove(m) rinc[i].remove(m) # 3. merge the edges with third part edges parent[i] = k parent[j] = k for p in range(3): Features[p][k] = Features[p][i] + Features[p][j] linc, rinc = _remap(K, i, j, k, Features, linc, rinc) # build a tree to encode the results t = WeightedForest(2 * n - nbcc, parent, height) return t
#-------------------------------------------------------------------------- #----------------------- Visualization ------------------------------------ # ------------------------------------------------------------------------- def _label_(f, parent, left, labelled): temp = np.nonzero(parent == f) if np.size(temp) > 0: i = temp[0][np.nonzero(left[temp[0]] == 1)] j = temp[0][np.nonzero(left[temp[0]] == 0)] labelled = _label_(i, parent, left, labelled) labelled[f] = labelled.max() + 1 labelled = _label_(j, parent, left, labelled) if labelled[f] < 0: labelled[f] = labelled.max() + 1 return labelled def _label(parent): # find the root root = np.nonzero(parent == np.arange(np.size(parent)))[0] # define left left = np.zeros(np.size(parent)) for f in range(np.size(parent)): temp = np.nonzero(parent == f) if np.size(temp) > 0: left[temp[0][0]] = 1 left[root] = .5 # define labelled labelled = - np.ones(np.size(parent)) # compute labelled for j in range(np.size(root)): labelled = _label_(root[j], parent, left, labelled) return labelled