#### Implements the Maximum Clusterability Divisive Clustering (MCDC) algorithm of #### Hofmeyr and Pavlidis (2015), IEEE SSCI CIDM ### function mcdc generates a divisive hierarchical clustering model using hyperplanes which maximise the ### variance ratio clusterability measure across them ## arguments: # X = dataset (matrix). each row is a datum. required # K = number of clusters to extract (integer). required # split.index = determines the order in which clusters are split (in decreasing # order of splitting indices). can be a function(v, X, P) of projection # vector v, data matrix X and list of parameters P. can also be one of # "size" (split the largest cluster), "fval" (split the cluster with # the maximum variance ratio value), or "Fdist" (indices determined by the non-central # F-distribution. See SSCI paper for details. slight difference from the paper is that # when the data size is above 2000 cluster size is used instead. This is because the naive # estimation of the model degrees of freedom has been seen to be unreliable when the number # of data is large). optional, default is "Fdist" # v0 = initial projection direction(s). can be a matrix # in which each column is an initialisation to try. # can be a function of the data matrix (or subset # thereof corresponding to the cluster being split) which returns # a matrix in which each column is an initialisation. # optional, default is the vector joining the means of a 2-means solution # minsize = minimum cluster size. Can be either integer or a function f(X) returning an integer. Default is 1. Throughout the projection pursuit no cuts which result in a cluster smaller # than minsize are allowed. This is achieved by considering only partitions in (v%*%X)[minsize:(n-minsize+1)]. # verb = verbosity level. verb == 0 produces no output. verb == 1 produces plots of the # projected data during each optimisation. verb == 2 adds to these plots information # about the function value, and quality of split (if labels are supplied). # verb == 3 creates a folder in the working directory and saves all plots produced for verb == 2. # optional, default is 3 # labels = vector of class labels. Only used for producing plots, not in the allocation of # data to clusters. optional, default is NULL (plots do not indicate true class membership # when true labels are unknown) # maxit = maximum number of BFGS iterations for each value of alpha. optional, default is 15 # ftol = tolerance level for function value improvements in BFGS. optional, default is 1e-5 ## output is a named list containing # $cluster = cluster assignment vector # $model = matrix containing the would-be location of each node (depth and position at depth) within a complete tree # $Nodes = the clustering model. unnamed list each element of which is a named list containing the details of the associated node # $data = the data matrix passed to mcdc() # $method = "MCDC" (used for plotting and model modification functions) # $args = list of (functional) arguments passed to ncutdc mcdc <- function(X, K, v0 = NULL, split.index = NULL, minsize = NULL, verb = NULL, labels = NULL, maxit = NULL, ftol = NULL){ if(is.data.frame(X)) X <- as.matrix(X) if(is.null(verb)) verb = 0 # set parameters for clustering and optimisation n <- nrow(X) d <- ncol(X) if(is.null(split.index) || split.index=='Fdist'){ if(n<2000){ split.index <- function(v, X, P){ if(nrow(rbind(c(), X))<=2) return(-Inf) VR <- f_mc(v, X, P) n <-nrow(X) d <- ncol(X) al <- min(n, d+1) beta <- max(0, n-d-1) if(beta==0) return(0) pf(VR*beta/al, al, beta, ncp = n) + 1e-30*sqrt(n)*VR } } else split.index <- function(v, X, P) nrow(X) } else if(split.index=='size') split.index <- function(v, X, P) nrow(X) else if(split.index=='fval') split.index <- function(v, X, P) f_mc(v, X, P) else if(!is.function(split.index)) stop('split.index must be a function of projection vector, data matrix and parameter list P with elements P$nmin') # obtain clusters and cluster hierarchy # split_indices used to select the order to partition nodes/clusters split_indices <- numeric(2*K-1) - Inf # ixs stores the data associated with each node in the model ixs <- list(1:n) # tree stores the location (depth, breadth) in the model of each node tree <- matrix(0, (2*K-1), 2) tree[1,] <- c(1, 1) # Parent stores the parent node number of each node (The parent of the root node is 0) Parent <- numeric(2*K-1) # stores hyperplane separators for each node, v and b vs <- matrix(0, (2*K-1), d) bs <- numeric(2*K-1) # stores the parameters used in each optimisation pars <- list() VRS <- numeric(2*K-1) # determine the optimal hyperplane(s) at the root node and select that with the maximum variance ratio c.split <- mch(X, v0, minsize, verb, labels, maxit, ftol) # store the results in the above discussed objects split_indices[1] <- split.index(c.split$v, X, c.split$params) pass <- list(which(c.split$cluster==2)) vs[1,] <- c.split$v bs[1] <- c.split$b pars[[1]] <- c.split$params VRS[1] <- c.split$fval # repeatedly apply binary partitions until the desired number of clusters results while(length(ixs)<(2*K-1)){ # select the leaf with the greatest split index id <- which.max(split_indices) split_indices[id] <- -Inf n.clust <- length(ixs) ixs[[n.clust+1]] <- ixs[[id]][pass[[id]]] ixs[[n.clust+2]] <- ixs[[id]][-pass[[id]]] c.split <- mch(X[ixs[[n.clust+1]],], v0, minsize, verb, labels[ixs[[n.clust+1]]], maxit, ftol) split_indices[n.clust+1] <- split.index(c.split$v, X[ixs[[n.clust+1]],], c.split$params) pass[[n.clust+1]] <- which(c.split$cluster==2) tree[n.clust+1,] <- c(tree[id,1] + 1, 2*tree[id,2]-1) vs[n.clust+1,] <- c.split$v bs[n.clust+1] <- c.split$b VRS[n.clust+1] <- c.split$fval pars[[n.clust+1]] <- c.split$params Parent[n.clust+1] <- id c.split <- mch(X[ixs[[n.clust+2]],], v0, minsize, verb, labels[ixs[[n.clust+2]]], maxit, ftol) split_indices[n.clust+2] <- split.index(c.split$v, X[ixs[[n.clust+2]],], c.split$params) pass[[n.clust+2]] <- which(c.split$cluster==2) tree[n.clust+2,] <- c(tree[id,1] + 1, 2*tree[id,2]) vs[n.clust+2,] <- c.split$v bs[n.clust+2] <- c.split$b VRS[n.clust+2] <- c.split$fval pars[[n.clust+2]] <- c.split$params Parent[n.clust+2] <- id } # determine cluster assignment vector asgn <- numeric(n) + 1 for(i in 1:(K-1)) asgn[ixs[[2*i]]] <- i+1 # find the actual location of each node in the hierarchy loci <- tree for(i in 1:max(tree[,1])){ rows <- which(tree[,1]==i) loci[rows,2] <- rank(tree[rows,2]) } # store the details of all hyperplanes used in the hierarchical model Nodes <- list() for(i in 1:length(ixs)) Nodes[[i]] <- list(ixs = ixs[[i]], v = vs[i,], b = bs[i], params = pars[[i]], fval = VRS[i], node = tree[i,], location = loci[i,]) output <- list(cluster = asgn, model = tree, Parent = Parent, Nodes = Nodes, data = X, method = 'MCDC', args = list(v0 = v0, split.index = split.index, minsize = minsize, maxit = maxit, ftol = ftol)) class(output) <- 'ppci_cluster_solution' output } ### function f_mc evaluates the projection index for mcdc ## arguments: # v = projection vector # X = data matrix # P = list of parameters containing (at least) # $nmin = minimum cluster size ## output is a scalar, the variance ratio clusterability of the optimal partition by a hyperplane orthogonal to v f_mc <- function(v, X, P){ # compute the projected points and sort them in increasing order p <- sort(X%*%v/norm_vec(v)) CS <- cumsum(p) n <- length(p) # find the variance ratio at each point and return the maximum V <- sum((p-CS[n]/n)^2)/(n-1) ixs <- P$nmin:(n-P$nmin) bc <- (n-ixs)/ixs*((CS[n]-CS[ixs])/(n-ixs)-CS[n]/n)^2 max(bc/(V-bc)) } ### function df_mc evaluates the gradient of the projection index for mcdc ### the gradient is valid when the optimal partition is unique and the projected ### point at the optimum is unique. This is a.e. w.r.t the lebesuge measure ## arguments: # v = projection vector # X = data matrix # P = list of parameters containing (at least) # $nmin = minimum cluster size ## output is a vector, the gradient of the variance ratio of the optimal hyperplane ## orthogonal to v df_mc <- function(v, X, P){ # compute the projected points and their ordering p <- X%*%v/norm_vec(v) o <- order(p) CS <- cumsum(p[o]) n <- length(p) # determine the location of the optimal split V <- sum((p-CS[n]/n)^2)/(n-1) ixs <- P$nmin:(n-P$nmin) bc <- (n-ixs)/ixs*((CS[n]-CS[ixs])/(n-ixs)-CS[n]/n)^2 VRS <- bc/(V-bc) b <- p[o][ixs][which.max(VRS)] # compute the gradient (which differs depending which side of the split each projected point lies, ix1 vs ix2) ix1 <- which(p<=b) ix2 <- which(p>b) m1 <- mean(p[ix1]) m2 <- mean(p[ix2]) m <- mean(p) bc <- length(ix1)/n*(m1-m)^2+length(ix2)/n*(m2-m)^2 dp1 <- (bc*(2/n*(m-m1)-2/n*(p[ix1]-m1))+2/n*(m1-m)*V)/(V-bc)^2 dp2 <- (bc*(2/n*(m-m2)-2/n*(p[ix2]-m2))+2/n*(m2-m)*V)/(V-bc)^2 dp <- numeric(n) dp[ix1] <- dp1 dp[ix2] <- dp2 nv <- norm_vec(v) dv <- (X/nv-((X)%*%v)%*%t(v)/nv^3) dp%*%dv } ### function mc_b finds the location of the optimal hyperplane orthogonal to v. That is, ### the value of b which makes H(v, b) an optimal hyperplane mc_b <- function(v, X, P){ # follows essentially the same procedure as evaluating the projection index p <- X%*%v/norm_vec(v) o <- order(p) CS <- cumsum(p[o]) n <- length(p) V <- sum((p-CS[n]/n)^2)/(n-1) ixs <- P$nmin:(n-P$nmin) bc <- (n-ixs)/ixs*((CS[n]-CS[ixs])/(n-ixs)-CS[n]/n)^2 VRS <- bc/(V-bc) w <- which.max(VRS) (p[o][ixs][w] + p[o][ixs][w+1])/2 } ### function mcpp performs projection pursuit based on variance ratio objective. The function ### acts as a gateway to the optimisation function ppclust.optim, providing appropriate arguments ### for mcdc ## arguments: # v = initial projection vector # X = data matrix # P = list of paramateters containing (at least) # $nmin = minimum cluster size # verb = verbosity level. See details in paper or at function mcdc/mch # maxit = maximum number of iterations in optimisation # ftol = relative tolerance level for convergence of gradient based optimisation ## output is the optimal projection vector mcpp <- function(v, X, P, verb, labels, maxit, ftol){ v <- ppclust.optim(v, f_mc, df_mc, X, P, mc_b, verbosity = verb, labels = labels, method = 'MCDC', maxit = maxit, ftol = ftol)$par return(v/norm_vec(v)) } ### function mch() finds maximum variance ratio hyperplanes ## arguments: # X = dataset (matrix). each row is a datum. required # v0 = initial projection direction(s). can be a matrix # in which each column is an initialisation to try. # can be a function of the data matrix (or subset # thereof corresponding to the cluster being split) which returns # a matrix in which each column is an initialisation. # optional, default is the vector joining the means of a 2-means clustering # verb = verbosity level. verb == 0 produces no output. verb == 1 produces plots of the # projected data during each optimisation. verb == 2 adds to these plots information # about the function value, relative depth and quality of split (if labels are supplied). # verb == 3 creates a folder in the working directory and saves all plots produced for verb == 2. # optional, default is 3 # labels = vector of class labels. Only used for producing plots, not in the allocation of # data to clusters. optional, default is NULL (plots do not indicate true class membership # when true labels are unknown) # maxit = maximum number of BFGS iterations for each value of alpha. optional, default is 15 # ftol = tolerance level for function value improvements in BFGS. optional, default is 1e-5 ## output is a list of lists, the i-th stores the details of the optimal hyperplane ## arising from the initialisation at v0[,i]. Each element has contains # $cluster = the cluster assignment vector # $v = the optimal projection vector # $b = the value of b making H(v, b) the optimal hyperplane # fval = the variance ratio across H(v, b) # params = list of parameters used to find H(v, b) mch <- function(X, v0 = NULL, minsize = NULL, verb = NULL, labels = NULL, maxit = NULL, ftol = NULL){ if(is.data.frame(X)) X <- as.matrix(X) params = list() if(is.null(minsize)) params$nmin <- 1 else if(is.function(minsize)) params$nmin <- minsize(X) else if(is.numeric(minsize) && length(minsize)==1) params$nmin <- minsize else stop('minsize must be a positive integer or a function of the data being split') # if labels are supplied, ensure they are integers 1:K (K the number of classes) if(!is.null(labels)){ lab_new <- numeric(length(labels)) u <- unique(labels) for(i in 1:length(u)) lab_new[which(labels==u[i])] = i labels <- lab_new } # if there are fewer than 2*minsize data, do not split if(is.vector(X)) X <- matrix(X, nrow = 1) n <- nrow(X) if(n<(2*params$nmin)){ return(list(cluster = numeric(nrow(X)) + 1, v = numeric(ncol(X))+1/sqrt(ncol(X)), b = 0, params = list(nmin = 1), fval = 0, method = 'MCDC', data = X, fitted = X[,1:2])) } if(is.null(verb)) verb = 0 # set up parameters for optimisation if(is.null(maxit)) maxit <- 50 if(is.null(ftol)) ftol <- 1e-8 if(is.null(v0)){ km <- kmeans(X, 2, nstart = 10) E <- cbind(c(), km$centers[1,]-km$centers[2,]) } else if (is.function(v0)) E <- cbind(c(), v0(X)) else E <- cbind(c(), v0) hyperplanes <- list() for(i in 1:ncol(E)){ v <- mcpp(E[,i], X, params, verb, labels, maxit, ftol) b <- mc_b(v, X, params) fval <- f_mc(v, X, params) pass <- X%*%v2) v2 <- rARPACK::eigs_sym(cov(X-X%*%v%*%t(v)), 1)$vectors else v2 <- eigen(cov(X-X%*%v%*%t(v)))$vectors[,1] hyperplanes[[i]] <- list(cluster = pass + 1, v = v, b = b, params = params, fval = fval, method = 'MCDC', data = X, fitted = X%*%cbind(v, v2)) class(hyperplanes[[i]]) <- 'ppci_hyperplane_solution' } best_sol <- which.max(unlist(lapply(hyperplanes, function(l) l$fval))) output <- hyperplanes[[best_sol]] output$alternatives <- output[-best_sol] output } ### function norm_vec computes the euclidean norm of a vector. This function is used by all methods in the package ## arguments: # v = numeric vector ## output is a scalar, the euclidean norm of the vector norm_vec <- function(v) sqrt(sum(v^2))