#### Implements the NCutH method of D. Hofmeyr (2016), "Clustering by minimum cut hyperplanes," IEEE TPAMI. #### The algorithm generates a divisive hierarchical clustering model using minimum normalised cut hyperplanes. #### Differences in terminology from the above paper (for consistency with the package) are that here #### each hyperplane is referred to as a NCutH, and the function to find such a hyperplane is ncuth. #### The complete divisive clustering algorithm is herein referred to as NCutDC (function ncutdc), for #### Normalised Cut Divisive Clustering. ### function ncutdc generates a complete clustering model using minimum normalised cut hyperplanes. ## arguments: # X = dataset (matrix). each row is a datum. required # K = number of clusters to extract (integer). required # split.index = determines the order in which clusters are split (in decreasing # order of splitting indices). can be a function(v, X, P) of projection # vector v, data matrix X and list of parameters P. can also be one of # "size" (split the largest cluster), or "fval" (split the cluster with # the minimum normalised cut value). optional, default is "fval" # v0 = initial projection direction(s). can be a matrix # in which each column is an initialisation to try. # can be a function of the data matrix (or subset # thereof corresponding to the cluster being split) which returns # a matrix in which each column is an initialisation. # optional, default is the first principal component # s = used to compute scale parameter. a function f(X) of the data (or subset thereof) being split which returns a numeric in which # case scale = f(X). default is scale = 100*eigen(cov(X))$values[1]^.5*nrow(X)^(-0.2). # minsize = minimum cluster size. Can be either integer or a function f(X) returning an integer. Default is 1. Throughout the projection pursuit no cuts which result in a cluster smaller # than minsize are allowed. This is achieved by considering only partitions in (v%*%X)[minsize:(n-minsize+1)]. # verb = verbosity level. verb == 0 produces no output. verb == 1 produces plots of the # projected data during each optimisation. verb == 2 adds to these plots information # about the function value, and quality of split (if labels are supplied). # verb == 3 creates a folder in the working directory and saves all plots produced for verb == 2. # optional, default is 3 # labels = vector of class labels. Only used for producing plots, not in the allocation of # data to clusters. optional, default is NULL (plots do not indicate true class membership # when true labels are unknown) # maxit = maximum number of BFGS iterations for each value of alpha. optional, default is 50 # ftol = tolerance level for function value improvements in BFGS. optional, default is 1e-8 ## output is a named list containing # $cluster = cluster assignment vector # $model = matrix containing the would-be location of each node (depth and position at depth) within a complete tree # $Nodes = the clustering model. unnamed list each element of which is a named list containing the details of the associated node # $data = the data matrix passed to mddc() # $method = "NCutH" (used for plotting and model modification functions) # $args = list of (functional) arguments passed to ncutdc ncutdc <- function(X, K, split.index = NULL, v0 = NULL, s = NULL, minsize = NULL, verb = NULL, labels = NULL, maxit = NULL, ftol = NULL){ if(is.null(verb)) verb = 0 # set parameters for clustering and optimisation if(is.null(split.index) || split.index=="fval") split.index <- function(v, X, P){ if(nrow(rbind(c(), X))<=2) return(-Inf) 1/(1+f_ncut(v, X, P)) } else if(split.index=="size") split.index <- function(v, X, P) nrow(X) else if(!is.function(split.index)) stop('split.index must be one of "size" or "fval" or, a function of projection vector, data matrix and parameter list P with elements P$s and P$nmin') n <- nrow(X) d <- ncol(X) # obtain clusters and return cluster assignment vector split_indices <- numeric(2*K-1) - Inf # ixs represents a list of cluster indices (for each node in the hierarchy model structure) ixs <- list(1:n) # tree stores the location (depth, breadth) in the model of each node tree <- matrix(0, (2*K-1), 2) tree[1,] <- c(1, 1) # Parent stores the parent node number of each node (The parent of the root node is 0) Parent <- numeric(2*K-1) # stores hyperplane separators for each node, v and b vs <- matrix(0, (2*K-1), d) bs <- numeric(2*K-1) # stores the parameters used in each optimisation pars <- list() # stores the normalised cut values on each hyperplane ncuts <- numeric(2*K-1) # find the first hyperplane. if multiple initialisations were used, select the one with the minimum ncut value c.split <- ncuth(X, v0, s, minsize, verb, labels, maxit, ftol) ix.opt <- which.min(unlist(lapply(c.split, function(sol) sol$fval))) c.split <- c.split[[ix.opt]] # store the results in the above discussed objects split_indices[1] <- split.index(c.split$v, X, c.split$params) pass <- list(which(c.split$cluster==2)) vs[1,] <- c.split$v bs[1] <- c.split$b pars[[1]] <- c.split$params ncuts[1] <- c.split$fval # repeat the divisive procedure until the correct number of clusters results while(length(ixs)<(2*K-1)){ # split the leaf with the maximum split index id <- which.max(split_indices) split_indices[id] <- -Inf n.clust <- length(ixs) ixs[[n.clust+1]] <- ixs[[id]][pass[[id]]] ixs[[n.clust+2]] <- ixs[[id]][-pass[[id]]] c.split <- ncuth(X[ixs[[n.clust+1]],], v0, s, minsize, verb, labels[ixs[[n.clust+1]]], maxit, ftol) ix.opt <- which.min(unlist(lapply(c.split, function(sol) sol$fval))) c.split <- c.split[[ix.opt]] split_indices[n.clust+1] <- split.index(c.split$v, X[ixs[[n.clust+1]],], c.split$params) pass[[n.clust+1]] <- which(c.split$cluster==2) tree[n.clust+1,] <- c(tree[id,1] + 1, 2*tree[id,2]-1) vs[n.clust+1,] <- c.split$v bs[n.clust+1] <- c.split$b ncuts[n.clust+1] <- c.split$fval pars[[n.clust+1]] <- c.split$params Parent[n.clust+1] <- id c.split <- ncuth(X[ixs[[n.clust+2]],], v0, s, minsize, verb, labels[ixs[[n.clust+2]]], maxit, ftol) ix.opt <- which.min(unlist(lapply(c.split, function(sol) sol$fval))) c.split <- c.split[[ix.opt]] split_indices[n.clust+2] <- split.index(c.split$v, X[ixs[[n.clust+2]],], c.split$params) pass[[n.clust+2]] <- which(c.split$cluster==2) tree[n.clust+2,] <- c(tree[id,1] + 1, 2*tree[id,2]) vs[n.clust+2,] <- c.split$v bs[n.clust+2] <- c.split$b ncuts[n.clust+2] <- c.split$fval pars[[n.clust+2]] <- c.split$params Parent[n.clust+2] <- id } # create cluster assignment vector for the complete hierarchy asgn <- numeric(n) for(i in 1:(K-1)) asgn[ixs[[2*i]]] <- i # determine actual locations of the nodes (not their would-be location within a complete tree) loci <- tree for(i in 1:max(tree[,1])){ rows <- which(tree[,1]==i) loci[rows,2] <- rank(tree[rows,2]) } # create the detailed cluster hierarchy Nodes <- list() for(i in 1:length(ixs)) Nodes[[i]] <- list(ixs = ixs[[i]], v = vs[i,], b = bs[i], params = pars[[i]], fval = ncuts[i], node = tree[i,], location = loci[i,]) list(cluster = asgn, model = tree, Parent = Parent, Nodes = Nodes, data = X, method = 'NCutH', args = list(v0 = v0, s = s, split.index = split.index, minsize = minsize, maxit = maxit, ftol = ftol)) } ### function ncuth() finds minimum normalised cut hyperplanes ## arguments: # X = dataset (matrix). each row is a datum. required # v0 = initial projection direction(s). can be a matrix # in which each column is an initialisation to try. # can be a function of the data matrix (or subset # thereof corresponding to the cluster being split) which returns # a matrix in which each column is an initialisation. # optional, default is the first principal component # s = positive numeric scaling parameter (sigma). optional, default is s = 100*eigen(cov(X))$values[1]^.5*nrow(X)^(-0.2) # verb = verbosity level. verb == 0 produces no output. verb == 1 produces plots of the # projected data during each optimisation. verb == 2 adds to these plots information # about the function value, relative depth and quality of split (if labels are supplied). # verb == 3 creates a folder in the working directory and saves all plots produced for verb == 2. # optional, default is 3 # labels = vector of class labels. Only used for producing plots, not in the allocation of # data to clusters. optional, default is NULL (plots do not indicate true class membership # when true labels are unknown) # maxit = maximum number of BFGS iterations for each value of alpha. optional, default is 15 # ftol = tolerance level for function value improvements in BFGS. optional, default is 1e-5 ## output is a list of lists, the i-th stores the details of the optimal hyperplane ## arising from the initialisation at v0[,i]. Each element has contains # $cluster = the cluster assignment vector # $v = the optimal projection vector # $b = the value of b making H(v, b) the optimal hyperplane # fval = the normalised cut across H(v, b) # params = list of parameters used to find H(v, b) ncuth <- function(X, v0 = NULL, s = NULL, minsize = NULL, verb = NULL, labels = NULL, maxit = NULL, ftol = NULL){ params = list() if(is.null(minsize)) params$nmin <- 1 else if(is.function(minsize)) params$nmin <- minsize(X) else if(is.numeric(minsize) && length(minsize)==1) params$nmin <- minsize else stop('minsize must be a positive integer or a function of the data being split') if(is.vector(X)) n <- 1 else n <- nrow(X) # if the data contain fewer than 2*nmin points then don't split if(n<(2*params$nmin)) return(list(list(cluster = numeric(n)+1, v = numeric(ncol(rbind(c(),X)))+1, b = 0, params = list(s = 100, nmin = params$nmin), fval = Inf, method = 'NCutH'))) # if labels are supplied, ensure they are integers 1:K (K the number of classes) if(!is.null(labels)){ lab_new <- numeric(length(labels)) u <- unique(labels) for(i in 1:length(u)) lab_new[which(labels==u[i])] = i labels <- lab_new } # set up parameters for optimisation if(is.null(verb)) verb = 0 if(is.null(maxit)) maxit <- 50 if(is.null(ftol)) ftol <- 1e-8 if(is.null(s)){ if(ncol(X)>2) params$s <- rARPACK::eigs_sym(cov(X), 1)$values[1]^.5*100/nrow(X)^.2 else params$s <- eigen(cov(X))$values[1]^.5*100/nrow(X)^.2 } else if(is.numeric(s)) params$s <- s else if(is.function(s)) params$s <- s(X) else stop('s must be numeric or a function of the data being split') if(is.null(v0)){ if(ncol(X)>2) E <- matrix(rARPACK::eigs_sym(cov(X), 1)$vectors, ncol = 1) else E <- matrix(eigen(cov(X))$vectors[,1], ncol = 1) } else if (is.function(v0)) E <- cbind(c(), v0(X)) else if (is.vector(v0)) E <- matrix(v0, ncol = 1) else E <- cbind(c(), v0) # store details of the hyperplanes arising from each initialisation output <- list() for(i in 1:ncol(E)){ v <- ncutpp(E[,i], X, params, verb, labels, maxit, ftol) b <- ncut_b(v, X, params) fval <- f_ncut(v, X, params) pass <- X%*%v2){ ds[1] <- (CSEki[n]-CSEki[w])*Ek[1]*(1/Deg[w]+1/(Deg[n]-Deg[w]))-Cut[w-P$nmin+1]*((2*Ek[1]*(CSEki[w]-CSEki[1])+Ek[1]*(CSEki[n]-CSEki[w]))/Deg[w]^2+Ek[1]*(CSEki[n]-CSEki[w])/(Deg[n]-Deg[w])^2) ds[2:(w-1)] <- (CSEki[n]-CSEki[w])*Ek[2:(w-1)]*(1/Deg[w]+1/(Deg[n]-Deg[w]))-Cut[w-P$nmin+1]*((2*Ek[2:(w-1)]*(CSEki[w]-CSEki[2:(w-1)])-2*Eki[2:(w-1)]*CSEk[1:(w-2)]+Ek[2:(w-1)]*(CSEki[n]-CSEki[w]))/Deg[w]^2+Ek[2:(w-1)]*(CSEki[n]-CSEki[w])/(Deg[n]-Deg[w])^2) ds[w] <- (CSEki[n]-CSEki[w])*(1/Deg[w]+1/(Deg[n]-Deg[w]))-Cut[w-P$nmin+1]*((CSEki[n]-CSEki[w]-2*CSEk[w-1])/Deg[w]^2+(CSEki[n]-CSEki[w])/(Deg[n]-Deg[w])^2) ds[(w+1):n] <- -CSEk[w]*Eki[(w+1):n]*(1/Deg[w]+1/(Deg[n]-Deg[w]))-Cut[w-P$nmin+1]*(-Eki[(w+1):n]*CSEk[w]/Deg[w]^2+(2*Ek[(w+1):n]*(CSEki[n]-CSEki[(w+1):n])-2*Eki[(w+1):n]*(CSEk[w:(n-1)]-CSEk[w])-Eki[(w+1):n]*CSEk[w])/(Deg[n]-Deg[w])^2) } else{ ds <- sapply(1:n, function(l){ if(l==1){ d1 <- (CSEki[n]-CSEki[w])*Ek[l] d2 <- 2*Ek[l]*(CSEki[w]-CSEki[l])+Ek[l]*(CSEki[n]-CSEki[w]) d3 <- Ek[l]*(CSEki[n]-CSEki[w]) d1*(1/Deg[w]+1/(Deg[n]-Deg[w]))-Cut[w-P$nmin+1]*(d2/Deg[w]^2+d3/(Deg[n]-Deg[w])^2) } else if(lw){ d1 <- -CSEk[w]*Eki[l] d2 <- -Eki[l]*CSEk[w] d3 <- 2*Ek[l]*(CSEki[n]-CSEki[l])-2*Eki[l]*(CSEk[l-1]-CSEk[w])-Eki[l]*CSEk[w] d1*(1/Deg[w]+1/(Deg[n]-Deg[w]))-Cut[w-P$nmin+1]*(d2/Deg[w]^2+d3/(Deg[n]-Deg[w])^2) } else{ d1 <- CSEki[n]-CSEki[w] d2 <- CSEki[n]-CSEki[w]-2*CSEk[w-1] d3 <- CSEki[n] - CSEki[w] d1*(1/Deg[w]+1/(Deg[n]-Deg[w]))-Cut[w-P$nmin+1]*(d2/Deg[w]^2+d3/(Deg[n]-Deg[w])^2) } }) } nv <- norm_vec(v) dv <- (X[o,]/nv-((X[o,])%*%v)%*%t(v)/nv^3)/P$s ds%*%dv } ### function ncut_b determines the location of the optimal hyperplane orthogonal to projection vector v ### That is, the value of b making H(v, b) an optimal hyperplane ## arguments # v = projection vector # X = data matrix # P = list of parameters containing (at least) # $s = scaling constant # $nmin = minimum cluster size ## output is a scalar. the location of the optimal hyperplane orthogonal to v ncut_b <- function(v, X, P){ # functionality follows closely that of f_ncut x <- X%*%v/norm_vec(v)/P$s srt <- sort(x) n <- length(x) EG <- exp(srt[1]-srt) CSEG <- cumsum(EG) CSiEG <- cumsum(1/EG) Cut <- (CSEG[n]-CSEG[P$nmin:(n-P$nmin)])*CSiEG[P$nmin:(n-P$nmin)] Deg <- ((cumsum(EG*CSiEG) + cumsum((CSEG[n]-CSEG)/EG))) w <- which.min((Cut*(1/Deg[P$nmin:(n-P$nmin)]+1/(Deg[n]-Deg[P$nmin:(n-P$nmin)]))))+P$nmin-1 (srt[w] + srt[w+1])/2*P$s } ### function ncutpp performs projection pursuit based on the normalised cut objective to find the optimal ### solution. Acts as a gateway to ppclust.optim, providing appropriate arguments for the ncut objective ## arguments: # v = initial projection vector # X = data matrix # P = list of parameters containing (at least) # $s = scaling parameter # $nmin = minimum cluster size # verb = verbosity level. see details in paper or at function ncutdc or ncuth # labels = vector of class labels used only in plotting for verb> 0 # maxit = maximum number of iterations for optimisation # ftol = relative tolerance level for the location of an optimum ncutpp <- function(v, X, P, verb, labels, maxit, ftol){ v_new <- ppclust.optim(v, f_ncut, df_ncut, X, P, ncut_b, verbosity = verb, labels = labels, method = 'NCutH', maxit = maxit, ftol = ftol)$par v_new/norm_vec(v_new) }