#### Implements the MDPP algorithm of Pavlidis et. al (2016) to find minimum density hyperplanes (MDHs). #### In addition, multiple MDHs can be combined to produce a complete clustering using a divisive hierarchical #### algorithm. ### function mddc() generates a complete clustering model using MDHs ## arguments: # X = dataset (matrix). each row is a datum. required # K = number of clusters to extract (integer). required # split.index = determines the order in which clusters are split (in decreasing # order of splitting indices). can be a function(v, X, P) of projection # vector v, data matrix X and list of parameters P. can also be one of # "size" (split the largest cluster), "fval" (split the cluster with # the minimum density hyperplane), or "rdepth" (split the cluster with # the greatest relative depth). optional, default is "size" # v0 = initial projection direction(s). can be a matrix # in which each column is an initialisation to try. # can be a function of the data matrix (or subset # thereof corresponding to the cluster being split) which returns # a matrix in which each column is an initialisation. # optional, default is the first principal component # bandwidth = used to compute bandwidth parameter (h). a function f(X) of # the data (or subset) being pslit. optional, default is bandwidth(X) = 0.9*eigen(cov(X))$values[1]^.5/nrow(X)^.2 # alphamax = maximum width of constraint F(v) = [mu - alphamax*sd, mu+alphamax*sd] # optional, default is 0.9 # verb = verbosity level. verb == 0 produces no output. verb == 1 produces plots of the # projected data during each optimisation. verb == 2 adds to these plots information # about the function value, relative depth and quality of split (if labels are supplied). # verb == 3 creates a folder in the working directory and saves all plots produced for verb == 2. # optional, default is 0 # labels = vector of class labels. Only used for producing plots, not in the allocation of # data to clusters. optional, default is NULL (plots do not indicate true class membership # when true labels are unknown) # maxit = maximum number of BFGS iterations for each value of alpha. optional, default is 15 # ftol = tolerance level for function value improvements in BFGS. optional, default is 1e-5 ## output is a named list containing # $cluster = cluster assignment vector # $model = matrix containing the would-be location of each node (depth and position at depth) within a complete tree # $Nodes = the clustering model. unnamed list each element of which is a named list containing the details of the associated node # $data = the data matrix passed to mddc() # $method = "MDH" (used for plotting and model modification functions) # $args = list of (functional) arguments passed to mddc mddc <- function(X, K, minsize = NULL, split.index = NULL, v0 = NULL, bandwidth = NULL, alphamin = NULL, alphamax = NULL, verb = NULL, labels = NULL, maxit = NULL, ftol = NULL){ if(is.null(verb)) verb = 0 # set parameters for clustering and optimisation if(is.null(split.index)) split.index <- function(v, X, P) nrow(X) else if(is.character(split.index)){ if(split.index=='size') split.index <- function(v, X, P) nrow(X) else if(split.index=='fval') split.index <- function(v, X, P) 1/f_md(v, X, P) else if(split.index=='rdepth') split.index <- function(v, X, P) md_reldepth(v, X, P) else stop('split.index must be one of "size", "fval", or "rdepth"') } else if(!is.function(split.index)) stop('split.index must be one of "size", "fval", or "rdepth" or a function f(v, X, P) of projection vector v, data X and list of parameters P') n <- nrow(X) d <- ncol(X) # obtain clusters and return cluster assignment vector split_indices <- numeric(2*K-1) - Inf # ixs represents a list of cluster indices (for each node in the hierarchy model structure) ixs <- list(1:n) # tree stores the location (depth, breadth) in the model of each node tree <- matrix(0, (2*K-1), 2) tree[1,] <- c(1, 1) # Parent stores the parent node number of each node (The parent of the root node is 0) Parent <- numeric(2*K-1) # stores hyperplane separators for each node, v and b vs <- matrix(0, (2*K-1), d) bs <- numeric(2*K-1) # stores the parameters used in each optimisation pars <- list() # store the integrated density on each hyperplane and the corresponding relative depth respectively dens <- numeric(2*K-1) rel.deps <- numeric(2*K-1) # compute the split(s) at the root node, and choose the solution with the maximum relative depth c.split <- mdh(X, v0, minsize, bandwidth, alphamin, alphamax, verb, labels, maxit, ftol) ix.opt <- which.max(unlist(lapply(c.split, function(sol) sol$rel.dep))) c.split <- c.split[[ix.opt]] if(c.split$rel.dep>0) split_indices[1] <- split.index(c.split$v, X, c.split$params) else split.indices[1] <- -Inf pass <- list(which(c.split$cluster==2)) vs[1,] <- c.split$v bs[1] <- c.split$b rel.deps[1] <- c.split$rel.dep dens[1] <- c.split$fval pars[[1]] <- c.split$params # repeat binary partitions until the desired number of clusters have been generated while(length(ixs)<(2*K-1) && max(split_indices) > -Inf){ id <- which.max(split_indices) split_indices[id] <- -Inf n.clust <- length(ixs) ixs[[n.clust+1]] <- ixs[[id]][pass[[id]]] ixs[[n.clust+2]] <- ixs[[id]][-pass[[id]]] c.split <- mdh(X[ixs[[n.clust+1]],], v0, minsize, bandwidth, alphamin, alphamax, verb, labels[ixs[[n.clust+1]]], maxit, ftol) ix.opt <- which.max(unlist(lapply(c.split, function(sol) sol$rel.dep))) c.split <- c.split[[ix.opt]] if(c.split$rel.dep==0) split_indices[n.clust+1] <- -Inf else split_indices[n.clust+1] <- split.index(c.split$v, X[ixs[[n.clust+1]],], c.split$params) pass[[n.clust+1]] <- which(c.split$cluster==2) vs[n.clust+1,] <- c.split$v bs[n.clust+1] <- c.split$b rel.deps[n.clust+1] <- c.split$rel.dep dens[n.clust+1] <- c.split$fval pars[[n.clust+1]] <- c.split$params tree[n.clust+1,] <- c(tree[id,1] + 1, 2*tree[id,2]-1) Parent[n.clust+1] <- id c.split <- mdh(X[ixs[[n.clust+2]],], v0, minsize, bandwidth, alphamin, alphamax, verb, labels[ixs[[n.clust+2]]], maxit, ftol) ix.opt <- which.max(unlist(lapply(c.split, function(sol) sol$rel.dep))) c.split <- c.split[[ix.opt]] if(c.split$rel.dep==0) split_indices[n.clust+2] <- -Inf else split_indices[n.clust+2] <- split.index(c.split$v, X[ixs[[n.clust+2]],], c.split$params) pass[[n.clust+2]] <- which(c.split$cluster==2) vs[n.clust+2,] <-c.split$v bs[n.clust+2] <- c.split$b rel.deps[n.clust+2] <- c.split$rel.dep dens[n.clust+2] <- c.split$fval pars[[n.clust+2]] <- c.split$params tree[n.clust+2,] <- c(tree[id,1] + 1, 2*tree[id,2]) Parent[n.clust+2] <- id } # if fewer than K clusters found, reset K if(length(ixs)<(2*K-1)){ K <- (length(ixs)+1)/2 split_indices <- split_indices[1:length(ixs)] tree <- tree[1:length(ixs),] vs <- vs[1:length(ixs),] bs <- bs[1:length(ixs)] dens <- dens[1:length(ixs)] rel.deps <- rel.deps[1:length(ixs)] Parent <- Parent[1:length(ixs)] } # create cluster assignment vector for the complete hierarchy asgn <- numeric(n) for(i in 1:(K-1)) asgn[ixs[[2*i]]] <- i # determine actual locations of the nodes (not their would-be location within a complete tree) loci <- tree for(i in 1:max(tree[,1])){ rows <- which(tree[,1]==i) loci[rows,2] <- rank(tree[rows,2]) } # create the detailed cluster hierarchy Nodes <- list() for(i in 1:length(ixs)) Nodes[[i]] <- list(ixs = ixs[[i]], v = vs[i,], b = bs[i], params = pars[[i]], fval = dens[i], rel.dep = rel.deps[i], node = tree[i,], location = loci[i,]) list(cluster = asgn, model = tree, Parent = Parent, Nodes = Nodes, data = X, method = 'MDH', args = list(v0 = v0, minsize = minsize, bandwidth = bandwidth, split.index = split.index, alphamin = alphamin, alphamax = alphamax, maxit = maxit, ftol = ftol)) } ### function f_md() evaluates the projection index for MDPP. Assumes the data have zero mean ## arguments: # v = projection vector # X = data matrix # P = list of parameters including (at least) # $h (bandwidth) # $alpha (constraint width. larger values allow hyperplanes further from the mean of the data) # $C (constant affecting the slope of the penalty) ## output is a scalar, the value of the density on the (best) hyperplane orthogonal to v f_md <- function(v, X, P){ # find density of data projected along v/||v|| p <- X%*%v/norm_vec(v) n <- length(p) # if alpha is zero then hyperplane passes through origin if(P$alpha==0){ return(sum(1/sqrt(2*pi)*exp(-p^2/2/P$h^2)/n/P$h)) } # otherwise find the minimum density hyperplane along v within [-alpha*sigma, alpha*sigma] s <- sd(p) den <- density(p, bw = P$h, from = -P$alpha*s-1, to = P$alpha*s+1, n = 100) pen <- den$y + P$C*((den$x<(-P$alpha*s))*(-P$alpha*s-den$x)^2 + (den$x>(P$alpha*s))*(den$x-P$alpha*s)^2) mins <- which(apply(rbind(pen[1:98], pen[2:99], pen[3:100]), 2, function(x) x[2]<=min(x)))+1 bs <- den$x[mins] fs <- pen[mins] # use bisection to refine the location of all local minima for(i in 1:length(mins)){ hi <- den$x[mins[i]+1] lo <- den$x[mins[i]-1] repeat{ mid <- (hi+lo)/2 dmid <- dkde(mid, p, P$h, P$alpha, P$C, s) if(abs(dmid)<1e-8 || (hi-mid)<1e-10){ bs[i] <- mid fs[i] <- 1/sqrt(2*pi)/n/P$h*sum(exp(-(p-mid)^2/2/P$h^2)) if(bs[i]>(P$alpha*s)) fs[i] <- fs[i] + P$C*(bs[i]-P$alpha*s)^2 if(bs[i]<(-P$alpha*s)) fs[i] <- fs[i] + P$C*(-bs[i]-P$alpha*s)^2 break } else if(dmid<0) lo <- mid else hi <- mid } } # return minimum of the minima after refinement min(fs) } ### function df_md() evaluates the gradient of the projection index for MDPP. Assumes data have zero mean ## arguments: # v = projection vector # X = data matrix # P = list of parameters including (at least) # $h (bandwidth) # $alpha (constraint width. larger values allow hyperplanes further from the mean of the data) # $C (constant affecting the slope of the penalty) # $COV (covariance matrix of data) ## output is a vector. the gradient of the projection index at v df_md = function(v, X, P){ # first steps are as in evaluation of the projection index, to find the location of the optimum p <- X%*%v/norm_vec(v) n <- length(p) nv <- norm_vec(v) if(P$alpha==0){ c(t(1/sqrt(2*pi)/n/P$h^3*(exp(-p^2/2/P$h^2)*(-p)))%*%(X/nv-(X%*%v)%*%t(v)/nv^3)) } s <- sd(p) den <- density(p, bw = P$h, from = -P$alpha*s-1, to = P$alpha*s+1, n = 100) pen <- den$y + P$C*((den$x<(-P$alpha*s))*(-P$alpha*s-den$x)^2 + (den$x>(P$alpha*s))*(den$x-P$alpha*s)^2) mins <- which(apply(rbind(pen[1:98], pen[2:99], pen[3:100]), 2, function(x) x[2]<=min(x)))+1 bs <- den$x[mins] fs <- pen[mins] for(i in 1:length(mins)){ hi <- den$x[mins[i]+1] lo <- den$x[mins[i]-1] repeat{ mid <- (hi+lo)/2 dmid <- dkde(mid, p, P$h, P$alpha, P$C, s) if(abs(dmid)<1e-8 || (hi-mid)<1e-10){ bs[i] <- mid fs[i] <- 1/sqrt(2*pi)/n/P$h*sum(exp(-(p-mid)^2/2/P$h^2)) if(bs[i]>(P$alpha*s)) fs[i] <- fs[i] + P$C*(bs[i]-P$alpha*s)^2 if(bs[i]<(-P$alpha*s)) fs[i] <- fs[i] + P$C*(-bs[i]-P$alpha*s)^2 break } else if(dmid<0) lo <- mid else hi <- mid } } w <- which.min(fs) b <- bs[w] # compute the gradient of the the density on hyperplane H(v, b) if(b<(-P$alpha*s)){ ds <- 1/nv^2*(P$COV%*%v/s-s*v) dpen <- c(-P$C*2*(-b-P$alpha*s)*P$alpha*ds) c(t(1/sqrt(2*pi)/n/P$h^3*(exp(-(p-b)^2/2/P$h^2)*(b-p)))%*%(X/nv-(X%*%v)%*%t(v)/nv^3)) + dpen } else if(b>(P$alpha*s)){ ds <- 1/nv^2*(P$COV%*%v/s-s*v) dpen <- -c(P$C*2*(b-P$alpha*s)*P$alpha*ds) c(t(1/sqrt(2*pi)/n/P$h^3*(exp(-(p-b)^2/2/P$h^2)*(b-p)))%*%(X/nv-(X%*%v)%*%t(v)/nv^3)) + dpen } else{ c(t(1/sqrt(2*pi)/n/P$h^3*(exp(-(p-b)^2/2/P$h^2)*(b-p)))%*%(X/nv-(X%*%v)%*%t(v)/nv^3)) } } ### function dkde() evaluates the gradient of the (penalised) kernel density estimator of the univariate ### projected dataset. Used in bisection to find the minimum density hyperplane along projection ## arguments: # pt = point at which to evaluate the gradient # xs = projected data points # bw = bandwidth # al = alpha parameter determining width of constraint # K = constant affecting the slope of the penalty # s = standard deviation of xs ## output is a scalar, the gradient of the projected density at pt dkde <- function(pt, xs, bw, al, K, s){ if(pt<(-al*s)) 1/sqrt(2*pi)/length(xs)/bw^3*sum(exp(-(xs-pt)^2/2/bw^2)*(xs-pt)) - 2*K*(-al*s-pt) else if(pt>(al*s)) 1/sqrt(2*pi)/length(xs)/bw^3*sum(exp(-(xs-pt)^2/2/bw^2)*(xs-pt)) + 2*K*(pt-al*s) else 1/sqrt(2*pi)/length(xs)/bw^3*sum(exp(-(xs-pt)^2/2/bw^2)*(xs-pt)) } ### function is_minim() checks if the optimal hyperplane along v is at a local minimum of the density ## arguments: # v = projection vector # X = data matrix # P = list of parameters including (at least) # $h (bandwidth) # $alpha (constraint width. larger values allow hyperplanes further from the mean of the data) ## output is logical, is the best hyperplane orthogonal to v at a minimum of the density is_minim <- function(v, X, P){ # compute the projected density along v/||v|| p <- X%*%v/norm_vec(v) s <- sd(p) d <- density(p, bw = P$h, from = -P$alpha*s, to = P$alpha*s, n = 100) # if the minimum lies at one of the boundaries then it is not a local # minimum of the density bix <- which.min(d$y) if(bix%in%c(1, 100) || min(sum(pd$x[bix]))1e-7) X <- t(t(X)-mns) # set up parameters for optimisation if(is.null(alphamin)) alphamin <- 0 if(is.null(alphamax)) alphamax <- 1 params$COV <- cov(X) if(is.null(bandwidth)){ if(ncol(X)>2) params$h <- 0.9*rARPACK::eigs_sym(params$COV, 1)$values[1]^.5/n^0.2 else params$h <- 0.9*eigen(params$COV)$values[1]^.5/n^0.2 } else if(is.numeric(bandwidth)) params$h <- bandwidth else if(is.function(bandwidth)) params$h <- bandwidth(X) else stop('bandwidth must be numeric or a function of the data being split') params$C <- 100*exp(-0.5)/sqrt(2*pi)/params$h^2 if(is.null(v0)){ if(ncol(X)>2) E <- matrix(rARPACK::eigs_sym(params$COV, 1)$vectors, ncol = 1) else E <- matrix(eigen(params$COV)$vectors[,1], ncol = 1) } else if(is.function(v0)) E <- v0(X) else if(is.vector(v0)) E <- matrix(v0, ncol = 1) else E <- v0 output <- list() # find the mdh arising from each column of E (v0) for(i in 1:ncol(E)){ v <- mdpp(E[,i], X, params, alphamin, alphamax, verb, labels, maxit, ftol) params$alpha <- v$alpha v <- v$v b <- md_b(v, X, params) pass <- X%*%v