1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
#### Implements the MDPP algorithm of Pavlidis et. al (2016) to find minimum density hyperplanes (MDHs).
#### In addition, multiple MDHs can be combined to produce a complete clustering using a divisive hierarchical
#### algorithm.



### function mddc() generates a complete clustering model using MDHs
## arguments:
# X = dataset (matrix). each row is a datum. required
# K = number of clusters to extract (integer). required
# split.index = determines the order in which clusters are split (in decreasing
#       order of splitting indices). can be a function(v, X, P) of projection
#       vector v, data matrix X and list of parameters P. can also be one of
#       "size" (split the largest cluster), "fval" (split the cluster with
#       the minimum density hyperplane), or "rdepth" (split the cluster with
#       the greatest relative depth). optional, default is "size"
# v0 = initial projection direction(s). can be a matrix
#       in which each column is an initialisation to try.
#       can be a function of the data matrix (or subset
#       thereof corresponding to the cluster being split) which returns
#       a matrix in which each column is an initialisation.
#       optional, default is the first principal component
# bandwidth = used to compute bandwidth parameter (h). a function f(X) of
#       the data (or subset) being pslit. optional, default is bandwidth(X) = 0.9*eigen(cov(X))$values[1]^.5/nrow(X)^.2
# alphamax = maximum width of constraint F(v) = [mu - alphamax*sd, mu+alphamax*sd]
#       optional, default is 0.9
# verb = verbosity level. verb == 0 produces no output. verb == 1 produces plots of the
#         projected data during each optimisation. verb == 2 adds to these plots information
#         about the function value, relative depth and quality of split (if labels are supplied).
#         verb == 3 creates a folder in the working directory and saves all plots produced for verb == 2.
#         optional, default is 0
# labels = vector of class labels. Only used for producing plots, not in the allocation of
#         data to clusters. optional, default is NULL (plots do not indicate true class membership
#         when true labels are unknown)
# maxit = maximum number of BFGS iterations for each value of alpha. optional, default is 15
# ftol = tolerance level for function value improvements in BFGS. optional, default is 1e-5

## output is a named list containing
# $cluster = cluster assignment vector
# $model = matrix containing the would-be location of each node (depth and position at depth) within a complete tree
# $Nodes = the clustering model. unnamed list each element of which is a named list containing the details of the associated node
# $data = the data matrix passed to mddc()
# $method = "MDH" (used for plotting and model modification functions)
# $args = list of (functional) arguments passed to mddc

mddc <- function(X, K, minsize = NULL, split.index = NULL, v0 = NULL, bandwidth = NULL, alphamin = NULL, alphamax = NULL, verb = NULL, labels = NULL, maxit = NULL, ftol = NULL){

  if(is.null(verb)) verb = 0
  # set parameters for clustering and optimisation

  if(is.null(split.index)) split.index <- function(v, X, P) nrow(X)
  else if(is.character(split.index)){
    if(split.index=='size') split.index <- function(v, X, P) nrow(X)
    else if(split.index=='fval') split.index <- function(v, X, P) 1/f_md(v, X, P)
    else if(split.index=='rdepth') split.index <- function(v, X, P) md_reldepth(v, X, P)
    else stop('split.index must be one of "size", "fval", or "rdepth"')
  }
  else if(!is.function(split.index)) stop('split.index must be one of "size", "fval", or "rdepth" or a function f(v, X, P) of projection vector v, data X and list of parameters P')

  n <- nrow(X)
  d <- ncol(X)

  # obtain clusters and return cluster assignment vector

  split_indices <- numeric(2*K-1) - Inf

  # ixs represents a list of cluster indices (for each node in the hierarchy model structure)
  ixs <- list(1:n)

  # tree stores the location (depth, breadth) in the model of each node

  tree <- matrix(0, (2*K-1), 2)
  tree[1,] <- c(1, 1)

  # Parent stores the parent node number of each node (The parent of the root node is 0)

  Parent <- numeric(2*K-1)


  # stores hyperplane separators for each node, v and b

  vs <- matrix(0, (2*K-1), d)
  bs <- numeric(2*K-1)

  # stores the parameters used in each optimisation

  pars <- list()

  # store the integrated density on each hyperplane and the corresponding relative depth respectively
  dens <- numeric(2*K-1)
  rel.deps <- numeric(2*K-1)

  # compute the split(s) at the root node, and choose the solution with the maximum relative depth

  c.split <- mdh(X, v0, minsize, bandwidth, alphamin, alphamax, verb, labels, maxit, ftol)
  ix.opt <- which.max(unlist(lapply(c.split, function(sol) sol$rel.dep)))
  c.split <- c.split[[ix.opt]]

  if(c.split$rel.dep>0) split_indices[1] <- split.index(c.split$v, X, c.split$params)
  else split.indices[1] <- -Inf

  pass <- list(which(c.split$cluster==2))

  vs[1,] <- c.split$v

  bs[1] <- c.split$b

  rel.deps[1] <- c.split$rel.dep

  dens[1] <- c.split$fval

  pars[[1]] <- c.split$params


  # repeat binary partitions until the desired number of clusters have been generated

  while(length(ixs)<(2*K-1) && max(split_indices) > -Inf){
    id <- which.max(split_indices)

    split_indices[id] <- -Inf

    n.clust <- length(ixs)

    ixs[[n.clust+1]] <- ixs[[id]][pass[[id]]]

    ixs[[n.clust+2]] <- ixs[[id]][-pass[[id]]]

    c.split <- mdh(X[ixs[[n.clust+1]],], v0, minsize, bandwidth, alphamin, alphamax, verb, labels[ixs[[n.clust+1]]], maxit, ftol)
    ix.opt <- which.max(unlist(lapply(c.split, function(sol) sol$rel.dep)))
    c.split <- c.split[[ix.opt]]

    if(c.split$rel.dep==0) split_indices[n.clust+1] <- -Inf
    else split_indices[n.clust+1] <- split.index(c.split$v, X[ixs[[n.clust+1]],], c.split$params)

    pass[[n.clust+1]] <- which(c.split$cluster==2)

    vs[n.clust+1,] <- c.split$v

    bs[n.clust+1] <- c.split$b

    rel.deps[n.clust+1] <- c.split$rel.dep

    dens[n.clust+1] <- c.split$fval

    pars[[n.clust+1]] <- c.split$params

    tree[n.clust+1,] <- c(tree[id,1] + 1, 2*tree[id,2]-1)

    Parent[n.clust+1] <- id

    c.split <- mdh(X[ixs[[n.clust+2]],], v0, minsize, bandwidth, alphamin, alphamax, verb, labels[ixs[[n.clust+2]]], maxit, ftol)
    ix.opt <- which.max(unlist(lapply(c.split, function(sol) sol$rel.dep)))
    c.split <- c.split[[ix.opt]]

    if(c.split$rel.dep==0) split_indices[n.clust+2] <- -Inf
    else split_indices[n.clust+2] <- split.index(c.split$v, X[ixs[[n.clust+2]],], c.split$params)

    pass[[n.clust+2]] <- which(c.split$cluster==2)

    vs[n.clust+2,] <-c.split$v

    bs[n.clust+2] <- c.split$b

    rel.deps[n.clust+2] <- c.split$rel.dep

    dens[n.clust+2] <- c.split$fval

    pars[[n.clust+2]] <- c.split$params

    tree[n.clust+2,] <- c(tree[id,1] + 1, 2*tree[id,2])

    Parent[n.clust+2] <- id

  }

  # if fewer than K clusters found, reset K

  if(length(ixs)<(2*K-1)){
    K <- (length(ixs)+1)/2
    split_indices <- split_indices[1:length(ixs)]
    tree <- tree[1:length(ixs),]
    vs <- vs[1:length(ixs),]
    bs <- bs[1:length(ixs)]
    dens <- dens[1:length(ixs)]
    rel.deps <- rel.deps[1:length(ixs)]
    Parent <- Parent[1:length(ixs)]
  }


  # create cluster assignment vector for the complete hierarchy

  asgn <- numeric(n)
  for(i in 1:(K-1)) asgn[ixs[[2*i]]] <- i

  # determine actual locations of the nodes (not their would-be location within a complete tree)

  loci <- tree
  for(i in 1:max(tree[,1])){
    rows <- which(tree[,1]==i)
    loci[rows,2] <- rank(tree[rows,2])
  }

  # create the detailed cluster hierarchy

  Nodes <- list()
  for(i in 1:length(ixs)) Nodes[[i]] <- list(ixs = ixs[[i]], v = vs[i,], b = bs[i], params = pars[[i]], fval = dens[i], rel.dep = rel.deps[i], node = tree[i,], location = loci[i,])

  list(cluster = asgn, model = tree, Parent = Parent, Nodes = Nodes, data = X, method = 'MDH', args = list(v0 = v0, minsize = minsize, bandwidth = bandwidth, split.index = split.index, alphamin = alphamin, alphamax = alphamax, maxit = maxit, ftol = ftol))

}


### function f_md() evaluates the projection index for MDPP. Assumes the data have zero mean
## arguments:
# v = projection vector
# X = data matrix
# P = list of parameters including (at least)
#   $h (bandwidth)
#   $alpha (constraint width. larger values allow hyperplanes further from the mean of the data)
#   $C (constant affecting the slope of the penalty)

## output is a scalar, the value of the density on the (best) hyperplane orthogonal to v

f_md <- function(v, X, P){

  # find density of data projected along v/||v||

  p <- X%*%v/norm_vec(v)
  n <- length(p)

  # if alpha is zero then hyperplane passes through origin

  if(P$alpha==0){
    return(sum(1/sqrt(2*pi)*exp(-p^2/2/P$h^2)/n/P$h))
  }

  # otherwise find the minimum density hyperplane along v within [-alpha*sigma, alpha*sigma]

  s <- sd(p)
  den <- density(p, bw = P$h, from = -P$alpha*s-1, to = P$alpha*s+1, n = 100)
  pen <- den$y + P$C*((den$x<(-P$alpha*s))*(-P$alpha*s-den$x)^2 + (den$x>(P$alpha*s))*(den$x-P$alpha*s)^2)
  mins <- which(apply(rbind(pen[1:98], pen[2:99], pen[3:100]), 2, function(x) x[2]<=min(x)))+1
  bs <- den$x[mins]
  fs <- pen[mins]

  # use bisection to refine the location of all local minima

  for(i in 1:length(mins)){
    hi <- den$x[mins[i]+1]
    lo <- den$x[mins[i]-1]
    repeat{
      mid <- (hi+lo)/2
      dmid <- dkde(mid, p, P$h, P$alpha, P$C, s)
      if(abs(dmid)<1e-8 || (hi-mid)<1e-10){
        bs[i] <- mid
        fs[i] <- 1/sqrt(2*pi)/n/P$h*sum(exp(-(p-mid)^2/2/P$h^2))
        if(bs[i]>(P$alpha*s)) fs[i] <- fs[i] + P$C*(bs[i]-P$alpha*s)^2
        if(bs[i]<(-P$alpha*s)) fs[i] <- fs[i] + P$C*(-bs[i]-P$alpha*s)^2
        break
      }
      else if(dmid<0) lo <- mid
      else hi <- mid
    }
  }

  # return minimum of the minima after refinement

  min(fs)
}

### function df_md() evaluates the gradient of the projection index for MDPP. Assumes data have zero mean
## arguments:
# v = projection vector
# X = data matrix
# P = list of parameters including (at least)
#   $h (bandwidth)
#   $alpha (constraint width. larger values allow hyperplanes further from the mean of the data)
#   $C (constant affecting the slope of the penalty)
#   $COV (covariance matrix of data)

## output is a vector. the gradient of the projection index at v

df_md = function(v, X, P){

  # first steps are as in evaluation of the projection index, to find the location of the optimum

  p <- X%*%v/norm_vec(v)
  n <- length(p)

  nv <- norm_vec(v)

  if(P$alpha==0){
    c(t(1/sqrt(2*pi)/n/P$h^3*(exp(-p^2/2/P$h^2)*(-p)))%*%(X/nv-(X%*%v)%*%t(v)/nv^3))
  }

  s <- sd(p)
  den <- density(p, bw = P$h, from = -P$alpha*s-1, to = P$alpha*s+1, n = 100)
  pen <- den$y + P$C*((den$x<(-P$alpha*s))*(-P$alpha*s-den$x)^2 + (den$x>(P$alpha*s))*(den$x-P$alpha*s)^2)
  mins <- which(apply(rbind(pen[1:98], pen[2:99], pen[3:100]), 2, function(x) x[2]<=min(x)))+1
  bs <- den$x[mins]
  fs <- pen[mins]
  for(i in 1:length(mins)){
    hi <- den$x[mins[i]+1]
    lo <- den$x[mins[i]-1]
    repeat{
      mid <- (hi+lo)/2
      dmid <- dkde(mid, p, P$h, P$alpha, P$C, s)
      if(abs(dmid)<1e-8 || (hi-mid)<1e-10){
        bs[i] <- mid
        fs[i] <- 1/sqrt(2*pi)/n/P$h*sum(exp(-(p-mid)^2/2/P$h^2))
        if(bs[i]>(P$alpha*s)) fs[i] <- fs[i] + P$C*(bs[i]-P$alpha*s)^2
        if(bs[i]<(-P$alpha*s)) fs[i] <- fs[i] + P$C*(-bs[i]-P$alpha*s)^2
        break
      }
      else if(dmid<0) lo <- mid
      else hi <- mid
    }
  }

  w <- which.min(fs)
  b <- bs[w]

  # compute the gradient of the the density on hyperplane H(v, b)

  if(b<(-P$alpha*s)){
    ds <- 1/nv^2*(P$COV%*%v/s-s*v)
    dpen <- c(-P$C*2*(-b-P$alpha*s)*P$alpha*ds)
    c(t(1/sqrt(2*pi)/n/P$h^3*(exp(-(p-b)^2/2/P$h^2)*(b-p)))%*%(X/nv-(X%*%v)%*%t(v)/nv^3)) + dpen
  }
  else if(b>(P$alpha*s)){
    ds <- 1/nv^2*(P$COV%*%v/s-s*v)
    dpen <- -c(P$C*2*(b-P$alpha*s)*P$alpha*ds)
    c(t(1/sqrt(2*pi)/n/P$h^3*(exp(-(p-b)^2/2/P$h^2)*(b-p)))%*%(X/nv-(X%*%v)%*%t(v)/nv^3)) + dpen
  }
  else{
    c(t(1/sqrt(2*pi)/n/P$h^3*(exp(-(p-b)^2/2/P$h^2)*(b-p)))%*%(X/nv-(X%*%v)%*%t(v)/nv^3))
  }
}

### function dkde() evaluates the gradient of the (penalised) kernel density estimator of the univariate
### projected dataset. Used in bisection to find the minimum density hyperplane along projection
## arguments:
# pt = point at which to evaluate the gradient
# xs = projected data points
# bw = bandwidth
# al = alpha parameter determining width of constraint
# K = constant affecting the slope of the penalty
# s = standard deviation of xs

## output is a scalar, the gradient of the projected density at pt

dkde <- function(pt, xs, bw, al, K, s){
  if(pt<(-al*s)) 1/sqrt(2*pi)/length(xs)/bw^3*sum(exp(-(xs-pt)^2/2/bw^2)*(xs-pt)) - 2*K*(-al*s-pt)
  else if(pt>(al*s)) 1/sqrt(2*pi)/length(xs)/bw^3*sum(exp(-(xs-pt)^2/2/bw^2)*(xs-pt)) + 2*K*(pt-al*s)
  else 1/sqrt(2*pi)/length(xs)/bw^3*sum(exp(-(xs-pt)^2/2/bw^2)*(xs-pt))
}

### function is_minim() checks if the optimal hyperplane along v is at a local minimum of the density
## arguments:
# v = projection vector
# X = data matrix
# P = list of parameters including (at least)
#   $h (bandwidth)
#   $alpha (constraint width. larger values allow hyperplanes further from the mean of the data)

## output is logical, is the best hyperplane orthogonal to v at a minimum of the density

is_minim <- function(v, X, P){

  # compute the projected density along v/||v||

  p <- X%*%v/norm_vec(v)
  s <- sd(p)
  d <- density(p, bw = P$h, from = -P$alpha*s, to = P$alpha*s, n = 100)

  # if the minimum lies at one of the boundaries then it is not a local
  # minimum of the density

  bix <- which.min(d$y)

  if(bix%in%c(1, 100) || min(sum(p<d$x[bix]), sum(p>d$x[bix]))<P$nmin) return(FALSE)
  else return(TRUE)
}


### function mdpp() performs projection pursuit for finding minimal density hyperplanes
## arguments:
# v = initial projection vector
# X = data matrix
# P = list of parameters including (at least)
#   $h (bandwidth)
#   $alpha (constraint width. larger values allow hyperplanes further from the mean of the data)
#   $C (constant affecting the slope of the penalty)
#   $COV (covariance matrix of data)
# alphamin = initial constraint on the distance of hyperplane to the mean of the data
# alphamax = maximum allowable distance of hyperplane from the mean of the data
# verb = verbosity level. For values greater than 0 plots are produced to illustrate the progress of the algorithm
# labels = vector of class labels. used only in the plotting of progress
# maxit = maximum number of iterations in BFGS for each value of alpha
# ftol = tolerance for termination of BFGS based on function value changes

## output is a list containing the optimal projection vector and the corresponding value of alpha

mdpp <- function(v, X, P, alphamin, alphamax, verb, labels, maxit, ftol){

  # initialise with alpha = 0

  P$alpha <- alphamin
  v_opt <- v
  al_opt <- alphamin

  # increase alpha and store the solution for the largest value of alpha which is a local minimum

  while(P$alpha <= alphamax){
    v <- ppclust.optim(v, f_md, df_md, X, P, md_b, verbosity = verb, labels = labels, method = 'MDH', maxit = maxit, ftol = ftol)$par
    if(is_minim(v, X, P)){
      v_opt <- v
      al_opt <- P$alpha
    }
    P$alpha <- P$alpha + 0.1
  }
  list(v = v_opt/norm_vec(v_opt), alpha = al_opt)
}

### function md_b() determines the location of the optimal hyperplane along v
## arguments:
# v = projection vector
# X = data matrix
# P = list of parameters including (at least)
#   $h (bandwidth)
#   $alpha (constraint width. larger values allow hyperplanes further from the mean of the data)

## output is a scalar, the value of b making H(v, b) the minimum density hyperlpane orthogonal to v

md_b <- function(v, X, P){
  p <- X%*%v/norm_vec(v)
  n <- length(p)
  s <- sd(p)
  den <- density(p, bw = P$h, from = -P$alpha*s, to = P$alpha*s, n = 1000)
  w <- which.min(den$y)
  den$x[w[ceiling(length(w)/2)]]
}

### function mdh() determines the minimum density hyperplane
## arguments:
# X = dataset (matrix). each row is a datum. required
# v0 = initial projection direction(s). can be a matrix
#       in which each column is an initialisation to try.
#       can be a function of the data matrix (or subset
#       thereof corresponding to the cluster being split) which returns
#       a matrix in which each column is an initialisation.
#       optional, default is the first principal component
# bandwidth = positive numeric bandwidth parameter used in kernel density estimation (h).
#       optional, default is bandwidth = 0.9*eigen(cov(X))$values[1]^.5/nrow(X)^.2
# alphamin = initial width of constraint F(v) = [mu-alpha*sd, mu+alpha*sd]
# alphamax = maximum width of constraint F(v) = [mu - alpha*sd, mu+alpha*sd]
#       optional, default is 0.9
# verb = verbosity level. verb == 0 produces no output. verb == 1 produces plots of the
#         projected data during each optimisation. verb == 2 adds to these plots information
#         about the function value, relative depth and quality of split (if labels are supplied).
#         verb == 3 creates a folder in the working directory and saves all plots produced for verb == 2.
#         optional, default is 3
# labels = vector of class labels. Only used for producing plots, not in the allocation of
#         data to clusters. optional, default is NULL (plots do not indicate true class membership
#         when true labels are unknown)
# maxit = maximum number of BFGS iterations for each value of alpha. optional, default is 15
# ftol = tolerance level for function value improvements in BFGS. optional, default is 1e-5

## output is a list of lists, the i-th stores the details of the minimum density hyperplane
## arising from the initialisation at v0[,i]. Each mdh has contains
# $cluster = the cluster assignment vector
# $v = the optimal projection vector
# $b = the value of b making H(v, b) the mdh
# rel.dep = the relative depth of H(v, b)
# fval = the density on H(v, b)
# params = list of parameters used to find H(v, b)

mdh <- function(X, v0 = NULL, minsize = NULL, bandwidth = NULL, alphamin = NULL, alphamax = NULL, verb = NULL, labels = NULL, maxit = NULL, ftol = NULL){

  params <- list()

  if(is.null(minsize)) params$nmin <- 1
  else if(is.numeric(minsize)) params$nmin <- minsize
  else stop('minsize must be integer.')

  # if the data contain fewer than 2*minsize points, do not split

  if(is.vector(X)) n <- 1
  else n <- nrow(X)
  if(n<(2*params$nmin)) return(list(list(cluster = numeric(n), v = numeric(ncol(rbind(c(), X))) + 1, b = NA, rel.dep = 0, fval = Inf, params = list(alpha=0,K=1000,h=1))))



  # if labels are supplied, ensure they are integers 1:K (K the number of classes)

  if(!is.null(labels)){
    lab_new <- numeric(length(labels))
    u <- unique(labels)
    for(i in 1:length(u)) lab_new[which(labels==u[i])] = i
    labels <- lab_new
  }

  if(is.null(verb)) verb <- 0

  if(is.null(maxit)) maxit <- 50

  if(is.null(ftol)) ftol <- 1e-8

  # if the data do not have zero mean, center them

  mns <- colMeans(X)
  if(max(abs(mns))>1e-7) X <- t(t(X)-mns)

  # set up parameters for optimisation

  if(is.null(alphamin)) alphamin <- 0

  if(is.null(alphamax)) alphamax <- 1

  params$COV <- cov(X)

  if(is.null(bandwidth)){
    if(ncol(X)>2) params$h <- 0.9*rARPACK::eigs_sym(params$COV, 1)$values[1]^.5/n^0.2
    else params$h <- 0.9*eigen(params$COV)$values[1]^.5/n^0.2
  }
  else if(is.numeric(bandwidth)) params$h <- bandwidth
  else if(is.function(bandwidth)) params$h <- bandwidth(X)
  else stop('bandwidth must be numeric or a function of the data being split')

  params$C <- 100*exp(-0.5)/sqrt(2*pi)/params$h^2

  if(is.null(v0)){
    if(ncol(X)>2) E <- matrix(rARPACK::eigs_sym(params$COV, 1)$vectors, ncol = 1)
    else E <- matrix(eigen(params$COV)$vectors[,1], ncol = 1)
  }
  else if(is.function(v0)) E <- v0(X)
  else if(is.vector(v0)) E <- matrix(v0, ncol = 1)
  else E <- v0

  output <- list()

  # find the mdh arising from each column of E (v0)

  for(i in 1:ncol(E)){

    v <- mdpp(E[,i], X, params, alphamin, alphamax, verb, labels, maxit, ftol)

    params$alpha <- v$alpha

    v <- v$v

    b <- md_b(v, X, params)

    pass <- X%*%v<b

    if(is_minim(v, X, params)){
      fval <- f_md(v, X, params)
      depth <- md_reldepth(v, X, params)
    }
    else{
      fval <- Inf
      depth <- 0
    }

    output[[i]] <- list(cluster = pass+1, v = v, b = b + (mns%*%v)[1], rel.dep = depth, fval = fval, params = params, method = 'MDH')

  }

  output
}

### function md_reldepth() computes the relative depth of the best hyperplane orthogonal to v
## arguments:
# v = projection vector
# X = data matrix
# P = list of parameters including (at least)
#   $h (bandwidth)
#   $alpha (constraint width. larger values allow hyperplanes further from the mean of the data)

## output is a scalar, the relative depth of the hypeprlane

md_reldepth <- function(v, X, P){

  # compute the projected density on v/||v|| and find the minimiser

  p <- X%*%v/norm_vec(v)
  n <- length(p)
  s <- sd(p)
  den <- density(p, bw = P$h, from = -P$alpha*s, to = P$alpha*s, n = 100)

  xmin <- den$x[which.min(den$y)]

  denmin <- min(den$y)+1e-10

  # compute the density to the left and right of the minimiser and find the relative depth

  den.left <- density(p, bw = P$h, to = xmin, n = 100)

  den.right <- density(p, bw = P$h, from = xmin, n = 100)

  (min(max(den.left$y), max(den.right$y))-denmin)/denmin
}