npMSL.R
## nonparametric algorithm for Smoothed Likelihood Maximization
## implementing block structure
#npMSL.old <- function(x, mu0, blockid = 1:ncol(x),
# bw=bw.nrd0(as.vector(as.matrix(x))), samebw = TRUE,
# h=bw, eps=1e-8,
# maxiter=500, ngrid=200, verb = TRUE){
# bw <- h # h is alternative bandwidth argument, for backward compatibility
# x <- as.matrix(x)
# n <- nrow(x) # number of subjects
# r <- ncol(x) # number of measurements in each subject
# u <- match(blockid, unique(blockid))
# B <- max(u) # nb of blocks
# BlS <- rep(0,B) # block sizes = C_ell in JCGS paper
# for (ell in 1:B) {
# BlS[ell] <- sum(u == ell)}
# if (is.matrix(mu0)) # mu0=centers
# m <- dim(mu0)[1] else m <- mu0 # mu0=number of clusters
# if(!samebw && !is.matrix(bw)) {
# bw <- matrix(bw, nrow=max(u), ncol=m)
# }
# z.hat <- matrix(0, nrow = n, ncol = m)
# tt0 <- proc.time() # for total time
## Initial Values
# if(m == 1) z.hat <- matrix(1, nrow = n, ncol = m) else {
# kmeans <- kmeans(x, mu0)
# for(j in 1:m)
# z.hat[kmeans$cluster==j, j] <- 1
# }
# iter <- 0
# finished <- FALSE
# lambda <- matrix(0, nrow = maxiter, ncol = m)
# loglik <- NULL
# total_udfl <- 0; total_nan <- 0 # for internal checks
# tmp <- 1:n
# xtra <- (max(x)-min(x))/10
# grid <- seq(min(x)-xtra, max(x)+xtra, length=ngrid)
# f stored on a ngrid by m by B array
# f_{g,j,ell} = f_{j ell}(u_g)
# f <- array(1/m/diff(grid[1:2]), c(ngrid, m, B))
# this f was not normalized for being uniform over grid
# Delta <- diff(grid[1:2])
# f <- array(1/((ngrid-1)*Delta), c(ngrid, m, B))
# oldloglik <- -Inf
# while (!finished) {
# iter <- iter + 1
# bw.old <- bw
# t0 <- proc.time()
# nb_udfl=0; # nb of underflows log(0) cancelled in nems_Estep_v3.c
# nb_nan=0; # nb of nonzero K()*log(0) cancelled in nems_Estep_v3.c
## Note: Enter loop assuming E-step is done -- i.e., z.hat in place
## M-Step
# lambda[iter, ] <- colMeans(z.hat)
## density estimation M-step
# z=.C("npMSL_Mstep", as.integer(ngrid), as.integer(n),
# as.integer(m), as.integer(r),
# as.integer(B), as.integer(BlS), as.integer(u),
# as.double(bw), as.double(x), as.double(grid),
# new.f=as.double(f),
# as.double(lambda[iter,]),
# as.double(z.hat),
# PACKAGE="mixtools")
# f <- array(z$new.f, c(ngrid, m, B)) # check sum(f == 0)
## print(apply(f,2:3,sum) * Delta)
## print(max(abs(f-f2)))
## browser()
## E-step (for next iteration)
# z=.C("npMSL_Estep", as.integer(ngrid), as.integer(n),
# as.integer(m), as.integer(r),
# as.integer(B), as.integer(u),
# as.double(bw),
# as.double(x), as.double(grid), f=as.double(f),
# as.double(lambda[iter,]), post=as.double(z.hat),
# loglik = double(1),
# nb_udfl = as.integer(nb_udfl), nb_nan = as.integer(nb_nan),
# PACKAGE="mixtools")
# nb_udfl = z$nb_udfl; nb_nan = z$nb_nan;
# total_udfl <- total_udfl + nb_udfl
# total_nan <- total_nan + nb_nan
# z.hat <- matrix(z$post, n, m)
# if (sum(is.nan(z.hat)) > 0) cat("Error!! NaN in z.hat") # obsolete ?
# loglik <- z$loglik
# loglikchange <- loglik - oldloglik
# oldloglik <- loglik
# finished <- iter >= maxiter
# if (iter>1 && max(abs(lambda[iter, ] - lambda[iter-1, ])) < eps)
# finished <- TRUE
# if (verb) {
# t1 <- proc.time()
# cat("iteration", iter, ": lambda ", round(lambda[iter, ], 4))
# cat(" obj change", round(loglikchange,4))
# cat(" time", (t1 - t0)[3])
# if (nb_udfl > 0) cat("average underflows=", nb_udfl/(n*m*r)," ")
# if (nb_nan >0) cat("average NaNs=", nb_nan/(n*m*r))
# Note: these average mean nb of nan over ngrid convolution
# cat("\n")
# }
# }
## f <- array(z$f, c(ngrid, m, r)) # obsolete in block version
# if (verb) {
# tt1 <- proc.time()
# cat("lambda ", round(lambda[iter, ], 4))
# cat(", total time", (tt1 - tt0)[3], "s\n")
# }
# return(structure(list(data = x, posteriors = z.hat,
# lambda = lambda[1:iter,], bandwidth = bw,
# blockid = u, lambdahat = lambda[iter,], f=f,
# grid = grid, loglik = loglik,
# meanUdfl = total_udfl/(n*m*r*iter),# average underflow
# meanNaN = total_nan/(n*m*r*iter)), # average nan
# class="npEM")) # define a "NEMS" class ?
#}
################################################################
################################################################
# development of version with adaptive bandwidth
################################################################
################################################################
npMSL <- function(x, mu0, blockid = 1:ncol(x),
bw=bw.nrd0(as.vector(as.matrix(x))), samebw = TRUE,
h=bw, eps=1e-8,
maxiter=500, ngrid=200, verb = TRUE){
bw <- h # h is alternative bandwidth argument, for backward compatibility
x <- as.matrix(x)
n <- nrow(x) # number of subjects
r <- ncol(x) # number of measurements in each subject
u <- match(blockid, unique(blockid))
B <- max(u) # nb of blocks
BlS <- rep(0,B) # block sizes = C_ell in JCGS paper
for (ell in 1:B) {
BlS[ell] <- sum(u == ell)}
if (is.matrix(mu0)) # mu0=centers
m <- dim(mu0)[1] else m <- mu0 # mu0=number of clusters
if(!samebw && !is.matrix(bw)) { # create initial bandwidth matrix
bw <- matrix(bw, nrow=max(u), ncol=m)
}
z.hat <- matrix(0, nrow = n, ncol = m)
tt0 <- proc.time() # for total time
## Initial Values
if(m == 1) z.hat <- matrix(1, nrow = n, ncol = m) else {
kmeans <- kmeans(x, mu0)
for(j in 1:m)
z.hat[kmeans$cluster==j, j] <- 1
}
iter <- 0
finished <- FALSE
lambda <- matrix(0, nrow = maxiter, ncol = m)
loglik <- NULL
total_udfl <- 0; total_nan <- 0 # eventual NaN and underflow in C code
tmp <- 1:n
xtra <- (max(x)-min(x))/10
grid <- seq(min(x)-xtra, max(x)+xtra, length=ngrid)
# f stored on a ngrid by m by B array
# f_{g,j,ell} = f_{j ell}(u_g)
# f <- array(1/m/diff(grid[1:2]), c(ngrid, m, B))
# this f was not normalized for being uniform over grid
Delta <- diff(grid[1:2])
f <- array(1/((ngrid-1)*Delta), c(ngrid, m, B))
oldloglik <- -Inf
orderx <- xx <- list() # preparation for adaptive bandwidth
for(k in 1:B) {
xx[[k]] <- as.vector(x[, u==k])
if (!samebw) {
orderx[[k]] = order(xx[[k]]) # only needed for IQR calculation for bw
}
}
CftEstep <- ifelse(samebw, "npMSL_Estep", "npMSL_Estep_bw")
# CftEstep <- "npMSL_Estep_bw" # temporary, for testing only the M-step
CftMstep <- ifelse(samebw, "npMSL_Mstep", "npMSL_Mstep_bw")
while (!finished) {
iter <- iter + 1
bw.old <- bw # is this needed?
t0 <- proc.time()
nb_udfl=0; # nb of underflows log(0) cancelled in nems_Estep.c
nb_nan=0; # nb of nonzero K()*log(0) cancelled in nems_Estep.c
## Note: Enter loop assuming E-step is done -- i.e., z.hat in place
## M-Step
lambda[iter, ] <- colMeans(z.hat)
## density estimation in M-step: WKDE-step
cs <- colSums(z.hat)
z.tmp <- sweep(z.hat, 2, cs, "/")
z.tmp[, cs==0] <- 1/NROW(z.tmp) # Just in case
## adaptive bandwidth update
for (ell in 1:B) {
r2 <- BlS[ell] # block size = nb of coordinates
if (!samebw) {
wts <- apply(z.tmp, 2, function(z) rep(z/r2, r2))
variances <- colSums(wts*outer(xx[[ell]], colSums(wts*xx[[ell]]),
'-')^2)
iqr <- apply(as.matrix(wts[orderx[[ell]],]), 2, wIQR,
xx[[ell]][orderx[[ell]]],
already.sorted=TRUE, already.normalized=TRUE)
h <- bw[ell, ] <- 0.9 * pmin(sqrt(variances), iqr/1.34) *
pmax(1,r2*n*lambda[iter, ])^(-1/5)
# Note: Doesn't allow "sample size" < 1.
# browser()
}
}
z=.C(CftMstep, as.integer(ngrid), as.integer(n),
as.integer(m), as.integer(r),
as.integer(B), as.integer(BlS), as.integer(u),
as.double(bw), as.double(x), as.double(grid),
new.f=as.double(f),
as.double(lambda[iter,]),
as.double(z.hat))
f <- array(z$new.f, c(ngrid, m, B)) # check sum(f == 0)
# print(apply(f,2:3,sum) * Delta)
# print(max(abs(f-f2)))
# browser()
## E-step (for next iteration)
z=.C(CftEstep, as.integer(ngrid), as.integer(n),
as.integer(m), as.integer(r),
as.integer(B), as.integer(u),
as.double(bw),
as.double(x), as.double(grid), f=as.double(f),
as.double(lambda[iter,]), post=as.double(z.hat),
loglik = double(1),
nb_udfl = as.integer(nb_udfl), nb_nan = as.integer(nb_nan))
nb_udfl = z$nb_udfl; nb_nan = z$nb_nan;
total_udfl <- total_udfl + nb_udfl
total_nan <- total_nan + nb_nan
z.hat <- matrix(z$post, n, m)
if (sum(is.nan(z.hat)) > 0) cat("Error!! NaN in z.hat") # obsolete ?
loglik <- z$loglik
loglikchange <- loglik - oldloglik
oldloglik <- loglik
finished <- iter >= maxiter
if (iter>1 && max(abs(lambda[iter, ] - lambda[iter-1, ])) < eps)
finished <- TRUE
if (verb) {
t1 <- proc.time()
cat("iteration", iter, ": lambda ", round(lambda[iter, ], 4))
cat(" obj change", round(loglikchange,4))
cat(" time", (t1 - t0)[3])
if ((nb_udfl > 0) || (nb_nan >0)) cat("\n ")
if (nb_udfl > 0) cat("average underflows=", round(nb_udfl/(n*m*r),3)," ")
if (nb_nan >0) cat("average NaNs=", round(nb_nan/(n*m*r),3))
# Note: these average mean nb of nan over ngrid convolution
cat("\n")
}
}
# f <- array(z$f, c(ngrid, m, r)) # obsolete in block version
if (!samebw) {
rownames(bw) <- paste("block", 1:max(u))
colnames(bw) <- paste("component", 1:m)
}
if (verb) {
tt1 <- proc.time()
cat("lambda ", round(lambda[iter, ], 4))
cat(", total time", (tt1 - tt0)[3], "s\n")
}
return(structure(list(data = x, posteriors = z.hat,
lambda = lambda[1:iter,], bandwidth = bw,
blockid = u, lambdahat = lambda[iter,], f=f,
grid = grid, loglik = loglik,
meanUdfl = total_udfl/(n*m*r*iter),# average underflow
meanNaN = total_nan/(n*m*r*iter)), # average NaN's
class="npEM")) # define a "npMSL" class ?
}