zcpglm.R
#######################################################
## Zero-inflated compound Poisson GLM ##
## Author: Wayne Zhang, actuary_zhang@hotmail.com ##
#######################################################
zcpglm <- function(formula, link = "log", data, weights, offset,
subset, na.action = NULL, contrasts = NULL,
control = list(), optimizer = "nlminb") {
call <- match.call()
if (missing(data))
data <- environment(formula)
# get formula for the two parts: zero and tweedie
tmp <- strsplit(paste(deparse(formula), sep = "", collapse = ""), "\\|\\|")[[1]]
if (length(tmp) == 1) {
warning("formula for the zero part is missing: default to ~1")
tmp <- c(tmp, "1")
}
ft <- as.formula(tmp[1])
fz <- as.formula(paste(formula[[2]], "~", tmp[2]))
# change environment so that model.frame will work (necessary for offset())!!!
environment(ft) <- environment(fz) <- environment(formula)
# generate model frame for each part
call2 <- call
call2$formula <- ft # tweedie
frt <- cpglm.mf(call2, contrasts)
call2$formula <- fz # zero
frz <- cpglm.mf(call2, contrasts)
fr <- list(Y = frt$Y, Xz = frz$X, Xt = frt$X,
offz = frz$off, offt = frt$off, wts = frt$wts)
link.power <- make.link.power(link)
control <- do.call("cplm.control", control)
# estimate parameters
ans <- zcpglm.optim(fr, link.power, control, optimizer)
# return result
ans@formula <- formula
ans@call <- call
ans@na.action <- na.action
ans@contrasts <- contrasts
ans@model.frame <- list(zero = frz$mf, tweedie = frt$mf)
return(ans)
}
# zero-inflated Tweedie using the Newton-Raphson method (EM is too slow)
zcpglm.optim <- function(fr, link.power = 0,
control = list(), optimizer = "nlminb"){
# dimensions
nbz <- ncol(fr$Xz)
nbt <- ncol(fr$Xt)
nb <- nbz + nbt
# link inverse functions
tw <- tweedie(link.power = link.power)
logit <- binomial()
eps <- 0.001 #constant in numerical gradient/hessian
yp <- fr$Y > 0
# statistics used in marginal loglikelihood and gradient
llik_common <- function(parm){
# extract parameters
betaz <- parm[1:nbz]
betat <- parm[(nbz + 1): nb]
phi <- exp(parm[(nb + 1)]) #phi is on log scale
p <- parm[(nb + 2)]
etaz <- fr$Xz %*% betaz + fr$offz
q <- as.numeric(logit$linkinv(etaz))
etat <- fr$Xt %*% betat + fr$offt
mu <- tw$linkinv(etat)
ly <- as.numeric(log(dtweedie(y = fr$Y, mu = mu,
phi = phi, power = p)))
ml <- q * as.integer(fr$Y == 0) + exp(log(1 - q) + ly)
ml[ml == 0] <- .Machine$double.eps
llik <- rep(NA, length(ly))
llik[!yp] <- log(ml[!yp])
llik[yp] <- (log(1 - q) + ly)[yp]
# dtweedie could result in zero!!!
return(list(etaz = etaz, etat = etat, q = q, mu = mu,
fy = exp(ly), ml = ml, llik = llik))
}
# marginal loglikelihood f(y) used in Newton-Raphson
llik_zcpglm <- function(parm){
ll <- llik_common(parm)
##FIXME: the weight is probably wrong?
- sum(fr$wts * ll$llik)
}
# gradient: analytical expressions for regression parameters
grad_zcpglm <- function(parm){
ll <- llik_common(parm)
gr <- rep(NA, nb + 2)
gr[1:nbz] <- - colSums(fr$wts * as.numeric(logit$mu.eta(ll$etaz)) *
(as.integer(fr$Y == 0) - ll$fy) / ll$ml * fr$Xz)
gr[(nbz + 1):nb] <- - colSums(fr$wts * (1 - ll$q) * ll$fy * tw$mu.eta(ll$etat)
* (fr$Y - ll$mu) / (ll$ml * ll$mu^parm[nb + 2]) * fr$Xt) / exp(parm[nb + 1])
llik_phip <- function(pm)
llik_zcpglm(c(parm[1:nb], pm))
gr[(nb + 1):(nb + 2)] <- grad(parm[-(1:nb)], llik_phip)
gr
}
# generate starting values
p <- 1.5
fitt <- glm.fit(fr$Xt, fr$Y, weights = fr$wts,
offset = fr$offt, family = tweedie(var.power = p,
link.power = link.power))
phi <- sum((fitt$weights * fitt$residuals^2)) / fitt$df.residual
pt0 <- exp(-fitt$fitted.values^(2 - p) / (phi * (2 - p)))
yz <- 1 - as.integer(as.logical(fr$Y)) - pt0 * (fr$Y == 0)
fitz <- glm.fit(fr$Xz, yz, weights = fr$wts,
offset = fr$offz, family = quasibinomial())
parm <- as.numeric(c(fitz$coefficients, fitt$coefficients, log(phi), p))
# run optimization
opt_ans <- cplm_optim(parm, llik_zcpglm, gr = grad_zcpglm,
lower = c(rep(-Inf, nb + 1), control$bound.p[1]),
upper = c(rep(Inf, nb + 1), control$bound.p[2]),
control = control, optimizer = optimizer)
if (opt_ans$convergence) warning(opt_ans$message)
# compute hessian matrix for regression parameters
# (optim also computes those for phi and p)
grad_zcpglm2 <- function(parm){
ll <- llik_common(parm)
gr <- rep(NA, nb)
gr[1:nbz] <- - colSums(fr$wts * as.numeric(logit$mu.eta(ll$etaz)) *
(as.integer(fr$Y == 0) - ll$fy) / ll$ml * fr$Xz)
gr[(nbz + 1):nb] <- - colSums(fr$wts * (1 - ll$q) * ll$fy * tw$mu.eta(ll$etat)
* (fr$Y - ll$mu) / (ll$ml * ll$mu^parm[nb + 2]) * fr$Xt) / exp(parm[nb + 1])
gr
}
parm <- opt_ans$par
hn <- matrix(0, nb, nb)
for (i in 1:nb){
parm[i] <- parm[i] - eps
g1 <- grad_zcpglm2(parm)
parm[i] <- parm[i] + 2 * eps
g2 <- grad_zcpglm2(parm)
hn[i,] <- (g2 - g1) / ( 2 * eps)
parm[i] <- parm[i] - eps
}
# extract stats for output
nmz <- dimnames(fr$Xz)[[2]]
nmt <- dimnames(fr$Xt)[[2]]
betaz <- opt_ans$par[1:nbz]
betat <- opt_ans$par[(nbz + 1):nb]
names(betaz) <- nmz
names(betat) <- nmt
q <- logit$linkinv(fr$Xz %*% betaz + fr$offz)
mu <- tw$linkinv(fr$Xt %*% betat + fr$offt)
Yhat <- as.numeric((1 - q) * mu)
nmv <- c(paste("zero_", nmz, sep = ""), paste("tw_", nmt, sep = ""))
vc <- solve(hn)
dimnames(vc) <- list(nmv, nmv)
# return results
out <- new("zcpglm",
coefficients = list(zero = betaz, tweedie = betat),
residuals = as.numeric(sqrt(fr$wts) * (fr$Y - Yhat)),
fitted.values = Yhat, call = call("foo"),
df.residual = as.integer(nrow(fr$Xt) - nb),
formula = ~ 1, control = control, contrasts = NULL,
p = opt_ans$par[nb + 2], phi = exp(opt_ans$par[nb + 1]),
converged = as.logical(ifelse(opt_ans$convergence, 0, 1)),
link.power= link.power, na.action = NULL,
model.frame = list(), llik = - opt_ans$value,
offset = list(zero = fr$offz, tweedie = fr$offt),
prior.weights = fr$wts, y = fr$Y,
inits = NULL, vcov = vc)
return(out)
}