Revision a7a30ac2a6f34e1d7710938b4547e6b5b702d881 authored by Wayne Zhang on 16 August 2012, 00:00:00 UTC, committed by Gabor Csardi on 16 August 2012, 00:00:00 UTC
1 parent 0aa535d
Raw File
zcpglm.R
#######################################################
##    Zero-inflated compound Poisson GLM             ##
## Author: Wayne Zhang, actuary_zhang@hotmail.com    ##
#######################################################


zcpglm <- function(formula, link = "log", data, weights, offset, 
                  subset, na.action = NULL, contrasts = NULL, 
                  control = list(), optimizer = "nlminb") {

  call <- match.call()  
  if (missing(data)) 
    data <- environment(formula) 
  
  # get formula for the two parts: zero and tweedie 
  tmp <- strsplit(paste(deparse(formula), sep = "", collapse = ""), "\\|\\|")[[1]]
  if (length(tmp) == 1) {
    warning("formula for the zero part is missing: default to ~1")
    tmp <- c(tmp, "1")
  }
  ft <- as.formula(tmp[1])
  fz <- as.formula(paste(formula[[2]], "~", tmp[2]))
  # change environment so that model.frame will work (necessary for offset())!!!
  environment(ft) <- environment(fz) <- environment(formula)
  
  # generate model frame for each part
  call2 <- call 
  call2$formula <- ft # tweedie              
  frt <- cpglm.mf(call2, contrasts)
  call2$formula <- fz # zero
  frz <- cpglm.mf(call2, contrasts)
  fr <- list(Y = frt$Y, Xz = frz$X, Xt = frt$X, 
          offz = frz$off, offt = frt$off, wts = frt$wts)
  link.power <- make.link.power(link)
  control <- do.call("cplm.control", control)
  
  # estimate parameters 
  ans <- zcpglm.optim(fr, link.power, control, optimizer)  
  
  # return result
  ans@formula <- formula 
  ans@call <- call
  ans@na.action <- na.action
  ans@contrasts <- contrasts
  ans@model.frame <- list(zero = frz$mf, tweedie = frt$mf)
  return(ans)
}


# zero-inflated Tweedie using the Newton-Raphson method (EM is too slow)
zcpglm.optim <- function(fr, link.power = 0, 
                         control = list(), optimizer = "nlminb"){
  
  # dimensions
  nbz <- ncol(fr$Xz)
  nbt <- ncol(fr$Xt)
  nb <- nbz + nbt
  # link inverse functions
  tw <- tweedie(link.power = link.power)                 
  logit <- binomial()
  eps <- 0.001 #constant in numerical gradient/hessian
  yp <- fr$Y > 0 
  
  # statistics used in marginal loglikelihood and gradient
  llik_common <- function(parm){
    # extract parameters
    betaz <- parm[1:nbz]
    betat <- parm[(nbz + 1): nb]
    phi <- exp(parm[(nb + 1)]) #phi is on log scale
    p <- parm[(nb + 2)]
    etaz <- fr$Xz %*% betaz + fr$offz
    q <- as.numeric(logit$linkinv(etaz))
    etat <- fr$Xt %*% betat + fr$offt
    mu <- tw$linkinv(etat)
    ly <- as.numeric(log(dtweedie(y = fr$Y, mu = mu, 
                                  phi = phi, power = p)))
    ml <- q * as.integer(fr$Y == 0) + exp(log(1 - q) + ly) 
    ml[ml == 0] <- .Machine$double.eps
    llik <- rep(NA, length(ly))
    llik[!yp] <- log(ml[!yp])
    llik[yp] <- (log(1 - q) + ly)[yp]
    
    # dtweedie could result in zero!!!
    return(list(etaz = etaz, etat = etat, q = q, mu = mu,
                fy = exp(ly), ml = ml, llik = llik))
  }
  
  # marginal loglikelihood f(y) used in Newton-Raphson 
  llik_zcpglm <- function(parm){
    ll <- llik_common(parm)
    ##FIXME: the weight is probably wrong?
    - sum(fr$wts * ll$llik)
  }
  
  # gradient: analytical expressions for regression parameters
  grad_zcpglm <- function(parm){
    ll <- llik_common(parm)
    gr <- rep(NA, nb + 2)  
    gr[1:nbz] <- - colSums(fr$wts * as.numeric(logit$mu.eta(ll$etaz)) * 
        (as.integer(fr$Y == 0) - ll$fy) / ll$ml * fr$Xz)
    gr[(nbz + 1):nb] <- - colSums(fr$wts * (1 - ll$q) * ll$fy * tw$mu.eta(ll$etat)
       * (fr$Y - ll$mu) / (ll$ml * ll$mu^parm[nb + 2]) * fr$Xt) / exp(parm[nb + 1])
    llik_phip <- function(pm)
      llik_zcpglm(c(parm[1:nb], pm))
    gr[(nb + 1):(nb + 2)] <- grad(parm[-(1:nb)], llik_phip)
    gr
  }
 
  # generate starting values
  p <- 1.5   
  fitt <- glm.fit(fr$Xt, fr$Y, weights = fr$wts, 
            offset = fr$offt, family = tweedie(var.power = p, 
            link.power = link.power))
  phi <- sum((fitt$weights * fitt$residuals^2)) / fitt$df.residual
  pt0 <- exp(-fitt$fitted.values^(2 - p) / (phi * (2 - p)))
  yz <- 1 - as.integer(as.logical(fr$Y)) - pt0 * (fr$Y == 0)
  fitz <- glm.fit(fr$Xz, yz, weights = fr$wts, 
            offset = fr$offz, family = quasibinomial())
  parm <- as.numeric(c(fitz$coefficients, fitt$coefficients, log(phi), p))
  
  # run optimization 
  opt_ans <- cplm_optim(parm, llik_zcpglm, gr = grad_zcpglm, 
                      lower = c(rep(-Inf, nb + 1), control$bound.p[1]),
                      upper = c(rep(Inf, nb + 1), control$bound.p[2]),
                      control = control, optimizer = optimizer)
  if (opt_ans$convergence) warning(opt_ans$message)
  
  # compute hessian matrix for regression parameters  
  # (optim also computes those for phi and p)
  grad_zcpglm2 <- function(parm){
    ll <- llik_common(parm)
    gr <- rep(NA, nb)  
    gr[1:nbz] <- - colSums(fr$wts * as.numeric(logit$mu.eta(ll$etaz)) * 
        (as.integer(fr$Y == 0) - ll$fy) / ll$ml * fr$Xz)
    gr[(nbz + 1):nb] <- - colSums(fr$wts * (1 - ll$q) * ll$fy * tw$mu.eta(ll$etat)
       * (fr$Y - ll$mu) / (ll$ml * ll$mu^parm[nb + 2]) * fr$Xt) / exp(parm[nb + 1])
    gr
  }
  parm <- opt_ans$par
  hn <- matrix(0, nb, nb)
  for (i in 1:nb){
    parm[i] <- parm[i] - eps
    g1 <- grad_zcpglm2(parm)
    parm[i] <- parm[i] + 2 * eps
    g2 <- grad_zcpglm2(parm)
    hn[i,] <- (g2 - g1) / ( 2 * eps)
    parm[i] <- parm[i] - eps
  }
  
  # extract stats for output                                   
  nmz <- dimnames(fr$Xz)[[2]]
  nmt <- dimnames(fr$Xt)[[2]]
  betaz <- opt_ans$par[1:nbz]
  betat <- opt_ans$par[(nbz + 1):nb]
  names(betaz) <- nmz 
  names(betat) <- nmt
  q <- logit$linkinv(fr$Xz %*% betaz + fr$offz)
  mu <- tw$linkinv(fr$Xt %*% betat + fr$offt)
  Yhat <- as.numeric((1 - q) * mu)
  nmv <- c(paste("zero_", nmz, sep = ""), paste("tw_", nmt, sep = ""))
  vc <- solve(hn)
  dimnames(vc) <- list(nmv, nmv)
  
  # return results
  out <- new("zcpglm", 
             coefficients = list(zero = betaz, tweedie = betat),
             residuals = as.numeric(sqrt(fr$wts) * (fr$Y - Yhat)),
             fitted.values = Yhat, call = call("foo"),
             df.residual = as.integer(nrow(fr$Xt) - nb),             
             formula = ~ 1, control = control, contrasts = NULL,
             p = opt_ans$par[nb + 2], phi = exp(opt_ans$par[nb + 1]), 
             converged = as.logical(ifelse(opt_ans$convergence, 0, 1)),
             link.power= link.power,  na.action = NULL,
             model.frame = list(), llik = - opt_ans$value,
             offset = list(zero = fr$offz, tweedie = fr$offt), 
             prior.weights = fr$wts, y = fr$Y,
             inits = NULL, vcov = vc)
  return(out)  
}               

back to top