####################################################### ## compound Poisson GLM ## ## Author: Wayne Zhang, actuary_zhang@hotmail.com ## ####################################################### cpglm <- function(formula, link = "log", data, weights, offset, subset, na.action = NULL, contrasts = NULL, control = list(), chunksize = 0, optimizer = "nlminb", ...) { call <- match.call() if (missing(data)) data <- environment(formula) # use bigglm for big data sets if (chunksize) { ans <- cpglm.profile.bigglm(call, data) } else { fr <- cpglm.mf(call, contrasts) link.power <- make.link.power(link) control <- do.call("cplm.control", control) ans <- cpglm.profile(fr, link.power, control, optimizer) } ans@formula <- formula ans@call <- call ans@na.action <- na.action ans@contrasts <- contrasts return(ans) } # function to implement the profile likelihood approach # Use cpglm.profile.bigglm for big data sets cpglm.profile <- function(fr, link.power = 0, control = list(), optimizer = "nlminb"){ # compute the profile loglikelihood: parm = c(log(phi), p) llik_profile <- function(parm){ fit2 <- cpglm.fit(fr, p = parm[2], link.power) 0.5 * dtweedie.nlogl(fr$Y, fit2$fitted.values, exp(parm[1]) / fr$wts, parm[2]) } # generate starting values for p and phi init <- cpglm.init(fr, link.power) parm <- c(log(init$phi), init$p) # optimize the profile loglikelihood (cplm_optim is a wrapper) opt_ans <- cplm_optim(parm, llik_profile, gr = NULL, lower = c(-Inf, control$bound.p[1]), upper = c(Inf, control$bound.p[2]), control = control, optimizer = optimizer) if (opt_ans$convergence) warning(opt_ans$message) # mle estimates p.max <- opt_ans$par[2] phi.max <- exp(opt_ans$par[1]) # fit glm using the optimized index parameter fit <- cpglm.fit(fr, p.max, link.power) # return results out <- new("cpglm", coefficients = fit$coefficients, residuals = fit$residuals, fitted.values = fit$fitted.values, weights = fit$weights, linear.predictors = fit$linear.predictors, df.residual = as.integer(fit$df.residual), deviance = fit$deviance, call = call("foo"), aic = 2 * (opt_ans$value + fit$rank), formula = ~ 1, control = control, contrasts = NULL, p = p.max, phi = phi.max, iter = fit$iter, converged = fit$converged, link.power= link.power, model.frame = fr$mf, na.action = NULL, offset = fr$offset, prior.weights = fit$prior.weights, y = fr$Y, inits = NULL, vcov = summary.glm(fit)$cov.scaled) return(out) } # function to implement the profile likelihood approach for big data sets cpglm.profile.bigglm <- function(call, data){ # reconstruct the call if (is.null(control <- call$control)) control <- list() if (is.null(link <- call$link)) link <- "log" if (is.null(optimizer <- call$optimizer)) optimizer <- "nlminb" control <- do.call("cplm.control", eval(control)) link.power <- make.link.power(eval(link)) mc <- match(c("link", "control", "family"), names(call), 0L) call <- call[-c(1, mc[mc>0])] pstart <- 1.5 call$family <- tweedie(var.power = pstart, link.power = link.power) # default maxit in bigglm: the default seems not to be enough call$maxit <- ifelse(is.null(call$maxit), 50, call$maxit) call$formula <- eval(call$formula) Y <- eval(call$formula[[2]], data, parent.frame()) # profile loglikelihood llik_profile <- function(parm){ call$family <- tweedie(var.power = parm[2], link.power = link.power) fit2 <- do.call("bigglm", as.list(call)) fs <- fitted(fit2, data) 0.5 * dtweedie.nlogl(Y, fs$fitted.values, exp(parm[1]) / fs$prior.weights, parm[2]) } # generate starting values for phi fit <- do.call("bigglm", as.list(call)) phistart <- fit$qr$ss / fit$df.resid parm <- c(log(phistart), pstart) # optimize the profiled loglikelihood opt_ans <- cplm_optim(parm, llik_profile, gr = NULL, lower = c(-Inf, control$bound.p[1]), upper = c(Inf, control$bound.p[2]), control = control, optimizer = optimizer) if (opt_ans$convergence) warning(opt_ans$message) p.max <- opt_ans$par[2] phi.max <- exp(opt_ans$par[1]) # fit glm using the optimized index parameter call$family <- tweedie(var.power = p.max, link.power = link.power) fit <- do.call("bigglm", as.list(call)) fs <- fitted(fit, data) # return results out <- new("cpglm", coefficients = coef(fit), residuals = fs$residuals, fitted.values = fs$fitted.values, linear.predictors = fs$linear.predictors, weights = fs$weights, df.residual = as.integer(fit$df.resid), aic = 2 * (opt_ans$value + fit$n - fit$df.resid), deviance = fit$deviance, call = call("foo"), formula = ~ 1, control = control, contrasts = NULL, p = p.max, phi = phi.max, iter = fit$iterations, converged = fit$converged, link.power= link.power, model.frame = as.data.frame(NULL), na.action = NULL, offset = fs$offset, prior.weights = fs$prior.weights, y = Y, inits = NULL, vcov = vcov(fit)) return(out) }