Skip to main content
  • Home
  • Development
  • Documentation
  • Donate
  • Operational login
  • Browse the archive

swh logo
SoftwareHeritage
Software
Heritage
Archive
Features
  • Search

  • Downloads

  • Save code now

  • Add forge now

  • Help

Revision ae99bafc369d70069d8b0dba6c4ca3bcb30e0800 authored by Wayne Zhang on 10 August 2011, 00:00:00 UTC, committed by Gabor Csardi on 10 August 2011, 00:00:00 UTC
version 0.1-1
0 parent
  • Files
  • Changes
  • 97f8ca8
  • /
  • R
  • /
  • cpglm.R
Raw File Download
Permalinks

To reference or cite the objects present in the Software Heritage archive, permalinks based on SoftWare Hash IDentifiers (SWHIDs) must be used.
Select below a type of object currently browsed in order to display its associated SWHID and permalink.

  • revision
  • directory
  • content
revision badge
swh:1:rev:ae99bafc369d70069d8b0dba6c4ca3bcb30e0800
directory badge Iframe embedding
swh:1:dir:11bcdc05d09a5204a48ea39467507532a4fc3c6a
content badge Iframe embedding
swh:1:cnt:61ad8587bffdaa73475e3d2ec16501ffe365d03f
Citations

This interface enables to generate software citations, provided that the root directory of browsed objects contains a citation.cff or codemeta.json file.
Select below a type of object currently browsed in order to generate citations for them.

  • revision
  • directory
  • content
Generate software citation in BibTex format (requires biblatex-software package)
Generating citation ...
Generate software citation in BibTex format (requires biblatex-software package)
Generating citation ...
Generate software citation in BibTex format (requires biblatex-software package)
Generating citation ...
cpglm.R
#######################################################
## MCEM method for fitting compound Poisson GLM
#######################################################

cpglm <- function(formula, link = "log", data, weights, subset, 
    na.action,betastart=NULL, phistart=NULL, pstart=NULL, offset, 
    contrasts = NULL,   control=list(),
    ... ) {
  
  call <- match.call()  
  if (missing(data)) 
    data <- environment(formula)   
  mf <- match.call(expand.dots = FALSE)
  m <- match(c("formula", "data", "subset", "weights",
               "na.action", "offset"), names(mf), 0L)
  mf <- mf[c(1L, m)]
  mf$drop.unused.levels <- TRUE
  mf[[1L]] <- as.name("model.frame")
  mf <- eval(mf, parent.frame())
  mt <- attr(mf, "terms")
  Y <- model.response(mf, "any")
  X <- if (!is.empty.model(mt)) 
        model.matrix(mt, mf, contrasts)
  weights <- as.vector(model.weights(mf))
  offset <- as.vector(model.offset(mf))
  if (!is.null(weights)) 
        stop("'weights' is not implemented yet")
  if (!is.null(offset)) {
    if (length(offset) != NROW(Y)) 
      stop(gettextf("number of 'offset' is %d should equal %d (number of observations)", 
                length(offset), NROW(Y)), domain = NA)
    }
  if (!is.null(betastart)){
    if (length(betastart) != ncol(X))
      stop(gettextf("number of 'betastart' is %d should equal %d (number of mean parameters)", 
                length(betastart), ncol(X)), domain = NA)
    }
  if (!is.null(phistart) && phistart<=0)
    stop("value of 'phistart' should be greater than 0")
  if (!is.null(pstart) && (pstart<=1 || pstart>=2))
    stop("value of 'pstart' should be between 1 and 2")   
  link.power <- make.link.power(link)

  cpfit <- .cpglm.fit(X,Y,weights=weights,offset=offset,
                     link.power=link.power,contrasts=contrasts,
                     betastart=betastart,phistart=phistart,pstart=pstart,
                      control=control)
  ans <- new("cpglm", 
             coefficients=cpfit$coefficients, 
             residuals=cpfit$residuals,
             fitted.values=cpfit$fitted.values,
             weights=cpfit$weights,
             df.residual=cpfit$df.residual,
             df.null=cpfit$df.null,
             y=cpfit$y,
             call=call,
             formula=formula,
             data=data,
             offset=cpfit$offset,
             control=cpfit$control,
             contrasts=contrasts,
             theta=cpfit$theta,
             vcov=cpfit$vcov,
             iter=cpfit$iter)  
  return(ans)
}


# internal function to run the MCEM 
.cpglm.fit <- function(X,Y,weights=NULL,offset=NULL,
                      link.power=0,contrasts=NULL,
                      betastart,phistart,pstart,
                      control=list()){
    # set control options                        
    control <- do.call("cpglm.control", control)
 
    X <- as.matrix(X)          
    # get names
    xnames <- dimnames(X)[[2L]]
    ynames <- if (is.matrix(Y)) 
        rownames(Y) else 
        names(Y)
    
    # default weights and offsets if NULL    
    nobs <- NROW(Y)
    #FIXME: fix weights now. does not handle prior weights
    weights <- rep.int(1, nobs)
    if (is.null(offset)) 
        offset <- rep.int(0, nobs)          
    # generating starting values if necessary
    if (is.null(pstart)) 
      pstart <- 1.5
    if (is.null(betastart) || is.null(phistart)) {
      fit.start <- glm(Y~-1+X,weights=weights,offset=offset,
                  family=tweedie(var.power=pstart,link.power=link.power))
      if (is.null(betastart))
        betastart <- as.numeric(fit.start$coefficients)
      if (is.null(phistart))
        phistart <- sum(residuals(fit.start,"pearson")^2)/df.residual(fit.start)
    }
    
    # index of positive y's     
    ygt0 <- as.numeric(which(Y>0L) )     
    npobs <- length(ygt0)    
    nbeta <- length(betastart)
    sz <- control$init.size 
    sr <- control$sample.iter
    C <- qchisq(1-control$alpha, nbeta+2)
    
    # input list to be used in sampling or likelihood calculation
    input <- list(X=X,Y=Y,ygt0=ygt0,beta=betastart,
                  phi=phistart,p=pstart,k=ygt0[1L],
                  link.power=link.power,offset=offset)         
         
    # start iterations:
    diff <- 10
    iter <- 0
    theta <- as.numeric(unlist(input[c("beta","phi","p")]))                  
    # weight for each sample
    w <- rep.int(1,sz)
      	
    while (diff>control$epsilon2){
      iter <- iter + 1       
    ######################################################### 
    ###########            The E Step             ###########
    #########################################################        
    # sample the t's using rejection sampling if needed
    if (iter <=sr) {
		sim.t <- sample.t.rej(sz, input)
    } else                    
    # use importance sampling if iter > sample.iter
    {   
      w <- import.weight(theta,input,sim.t,sr) 
    }  
              
    ######################################################### 
    ###########            The M Step             ###########
    #########################################################            
    #1. compute beta given p
    fit <- glm.fit(x = X, y = Y, weights = weights,offset = offset, 
		family = tweedie(var.power=input$p,link.power=input$link.power))
    input$beta <- as.numeric(fit$coefficients)
    mu <- as.numeric(fit$fitted.values)
    
    #2. update phi and p
    nb <- optim.phi.p(mu,sim.t,w,bd=control$bound.p,input)
    input$phi <- nb[1] 
    input$p <- nb[2] 
 	
    # store the history of theta
    theta <- rbind(theta,as.numeric(unlist(input[c("beta","phi","p")])))                    

    # print out iteration info if necessary  
    if (control$verbose) {
      cat("Iteration:",iter,"\n")
      parm <- round(theta[iter+1,],4)
      names(parm) <- c(xnames,"phi","p")
      cat(parm,"\n")
    }
    if (iter >= control$maxit) {
      warning("maximum iteration limit reached!\n")
      break
    }
    # update diff
    diff <- max(abs(theta[iter+1,]-theta[iter,])/
    			(abs(theta[iter,])+control$epsilon1))
    
    # determine if sample size needs to be increased
    if (!control$fixed.size & iter <=sr){		
    	T <- tstat.intv.cpglm(theta,input, sim.t,w)  
    	if (iter <=sr &  T>C) 	
	    	sz <- min(sz + round(sz/control$k), control$max.size)
	cat("sample size:", sz, "\n")     
	}
  }

    # fit the glm using the converged parameters  
    fit <- glm(Y~-1+ X, weights = weights,offset = offset,
          family=tweedie(var.power=input$p,link.power=input$link.power))

    if (control$verbose) {
      cat("Algorithm has converged\n")
      cat("Computing hessian matrix\n")      
    }  

    # compute the vcov matrix  
    vcov <- var.theta.cpglm(theta,input, sim.t,w)

    # get output  
    names(fit$coefficients)<- xnames    
    ans <- c(fit[c("coefficients","residuals","fitted.values",
                   "df.residual", "df.null","y")],
                   list(theta=theta,vcov=vcov,iter=iter,
                        control=control,offset=offset,
                        weights=weights))
    return(ans)

}
  
# function to simulate T's from the posterior distribution
sample.t.rej <- function(n,input){
  out <- .Call("sampleTRej",n=as.integer(n),
		Y=as.numeric(input$Y[input$ygt0]),
		phi=input$phi,
		p=input$p )
  return(out)				
}

# function to compute the weight in importance sampling
import.weight <- function(theta,input,sim.t,sr){
  ix <- c(ncol(theta)-1, ncol(theta))      
  w <- .Call("importWeight",nc=as.integer(ncol(sim.t)),
      			Y=input$Y[input$ygt0], 
      			sT=sim.t,parm=list("old"=theta[sr,ix],
                                   "new"=theta[nrow(theta),ix]))  
}  

# function to maximize over phi and p
optim.phi.p <- function(mu,sim.t,w,bd=c(1.01,1.99),input){
  out <- .Call("optimNbGlm",Y=input$Y[input$ygt0], 
	mu=mu, ygt0=as.integer(input$ygt0), p= input$p, phi=input$phi,  
	simT=sim.t, w=w, bd=bd)
  return(out)				
}

# function to take inverse of a matrix using svd 
svd.inv <- function(x){
	sx <- svd(x)
	return(sx$v%*% diag(1/sx$d)%*%t(sx$u))	
}

# function to compute approximate normal confidence interval
# used for determining whether to increase sample size
tstat.intv.cpglm <- function(theta,input,sim.t,w){
	nt <- nrow(theta)
	out <- .Call("hessGlmEst",X=input$X,Y=input$Y[input$ygt0],
            ygt0= as.integer(input$ygt0),theta=theta[nt,],
		simT=sim.t,w=w,linkpower=as.double(input$link.power), 
		n=as.integer(nrow(input$X)),offset=as.numeric(input$offset))
	# compute approximate covariance matrix	
	hI.inv <- svd.inv(out$HQ)
	var.app <- hI.inv %*% out$VQ %*% t(hI.inv)  
	mu.app <- theta[nt-1,]-theta[nt,]
	# compute test stat
	return(t(mu.app)%*%svd.inv(var.app)%*%mu.app)
}

               
# function to compute covariance matrix
var.theta.cpglm <- function(theta,input,sim.t,w){
	out <- .Call("hessGlmEst",X=input$X,Y=input$Y[input$ygt0],
            ygt0= as.integer(input$ygt0),theta=theta[nrow(theta),],
		simT=sim.t,w=w,linkpower=as.double(input$link.power), 
		n=as.integer(nrow(input$X)),offset=as.numeric(input$offset))
	# compute information matrix 
	IM <- -out[[1]]-out[[2]]	
	# invert information matrix
	svdI <- svd(IM)              
	var.theta <- svdI$v%*% diag(1/svdI$d)%*%t(svdI$u)  
	return(var.theta)
}

# function to compute the link.power needed in tweedie
make.link.power <- function(link) {   
  #link.temp<- substitute(link)
  #if (!is.character(link.temp))
  #      link.temp <- deparse(link.temp)
  link.temp<- link
  okLinks <- c("log", "identity", "sqrt","inverse")
  if (link.temp %in% okLinks) 
      switch(link.temp,log=0, identity=1, sqrt=0.5, inverse=2) else
      stop("invalid link function!")
}

# control options intializer
cpglm.control <- function(init.size=100L,
                       sample.iter=50L,
                       max.size=10000L,
                       maxit=200,
                       epsilon1=1e-03,
                       epsilon2=1e-04,
                       alpha =0.25,
                       k=5,                       
                       bound.p=c(1.01,1.99),
                       fixed.size=TRUE,   
                       verbose=TRUE){
  if (!is.numeric(init.size) || init.size <= 0)
        stop("value of sample.size should be an integer and >0")
  if (!is.numeric(sample.iter) || sample.iter <= 0)
        stop("value of sample.iter should be an integer and >0")   
  if (!is.numeric(epsilon1) || epsilon1 <= 0) 
        stop("value of 'epsilon1' must be > 0")
  if (!is.numeric(epsilon2) || epsilon2 <= 0) 
        stop("value of 'epsilon2' must be > 0") 
  if (!is.numeric(alpha) || alpha <= 0 || alpha>=1) 
        stop("value of 'alpha' must be between 0 and 1")               
  if (!is.numeric(k) || k <= 0) 
        stop("value of 'k' must be > 0")         
  if (!is.numeric(maxit) || maxit <= 0) 
        stop("value of 'maxit' must be > 0")
  if (min(bound.p)<1 || max(bound.p)>2)
        stop("value of 'bound.p' must be between 1 and 2")
  if (!is.numeric(fixed.size) && !is.logical(fixed.size))
        stop("'fixed.size' must be logical or numeric")
  if (!is.numeric(verbose) && !is.logical(verbose))
        stop("'verbose' must be logical or numeric")
  
  bound.p <- sort(bound.p)
  fixed.size <- as.logical(fixed.size)
  verbose <- as.logical(verbose)
  
    list(init.size=init.size,
         sample.iter=sample.iter,
         maxit=maxit,
         epsilon1 = epsilon1,
         epsilon2=epsilon2,
         alpha=alpha,
         k=k,
         fixed.size=fixed.size,
         max.size=max.size,
         bound.p=bound.p,
         verbose=verbose)  
}






The diff you're trying to view is too large. Only the first 1000 changed files have been loaded.
Showing with 0 additions and 0 deletions (0 / 0 diffs computed)
swh spinner

Computing file changes ...

Software Heritage — Copyright (C) 2015–2025, The Software Heritage developers. License: GNU AGPLv3+.
The source code of Software Heritage itself is available on our development forge.
The source code files archived by Software Heritage are available under their own copyright and licenses.
Terms of use: Archive access, API— Contact— JavaScript license information— Web API

back to top