Content - 8b84740de46572faae8f0277ace1d3e6101bcd1b - dd9e837/R/gini.R

gini.R
#######################################################
##    The Gini index from an ordered Lorenz curve    ##
## Author: Wayne Zhang, actuary_zhang@hotmail.com    ##
#######################################################

# class defining slots common to all derived classes 
setClass("gini", 
  representation(
    call = "call",
    gini = "matrix",
    sd = "matrix",
    lorenz = "list")
)

# compute gini indices etc for model selections
gini <- function(loss, score, base = NULL, data, ...){
  call <- match.call()  
  
  if (missing(loss) || missing(score))
    stop("loss and score must be specified")
  if (!is.character(loss) || !is.character(score) ||
     (!is.null(base) && !is.character(base)))
    stop("Arguments 'loss', 'score' and 'base' must be characters!")
  # check base 
  if (length(base) > 1) {
    base <- base[1]
    warning("Only the first base premium is used!")    
  }    
  # check score 
  if (length(base) && (pos <- match(base, score, nomatch = 0))){
    score <- score[-pos]
    warning(paste(base, "is removed from score!"))
  }
  
  # get loss and standardize so that mean is 1
  y <- data[, loss] / mean(data[, loss])
  sc <- data[, score]
  
  # when the base premium is specified
  if (!is.null(base)) {
    P <- data[, base] / mean(data[, base])
    ans <- do_gini(y, P, sc)
    dimnames(ans$gini) <- dimnames(ans$sd) <- list(base, score)
    ans$lorenz <- list(ans$lorenz)
    names(ans$lorenz) <- base
  } else { # model comparison using each score as a base
    p <- length(score)
    gi <- sd <- matrix(0, p, p)
    lrz <- vector("list", p)    
    for (i in 1:p){
      P <- sc[, i] / mean(sc[, i])
      tmp <- do_gini(y, P, sc[, -i, drop = FALSE])
      gi[i, -i] <- tmp$gini
      sd[i, -i] <- tmp$sd
      lrz[[i]] <- tmp$lorenz      
    }    
    dimnames(gi) <- dimnames(sd) <- list(score, score)
    names(lrz) <- score
    ans <- list(gini = gi, sd = sd, lorenz = lrz)
  }
  out <- new("gini", call = call, gini = ans$gini, 
             sd = ans$sd, lorenz = ans$lorenz)    
  return (out)
}

# compute the gini index, std errs, and the graph of the lorenz curve
do_gini <- function(y, P, S){    
  p <- ncol(S) 
  gi <- sd <- matrix(NA, 1, p)
  ni <- 50
  lrz <- matrix(0, ni, p)
  n <- length(y)
  for (i in 1:p){
    # order by relativity
    yP <- cbind(y, P)[order(S[, i] / P), ]
    # distribution function    
    DF <- apply(yP / colSums(yP), 2, cumsum)
    DF <- rbind(c(0, 0), DF)
    # compute the gini index
    gi[1, i] <- 1 - sum(diff(DF[, 2]) * (DF[2:(n + 1), 1] + DF[1:n, 1]))    
    
    # compute the standard errors
    h1 <- 0.5 * (yP[, 2] * DF[-1, 1] + yP[, 1] * (1 - DF[-1, 2]))    
    mh1 <- mean(h1)
    mh <- 0.5 * (1 - gi[1, i])
    vh <- var(h1)  
    vhy <- cov(h1, yP[, 1])
    vhP <- cov(h1, yP[, 2])
    vy <- var(y)
    vP <- var(P)
    vyP <- cov(yP[, 1], yP[, 2])
    v <-  4 * (4 * vh + mh^2 * (vy + vP) - 4 * mh * (vhy  + vhP)  + 2 * mh^2 * vyP)      
    sd[1, i] <- sqrt(v / n) 
    
    # compute the interpolations used for plots
    lrz[, i] <- approx(DF[, 2:1], n = ni)[[2]]
  }
  #add cdf for premiums (this is the same across models)
  lrz <- cbind(seq(0, 1, length = ni), lrz) 
  dimnames(lrz)[[2]] <- c(".P.", dimnames(S)[[2]]) 
  # scale by 100
  return(list(gini = gi * 100, sd = sd *100, lorenz = lrz * 100))
}


setMethod("show", 
  signature(object = "gini"),
  function(object){
    digits <- max(3, getOption("digits") - 3)
    gi <- format(object@gini, digits = digits)
    sd <- format(object@sd, digits = digits)
    cat("\nCall:\n", paste(deparse(object@call), sep = "\n", collapse = "\n"), 
          "\n\n", sep = "")
    cat("Gini indices:\n")
    print.default(gi, print.gap = 2,  quote = FALSE)
    cat("\n")
    cat("Standard errors:\n")
    print.default(sd, print.gap = 2,  quote = FALSE) 
    cat("\n")
    if (nrow(object@gini) > 1) {
      bm <- names(which.min(apply(object@gini, 1, max)))
    } else {
      bm <- colnames(object@gini)[which.max(object@gini[1, ])]
    }
    cat(paste("The selected score is ", bm, ".\n", sep = ""))  
  }  
)

setMethod("plot", 
  signature(x = "gini", y = "missing"),
  function(x, y, overlay = TRUE, ...) {
    lrz <- lapply(x@lorenz, as.data.frame)
    pd <- lapply(1:length(lrz), function(t){
            lrz[[t]]$Base <- rep(names(lrz)[t], nrow(lrz[[t]]))
            melt(lrz[[t]], c("Base", ".P.")) 
            })
    pd <- do.call("rbind", pd)
    names(pd) <- c("Base", "Premium", "Score", "Loss")
    pd$Score <- factor(pd$Score, levels = eval(x@call$score))
    pp <- ggplot(pd, aes(Premium, Loss))
    pp <- pp + geom_line(aes(color = Score, linetype = Score)) + 
           geom_line(aes(Premium, Premium), color = gray(0.35))
    if (overlay)
      pp <- pp + facet_grid(Base~.) else 
      pp <- pp + facet_grid(Base~Score) 
    print(pp)
  }
)