Content - 5e983d434948febd546b7e573e29671175425fc0 - b9bb880/R/Transformations.R

swh:1:snp:813359ba77493c9d5dd1abad9a1f53490a8abf57

Tip revision: ac414ce8dc062c13b47c28a5cdcd0ee8f98f3778 authored by Torsten Hothorn on 09 October 2007, 00:00:00 UTC
version 0.6-7
Tip revision: ac414ce
Transformations.R

### compute average scores, see Hajek, Sidak, Sen (page 131ff)
average_scores <- function(s, x) {
    dup <- x[duplicated(x)]
    for (d in dup)
        s[x == d] <- mean(s[x == d])
    return(s)
} 

### identity transformation
id_trafo <- function(x) x

### Ansari-Bradley
ansari_trafo <- function(x, ties.method = c("mid-ranks", "average-scores")) {
    ties.method <- match.arg(ties.method)
    scores <- switch(ties.method, 
        "mid-ranks" = {
            r <- rank(x)
            pmin(r, length(x) - r + 1)
        },
        "average-scores" = {
            r <- rank(x, ties.method = "random")
            s <- pmin(r, length(x) - r + 1)
            average_scores(s, x)
        }
    )
    return(scores)
}

### Fligner
fligner_trafo <- function(x, ties.method = c("mid-ranks", "average-scores")) {
    ties.method <- match.arg(ties.method)
    scores <- switch(ties.method, 
        "mid-ranks" = {
            qnorm((1 + rank(abs(x))/(length(x) + 1))/2)
        },
        "average-scores" = {
            r <- rank(abs(x), ties.method = "random")
            s <- qnorm((1 + r/(length(x) + 1))/2)
            average_scores(s, x)
        }
    )
    return(scores)
}

### Normal Scores (van der Waerden)
normal_trafo <- function(x, ties.method = c("mid-ranks", "average-scores")) {
    ties.method <- match.arg(ties.method)
    scores <- switch(ties.method,
        "mid-ranks" = { 
            qnorm(rank(x)/(length(x) + 1))
        },
        "average-scores" = {
            r <- rank(x, ties.method = "random")
            s <- qnorm(r/(length(x) + 1))
            average_scores(s, x)
        }
    )
    return(scores)
}
 
### Median Scores
median_trafo <- function(x)
    as.numeric(x <= median(x))

### Conover & Salsburg (1988)
consal_trafo <- function(x, ties.method = c("mid-ranks", "average-scores")) {
    ties.method <- match.arg(ties.method)
    scores <- switch(ties.method,
        "mid-ranks" = { 
            (rank(x)/(length(x) + 1))^4
        },
        "average-scores" = {
            r <- rank(x, ties.method = "random")
            s <- (r/(length(x) + 1))^4
            average_scores(s, x)
        }
    )
    return(scores)
}

### maximally selected (rank, chi^2, whatsoever) statistics
### ordered x
maxstat_trafo <- function(x, minprob = 0.1, maxprob = 1 - minprob) {
    qx <- quantile(x, prob = c(minprob, maxprob), type = 1)
    ux <- sort(unique(x))
    ux <- ux[ux < max(x)]  
    if (mean(x <= qx[2]) <= maxprob) {
        cutpoints <- ux[ux >= qx[1] & ux <= qx[2]]
    } else {
        cutpoints <- ux[ux >= qx[1] & ux < qx[2]]
    }
    cm <- .Call("R_maxstattrafo", as.double(x), as.double(cutpoints),
                PACKAGE = "coin")
    colnames(cm) <- paste("x <= ", round(cutpoints, 3), sep = "")
    rownames(cm) <- 1:nrow(cm)
    cm
}

### compute index matrix of all 2^(nlevel - 1) possible splits
### code translated from package `tree'
fsplits <- function(nlevel) {

    mi <- 2^(nlevel - 1)
    index <- matrix(0, nrow = mi, ncol = nlevel)
    index[,1] <- 1

    for (i in 0:(mi-1)) {
        ii <- i
        for (l in 2:nlevel) {
            index[(i + 1), l] <- (ii %% 2)
            ii <- ii %/% 2
        }
    }
    storage.mode(index) <- "logical"
    index[-nrow(index),, drop = FALSE]
}

### set up transformation g(x) for all possible binary splits 
### in an unordered x
fmaxstat_trafo <- function(x, minprob = 0.1, maxprob = 0.9) {

    sp <- fsplits(nlevels(x))
    lev <- levels(x)
    tr <- matrix(0, nrow = length(x), ncol = nrow(sp))
    cn <- vector(mode = "character", length = nrow(sp))
    for (i in 1:nrow(sp)) {
        tr[ ,i] <- x %in% lev[sp[i, ]]
        cn[i] <- paste("{", paste(lev[sp[i, ]], collapse = ", "), "} vs. {",
                       paste(lev[!sp[i, ]], collapse = ", "), "}", sep = "")
    }
    rownames(tr) <- 1:length(x)
    colnames(tr) <- cn
    tr <- tr[, colMeans(tr) >= minprob & colMeans(tr) <= maxprob]
    tr
}

### logrank scores; with two different methods of handling
### ties
logrank_trafo <- function(x, ties.method = c("logrank", "HL")) {
    ties.method <- match.arg(ties.method)
    time <- x[,1]
    event <- x[,2]
    n <- length(time)
    ot <- order(time, event)
    rt <- rank(time, ties.method = "max")
    mt <- rank(time, ties.method = "min") - 1
    fact <- switch(ties.method, "logrank" = event / (n - mt),
                                "HL" = event/(n - rt + 1)
                  )
    event - cumsum(fact[ot])[rt]
}

### factor handling
f_trafo <- function(x) {
    ### temporarily define `na.pass' as na.action
    opt <- options()
    on.exit(options(opt))
    options(na.action = na.pass)
    ### remove unused levels
    x <- x[, drop = TRUE]
    if (nlevels(x) == 1) {
        stop("Can't deal with factors containing only one level")
    } else {
        ### construct design matrix _without_ intercept
        mm <- model.matrix(~ x - 1)
    }
    colnames(mm) <- levels(x)
    ### the two-sample situations
    if (ncol(mm) == 2) mm <- mm[,-2,drop = FALSE]
    return(mm)
}

### ordered factors
of_trafo <- function(x) {
    scores <- attr(x, "scores")
    if (is.null(scores)) scores <- 1:nlevels(x)
    return(matrix(scores[x], ncol = 1))
}

### transformation function
trafo <- function(data, numeric_trafo = id_trafo, factor_trafo = f_trafo, 
                  ordered_trafo = of_trafo, surv_trafo = logrank_trafo, 
                  var_trafo = NULL, block = NULL) {

    if (!(is.data.frame(data) || is.list(data)))
        stop(sQuote("data"), " is not a data.frame or list")

    if (!is.null(block)) {
        if (!is.factor(block) || length(block) != nrow(data))
            stop(sQuote("block"), " is not a factor with ", 
                 nrow(data), " elements")

        ### need to check dimension of matrix returned by 
        ### user supplied functions
        ret <- trafo(data, numeric_trafo, factor_trafo, surv_trafo)

        ### apply trafo to each block separately
        for (lev in levels(block)) {
            ret[block == lev, ] <- trafo(data[block == lev, ,drop = FALSE], 
                numeric_trafo, factor_trafo, ordered_trafo, surv_trafo)
        }
        return(ret)
    }

    if (!is.null(var_trafo)) {
        if (!is.list(var_trafo)) stop(sQuote("var_trafo"), " is not a list")
        if (!all(names(var_trafo) %in% names(data)))
            stop("variable(s) ", 
                 names(var_trafo)[!(names(var_trafo) %in% names(data))], 
                 " not found in ", sQuote("var_trafo"))
    }

    ### compute transformations for each variable
    tr <- vector(mode = "list", length = length(data))
    names(tr) <- names(data)
    for (nm in names(data)) {
        x <- data[[nm]]
        if (nm %in% names(var_trafo)) {
            tr[[nm]] <- as.matrix(var_trafo[[nm]](x))
            next()
        }
        if (class(x)[1] == "AsIs") {
            if (length(class(x)) == 1) {
                x <- as.numeric(x)
            } else {
                class(x) <- class(x)[-1]
            }
        }
        if (is.ordered(x)) {
            tr[[nm]] <- as.matrix(ordered_trafo(x))
            next()
        }
        if (is.factor(x) || is.logical(x)) {
            tr[[nm]] <- as.matrix(factor_trafo(x))
            next()
        }
        if (inherits(x, "Surv")) {
            tr[[nm]] <- as.matrix(surv_trafo(x))
            next()
        }
        if (is.numeric(x)) {
            tr[[nm]] <- as.matrix(numeric_trafo(x))
            next()
        }
        if (is.null(tr[[nm]]))
            stop("data class ", class(x), " is not supported")
    }

    ### set up a matrix of transformations
    ### when more than one factor is in play, factor names 
    ### _and_ colnames of the corresponding rows are combined by `.'
    RET <- c()
    assignvar <- c()
    cn <- c()
    for (i in 1:length(tr)) {
        if (nrow(tr[[i]]) != nrow(data)) 
            stop("Transformation of variable ", names(tr)[i], 
                 " are not of length / nrow", nrow(data))
        RET <- cbind(RET, tr[[i]])
        if (is.null(colnames(tr[[i]]))) {
            cn <- c(cn, rep("", ncol(tr[[i]])))	
        } else {
            cn <- c(cn, paste(ifelse(length(tr) > 1, ".", ""), 
                              colnames(tr[[i]]), sep = ""))
        }
        assignvar <- c(assignvar, rep(i, ncol(tr[[i]])))
    }
    attr(RET, "assign") <- assignvar
    if (length(tr) > 1) {
        colnames(RET) <- paste(rep(names(tr), tabulate(assignvar)), 
                               cn, sep = "")
    } else {
        colnames(RET) <- cn
    }
    return(RET)
}