swh:1:snp:1f7eae87bb9a78532242cb7060d401b54a54ddee
Tip revision: 4fbb09ab94eb59bfa4196e2a4898f4e30c2845ab authored by Manuela Hummel on 21 January 2019, 08:10:22 UTC
version 2.3.1
version 2.3.1
Tip revision: 4fbb09a
similarities.R
## fast calculation of Goodman&Kruskal's gamma
myGKgamma <- function(x, y){
Hmisc::rcorr.cens(as.numeric(x), as.numeric(y), outx=TRUE)["Dxy"] # outx=T is important to leave out tied observations
}
# -> "fast" for continuous vs categorical
# -> faster for categorical vs categorical: GoodmanKruskalGamma from package DescTools
## calculate chi^2 test p-value; relevant lines taken from chisq.test()
chisq <- function(x, group){
OK <- complete.cases(x, group)
x <- factor(x[OK])
y <- factor(group[OK])
x <- table(x, y)
n <- sum(x)
nr <- as.integer(nrow(x))
nc <- as.integer(ncol(x))
sr <- rowSums(x)
sc <- colSums(x)
E <- outer(sr, sc, "*")/n
STATISTIC <- sum((abs(x - E))^2/E)
PARAMETER <- (nr - 1L) * (nc - 1L)
PVAL <- pchisq(STATISTIC, PARAMETER, lower.tail = FALSE)
return(PVAL)
}
## calculate new measure for association btw. ranked (continuous or ordinal) variable A and categorical (K>2) variable B
assoc.rank.cat <- function(x, a, alpha=0.05){
# x: continuous or ordinal vector
# a: factor (not ordinal)
# alpha: significance level for Kruskal-Wallis test
# test if there might be any association
p <- kruskal.test(x, a)$p.value
# only go on with optimizing ordering of a in case of a significant result
if(p < alpha){
# order levels of a variable according to mean ranks of x
a <- factor(as.numeric(a))
r <- rank(x, na.last="keep")
m <- tapply(r, a, median, na.rm=TRUE)
a <- ordered(a, levels=order(m))
}
# continuous x -> Spearman
if(data.class(x) == "numeric")
res <- abs(cor(x, as.numeric(a), method="spearman", use="complete.obs"))
# ordinal x -> Goodman & Kruskal's gamma
if(data.class(x) == "ordered"){
res <- abs(DescTools::GoodmanKruskalGamma(x, a))
}
return(res)
}
## function to "impose an order" in categorical (K>2) variables by "diagonalization" of cross table
assoc.cat.cat <- function(a, b, alpha=0.05){
# a, b: factors (not ordinal) with >2 categories
# alpha: significance level for Kruskal-Wallis test
# when both variables are binary, of course no reordering is necessary
ka <- length(table(a))
kb <- length(table(b))
if(ka > 2 | kb > 2){
# test if there might be any association
p <- chisq(a, b)
# only go on with diagonalization of cross table in case of a significant result
if(p < alpha){
tab <- table(a, b)
# get "optimal" ordering of categories (large frequencies in the diagonal)
tab.new <- extracat::optile(tab)
# reorder categories correspondingly
a <- ordered(a, levels=rownames(tab.new))
b <- ordered(b, levels=colnames(tab.new))
}
}
# calculate Goodman & Kruskal's gamma on "optimized" cross table
abs(DescTools::GoodmanKruskalGamma(a, b))
}