https://github.com/cran/CluMix
Raw File
Tip revision: dd414a45be07f9c033c980461d041111b898f0a7 authored by Manuela Hummel on 29 December 2016, 10:52:10 UTC
version 1.3.1
Tip revision: dd414a4
similarities.R

## fast calculation of Goodman&Kruskal's gamma
myGKgamma <- function(x, y){
  
  Hmisc::rcorr.cens(as.numeric(x), as.numeric(y), outx=TRUE)["Dxy"]  # outx=T is important to leave out tied observations
}
# -> "fast" for continuous vs categorical
# -> faster for categorical vs categorical: GoodmanKruskalGamma from package DescTools


## calculate chi^2 test p-value; relevant lines taken from chisq.test()
chisq <- function(x, group){
  OK <- complete.cases(x, group)
  x <- factor(x[OK])
  y <- factor(group[OK])
  x <- table(x, y)
  n <- sum(x)
  nr <- as.integer(nrow(x))
  nc <- as.integer(ncol(x))
  sr <- rowSums(x)
  sc <- colSums(x)
  E <- outer(sr, sc, "*")/n
  
  STATISTIC <- sum((abs(x - E))^2/E)
  PARAMETER <- (nr - 1L) * (nc - 1L)
  PVAL <- pchisq(STATISTIC, PARAMETER, lower.tail = FALSE)
  return(PVAL)
}



## calculate new measure for association btw. ranked (continuous or ordinal) variable A and categorical (K>2) variable B
assoc.rank.cat <- function(x, a, alpha=0.05){
  # x: continuous or ordinal vector
  # a: factor (not ordinal)
  # alpha: significance level for Kruskal-Wallis test
  
  # test if there might be any association
  p <- kruskal.test(x, a)$p.value 
  
  # only go on with optimizing ordering of a in case of a significant result
  if(p < alpha){ 
    
    # order levels of a variable according to mean ranks of x
    a <- factor(as.numeric(a))
    r <- rank(x, na.last="keep")
    m <- tapply(r, a, median, na.rm=TRUE) 
    a <- ordered(a, levels=order(m))
  }
  
  # continuous x -> Spearman
  if(data.class(x) == "numeric")
    res <- abs(cor(x, as.numeric(a), method="spearman", use="complete.obs"))
  
  # ordinal x -> Goodman & Kruskal's gamma
  if(data.class(x) == "ordered"){
    res <- abs(DescTools::GoodmanKruskalGamma(x, a))
  }
  
  return(res)
}



## function to "impose an order" in categorical (K>2) variables by "diagonalization" of cross table
assoc.cat.cat <- function(a, b, alpha=0.05){
  # a, b: factors (not ordinal) with >2 categories
  # alpha: significance level for Kruskal-Wallis test
  
  # when both variables are binary, of course no reordering is necessary
  ka <- length(table(a))
  kb <- length(table(b))
  if(ka > 2 | kb > 2){
    
    # test if there might be any association
    p <- chisq(a, b)
    
    # only go on with diagonalization of cross table in case of a significant result
    if(p < alpha){ 
      tab <- table(a, b)
      
      # get "optimal" ordering of categories (large frequencies in the diagonal)
      tab.new <- extracat::optile(tab)
      
      # reorder categories correspondingly
      a <- ordered(a, levels=rownames(tab.new))
      b <- ordered(b, levels=colnames(tab.new))  
    }
  }
  
  # calculate Goodman & Kruskal's gamma on "optimized" cross table
  abs(DescTools::GoodmanKruskalGamma(a, b))  
}



back to top