#' Imputation of missing values in compositional data using knn methods #' #' This function offers several k-nearest neighbor methods for the imputation #' of missing values in compositional data. #' #' The Aitchison \code{metric} should be chosen when dealing with compositional #' data, the Euclidean \code{metric} otherwise. #' #' If \code{primitive} \eqn{==} FALSE, a sequential search for the #' \eqn{k}-nearest neighbors is applied for every missing value where all #' information corresponding to the non-missing cells plus the information in #' the variable to be imputed plus some additional information is available. If #' \code{primitive} \eqn{==} TRUE, a search of the \eqn{k}-nearest neighbors #' among observations is applied where in addition to the variable to be #' imputed any further cells are non-missing. #' #' If \code{normknn} is TRUE (prefered option) the imputed cells from a nearest #' neighbor method are adjusted with special adjustment factors (more details #' can be found online (see the references)). #' #' @param x data frame or matrix #' @param method method (at the moment, only \dQuote{knn} can be used) #' @param k number of nearest neighbors chosen for imputation #' @param metric \dQuote{Aichison} or \dQuote{Euclidean} #' @param agg \dQuote{median} or \dQuote{mean}, for the aggregation of the #' nearest neighbors #' @param primitive if TRUE, a more enhanced search for the $k$-nearest #' neighbors is obtained (see details) #' @param normknn An adjustment of the imputed values is performed if TRUE #' @param das depricated. if TRUE, the definition of the Aitchison distance, #' based on simple logratios of the compositional part, is used (Aitchison, #' 2000) to calculate distances between observations. if FALSE, a version #' using the clr transformation is used. #' @param adj either \sQuote{median} (default) or \sQuote{sum} can be chosen #' for the adjustment of the nearest neighbors, see Hron et al., 2010. #' @return \item{xOrig }{Original data frame or matrix} \item{xImp }{Imputed #' data} \item{w }{Amount of imputed values} \item{wind }{Index of the missing #' values in the data} \item{metric }{Metric used} #' @author Matthias Templ #' @export #' @seealso \code{\link{impCoda}} #' @references Aitchison, J., Barcelo-Vidal, C., Martin-Fernandez, J.A., #' Pawlowsky-Glahn, V. (2000) Logratio analysis and compositional distance, #' \emph{Mathematical Geology}, 32(3), 271-275. #' #' Hron, K., Templ, M., Filzmoser, P. (2010) Imputation of missing values #' for compositional data using classical and robust methods #' \emph{Computational Statistics and Data Analysis}, 54 (12), #' 3095-3107. #' @keywords manip multivariate #' @examples #' #' data(expenditures) #' x <- expenditures #' x[1,3] #' x[1,3] <- NA #' xi <- impKNNa(x)$xImp #' xi[1,3] #' `impKNNa` <- function(x, method="knn", k=3, metric="Aitchison", agg="median", primitive=FALSE, normknn=TRUE, das=FALSE, adj="median"){ ### Coda Version 2! ### Nearest Neighbor imputation algorithm of Missing values ### based on distance matrices. ### ### Program by Matthias Templ, Oktober 2008 ### R Version: 2.7.1 ### Copyright : TU-Wien, 2008 ### # TODO: help file mit adj ergaenzen x <- as.matrix(x) class(x) <- "matrix" N <- dim(x)[1] P <- dim(x)[2] if(any(rowSums(is.na(x)) == P)) stop("One or more observations constist of only NA's") xOrig <- xmiss <- x w <- is.na(x) w2 <- !is.na(x) if(metric=="Euclidean" & primitive==FALSE){ findknn <- function(x, i, j){ ## find knn m1 <- which(!is.na(x[i,])) mi <- which(!is.na(x[,j,drop=FALSE]) & !is.na(rowSums(x[,m1,drop=FALSE])) ) d <- rowSums(t((t(x[mi,m1,drop=FALSE]) - x[i,m1])^2)) names(d) <- mi as.numeric(names(which(d <= quantile(d, k/N)))) } } if(metric=="Euclidean" & primitive==TRUE){ findknn <- function(x, i, j){ ## find knn m1 <- which(!is.na(x[i,])) mi <- which(!is.na(x[,j,drop=FALSE]) ) d <- rowSums(t((t(x[mi,m1,drop=FALSE]) - x[i,m1])^2),na.rm=TRUE) names(d) <- mi as.numeric(names(which(d <= quantile(d, k/N)))) } } if(metric=="Aitchison" & primitive==FALSE){ findknn <- function(x, i, j){ m1 <- which(!is.na(x[i,])) mi <- which(c(!is.na(x[,j,drop=FALSE])) & c(!is.na(rowSums(x[,m1,drop=FALSE])))) xclr <- cenLR(rbind(x[mi, m1, drop=FALSE], x[i, m1]))$x.clr d <- rowSums(t(abs(t(xclr[-nrow(xclr),]) - c(xclr[nrow(xclr),])))) names(d) <- mi wA <- which(d <= quantile(d, k/length(d))) ### new change for zeros paper: # print(is.numeric(xclr)) #if(is.numeric(xclr)) xclr <- matrix(xclr, ncol=length(xclr)) #d <- rowSums(t(abs(t(xclr[-nrow(xclr),,drop=FALSE]) - c(xclr[nrow(xclr),,drop=FALSE]))), na.rm=TRUE) #names(d) <- mi #print(d) #wA <- which(d <= quantile(d, k/length(d))) ### end new change w <- as.numeric(names(wA)) list(knn=w, m1=m1) } } if(metric=="Aitchison" & primitive==TRUE & das == TRUE){ findknn <- function(x, i, j){ m1 <- which(!is.na(x[i,])) mi <- which(c(!is.na(x[,j,drop=FALSE])) ) da <- function(x,y){ d <- 0 p <- length(x) for(i in 1:(p-1)){ for(j in (i+1):p){ d <- d + (log(x[i]/x[j]) - log(y[i]/y[j]))^2 } } d=d/p d } ref <- x[i,m1] d <- apply(x[mi, m1, drop=FALSE], 1, function(z){da(x=z, y=ref)}) names(d) <- mi wA <- which(d <= quantile(d, k/length(d))) w <- as.numeric(names(wA)) list(knn=w, m1=m1) } } if(metric=="Aitchison" & primitive==FALSE & das == TRUE){ findknn <- function(x, i, j){ m1 <- which(!is.na(x[i,])) mi <- which(c(!is.na(x[,j,drop=FALSE])) & c(!is.na(rowSums(x[,m1,drop=FALSE])))) da <- function(x,y){ d <- 0 p <- length(x) for(i in 1:(p-1)){ for(j in (i+1):p){ d <- d + (log(x[i]/x[j]) - log(y[i]/y[j]))^2 } } d=d/p d } ref <- x[i,m1] d <- apply(x[mi, m1, drop=FALSE], 1, function(z){da(x=z, y=ref)}) names(d) <- mi wA <- which(d <= quantile(d, k/length(d))) w <- as.numeric(names(wA)) list(knn=w, m1=m1) } } if(metric=="Aitchison" & primitive==TRUE){ findknn <- function(x, i, j){ m1 <- which(!is.na(x[i,])) a <- c(!is.na(x[,j,drop=FALSE])) b <- ifelse(rowSums(is.na(x[,-j])) == 0, TRUE, FALSE) mi <- which(a & b) xclr <- cenLR(rbind(x[mi, m1, drop=FALSE], x[i, m1]))$x.clr d <- rowSums(t(abs(t(xclr[-nrow(xclr),]) - c(xclr[nrow(xclr),]))), na.rm=TRUE) #d[d == 0] <- NA ## dirty names(d) <- mi wA <- which(d <= quantile(d, k/length(d), na.rm=TRUE)) w <- as.numeric(names(wA)) list(knn=w, m1=m1) } } if(metric=="Euclidean"){ imp <- function(x, i, j){ ## workhorse: do the imputation knn <- findknn(x, i, j) get(agg)(x[knn,j], na.rm=TRUE) #median } } if(metric=="Aitchison" & normknn == FALSE){ imp <- function(x, i, j){ ## workhorse: do the imputation knn <- findknn(x, i, j) ### median, normalization: #fac <- knn$xclriSum/knn$medKnnSum #median(x[knn$knn, j])#*fac # fac anders berechnen medSum <- get(adj)(apply(x[knn$knn, knn$m1, drop=FALSE], 2, function(x,...){get(agg)(x, na.rm=TRUE)} ), na.rm=TRUE) xSum <- get(adj)(x[i, knn$m1, drop=FALSE], na.rm=TRUE) fac <- xSum/medSum get(agg)(x[knn$knn, j], na.rm=TRUE)*fac #median } } if(metric=="Aitchison" & normknn == TRUE){ imp <- function(x, i, j){ knn <- findknn(x, i, j) ##normalization: #xSum <- sum(x[i, knn$m1, drop=FALSE], na.rm=TRUE) xSum <- get(adj)(x[i, knn$m1, drop=FALSE], na.rm=TRUE) if( length(knn$knn) == 1){ #knnSum <- sum(x[knn$knn, knn$m1], na.rm=TRUE) knnSum <- get(adj)(x[knn$knn, knn$m1], na.rm=TRUE) } else{ #knnSum <- rowSums(x[knn$knn, knn$m1, drop=FALSE], na.rm=TRUE) knnSum <- apply(x[knn$knn, knn$m1, drop=FALSE], 1, function(x,...){get(agg)(x, na.rm=TRUE)} ) } fac <- xSum/knnSum get(agg)(x[knn$knn, j] * fac, na.rm=TRUE) ## statt Mean das geom. Mittel? } } #if(metric == "Aitchison" & ratios == TRUE){ # knn <- findknn(x, i, j) # p <- ncol(x) # y <- apply(apply(x[knn$knn, , drop = FALSE], 1, function(x){x[2:p]/x[1]}),1,median) # x[i,j] <- 1 # x[i,-j] <- y #} #if(metric=="Aitchison" & unweighted == TRUE){ # imp <- function(x, i, j){ # knn <- findknn(x, i, j) # x[i,j] <- get(agg)(x[knn$knn, j]) # } #} for(i in 1:N){ for(j in 1:P){ if(is.na(x[i,j])){ x[i,j] <- imp(xmiss, i, j) } } } w <- is.na(xOrig) colnames(x) <- colnames(xOrig) res <- list(xOrig=xOrig, xImp=x, criteria=NULL, iter=NULL, w=length(which(w)), wind=w, metric=metric) class(res) <- "imp" res }