#' EM-based replacement of rounded zeros in compositional data #' #' Parametric replacement of rounded zeros for compositional data using #' classical and robust methods based on ilr coordinates with a special #' choice of balances. #' #' Statistical analysis of compositional data including zeros runs into #' problems, because log-ratios cannot be applied. Usually, rounded zeros are #' considerer as missing not at random missing values. #' #' The algorithm iteratively imputes parts with rounded zeros whereas in each #' step (1) compositional data are expressed in pivot coordinates (2) tobit regression is #' applied (3) the rounded zeros are replaced by the expected values (4) the #' corresponding inverse ilr mapping is applied. After all parts are #' imputed, the algorithm starts again until the imputations do not change. #' #' @aliases imputeBDLs print.replaced checkData adjustImputed #' @param x data.frame or matrix #' @param maxit maximum number of iterations #' @param eps convergency criteria #' @param method either "lm", "lmrob" or "pls" #' @param dl Detection limit for each variable. zero for variables with #' variables that have no detection limit problems. #' @param variation, if TRUE those predictors are chosen in each step, who's variation is lowest to the predictor. #' @param nPred, if determined and variation equals TRUE, it fixes the number of predictors #' @param nComp if determined, it fixes the number of pls components. If #' \dQuote{boot}, the number of pls components are estimated using a #' bootstraped cross validation approach. #' @param bruteforce sets imputed values above the detection limit to the #' detection limit. Replacement above the detection limit are only exeptionally #' occur due to numerical instabilities. The default is FALSE! #' @param noisemethod adding noise to imputed values. Experimental #' @param noise TRUE to activate noise (experimental) #' @param R number of bootstrap samples for the determination of pls #' components. Only important for method \dQuote{pls}. #' @param correction normal or density #' @param verbose additional print output during calculations. #' @param test an internal test situation (this parameter will be deleted soon) #' @importFrom cvTools cvFit #' @importFrom zCompositions multRepl #' @importFrom fpc pamk #' @import pls #' @return \item{x }{imputed data} \item{criteria }{change between last and #' second last iteration} \item{iter }{number of iterations} \item{maxit #' }{maximum number of iterations} \item{wind}{index of zeros} #' \item{nComp}{number of components for method pls} \item{method}{chosen #' method} #' @author Matthias Templ, method subPLS from Jiajia Chen #' @references Templ, M., Hron, K., Filzmoser, P., Gardlo, A. (2016). #' Imputation of rounded zeros for high-dimensional compositional data. #' \emph{Chemometrics and Intelligent Laboratory Systems}, 155, 183-190. #' #' Chen, J., Zhang, X., Hron, K., Templ, M., Li, S. (2018). #' Regression imputation with Q-mode clustering for rounded zero replacement in high-dimensional compositional data. #' \emph{Journal of Applied Statistics}, 45 (11), 2067-2080. #' @seealso \code{\link{imputeBDLs}} #' @keywords manip multivariate #' @export #' @importFrom MASS rlm #' @examples #' #' p <- 10 #' n <- 50 #' k <- 2 #' T <- matrix(rnorm(n*k), ncol=k) #' B <- matrix(runif(p*k,-1,1),ncol=k) #' X <- T %*% t(B) #' E <- matrix(rnorm(n*p, 0,0.1), ncol=p) #' XE <- X + E #' data <- data.frame(pivotCoordInv(XE)) #' col <- ncol(data) #' row <- nrow(data) #' DL <- matrix(rep(0),ncol=col,nrow=1) #' for(j in seq(1,col,2)) #' {DL[j] <- quantile(data[,j],probs=0.06,na.rm=FALSE)} #' #' for(j in 1:col){ #' data[data[,j] dl[i] check[i] <- any(critvals) if(check[i]){ x[which(x[indexNA[,i],i] > dl[i]),i] <- dl[i] } } if(any(check) & verbose){ message(paste(sum(critvals), "/", sum(w), "replaced values has been corrected")) } x } ## check if data are fine checkData(x, dl) ## some specific checks stopifnot((method %in% c("lm", "MM", "lmrob", "pls", "subPLS"))) if(method=="pls" & ncol(x)<5) stop("too less variables/parts for method pls") if(!(correction %in% c("normal","density"))){ stop("correction method must be normal or density") } if(method == "pls" & variation){ stop("if variation is TRUE then pls is not supported.") } ## how to deal with user input on nComp if(is.null(nComp)){ pre <- FALSE nC <- NULL } else if(nComp=="boot"){ nC <- integer(ncol(x)) pre <- TRUE } else if(length(nComp) == ncol(x)){ nC <- nComp pre <- FALSE } else { pre <- FALSE } ## store some important values n <- nrow(x) d <- ncol(x) rs <- rowSums(x, na.rm = TRUE) if(method == "subPLS"){ w <- x == 0 indexFinalCheck <- x == 0 mulzero <- zCompositions::multRepl(x,label=0,dl=dl) dd <- as.dist(robCompositions::variation(mulzero)) g <- fpc::pamk(dd)$nc pos <- as.matrix(cutree(hclust(dd, method="ward.D"),g)) indNA <- apply(x,2,function(x){any(x==0)}) gz <- intersect(order(-table(pos[indNA])),which(table(pos)>1)) fun.PLS <- function(x,data,dl,pos,b,R) { data_b <- data[,pos==b] data_nb <- data[,pos!=b] col1 <- ncol(data_b) row <- nrow(data_b) u <- cbind(cenLR(data_b)$x,cenLR(data_nb)$x) u1 <- u[,1:col1] n <- bootnComp(u[,-(1:col1),drop=FALSE],as.matrix(u1),R,plotting=FALSE)$res # form <- paste(paste(colnames(u1), collapse = "+"), "~", paste(colnames(u), collapse = "+")) # form <- as.formula(form) # pls <- mvr(form, data=cbind(u1, u), method="simpls", # ncomp=n) pls <- mvr(as.matrix(u1)~ as.matrix(u[,-(1:col1)]),ncomp=n,method="simpls") # pls <- mvr(u1~ u[,-(1:col1)],ncomp=n,method="simpls") mean <- matrix(predict(pls,ncomp=n),ncol=col1) sigma <- cov(u1-mean) sig_b <- x[,pos==b]==0 cz <- which(colSums(sig_b)!=0) rz <- which(rowSums(sig_b)!=0) lj <- dl[pos==b] for(j in cz) { linjie <- as.matrix(cenLR(cbind(rep(lj[j],row),data_b[,-j,drop=FALSE]))$x[,1,drop=FALSE]) u1[sig_b[,j],j] <- mean[sig_b[,j],j]-sqrt(sigma[j,j])*(dnorm((linjie[sig_b[,j]]-mean[sig_b[,j],j])/sqrt(sigma[j,j]))/pnorm((linjie[sig_b[,j]]-mean[sig_b[,j],j])/sqrt(sigma[j,j]))) } for(i in rz) {if(rowSums(sig_b)[i]= eps){ xold <- impute for (m in 1:length(gz)) {impute <- fun.PLS(x,impute,dl,pos,gz[m],R)} it <- it+1 criteria <- sum(((xold-impute)/impute)^2, na.rm = TRUE) } impute <- checkDL(impute, dl, indexFinalCheck) res <- list(x=impute, criteria=criteria, iter=it, maxit=maxit, wind=w, nComp=nC, nPred=nPred, variation=variation, method=method, dl=dl) class(res) <- "replaced" return(res) } # if(method == "pls_clust"){ # indexFinalCheck <- x == 0 # mulzero <- zCompositions::multRepl(x,label=0,dl=dl) # dd <- as.dist(robCompositions::variation(mulzero)) # g <- fpc::pamk(dd)$nc # pos <- as.matrix(cutree(hclust(dd, method="ward.D"),g)) # indNA <- apply(x,2,function(x){any(x==0)}) # gz <- intersect(order(-table(pos[indNA])),which(table(pos)>1)) # # fun.PLS <- function(x,data,dl,pos,b,R) # { # data_b <- data[,pos==b] # data_nb <- data[,pos!=b] # col1 <- ncol(data_b) # row <- nrow(data_b) # u <- cbind(cenLR(data_b)$x.clr, cenLR(data_nb)$x.clr) # u1 <- u[,1:col1] # n <- bootnComp(u[,-(1:col1),drop=FALSE],u1,R,plotting=FALSE)$res # pls <- mvr(u1~ u[,-(1:col1)],ncomp=n,method="simpls") # mean <- matrix(predict(pls,ncomp=n),ncol=col1) # sigma <- cov(u1-mean) # sig_b <- x[,pos==b]==0 # cz <- which(colSums(sig_b)!=0) # rz <- which(rowSums(sig_b)!=0) # lj <- dl[pos==b] # for(j in cz) # { # linjie <- clr(cbind(rep(lj[j],row),data_b[,-j,drop=FALSE]))[,1,drop=FALSE] # u1[sig_b[,j],j] <- mean[sig_b[,j],j]-sqrt(sigma[j,j])*(dnorm((linjie[sig_b[,j]]-mean[sig_b[,j],j])/sqrt(sigma[j,j]))/pnorm((linjie[sig_b[,j]]-mean[sig_b[,j],j])/sqrt(sigma[j,j]))) # } # for(i in rz) # {if(rowSums(sig_b)[i]= eps){ # xold <- impute # for (m in 1:length(gz)) # {impute <- fun.PLS(x,impute,dl,pos,gz[m],R)} # it <- it+1 # criteria <- sum(((xold-impute)/impute)^2, na.rm = TRUE) # } # impute <- checkDL(impute, dl, indexFinalCheck) # # res <- list(x=impute, criteria=criteria, iter=it, # maxit=maxit, wind=w, nComp=nC, nPred=nPred, # variation=variation, # method=method, dl=dl) # class(res) <- "replaced" # return(res) # } ## set zeros to NA for easier handling x[x == 0] <- NA x[x < 0] <- NA indexFinalCheck <- is.na(x) ## sort variables of x based on ## decreasing number of zeros in the variables cn <- colnames(x) wcol <- - abs(apply(x, 2, function(x) sum(is.na(x)))) o <- order(wcol) x <- x[,o] if(verbose) cat("number of variables with zeros:\n", sum(wcol != 0)) ## --> now work in revised order of variables ## dl must also be in correct order dlordered <- dl[o] ################# ## index of missings / non-missings w <- is.na(x) wn <- !is.na(x) xcheck <- x w2 <- is.na(x) ## initialisation indNA <- apply(x, 2, function(x){any(is.na(x))}) # print(indNA) for(i in 1:length(dl)){ ind <- is.na(x[,i]) # if(length(ind) > 0) x[ind,i] <- dl[i]*runif(sum(ind),1/3,2/3) if(length(ind) > 0) x[ind,i] <- dlordered[i] *2/3 } xOrig <- x ## nPred if variation == TRUE and cv for number of predictors ## evaluate the vector containing the amount of predictors if(!is.null(nPred) & length(nPred) == 1){ nPred <- rep(nPred, ncol(x)) } if(!is.null(nPred) & length(nPred) > 1){ stop("nPred must be NULL or a vector of length 1.") } if(is.null(nPred) & variation){ ptmcv <- proc.time() if(verbose) cat("\n cross validation to estimate number of predictors\n") ii <- 1 if(verbose) pb <- txtProgressBar(min = 0, max = sum(indNA), style = 3) nPred <- numeric(nrow(x)) rtmspe <- NULL for(i in which(indNA)){ xneworder <- cbind(x[, i, drop=FALSE], x[, -i, drop=FALSE]) rv <- variation(x, method = "Pairwise")[1,] cve <- numeric() for(np in seq(3, min(c(27,ncol(x),floor(nrow(x)/2))), 3)){ s <- sort(rv)[np] cols <- which(rv <= s)[1:np] xn <- xneworder[, cols] if(test) xilr <- data.frame(isomLRp(xn)) else xilr <- data.frame(pivotCoord(xn)) colnames(xilr)[1] <- "Y" call <- call(method, formula = Y ~ .) # perform cross-validation cve[np] <- suppressWarnings(cvFit(call, data = xilr, y = xilr$Y, cost = cvTools::rtmspe, K = 5, R = 1, costArgs = list(trim = 0.1), seed = 1234)$cv) } nPred[i] <- which.min(cve) ## update progress bar if(verbose) setTxtProgressBar(pb, ii); ii <- ii + 1 } if(verbose) close(pb) ptmcv <- proc.time() - ptmcv } ############################### ### start the iteration ### ############################### if(verbose) cat("\n start the iteration:") it <- 1; criteria <- 99999999 while(it <= maxit & criteria >= eps){ if(verbose) cat("\n iteration", it, "; criteria =", criteria) xold <- x for(i in which(indNA)){ if(verbose) cat("\n replacement on (sorted) part", i) ## sort data columns first ## i'th positions must be first xneworder <- cbind(x[, i, drop=FALSE], x[, -i, drop=FALSE]) ## if based on variation matrix: if(variation){ orig <- xneworder rv <- variation(x, method = "Pairwise")[1,] s <- sort(rv)[nPred[i]] cols <- which(rv <= s)[1:nPred[i]] xneworder <- xneworder[, cols] } ## factors for preserving abs values: # fac <- multis(xneworder) ## detection limit in ilr-space forphi <- cbind(rep(dlordered[i], n), xneworder[,-1,drop=FALSE]) if(any(is.na(forphi))) break() if(test) phi <- isomLRp(forphi)[,1] else phi <- pivotCoord(forphi)[,1] # part <- cbind(x[,i,drop=FALSE], x[,-i,drop=FALSE]) xneworder[xneworder < 2*.Machine$double.eps] <- 2*.Machine$double.eps if(test) xilr <- data.frame(isomLRp(xneworder)) else xilr <- data.frame(pivotCoord(xneworder)) # c1 <- colnames(xilr)[1] # colnames(xilr)[1] <- "V1" response <- as.matrix(xilr[, 1, drop=FALSE]) predictors <- as.matrix(xilr[, -1, drop=FALSE]) if(method=="lm"){ reg1 <- lm(response ~ predictors) yhat <- predict(reg1, newdata=data.frame(predictors)) } else if(method=="MM" | method=="lmrob"){ reg1 <- MASS::rlm(response ~ predictors, method="MM",maxit = 100)#rlm(V1 ~ ., data=xilr2, method="MM",maxit = 100) yhat <- predict(reg1, newdata=data.frame(predictors)) } else if(method=="pls"){ if(it == 1 & pre){ ## evaluate ncomp. nC[i] <- bootnComp(xilr[, 2:ncol(xilr), drop=FALSE], y=xilr[, 1], R, plotting=FALSE)$res #$res2 } if(verbose) cat(" ; ncomp:", nC[i]) reg1 <- mvr(as.matrix(response) ~ as.matrix(predictors), ncomp=nC[i], method="simpls") yhat <- predict(reg1, newdata=data.frame(predictors), ncomp=nC[i]) } # s <- sqrt(sum(reg1$res^2)/abs(nrow(xilr)-ncol(xilr))) ## quick and dirty: abs() s <- sqrt(sum(reg1$res^2)/nrow(xilr)) yhat <- as.numeric(yhat) ex <- (phi - yhat)/s if(correction=="normal"){ yhat2sel <- ifelse(dnorm(ex[w[, i]]) > .Machine$double.eps, yhat[w[, i]] - s*dnorm(ex[w[, i]])/pnorm(ex[w[, i]]), yhat[w[, i]]) } else if(correction=="density"){ stop("correction equals density is no longer be supported") # den <- density(ex[w[,i]]) # distr <- sROC::kCDF(ex[w,i]) } if(any(is.na(yhat)) || any(yhat=="Inf")) stop("Problems in ilr because of infinite or NA estimates") # check if we are under the DL: if(any(yhat2sel >= phi[w[, i]])){ yhat2sel <- ifelse(yhat2sel > phi[w[, i]], phi[w[, i]], yhat2sel) } xilr[w[, i], 1] <- yhat2sel if(test) xinv <- isomLRInvp(xilr) else xinv <- pivotCoordInv(xilr) ## if variation: if(variation == TRUE){ xneworder <- adjustImputed(xinv, xneworder, w2[, cols]) orig[, cols] <- xneworder #* fac xinv <- orig } ## reordering of xOrig if(i %in% 2:(d-1)){ xinv <- cbind(xinv[,2:i], xinv[,c(1,(i+1):d)]) } if(i == d){ xinv <- cbind(xinv[,2:d], xinv[,1]) } if(!variation){ x <- adjustImputed(xinv, xOrig, w2) } else { x <- xinv } # x <- adjust3(xinv, xOrig, w2) # ## quick and dirty: # x[!w] <- xOrig[!w] } it <- it + 1 criteria <- sum( ((xold - x)/x)^2, na.rm=TRUE) ## DIRTY: (na.rm=TRUE) if(verbose & criteria != 0) cat("\n iteration", it, "; criteria =", criteria) } #### add random error ### if(noise){ for(i in which(indNA)){ if(verbose) cat("\n add noise on variable", i) # add error terms inderr <- w[,i] if(noisemethod == "residuals") { error <- sample(residuals( reg1 )[inderr], size=wcol[i], replace=TRUE) reg1$res[inderr] <- error } else { mu <- median(residuals( reg1 )[inderr]) sigma <- mad(residuals( reg1 )[inderr]) error <- rnorm(wcol[i], mean=mu, sd=sigma) reg1$res[inderr] <- error } # return realizations yhat[inderr] <- yhat[inderr] + error s <- sqrt(sum(reg1$res^2)/nrow(xilr)) ## quick and dirty: abs() ex <- (phi - yhat)/s yhat2sel <- ifelse(dnorm(ex[w[, i]]) > .Machine$double.eps, yhat[w[, i]] - s*dnorm(ex[w[, i]])/pnorm(ex[w[, i]]), yhat[w[, i]]) if(any(is.na(yhat)) || any(yhat=="Inf")) stop("Problems in ilr because of infinite or NA estimates") # check if we are under the DL: if(any(yhat2sel >= phi[w[, i]])){ yhat2sel <- ifelse(yhat2sel > phi[w[, i]], phi[w[, i]], yhat2sel) } xilr[w[, i], 1] <- yhat2sel if(test) xinv <- isomLRInvp(xilr) else xinv <- pivotCoordInv(xilr) ## reordering of xOrig if(i %in% 2:(d-1)){ xinv <- cbind(xinv[,2:i], xinv[,c(1,(i+1):d)]) } if(i == d){ xinv <- cbind(xinv[,2:d], xinv[,1]) } # x <- adjust2(xinv, xOrig, w) # ## quick and dirty: # x[!w] <- xOrig[!w] } } ### end add random error ### # x <- adjust3(x, xOrig, w) # x[!w] <- xOrig[!w] x <- x[,order(o)] ## checked: reordering is OK! colnames(x) <- cn if(verbose){ message(paste(sum(w)), "values below detection limit has been imputed \n below the corresponding detection limits") } x <- checkDL(x, dl, indexFinalCheck) res <- list(x=x, criteria=criteria, iter=it, maxit=maxit, wind=w, nComp=nC, nPred=nPred, variation=variation, method=method, dl=dl) class(res) <- "replaced" return(res) } #' @rdname imputeBDLs #' @export #' @param xImp imputed data set #' @param xOrig original data set #' @param wind index matrix of rounded zeros adjustImputed <- function(xImp, xOrig, wind){ ## aim: ## (1) ratios must be preserved ## (2) do not change original values ## (3) adapt imputations xneu <- xImp s1 <- rowSums(xOrig, na.rm = TRUE) ## per row: consider rowsums of imputed data sumPrevious <- sumAfter <- numeric(nrow(xImp)) for (i in 1:nrow(xImp)) { if(any(wind[i, ]) & !all(wind[i,])){ sumPrevious[i] <- sum(xOrig[i, !wind[i, ]]) sumAfter[i] <- sum(xImp[i, !wind[i,]]) } else{ sumPrevious[i] <- sumAfter[i] <- 1 } } # how much is rowsum increased by imputation: fac <- sumPrevious/sumAfter # # decrese rowsums of orig. # s1[i] <- s1[i]/fac # } ## for non-zeros overwrite them: xneu[!wind] <- xOrig[!wind] ## adjust zeros: for(i in 1:nrow(xImp)){ if(any(wind[i,])){ xneu[i,wind[i,]] <- fac[i]*xneu[i,wind[i,]] } } return(xneu) } #' @rdname imputeBDLs #' @export `checkData` <- function(x, dl){ if(any(is.na(x))) stop("your data includes missing values.\n Use impKNNa() or impCoda() to impute them first.") if( is.vector(x) ) stop("x must be a matrix or data frame") ## check if only numeric variables are in x: cl <- lapply(x, class) if(!all(cl %in% "numeric")) stop("some of your variables are not of class numeric.") if( length(dl) < ncol(x)) stop(paste("dl has to be a vector of ", ncol(x))) if(any(is.na(x))) stop("missing values are not allowed. \n Use impKNNa or impCoda to impute them first.") # pre <- TRUE # if(length(nComp) != ncol(x) & nComp!="boot") stop("nComp must be NULL, boot or of length ncol(x)") # } else if(nComp == "boot"){# # pre <- TRUE # } else { # pre <- FALSE # } ################# ## zeros to NA: # check if values are in (0, dl[i]): check <- logical(ncol(x)) for(i in 1:ncol(x)){ # check[i] <- any(x[,i] < dl[i] & x[,i] != 0) x[x[,i] < dl[i],i] <- 0 } # if(any(check)){warning("values below detection limit have been set to zero and will be imputed")} check2 <- any(x < 0) if(check2){warning("values below 0 set have been set to zero and will be imputed")} x[x == 0] <- NA x[x < 0] <- NA indexFinalCheck <- is.na(x) ## check if rows consists of only zeros: checkRows <- unlist(apply(x, 1, function(x) all(is.na(x)))) if(any(checkRows)){ w <- which(checkRows) cat("\n--------\n") message("Rows with only zeros are not allowed") message("Remove this rows before running the algorithm") cat("\n--------\n") stop(paste("Following rows with only zeros:", w)) } ## check if cols consists of only zeros: checkCols <- unlist(apply(x, 2, function(x) all(is.na(x)))) if(any(checkCols)){ w <- which(checkCols) cat("\n--------\n") message("Cols with only zeros are not allowed") message("Remove this columns before running the algorithm") cat("\n--------\n") stop(paste("\n Following cols with only zeros:", colnames(x)[w])) } indNA <- apply(x, 2, function(x){any(is.na(x))}) ## check if for any variable with zeros, ## the detection limit should be larger than 0: if(any(dl[indNA]==0)){ w <- which(dl[indNA]==0) invalidCol <- colnames(x)[w] for(i in 1:length(invalidCol)){ cat("-------\n") cat(paste("Error: variable/part", invalidCol[i], "has detection limit 0 but includes zeros")) cat("\n-------\n") } stop(paste("Set detection limits larger than 0 for variables/parts \n including zeros")) } } #' @rdname imputeBDLs #' @method print replaced #' @export #' @param ... further arguments passed through the print function print.replaced <- function(x, ...){ message(paste("\n", sum(x$w), "values below detection limit were imputed \n below their corresponding detection limits.\n")) } #' Bootstrap to find optimal number of components #' #' Combined bootstrap and cross validation procedure to find optimal number of #' PLS components #' #' Heavily used internally in function impRZilr. #' #' @param X predictors as a matrix #' @param y response #' @param R number of bootstrap replicates #' @param plotting if TRUE, a diagnostic plot is drawn for each bootstrap #' replicate #' @return Including other information in a list, the optimal number of #' components #' @author Matthias Templ #' @seealso \code{\link{impRZilr}} #' @keywords manip #' @export #' @examples #' #' ## we refer to impRZilr() #' bootnComp <- function(X, y, R=99, plotting=FALSE){ ind <- 1:nrow(X) d <- matrix(, ncol=R, nrow=nrow(X))#nrow(X)) nc <- integer(R) for(i in 1:R){ bootind <- sample(ind) # XX <- X # yy <- y if(is.null(ncol(y))){ ds <- cbind(X[bootind,], as.numeric(y[bootind])) colnames(ds)[ncol(ds)] <- "V1" res1 <- mvr(V1~., data=data.frame(ds), method="simpls", validation="CV") } else { ds <- cbind(X[bootind,], as.matrix(y[bootind,])) form <- paste(paste(colnames(ds[, (ncol(X)+1):ncol(ds)]), collapse = "+"), "~", paste(colnames(ds[, 1:ncol(X)]), collapse = "+")) form <- as.formula(form) res1 <- mvr(form, data=data.frame(ds), method="simpls", validation="CV") } d[1:res1$ncomp, i] <- res1$validation$PRESS nc[i] <- which.min(res1$validation$PRESS) # d[1:reg1$ncomp,i] <- as.numeric(apply(reg1$validation$pred, 3, # function(x) sum(((y - x)^2)) ) ) } d <- na.omit(d) sdev <- apply(d, 1, sd, na.rm=TRUE) means <- apply(d, 1, mean, na.rm=TRUE) mi <- which.min(means) r <- round(ncol(X)/20) mi2 <- which.min(means[r:length(means)])+r-1 w <- means < min(means) + sdev[mi] threshold <- min(means) + sdev[mi] sdev <- sdev means <- means mi <- mi means2 <- means means2[!w] <- 999999999999999 res <- which.min(means2) mi3 <- which.max(w) # minsd <- means - sdev > means[mi] # check <- means # check[!minsd] <- 99999999 if(plotting) plot(means, type="l") # res <- which.min(check) list(res3=mi3, res2=mi2, res=res, means=means) } bootnCompHD <- function(X,y, R=99, plotting=FALSE){ ind <- 1:nrow(X) d <- matrix(, ncol=R, nrow=nrow(X))#nrow(X)) for(i in 1:R){ bootind <- sample(ind) XX <- X yy <- y ds <- cbind(X[bootind,], as.numeric(y[bootind])) colnames(ds)[ncol(ds)] <- "V1" reg1 <- mvr(V1~., data=data.frame(ds), method="simpls", validation="CV") d[1:reg1$ncomp,i] <- as.numeric(apply(reg1$validation$pred, 3, function(x) sum(((y - x)^2)) ) ) } d <- na.omit(d) sdev <- apply(d, 1, mad, na.rm=TRUE) means <- apply(d, 1, median, na.rm=TRUE) mi <- which.min(means) if(plotting) plot(means, type="l", col="blue", ylab="squared total prediction error", xlab="number of components") themean <- mean(means) thesd <- sd(means) abovethreshold <- themean - sdev > means check <- means check[!abovethreshold] <- 9999999999 res <- which.min(check) # minsd <- means - sdev > means[mi] # check <- means # check[!minsd] <- 99999999 # res <- which.min(check) if(plotting){ abline(v=res, lwd=3) abline(h=mi, col="red") # abline(h=means-sdev, lty=3) } list(res=res, mean=means) } #checkIfValuesUnderDL <- function(x, dl, wind){ # check <- logical(ncol(x)) # for(i in 1:ncol(x)){ # check[i] <- any(x[,i] > dl[i]) # } # return(check) #} ## test adjust2: adjust2 <- function (xImp, xOrig, wind){ ## aim: do not change original values ## adapt imputations xneu = xImp s1 <- rowSums(xOrig, na.rm = TRUE) ## per row: consider rowsums of imputed data ## example: ## wind: F F T F F ## ganz orig: 3 5 NA 8 10 (sum=26) ## orig(init): 3 5 6.5 8 10 (sum=32.5) ## imp: 3 5 7 8 10 (sum=33) ## s: 26 ## s2: 7 ## fac: 26/(26+7) ## s1: 32.5/(26/(26+7)) = 41.25 for (i in 1:nrow(xImp)) { if(any(wind[i,])) s <- sum(xImp[i, !wind[i, ]]) else s <- 1 if(any(wind[i,])) s2 <- sum(xImp[i, wind[i, ]]) else s2 <- 0 # how much is rowsum increased by imputation: fac <- s/(s + s2) # decrese rowsums of orig. s1[i] <- s1[i]/fac } ## impS: 41.25/33 impS <- s1/rowSums(xImp) for (i in 1:ncol(xImp)) { xneu[, i] <- xImp[, i] * impS } xImp <- xneu return(xImp) } adjust3 <- function(xImp, xOrig, wind){ xOrigSum <- rowSums(xOrig) # sum imputed without former zeros: xImpSum <- numeric(ncol(xOrig)) for(i in 1:nrow(xOrig)){ xImpSum[i] <- sum(xImp[i,!wind[i,]]) fac <- xOrigSum[i] / xImpSum[i] xImp[i,wind[i,]] <- xImp[i,wind[i,]] * fac } xImp[!wind] <- xOrig[!wind] xImp } # ### switch function to automatically select methods #getM <- function(xReg, ndata, type, index,mixedTF,mixedConstant,factors,step,robust,noise,noise.factor=1,force=FALSE, robMethod="MM") { # switch(type, # numeric = useLM(xReg, ndata, index, mixedTF,mixedConstant,factors,step,robust,noise,noise.factor,force,robMethod), # factor = useMN(xReg, ndata, index,factors,step,robust), # bin = useB(xReg, ndata, index,factors,step,robust), # count = useGLMcount(xReg, ndata, index, factors, step, robust) # ) #} # # # #f <- function(){ ## x <- constSum(x) ## dl <- dl/sum(dl) # ## initialisation: ## x[is.na(x)] <- 0.001 # for(i in 1:length(dl)){ # ind <- is.na(x[,i]) # #PF# if(length(ind) > 0) x[ind,i] <- dl[i]/3*2 # if(length(ind) > 0) x[ind,i] <- dl[i]*runif(sum(ind),1/3,2/3) # } # ## x <- constSum(x) # # ## parameters: # it=0 # criteria <- 10000000 # error <- rep(0, ncol(x)) # nComp <- normalerror <- numeric(ncol(x)) # if(noisemethod=="residuals") residuals <- matrix(,ncol=ncol(x), nrow=nrow(x)) # nComp[nComp==0] <- NA # criteria <- 1e+07 # sigma <- mu <- rep(0, ncol(x)) # # ########################################### # ### start the iteration # if(verbose) cat("\n start the iteration:") # while(it <= maxit & criteria >= eps){ # xold <- x # it=it+1 # for(i in which(indNA)){ # if(verbose) cat("\n column", i) # ## change the first column with that one with the highest amount of NAs # ## in the step # if(wcol[indM[i]] > 0){ # xNA=x[,indM[i]] # x1=x[,1] # x[,1]=xNA # x[,indM[i]]=x1 # # if(method == "roundedZero"){ # xilr <- pivotCoord(x) # phi <- pivotCoord(cbind(rep(dl[indM[i]], nrow(x)), x[,-1,drop=FALSE]))[,1] # TODO: phi auserhalb der Schleife! # ## --> x hat sich geaendert aber dl nicht. # xilr2 <- data.frame(xilr) # c1 <- colnames(xilr2)[1] # colnames(xilr2)[1] <- "V1" # reg1 = lm(V1 ~ ., data=xilr2) # yhat2 <- predict(reg1, new.data=xilr2[,-i]) # if(bruteforce){ # xilr2[w[, indM[i]], 1] <- ifelse(yhat2[w[, indM[i]]] <= phi[w[, indM[i]]], phi[w[, indM[i]]], yhat2[w[, indM[i]]] ) # } else { # s <- sqrt(sum(reg1$res^2)/(nrow(xilr2)-ncol(xilr2))) # ex <- (phi - yhat2)/s # yhat2sel <- ifelse(dnorm(ex[w[, indM[i]]]) > .Machine$double.eps, # yhat2[w[, indM[i]]] - s*dnorm(ex[w[, indM[i]]])/pnorm(ex[w[, indM[i]]]), # yhat2[w[, indM[i]]]) # if(any(is.na(yhat2)) || any(yhat2=="Inf")) stop("Problems in ilr because of infinite or NA estimates") # # check if we are under the DL: # if(any(yhat2sel >= phi[w[, indM[i]]])){ # yhat2sel <- ifelse(yhat2sel > phi[w[, indM[i]]], phi[w[, indM[i]]], yhat2sel) # } # xilr2[w[, indM[i]], 1] <- yhat2sel # } # } # if (method == "pls") { # xilr = pivotCoord(x) # ttt <- cbind(rep(dl[indM[i]], nrow(x)), x[,-1,drop=FALSE]) # phi <- pivotCoord(cbind(rep(dl[indM[i]], nrow(x)), x[,-1,drop=FALSE]))[,1] ## if(verbose) cat("\n phi", phi, "in iteration", it) # xilr2 <- data.frame(xilr) # c1 <- colnames(xilr2)[1] # colnames(xilr2)[1] <- "V1" # if(it == 1){ ## evaluate ncomp. # test <- xilr2[,!(colnames(xilr2) == "V1")] # testy <- xilr2[,"V1"] # X <- xilr2[,!(colnames(xilr2) == "V1")] # y <- xilr2[,"V1"] # nComp[i] <- bootnComp(xilr2[,!(colnames(xilr2) == "V1")],y=xilr2[,"V1"], R) # } # reg1 <- mvr(V1~.,ncomp=nComp[i], data = xilr2, method="simpls") # yhat2 <- as.numeric(predict(reg1, new.data=xilr2[,-i], ncomp=nComp[i]) ) ## if(noisemethod=="residuals") residuals[,i] <- reg1$residuals[,,nComp[i]] ## if(noisemethod=="normal"){ ## mu[i] <- median(residuals(reg1)[,,nComp[i]]) ## sigma[i] <- mad(residuals(reg1)[,,nComp[i]]) * noiseeffect ## } # colnames(xilr2)[1] <- c1 ## fit=reg1$fitted.values[,,nComp[i]] # s <- sqrt(sum(reg1$residuals[,,nComp[i]]^2)/abs(nrow(xilr2)-ncol(xilr2))) # ex <- as.numeric((phi - yhat2)/s ) # yhat2sel <- ifelse(dnorm(ex[w[, indM[i]]]) > .Machine$double.eps, # yhat2[w[, indM[i]]] - s*dnorm(ex[w[, indM[i]]])/pnorm(ex[w[, indM[i]]]), # yhat2[w[, indM[i]]]) # if(any(is.na(yhat2)) || any(yhat2=="Inf")) stop("Problems in ilr because of NaN or NA estimates") # # check if we are under the DL: # if(any(yhat2sel >= phi[w[, indM[i]]])){ # yhat2sel <- ifelse(yhat2sel > phi[w[, indM[i]]], phi[w[, indM[i]]], yhat2sel) # } ## if(verbose) cat("\n yhat2sel at iteration", it, "on column", i ,"is", yhat2sel) # xilr2[w[, indM[i]], 1] <- yhat2sel # # # } # if(method == "roundedZeroRobust"){ # xilr <- pivotCoord(x) # x[x < .Machine$double.eps] <- 0.00000000001 ## TODO: better solution # phi <- pivotCoord(cbind(rep(dl[indM[i]], nrow(x)), x[,-1,drop=FALSE]))[,1] # TODO: phi auserhalb der Schleife! # xilr2 <- data.frame(xilr) # c1 <- colnames(xilr2)[1] # colnames(xilr2)[1] <- "V1" # reg1 = rlm(V1 ~ ., data=xilr2, method="MM",maxit = 100) ## reg1 = lmrob(V1 ~ ., data=xilr2) # yhat2 <- predict(reg1, new.data=xilr2[,-i]) # if(bruteforce){ # xilr2[w[, indM[i]], 1] <- ifelse(yhat2[w[, indM[i]]] <= phi[w[, indM[i]]], phi[w[, indM[i]]], yhat2[w[, indM[i]]] ) # } else { # s <- reg1$s # #PF# s <- IQR(reg1$resid)/1.349 # ex <- (phi - yhat2)/s # yhat2sel <- ifelse(dnorm(ex[w[, indM[i]]]) > .Machine$double.eps, # yhat2[w[, indM[i]]] - s*dnorm(ex[w[, indM[i]]])/pnorm(ex[w[, indM[i]]]), # yhat2[w[, indM[i]]]) # if(any(is.na(yhat2)) || any(yhat2=="Inf")) stop("Problems in ilr because of infinite or NA estimates") # # check if we are under the DL: # if(any(yhat2sel >= phi[w[, indM[i]]])){ # yhat2sel <- ifelse(yhat2sel > phi[w[, indM[i]]], phi[w[, indM[i]]], yhat2sel) # } # xilr2[w[, indM[i]], 1] <- yhat2sel # } # } # # xilr <- xilr2 # x <- pivotCoordInv(xilr) # # ## return the order of columns: # xNA=x[,1] # x1=x[,indM[i]] # x[,1]=x1 # x[,indM[i]]=xNA # x <- adjust2(x, xcheck, w) # print(x[1:2,1:2]) # if(verbose) cat("\n iteration", it, "column", i, "value", x[2,1]) # } # # # } # # # criteria <- sum( ((xold - x)/x)^2, na.rm=TRUE) ## DIRTY: (na.rm=TRUE) # colnames(x) <- colnames(xcheck) # # } # # res <- list(xOrig=xcheck, xImp=x[,order(o)], criteria=criteria, iter=it, nComp=nComp, # maxit=maxit, w=length(which(w)), wind=w) # class(res) <- "imp" ## res <- adjust(res) # invisible(res) #} # # # ################################################################################## # #`impRZilr` <- #function(x, maxit=10, eps=0.1, method="roundedZero", # dl=rep(0.05, ncol(x)), bruteforce=FALSE, noise = TRUE, noisemethod="residuals", noiseeffect=1, R=10, # verbose=FALSE){ # # # if( is.vector(x) ) stop("x must be a matrix or data frame") # stopifnot((method %in% c("ltsReg", "ltsReg2", "classical", "lm", "roundedZero","roundedZeroRobust","pls"))) # if( length(dl) < ncol(x)) stop(paste("dl has to be a vector of ", ncol(x))) # # ################# # ## zeros to NA: # x[x==0] <- NA # # ################# # ## index of missings / non-missings # w <- is.na(x) # wn <- !is.na(x) # w2 <- apply(x, 1, function(x){ # length(which(is.na(x))) # }) # indNA <- apply(x, 2, function(x){any(is.na(x))}) # # ################# # ## sort the columns of the data according to the amount of missings in the variables # wcol <- apply(x, 2, function(x) length(which(is.na(x)))) # indM <- sort(wcol, index.return=TRUE, decreasing=TRUE)$ix # cn <- colnames(x) # xcheck <- x # # ################ # ## detection limit in ilr-space ## for(i in which(indNA)){ ## ## } # # ## x <- constSum(x) ## dl <- dl/sum(dl) # ## initialisation: ## x[is.na(x)] <- 0.001 # for(i in 1:length(dl)){ # ind <- is.na(x[,i]) # #PF# if(length(ind) > 0) x[ind,i] <- dl[i]/3*2 # if(length(ind) > 0) x[ind,i] <- dl[i]*runif(sum(ind),1/3,2/3) # } # ## x <- constSum(x) # # ## parameters: # it=0 # criteria <- 10000000 # error <- rep(0, ncol(x)) # nComp <- normalerror <- numeric(ncol(x)) # if(noisemethod=="residuals") residuals <- matrix(,ncol=ncol(x), nrow=nrow(x)) # nComp[nComp==0] <- NA # criteria <- 1e+07 # sigma <- mu <- rep(0, ncol(x)) # # ########################################### # ### start the iteration # if(verbose) cat("\n start the iteration:") # while(it <= maxit & criteria >= eps){ # xold <- x # it=it+1 # for(i in which(indNA)){ # if(verbose) cat("\n column", i) # ## change the first column with that one with the highest amount of NAs # ## in the step # if(wcol[indM[i]] > 0){ # xNA=x[,indM[i]] # x1=x[,1] # x[,1]=xNA # x[,indM[i]]=x1 # # if(method == "roundedZero"){ # xilr <- pivotCoord(x) # phi <- pivotCoord(cbind(rep(dl[indM[i]], nrow(x)), x[,-1,drop=FALSE]))[,1] # TODO: phi auserhalb der Schleife! # ## --> x hat sich geaendert aber dl nicht. # xilr2 <- data.frame(xilr) # c1 <- colnames(xilr2)[1] # colnames(xilr2)[1] <- "V1" # reg1 = lm(V1 ~ ., data=xilr2) # yhat2 <- predict(reg1, new.data=xilr2[,-i]) # if(bruteforce){ # xilr2[w[, indM[i]], 1] <- ifelse(yhat2[w[, indM[i]]] <= phi[w[, indM[i]]], phi[w[, indM[i]]], yhat2[w[, indM[i]]] ) # } else { # s <- sqrt(sum(reg1$res^2)/(nrow(xilr2)-ncol(xilr2))) # ex <- (phi - yhat2)/s # yhat2sel <- ifelse(dnorm(ex[w[, indM[i]]]) > .Machine$double.eps, # yhat2[w[, indM[i]]] - s*dnorm(ex[w[, indM[i]]])/pnorm(ex[w[, indM[i]]]), # yhat2[w[, indM[i]]]) # if(any(is.na(yhat2)) || any(yhat2=="Inf")) stop("Problems in ilr because of infinite or NA estimates") # # check if we are under the DL: # if(any(yhat2sel >= phi[w[, indM[i]]])){ # yhat2sel <- ifelse(yhat2sel > phi[w[, indM[i]]], phi[w[, indM[i]]], yhat2sel) # } # xilr2[w[, indM[i]], 1] <- yhat2sel # } # } # if (method == "pls") { # xilr = pivotCoord(x) # ttt <- cbind(rep(dl[indM[i]], nrow(x)), x[,-1,drop=FALSE]) # phi <- pivotCoord(cbind(rep(dl[indM[i]], nrow(x)), x[,-1,drop=FALSE]))[,1] ## if(verbose) cat("\n phi", phi, "in iteration", it) # xilr2 <- data.frame(xilr) # c1 <- colnames(xilr2)[1] # colnames(xilr2)[1] <- "V1" # if(it == 1){ ## evaluate ncomp. # test <- xilr2[,!(colnames(xilr2) == "V1")] # testy <- xilr2[,"V1"] # X <- xilr2[,!(colnames(xilr2) == "V1")] # y <- xilr2[,"V1"] # nComp[i] <- bootnComp(xilr2[,!(colnames(xilr2) == "V1")],y=xilr2[,"V1"], R) # } # reg1 <- mvr(V1~.,ncomp=nComp[i], data = xilr2, method="simpls") # yhat2 <- as.numeric(predict(reg1, new.data=xilr2[,-i], ncomp=nComp[i]) ) ## if(noisemethod=="residuals") residuals[,i] <- reg1$residuals[,,nComp[i]] ## if(noisemethod=="normal"){ ## mu[i] <- median(residuals(reg1)[,,nComp[i]]) ## sigma[i] <- mad(residuals(reg1)[,,nComp[i]]) * noiseeffect ## } # colnames(xilr2)[1] <- c1 ## fit=reg1$fitted.values[,,nComp[i]] # s <- sqrt(sum(reg1$residuals[,,nComp[i]]^2)/abs(nrow(xilr2)-ncol(xilr2))) # ex <- as.numeric((phi - yhat2)/s ) # yhat2sel <- ifelse(dnorm(ex[w[, indM[i]]]) > .Machine$double.eps, # yhat2[w[, indM[i]]] - s*dnorm(ex[w[, indM[i]]])/pnorm(ex[w[, indM[i]]]), # yhat2[w[, indM[i]]]) # if(any(is.na(yhat2)) || any(yhat2=="Inf")) stop("Problems in ilr because of NaN or NA estimates") # # check if we are under the DL: # if(any(yhat2sel >= phi[w[, indM[i]]])){ # yhat2sel <- ifelse(yhat2sel > phi[w[, indM[i]]], phi[w[, indM[i]]], yhat2sel) # } ## if(verbose) cat("\n yhat2sel at iteration", it, "on column", i ,"is", yhat2sel) # xilr2[w[, indM[i]], 1] <- yhat2sel # # # } # if(method == "roundedZeroRobust"){ # xilr <- pivotCoord(x) # x[x < .Machine$double.eps] <- 0.00000000001 ## TODO: better solution # phi <- pivotCoord(cbind(rep(dl[indM[i]], nrow(x)), x[,-1,drop=FALSE]))[,1] # TODO: phi auserhalb der Schleife! # xilr2 <- data.frame(xilr) # c1 <- colnames(xilr2)[1] # colnames(xilr2)[1] <- "V1" # reg1 = rlm(V1 ~ ., data=xilr2, method="MM",maxit = 100) ## reg1 = lmrob(V1 ~ ., data=xilr2) # yhat2 <- predict(reg1, new.data=xilr2[,-i]) # if(bruteforce){ # xilr2[w[, indM[i]], 1] <- ifelse(yhat2[w[, indM[i]]] <= phi[w[, indM[i]]], phi[w[, indM[i]]], yhat2[w[, indM[i]]] ) # } else { # s <- reg1$s # #PF# s <- IQR(reg1$resid)/1.349 # ex <- (phi - yhat2)/s # yhat2sel <- ifelse(dnorm(ex[w[, indM[i]]]) > .Machine$double.eps, # yhat2[w[, indM[i]]] - s*dnorm(ex[w[, indM[i]]])/pnorm(ex[w[, indM[i]]]), # yhat2[w[, indM[i]]]) # if(any(is.na(yhat2)) || any(yhat2=="Inf")) stop("Problems in ilr because of infinite or NA estimates") # # check if we are under the DL: # if(any(yhat2sel >= phi[w[, indM[i]]])){ # yhat2sel <- ifelse(yhat2sel > phi[w[, indM[i]]], phi[w[, indM[i]]], yhat2sel) # } # xilr2[w[, indM[i]], 1] <- yhat2sel # } # } # # xilr <- xilr2 # x <- pivotCoordInv(xilr) # # ## return the order of columns: # xNA=x[,1] # x1=x[,indM[i]] # x[,1]=x1 # x[,indM[i]]=xNA # x <- adjust2(x, xcheck, w) # print(x[1:2,1:2]) # if(verbose) cat("\n iteration", it, "column", i, "value", x[2,1]) # } # # # } # # # criteria <- sum( ((xold - x)/x)^2, na.rm=TRUE) ## DIRTY: (na.rm=TRUE) # colnames(x) <- colnames(xcheck) # # } # # res <- list(xOrig=xcheck, xImp=x, criteria=criteria, iter=it, nComp=nComp, # maxit=maxit, w=length(which(w)), wind=w) # class(res) <- "imp" ## res <- adjust(res) # invisible(res) #} # `impRZilr` <- # function(x, maxit=10, eps=0.1, method="roundedZero", # dl=rep(0.05, ncol(x)), bruteforce=FALSE){ # # `ilrM` <- # function(x, info=TRUE){ # x.ilr=matrix(NA,nrow=nrow(x),ncol=ncol(x)-1) # D=ncol(x) # for (i in 1:ncol(x.ilr)){ # x.ilr[,i]=sqrt((D-i)/(D-i+1))*log(((apply(as.matrix(x[,(i+1):D,drop=FALSE]),1,prod))^(1/(D-i)))/(x[,i])) # } # # invisible(-x.ilr) # if(info) {res <- list(xilr=-x.ilr, # xOrig=x) # class(res) <- "ilrTransform" # } else { # res <- -x.ilr # } # res # } # `invilrM` <- # function(x.ilr){ # if(class(x.ilr) =="ilrTransform" ){ # fac <- rowSums(x.ilr$xOrig) # x.ilr <- x.ilr$xilr # y <- matrix(0,nrow=nrow(x.ilr),ncol=ncol(x.ilr)+1) # D=ncol(x.ilr)+1 # y[,1]=-sqrt((D-1)/D)*x.ilr[,1] # for (i in 2:ncol(y)){ # for (j in 1:(i-1)){ # y[,i]=y[,i]+x.ilr[,j]/sqrt((D-j+1)*(D-j)) # } # } # for (i in 2:(ncol(y)-1)){ # y[,i]=y[,i]-sqrt((D-i)/(D-i+1))*x.ilr[,i] # } # yexp=exp(-y) # x.back=yexp/apply(yexp,1,sum) * fac # * rowSums(derOriginaldaten) # invisible(x.back) # } else { # y=matrix(0,nrow=nrow(x.ilr),ncol=ncol(x.ilr)+1) # D=ncol(x.ilr)+1 # y[,1]=-sqrt((D-1)/D)*x.ilr[,1] # for (i in 2:ncol(y)){ # for (j in 1:(i-1)){ # y[,i]=y[,i]+x.ilr[,j]/sqrt((D-j+1)*(D-j)) # } # } # for (i in 2:(ncol(y)-1)){ # y[,i]=y[,i]-sqrt((D-i)/(D-i+1))*x.ilr[,i] # } # yexp=exp(-y) # x.back=yexp/apply(yexp,1,sum) # * rowSums(derOriginaldaten) # invisible(x.back) # #return(yexp) # } # x.back # } # # # # if( is.vector(x) ) stop("x must be a matrix or data frame") # stopifnot((method %in% c("ltsReg", "ltsReg2", "classical", "lm", "roundedZero","roundedZeroRobust"))) # if( length(dl) < ncol(x)) stop(paste("dl has to be a vector of ", ncol(x))) # # # ## zeros to NA: # x[x==0] <- NA # # ##index of missings / non-missings # w <- is.na(x) # wn <- !is.na(x) # w2 <- apply(x, 1, function(x){ # length(which(is.na(x))) # }) # # # ##sort the columns of the data according to the amount of missings in the variables # wcol <- apply(x,2,function(x) length(which(is.na(x)))) # indM <- sort(wcol, index.return=TRUE, decreasing=TRUE)$ix # cn <- colnames(x) # xcheck <- x # # ## initialisation: # # x[is.na(x)] <- 0.001 # for(i in 1:length(dl)){ # ind <- is.na(x[,i]) # #PF# if(length(ind) > 0) x[ind,i] <- dl[i]/3*2 # if(length(ind) > 0) x[ind,i] <- dl[i]*runif(sum(ind),1/3,2/3) # } # # # x <- constSum(x) # # ## parameters: # it=0 # criteria <- 10000000 # error <- rep(0, ncol(x)) # # ########################################### # ### start the iteration # while(it <= maxit & criteria >= eps){ # xold <- x # it=it+1 # for(i in 1:ncol(x)){ # ## change the first column with that one with the highest amount of NAs # ## in the step # if(wcol[indM[i]] > 0){ # xNA=x[,indM[i]] # x1=x[,1] # x[,1]=xNA # x[,indM[i]]=x1 # # if(method == "roundedZero"){ # xilr <- ilrM(x) # phi <- ilrM(cbind(rep(dl[indM[i]], nrow(x)), x[,-1,drop=FALSE]), info=FALSE)[,1] # TODO: phi auserhalb der Schleife! # ## --> x hat sich geaendert aber dl nicht. # xilr2 <- data.frame(xilr$xilr) # c1 <- colnames(xilr2)[1] # colnames(xilr2)[1] <- "V1" # reg1 = lm(V1 ~ ., data=xilr2) # yhat2 <- predict(reg1, new.data=xilr2[,-i]) # if(bruteforce){ # xilr2[w[, indM[i]], 1] <- ifelse(yhat2[w[, indM[i]]] <= phi[w[, indM[i]]], phi[w[, indM[i]]], yhat2[w[, indM[i]]] ) # } else { # # s <- sd(reg1$res, na.rm=TRUE) # s <- sqrt(sum(reg1$res^2)/(nrow(xilr2)-ncol(xilr2))) # ex <- (phi - yhat2)/s # ##################################################### # # CHANGED PF: # #PF# if(all(dnorm(ex) > 5 * .Machine$double.eps)) yhat2 <- yhat2 - s*dnorm(ex)/pnorm(ex) # yhat2sel <- ifelse(dnorm(ex[w[, indM[i]]]) > .Machine$double.eps, # yhat2[w[, indM[i]]] - s*dnorm(ex[w[, indM[i]]])/pnorm(ex[w[, indM[i]]]), # yhat2[w[, indM[i]]]) # if(any(is.na(yhat2)) || any(yhat2=="Inf")) stop("Problems in ilr because of infinite or NA estimates") # # check if we are under the DL: # if(any(yhat2sel >= phi[w[, indM[i]]])){ # yhat2sel <- ifelse(yhat2sel > phi[w[, indM[i]]], phi[w[, indM[i]]], yhat2sel) # } # xilr2[w[, indM[i]], 1] <- yhat2sel # ##################################################### # } # } # if(method == "roundedZeroRobust"){ # xilr <- ilrM(x) # x[x < .Machine$double.eps] <- 0.00000000001 ## TODO: better solution # phi <- ilrM(cbind(rep(dl[indM[i]], nrow(x)), x[,-1,drop=FALSE]), info=FALSE)[,1] # TODO: phi auserhalb der Schleife! # xilr2 <- data.frame(xilr$xilr) # c1 <- colnames(xilr2)[1] # colnames(xilr2)[1] <- "V1" # reg1 = rlm(V1 ~ ., data=xilr2, method="MM",maxit = 100) # # reg1 = lmrob(V1 ~ ., data=xilr2) # yhat2 <- predict(reg1, new.data=xilr2[,-i]) # if(bruteforce){ # xilr2[w[, indM[i]], 1] <- ifelse(yhat2[w[, indM[i]]] <= phi[w[, indM[i]]], phi[w[, indM[i]]], yhat2[w[, indM[i]]] ) # } else { # # s <- mad(reg1$res, na.rm=TRUE) # s <- reg1$s # #PF# s <- IQR(reg1$resid)/1.349 # ex <- (phi - yhat2)/s # ##################################################### # # CHANGED PF: # #PF# if(all(dnorm(ex) > 5 * .Machine$double.eps)) yhat2 <- yhat2 - s*dnorm(ex)/pnorm(ex) # yhat2sel <- ifelse(dnorm(ex[w[, indM[i]]]) > .Machine$double.eps, # yhat2[w[, indM[i]]] - s*dnorm(ex[w[, indM[i]]])/pnorm(ex[w[, indM[i]]]), # yhat2[w[, indM[i]]]) # if(any(is.na(yhat2)) || any(yhat2=="Inf")) stop("Problems in ilr because of infinite or NA estimates") # # check if we are under the DL: # if(any(yhat2sel >= phi[w[, indM[i]]])){ # yhat2sel <- ifelse(yhat2sel > phi[w[, indM[i]]], phi[w[, indM[i]]], yhat2sel) # } # xilr2[w[, indM[i]], 1] <- yhat2sel # ##################################################### # } # } # # xilr$xilr <- xilr2 # x=invilrM(xilr) # ## return the order of columns: # xNA=x[,1] # x1=x[,indM[i]] # x[,1]=x1 # x[,indM[i]]=xNA # } # # # } # # # criteria <- sum( ((xold - x)/x)^2, na.rm=TRUE) ## DIRTY: (na.rm=TRUE) # colnames(x) <- colnames(xcheck) # # } # # res <- list(xOrig=xcheck, xImp=x, criteria=criteria, iter=it, # maxit=maxit, w=length(which(w)), wind=w) # class(res) <- "imp" # invisible(res) # } #