Revision f32aa1b79479456147aa29399ffadd5d74512832 authored by sobek44 on 29 October 2014, 11:11:13 UTC, committed by sobek44 on 29 October 2014, 11:11:13 UTC
1 parent 3f5d681
auxiliary_functions.R
##' Computing joint sigma for all clusters
##'
##' Computes unbiased noise estimator under assumption that
##' all subspaces are of the same dimension
##'
##' @param X data
##' @param segmentation variable segmentation
##' @param max.dim maximal subspace dimension
##' @param n number of rows
##' @param p number of variables
##' @param numb.clusters total number of clusters
##' @return unbiased noise estimator
##' @author Piotr Sobczyk
##' @keywords internal
getSigma <- function(X, segmentation, max.dim, n, p, numb.clusters){
RES.sigma=0
for(k in 1:numb.clusters){
Xk = X[,segmentation==k, drop=F]
if(dim(Xk)[2]>max.dim){ #length because it might be onedimensional
svdSIGNAL= svd(Xk)
SIGNAL = matrix(svdSIGNAL$u[, 1:max.dim], ncol=max.dim) %*%
diag(svdSIGNAL$d[1:max.dim], nrow=max.dim) %*%
t(matrix(svdSIGNAL$v[, 1:max.dim], ncol=max.dim))
RES.sigma = RES.sigma + sum((Xk - SIGNAL)^2)
}
}
degrees.freedom <- n*p-p-n*max.dim-p*max.dim+max.dim^2+max.dim
sigma <- sqrt(RES.sigma/degrees.freedom)
sigma
}
#' Choose subspace closest to the given variable
#'
#' The most similar subspace is choosen based on R^2
#'
#' @param variable variable to be assigned
#' @param pcas orthogonal basis for different subspaces
#' @param numberClusters number of subspaces (clusters)
#' @return index number of subspace closest to variable
#' @keywords internal
choose.cluster <- function(variable, pcas, numberClusters){
v1 = var(variable)
which.max( vapply(1:numberClusters, function(i){
v2 <- var(fastLmPure(pcas[[i]], variable, method = 0L)$residuals);
p <- ncol(pcas[[i]]);
n <- length(variable);
(v1-v2)/v1
}, 0.9) )
}
#' Computes missclasification rate.
#'
#' Missclasification is commonly used performance measure in subspace clustering.
#' It allows to compare two partitions with the same number of clusters.
#'
#' As getting exact value of missclasification requires checking all permutations
#' and is therefore intrackable even for modest number of clusters, a heuristic approach is proposed.
#' It is assumed that there are K classes of maximum M elements.
#' Additional requirement is that classes labels are from range [1, K].
#'
#' @param group a vector, first partiton
#' @param true_group a vector, second (reference) partition
#' @param M an integer, maximal number of elements in one class
#' @param K an integer, number of classes
#' @references {R Vidal. Subspace clustering. Signal Processing Magazine, IEEE, 28(2):52–68, 2011.}
#' @export
#' @return misclassification rate
#' @examples
#' \donttest{
#' data <- dataSIMULATION(n=100, SNR=1, K=5, numbVars=30, max.dim=2)
#' mlcc.fit <- MPCV.reps(data$X, numb.clusters=5, numb.runs=20, max.dim=2)
#' missclasification(mlcc.fit$segmentation,data$s, 30, 5)
#' }
#'
#' #one can use this function not only for clusters
#' partition1 <- sample(10, 300, replace=TRUE)
#' partition2 <- sample(10, 300, replace=TRUE)
#' missclasification(partition1, partition1, max(table(partition1)), 10)
#' missclasification(partition1, partition2, max(table(partition2)), 10)
missclasification <-function(group, true_group, M, K){
forbidden = NULL;
suma = 0;
nG = max(group);
for (i in M:1){ #differnet concordance levels
for(j in 1:nG){ #subspaces numbers (found)
if (sum(j==forbidden)==0){ #subspace not yet used
for (k in 1:K){ # subspaces numbers (true)
if (sum(j==group[true_group==k])==i){
suma = suma + i
forbidden = c(forbidden, j)
break;
}
}
}
}
}
mis = 1-suma/length(true_group)
return(mis)
}
Computing file changes ...