Revision 12acd76ec1d613505e4bb20b3012e8d8507a310a authored by Lars Kotthoff on 16 May 2018, 20:38:09 UTC, committed by cran-robot on 16 May 2018, 20:38:09 UTC
1 parent 970c4e6
selector.cfs.R
### CFS
# classification and regression
# continous and discrete data
cfs <- function(formula, data) {
cont_correlation <- function(a, b) {
result = 0
if(!is.factor(a) && !is.factor(b)) { # both continous
complete = complete.cases(a) & complete.cases(b)
if(!any(complete))
return(0)
vec1 = a[complete]
vec2 = b[complete]
if(sd(vec1) == 0 || sd(vec2) == 0)
return(0)
result = cor(vec1, vec2)
} else if(is.factor(a) && is.factor(b)) { # both discrete
tab = table(a, b)
alevels = rownames(tab)
blevels = colnames(tab)
result = sum(sapply(alevels, function(avalue) {
avec = as.numeric(a == avalue)
complete_a = complete.cases(a)
return(sum(sapply(blevels, function(bvalue) {
bvec = as.numeric(b == bvalue)
complete_b = complete.cases(b)
complete = complete_a & complete_b
avec_complete_data = avec[complete]
bvec_complete_data = bvec[complete]
if(sd(avec_complete_data, na.rm=TRUE) == 0 || sd(bvec_complete_data, na.rm=TRUE) == 0)
return(0)
return(tab[avalue, bvalue] / length(a) * cor(avec_complete_data, bvec_complete_data))
})))
}))
} else { # continous and discrete
cont = NULL;
disc = NULL;
if(is.factor(a)) {
cont = b
disc = a
} else {
cont = a
disc = b
}
cont_complete = complete.cases(cont)
disc_table = table(disc)
disc_levels = names(disc_table)
if(length(disc_levels) == 0) {
result = 0
} else {
result = sum(sapply(disc_levels, function(lev) {
disc_vec = as.numeric(disc == lev)
disc_vec_complete = complete.cases(disc_vec)
complete = cont_complete & disc_vec_complete
disc_vec_complete_data = disc_vec[complete]
cont_complete_data = cont[complete]
if(sd(cont_complete_data) == 0)
return(0)
return(disc_table[lev] / length(disc) * cor(disc_vec_complete_data, cont_complete_data))
}))
}
}
return(result)
}
# uses parent.env (correlations)
get_correlation <- function(attr1, attr2, classification, new_data, entropies) {
#lazy evaluation
if(!is.na(correlations[attr1, attr2])) {
return(correlations[attr1, attr2])
}
tmp_res = NA
if(classification) { #discrete class
tmp_res = 2.0 * (entropies[attr1] + entropies[attr2] - entropyHelper(data.frame(cbind(new_data[[attr1]], new_data[[attr2]])))) / (entropies[attr1] + entropies[attr2])
} else { #continous class
tmp_res = cont_correlation(new_data[[attr1]], new_data[[attr2]])
}
if(is.nan(tmp_res)) {
# all entropies (individual + joint) are 0
tmp_res = 0
}
correlations[attr1, attr2] <<- tmp_res
correlations[attr2, attr1] <<- tmp_res
return(tmp_res)
}
# uses parent.env
evaluator <- function(attrs) {
ff_sum = 0
ff_count = 0
fc_sum = 0
attr_count = length(attrs)
if(attr_count <= 0)
stop("Attributes not specified")
for(i in 1:attr_count) {
attr1 = attrs[i]
# feature-class correlation
cor = get_correlation(attr1, 1, classification, new_data, entropies)
fc_sum = fc_sum + cor
# feature-feature correlation
if(i == attr_count) {
next()
}
for(j in (i+1):attr_count) {
attr2 = attrs[j]
cor = get_correlation(attr1, attr2, classification, new_data, entropies)
ff_count = ff_count + 1
ff_sum = ff_sum + cor
}
}
ff_cor = ff_sum / ff_count
fc_cor = fc_sum / attr_count
if(attr_count == 1)
return(fc_cor)
else
return(attr_count * fc_cor / sqrt(attr_count + attr_count * (attr_count - 1) * ff_cor))
}
new_data = get.data.frame.from.formula(formula, data)
# prepare correlation matrix
classification = is.factor(new_data[[1]])
attr_count = dim(new_data)[2]
attr_names = colnames(new_data)
correlations = matrix(rep(NA, attr_count ^ 2), nrow = attr_count, ncol = attr_count,
dimnames = list(attr_names, attr_names))
entropies = NULL
if(classification) {
new_data = supervised.discretization(formula, data = new_data)
new_data = get.data.frame.from.formula(formula, new_data)
entropies = sapply(new_data, entropyHelper)
}
result = best.first.search(names(new_data)[-1], evaluator)
return(result)
}
Computing file changes ...