Revision 12acd76ec1d613505e4bb20b3012e8d8507a310a authored by Lars Kotthoff on 16 May 2018, 20:38:09 UTC, committed by cran-robot on 16 May 2018, 20:38:09 UTC
1 parent 970c4e6
Raw File
selector.cfs.R
### CFS
# classification and regression
# continous and discrete data
cfs <- function(formula, data) {
	cont_correlation <- function(a, b) {
		result = 0
		if(!is.factor(a) && !is.factor(b)) { # both continous
			complete = complete.cases(a) & complete.cases(b)
			if(!any(complete))
				return(0)
			vec1 = a[complete]
			vec2 = b[complete]
			if(sd(vec1) == 0 || sd(vec2) == 0)
				return(0)
			result = cor(vec1, vec2)
		} else if(is.factor(a) && is.factor(b)) { # both discrete
			tab = table(a, b)
			alevels = rownames(tab)
			blevels = colnames(tab)
			
			result = sum(sapply(alevels, function(avalue) {
					avec = as.numeric(a == avalue)
					complete_a = complete.cases(a)
					return(sum(sapply(blevels, function(bvalue) {
							bvec = as.numeric(b == bvalue)
							complete_b = complete.cases(b)
							complete = complete_a & complete_b
							avec_complete_data = avec[complete]
							bvec_complete_data = bvec[complete]
							if(sd(avec_complete_data, na.rm=TRUE) == 0 || sd(bvec_complete_data, na.rm=TRUE) == 0)
								return(0)
							return(tab[avalue, bvalue] / length(a) * cor(avec_complete_data, bvec_complete_data))
						})))
				}))
		} else { # continous and discrete
			cont = NULL;
			disc = NULL;
			if(is.factor(a)) {
				cont = b
				disc = a
			} else {
				cont = a
				disc = b
			}
			
			cont_complete = complete.cases(cont)
			disc_table = table(disc)
			disc_levels = names(disc_table)
			
			if(length(disc_levels) == 0) {
				result = 0
			} else {
				result = sum(sapply(disc_levels, function(lev) {
						disc_vec = as.numeric(disc == lev)
						disc_vec_complete = complete.cases(disc_vec)
						complete = cont_complete & disc_vec_complete
						disc_vec_complete_data = disc_vec[complete]
						cont_complete_data = cont[complete]
						if(sd(cont_complete_data) == 0)
							return(0)
						return(disc_table[lev] / length(disc) * cor(disc_vec_complete_data, cont_complete_data))
					}))
			}
		}
		return(result)
	}

	# uses parent.env (correlations)
	get_correlation <- function(attr1, attr2, classification, new_data, entropies) {
		#lazy evaluation
		if(!is.na(correlations[attr1, attr2])) {
			return(correlations[attr1, attr2])
		}
		
		tmp_res = NA
		if(classification) { #discrete class
			tmp_res = 2.0 * (entropies[attr1] + entropies[attr2] - entropyHelper(data.frame(cbind(new_data[[attr1]], new_data[[attr2]])))) / (entropies[attr1] + entropies[attr2])
		} else { #continous class
			tmp_res = cont_correlation(new_data[[attr1]], new_data[[attr2]])
		}
        if(is.nan(tmp_res)) {
            # all entropies (individual + joint) are 0
            tmp_res = 0
        }

		correlations[attr1, attr2] <<- tmp_res
		correlations[attr2, attr1] <<- tmp_res
		
		return(tmp_res)
	}

	# uses parent.env
	evaluator <- function(attrs) {
		ff_sum = 0
		ff_count = 0
		fc_sum = 0
		attr_count = length(attrs)
		
		if(attr_count <= 0)
			stop("Attributes not specified")
		
		for(i in 1:attr_count) {
			attr1 = attrs[i]
			
			# feature-class correlation
            cor = get_correlation(attr1, 1, classification, new_data, entropies)
            fc_sum = fc_sum + cor
			
			# feature-feature correlation
			if(i == attr_count) {
				next()
			}
			for(j in (i+1):attr_count) {
				attr2 = attrs[j]
                cor = get_correlation(attr1, attr2, classification, new_data, entropies)
                ff_count = ff_count + 1
                ff_sum = ff_sum + cor
			}
		}

		ff_cor = ff_sum / ff_count
		fc_cor = fc_sum / attr_count
		
		if(attr_count == 1)
			return(fc_cor)
		else
			return(attr_count * fc_cor / sqrt(attr_count + attr_count * (attr_count - 1) * ff_cor))
	}
	
	new_data = get.data.frame.from.formula(formula, data)
	
	# prepare correlation matrix
	classification = is.factor(new_data[[1]])
	attr_count = dim(new_data)[2]
	attr_names = colnames(new_data)
	correlations = matrix(rep(NA, attr_count ^ 2), nrow = attr_count, ncol = attr_count,
		dimnames = list(attr_names, attr_names))
	
	entropies = NULL
	if(classification) {
		new_data = supervised.discretization(formula, data = new_data)
		new_data = get.data.frame.from.formula(formula, new_data)
		entropies = sapply(new_data, entropyHelper)		
	}

	result = best.first.search(names(new_data)[-1], evaluator)
		
	return(result)
}
back to top