Revision a6a4107a08051dfddc3c733102d002fd8617ab9e authored by Lars Kotthoff on 25 October 2014, 00:00:00 UTC, committed by Gabor Csardi on 25 October 2014, 00:00:00 UTC
1 parent c389439
Raw File
discretize.R
discretize.all <- function(formula, data) {
	new_data = get.data.frame.from.formula(formula, data)
	
	dest_column_name = dimnames(new_data)[[2]][1]
	if(!is.factor(new_data[[1]])) {
		new_data[[1]] = equal.frequency.binning.discretization(new_data[[1]], 5)
	}
	
	new_data = supervised.discretization(formula, data = new_data)

	# reorder attributes
	new_data = get.data.frame.from.formula(formula, new_data)
	return(new_data)
}

# unupervised
equal.frequency.binning.discretization <- function(data, bins) {
	bins = as.integer(bins)
	if (!is.numeric(data)) 
		stop("Data must be numeric")
	if(bins < 1)
		stop("Number of bins too small")
	
	complete = complete.cases(data)
	ord = order(data)
	len = length(data[complete])
	blen = len / bins
	new_data = data

	p1 = p2 = 0

	for(i in 1:bins) {
		p1 = p2 + 1
		p2 = round(i * blen)
		new_data[ord[p1:min(p2, len)]] = i
	}

	return(factor(new_data))
}

# unupervised
equal.width.binning.discretization <- function(data, bins) {
	if (!is.numeric(data)) 
		stop("Data must be numeric")
	if(bins < 1)
		stop("Number of bins too small")
	return(cut(data, bins))
}

#MDL - Fayyad, Irani
supervised.discretization <- function(formula, data) {
	data = get.data.frame.from.formula(formula, data)
	complete = complete.cases(data[[1]])
	all.complete = all(complete)
	if(!all.complete) {
		new_data = data[complete, , drop=FALSE]
		result = Discretize(formula, data = new_data, na.action = na.pass)
		return(result)
	} else {
		return(Discretize(formula, data = data, na.action = na.pass))
	}
	
}
back to top