Revision 738b43f6dcce877b85c10f23dafc38e0098bd8fb authored by Daniel Luedecke on 19 February 2014, 11:32:18 UTC, committed by cran-robot on 19 February 2014, 11:32:18 UTC
1 parent 64f2fd6
sjPlotFrequencies.R
# bind global variables
if(getRversion() >= "2.15.1") utils::globalVariables(c("grp", "ia", "..density.."))
#' @title Plot frequencies of (count) variables
#' @name sjp.frq
#' @references \itemize{
#' \item \url{http://strengejacke.wordpress.com/sjplot-r-package/}
#' \item \url{http://strengejacke.wordpress.com/2013/02/25/simplify-frequency-plots-with-ggplot-in-r-rstats/}
#' }
#'
#' @seealso \link{sjt.frq}
#'
#' @description Plot frequencies of a (count) variable as bar graph, histogram,
#' box plot etc. using ggplot.
#'
#' @param varCount The variable which frequencies should be plotted.
#' @param title Title of diagram as string.
#' Example: \code{title=c("my title")}
#' @param titleSize The size of the plot title. Default is 1.3.
#' @param titleColor The color of the plot title. Default is \code{"black"}.
#' @param weightBy A weight factor that will be applied to weight all cases from \code{varCount}.
#' default is \code{NULL}, so no weights are used.
#' @param weightByTitleString If a weight factor is supplied via the parameter \code{weightBy}, the diagram's title
#' may indicate this with a remark. Default is \code{NULL}, so the diagram's title will not be modified when
#' cases are weighted. Use a string as parameter, e.g.: \code{weightByTitleString=" (weighted)"}.
#' @param interactionVar An interaction variable which can be used for box plots. Divides the observations in
#' \code{varCount} into the factors (sub groups) of \code{interactionVar}. Only applies when parameter \code{"type"}
#' is \code{"box"} or \code{"violin"} (resp. their alternative strings like \code{"boxplot"}, \code{"boxplots"} or \code{"v"}).
#' @param maxYlim Indicates how to calculate the maximum limit of the y-axis.
#' If \code{TRUE}, the upper y-limit corresponds to the amount of cases,
#' i.e. y-axis for each plot of a data base are the same.
#' If \code{FALSE} (default), the maximum y-axis depends on the highest count of a
#' variable's answer category. In this case, the y-axis breaks may change,
#' depending on the variable.
#' @param upperYlim Uses a pre-defined upper limit for the y-axis. Overrides the \code{maxYlim} parameter.
#' @param order Determines whether categories on x-axis should be order according to the frequencies or not.
#' Default is \code{"none"}, so categories are not ordered by frequency. Use \code{"asc"} or
#' \code{"desc"} for sorting categories ascending or descending in relation to the frequencies.
#' @param type Specifies the type of distribution plot that will be plotted.
#' \itemize{
#' \item \code{"bar"}, \code{"bars"} or \code{"b"} for simple bars (the default setting).
#' \item \code{"dots"} or \code{"dot"} for a dot plot.
#' \item \code{"h"}, \code{"hist"} or \code{"histogram"} for a histogram.
#' \item \code{"line"}, \code{"lines"} or \code{"l"} for a histogram with filled area with line.
#' \item \code{"dens"}, \code{"d"} or \code{"density"} for a density plot.
#' \item \code{"box"}, \code{"boxplot"} or \code{"boxplots"} for box plots.
#' \item \code{"v"} or \code{"violin"} for violin plots.
#' }
#' @param axisLabels.x Labels for the x-axis breaks.
#' Example: \code{axisLabels.x=c("Label1", "Label2", "Label3")}.
#' Note: If you use the \code{\link{sji.SPSS}} function and the \code{\link{sji.getValueLabels}} function, you receive a
#' list object with label string. The labels may also be passed as list object. They will be unlisted and
#' converted to character vector automatically.
#' @param interactionVarLabels Labels for the x-axis breaks when having interaction variables included.
#' These labels replace the \code{axisLabels.x}. Only applies, when using box or violin plots
#' (i.e. \code{"type"} is \code{"box"} or \code{"violin"}) and \code{interactionVar} is not \code{NULL}.
#' Example: See \code{axisLabels.x}.
#' @param axisLabelAngle.x Angle for axis-labels.
#' @param axisLabelSize The size of axis labels of both x and y axis. Default is 1.1, recommended values range
#' between 0.5 and 3.0.
#' @param valueLabelSize The size of value labels in the diagram. Default is 4, recommended values range
#' between 2 and 8.
#' @param breakTitleAt Determines how many chars of the title are displayed in
#' one line and when a line break is inserted into the title.
#' @param breakLabelsAt Determines how many chars of the labels are displayed in
#' one line and when a line break is inserted into the axis labels.
#' @param gridBreaksAt Sets the breaks on the y axis, i.e. at every n'th position a major
#' grid is being printed.
#' @param barWidth Width of bars. Default is 0.6, recommended values range from 0.2 to 2.0
#' @param dotSize The size of dots in case of dot-plots (\code{type="dots"}).
#' @param innerBoxPlotWidth The width of the inner box plot that is plotted inside of violin plots. Only applies
#' if \code{type} is \code{"violin"}. Default value is 0.15
#' @param innerBoxPlotDotSize Size of mean dot insie a violin plot. Applies only when \code{type} is set to \code{"violin"}.
#' @param barColor User defined color for bars. If not specified, a default blue
#' color palette will be used for the bar charts.
#' @param barAlpha Specify the transparancy (alpha value) of bars.
#' @param axisLabelColor User defined color for axis labels. If not specified, a default dark gray
#' color palette will be used for the labels.
#' @param borderColor User defined color of whole diagram border (panel border).
#' @param axisColor User defined color of axis border (y- and x-axis, in case the axes should have different colors than
#' the diagram border).
#' @param barOutline If \code{TRUE}, each bar gets a colored outline. Default is \code{FALSE}.
#' @param barOutlineSize The size of the bar outlines. Only applies if \code{barOutline} is \code{TRUE}.
#' Default is 0.2
#' @param outlineColor The color of the bar outline. Only applies, if \code{barOutline} is \code{TRUE}.
#' @param majorGridColor Specifies the color of the major grid lines of the diagram background.
#' @param minorGridColor Specifies the color of the minor grid lines of the diagram background.
#' @param hideGrid.x If \code{TRUE}, the x-axis-gridlines are hidden. Default if \code{FALSE}.
#' @param hideGrid.y If \code{TRUE}, the y-axis-gridlines are hidden. Default if \code{FALSE}.
#' @param showValueLabels Whether counts and percentage values should be plotted to each bar. Default
#' is \code{TRUE}.
#' @param showCountValues If \code{TRUE} (default), count values are be plotted to each bar. If \code{FALSE},
#' count values are removed.
#' @param showPercentageValues If \code{TRUE} (default), percentage values are be plotted to each bar, if \code{FALSE},
#' percentage-values are removed.
#' @param showAxisLabels.x Whether x axis labels (category names) should be shown or not.
#' @param showAxisLabels.y Whether y axis labels (count values) should be shown or not.
#' @param showTickMarks Whether tick marks of axes should be shown or not.
#' @param showMeanIntercept If \code{TRUE}, a vertical line in histograms is drawn to indicate the mean value of the count
#' variables. Only applies to histogram-charts.
#' @param showMeanValue If \code{TRUE} (default value), the mean value is printed to the vertical line that indicates the mean value
#' of the count variables. Only applies to histogram-charts.
#' @param showStandardDeviation If \code{TRUE}, the standard deviation is annotated as shaded rectangle around the mean intercept
#' line. Only applies to histogram-charts.
#' @param meanInterceptLineType The linetype of the mean intercept line. Only applies to histogram-charts and
#' when \code{showMeanIntercept} is \code{TRUE}.
#' @param meanInterceptLineSize The size of the mean intercept line. Only applies to histogram-charts and when
#' \code{showMeanIntercept} is \code{TRUE}.
#' @param showNormalCurve If \code{TRUE}, a normal curve is plotted over the histogram. Default is
#' \code{FALSE}. Only applies when histograms are plotted.
#' @param normalCurveColor Specify the color of the normal curve line. Only
#' applies if \code{showNormalCurve} is \code{TRUE}.
#' @param normalCurveSize Specifiy the size of the normal curve line. Only
#' applies if \code{showNormalCurve} is \code{TRUE}.
#' @param normalCurveAlpha Specify the transparancy (alpha value) of the normal curve. Only
#' applies if \code{showNormalCurve} is \code{TRUE}.
#' @param valueLabelColor The color of the value labels (numbers) inside the digram.
#' @param axisTitle.x A label for the x axis. useful when plotting histograms with metric scales where no category labels
#' are assigned to the x axis.
#' @param axisTitle.y A label for the y axis. useful when plotting histograms with metric scales where no category labels
#' are assigned to the y axis.
#' @param axisTitleColor The color of the x and y axis labels. Refers to \code{axisTitle.x} and \code{axisTitle.y}, not to the tick mark
#' or category labels.
#' @param axisTitleSize the size of the x and y axis labels. Refers to \code{axisTitle.x} and \code{axisTitle.y}, not to the tick mark
#' or category labels. Default is 1.3.
#' @param startAxisAt Determines the first value on the x-axis. By default, this value is 1,
#' i.e. the value range on the x axis starts with 1, independent from the lowest
#' value of \code{varCount} (which means, you may have zero counts and hence no bars plotted
#' for these values in such cases). Change this parameter, if variables with a value range
#' starting from greater values than one (e.g. 5-10) should be plotted to avoid empty
#' bars in the plot.
#' @param autoGroupAt A value indicating at which length of unique values of \code{varCount} the variable
#' is automatically grouped into smaller units (see \link{sju.groupVar}). If \code{varCount} has large
#' numbers of unique values, too many bars for the graph have to be plotted. Hence it's recommended
#' to group such variables. For example, if \code{autoGroupAt} is 50, i.e. if \code{varCount} has 50 and more unique values
#' it will be grouped using \link{sju.groupVar} with \code{groupsize="auto"} parameter. By default,
#' the maximum group count is 30. However, if \code{autoGroupAt} is less than 30, \code{autoGroupAt}
#' groups are built. Default value for \code{autoGroupAt} is \code{NULL}, i.e. auto-grouping is off.
#' @param theme Specifies the diagram's background theme. Default (parameter \code{NULL}) is a gray
#' background with white grids.
#' \itemize{
#' \item Use \code{"bw"} for a white background with gray grids
#' \item \code{"classic"} for a classic theme (black border, no grids)
#' \item \code{"minimal"} for a minimalistic theme (no border,gray grids) or
#' \item \code{"none"} for no borders, grids and ticks.
#' }
#' The ggplot-object can be returned with \code{returnPlot} set to \code{TRUE} in order to further
#' modify the plot's theme.
#' @param flipCoordinates If \code{TRUE}, the x and y axis are swapped. Default is \code{FALSE}.
#' @param omitNA If \code{TRUE}, missings are not included in the frequency calculation and diagram plot.
#' @param returnPlot If \code{TRUE}, the ggplot-object with the complete plot will be returned (and not plotted).
#' Default is \code{FALSE}, hence the ggplot object will be plotted, not returned.
#' @return The ggplot-object with the complete plot in case \code{returnPlot} is \code{TRUE}.
#'
#' @examples
#' # boxplot
#' sjp.frq(ChickWeight$weight, type="box")
#'
#' # histogram
#' sjp.frq(discoveries, type="hist", showMeanIntercept=TRUE)
#' # histogram with minimal theme
#' sjp.frq(discoveries, type="hist", showMeanIntercept=TRUE,
#' theme="minimal", minorGridColor="white",
#' showTickMarks=FALSE, hideGrid.x=TRUE)
#'
#' # violin plot
#' sjp.frq(ChickWeight$weight, type="v")
#'
#' # bar plot
#' sjp.frq(ChickWeight$Diet)
#' sjp.frq(ChickWeight$Diet, maxYlim=TRUE)
#'
#' # bar plot with EUROFAMCARE sample dataset
#' # dataset was importet from an SPSS-file, using:
#' # efc <- sji.SPSS("efc.sav", enc="UTF-8")
#' data(efc)
#' efc.val <- sji.getValueLabels(efc)
#' efc.var <- sji.getVariableLabels(efc)
#' sjp.frq(as.factor(efc$e15relat),
#' title=efc.var[['e15relat']],
#' axisLabels.x=efc.val['e15relat'],
#' axisLabelAngle.x=90)
#'
#' # bar plot with EUROFAMCARE sample dataset
#' # grouped variable
#' ageGrp <- sju.groupVar(efc$e17age)
#' ageGrpLab <- sju.groupVarLabels(efc$e17age)
#' sjp.frq(ageGrp,
#' title=efc.var[['e17age']],
#' axisLabels.x=ageGrpLab)
#' # minimal theme
#' sjp.frq(ageGrp,
#' title=efc.var[['e17age']],
#' axisLabels.x=ageGrpLab,
#' theme="minimal",
#' minorGridColor="white",
#' showTickMarks=FALSE,
#' hideGrid.x=TRUE)
#'
#' # box plots with interaction variable
#' # the following example is equal to the function call
#' # sjp.grpfrq(efc$e17age, efc$e16sex, type="box")
#' sjp.frq(efc$e17age,
#' title=paste(efc.var[['e17age']], "by", efc.var[['e16sex']]),
#' interactionVar=efc$e16sex,
#' interactionVarLabels=efc.val['e16sex'],
#' type="box")
#'
#' # negative impact scale, ranging from 7-28, assuming that
#' # variable scale (lowest value) starts with 1 (default assumption)
#' sjp.frq(efc$neg_c_7)
#'
#' # negative impact scale, ranging from 7-28, setting
#' # start index of x-axis to lowest value (7)
#' sjp.frq(efc$neg_c_7, startAxisAt=7)
#'
#' @import ggplot2
#' @export
sjp.frq <- function(varCount,
title=NULL,
titleSize=1.3,
titleColor="black",
weightBy=NULL,
weightByTitleString=NULL,
interactionVar=NULL,
maxYlim=FALSE,
upperYlim=NULL,
order="none",
type="bars",
axisLabels.x=NULL,
interactionVarLabels=NULL,
axisLabelAngle.x=0,
axisLabelSize=1.1,
axisLabelColor="gray30",
valueLabelSize=4,
valueLabelColor="black",
breakTitleAt=50,
breakLabelsAt=12,
gridBreaksAt=NULL,
barWidth=0.6,
dotSize=4,
barColor=NULL,
barAlpha=1,
barOutline=FALSE,
barOutlineSize=0.2,
innerBoxPlotWidth=0.15,
innerBoxPlotDotSize=3,
borderColor=NULL,
axisColor=NULL,
outlineColor="black",
majorGridColor=NULL,
minorGridColor=NULL,
hideGrid.x=FALSE,
hideGrid.y=FALSE,
showValueLabels=TRUE,
showCountValues=TRUE,
showPercentageValues=TRUE,
showAxisLabels.x=TRUE,
showAxisLabels.y=TRUE,
showTickMarks=TRUE,
showMeanIntercept=FALSE,
showMeanValue=TRUE,
showStandardDeviation=TRUE,
showNormalCurve=FALSE,
meanInterceptLineType=2,
meanInterceptLineSize=0.5,
normalCurveColor="red",
normalCurveSize=0.7,
normalCurveAlpha=0.4,
axisTitle.x=NULL,
axisTitle.y=NULL,
axisTitleColor="black",
axisTitleSize=1.3,
startAxisAt=1,
autoGroupAt=NULL,
theme=NULL,
flipCoordinates=FALSE,
omitNA=TRUE,
returnPlot=FALSE) {
# --------------------------------------------------------
# count variable may not be a factor!
# --------------------------------------------------------
if (is.factor(varCount)) {
varCount <- as.numeric(as.character(varCount))
}
# --------------------------------------------------------
# We have several options to name the histrogram type
# Here we will reduce it to a unique value
# --------------------------------------------------------
if (type=="b" || type=="bar") {
type <- c("bars")
}
if (type=="dot") {
type <- c("dots")
}
if (type=="h" || type=="hist") {
type <- c("histogram")
}
if (type=="d" || type=="density") {
type <- c("dens")
}
if (type=="l" || type=="lines") {
type <- c("line")
}
if (type=="box" || type=="boxplot") {
type <- c("boxplots")
}
if (type=="v") {
type <- c("violin")
}
#---------------------------------------------------
# weight variable
#---------------------------------------------------
# weightby <- function(var, weight) {
# items <- unique(var)
# newvar <- c()
# for (i in 1:length(items)) {
# newcount = round(sum(weight[which(var==items[i])]))
# newvar <- c(newvar, rep(items[i], newcount))
# }
# return (newvar)
# }
weightby <- function(var, weight) {
# init values
weightedvar <- c()
wtab <- round(xtabs(weight ~ var, data=data.frame(cbind(weight=weight,var=var))))
# iterate all table values
for (w in 1:length(wtab)) {
# retrieve count of each table cell
w_count <- wtab[[w]]
# retrieve "cell name" which is identical to the variable value
w_value <- as.numeric(names(wtab[w]))
# append variable value, repeating it "w_count" times.
weightedvar <- c(weightedvar, rep(w_value, w_count))
}
return(weightedvar)
}
if (!is.null(weightBy)) {
varCount <- weightby(varCount, weightBy)
}
#---------------------------------------------------
# check whether variable should be auto-grouped
#---------------------------------------------------
if (!is.null(autoGroupAt) && length(unique(varCount))>=autoGroupAt) {
cat(sprintf("\nVariable has %i unique values and was grouped...\n", length(unique(varCount))))
agcnt <- ifelse (autoGroupAt<30, autoGroupAt, 30)
axisLabels.x <- sju.groupVarLabels(varCount, groupsize="auto", autoGroupCount=agcnt)
varCount <- sju.groupVar(varCount, groupsize="auto", asNumeric=TRUE, autoGroupCount=agcnt)
}
#---------------------------------------------------
# create frequency data frame
#---------------------------------------------------
df <- as.data.frame(table(varCount))
names(df) <- c("y", "Freq")
# --------------------------------------------------------
# unlist labels
# --------------------------------------------------------
# Help function that unlists a list into a vector
unlistlabels <- function(lab) {
dummy <- unlist(lab)
labels <- c()
for (i in 1:length(dummy)) {
labels <- c(labels, as.character(dummy[i]))
}
return (labels)
}
if (!is.null(axisLabels.x) && is.list(axisLabels.x)) {
axisLabels.x <- unlistlabels(axisLabels.x)
}
if (!is.null(interactionVarLabels) && is.list(interactionVarLabels)) {
interactionVarLabels <- unlistlabels(interactionVarLabels)
}
# --------------------------------------------------------
# Define amount of category, include zero counts
# --------------------------------------------------------
# Zero counts of categories are not plotted by default just becaus
# these categories don't appear in the data. If we assume a
# "quasi-continuous" scale (categories from 1 to 4 etc.), we now
# identify the zero counts and add / insert them into the data frame.
# This enables us to plot zero counts as well.
# We guess the maximum amount of categories either by the amount
# of supplied category labels. If no category labels were passed
# as parameter, we assume that the maximum value found in the category
# columns represents the highest category number
catcount <- 0
lower_lim <- 0
catmin <- min(na.omit(varCount))
# Factors have to be transformed into numeric values
# for continiuos x-axis-scale
df$y <- as.numeric(as.character(df$y))
# if categories start with zero, fix this here
if (min(df$y)==0) {
df$y <- df$y+1
}
# get the highest answer category of "y", so we know where the
# range of the x-axis ends
if (!is.null(axisLabels.x)) {
catcount <- length(axisLabels.x)
}
else {
# determine maximum values
# first, check the total amount of different factor levels
catcount_1 <- length(unique(na.omit(varCount)))
# second, check the maximum factor level
catcount_2 <- max(na.omit(varCount))
# if categories start with zero, fix this here
if (min(na.omit(varCount))==0) {
catcount_2 <- catcount_2+1
}
# catcount should contain the higher values, i.e. the maximum count of
# categories (factor levels) corresponds either to the highest factor level
# value or to the amount of different factor levels, depending on which one
# is larger
catcount <- ifelse (catcount_1 > catcount_2, catcount_1, catcount_2)
}
# Create a vector of zeros
frq <- rep(0,catcount)
# Replace the values in freq for those indices which equal dummyf$xa
# by dummyf$ya so that remaining indices are ones which you
# intended to insert
frq[df$y] <- df$Freq
# create new data frame. We now have a data frame with all
# variable categories abd their related counts, including
# zero counts, but no(!) missings!
mydat <- as.data.frame(cbind(var=startAxisAt:catcount, frq=frq[startAxisAt:catcount]))
# caculate missings here
missingcount <- length(which(is.na(varCount)))
# --------------------------------------------------------
# --------------------------------------------------------
# Trim labels and title to appropriate size
# --------------------------------------------------------
# check length of diagram title and split longer string at into new lines
# every 50 chars
if (!is.null(title)) {
# if we have weighted values, say that in diagram's title
if (!is.null(weightByTitleString)) {
title <- paste(title, weightByTitleString, sep="")
}
title <- sju.wordwrap(title, breakTitleAt)
}
# check length of x-axis title and split longer string at into new lines
# every 50 chars
if (!is.null(axisTitle.x)) {
axisTitle.x <- sju.wordwrap(axisTitle.x, breakTitleAt)
}
# check length of x-axis title and split longer string at into new lines
# every 50 chars
if (!is.null(axisTitle.y)) {
axisTitle.y <- sju.wordwrap(axisTitle.y, breakTitleAt)
}
# check length of x-axis-labels and split longer strings at into new lines
# every 10 chars, so labels don't overlap
if (!is.null(axisLabels.x)) {
axisLabels.x <- sju.wordwrap(axisLabels.x, breakLabelsAt)
}
# If axisLabels.x were not defined, simply set numbers from 1 to
# amount of categories (=number of rows) in dataframe instead
else {
axisLabels.x <- c(startAxisAt:(nrow(mydat)+startAxisAt-1))
}
# check length of x-axis-labels of interaction variable and split
# longer strings into new lines
if (!is.null(interactionVar)) {
if (!is.null(interactionVarLabels)) {
interactionVarLabels <- sju.wordwrap(interactionVarLabels, breakLabelsAt)
}
# If interaction-variable-labels were not defined, simply set numbers from 1 to
# amount of categories instead
else {
iavarLabLength <- length(unique(na.omit(interactionVar)))
interactionVarLabels <- c(1:iavarLabLength)
}
}
# --------------------------------------------------------
# --------------------------------------------------------
# Handle missings
# --------------------------------------------------------
# If missings are not removed, add an
# "NA" to labels and a new row to data frame which contains the missings
if (!omitNA) {
axisLabels.x = c(axisLabels.x, "NA")
mydat <- rbind(mydat, c(catcount+1, missingcount))
# also add a columns with percentage values of count distribution
mydat <- data.frame(cbind(mydat, prz = c(round(100*mydat$frq/length(varCount),2))))
}
else {
# also add a columns with percentage values of count distribution
mydat <- data.frame(cbind(mydat, prz = c(round(100*mydat$frq/length(na.omit(varCount)),2))))
}
# --------------------------------------------------------
# --------------------------------------------------------
# Order categories ascending or descending
# --------------------------------------------------------
if (order=="asc" || order=="desc") {
ord <- order(mydat$frq, decreasing=(order=="desc"))
mydat$frq <- mydat$frq[ord]
mydat$prz <- mydat$prz[ord]
axisLabels.x <- axisLabels.x[ord]
}
# --------------------------------------------------------
# If we have a histogram, caluclate means of groups
# --------------------------------------------------------
if (is.null(weightBy)) {
mittelwert <- mean(varCount, na.rm=TRUE)
}
else {
mittelwert <- weighted.mean(varCount, weightBy, na.rm=TRUE)
}
stddev <- sd(varCount, na.rm=TRUE)
# --------------------------------------------------------
# If we have boxplots, use different data frame structure
# --------------------------------------------------------
if (type=="boxplots" || type=="violin") {
if (is.null(interactionVar)) {
mydat <- na.omit(data.frame(cbind(grp=1, frq=varCount, var=varCount)))
}
else {
mydat <- na.omit(data.frame(cbind(grp=1, ia=interactionVar, frq=varCount, var=varCount)))
mydat$ia <- as.factor(mydat$ia)
}
mydat$grp <- as.factor(mydat$grp)
}
# --------------------------------------------------------
# Prepare bar charts
# --------------------------------------------------------
trimViolin <- FALSE
# calculate upper y-axis-range
# if we have a fixed value, use this one here
if (!is.null(upperYlim)) {
upper_lim <- upperYlim
}
else {
# in case we have a histrogram, calculate
# max. y lim depending on highest value
if (type!="bars" && type!="dots") {
# if we have boxplots, we have different ranges, so we can adjust
# the y axis
if (type=="boxplots" || type=="violin") {
# use an extra standard-deviation as limits for the y-axis when we have boxplots
lower_lim <- min(na.omit(varCount)) - floor(sd(na.omit(varCount)))
upper_lim <- max(na.omit(varCount)) + ceiling(sd(na.omit(varCount)))
# make sure that the y-axis is not below zero
if (lower_lim < 0) {
lower_lim <- 0
trimViolin <- TRUE
}
}
else {
# ... or the amount of max. answers per category
upper_lim <- histYlim(varCount)
}
}
else {
# else calculate upper y-axis-range depending
# on the amount of cases...
if (maxYlim) {
upper_lim <- basisYlim(length(varCount))
}
else {
# ... or the amount of max. answers per category
upper_lim <- freqYlim(mydat$frq)
}
}
}
# --------------------------------------------------------
# check whether bars should have an outline
# --------------------------------------------------------
if (!barOutline) {
outlineColor <- waiver()
}
# --------------------------------------------------------
# define bar colors
# --------------------------------------------------------
# check whether barcolor is defined
if (is.null(barColor)) {
# set default color for histograms
barColor <- c("#4080c0")
if (type=="bars") {
geob <- geom_bar(stat="identity", colour=outlineColor, width=barWidth, alpha=barAlpha)
}
else if (type=="dots") {
geob <- geom_point(size=dotSize, alpha=barAlpha)
}
}
else {
# continue here, if barcolor is defined.
if (type=="bars") {
geob <- geom_bar(stat="identity", fill=barColor, colour=outlineColor, width=barWidth, alpha=barAlpha)
}
else if (type=="dots") {
geob <- geom_point(colour=barColor, size=dotSize, alpha=barAlpha)
}
}
# --------------------------------------------------------
# Set theme and default grid colours. grid colours
# might be adjusted later
# --------------------------------------------------------
hideGridColor <- c("white")
if (is.null(theme)) {
ggtheme <- theme_gray()
hideGridColor <- c("gray90")
}
else if (theme=="bw") {
ggtheme <- theme_bw()
}
else if (theme=="classic") {
ggtheme <- theme_classic()
}
else if (theme=="minimal") {
ggtheme <- theme_minimal()
}
else if (theme=="none") {
ggtheme <- theme_minimal()
majorGridColor <- c("white")
minorGridColor <- c("white")
showTickMarks <-FALSE
}
# --------------------------------------------------------
# Hide or show Tick Marks and Category Labels (x axis text)
# --------------------------------------------------------
if (!showTickMarks) {
ggtheme <- ggtheme + theme(axis.ticks = element_blank())
}
if (!showAxisLabels.x) {
axisLabels.x <- c("")
}
# --------------------------------------------------------
# Set value labels
# --------------------------------------------------------
if (flipCoordinates) {
# adjust vertical position for labels, based on whether percentage values
# are shown or not
vert <- ifelse((showPercentageValues == TRUE && showCountValues == TRUE), 0.5, 0.1)
hort <- 1.5
}
else {
# adjust vertical position for labels, based on whether percentage values
# are shown or not
vert <- ifelse((showPercentageValues == TRUE && showCountValues == TRUE), -0.2, -0.6)
hort <- waiver()
}
# --------------------------------------------------------
# Set value labels
# --------------------------------------------------------
# don't display value labels when we have boxplots or violin plots
if (type=="boxplots" || type=="violin") {
showValueLabels <- FALSE
}
if (showValueLabels) {
# here we have counts and percentages
if (showPercentageValues && showCountValues) {
ggvaluelabels <- geom_text(label=sprintf("%i\n(%.01f%%)", mydat$frq, mydat$prz),
size=valueLabelSize,
vjust=vert,
hjust = hort,
colour=valueLabelColor)
}
else if (showCountValues) {
# here we have counts, without percentages
ggvaluelabels <- geom_text(label=sprintf("%i", mydat$frq),
hjust = hort,
size=valueLabelSize,
vjust=vert,
colour=valueLabelColor)
}
else if (showPercentageValues) {
# here we have counts, without percentages
ggvaluelabels <- geom_text(label=sprintf("%.01f%%", mydat$prz),
hjust = hort,
size=valueLabelSize,
vjust=vert,
colour=valueLabelColor)
}
else {
# no labels
ggvaluelabels <- geom_text(label="")
}
}
else {
# no labels
ggvaluelabels <- geom_text(label="")
}
# --------------------------------------------------------
# Set up grid breaks
# --------------------------------------------------------
maxx <- max(mydat$var) + 1
if (is.null(gridBreaksAt)) {
gridbreaks <- waiver()
histgridbreaks <- waiver()
}
else {
gridbreaks <- c(seq(lower_lim, upper_lim, by=gridBreaksAt))
histgridbreaks <- c(seq(lower_lim, maxx, by=gridBreaksAt))
}
# --------------------------------------------------------
# Set up grid colours
# --------------------------------------------------------
majorgrid <- NULL
minorgrid <- NULL
if (!is.null(majorGridColor)) {
majorgrid <- element_line(colour=majorGridColor)
}
if (!is.null(minorGridColor)) {
minorgrid <- element_line(colour=minorGridColor)
}
hidegrid <- element_line(colour=hideGridColor)
# ----------------------------------
# set y scaling and label texts
# ----------------------------------
# set Y-axis, depending on the calculated upper y-range.
# It either corresponds to the maximum amount of cases in the data set
# (length of var) or to the highest count of var's categories.
if (showAxisLabels.y) {
yscale <- scale_y_continuous(limits=c(lower_lim, upper_lim), expand=c(0,0), breaks=gridbreaks)
}
else {
yscale <- scale_y_continuous(limits=c(lower_lim, upper_lim), expand=c(0,0), breaks=gridbreaks, labels=NULL)
}
# ----------------------------------
# Print plot
# ----------------------------------
# Check how many categories we have on the x-axis.
# If it exceeds the user defined limits, plot
# histrogram instead of bar chart
# ----------------------------------
if (type=="bars" || type=="dots") {
# mydat is a data frame that only contains one variable (var).
# Must be declared as factor, so the bars are central aligned to
# each x-axis-break.
baseplot <- ggplot(mydat, aes(x=factor(var), y=frq, fill=var)) +
geob +
yscale +
# remove guide / legend
guides(fill=FALSE) +
# show absolute and percentage value of each bar.
ggvaluelabels +
# print value labels to the x-axis.
# If parameter "axisLabels.x" is NULL, the category numbers (1 to ...)
# appear on the x-axis
scale_x_discrete(labels=axisLabels.x)
# check whether coordinates should be flipped, i.e.
# swap x and y axis
if (flipCoordinates) {
baseplot <- baseplot + coord_flip()
}
}
else {
# --------------------------------------------------
# Here we start when we have a histogram instead of
# bar plot.
# --------------------------------------------------
# Start density plot here
# --------------------------------------------------
if (type=="boxplots" || type=="violin") {
if (is.null(interactionVar)) {
baseplot <- ggplot(mydat, aes(x=grp, y=frq))
scalex <- scale_x_discrete(labels="")
}
else {
baseplot <- ggplot(mydat, aes(x=interaction(ia, grp), y=frq))
scalex <- scale_x_discrete(labels=interactionVarLabels)
}
if (type=="boxplots") {
baseplot <- baseplot +
geom_boxplot(colour=outlineColor, width=barWidth, alpha=barAlpha, fill=barColor)
}
else {
baseplot <- baseplot +
geom_violin(colour=outlineColor, width=barWidth, alpha=barAlpha, fill=barColor, trim=trimViolin) +
# if we have a violin plot, add an additional boxplot inside to show
# more information
geom_boxplot(width=innerBoxPlotWidth, fill="white", outlier.colour=NA)
}
# if we have boxplots or violon plots, also add a point that indicates
# the mean value
# different fill colours, because violin boxplots have white background
fcsp <- ifelse(type=="boxplots", "white", "black")
baseplot <- baseplot +
stat_summary(fun.y="mean", geom="point", shape=21, size=innerBoxPlotDotSize, fill=fcsp)
# no additional labels for the x- and y-axis, only diagram title
baseplot <- baseplot +
yscale +
scalex
}
# --------------------------------------------------
# Start density plot here
# --------------------------------------------------
else if (type=="dens") {
geoh <- geom_histogram(fill=barColor, colour=outlineColor, size=barOutlineSize, alpha=barAlpha)
densityDat <- data.frame(cbind(varCount))
# First, plot histogram with density curve
baseplot <- ggplot(densityDat, aes(x=varCount, y=..density..)) +
geoh +
# transparent density curve above bars
geom_density(fill="cornsilk", alpha=0.3) +
# remove margins from left and right diagram side
scale_x_continuous(expand=c(0,0), breaks=histgridbreaks)
}
else {
# -----------------------------------------------------------------
# Since the density curve shows no absolute numbers (counts) on the
# y-axis, have also the opportunity to plot "real" histrograms with
# counts on the y-axis
# -----------------------------------------------------------------
basehist <- geom_histogram(stat="identity", fill=barColor, colour=outlineColor, size=barOutlineSize, alpha=barAlpha, binwidth=barWidth)
basehistline <- geom_area(fill=barColor, alpha=0.3)
# base constructor
baseplot <- ggplot(mydat, aes(x=var, y=frq))
# check whether user wants line or bar histogram
if (type=="line") {
baseplot <- baseplot + basehistline + geom_line()
}
else {
baseplot <- baseplot + basehist
}
# check whether user wants to overlay the histogram
# with a normal curve
if (showNormalCurve) {
baseplot <- baseplot +
stat_function(fun=function(x, mean, sd, n) { n*dnorm(x=x, mean=mean, sd=sd) },
args=with(mydat, c(mean=mittelwert, sd=stddev, n=length(varCount))),
colour=normalCurveColor,
size=normalCurveSize,
alpha=normalCurveAlpha)
}
# if we have a histogram, add mean-lines
if (showMeanIntercept) {
baseplot <- baseplot +
# vertical lines indicating the mean
geom_vline(xintercept=mittelwert, linetype=meanInterceptLineType, size=meanInterceptLineSize)
# check whether meanvalue should be shown.
if (showMeanValue) {
baseplot <- baseplot +
# use annotation instead of geomtext, because we need mean value only printed once
annotate("text", x=mittelwert, y=upper_lim, parse=TRUE, label=paste("italic(bar(x)) == ", "'", c(round(mittelwert,1)), "'"), size=valueLabelSize, colour=valueLabelColor, hjust=1.1, vjust=2.2)
}
# check whether the user wants to plot standard deviation area
if (showStandardDeviation) {
baseplot <- baseplot +
# first draw shaded rectangle. these are by default in grey colour with very high transparancy
annotate("rect", xmin=mittelwert-stddev, xmax=mittelwert+stddev, ymin=0, ymax=c(upper_lim), fill="grey70", alpha=0.2) +
# draw border-lines for shaded rectangle
geom_vline(xintercept=mittelwert-stddev, linetype=3, size=meanInterceptLineSize, alpha=0.7) +
geom_vline(xintercept=mittelwert+stddev, linetype=3, size=meanInterceptLineSize, alpha=0.7)
# if mean values are plotted, plot standard deviation values as well
if (showMeanValue) {
baseplot <- baseplot +
# use annotation instead of geomtext, because we need mean value only printed once
annotate("text", x=mittelwert, y=upper_lim, label=sprintf("italic(s) == %.2f", round(stddev,1)), parse=TRUE, size=valueLabelSize, colour=valueLabelColor, hjust=1.15, vjust=4.2)
}
}
}
baseplot <- baseplot +
# show absolute and percentage value of each bar.
ggvaluelabels +
# remove margins from left and right diagram side
scale_x_continuous(limits=c(catmin,maxx), expand=c(0,0), breaks=histgridbreaks) +
yscale
}
}
# set axes text and
baseplot <- baseplot +
labs(title=title, x=axisTitle.x, y=axisTitle.y) +
ggtheme +
theme(axis.text = element_text(size=rel(axisLabelSize), colour=axisLabelColor),
axis.text.x = element_text(angle=axisLabelAngle.x),
axis.title = element_text(size=rel(axisTitleSize), colour=axisTitleColor),
plot.title = element_text(size=rel(titleSize), colour=titleColor))
# the panel-border-property can only be applied to the bw-theme
if (!is.null(borderColor)) {
if (!is.null(theme) && theme=="bw") {
baseplot <- baseplot +
theme(panel.border = element_rect(colour=borderColor))
}
else {
cat("\nParameter 'borderColor' can only be applied to 'bw' theme.\n")
}
}
if (!is.null(axisColor)) {
baseplot <- baseplot +
theme(axis.line = element_line(colour=axisColor))
}
if (!is.null(minorgrid)) {
baseplot <- baseplot +
theme(panel.grid.minor = minorgrid)
}
if (!is.null(majorgrid)) {
baseplot <- baseplot +
theme(panel.grid.major = majorgrid)
}
if (hideGrid.x) {
baseplot <- baseplot +
theme(panel.grid.major.x = hidegrid,
panel.grid.minor.x = hidegrid)
}
if (hideGrid.y) {
baseplot <- baseplot +
theme(panel.grid.major.y = hidegrid,
panel.grid.minor.y = hidegrid)
}
# ---------------------------------------------------------
# Check whether ggplot object should be returned or plotted
# ---------------------------------------------------------
if (returnPlot) {
return(baseplot)
}
else {
plot(baseplot)
}
}
# Berechnet die aufgerundete Obergrenze der y-Achse anhand
# der maximal möglichen Fallzahl einer Antwortmöglichkeit
# Dadurch werden Balkendiagramme eines Datensatzes immer im
# gleichen Vergältnis dargestellt, da die y-Achse nie variiert,
# sondern immer von 0 bis (Anzahl der Fälle) geht.
#
# Parameter:
# - len: die Anzahl an max. möglichen Fällen
basisYlim <- function(len) {
anzahl <- 1
while (len>=(10*anzahl)) {
anzahl <- anzahl * 10
}
while(len>=anzahl) {
anzahl <- anzahl + round(anzahl/10,0)
}
# retval <- (ceiling(len/anzahl)*anzahl)
# return (retval)
return (anzahl)
}
# Berechnet die aufgerundete Obergrenze der y-Achse anhand
# des höchsten Datenwertes einer Antwortmöglichkeit.
# Dadurch werden Balkendiagramme eines Datensatzes immer unterschiedlich
# dargestellt, je nach Anzahl der häufigsten Antworten. Die y-Achse
# geht immer von 0 bis (maximale Antworthäufigkeit einer Variable)
#
# Parameter:
# - var: die Variable mit den Antwortmöglichkeiten
freqYlim <- function(var) {
# suche die Antwort mit den häufigsten Antworten,
# also den höchsten Wert einer Variablenausprägung
len <- max(var)
anzahl <- 5
while (len>=(10*anzahl)) {
anzahl <- anzahl + 5
}
correct <- 10+(floor(log10(len))-1)
return (correct*anzahl)
}
histYlim <- function(var) {
# suche die Antwort mit den häufigsten Antworten,
# also den höchsten Wert einer Variablenausprägung
len <- max(table(var))
if (len<100) {
anzahl <- 10
}
else {
anzahl <- 100
}
li <- ceiling(len/anzahl)
if ((li %% 2) == 1) {
li <- li+1
}
retval <- li*anzahl
return (retval)
}
# usage:
# df<-insertRowToDF(df,5,c(16,0)); # inserting the values (16,0) after the 5th row
insertRowToDF<-function(X,index_after,vector_to_insert){
stopifnot(length(vector_to_insert) == ncol(X)) # to check valid row to be inserted
X<-rbind(X[1:index_after,],vector_to_insert,X[(index_after+1):nrow(X),])
row.names(X)<-1:nrow(X)
return (X)
}
Computing file changes ...