https://github.com/cran/sjPlot
Tip revision: 01a8850ec45ac0f48f2b4fad6d22cf7b88ac0df0 authored by Daniel Lüdecke on 23 August 2017, 07:16:47 UTC
version 2.3.3
version 2.3.3
Tip revision: 01a8850
sjPlotClusterAnalysis.R
# bind global variables
utils::globalVariables(c("xpos", "value", "Var2", "grp", "prc", "fg", "cprc", "se", "group", "var", "kmeans"))
#' @title Compute quick cluster analysis
#' @name sjc.qclus
#' @description Compute a quick kmeans or hierarchical cluster analysis and displays "cluster characteristics"
#' as plot.
#'
#' @references Maechler M, Rousseeuw P, Struyf A, Hubert M, Hornik K (2014) cluster: Cluster Analysis Basics and Extensions. R package.
#'
#' @param data A data frame with variables that should be used for the
#' cluster analysis.
#' @param groupcount Amount of groups (clusters) used for the cluster solution. May also be
#' a set of initial (distinct) cluster centres, in case \code{method = "kmeans"}
#' (see \code{\link{kmeans}} for details on \code{centers} argument).
#' If \code{groupcount = NULL} and \code{method = "kmeans"}, the optimal
#' amount of clusters is calculated using the gap statistics (see
#' \code{\link{sjc.kgap}}). For \code{method = "hclust"}, \code{groupcount}
#' needs to be specified. Following functions may be helpful for estimating
#' the amount of clusters:
#' \itemize{
#' \item Use \code{\link{sjc.elbow}} to determine the group-count depending on the elbow-criterion.
#' \item If \code{method = "kmeans"}, use \code{\link{sjc.kgap}} to determine the group-count according to the gap-statistic.
#' \item If \code{method = "hclust"} (hierarchical clustering, default), use \code{\link{sjc.dend}} to inspect different cluster group solutions.
#' \item Use \code{\link{sjc.grpdisc}} to inspect the goodness of grouping (accuracy of classification).
#' }
#' @param groups Optional, by default, this argument is \code{NULL} and will be
#' ignored. However, to plot existing cluster groups, specify \code{groupcount}
#' and \code{groups}. \code{groups} is a vector of same length as
#' \code{nrow(data)} and indicates the group classification of the cluster
#' analysis. The group classification can be computed with the
#' \code{\link{sjc.cluster}} function. See 'Examples'.
#' @param method Method for computing the cluster analysis. By default (\code{"kmeans"}), a
#' kmeans cluster analysis will be computed. Use \code{"hclust"} to
#' compute a hierarchical cluster analysis. You can specify the
#' initial letters only.
#' @param distance Distance measure to be used when \code{method = "hclust"} (for hierarchical
#' clustering). Must be one of \code{"euclidean"}, \code{"maximum"}, \code{"manhattan"},
#' \code{"canberra"}, \code{"binary"} or \code{"minkowski"}. See \code{\link{dist}}.
#' If is \code{method = "kmeans"} this argument will be ignored.
#' @param agglomeration Agglomeration method to be used when \code{method = "hclust"} (for hierarchical
#' clustering). This should be one of \code{"ward"}, \code{"single"}, \code{"complete"}, \code{"average"},
#' \code{"mcquitty"}, \code{"median"} or \code{"centroid"}. Default is \code{"ward"} (see \code{\link{hclust}}).
#' If \code{method = "kmeans"} this argument will be ignored. See 'Note'.
#' @param iter.max Maximum number of iterations allowed. Only applies, if
#' \code{method = "kmeans"}. See \code{\link{kmeans}} for details on this argument.
#' @param algorithm Algorithm used for calculating kmeans cluster. Only applies, if
#' \code{method = "kmeans"}. May be one of \code{"Hartigan-Wong"} (default),
#' \code{"Lloyd"} (used by SPSS), or \code{"MacQueen"}. See \code{\link{kmeans}}
#' for details on this argument.
#' @param show.accuracy Logical, if \code{TRUE}, the \code{\link{sjc.grpdisc}} function will be called,
#' which computes a linear discriminant analysis on the classified cluster groups and plots a
#' bar graph indicating the goodness of classification for each group.
#' @param show.grpcnt Logical, if \code{TRUE} (default), the count within each cluster group is added to the
#' legend labels (e.g. \code{"Group 1 (n=87)"}).
#' @param reverse.axis Logical, if \code{TRUE}, the values on the x-axis are reversed.
#'
#' @inheritParams sjp.grpfrq
#'
#' @return (Invisibly) returns an object with
#' \itemize{
#' \item \code{data}: the used data frame for plotting,
#' \item \code{plot}: the ggplot object,
#' \item \code{groupcount}: the number of found cluster (as calculated by \code{\link{sjc.kgap}})
#' \item \code{classification}: the group classification (as calculated by \code{\link{sjc.cluster}}), including missing values, so this vector can be appended to the original data frame.
#' \item \code{accuracy}: the accuracy of group classification (as calculated by \code{\link{sjc.grpdisc}}).
#' }
#'
#' @details Following steps are computed in this function:
#' \enumerate{
#' \item If \code{method = "kmeans"}, this function first determines the optimal group count via gap statistics (unless argument \code{groupcount} is specified), using the \code{\link{sjc.kgap}} function.
#' \item A cluster analysis is performed by running the \code{\link{sjc.cluster}} function to determine the cluster groups.
#' \item Then, all variables in \code{data} are scaled and centered. The mean value of these z-scores within each cluster group is calculated to see how certain characteristics (variables) in a cluster group differ in relation to other cluster groups.
#' \item These results are plotted as graph.
#' }
#' This method can also be used to plot existing cluster solution as graph witouth computing
#' a new cluster analysis. See argument \code{groups} for more details.
#'
#' @note See 'Note' in \code{\link{sjc.cluster}}
#'
#' @examples
#' \dontrun{
#' # k-means clustering of mtcars-dataset
#' sjc.qclus(mtcars)
#'
#' # k-means clustering of mtcars-dataset with 4 pre-defined
#' # groups in a faceted panel
#' sjc.qclus(airquality, groupcount = 4, facet.grid = TRUE)}
#'
#' # k-means clustering of airquality data
#' # and saving the results. most likely, 3 cluster
#' # groups have been found (see below).
#' airgrp <- sjc.qclus(airquality)
#'
#' # "re-plot" cluster groups, without computing
#' # new k-means cluster analysis.
#' sjc.qclus(airquality, groupcount = 3, groups = airgrp$classification)
#'
#' @import ggplot2
#' @importFrom stats na.omit
#' @importFrom graphics plot
#' @importFrom sjmisc std
#' @export
sjc.qclus <- function(data,
groupcount = NULL,
groups = NULL,
method = c("kmeans", "hclust"),
distance = c("euclidean", "maximum", "manhattan", "canberra", "binary", "minkowski"),
agglomeration = c("ward", "ward.D", "ward.D2", "single", "complete", "average", "mcquitty", "median", "centroid"),
iter.max = 20,
algorithm = c("Hartigan-Wong", "Lloyd", "MacQueen"),
show.accuracy = FALSE,
title = NULL,
axis.labels = NULL,
wrap.title = 40,
wrap.labels = 20,
wrap.legend.title = 20,
wrap.legend.labels = 20,
facet.grid = FALSE,
geom.colors = "Paired",
geom.size = 0.5,
geom.spacing = 0.1,
show.legend = TRUE,
show.grpcnt = TRUE,
legend.title = NULL,
legend.labels = NULL,
coord.flip = FALSE,
reverse.axis = FALSE,
prnt.plot = TRUE) {
# --------------------------------------------------------
# match arguments
# --------------------------------------------------------
distance <- match.arg(distance)
method <- match.arg(method)
agglomeration <- match.arg(agglomeration)
algorithm <- match.arg(algorithm)
# --------------------------------------------------------
# save original data frame
# --------------------------------------------------------
rownames(data) <- seq_len(nrow(data))
data.origin <- data
# remove missings
data <- stats::na.omit(data)
# check for valid argument
if (is.null(axis.labels)) axis.labels <- colnames(data)
# --------------------------------------------------------
# Trim labels and title to appropriate size
# --------------------------------------------------------
# check length of diagram title and split longer string at into new lines
if (!is.null(title)) title <- sjmisc::word_wrap(title, wrap.title)
# check length of legend title and split longer string at into new lines
if (!is.null(legend.title)) legend.title <- sjmisc::word_wrap(title, wrap.legend.title)
# check length of y-axis title and split longer string at into new lines
if (!is.null(legend.labels)) legend.labels <- sjmisc::word_wrap(legend.labels, wrap.legend.labels)
# check length of x-axis-labels and split longer strings at into new lines
# every 10 chars, so labels don't overlap
axis.labels <- sjmisc::word_wrap(axis.labels, wrap.labels)
# ---------------------------------------------
# check for auto-groupcount
# ---------------------------------------------
if (is.null(groupcount)) {
# ------------------------
# check if suggested package is available
# ------------------------
if (!requireNamespace("cluster", quietly = TRUE)) {
stop("Package `cluster` needed for this function to work. Please install it.", call. = FALSE)
}
# check whether method is kmeans. hierarchical clustering
# requires a specified groupcount
if (method != "kmeans") {
message("Cannot compute hierarchical cluster analysis when `groupcount` is NULL. Using kmeans clustering instead.")
method <- "kmeans"
}
# retrieve optimal group count via gap statistics
kgap <- sjc.kgap(data, plotResults = F)
# save group counts
groupcount <- kgap$solution
}
# ---------------------------------------------
# run cluster analysis with claculated group count
# ---------------------------------------------
if (is.null(groups)) {
# check for argument and R version
if (agglomeration == "ward") agglomeration <- "ward.D2"
grp.class <- grp <- sjc.cluster(data.origin, groupcount, method, distance, agglomeration, iter.max, algorithm)
} else {
grp.class <- grp <- groups
}
# remove missings
grp <- stats::na.omit(grp)
# ---------------------------------------------
# check whether groupcount was matrix or not
# ---------------------------------------------
if (is.matrix(groupcount)) groupcount <- length(unique(grp))
# ---------------------------------------------
# auto-set legend labels
# ---------------------------------------------
if (is.null(legend.labels)) legend.labels <- sprintf("Group %i", seq_len(groupcount))
# --------------------------------------------------------
# show goodness of classification
# --------------------------------------------------------
grp.accuracy <- sjc.grpdisc(data,
groups = grp,
groupcount = groupcount,
prnt.plot = show.accuracy)
# ---------------------------------------------
# Add group count to legend labels
# ---------------------------------------------
if (show.grpcnt || show.accuracy) {
# iterate legend labels
for (i in seq_len(length(legend.labels))) {
# label string for group count
gcnt.label <- sprintf("n=%i", length(which(grp == i)))
# label string for accuracy
acc.label <- sprintf("accuracy=%.2f%%", 100 * grp.accuracy$accuracy[i])
# prepare legend label
legend.labels[i] <- paste0(legend.labels[i], " (")
# add group count to each label
if (show.grpcnt) legend.labels[i] <- paste0(legend.labels[i], gcnt.label)
if (show.grpcnt && show.accuracy) legend.labels[i] <- paste0(legend.labels[i], ", ")
if (show.accuracy) legend.labels[i] <- paste0(legend.labels[i], acc.label)
legend.labels[i] <- paste0(legend.labels[i], ")")
}
}
# scale data
z <- sjmisc::std(data)
# retrieve column count
colnr <- ncol(data)
# init data frame
df <- data.frame()
# ---------------------------------------------
# iterate all columns to calculate group means
# ---------------------------------------------
for (cnt in seq_len(colnr)) {
# retrieve column data
coldat <- z[[cnt]]
# ---------------------------------------------
# iterate groups
# ---------------------------------------------
for (i in seq_len(groupcount)) {
# retrieve column values for each group
grpvalues <- coldat[which(grp == i)]
# calculate mean
mw <- mean(grpvalues, na.rm = TRUE)
df <- rbind(df, cbind(x = cnt, y = mw, group = i))
}
# df %.% group_by(grp) %.% summarise(mean(a))
}
# --------------------------------------------------------
# factor for discrete scale
# --------------------------------------------------------
df$group <- as.factor(df$group)
# --------------------------------------------------------
# create plot
# --------------------------------------------------------
if (reverse.axis) {
gp <- ggplot(df, aes(x = rev(x), y = y, fill = group))
axis.labels <- rev(axis.labels)
} else {
gp <- ggplot(df, aes_string(x = "x", y = "y", fill = "group"))
}
gp <- gp +
geom_bar(stat = "identity",
position = position_dodge(geom.size + geom.spacing),
width = geom.size) +
scale_x_discrete(breaks = seq_len(colnr),
limits = seq_len(colnr),
labels = axis.labels) +
labs(title = title, x = "Cluster group characteristics", y = "Mean of z-scores", fill = legend.title)
# --------------------------------------------------------
# check whether coordinates should be flipped, i.e.
# swap x and y axis
# --------------------------------------------------------
if (coord.flip) gp <- gp + coord_flip()
# --------------------------------------------------------
# use facets
# --------------------------------------------------------
if (facet.grid) gp <- gp + facet_wrap(~group)
# ---------------------------------------------------------
# set geom colors
# ---------------------------------------------------------
gp <- sj.setGeomColors(gp,
geom.colors,
length(legend.labels),
show.legend,
legend.labels)
# --------------------------------------------------------
# plot
# --------------------------------------------------------
if (prnt.plot) graphics::plot(gp)
# --------------------------------------------------------
# return values
# --------------------------------------------------------
invisible(structure(class = "sjcqclus",
list(data = df,
groupcount = groupcount,
classification = grp.class,
accuracy = grp.accuracy$accuracy,
plot = gp)))
}
#' @title Compute hierarchical or kmeans cluster analysis
#' @name sjc.cluster
#' @description Compute hierarchical or kmeans cluster analysis and return the group
#' association for each observation as vector.
#'
#' @references Maechler M, Rousseeuw P, Struyf A, Hubert M, Hornik K (2014) cluster: Cluster Analysis Basics and Extensions. R package.
#'
#' @inheritParams sjc.qclus
#'
#' @return The group classification for each observation as vector. This group
#' classification can be used for \code{\link{sjc.grpdisc}}-function to
#' check the goodness of classification.
#' The returned vector includes missing values, so it can be appended
#' to the original data frame \code{data}.
#'
#' @note Since R version > 3.0.3, the \code{"ward"} option has been replaced by
#' either \code{"ward.D"} or \code{"ward.D2"}, so you may use one of
#' these values. When using \code{"ward"}, it will be replaced by \code{"ward.D2"}.
#' \cr \cr
#' To get similar results as in SPSS Quick Cluster function, following points
#' have to be considered:
#' \enumerate{
#' \item Use the \code{/PRINT INITIAL} option for SPSS Quick Cluster to get a table with initial cluster centers.
#' \item Create a \code{\link{matrix}} of this table, by consecutively copying the values, one row after another, from the SPSS output into a matrix and specify \code{nrow} and \code{ncol} arguments.
#' \item Use \code{algorithm="Lloyd"}.
#' \item Use the same amount of \code{iter.max} both in SPSS and this \code{sjc.qclus}.
#' }
#' This ensures a fixed initial set of cluster centers (as in SPSS), while \code{\link{kmeans}} in R
#' always selects initial cluster sets randomly.
#'
#' @examples
#' # Hierarchical clustering of mtcars-dataset
#' groups <- sjc.cluster(mtcars, 5)
#'
#' # K-means clustering of mtcars-dataset
#' groups <- sjc.cluster(mtcars, 5, method="k")
#'
#' @import ggplot2
#' @importFrom stats dist na.omit hclust kmeans cutree
#' @export
sjc.cluster <- function(data,
groupcount = NULL,
method = c("hclust", "kmeans"),
distance = c("euclidean", "maximum", "manhattan", "canberra", "binary", "minkowski"),
agglomeration = c("ward", "ward.D", "ward.D2", "single", "complete", "average", "mcquitty", "median", "centroid"),
iter.max = 20,
algorithm = c("Hartigan-Wong", "Lloyd", "MacQueen")) {
# --------------------------------------------------------
# match arguments
# --------------------------------------------------------
distance <- match.arg(distance)
method <- match.arg(method)
agglomeration <- match.arg(agglomeration)
algorithm <- match.arg(algorithm)
# --------------------------------------------------------
# save original data frame
# --------------------------------------------------------
data.origin <- data
# create id with index numbers for rows
data.origin$sj.grp.id <- seq_len(nrow(data.origin))
# create NA-vector of same length as data frame
complete.groups <- rep(NA, times = nrow(data.origin))
# Prepare Data
# listwise deletion of missing
data <- stats::na.omit(data)
# remove missings
data.origin <- stats::na.omit(data.origin)
# ---------------------------------------------
# check for auto-groupcount
# ---------------------------------------------
if (is.null(groupcount)) {
# ------------------------
# check if suggested package is available
# ------------------------
if (!requireNamespace("cluster", quietly = TRUE)) {
stop("Package `cluster` needed for this function to work. Please install it.", call. = FALSE)
}
# check whether method is kmeans. hierarchical clustering
# requires a specified groupcount
if (method != "kmeans") {
message("Cannot compute hierarchical cluster analysis when `groupcount` is NULL. Using kmeans clustering instead.")
method <- "kmeans"
}
# retrieve optimal group count via gap statistics
kgap <- sjc.kgap(data, plotResults = F)
# save group counts
groupcount <- kgap$solution
}
# --------------------------------------------------
# Ward Hierarchical Clustering
# --------------------------------------------------
if (method == "hclust") {
# check for argument and R version
if (agglomeration == "ward") agglomeration <- "ward.D2"
# distance matrix
d <- stats::dist(data, method = distance)
# hierarchical clustering, using ward
hc <- stats::hclust(d, method = agglomeration)
# cut tree into x clusters
groups <- stats::cutree(hc, k = groupcount)
} else {
km <- stats::kmeans(data, centers = groupcount, iter.max = iter.max, algorithm = algorithm)
# return cluster assignment
groups <- km$cluster
}
# -----------------------------------
# create vector with cluster group classification,
# including missings
# -----------------------------------
# assign valid group values
complete.groups[data.origin$sj.grp.id] <- groups
# return group assignment
return(complete.groups)
}
#' @title Compute hierarchical cluster analysis and visualize group classification
#' @name sjc.dend
#' @description Computes a hierarchical cluster analysis and plots a hierarchical
#' dendrogram with highlighted rectangles around the classified groups.
#' Can be used, for instance, as visual tool to verify the elbow-criterion
#' (see \code{\link{sjc.elbow}}).
#'
#' @param groupcount The amount of groups (clusters) that should be used.
#' \itemize{
#' \item Use \code{\link{sjc.elbow}}-function to determine the group-count depending on the elbow-criterion.
#' \item Use \code{\link{sjc.grpdisc}}-function to inspect the goodness of grouping (accuracy of classification).
#' }
#' Solutions for multiple cluster groups can be plotted, for instance with \code{"groupcount = c(3:6)"}.
#'
#' @inheritParams sjc.qclus
#'
#' @note Since R version > 3.0.3, the \code{"ward"} option has
#' been replaced by either \code{"ward.D"} or \code{"ward.D2"},
#' so you may use one of these values. When using \code{"ward"},
#' it will be replaced by \code{"ward.D2"}.
#'
#' @examples
#' # Plot dendrogram of hierarchical clustering of mtcars-dataset
#' # and show group classification
#' sjc.dend(mtcars, 5)
#'
#' # Plot dendrogram of hierarchical clustering of mtcars-dataset
#' # and show group classification for 2 to 4 groups
#' sjc.dend(mtcars, 2:4)
#'
#' @importFrom stats dist hclust cutree na.omit rect.hclust
#' @importFrom scales brewer_pal
#' @importFrom graphics plot rect par
#' @export
sjc.dend <- function(data, groupcount, distance = "euclidean", agglomeration = "ward") {
# Prepare Data
# listwise deletion of missing
data <- stats::na.omit(data)
# --------------------------------------------------
# Ward Hierarchical Clustering
# --------------------------------------------------
# distance matrix
d <- stats::dist(data, method = distance)
# check for argument and R version
if (agglomeration == "ward") agglomeration <- "ward.D2"
# hierarchical clustering, using ward
hc <- stats::hclust(d, method = agglomeration)
# display simple dendrogram
graphics::plot(hc,
main = "Cluster Dendrogramm",
xlab = sprintf("Hierarchical Cluster Analysis, %s-Method",
agglomeration))
# now plot overlaying rectangles, depending on the amount of groupcounts
gl <- length(groupcount)
if (gl > 1) {
# retrieve different colors
color <- scales::brewer_pal("qual", "Set1")(gl)
# iterate all groupcounts
for (cnt in seq_len(gl)) {
k <- groupcount[cnt]
# retrieve cluster
cluster <- stats::cutree(hc, k = k)
# create table with cluster groups
clustab <- table(cluster)[unique(cluster[hc$order])]
m <- c(0, cumsum(clustab))
which <- 1L:k
# draw dendrogram with red borders around the clusters
# source code taken from "rect.hclust" from base-package
for (n in seq_along(which)) {
graphics::rect(m[which[n]] + 0.46 + (cnt * 0.2),
graphics::par("usr")[3L],
m[which[n] + 1] + 0.53 - (cnt * 0.2),
mean(rev(hc$height)[(k - 1):k]),
border = color[cnt],
lwd = 2)
}
}
} else {
# draw dendrogram with red borders around the clusters
stats::rect.hclust(hc, k = groupcount, border = "red")
}
}
#' @title Compute a linear discriminant analysis on classified cluster groups
#' @name sjc.grpdisc
#' @description Computes linear discriminant analysis on classified cluster groups.
#' This function plots a bar graph indicating the goodness of classification
#' for each group.
#'
#' @param groups group classification of the cluster analysis that was returned
#' from the \code{\link{sjc.cluster}}-function
#' @param groupcount amount of groups (clusters) that should be used. Use
#' \code{\link{sjc.elbow}} to determine the group-count depending
#' on the elbow-criterion.
#' @param clss.fit logical, if \code{TRUE} (default), a vertical line indicating the
#' overall goodness of classification is added to the plot, so one can see
#' whether a certain group is below or above the average classification goodness.
#'
#' @inheritParams sjc.cluster
#' @inheritParams sjp.grpfrq
#'
#' @return (Invisibly) returns an object with
#' \itemize{
#' \item \code{data}: the used data frame for plotting,
#' \item \code{plot}: the ggplot object,
#' \item \code{accuracy}: a vector with the accuracy of classification for each group,
#' \item \code{total.accuracy}: the total accuracy of group classification.
#' }
#'
#' @examples
#' # retrieve group classification from hierarchical cluster analysis
#' # on the mtcars data set (5 groups)
#' groups <- sjc.cluster(mtcars, 5)
#'
#' # plot goodness of group classificatoin
#' sjc.grpdisc(mtcars, groups, 5)
#'
#' @importFrom stats na.omit
#' @importFrom graphics plot
#' @importFrom MASS lda
#' @import ggplot2
#' @export
sjc.grpdisc <- function(data, groups, groupcount, clss.fit = TRUE, prnt.plot = TRUE) {
# Prepare Data
# listwise deletion of missing
data <- stats::na.omit(data)
groups <- stats::na.omit(groups)
# ---------------------------------------------------------------
# compute discriminant analysis of groups on original data frame
# ---------------------------------------------------------------
disc <- MASS::lda(groups ~ ., data = data, na.action = "na.omit", CV = TRUE)
# ---------------------------------------------------------------
# Assess the accuracy of the prediction
# percent correct for each category of groups
# ---------------------------------------------------------------
ct <- table(groups, disc$class)
dg <- diag(prop.table(ct, 1))
# ---------------------------------------------------------------
# print barplot for correct percentage for each category of groups
# ---------------------------------------------------------------
perc <- round(100 * dg, 2)
percrest <- round(100 - perc, 2)
# total correct percentage
totalcorrect <- sum(diag(prop.table(ct)))
# round total percentages and transform to percent value
totalcorrect <- round(100 * totalcorrect, 2)
# create three data columns for data frame which is
# needed to plot the barchart with ggplot
newdat <- NULL
tmpdat <- NULL
filldat <- NULL
labeldat <- NULL
# data frame has flexible count of rows, depending on
# the amount of groups in the lda
for (i in seq_len(groupcount)) {
# first columns indicates the two parts of each group
# (correct percentage and remaining percentage untill 100%)
newdat <- rbind(newdat, paste("g", i, sep = ""))
newdat <- rbind(newdat, paste("g", i, sep = ""))
# second columns contains the percentage of lda
# followed by the remaining percentage to 100%
tmpdat <- rbind(tmpdat, perc[i])
tmpdat <- rbind(tmpdat, percrest[i])
# third columns indicates both which data row contains
# the lda-percentage and which one the remaining percentage
filldat <- rbind(filldat, "1")
filldat <- rbind(filldat, "2")
# last column is created for printing the label-values
# we need only on percentage value, otherwise double labels are
# printed
labeldat <- rbind(labeldat, perc[i])
labeldat <- rbind(labeldat, 0)
}
# create data frame
mydat <- data.frame(filldat, newdat, tmpdat, labeldat)
# name columns
names(mydat) <- c("fg", "grp", "prc", "cprc")
# fillgroup-indicator ($fg) needs to be a factor
mydat$fg <- factor(mydat$fg)
# plot bar charts, stacked proportional
# this works, because we always have two "values" (variables)
# for the X-axis in the $grp-columns indicating a group
classplot <- ggplot(mydat, aes_string(x = "grp", y = "prc", fill = "fg")) +
# use stat identity to show value, not count of $prc-variable
# draw no legend!
geom_bar(stat = "identity", colour = "black", show.legend = FALSE) +
# fill bars
scale_fill_manual(values = c("#235a80", "#80acc8")) +
# give chart and X-axis a title
labs(title = "Accuracy of cluster group classification (in %)",
x = "cluster groups",
y = NULL) +
# print value labels into bar chart
geom_text(aes_string(label = "cprc", y = "cprc"),
vjust = 1.2,
colour = "white") +
# larger font size for axes
theme(axis.line = element_line(colour = "gray"),
axis.text = element_text(size = rel(1.2)),
axis.title = element_text(size = rel(1.2))) +
# set ticks
scale_y_continuous(breaks = seq(0, 100, 10)) +
# change range on x-axis, so the text annotation is visible and
# beside the bars and not printed into them
coord_cartesian(ylim = c(0, 100),
xlim = c(-0.5, groupcount + 1))
if (clss.fit) {
classplot <- classplot +
# set line across all bars which indicates the total percentage of
# correct assigned cases
geom_hline(yintercept = totalcorrect,
linetype = 2,
colour = "#333333") +
# print text annotation
annotate("text",
x = 0,
y = totalcorrect,
vjust = 1.2,
label = paste("overall", c(totalcorrect), sep = "\n"),
size = 5,
colour = "#333333")
}
# --------------------------------------------------------
# plot
# --------------------------------------------------------
if (prnt.plot) graphics::plot(classplot)
# --------------------------------------------------------
# return values
# --------------------------------------------------------
invisible(structure(class = "sjcgrpdisc",
list(data = mydat,
accuracy = as.vector(dg),
total.accuracy = totalcorrect,
plot = classplot)))
}
#' @title Compute elbow values of a k-means cluster analysis
#' @name sjc.elbow
#' @description Plot elbow values of a k-means cluster analysis. This function
#' computes a k-means cluster analysis on the provided data frame
#' and produces two plots: one with the different elbow values
#' and a second plot that maps the differences between each
#' "step" (i.e. between elbow values) on the y-axis. An
#' increase in the second plot may indicate the elbow criterion.
#'
#' @param data data frame containing all variables that should be used for
#' determining the elbow criteria
#' @param steps maximum group-count for the k-means cluster analysis for
#' which the elbow-criterion should be displayed. Default is \code{15}.
#' @param show.diff logical, if \code{TRUE}, an additional plot with the differences between
#' each fusion step of the Elbow criterion calculation is shown. This plot
#' may help identifying the "elbow". Default for this argument is \code{FALSE}.
#'
#' @examples
#' # plot elbow values of mtcars dataset
#' sjc.elbow(mtcars)
#'
#' @import ggplot2
#' @importFrom tidyr gather
#' @importFrom grDevices rgb
#' @importFrom stats na.omit
#' @importFrom graphics plot
#' @importFrom rlang .data
#' @export
sjc.elbow <- function(data, steps = 15, show.diff = FALSE) {
# Prepare Data
# listwise deletion of missing
data <- stats::na.omit(data)
# define line linecolor
lcol <- grDevices::rgb(128, 172, 200, maxColorValue = 255)
# calculate elbow values (sum of squares)
wss <- (nrow(data) - 1) * sum(apply(data, 2, var))
for (i in 2:steps) wss[i] <- sum(kmeans(data, centers = i)$withinss)
# round and print elbow values
wssround <- round(wss, 0)
dfElbowValues <- as.data.frame(wssround)
dfElbowValues <- cbind(dfElbowValues, xpos = seq_len(nrow(dfElbowValues)))
# calculate differences between each step
diff <- c()
for (i in 2:steps) diff <- cbind(diff,wssround[i - 1] - wssround[i])
dfElbowDiff <- tidyr::gather(as.data.frame(diff),
"Var2",
"value",
!! seq_len(ncol(diff)),
factor_key = TRUE)
# --------------------------------------------------
# Plot diagram with sum of squares
# all pointes are connected with a line
# a bend the in curve progression might indicate elbow
# --------------------------------------------------
graphics::plot(ggplot(dfElbowValues, aes_string(x = "xpos", y = "wssround", label = "wssround")) +
geom_line(colour = lcol) +
geom_point(colour = lcol, size = 3) +
geom_text(hjust = -0.3) +
labs(title = "Elbow criterion (sum of squares)",
x = "Number of clusters",
y = "elbow value"))
# --------------------------------------------------
# Plot diagram with differences between each step
# increasing y-value on x-axis (compared to former y-values)
# might indicate elbow
# --------------------------------------------------
if (show.diff) {
graphics::plot(ggplot(dfElbowDiff, aes_string(x = "Var2", y = "value", label = "value")) +
geom_line(colour = lcol) +
geom_point(colour = lcol, size = 3) +
geom_text(hjust = -0.3) +
labs(title = "Elbow criterion (differences between sum of squares)",
x = "difference to previews cluster",
y = "delta"))
}
}
#' @title Compute gap statistics for k-means-cluster
#' @name sjc.kgap
#' @description An implementation of the gap statistic algorithm from Tibshirani, Walther, and Hastie's
#' "Estimating the number of clusters in a data set via the gap statistic".
#' This function calls the \code{\link[cluster]{clusGap}}-function of the
#' \pkg{cluster}-package to calculate the data for the plot.
#'
#' @seealso \code{\link{sjc.elbow}}
#'
#' @param x matrix, where rows are observations and columns are individual dimensions,
#' to compute and plot the gap statistic (according to a uniform reference distribution).
#' @param max maximum number of clusters to consider, must be at least two. Default
#' is 10.
#' @param B integer, number of Monte Carlo ("bootstrap") samples. Default is 100.
#' @param SE.factor [When \code{method} contains "SE"] Determining the optimal
#' number of clusters, Tibshirani et al. proposed the "1 S.E."-rule.
#' Using an SE.factor f, the "f S.E."-rule is used, more generally.
#' @param method character string indicating how the "optimal" number of clusters,
#' k^, is computed from the gap statistics (and their standard deviations),
#' or more generally how the location k^ of the maximum of f[k] should be
#' determined. Default is \code{"Tibs2001SEmax"}. Possible value are:
#' \describe{
#' \item{\code{"globalmax"}}{simply corresponds to the global maximum, i.e., is which.max(f).}
#' \item{\code{"firstmax"}}{gives the location of the first local maximum.}
#' \item{\code{"Tibs2001SEmax"}}{uses the criterion, Tibshirani et al(2001) proposed: "the smallest k such that f(k) >= f(k+1) - s_{k+1}". Note that this chooses k = 1 when all standard deviations are larger than the differences f(k+1) - f(k).}
#' \item{\code{"firstSEmax"}}{is the location of the first f() value which is not larger than the first local maximum minus SE.factor * SE.f[], i.e, within an "f S.E." range of that maximum (see also SE.factor).}
#' \item{\code{"globalSEmax"}}{(used in Dudoit and Fridlyand (2002), supposedly following Tibshirani's proposition) is the location of the first f() value which is not larger than the global maximum minus SE.factor * SE.f[], i.e, within an "f S.E." range of that maximum (see also SE.factor).}
#' }
#' @param plotResults logical, if \code{TRUE} (default), a graph visualiting the gap statistic will
#' be plotted. Use \code{FALSE} to omit the plot.
#'
#' @return An object containing the used data frame for plotting, the ggplot object
#' and the number of found cluster.
#'
#' @references \itemize{
#' \item Tibshirani R, Walther G, Hastie T (2001) Estimating the number of clusters in a data set via gap statistic. J. R. Statist. Soc. B, 63, Part 2, pp. 411-423
#' \item Maechler, M., Rousseeuw, P., Struyf, A., Hubert, M., Hornik, K.(2013). cluster: Cluster Analysis Basics and Extensions. R package version 1.14.4. (\href{https://cran.r-project.org/package=cluster}{web})
#' }
#'
#' @examples
#' \dontrun{
#' # plot gap statistic and determine best number of clusters
#' # in mtcars dataset
#' sjc.kgap(mtcars)
#'
#' # and in iris dataset
#' sjc.kgap(iris[,1:4])}
#'
#' @import ggplot2
#' @importFrom stats na.omit
#' @importFrom graphics plot
#' @export
sjc.kgap <- function(x,
max = 10,
B = 100,
SE.factor = 1,
method = "Tibs2001SEmax",
plotResults = TRUE) {
# ------------------------
# check if suggested package is available
# ------------------------
if (!requireNamespace("cluster", quietly = TRUE)) {
stop("Package 'cluster' needed for this function to work. Please install it.", call. = FALSE)
}
# Prepare Data
# listwise deletion of missing
x <- stats::na.omit(x)
# Gap Statistic for Estimating the Number of Clusters
gap <- cluster::clusGap(x, kmeans, max, B)
stopifnot((K <- nrow(T <- gap$Tab)) >= 1, SE.factor >= 0)
message("Clustering Gap statistic [\"clusGap\"].\n",
sprintf("B=%d simulated reference sets, k = 1..%d\n", gap$B, K),
sep = "")
# Gap Statistic for Estimating the Number of Clusters
nc <- cluster::maxSE(f = T[, "gap"],
SE.f = T[, "SE.sim"],
method = method,
SE.factor = SE.factor)
message(sprintf(" --> Number of clusters (method '%s'%s): %d\n",
method,
if (grepl("SE", method, fixed = T)) sprintf(", SE.factor=%g", SE.factor) else "",
nc))
# point size for cluster solution
nclus <- rep(2, max)
nclus[nc] <- 4
# coliur for cluster solution
cclus <- rep("black", max)
cclus[nc] <- "#cc3366"
# create data frame
df <- data.frame(x = seq_len(max),
y = gap$Tab[, 'gap'],
se = gap$Tab[, 'SE.sim'],
psize = nclus,
pcol = cclus)
# plot cluster solution
gp <- ggplot(df, aes_string(x = "x", y = "y")) +
geom_errorbar(aes(ymin = y - se, ymax = y + se),
width = 0,
size = 0.5,
colour = "#3366cc") +
geom_line(colour = "gray50") +
geom_point(colour = df$pcol, size = df$psize) +
scale_x_discrete(breaks = seq_len(nrow(df))) +
labs(x = "Number of clusters",
y = "Gap",
title = sprintf("Estimation of clusters (gap statistics)\n%i-cluster solution found", nc)) +
theme_classic()
if (plotResults) graphics::plot(gp)
# return value
invisible(structure(class = "sjckgap",
list(data = df,
plot = gp,
solution = nc)))
}