Revision 738b43f6dcce877b85c10f23dafc38e0098bd8fb authored by Daniel Luedecke on 19 February 2014, 11:32:18 UTC, committed by cran-robot on 19 February 2014, 11:32:18 UTC
1 parent 64f2fd6
sjTabFrequencies.R
#' @title Save frequencies as HTML table
#' @name sjt.frq
#'
#' @description Save (multiple) frequency tables as HTML file.
#'
#' @seealso \link{sjp.frq}
#'
#' @param data The variables which frequencies should be printed as table. Either use a single variable
#' (vector) or a data frame where each column represents a variable (see examples.
#' @param file The destination file, which will be in html-format. If no filepath is specified,
#' the file will be saved as temporary file and openend either in the RStudio View pane or
#' in the default web browser.
#' @param weightBy A weight factor that will be applied to weight all cases from \code{data}.
#' default is \code{NULL}, so no weights are used.
#' @param variableLabels A single character vector or a list of character vectors that indicate
#' the variable names of those variables from \code{data} and will be used as variable labels
#' in the output. Note that if multiple variables
#' are supplied (as data frame), the variable labels must be supplied as \code{list} object
#' (see examples).
#' @param valueLabels A list of character vectors that indicate the value labels of those variables
#' from \code{data}. Note that if multiple variables are supplied (as data frame), the
#' value labels must be supplied as nested \code{list} object (see examples).
#' @param autoGroupAt A value indicating at which length of unique values a variable from \code{data}
#' is automatically grouped into smaller units (see \link{sju.groupVar}). Variables with large
#' numbers of unique values may be too time consuming when a HTML table is created and R would
#' not respond any longer. Hence it's recommended to group such variables. Default value is 50,
#' i.e. variables with 50 and more unique values will be grouped using \link{sju.groupVar} with
#' \code{groupsize="auto"} parameter. By default, the maximum group count is 30. However, if
#' \code{autoGroupAt} is less than 30, \code{autoGroupAt} groups are built. Default value is \code{NULL},
#' i.e. auto-grouping is turned off.
#' @param alternateRowColors If \code{TRUE}, alternating rows are highlighted with a light gray
#' background color.
#' @param stringValue String label for the very first table column containing the values (see
#' \code{valueLabels}).
#' @param stringCount String label for the first table data column containing the counts. Default is \code{"N"}.
#' @param stringPerc String label for the second table data column containing the percentages, where the
#' count percentages include missing values.
#' @param stringValidPerc String label for the third data table column containing the valid percentages, i.e. the
#' count percentage value exluding possible missing values.
#' @param stringCumPerc String label for the last table data column containing the cumulative percentages.
#' @param stringMissingValue String label for the last table data row containing missing values.
#' @param highlightMedian If \code{TRUE}, the table row indicating the median value will
#' be highlighted.
#' @param highlightQuartiles If \code{TRUE}, the table row indicating the lower and upper quartiles will
#' be highlighted.
#' @param skipZeroRows If \code{TRUE}, rows with only zero-values are not printed. Default is \code{FALSE}.
#' @param showSummary If \code{TRUE} (default), a summary row with total and valid N as well as mean and
#' standard deviation is shown.
#' @param encoding The charset encoding used for variable and value labels. Default is \code{"UTF-8"}. Change
#' encoding if specific chars are not properly displayed (e.g.) German umlauts).
#' @return Invisibly returns a \link{structure} with the web page style sheet (\code{page.style}) and each
#' frequency table as web page content (\code{page.content.list}) for further use.
#'
#' @note The HTML tables can either be saved as file and manually opened (specify parameter \code{file}) or
#' they can be saved as temporary files and will be displayed in the RStudio Viewer pane (if working with RStudio)
#' or opened with the default web browser. Displaying resp. opening a temporary file is the
#' default behaviour (i.e. \code{file=NULL}).
#'
#' @examples
#' # load sample data
#' data(efc)
#'
#' # retrieve value and variable labels
#' variables <- sji.getVariableLabels(efc)
#' values <- sji.getValueLabels(efc)
#'
#' # show frequencies of "e42dep" in RStudio Viewer Pane
#' # or default web browser
#' \dontrun{
#' sjt.frq(efc$e42dep)}
#'
#' # plot and save frequency table of "e42dep" with labels
#' \dontrun{
#' sjt.frq(efc$e42dep,
#' file="dependency_labels.html",
#' variableLabels=variables['e42dep'],
#' valueLabels=values[['e42dep']])}
#'
#' # plot frequencies of e42dep, e16sex and c172code in one HTML file
#' # and show table in RStudio Viewer Pane or default web browser
#' \dontrun{
#' sjt.frq(as.data.frame(cbind(efc$e42dep, efc$e16sex, efc$c172code)),
#' variableLabels=list(variables['e42dep'], variables['e16sex'], variables['c172code']),
#' valueLabels=list(values[['e42dep']], values[['e16sex']], values[['c172code']]))}
#'
#' # plot larger scale including zero-counts
#' # and save to file, indicating median and quartiles
#' \dontrun{
#' sjt.frq(efc$neg_c_7,
#' file="negativeimpact.html",
#' variableLabels=variables['neg_c_7'],
#' valueLabels=values[['neg_c_7']],
#' highlightMedian=TRUE,
#' highlightQuartiles=TRUE)}
#'
#' @export
sjt.frq <- function (data,
file=NULL,
weightBy=NULL,
variableLabels=NULL,
valueLabels=NULL,
autoGroupAt=NULL,
alternateRowColors=FALSE,
stringValue="value",
stringCount="N",
stringPerc="raw %",
stringValidPerc="valid %",
stringCumPerc="cumulative %",
stringMissingValue="missings",
highlightMedian=FALSE,
highlightQuartiles=FALSE,
skipZeroRows=FALSE,
showSummary=TRUE,
encoding="UTF-8") {
# -------------------------------------
# table init
# -------------------------------------
toWrite <- sprintf("<html>\n<head>\n<meta http-equiv=\"Content-type\" content=\"text/html;charset=%s\">\n", encoding)
page.style <- "<style>\n.arc { background-color:#eaeaea }\n.qrow { border-bottom: 1px solid #cc3333 }\n.mdrow { font-weight:bolder; font-style:italic; color:#993333 }\n.abstand { margin-bottom: 2em }\ntable { border-collapse:collapse; border:none }\nth { border-top: double; text-align:center; font-style:italic; font-weight:normal }\ntable td { padding:0.2cm }\ntd.summary { text-align:right; font-style:italic; font-size:0.9em; padding-top:0.1cm; padding-bottom:0.1cm }\n.lasttablerow { border-top:1px solid; border-bottom: double }\n.firsttablerow { border-bottom:1px solid }\n.leftalign { text-align:left }\n.centeralign { text-align:center }\ncaption { font-weight: bold; text-align:left }\n</style>"
toWrite <- paste(toWrite, page.style)
toWrite <- paste(toWrite, "\n</head>\n<body>\n")
# -------------------------------------
# make data frame of single variable, so we have
# unique handling for the data
# -------------------------------------
if (!is.data.frame(data)) {
data <- as.data.frame(data)
}
# -------------------------------------
# determine number of variables
# -------------------------------------
nvar <- ncol(data)
# -------------------------------------
# transform variable and value labels
# to list object
# -------------------------------------
if (!is.null(variableLabels) && !is.list(variableLabels)) {
# if we have variable labels as vector, convert them to list
variableLabels <- as.list(variableLabels)
}
else if (is.null(variableLabels)) {
# if we have no variable labels, use column names
# of data frame
variableLabels <- as.list(colnames(data))
}
if (!is.null(valueLabels) && !is.list(valueLabels)) {
# if we have value labels as vector, convert them to list
valueLabels <- list(valueLabels)
}
else if (is.null(valueLabels)) {
# create list
valueLabels <- list()
# iterate all variables
for (i in 1:nvar) {
# retrieve variable
dummy <- data[,i]
# and add label range to value labels list
valueLabels <- c(valueLabels, list(min(dummy, na.rm=TRUE):max(dummy, na.rm=TRUE)))
}
}
# -------------------------------------
# header row of table
# -------------------------------------
page.content.list <- list()
headerRow <- sprintf(" <tr class=\"firsttablerow\">\n <th>%s</th>\n <th>%s</th>\n <th>%s</th>\n <th>%s</th>\n <th>%s</th>\n </tr>\n\n", stringValue, stringCount, stringPerc, stringValidPerc, stringCumPerc)
# -------------------------------------
# start iterating all variables
# -------------------------------------
for (cnt in 1:nvar) {
# -----------------------------------------------
# prepare data: create frequencies and weight them,
# if requested. put data into a data frame
#---------------------------------------------------
# get variable
orivar <- var <- data[,cnt]
# -----------------------------------------------
# check for length of unique values and skip if too long
# -----------------------------------------------
if (!is.null(autoGroupAt) && length(unique(var))>=autoGroupAt) {
cat(sprintf("\nVariable %s with %i unique values was grouped...\n", colnames(data)[cnt], length(unique(var))))
varsum <- var
agcnt <- ifelse (autoGroupAt<30, autoGroupAt, 30)
valueLabels[[cnt]] <- sju.groupVarLabels(var, groupsize="auto", autoGroupCount=agcnt)
var <- sju.groupVar(var, groupsize="auto", asNumeric=TRUE, autoGroupCount=agcnt)
}
# create frequency table
if (is.null(weightBy)) {
# unweighted, including NA
ftab <- table(var)
ftab.NA <- table(var, useNA="always")
ftab.perc <- 100*round(prop.table(ftab.NA),6)
ftab.valid <- 100*round(prop.table(ftab),6)
# unweihted
weightedvar <- NULL
varsum <- var
}
else {
# weighted, including NA
ftab <- round(xtabs(weightBy ~ var, data=data.frame(cbind(weightBy=weightBy,var=var))))
ftab.NA <- round(xtabs(weightBy ~ var, data=data.frame(cbind(weightBy=weightBy,var=var)), exclude=NULL, na.action=na.pass),0)
ftab.perc <- 100*round(prop.table(ftab.NA),6)
ftab.valid <- 100*round(prop.table(ftab),6)
#---------------------------------------------------
# retrieve summary. we reproduce the variable from the table
# matrix here because we have weights included
#---------------------------------------------------
# init values
weightedvar <- c()
# iterate all table values
for (w in 1:length(ftab)) {
# retrieve count of each table cell
w_count <- ftab[[w]]
# retrieve "cell name" which is identical to the variable value
w_value <- as.numeric(names(ftab[w]))
# append variable value, repeating it "w_count" times.
weightedvar <- c(weightedvar, rep(w_value, w_count))
}
varsum <- weightedvar
}
# retrieve summary
varsummary <- summary(varsum)
# retrieve median
var.median <- varsummary[[3]]
# retrieve quartiles
var.lowerq <- round(varsummary[[2]])
var.upperq <- round(varsummary[[5]])
#---------------------------------------------------
# new data frame from frequencies for current variable
#---------------------------------------------------
df <- as.data.frame(cbind(freq=c(ftab.NA), perc=c(ftab.perc), valid=c(ftab.valid,0)))
# add cumulative percentages
df$cumperc <- cumsum(df$valid)
# rename "NA" row
rownames(df)[nrow(df)] <- "NA"
# save rownames index numbers
rowindexnumbers <- as.numeric(c(rownames(df)))
# -------------------------------------
# start table tag
# -------------------------------------
page.content <- "<table>\n"
# -------------------------------------
# retrieve variable label
# -------------------------------------
varlab <- variableLabels[[cnt]]
# -------------------------------------
# table caption, variable label
# -------------------------------------
page.content <- paste(page.content, sprintf(" <caption>%s</caption>\n", varlab))
# -------------------------------------
# header row with column labels
# -------------------------------------
page.content <- paste(page.content, headerRow)
# -------------------------------------
# data rows with value labels
# -------------------------------------
# retrieve value labels
vallab <- valueLabels[[cnt]]
# determine value range
minval <- min(var, na.rm=TRUE)
maxval <- max(var, na.rm=TRUE)
# determine catcount, which is +1 if minval = 0
catcount <- ifelse(minval==0, maxval+1, maxval)
# check if value labels are NULL
if (is.null(vallab)) {
# set range as value labels
vallab <- as.character(c(minval:maxval))
}
# check whether value label length exceeds maximum value
# (i.e. missing in upper category)
if (catcount<length(vallab)) {
# correct maximum value
maxval <- length(vallab)
}
# set value range
valrange <- c(minval:maxval)
# iterate all labels, each one in one row
for (j in 1:length(valrange)) {
# search row index. we may have a zero value here
# in case a certain category value is zero (zero counts)
ri <- which(rowindexnumbers==valrange[j])
# check for zero count
if (is.null(ri) || length(ri)==0 || ri==0) {
# create zero-count-row
datarow <- c(0,0,0)
zerorow <- TRUE
}
else {
# retrieve data row
datarow <- df[ri,]
zerorow <- FALSE
}
# -------------------------------------
# check if to skip zero rows
# -------------------------------------
if (skipZeroRows && zerorow) {
# nothing here...
}
else {
# -------------------------------------
# access cell data via "celldata <- c(datarow[XY])
# we have 4 data cells (freq, perc, valid and cumperc)
# -------------------------------------
# write table data row
# -------------------------------------
# init default values
rowstring <- ""
rowcss <- "<tr>"
# init default value for alternating colors
if (alternateRowColors) rowstring <- ifelse(j %% 2 ==0, " arc", "")
# check whether we have median row and whether it should be highlighted
if (highlightMedian && ((j+minval)==(var.median+1))) {
rowcss <- sprintf("<tr class=\"mdrow%s\">", rowstring)
}
else {
# check whether we have lower quartile and whether it should be highlighted
if (highlightQuartiles) {
if(((j+minval)==(var.lowerq+1)) || ((j+minval)==(var.upperq+1))) {
rowcss <- sprintf("<tr class=\"qrow%s\">", rowstring)
}
else {
if (alternateRowColors) rowcss <- ifelse(j %% 2 ==0, "<tr class=\"arc\">", "<tr>")
}
}
else {
if (alternateRowColors) rowcss <- ifelse(j %% 2 ==0, "<tr class=\"arc\">", "<tr>")
}
}
# value label
page.content <- paste(page.content, sprintf(" %s\n <td class=\"leftalign\">%s</td>\n", rowcss, vallab[j]))
# cell values, first value is integer
page.content <- paste(page.content, sprintf(" <td class=\"centeralign\">%i</td>\n", as.integer(c(datarow[1])[[1]])))
for (i in 2:4) {
# following values are float
page.content <- paste(page.content, sprintf(" <td class=\"centeralign\">%.2f</td>\n", c(datarow[i])[[1]]))
}
# close row-tag
page.content <- paste(page.content, " </tr>\n", "\n")
}
}
# -------------------------------------
# write last table data row with NA
# -------------------------------------
# retrieve data row
datarow <- df[nrow(df),]
# -------------------------------------
# write table data row
# -------------------------------------
# value label
page.content <- paste(page.content, sprintf(" <tr class=\"lasttablerow\">\n <td class=\"leftalign\">%s</td>\n", stringMissingValue))
# cell values, first value is integer
page.content <- paste(page.content, sprintf(" <td class=\"centeralign\">%i</td>\n", as.integer(c(datarow[1])[[1]])))
# 2nd value is float. we don't need 3rd and 4th value as they are always 0 and 100
page.content <- paste(page.content, sprintf(" <td class=\"centeralign\">%.2f</td>\n <td></td>\n <td></td>\n", c(datarow[2])[[1]]))
# -------------------------------------
# add info for mean, standard deviation
# -------------------------------------
if (showSummary) {
vartot <- length(var)
varvalid <- vartot - length(var[which(is.na(var))])
if (is.null(weightBy)) {
mw <- mean(orivar, na.rm=TRUE)
}
else {
mw <- weighted.mean(orivar, weightBy, na.rm=TRUE)
}
page.content <- paste(page.content, sprintf(" </tr>\n\n <tr>\n <td class=\"summary\" colspan=\"5\">total N=%i · valid N=%i · x̄=%.2f · σ=%.2f</td>\n", vartot, varvalid, mw, sd(orivar, na.rm=TRUE)))
}
# -------------------------------------
# finish table
# -------------------------------------
page.content <- paste(page.content, " </tr>\n </table>")
# -------------------------------------
# add table to return value list, so user can access each
# single frequency table
# -------------------------------------
page.content.list[[length(page.content.list)+1]] <- page.content
toWrite <- paste(toWrite, page.content, "\n")
# -------------------------------------
# add separator in case we have more than one table
# -------------------------------------
if (nvar>1) {
toWrite = paste(toWrite, "\n<p class=\"abstand\"> </p>\n", "\n")
}
}
# -------------------------------------
# finish html page
# -------------------------------------
toWrite = paste(toWrite, "</body></html>", "\n")
# -------------------------------------
# check if we have filename specified
# -------------------------------------
if (!is.null(file)) {
# write file
write(toWrite, file=file)
}
else {
# else create and browse temporary file
htmlFile <- tempfile(fileext=".html")
write(toWrite, file=htmlFile)
# check whether we have RStudio Viewer
viewer <- getOption("viewer")
if (!is.null(viewer)) {
viewer(htmlFile)
}
else {
utils::browseURL(htmlFile)
}
# delete temp file
# unlink(htmlFile)
}
invisible (structure(class = "sjtfrq",
list(page.style = page.style,
page.content.list = page.content.list)))
}
Computing file changes ...