CCA_wrappers.r
#' Reorder a matrix
#'
#' This function reorders a matrix by rows of columns
#'
#' @param matrix1 a matrix (rows=genes x columns=samples) of gene expression data
#' (e.g., scRNA-seq)
#' @param by.rows By rows (TRUE; default) or by columns
#'
#' @return a reordered matrix
#'
#' @export
reorder_matrix <- function(matrix1,
by.rows = TRUE) {
if (by.rows == TRUE) {
conf.order <- order(apply(matrix1, 1, which.max))
matrix1.reordered <- matrix1[conf.order, ]
} else {
conf.order <- order(apply(matrix1, 2, which.max))
matrix1.reordered <- matrix1[, conf.order]
}
matrix1.reordered
}
#' Compare two cluster sets matched to CCA
#'
#' This function takes cluster calls defined in two different data sets and then
#' determines to what extent these cluster calls match up with cluster calls from CCA.
#'
#' @param cl a matrix (rows=genes x columns=samples) of gene expression data
#' (e.g., scRNA-seq)
#' @param by.rows By rows (TRUE; default) or by columns
#'
#' @return a reordered matrix
#'
#' @export
compareClusterCalls <- function(cl,
ref.cl,
cl.anno,
plot.title = NA, plot.silent = TRUE,
heat.colors = colorRampPalette(c("grey99", "orange", "red"))(100),
row.cl.num = min(length(unique(cl)), length(unique(ref.cl)))) {
library(grid)
library(pheatmap)
conf1 <- table(cl, ref.cl)
conf1 <- sweep(conf1, 1, rowSums(conf1), "/")
conf2 <- reorder_matrix(conf1)
# Cluster co-occurence
cl.prop.cocl <- apply(conf1, 2, function(x) {
grid1 <- expand.grid(x, x)
min.prop <- apply(grid1, 1, min)
})
cl.prop.cocl.total <- apply(cl.prop.cocl, 1, sum)
cl.prop.cocl.m <- matrix(cl.prop.cocl.total, nrow(conf1), nrow(conf1),
dimnames = list(rownames(conf1), rownames(conf1))
)
ph1 <- pheatmap(conf2,
cutree_rows = row.cl.num, clustering_method = "ward.D2",
annotation_row = cl.anno, color = heat.colors, fontsize = 6,
main = plot.title, silent = plot.silent
)
list(conf = conf2, cocl = cl.prop.cocl.m, ph = ph1)
}
#' Get some summary statistics
#'
#' This does the summary. For each group return a vector with N, mean, and sd.
#' This is called by qcPlot.
#'
#' @param data Annotation data frame
#' @param measurevar what variables to measure
#' @param groupvars what to group by
#' @param na.rm how to treat NA
#' @param conf.interval confidence interval (default = .95)
#' @param .drop drop something?
#' @param roundall should everything be rounded
#'
#' @return a data frame of statistics
#'
#' @export
summarySE <- function(data = NULL,
measurevar,
groupvars = NULL,
na.rm = FALSE,
conf.interval = .95,
.drop = TRUE,
roundall = F) {
require(dplyr)
#
names(data)[names(data) == measurevar] <- "measurevar"
datac <- data %>%
select(one_of(groupvars, "measurevar")) %>%
filter(ifelse(na.rm == T, !is.na(measurevar), T)) %>%
mutate(measurevar = as.numeric(measurevar)) %>%
group_by_(c(groupvars)) %>%
summarise(
N = n(),
median = median(measurevar),
mean = mean(measurevar),
max = max(measurevar),
sd = ifelse(N == 1, 0, sd(measurevar)),
q25 = as.numeric(quantile(measurevar, 0.25)),
q75 = as.numeric(quantile(measurevar, 0.75))
) %>%
mutate(se = sd / sqrt(N))
# %>% mutate(ci = se * qt(conf.interval/2 + 0.5, N-1))
if (roundall) {
roundcols <- c("median", "mean", "max", "sd", "q25", "q75", "se", "ci")
datac[roundcols] <- round(datac[roundcols], 3)
}
# datac <- datac %>% mutate(xpos = 1:n())
datac
}
#' Make QC plots
#'
#' Makes QC plots
#'
#' @param anno Annotation data frame for patch-seq
#' @param dendcluster_anno what to cluster by
#' @param groupvars what to group by
#' @param scaleLimits scaleLimits
#' @param scaleBreaks scaleBreaks
#' @param scaleLabels scaleLabels
#' @param ylab ylab
#' @param fileName fileName
#'
#' @return the plot is returned
#'
#' @export
qcPlot <- function(anno,
dendcluster_anno,
name,
scaleLimits = c(-5000, 12000),
scaleBreaks = seq(0, 12000, 2000),
scaleLabels = seq(0, 12, 2),
ylab = "value",
fileName = gsub("\\.", "_", gsub("_label", "", name)),
outputFolder) {
# dendcluster_id is the annotation for cluster ordering based on the current, bootstrapped dendrogram
stats <- summarySE(data = anno, measurevar = name, groupvars = "dendcluster_id")
genes_plot <- ggplot() +
# geom_quasirandom from the ggbeeswarm package
# makes violin-shaped jittered point plots
geom_quasirandom(
data = anno,
aes(
x = dendcluster_id,
y = eval(parse(text = name))
),
color = "skyblue",
# Need to set position_jitter height = 0 to prevent
# jitter on the y-axis, which changes data representation
position = position_jitter(width = .3, height = 0), size = 0.1
) +
# Errorbars built using stats values
geom_errorbar(
data = stats,
aes(x = dendcluster_id, ymin = q25, ymax = q75),
size = 0.2
) +
# Median points from stats
geom_point(
data = stats,
aes(x = dendcluster_id, y = median),
color = "red",
size = 0.5
) +
# Cluster labels as text objects
geom_text(
data = dendcluster_anno,
aes(
x = dendcluster_id, y = 0, label = dendcluster_label,
color = dendcluster_color
),
angle = 90,
hjust = 2,
vjust = 0.3,
size = 2 * 5 / 6
) +
scale_color_identity() +
# Expand the y scale so that the labels are visible
scale_y_continuous(ylab,
limits = scaleLimits,
breaks = scaleBreaks,
labels = scaleLabels
) +
# Remove X-axis title
scale_x_continuous("") +
theme_bw() +
# Theme tuning
theme(
axis.text.x = element_blank(),
axis.ticks = element_blank(),
panel.border = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank()
)
ggsave(paste0(outputFolder, fileName, "_QC.pdf"), genes_plot, width = 8, height = 4, useDingbats = F)
genes_plot
}