https://github.com/MolecularCellBiologyImmunology/cytof-periventricular-ms
Raw File
Tip revision: 70e973c2d935e4ff2cb080d7feef0dd08c23e061 authored by sabelarl on 07 July 2021, 12:56:38 UTC
Update README.md
Tip revision: 70e973c
testDS_LMM_adjusted.R
#' Adjusted version of the DS-LMM method. It applies a different threshold for 
#' the minimum number of cells; it only considers selected population & marker 
#' combinations for the analysis of the differential expresssion; it specifies 
#' the contrasts between disease groups.

testDS_LMM_adjusted <- function (d_counts, d_medians, formula, contrast, weights = TRUE, 
          markers_to_test = NULL, min_cells = 100, min_samples = NULL, 
          contrast_extra = F, contrast_extra_con = NULL) 
{
    # if (is.null(min_samples)) {
    #     min_samples <- ncol(d_counts)/3
    # }
    if (!is.null(markers_to_test)) {
        markers_to_test <- markers_to_test
    }
    else {
        markers_to_test <- metadata(d_medians)$id_state_markers
    }
    counts <- assays(d_counts)[["counts"]]
    cluster_id <- rowData(d_counts)$cluster_id
    ## Threshold of minimum number of cells per cluster:
    # tf <- counts >= min_cells
    # ix_keep <- apply(tf, 1, function(r) sum(r) >= min_samples)
    ix_keep <- apply(counts, 1, function(r) sum(r) > min_cells) # threshold per cluster, not per sample
    
    counts <- counts[ix_keep, , drop = FALSE]
    cluster_id <- cluster_id[ix_keep]
    if (is.logical(weights)) {
        if (weights) {
            weights <- colSums(counts)
        }
        else {
            weights <- NULL
        }
    }
    else if (is.numeric(weights)) {
        stopifnot(length(weights) == ncol(d_counts))
    }
    ## REMOVE this because we specify contrast below 
    # if (ncol(contrast) == 1 & nrow(contrast) > 1) { 
    #     contrast <- t(contrast)
    # }
    
    ### In order to see which cell pop and marker we're analysing:
    state_names <- names(assays(d_medians))[markers_to_test]
    state_names <- state_names[state_names != "CD39"]
    foo <- as.list(assays(d_medians)[state_names])
    bar <- foo
    meds <- rbindlist(lapply(names(bar), function(a) {
        aa <- as.data.table(bar[[a]], keep.rownames = T)
        aa[, marker := a]
        setnames(aa, 'rn', 'pops')
        aa
        }))
    meds <- meds[pops %in% cluster_id] # keep only pops with enough cells (>100 per cluster)
    meds[, unique := paste(pops, marker, sep = "_")]
    meds <- meds[unique %in% state_dt$unique] # Keep only selected combination marker-pop
    # Remove combinations marker-pop with zero expression 
    limit <- ncol(meds[, 2:ncol(meds)])*2/3 # set limit at third samples (at max 0 expression in 2/3 of samples)
    meds <- meds[rowSums(meds==0, na.rm=TRUE) < limit, ]
    
    combi <- unique(meds$unique)
    meds <- as.matrix(meds, rownames = meds$unique)
    meds_all <- do.call("rbind", as.list(assays(d_medians)[state_names]))
    
    p_vals <- c()
    for (i in combi) {
        p_vals[i] <- tryCatch({
            y <- meds[i,]
            print(y[length(y)]) # print cluster name and marker
            to_remove <- c(1, length(y), length(y)-1)
            y <- y[-to_remove]
            data_i <- cbind(cbind.data.frame(y, weights), formula$data)
            data_i$y <- as.numeric(data_i$y)
            if ("age" %in% colnames(data_i)) {
                data_i$age <- as.numeric(data_i$age) # if added to the model
            }
            if ("pmd_hours" %in% colnames(data_i)) {
                data_i$pmd_hours <- as.numeric(data_i$pmd_hours) #  if added to the model
            }
            if (formula$random_terms) {
                fit <- lmer(formula$formula, data = data_i, 
                            weights = weights)
            }
            else {
                fit <- lm(formula$formula, data = data_i, weights = weights)
            }
            print(summary(fit)) # print model
            ### Create contrast (from https://publicifsv.sund.ku.dk/~jufo/courses/rm2018/nlmePackage.pdf):
            name.coef <- names(coef(fit))#[["sample_id"]])
            n.coef <- length(name.coef)
            contr <- matrix(0, nrow = 1, ncol = n.coef,
                            dimnames = list("contrast", name.coef))
            contr[1, contrast] <- 1  # choose among: conditionms, conditionad, age, genderm, pmd_hours
            if (contrast_extra) {
                contr[1, contrast_extra_con] <- -1 
            }# C[1, "conditionms"] <- -1 # optional, if you want to compare to other than intercept (control)
            test <- glht(fit, contr)
            print(summary(test))
            print(contrast_extra)
            summary(test)$test$pvalues
        }, error = function(e) NA)
    }
    
    ### Reduce multiple testing adjustment
    #Adjust per pop + Keep only p_vals of state markers of interest per cell pop
    pvals_dt <- as.data.table(p_vals, keep.rownames = T) 
    setnames(pvals_dt, "rn", "unique")
    # Keep only selected markers:
    pvals_dt <- merge(pvals_dt, state_dt)[, c("unique", "p_vals", "pop", "marker")]
    padj_dt <- c()
    for (popu in unique(pvals_dt$pop)) {
        pop_vals <- pvals_dt[pop == popu]
        pop_vals[, p_adj := p.adjust(p_vals, method = "fdr")]
        padj_dt <- rbind(padj_dt, pop_vals)
    }
    setnames(padj_dt, c("pop", "p_vals", "marker"), c("cluster_id", "p_val", "marker_id"))
    # padj_dt <- padj_dt[, c("p_val", "cluster_id", "p_adj", "marker_id")]
    padj_dt <- as.data.frame(padj_dt)
    res <- padj_dt
    
    ### This is not necessary anymore, since we just return the list of pvals and padjs:
    # p_adj <- p.adjust(p_vals[!is.na(p_vals)], method = "fdr")
    # stopifnot(length(p_vals) == length(p_adj))
    # out <- data.frame(p_val = p_vals, p_adj = p_adj, stringsAsFactors = FALSE)
    # row_data <- as.data.frame(matrix(as.numeric(NA), nrow = length(p_adj), ncol = ncol(out)))
    # colnames(row_data) <- colnames(out)
    # cluster_id_nm <- as.numeric(cluster_id)
    # s <- seq(0, nlevels(cluster_id) * (length(state_names) - 
    #                                        1), by = nlevels(cluster_id))
    # r1 <- rep(cluster_id_nm, length(state_names))
    # r2 <- rep(s, each = length(cluster_id_nm))
    # stopifnot(length(s) == length(state_names))
    # stopifnot(length(r1) == length(r2))
    # rows <- r1 + r2
    # row_data[rows, ] <- out
    # clus <- factor(rep(levels(cluster_id), length(state_names)), 
    #                levels = levels(cluster_id))
    # stat <- factor(rep(state_names, each = length(levels(cluster_id))), 
    #                levels = state_names)
    # stopifnot(length(clus) == nrow(row_data), length(stat) == 
    #               nrow(row_data))
    # row_data <- cbind(data.frame(cluster_id = clus, marker_id = stat, 
    #                              stringsAsFactors = FALSE), row_data)
    # col_data <- colData(d_medians)
    # res <- SummarizedExperiment(meds_all, rowData = row_data, 
    #                             colData = col_data)
    return(list(res = res, meds = meds))
}
back to top