Raw File
differential_analysis.R
#' Plots and tables with the proportions of cell populations;
#' setting the formulas for differential abundance and 
#' differential expression analysis. Visualising marker expression
#' with heatmaps.

### PLOTS ###
stack_and_boxplots <- function(sce, settings) {
    ### Stacked plot cell type composition per sample
    set_dev(plot_path(settings, "de_abundance_stack"))
    p1 <- plotAbundances(sce, k = "merge1", by = "sample_id")
    print(p1)
    unset_dev()
    
    ### Boxplots
    set_dev(plot_path(settings, "de_abundance_boxplot"))
    p2 <- plotAbundances(sce, k = "merge1", by = "cluster_id", shape = "patient_id")
    print(p2)
    unset_dev()
    
    return(p1)
    return(p2)
}

### PROP TABLE ###
export_prop_table <- function(sce, merge = "merge1", name = "prop_table", settings) {
    ### Export proportion data (out of CD45+ immune cells) ###
    # Exclude CD45- cells from sce:
    equi <- as.data.table(metadata(sce)$cluster_codes)
    cd45_som <- equi[merge1 == "CD45-", som100]
    if (length(cd45_som) != 0) {
        sce_immune <- sce[, sce$cluster_id != cd45_som]
    } else {
        sce_immune <- sce
    }
    
    # Exclude Undefined to calculate proportions:
    exclusion <- cluster_ids(sce_immune, merge) != "Undefined"
    final_pops <- cluster_ids(sce_immune, merge)[exclusion] 
    final_sample_ids <- sample_ids(sce_immune)[exclusion]
    # Create proportion table from sce_immune:
    prop_table <- prop.table(table(final_pops, final_sample_ids), 2)
    prop_table <- as.data.table(prop_table)
    setnames(prop_table, c("final_pops", "final_sample_ids", "N"), c("pop", "sample_id", "prop"))
    # Clarify that Undefined are not counted for calculating proportion by changing values to NA:
    if ("Undefined" %in% prop_table$pop) {
        prop_table[pop == "Undefined", prop := NA]    
    }
    print(table_path(settings, name = name))
    exp_tab(table = prop_table, tab_path = table_path(settings, name = name))
}


### ABSOLUTE TABLE ###
### Create absolute tables and export them:
export_absolute_table <- function(sce, merge = "merge1", name = "absolute_table", settings) {
    absolute_table <- as.data.table(table(cluster_ids(sce, merge), sample_ids(sce)))
    setnames(absolute_table, c("V1", "V2", "N"), c("pop", "sample_id", "n"))
    exp_tab(table = absolute_table, tab_path = table_path(settings, name = name))
}

export_absolute_props <- function(sce, merge = "merge1", name = "abs_props", settings,
                                  import_tab = "absolute_table") {
    abs_zoomed <- fread(paste0("./", settings$tissue,
                               "/subset/original/", settings$zoom_pop, "/", settings$tissue,
                               "_", import_tab, ".csv" )) 
    abs_immune <- fread(paste0("./", settings$tissue,
                              "/subset/original/", settings$tissue, "_", import_tab, ".csv" ))
    abs_immune <- abs_immune[, sum(n), by = "sample_id"]
    setnames(abs_immune, "V1", "n_immune")
    abs_props <- merge(abs_zoomed, abs_immune, by = "sample_id")
    abs_props[, prop := n/n_immune]
    abs_props <- abs_props[, c("sample_id", "pop", "prop")]
    exp_tab(table = abs_props, tab_path = table_path(settings, name = name))
}


### DIFFERENTIAL ABUNDANCE ###
### CREATE FORMULAS
create_da_formulas <- function(sce, settings) {
    ### Compare proportions with GLMM
    ei <- metadata(sce)$experiment_info
    
    ### Create formulas (two depending if we include PMD or not)
    # General
    (da_formula1 <- createFormula(ei,
                                  cols_fixed = c("condition"
                                                 # , "age"
                                                 # , "gender"
                                                 # , "pmd_hours"
                                                 # , "batch_id"
                                  ),
                                  cols_random = "sample_id")) 
    
    # For septum, PMD is not different in MS vs con and AD vs con
    if (settings$tissue == "septum") {
        (da_formula2 <- createFormula(ei,
                                      cols_fixed = c("condition"
                                                     # ,"age"
                                                     # , "gender"
                                                     # # , "pmd_hours" # not significant vs control
                                                     # , "batch_id"
                                      ),
                                      cols_random = "sample_id")) 
    } else {
        (da_formula2 <- createFormula(ei,
                                      cols_fixed = c("condition"
                                                     # ,"age"
                                                     # , "gender"
                                                     # , "pmd_hours"
                                                     # , "batch_id"
                                      ),
                                      cols_random = "sample_id")) 
    }
    return(list(da_formula1 = da_formula1, da_formula2 = da_formula2))
}


### DIFFERENTIAL ABUNDANCE
differential_abundance <- function(sce, settings, contrast = "conditionms", formula = "da_formula1", 
                      contrast_extra = F, contrast_extra_con = NULL, clustering_to_use = "merge1",
                      group1 = "control", group2 = "ms", different_prop = F,
                      name = "da_res_ms") {
    # Start writing to an output file
    sink(table_path(settings, name = name, format = ".txt"))
    da_res <- diffcyt_adjusted(sce,
                               formula = formula,
                               contrast = contrast, # choose among: conditionms, conditionad, age, genderm, pmd_hours
                               contrast_extra = contrast_extra, # optional, if you want to compare to other than intercept (control)
                               contrast_extra_con = contrast_extra_con, # optional, if you want to compare to other than intercept (control)
                               analysis_type = "DA", method_DA = "diffcyt-DA-GLMM", settings = settings, 
                               different_prop = different_prop, 
                               clustering_to_use = clustering_to_use, verbose = TRUE)
    sink()
    
    rowData(da_res$res)
    table(rowData(da_res$res)$p_adj < FDR_cutoff)
    topTable <- as.data.frame(topTable(da_res,show_props = TRUE,
                                       format_vals = TRUE, digits = 4))
    exp_tab(table = topTable, tab_path = table_path(settings, name = paste0("topTable_", name)))

    # Plot heatmap
    sce_comp <- sce[,sce$condition %in% c(group1, group2)]
    set_dev(plot_path(settings, paste0("diff_heatmap_", name)))
    hm <- plotDiffHeatmap(sce_comp, rowData(da_res$res), all = TRUE, fdr = FDR_cutoff)
    print(hm)
    unset_dev()
    
}

### NON-PARAMETRIC TESTS ###
### GET FREQUENCIES
get_freqs <- function(sce, merge = "merge1") {
    ## Create table with cell frequencies of each cluster:
    fq <- prop.table(table(cluster_ids(sce, "merge1"), sample_ids(sce)), 
                     2) * 100
    df <- melt(fq, value.name = "freq", varnames = c("cluster_id", 
                                                     "sample_id"))
    m <- match(df$sample_id, ei(sce)$sample_id)
    cols <- setdiff(names(ei(sce)), names(df))
    df <- data.frame(df, ei(sce)[m, cols])
    df <- as.data.table(df)
    return(df)
}

### KRUSKALL-WALLIS
## Perform Kruskall-Wallis for each cluster:
kruskal_per_cluster <- function(df, settings, name = "kruskall") {
    cluster_id <- unique(df$cluster_id)
    kruskall <- as.data.table(cluster_id)
    kruskall[, pvalue := -1]
    p <- c()
    for (cluster in cluster_id) {
        res.kruskal <- kruskal.test(freq ~ condition, data = df[cluster_id==cluster])
        print(cluster)
        print(res.kruskal)
        # p <- append(p, res.kruskal$p.value)
        p <- res.kruskal$p.value
        kruskall[cluster_id == cluster, pvalue := p]
    }
    
    kruskall[, padjust := p.adjust(kruskall$pvalue, method = "fdr")]
    exp_tab(table = kruskall, tab_path = table_path(settings, name = name))
}

### POSTHOC
posthoc_pairwise <- function(df) {
    ## Post hoc pairwise control, MS, AD
    cluster_id <- unique(df$cluster_id)
    p.pairwise <- c()
    for (cluster in cluster_id) {
        res.pairwise <- pairwise.wilcox.test(df[cluster_id==cluster]$freq,
                                             df[cluster_id==cluster]$condition,
                                             p.adjust.method = "BH")
        print(cluster)
        print(res.pairwise)
        p.pairwise <- append(p.pairwise, res.pairwise$p.value)
    }
}


### DIFFERENTIAL EXPRESSION ###
### Differential analysis of marker expression stratified by cell population ###
plot_median_expression <- function(sce, settings) {
    # Plot median expression of all markers in each cluster for each sample, by condition
    set_dev(plot_path(settings, "marker_expression"))
    p <- plotPbExprs(sce, k = "merge1",
                     facet = "cluster_id", shape_by = "patient_id")
    p$facet$params$ncol <- 2
    print(p)
    unset_dev()
}

### CREATE FORMULAS
create_ds_formulas <- function(sce) {
    ei <- metadata(sce)$experiment_info
    ### Compare expression with LMM linear model
    # Formula
    ds_formula1 <- createFormula(ei, cols_fixed = c("condition"
                                                    # ,"age"
                                                    # , "gender"
                                                    # , "pmd_hours"
                                                    # , "batch_id"
    )
    # ,cols_random = "" # no random effect
    )
    return(list(ds_formula1 = ds_formula1))
}

### Select markers of interest
select_markers <- function(markers_of_interest = "CP_CyTOF_state_markers_per_pop.csv") {
    ### To only study DE in markers of interest for each cell pop:
    # Import table with state markers of interest per cell pop
    markers_test <- fread(paste0("./", markers_of_interest),
                          na.strings = c("", NULL, "NA"))
    state <- colnames(markers_test)[2:17] # Create list with all state marker names
    state_dt <- melt(markers_test, id.vars = c("pop"), measure.vars = state, 
                     variable.name = "marker", value.name = "include")
    state_dt <- state_dt[include == "x"] # Keep only those of interest
    state_dt[, unique := paste(pop, marker, sep = "_")] # Create single unique identifier per combi
    return(state_dt)
}

### DIFFERENTIAL EXPRESSION
differential_expression <- function(sce, settings, contrast = "conditionms", formula = "ds_formula1", 
                                    contrast_extra = F, contrast_extra_con = NULL, 
                                    clustering_to_use = "merge1",
                                    group1 = "control", group2 = "ms",
                                    name = "ds_res_ms") {
    sink(table_path(settings, name = name, format = ".txt"))
    ds_res <- diffcyt_adjusted(sce,
                                  formula = formula, 
                                  contrast = contrast,
                                  contrast_extra = contrast_extra,
                                  contrast_extra_con = contrast_extra_con, 
                                  analysis_type = "DS", method_DS = "diffcyt-DS-LMM",
                                  clustering_to_use = clustering_to_use, verbose = FALSE)
    sink()
    
    ds_pvals <- as.data.table(ds_res$res)
    ds_medians <- ds_res$meds
    ds_results <- merge(ds_pvals, ds_medians, by = "unique")
    ds_results <- ds_results[, !c("pops", "marker")]
    table(ds_results$p_adj < FDR_cutoff)
    exp_tab(table = ds_results, tab_path = table_path(settings, name = paste0(name, "_results")))
    
    ###
    # Heatmap not working because I removed 
    # metadata(res) <- as.list(c(metadata(res), clustering_name = clustering_name))
    # from diffcyt_adjusted()
    # Heatmap to report the differential signals
    # plotDiffHeatmap(sce, ds_res_ms, top_n = 50, order = TRUE,
    #                 th = FDR_cutoff, normalize = TRUE, hm1 = FALSE)
    # set_dev(plot_path(SETTINGS, "ds_heatmap_ms"))
    # plotDiffHeatmap(sce_ms, ds_res_ms, top_n = 50, order = TRUE,
    #                 th = FDR_cutoff, normalize = TRUE, hm1 = FALSE)
    # unset_dev()
}


plot_heatmap_expresssion <- function(sce, fdr = 0.1, settings,
                                     name1 = "ds_res_ms_results.csv", name2 = "ds_res_advsms_results.csv") {
    # Load latest version of heatmap.3 function:
    source_url("https://raw.githubusercontent.com/obigriffith/biostar-tutorials/master/Heatmaps/heatmap.3.R")
    
    if (settings$norm) {
        normalised_folder <- "normalised/"
    } else {
        normalised_folder <- "original/"
    }
    if (settings$subsetting) {
        subset_folder <- "subset/"
    } else {
        subset_folder <- "whole/"
    }
    if (settings$zoom) {
        zoom_folder <- paste0(settings$zoom_pop, "/")
    } else {
        zoom_folder <- NULL
    }
    tissue <- settings$tissue
    
    populations <- as.list(unique(metadata(sce)$cluster_codes["merge1"]))$merge1
    populations <- populations[!populations %in% c("CD45-", "Undefined")]
    
    for (population in populations) {
        ### Import table with median expressions
        # MS
        dt_ms <- as.data.table(
            fread(paste0(settings$experiment_dir, "/", tissue, "/", 
                         subset_folder, normalised_folder, zoom_folder, tissue, "_", name1)))
        dt2_ms <- dt_ms[cluster_id == population]
        dt2_ms <-  unique(dt2_ms, by = "marker_id") # remove duplicated rows
        dt3 <- dt2_ms[, !(c("p_val", "p_adj"))] # remove the p-value for heatmap
        
        ### Define order of heatmap:
        sample_order <- c("control01", "control02", "control03", "control04", "control05", 
                          "control06", "control07", "control08", "control09", "control10",  
                          "control11", "control12",    
                          "ad01", "ad02", "ad03", "ad04", "ad05", "ad06","ad07", "ad08",
                          "ms01","ms02", "ms03", "ms04", "ms05", "ms06", 
                          "ms07", "ms08", "ms09", "ms10", "ms11", "ms12", "ms13" )
        sample_order <- sample_order[sample_order %in% colnames(dt3)] # keep only existing samples 
        setcolorder(dt3, sample_order) # reorder columns for heatmap
        
        # AD (load for p-values only)
        dt_ad <- as.data.table(
            fread(paste0(settings$experiment_dir, "/", tissue, "/", 
                         subset_folder, normalised_folder, zoom_folder, tissue, "_", name2)))
        dt2_ad <- dt_ad[cluster_id == population]
        dt2_ad <-  unique(dt2_ad, by = "marker_id") # remove duplicated rows
        
        ### Get adjusted p-values
        padj_ms <- dt2_ms[, p_adj] # MS adjusted p-value for annotations
        padj_ad <- dt2_ad[, p_adj] # AD adjusted p-value for annotations
        padj <- cbind(padj_ad, padj_ms)
        padj_colors <- ifelse(padj > fdr, "gray", 
                              ifelse(padj < 0.05 & padj > 0.01, "olivedrab3", "olivedrab2")) # assign colors
        padj_colors <- t(padj_colors) # transpose for the heatmap
        
        ### Create condition groups for annotating the heatmap
        condition_colors <- ifelse(grepl("control", sample_order), "#043741",
                                   ifelse(grepl("ms", sample_order), "#e79d24", 
                                          ifelse(grepl("ad", sample_order), "#189cb3", NA)))
        condition_colors <- as.matrix(condition_colors)
        # 
        # ### Create marker groups
        # marker_order <- c("CD49d", "CD54", "CD31/PECAM_1", 
        #                   "Tbet", "CD45RA", "CD45RO",
        #                   "PDL1", 
        #                   "CD25", "CD69")
        # foo <- data.table(marker_order)
        # foo[, marker_type := ifelse(marker_order %in% c("CD49d", "CD54", "CD31/PECAM_1"), "adhesion", 
        #                             ifelse(marker_order %in% c("Tbet", "CD45RA", "CD45RO"), "memory/maturation", 
        #                                    ifelse(marker_order %in% c("PDL1"), "inhibition", "residency/activation")))]
        
        ### Create matrix
        mat <- as.matrix(dt3[, !c("unique", "cluster_id", "marker_id")])
        rownames(mat) <- dt3$marker_id
        
        palette = rev(brewer.pal(11, "RdYlBu"))
        # palette = colorRampPalette(c("deepskyblue4","white","darkorange2"))(200)
        
        ### Create heatmap
        
        main_title <- paste(population, tissue)
        set_dev(plot_path(settings, paste0("expr_heatmap_", population)))
        heatmap.3(mat, #hclustfun=myclust, distfun=mydist, na.rm = TRUE,
                  scale="row",
                  dendrogram="none", 
                  # margins=c(9,9),
                  Rowv=TRUE, Colv=F, #ColSideColors=clab,
                  RowSideColors=padj_colors, RowSideColorsSize=2,
                  ColSideColors=condition_colors, ColSideColorsSize=1,
                  symbreaks=T,
                  key=TRUE, keysize = 0.75, symkey=T,
                  # density.info="histogram",
                  main=main_title, 
                  cexRow=2, cexCol=2,
                  col=palette
                  )
        legend("topright",legend=c("padj > 0.1","0.05 < padj < 0.1", "padj < 0.05"),
               fill=c("gray","olivedrab3", "olivedrab2"), border=FALSE, bty="n", 
               y.intersp = 1, cex=1.25)
        unset_dev()
        
              }
}



back to top