Revision 8f750d2bd2afc7bd12844aedf402519ea117930a authored by cscjohns on 03 July 2021, 19:17:11 UTC, committed by GitHub on 03 July 2021, 19:17:11 UTC
1 parent f66f892
Raw File
dbi_8_WGCNA.R
################################################################################
#
# Weighted Gene Co-Expression Network Analysis (WGCNA)
#
################################################################################

# Display the current working directory
getwd()

# Load the WGCNA package
library(WGCNA)

# The following setting is important, do not omit.
options(stringsAsFactors = FALSE)
enableWGCNAThreads()

#Read in the WGCNA data
load("diet_pbmc_data/clean_wgcna_data.RData")

#===============================================================================
#
#  Code chunk 2
#
#===============================================================================

# Choose a set of soft-thresholding powers
powers <- c(c(1:10), seq(from = 12, to = 20, by = 2))
# Call the network topology analysis function
sft <- pickSoftThreshold(t_matrix, powerVector = powers, verbose = 5,
                         blockSize = 12240)
# Plot the results:
pdf("soft_thresholding.pdf")
sizeGrWindow(9, 5)
par(mfrow = c(1, 2))
cex1 = 0.9
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[, 3])*sft$fitIndices[, 2],
     xlab = "Soft Threshold (power)",
     ylab = "Scale Free Topology Model Fit,signed R^2", type = "n",
     main = paste("Scale independence"))
text(sft$fitIndices[, 1], -sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2],
     labels = powers,cex = cex1, col = "red")
# this line corresponds to using an R^2 cut-off of h
abline(h = 0.75,col = "red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[, 1], sft$fitIndices[, 5],
     xlab = "Soft Threshold (power)",
     ylab = "Mean Connectivity", type = "n",
     main = paste("Mean connectivity"))
text(sft$fitIndices[, 1], sft$fitIndices[, 5], labels = powers,
     cex = cex1, col = "red")

dev.off()


#===============================================================================
#
#  Code chunk 3
#
#===============================================================================


softPower <- 6
adjacency <- adjacency(t_matrix, power = softPower)


#===============================================================================
#
#  Code chunk 4
#
#===============================================================================


# Turn adjacency into topological overlap
TOM <- TOMsimilarity(adjacency)
dissTOM <- 1 - TOM


#===============================================================================
#
#  Code chunk 5
#
#===============================================================================


# Call the hierarchical clustering function
geneTree <- hclust(as.dist(dissTOM), method = "average")
# Plot the resulting clustering tree (dendrogram)
pdf("dendrogram_1.pdf")
sizeGrWindow(12, 9)
plot(geneTree, xlab = "", sub = "",
     main = "Gene clustering on TOM-based dissimilarity",
     labels = FALSE, hang = 0.04)
dev.off()


#===============================================================================
#
#  Code chunk 6
#
#===============================================================================


# We like large modules, so we set the minimum module size relatively high:
minModuleSize <- 30
# Module identification using dynamic tree cut:
dynamicMods <- cutreeDynamic(dendro = geneTree, distM = dissTOM,
                             deepSplit = 2, pamRespectsDendro = FALSE,
                             minClusterSize = minModuleSize)
table(dynamicMods)


#===============================================================================
#
#  Code chunk 7
#
#===============================================================================


# Convert numeric lables into colors
dynamicColors <- labels2colors(dynamicMods)
table(dynamicColors)
# Plot the dendrogram and colors underneath
pdf("dendrogram_2.pdf")
sizeGrWindow(8,6)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05,
                    main = "Gene dendrogram and module colors")
dev.off()

#===============================================================================
#
#  Code chunk 8
#
#===============================================================================


# Calculate eigengenes
MEList <- moduleEigengenes(t_matrix, colors = dynamicColors)
MEs <- MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss <- 1 - cor(MEs)
# Cluster module eigengenes
METree <- hclust(as.dist(MEDiss), method = "average")
# Plot the result
pdf("ME_cluster.pdf")
sizeGrWindow(7, 6)
plot(METree, main = "Clustering of module eigengenes",
     xlab = "", sub = "")


#===============================================================================
#
#  Code chunk 9
#
#===============================================================================


MEDissThres <- 0.25
# Plot the cut line into the dendrogram
abline(h = MEDissThres, col = "red")
dev.off()
# Call an automatic merging function
merged <- mergeCloseModules(t_matrix, dynamicColors, cutHeight = MEDissThres,
                            verbose = 3)
# The merged module colors
mergedColors <- merged$colors
# Eigengenes of the new merged modules:
mergedMEs <- merged$newMEs


#===============================================================================
#
#  Code chunk 10
#
#===============================================================================

pdf(file = "dendrogram_3.pdf", wi = 9, he = 6)
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
                    c("Dynamic Tree Cut", "Merged dynamic"),
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)
dev.off()


#===============================================================================
#
#  Code chunk 11
#
#===============================================================================


# Rename to moduleColors
moduleColors <- mergedColors
# Construct numerical labels corresponding to the colors
colorOrder <- c("grey", standardColors(50))
moduleLabels <- match(moduleColors, colorOrder)-1
MEs <- mergedMEs
# Save module colors and labels for use in subsequent parts
save(MEs, mergedMEs, dynamicColors, mergedColors, geneTree,
     file = "cyno_wgcna_output.RData")


#===============================================================================
#
#  Correlation between diet and module eigengenes
#
#===============================================================================

eigengene_diet_cor <- tibble(
  "module" = str_c("Module ", 1:16),
  "module_color" = str_remove(names(mergedMEs), "^ME"),
  "cor_diet" = apply(mergedMEs, 2, function(x){
    cor.test(sample_info$diet_med, x)$estimate}),
  "cor_pval" = apply(mergedMEs, 2, function(x){
    cor.test(sample_info$diet_med, x)$p.value}),
  "lowci" = apply(mergedMEs, 2, function(x){
    cor.test(sample_info$diet_med, x)$conf.int[1]}),
  "hici" = apply(mergedMEs, 2, function(x){
    cor.test(sample_info$diet_med, x)$conf.int[2]}),
  "ttest_t" = apply(mergedMEs, 2, function(x){
    t.test(x[sample_info$diet_med == 0],
           x[sample_info$diet_med == 1])$statistic}),
  "ttest_df" = apply(mergedMEs, 2, function(x){
    t.test(x[sample_info$diet_med == 0],
           x[sample_info$diet_med == 1])$parameter}),
  "ttest_pval" = apply(mergedMEs, 2, function(x){
    t.test(x[sample_info$diet_med == 0],
           x[sample_info$diet_med == 1])$p.value})
)
eigengene_diet_cor$padj <- p.adjust(eigengene_diet_cor$ttest_pval)

#===============================================================================
#
#  Enrichment for diet genes in modules
#
#===============================================================================

module_deg_info <- DEG_info %>%
  add_column("module_color" = mergedColors) %>%
  left_join(eigengene_diet_cor %>%
              select(module, module_color),
            by = "module_color")

module_fet <- module_deg_info %>%
  mutate(diet_dir = ifelse(diet_sig == "TRUE", diet_dir, "ns")) %>%
  group_by(module, diet_dir) %>%
  summarize(n = n()) %>%
  pivot_wider(names_from = "diet_dir",
              values_from = "n") %>%
  replace_na(list(MED = 0, WEST = 0, ns = 0)) %>%
  mutate(n_genes = MED + ns + WEST,
         WD_lor = log2(fisher.test(matrix(c(WEST,
                                            MED + ns,
                                            2236 - WEST,
                                            10004 - MED - ns),
                                          nrow = 2))$estimate),
         WD_lowci = log2(fisher.test(matrix(c(WEST,
                                              MED + ns,
                                              2236 - WEST,
                                              10004 - MED - ns),
                                            nrow = 2))$conf.int[1]),
         WD_hici = log2(fisher.test(matrix(c(WEST,
                                             MED + ns,
                                             2236 - WEST,
                                             10004 - MED - ns),
                                           nrow = 2))$conf.int[2]),
         WD_pval = fisher.test(matrix(c(WEST,
                                        MED + ns,
                                        2236 - WEST,
                                        10004 - MED - ns),
                                      nrow = 2))$p.value,
         MD_lor = log2(fisher.test(matrix(c(MED,
                                            WEST + ns,
                                            2664 - MED,
                                            9576 - WEST - ns),
                                          nrow = 2))$estimate),
         MD_lowci = log2(fisher.test(matrix(c(MED,
                                              WEST + ns,
                                              2664 - MED,
                                              9576 - WEST - ns),
                                            nrow = 2))$conf.int[1]),
         MD_hici = log2(fisher.test(matrix(c(MED,
                                             WEST + ns,
                                             2664 - MED,
                                             9576 - WEST - ns),
                                           nrow = 2))$conf.int[2]),
         MD_pval = fisher.test(matrix(c(MED,
                                        WEST + ns,
                                        2664 - MED,
                                        9576 - WEST - ns),
                                      nrow = 2))$p.value) %>%
  pivot_longer(WD_lor:MD_pval,
               names_to = "measure",
               values_to = "value") %>%
  mutate(value = ifelse(value == -Inf, -12, value)) %>%
  separate(measure,
           into = c("diet", "measure"),
           sep = "_") %>%
  pivot_wider(names_from = "measure",
              values_from = "value") %>%
  left_join(eigengene_diet_cor %>%
              select(module, cor_diet),
            by = "module") %>%
  mutate(module_label = as_factor(str_c(module, " (", n_genes, ")")),
         diet_genes = ifelse(diet == "MD", MED, WEST),
         diet = ifelse(diet == "MD", "Mediterranean", "Western"),
         order = as.numeric(str_remove(module, "Module ")))

module_fet$padj <- p.adjust(module_fet$pval)

module_fet %>%
  filter(n_genes > 30) %>%
  ggplot(aes(x = fct_reorder(module_label, desc(order)), y = lor)) +
  geom_hline(yintercept = 0, linetype = "dashed", col = "black") +
  geom_errorbar(aes(ymin = lowci, ymax = hici, col = diet),
                size = 3, alpha = 0.6, width = 0) +
  geom_label(aes(label = diet_genes, fill = diet), col = "white") +
  scale_color_manual(breaks = c("Mediterranean", "Western"),
                     values = c(med_col, wes_col),
                     guide = NULL) +
  scale_fill_manual(breaks = c("Mediterranean", "Western"),
                    values = c(med_col, wes_col),
                    guide = NULL) +
  scale_y_continuous(limits = c(-12, 4),
                     breaks = seq(-12, 4, 2),
                     labels = c("-Inf", seq(-10, 4, 2))) +
  labs(x = NULL,
       y = expression(Log[2]*"(Odds Ratio)"),
       title = "Enrichment of DE Genes in WGCNA Modules") +
  coord_flip()

module_fet %>%
  filter(n_genes > 30) %>%
  ggplot(aes(x = fct_reorder(module_label, desc(order)), y = lor)) +
  geom_hline(yintercept = 0, linetype = "dashed", col = "black") +
  geom_errorbar(aes(ymin = lowci, ymax = hici, col = ifelse(lowci * hici > 0, diet, "gray")),
                size = 3, alpha = 0.6, width = 0) +
  geom_label(aes(label = diet_genes, fill = ifelse(lowci * hici > 0, diet, "gray")), col = "white") +
  scale_color_manual(breaks = c("Mediterranean", "Western", "gray"),
                     values = c(med_col, wes_col, "gray"),
                     guide = NULL) +
  scale_fill_manual(breaks = c("Mediterranean", "Western", "gray"),
                    values = c(med_col, wes_col, "gray"),
                    guide = NULL) +
  scale_y_continuous(limits = c(-12, 4),
                     breaks = seq(-12, 4, 2),
                     labels = c("-Inf", seq(-10, 4, 2))) +
  labs(x = NULL,
       y = expression(Log[2]*"(Odds Ratio)"),
       title = "Enrichment of DE Genes in WGCNA Modules") +
  coord_flip() +
  facet_wrap(~diet)

#===============================================================================
#
#  GO enrichment
#
#===============================================================================

library(topGO)
library(biomaRt)

# Create a biomart object with ensembl mfas genes
mart <- useMart(biomart = "ENSEMBL_MART_ENSEMBL",
                dataset = "mfascicularis_gene_ensembl")

# Create a vector of gene names from expression matrix
ensembl_gene_names <- DEG_info$stable_ID

# Create a biomart object of GO terms associated with genes from study
mfas_GO <- getBM(attributes = c("ensembl_gene_id", "go_id"),
                 filters = "ensembl_gene_id",
                 values = ensembl_gene_names,
                 mart = mart)

# Create a list of 
gene_ID2GO <- lapply(unique(mfas_GO$ensembl_gene_id),
                     function(x){sort(mfas_GO[mfas_GO$ensembl_gene_id == x,
                                              "go_id"])})
names(gene_ID2GO) <- unique(mfas_GO$ensembl_gene_id)

# Create a list of module gene sets
module_genesets <- lapply(unique(mdi$module),
                          function(x){sort(mdi[mdi$module == x,
                                               "stable_ID"])})
names(module_genesets) <- unique(mdi$module)

all_genes <- DEG_info$stable_ID
names(all_genes) <- DEG_info$stable_ID

# Create topGOdata objects for each module geneset
module_topGOdata <- list()
module_go_FET <- list()
module_go_results <- list()
module_res2 <- list()

for(i in 1:length(module_genesets)){
  geneList <- factor(as.integer(all_genes %in% module_genesets[[i]]))
  names(geneList) <- names(all_genes)
  module_topGOdata[i] <- new("topGOdata",
                             description = "Simple session",
                             ontology = "BP",
                             allGenes = geneList,
                             nodeSize = 10,
                             annot = annFUN.gene2GO,
                             gene2GO = gene_ID2GO)
  this_topGOdata <- module_topGOdata[[i]]
  module_go_FET[i] <- runTest(this_topGOdata,
                              algorithm = "weight01",
                              statistic = "fisher")
  this_go_FET <- module_go_FET[[i]]
  module_go_results[[i]] <- GenTable(this_topGOdata,
                                     FET.weight01 = this_go_FET,
                                     orderBy = "FET.weight01",
                                     ranksOf = "FET.weight01",
                                     topNodes = this_go_FET@geneData[4],
                                     numChar = 1000)
  this_go_results <- module_go_results[[i]]
  module_res2[[i]] <- this_go_results %>%
    mutate(pval = as.numeric(FET.weight01),
           qval = qvalue(pval)$qvalues,
           padj = p.adjust(pval, method = "BH"),
           lor = log2(Significant / (Annotated - Significant)) -
             log2((this_go_FET@geneData[2] - Significant) /
                    (this_go_FET@geneData[1] - Annotated -
                       this_go_FET@geneData[2] - Significant))) %>%
    filter(pval < 0.01) %>%
    select(GO.ID, Term, Annotated, Significant, Expected, lor, pval, qval, padj) %>%
    arrange(desc(lor))
}

names(module_topGOdata) <- names(module_genesets)
names(module_go_FET) <- names(module_genesets)
names(module_go_results) <- names(module_genesets)
names(module_res2) <- names(module_genesets)


wgcna_topGO_results <- module_res2[[1]]
wgcna_topGO_results$module <- rep(names(module_res2)[1],
                                  times = nrow(module_res2[[1]]))

for(i in 2:length(module_genesets)){
  tempdf <- module_res2[[i]]
  tempdf$module <- rep(names(module_res2)[i],
                       times = nrow(module_res2[[i]]))
  wgcna_topGO_results <- rbind(wgcna_topGO_results,
                               tempdf)
}

wgcna_topGO_results %>%
  left_join(eigengene_diet_cor %>%
              select(module, cor_diet),
            by = "module") %>%
  arrange(desc(cor_diet), padj) %>%
  select(module, cor_diet, GO.ID, Term, Annotated, Significant, Expected, lor,
         pval, padj) %>%
  write_csv("wgcna_topGO_results.csv")

#===============================================================================
#
#  Enrichment for polarization genes in modules
#
#===============================================================================

module_pol <- module_deg_info %>%
  mutate(polarization = ifelse(polarization == "Pro-Inflammatory", "PRO",
                               ifelse(polarization == "Regulatory", "REG", "ns"))) %>%
  group_by(module, polarization) %>%
  summarize(n = n()) %>%
  pivot_wider(names_from = "polarization",
              values_from = "n") %>%
  replace_na(list(REG = 0, PRO = 0, ns = 0)) %>%
  mutate(n_genes = REG + ns + PRO,
         PR_lor = log2(fisher.test(matrix(c(PRO,
                                            REG + ns,
                                            698 - PRO,
                                            11542 - REG - ns),
                                          nrow = 2))$estimate),
         PR_lowci = log2(fisher.test(matrix(c(PRO,
                                              REG + ns,
                                              698 - PRO,
                                              11542 - REG - ns),
                                            nrow = 2))$conf.int[1]),
         PR_hici = log2(fisher.test(matrix(c(PRO,
                                             REG + ns,
                                             698 - PRO,
                                             11542 - REG - ns),
                                           nrow = 2))$conf.int[2]),
         PR_pval = fisher.test(matrix(c(PRO,
                                        REG + ns,
                                        698 - PRO,
                                        11542 - REG - ns),
                                      nrow = 2))$p.value,
         RG_lor = log2(fisher.test(matrix(c(REG,
                                            PRO + ns,
                                            138 - REG,
                                            12102 - PRO - ns),
                                          nrow = 2))$estimate),
         RG_lowci = log2(fisher.test(matrix(c(REG,
                                              PRO + ns,
                                              138 - REG,
                                              12102 - PRO - ns),
                                            nrow = 2))$conf.int[1]),
         RG_hici = log2(fisher.test(matrix(c(REG,
                                             PRO + ns,
                                             138 - REG,
                                             12102 - PRO - ns),
                                           nrow = 2))$conf.int[2]),
         RG_pval = fisher.test(matrix(c(REG,
                                        PRO + ns,
                                        138 - REG,
                                        12102 - PRO - ns),
                                      nrow = 2))$p.value) %>%
  pivot_longer(PR_lor:RG_pval,
               names_to = "measure",
               values_to = "value") %>%
  mutate(value = ifelse(value == -Inf, -10, value)) %>%
  separate(measure,
           into = c("polarization", "measure"),
           sep = "_") %>%
  pivot_wider(names_from = "measure",
              values_from = "value") %>%
  left_join(eigengene_diet_cor %>%
              select(module, cor_diet),
            by = "module") %>%
  mutate(module_label = as_factor(str_c(module, " (", n_genes, ")")),
         polarization_genes = ifelse(polarization == "RG", REG, PRO),
         polarization = ifelse(polarization == "RG", "Regulatory", "Pro-Inflammatory"),
         order = as.numeric(str_remove(module, "Module ")))

module_pol$padj <- p.adjust(module_pol$pval)

module_pol %>%
  filter(n_genes > 30) %>%
  ggplot(aes(x = fct_reorder(module_label, desc(order)), y = lor)) +
  geom_hline(yintercept = 0, linetype = "dashed", col = "black") +
  geom_errorbar(aes(ymin = lowci, ymax = hici, col = ifelse(lowci * hici > 0, polarization, "gray")),
                size = 3, alpha = 0.6, width = 0) +
  geom_label(aes(label = polarization_genes, fill = ifelse(lowci * hici > 0, polarization, "gray")), col = "white") +
  scale_color_manual(breaks = c("Regulatory", "Pro-Inflammatory", "gray"),
                     values = c(m2_col, m1_col, "gray"),
                     guide = NULL) +
  scale_fill_manual(breaks = c("Regulatory", "Pro-Inflammatory", "gray"),
                    values = c(m2_col, m1_col, "gray"),
                    guide = NULL) +
  scale_y_continuous(limits = c(-10, 8),
                     breaks = seq(-10, 8, 2),
                     labels = c("-Inf", seq(-8, 8, 2))) +
  labs(x = NULL,
       y = expression(Log[2]*"(Odds Ratio)"),
       title = "Enrichment of Polarization Genes in WGCNA Modules") +
  coord_flip() +
  facet_wrap(~polarization)
back to top