Revision 8f750d2bd2afc7bd12844aedf402519ea117930a authored by cscjohns on 03 July 2021, 19:17:11 UTC, committed by GitHub on 03 July 2021, 19:17:11 UTC
1 parent f66f892
dbi_8_WGCNA.R
################################################################################
#
# Weighted Gene Co-Expression Network Analysis (WGCNA)
#
################################################################################
# Display the current working directory
getwd()
# Load the WGCNA package
library(WGCNA)
# The following setting is important, do not omit.
options(stringsAsFactors = FALSE)
enableWGCNAThreads()
#Read in the WGCNA data
load("diet_pbmc_data/clean_wgcna_data.RData")
#===============================================================================
#
# Code chunk 2
#
#===============================================================================
# Choose a set of soft-thresholding powers
powers <- c(c(1:10), seq(from = 12, to = 20, by = 2))
# Call the network topology analysis function
sft <- pickSoftThreshold(t_matrix, powerVector = powers, verbose = 5,
blockSize = 12240)
# Plot the results:
pdf("soft_thresholding.pdf")
sizeGrWindow(9, 5)
par(mfrow = c(1, 2))
cex1 = 0.9
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[, 3])*sft$fitIndices[, 2],
xlab = "Soft Threshold (power)",
ylab = "Scale Free Topology Model Fit,signed R^2", type = "n",
main = paste("Scale independence"))
text(sft$fitIndices[, 1], -sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2],
labels = powers,cex = cex1, col = "red")
# this line corresponds to using an R^2 cut-off of h
abline(h = 0.75,col = "red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[, 1], sft$fitIndices[, 5],
xlab = "Soft Threshold (power)",
ylab = "Mean Connectivity", type = "n",
main = paste("Mean connectivity"))
text(sft$fitIndices[, 1], sft$fitIndices[, 5], labels = powers,
cex = cex1, col = "red")
dev.off()
#===============================================================================
#
# Code chunk 3
#
#===============================================================================
softPower <- 6
adjacency <- adjacency(t_matrix, power = softPower)
#===============================================================================
#
# Code chunk 4
#
#===============================================================================
# Turn adjacency into topological overlap
TOM <- TOMsimilarity(adjacency)
dissTOM <- 1 - TOM
#===============================================================================
#
# Code chunk 5
#
#===============================================================================
# Call the hierarchical clustering function
geneTree <- hclust(as.dist(dissTOM), method = "average")
# Plot the resulting clustering tree (dendrogram)
pdf("dendrogram_1.pdf")
sizeGrWindow(12, 9)
plot(geneTree, xlab = "", sub = "",
main = "Gene clustering on TOM-based dissimilarity",
labels = FALSE, hang = 0.04)
dev.off()
#===============================================================================
#
# Code chunk 6
#
#===============================================================================
# We like large modules, so we set the minimum module size relatively high:
minModuleSize <- 30
# Module identification using dynamic tree cut:
dynamicMods <- cutreeDynamic(dendro = geneTree, distM = dissTOM,
deepSplit = 2, pamRespectsDendro = FALSE,
minClusterSize = minModuleSize)
table(dynamicMods)
#===============================================================================
#
# Code chunk 7
#
#===============================================================================
# Convert numeric lables into colors
dynamicColors <- labels2colors(dynamicMods)
table(dynamicColors)
# Plot the dendrogram and colors underneath
pdf("dendrogram_2.pdf")
sizeGrWindow(8,6)
plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut",
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05,
main = "Gene dendrogram and module colors")
dev.off()
#===============================================================================
#
# Code chunk 8
#
#===============================================================================
# Calculate eigengenes
MEList <- moduleEigengenes(t_matrix, colors = dynamicColors)
MEs <- MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss <- 1 - cor(MEs)
# Cluster module eigengenes
METree <- hclust(as.dist(MEDiss), method = "average")
# Plot the result
pdf("ME_cluster.pdf")
sizeGrWindow(7, 6)
plot(METree, main = "Clustering of module eigengenes",
xlab = "", sub = "")
#===============================================================================
#
# Code chunk 9
#
#===============================================================================
MEDissThres <- 0.25
# Plot the cut line into the dendrogram
abline(h = MEDissThres, col = "red")
dev.off()
# Call an automatic merging function
merged <- mergeCloseModules(t_matrix, dynamicColors, cutHeight = MEDissThres,
verbose = 3)
# The merged module colors
mergedColors <- merged$colors
# Eigengenes of the new merged modules:
mergedMEs <- merged$newMEs
#===============================================================================
#
# Code chunk 10
#
#===============================================================================
pdf(file = "dendrogram_3.pdf", wi = 9, he = 6)
plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE, hang = 0.03,
addGuide = TRUE, guideHang = 0.05)
dev.off()
#===============================================================================
#
# Code chunk 11
#
#===============================================================================
# Rename to moduleColors
moduleColors <- mergedColors
# Construct numerical labels corresponding to the colors
colorOrder <- c("grey", standardColors(50))
moduleLabels <- match(moduleColors, colorOrder)-1
MEs <- mergedMEs
# Save module colors and labels for use in subsequent parts
save(MEs, mergedMEs, dynamicColors, mergedColors, geneTree,
file = "cyno_wgcna_output.RData")
#===============================================================================
#
# Correlation between diet and module eigengenes
#
#===============================================================================
eigengene_diet_cor <- tibble(
"module" = str_c("Module ", 1:16),
"module_color" = str_remove(names(mergedMEs), "^ME"),
"cor_diet" = apply(mergedMEs, 2, function(x){
cor.test(sample_info$diet_med, x)$estimate}),
"cor_pval" = apply(mergedMEs, 2, function(x){
cor.test(sample_info$diet_med, x)$p.value}),
"lowci" = apply(mergedMEs, 2, function(x){
cor.test(sample_info$diet_med, x)$conf.int[1]}),
"hici" = apply(mergedMEs, 2, function(x){
cor.test(sample_info$diet_med, x)$conf.int[2]}),
"ttest_t" = apply(mergedMEs, 2, function(x){
t.test(x[sample_info$diet_med == 0],
x[sample_info$diet_med == 1])$statistic}),
"ttest_df" = apply(mergedMEs, 2, function(x){
t.test(x[sample_info$diet_med == 0],
x[sample_info$diet_med == 1])$parameter}),
"ttest_pval" = apply(mergedMEs, 2, function(x){
t.test(x[sample_info$diet_med == 0],
x[sample_info$diet_med == 1])$p.value})
)
eigengene_diet_cor$padj <- p.adjust(eigengene_diet_cor$ttest_pval)
#===============================================================================
#
# Enrichment for diet genes in modules
#
#===============================================================================
module_deg_info <- DEG_info %>%
add_column("module_color" = mergedColors) %>%
left_join(eigengene_diet_cor %>%
select(module, module_color),
by = "module_color")
module_fet <- module_deg_info %>%
mutate(diet_dir = ifelse(diet_sig == "TRUE", diet_dir, "ns")) %>%
group_by(module, diet_dir) %>%
summarize(n = n()) %>%
pivot_wider(names_from = "diet_dir",
values_from = "n") %>%
replace_na(list(MED = 0, WEST = 0, ns = 0)) %>%
mutate(n_genes = MED + ns + WEST,
WD_lor = log2(fisher.test(matrix(c(WEST,
MED + ns,
2236 - WEST,
10004 - MED - ns),
nrow = 2))$estimate),
WD_lowci = log2(fisher.test(matrix(c(WEST,
MED + ns,
2236 - WEST,
10004 - MED - ns),
nrow = 2))$conf.int[1]),
WD_hici = log2(fisher.test(matrix(c(WEST,
MED + ns,
2236 - WEST,
10004 - MED - ns),
nrow = 2))$conf.int[2]),
WD_pval = fisher.test(matrix(c(WEST,
MED + ns,
2236 - WEST,
10004 - MED - ns),
nrow = 2))$p.value,
MD_lor = log2(fisher.test(matrix(c(MED,
WEST + ns,
2664 - MED,
9576 - WEST - ns),
nrow = 2))$estimate),
MD_lowci = log2(fisher.test(matrix(c(MED,
WEST + ns,
2664 - MED,
9576 - WEST - ns),
nrow = 2))$conf.int[1]),
MD_hici = log2(fisher.test(matrix(c(MED,
WEST + ns,
2664 - MED,
9576 - WEST - ns),
nrow = 2))$conf.int[2]),
MD_pval = fisher.test(matrix(c(MED,
WEST + ns,
2664 - MED,
9576 - WEST - ns),
nrow = 2))$p.value) %>%
pivot_longer(WD_lor:MD_pval,
names_to = "measure",
values_to = "value") %>%
mutate(value = ifelse(value == -Inf, -12, value)) %>%
separate(measure,
into = c("diet", "measure"),
sep = "_") %>%
pivot_wider(names_from = "measure",
values_from = "value") %>%
left_join(eigengene_diet_cor %>%
select(module, cor_diet),
by = "module") %>%
mutate(module_label = as_factor(str_c(module, " (", n_genes, ")")),
diet_genes = ifelse(diet == "MD", MED, WEST),
diet = ifelse(diet == "MD", "Mediterranean", "Western"),
order = as.numeric(str_remove(module, "Module ")))
module_fet$padj <- p.adjust(module_fet$pval)
module_fet %>%
filter(n_genes > 30) %>%
ggplot(aes(x = fct_reorder(module_label, desc(order)), y = lor)) +
geom_hline(yintercept = 0, linetype = "dashed", col = "black") +
geom_errorbar(aes(ymin = lowci, ymax = hici, col = diet),
size = 3, alpha = 0.6, width = 0) +
geom_label(aes(label = diet_genes, fill = diet), col = "white") +
scale_color_manual(breaks = c("Mediterranean", "Western"),
values = c(med_col, wes_col),
guide = NULL) +
scale_fill_manual(breaks = c("Mediterranean", "Western"),
values = c(med_col, wes_col),
guide = NULL) +
scale_y_continuous(limits = c(-12, 4),
breaks = seq(-12, 4, 2),
labels = c("-Inf", seq(-10, 4, 2))) +
labs(x = NULL,
y = expression(Log[2]*"(Odds Ratio)"),
title = "Enrichment of DE Genes in WGCNA Modules") +
coord_flip()
module_fet %>%
filter(n_genes > 30) %>%
ggplot(aes(x = fct_reorder(module_label, desc(order)), y = lor)) +
geom_hline(yintercept = 0, linetype = "dashed", col = "black") +
geom_errorbar(aes(ymin = lowci, ymax = hici, col = ifelse(lowci * hici > 0, diet, "gray")),
size = 3, alpha = 0.6, width = 0) +
geom_label(aes(label = diet_genes, fill = ifelse(lowci * hici > 0, diet, "gray")), col = "white") +
scale_color_manual(breaks = c("Mediterranean", "Western", "gray"),
values = c(med_col, wes_col, "gray"),
guide = NULL) +
scale_fill_manual(breaks = c("Mediterranean", "Western", "gray"),
values = c(med_col, wes_col, "gray"),
guide = NULL) +
scale_y_continuous(limits = c(-12, 4),
breaks = seq(-12, 4, 2),
labels = c("-Inf", seq(-10, 4, 2))) +
labs(x = NULL,
y = expression(Log[2]*"(Odds Ratio)"),
title = "Enrichment of DE Genes in WGCNA Modules") +
coord_flip() +
facet_wrap(~diet)
#===============================================================================
#
# GO enrichment
#
#===============================================================================
library(topGO)
library(biomaRt)
# Create a biomart object with ensembl mfas genes
mart <- useMart(biomart = "ENSEMBL_MART_ENSEMBL",
dataset = "mfascicularis_gene_ensembl")
# Create a vector of gene names from expression matrix
ensembl_gene_names <- DEG_info$stable_ID
# Create a biomart object of GO terms associated with genes from study
mfas_GO <- getBM(attributes = c("ensembl_gene_id", "go_id"),
filters = "ensembl_gene_id",
values = ensembl_gene_names,
mart = mart)
# Create a list of
gene_ID2GO <- lapply(unique(mfas_GO$ensembl_gene_id),
function(x){sort(mfas_GO[mfas_GO$ensembl_gene_id == x,
"go_id"])})
names(gene_ID2GO) <- unique(mfas_GO$ensembl_gene_id)
# Create a list of module gene sets
module_genesets <- lapply(unique(mdi$module),
function(x){sort(mdi[mdi$module == x,
"stable_ID"])})
names(module_genesets) <- unique(mdi$module)
all_genes <- DEG_info$stable_ID
names(all_genes) <- DEG_info$stable_ID
# Create topGOdata objects for each module geneset
module_topGOdata <- list()
module_go_FET <- list()
module_go_results <- list()
module_res2 <- list()
for(i in 1:length(module_genesets)){
geneList <- factor(as.integer(all_genes %in% module_genesets[[i]]))
names(geneList) <- names(all_genes)
module_topGOdata[i] <- new("topGOdata",
description = "Simple session",
ontology = "BP",
allGenes = geneList,
nodeSize = 10,
annot = annFUN.gene2GO,
gene2GO = gene_ID2GO)
this_topGOdata <- module_topGOdata[[i]]
module_go_FET[i] <- runTest(this_topGOdata,
algorithm = "weight01",
statistic = "fisher")
this_go_FET <- module_go_FET[[i]]
module_go_results[[i]] <- GenTable(this_topGOdata,
FET.weight01 = this_go_FET,
orderBy = "FET.weight01",
ranksOf = "FET.weight01",
topNodes = this_go_FET@geneData[4],
numChar = 1000)
this_go_results <- module_go_results[[i]]
module_res2[[i]] <- this_go_results %>%
mutate(pval = as.numeric(FET.weight01),
qval = qvalue(pval)$qvalues,
padj = p.adjust(pval, method = "BH"),
lor = log2(Significant / (Annotated - Significant)) -
log2((this_go_FET@geneData[2] - Significant) /
(this_go_FET@geneData[1] - Annotated -
this_go_FET@geneData[2] - Significant))) %>%
filter(pval < 0.01) %>%
select(GO.ID, Term, Annotated, Significant, Expected, lor, pval, qval, padj) %>%
arrange(desc(lor))
}
names(module_topGOdata) <- names(module_genesets)
names(module_go_FET) <- names(module_genesets)
names(module_go_results) <- names(module_genesets)
names(module_res2) <- names(module_genesets)
wgcna_topGO_results <- module_res2[[1]]
wgcna_topGO_results$module <- rep(names(module_res2)[1],
times = nrow(module_res2[[1]]))
for(i in 2:length(module_genesets)){
tempdf <- module_res2[[i]]
tempdf$module <- rep(names(module_res2)[i],
times = nrow(module_res2[[i]]))
wgcna_topGO_results <- rbind(wgcna_topGO_results,
tempdf)
}
wgcna_topGO_results %>%
left_join(eigengene_diet_cor %>%
select(module, cor_diet),
by = "module") %>%
arrange(desc(cor_diet), padj) %>%
select(module, cor_diet, GO.ID, Term, Annotated, Significant, Expected, lor,
pval, padj) %>%
write_csv("wgcna_topGO_results.csv")
#===============================================================================
#
# Enrichment for polarization genes in modules
#
#===============================================================================
module_pol <- module_deg_info %>%
mutate(polarization = ifelse(polarization == "Pro-Inflammatory", "PRO",
ifelse(polarization == "Regulatory", "REG", "ns"))) %>%
group_by(module, polarization) %>%
summarize(n = n()) %>%
pivot_wider(names_from = "polarization",
values_from = "n") %>%
replace_na(list(REG = 0, PRO = 0, ns = 0)) %>%
mutate(n_genes = REG + ns + PRO,
PR_lor = log2(fisher.test(matrix(c(PRO,
REG + ns,
698 - PRO,
11542 - REG - ns),
nrow = 2))$estimate),
PR_lowci = log2(fisher.test(matrix(c(PRO,
REG + ns,
698 - PRO,
11542 - REG - ns),
nrow = 2))$conf.int[1]),
PR_hici = log2(fisher.test(matrix(c(PRO,
REG + ns,
698 - PRO,
11542 - REG - ns),
nrow = 2))$conf.int[2]),
PR_pval = fisher.test(matrix(c(PRO,
REG + ns,
698 - PRO,
11542 - REG - ns),
nrow = 2))$p.value,
RG_lor = log2(fisher.test(matrix(c(REG,
PRO + ns,
138 - REG,
12102 - PRO - ns),
nrow = 2))$estimate),
RG_lowci = log2(fisher.test(matrix(c(REG,
PRO + ns,
138 - REG,
12102 - PRO - ns),
nrow = 2))$conf.int[1]),
RG_hici = log2(fisher.test(matrix(c(REG,
PRO + ns,
138 - REG,
12102 - PRO - ns),
nrow = 2))$conf.int[2]),
RG_pval = fisher.test(matrix(c(REG,
PRO + ns,
138 - REG,
12102 - PRO - ns),
nrow = 2))$p.value) %>%
pivot_longer(PR_lor:RG_pval,
names_to = "measure",
values_to = "value") %>%
mutate(value = ifelse(value == -Inf, -10, value)) %>%
separate(measure,
into = c("polarization", "measure"),
sep = "_") %>%
pivot_wider(names_from = "measure",
values_from = "value") %>%
left_join(eigengene_diet_cor %>%
select(module, cor_diet),
by = "module") %>%
mutate(module_label = as_factor(str_c(module, " (", n_genes, ")")),
polarization_genes = ifelse(polarization == "RG", REG, PRO),
polarization = ifelse(polarization == "RG", "Regulatory", "Pro-Inflammatory"),
order = as.numeric(str_remove(module, "Module ")))
module_pol$padj <- p.adjust(module_pol$pval)
module_pol %>%
filter(n_genes > 30) %>%
ggplot(aes(x = fct_reorder(module_label, desc(order)), y = lor)) +
geom_hline(yintercept = 0, linetype = "dashed", col = "black") +
geom_errorbar(aes(ymin = lowci, ymax = hici, col = ifelse(lowci * hici > 0, polarization, "gray")),
size = 3, alpha = 0.6, width = 0) +
geom_label(aes(label = polarization_genes, fill = ifelse(lowci * hici > 0, polarization, "gray")), col = "white") +
scale_color_manual(breaks = c("Regulatory", "Pro-Inflammatory", "gray"),
values = c(m2_col, m1_col, "gray"),
guide = NULL) +
scale_fill_manual(breaks = c("Regulatory", "Pro-Inflammatory", "gray"),
values = c(m2_col, m1_col, "gray"),
guide = NULL) +
scale_y_continuous(limits = c(-10, 8),
breaks = seq(-10, 8, 2),
labels = c("-Inf", seq(-8, 8, 2))) +
labs(x = NULL,
y = expression(Log[2]*"(Odds Ratio)"),
title = "Enrichment of Polarization Genes in WGCNA Modules") +
coord_flip() +
facet_wrap(~polarization)
Computing file changes ...