--- Author: "Amin Haghani" Paper: "DNA Methylation Networks Underlying Mammalian Traits" Authors: "Amin Haghani1, 2 †*; Caesar Z. Li3, 4 †; Todd R. Robeck5; Joshua Zhang1; Ake T. Lu1, 2; Julia Ablaeva6; Victoria A. Acosta-Rodríguez7; Danielle M. Adams8; Abdulaziz N. Alagaili9, 10; Javier Almunia11; Ajoy Aloysius12; Nabil M.S. Amor13; Reza Ardehali14; Adriana Arneson15, 16; C. Scott Baker17; Gareth Banks18; Katherine Belov19; Nigel C. Bennett20; Peter Black21; Daniel T. Blumstein22, 23; Eleanor K. Bors17; Charles E. Breeze24; Robert T. Brooke25; Janine L. Brown26; Gerald Carter27; Alex Caulton28, 29; Julie M. Cavin30; Lisa Chakrabarti31; Ioulia Chatzistamou32; Andreas S. Chavez27, 33; Hao Chen34; Kaiyang Cheng35; Priscila Chiavellini36; Oi-Wa Choi37, 38; Shannon Clarke28; Joseph A. Cook39; Lisa N. Cooper40; Marie-Laurence Cossette41; Joanna Day42; Joseph DeYoung37, 38; Stacy Dirocco43; Christopher Dold44; Jonathan L. Dunnum39; Erin E. Ehmke45; Candice K. Emmons46; Stephan Emmrich6; Ebru Erbay47, 48, 49; Claire Erlacher-Reid43; Chris G. Faulkes50, 51; Zhe Fei3, 52; Steven H. Ferguson53, 54; Carrie J. Finno55; Jennifer E. Flower56; Jean-Michel Gaillard57; Eva Garde58; Livia Gerber59, 60; Vadim N. Gladyshev61; Rodolfo G. Goya36; Matthew J Grant62; Carla B. Green7; M. Bradley Hanson46; Daniel W. Hart20; Martin Haulena63; Kelsey Herrick64; Andrew N. Hogan65; Carolyn J. Hogg19; Timothy A. Hore66; Taosheng Huang67; Juan Carlos Izpisua Belmonte2; Anna J. Jasinska37, 68, 69; Gareth Jones70; Eve Jourdain71; Olga Kashpur72; Harold Katcher73; Etsuko Katsumata74; Vimala Kaza75; Hippokratis Kiaris76; Michael S. Kobor77; Pawel Kordowitzki78; William R. Koski79; Michael Krützen60; Soo Bin Kwon16, 15; Brenda Larison22, 80; Sang-Goo Lee61; Marianne Lehmann36; Jean-François Lemaître57; Andrew J. Levine81; Xinmin Li82; Cun Li83, 84; Andrea R. Lim1; David T.S. Lin85; Dana M. Lindemann43; Schuyler W. Liphardt86; Thomas J. Little87; Nicholas Macoretta6; Dewey Maddox88; Craig O. Matkin89; Julie A. Mattison90; Matthew McClure91; June Mergl92; Jennifer J. Meudt93; Gisele A. Montano5; Khyobeni Mozhui94; Jason Munshi-South95; William J. Murphy96, 97; Asieh Naderi76; Martina Nagy98; Pritika Narayan62; Peter W. Nathanielsz83, 84; Ngoc B. Nguyen14; Christof Niehrs99, 100; Batsaikhan Nyamsuren101; Justine K. O'Brien42; Perrie O'Tierney Ginn72; Duncan T Odom102, 103; Alexander G. Ophir104; Steve Osborn105; Elaine A. Ostrander65; Kim M. Parsons46; Kimberly C. Paul81; Amy B. Pedersen87; Matteo Pellegrini106; Katharina J. Peters60, 107; Jessica L. Petersen108; Darren W. Pietersen109; Gabriela M. Pinho22; Jocelyn Plassais65; Jesse R. Poganik61; Natalia A. Prado110, 26; Pradeep Reddy111, 2; Benjamin Rey57; Beate R. Ritz112, 113, 81; Jooke Robbins114; Magdalena Rodriguez115; Jennifer Russell105; Elena Rydkina6; Lindsay L. Sailer104; Adam B. Salmon116; Akshay Sanghavi73; Kyle M. Schachtschneider117, 118, 119; Dennis Schmitt120; Todd Schmitt64; Lars Schomacher99; Lawrence B. Schook117, 121; Karen E. Sears22; Ashley W. Seifert12; Aaron B.A. Shafer122; Anastasia V. Shindyapina61; Melanie Simmons45; Kavita Singh123; Ishani Sinha22; Jesse Slone67; Russel G. Snell62; Elham Soltanmohammadi76; Matthew L. Spangler108; Maria Spriggs21; Lydia Staggs43; Nancy Stedman21; Karen J. Steinman124; Donald T Stewart125; Victoria J. Sugrue66; Balazs Szladovits126; Joseph S. Takahashi7, 127; Masaki Takasugi6; Emma C. Teeling128; Michael J. Thompson106; Bill Van Bonn129; Sonja C. Vernes130, 131; Diego Villar132; Harry V. Vinters133; Ha Vu15, 16; Mary C. Wallingford72; Nan Wang37, 38; Gerald S. Wilkinson8; Robert W. Williams134; Qi Yan3, 2; Mingjia Yao3; Brent G. Young54; Bohan Zhang61; Zhihui Zhang6; Yang Zhao6; Peng Zhao14, 135; Wanding Zhou136, 137; Joseph A. Zoller3; Jason Ernst15, 16; Andrei Seluanov138; Vera Gorbunova138; X. William Yang37, 38; Ken Raj139; Steve Horvath1, 2 *" Date: "07-24-2023" output: html_document --- ```{r libraries, message=FALSE} library(easypackages) libraries("readxl", "psych", "tidyr", "dplyr","gplots", "RColorBrewer","limma", "ggplot2", "metap", "anRichment", "clusterProfiler", "RColorBrewer", "ggstatsplot", "gridExtra", "ggupset", "psych", "corrplot", "limma", "edgeR", "gridExtra", "ggupset", "ggrepel", "data.table", "stringr") source("~/Google Drive/Amin documents/Steve projects/Research projects/Human unique methylation project/summarizeFunctions.R") ``` Color palattes ```{r colour pallets} my_palette <- rev(colorRampPalette(brewer.pal(11, "RdBu"))(256))[25:231] my_palette2 <- colorRampPalette(c("green", "white", "red"))(n = 299) my_palette3 <- brewer.pal(9,"RdYlGn") my_palette4 <- brewer.pal(8,"Dark2") my_palette5 <- rev(brewer.pal(9,"BrBG")) my_palette5 <- rev(brewer.pal(9,"PiYG")) my_palette6 <- brewer.pal(9,"YlOrBr")[2:6] my_palette7 <- colorRampPalette(c("yellow", "orange", "red"))(n = 299) my_palette8 <- colorRampPalette(c("blue", "purple"))(n = 299) my_palette9 <- brewer.pal(9,"Purples")[2:9] my_palette10 <- brewer.pal(9,"Oranges")[2:9] blues <- brewer.pal(9,"Blues") reds <- brewer.pal(9,"Reds") my_palette11 <- colorRampPalette(c(rev(blues)[1:3],"grey", "grey" ,"grey","grey", reds[2],reds[2:5]))(n = 299) my_palette12 <- colorRampPalette(c("blue", "grey", "red"))(n = 299) my_palette13 <- rev(brewer.pal(9,"Spectral")) ``` ```{r import data, message=FALSE, warning=FALSE} samplesNoMars <- readRDS("samplesNoMars.RDS") bValsNoMars <- readRDS("DNAmDataNoMarsupials.RDS") ``` ```{r} multiSampels <- samplesNoMars %>% group_split(SpeciesLatinName,Tissue) names(multiSampels) <- sapply(multiSampels, function(x){paste(x$SpeciesLatinName[1], x$Tissue[1], sep = "_")}) multiSampels <- plyr::compact(plyr::llply(multiSampels, function(x){ if(nrow(x)<50){x <- NULL} return(x) })) multiExpr <- plyr::llply(multiSampels, function(x){ bval <- bValsNoMars %>% dplyr::select(x$Basename) bval <- list(data = t(bval)) }) #multiExpr <- multiExpr[1:10] #saveRDS(multiExpr, "multiExpr.RDS") ``` ```{r consensus WGCNA} # Check that the data has the correct format for many functions operating on multiple sets: exprSize = checkSets(multiExpr) exprSize # Define data set dimensions nGenes = exprSize$nGenes; nSamples = exprSize$nSamples; # Get the number of sets in the multiExpr structure. nSets = checkSets(multiExpr)$nSets # Choose a set of soft-thresholding powers powers = c(c(1), seq(from = 2, to=20, by=2)); # Initialize a list to hold the results of scale-free analysis powerTables = vector(mode = "list", length = nSets); # Call the network topology analysis function for each set in turn for (set in 1:nSets) powerTables[[set]] = list(data = pickSoftThreshold(multiExpr[[set]]$data, powerVector=powers, verbose = 2)[[2]]); collectGarbage(); names(powerTables) <- names(multiExpr) dat <- powerTables[[1]]$data powers <- rbindlist(lapply(powerTables, function(x){ dat <- x$data }), idcol = "set", use.names = TRUE) #saveRDS(powers, "ConsensusPowerTable.RDS") p1 <- ggplot(powers, aes(x=Power, y=`SFT.R.sq` ))+geom_point(aes(color=as.character(set)))+geom_line(aes(color=as.character(set)))+geom_smooth(method = "loess") pdf("ConsensusPowerTrend.pdf", width = 4, height = 4) p1 dev.off() power <- powers %>% group_by(Power) %>% summarise(medianRsq = median(`SFT.R.sq`)) %>% top_n(1, medianRsq) powers <- powers %>% group_by(set) %>% top_n(1, `SFT.R.sq`)%>% top_n(1, -Power) %>% ungroup() %>% tibble::column_to_rownames(var = "set") powers <- powers[names(multiExpr),] ``` ```{r soft power} # Initialize an appropriate array to hold the adjacencies adjacencies = array(0, dim = c(nSets, nGenes, nGenes)); # Calculate adjacencies in each individual data set for (set in 1:nSets) adjacencies[set, , ] = abs(cor(multiExpr[[set]]$data, use = "p"))^powers$Power[set]; # Initialize an appropriate array to hold the TOMs TOM = array(0, dim = c(nSets, nGenes, nGenes)); # Calculate TOMs in each individual data set for (set in 1:nSets) TOM[set, , ] = TOMsimilarity(adjacencies[set, , ]); ``` Since the data is too large, I decided to split them in chunk and create a concensus TOM ```{r split in chunk analysis} x <- seq_along(1:nSets) sets <- split(1:nSets, ceiling(x/5)) z <- lapply(1:length(sets), function(x){ adjacencies = array(0, dim = c(length(sets[[x]]), nGenes, nGenes)); # Calculate adjacencies in each individual data set for (set in 1:length(sets[[x]])) adjacencies[set, , ] = abs(cor(multiExpr[[set]]$data, use = "p"))^powers$Power[set]; # Initialize an appropriate array to hold the TOMs TOM = array(0, dim = c(length(sets[[x]]), nGenes, nGenes)); # Calculate TOMs in each individual data set for (set in 1:length(sets[[x]])) TOM[set, , ] = TOMsimilarity(adjacencies[set, , ]); #saveRDS(TOM, paste("TOM_", x, ".RDS" ,sep = "")) # Define the reference percentile scaleP = 0.95 # Set RNG seed for reproducibility of sampling set.seed(12345) nSet <- dim(TOM)[1] # Sample sufficiently large number of TOM entries nSamples = as.integer(1/(1-scaleP) * 1000); # Choose the sampled TOM entries scaleSample = sample(nGenes*(nGenes-1)/2, size = nSamples) TOMScalingSamples = list(); # These are TOM values at reference percentile scaleQuant = rep(1, nSet) # Scaling powers to equalize reference TOM values scalePowers = rep(1, nSet) # Loop over sets for (set in 1:nSet){ # Select the sampled TOM entries TOMScalingSamples[[set]] = as.dist(TOM[set, , ])[scaleSample] # Calculate the 95th percentile scaleQuant[set] = quantile(TOMScalingSamples[[set]], probs = scaleP, type = 8); assign("scaleQuant1", scaleQuant[1], envir = .GlobalEnv) # Scale the TOM if ((x==1&set>1)|x>1) { scalePowers[set] = log(scaleQuant1)/log(scaleQuant[set]); TOM[set, ,] = TOM[set, ,]^scalePowers[set]; } } z1 <- lapply(1:dim(TOM)[1], function(x){TOM[x, , ]}) consensusTOM <- do.call(pmin,z1) saveRDS(consensusTOM, paste("TOM_", x, ".RDS" ,sep = "")) return(consensusTOM) }) consensusTOM <- do.call(pmin,z) rm(z) saveRDS(consensusTOM, "consensusTOM.RDS") ``` ```{r scaling topology ovelap} # Define the reference percentile scaleP = 0.95 # Set RNG seed for reproducibility of sampling set.seed(12345) # Sample sufficiently large number of TOM entries nSamples = as.integer(1/(1-scaleP) * 1000); # Choose the sampled TOM entries scaleSample = sample(nGenes*(nGenes-1)/2, size = nSamples) TOMScalingSamples = list(); # These are TOM values at reference percentile scaleQuant = rep(1, nSets) # Scaling powers to equalize reference TOM values scalePowers = rep(1, nSets) # Loop over sets for (set in 1:nSets){ # Select the sampled TOM entries TOMScalingSamples[[set]] = as.dist(TOM[set, , ])[scaleSample] # Calculate the 95th percentile scaleQuant[set] = quantile(TOMScalingSamples[[set]], probs = scaleP, type = 8); # Scale the TOM if (set>1) { scalePowers[set] = log(scaleQuant[1])/log(scaleQuant[set]); TOM[set, ,] = TOM[set, ,]^scalePowers[set]; } } ``` ```{r calculating consensus TOM} z <- lapply(1:dim(TOM)[1], function(x){TOM[x, , ]}) consensusTOM <- do.call(pmin,z) rm(z) ``` ```{r} # Clustering consTree = hclust(as.dist(1-consensusTOM), method = "average"); # We like large modules, so we set the minimum module size relatively high: minModuleSize = 30; # Module identification using dynamic tree cut: unmergedLabels = cutreeDynamic(dendro = consTree, distM = 1-consensusTOM, deepSplit = 2, cutHeight = 0.995, minClusterSize = minModuleSize, pamRespectsDendro = FALSE ); unmergedColors = labels2colors(unmergedLabels) ``` ```{r merge modules} # Calculate module eigengenes unmergedMEs = multiSetMEs(multiExpr, colors = NULL, universalColors = unmergedColors) # Calculate consensus dissimilarity of consensus module eigengenes consMEDiss = consensusMEDissimilarity(unmergedMEs); # Cluster consensus modules consMETree = hclust(as.dist(consMEDiss), method = "average"); # Plot the result sizeGrWindow(7,6) par(mfrow = c(1,1)) plot(consMETree, main = "Consensus clustering of consensus module eigengenes", xlab = "", sub = "") abline(h=0.25, col = "red") merge = mergeCloseModules(multiExpr, unmergedLabels, cutHeight = 0.25, verbose = 3) # Numeric module labels moduleLabels = merge$colors; # Convert labels to colors moduleColors = labels2colors(moduleLabels) # Eigengenes of the new merged modules: consMEs = merge$newMEs; save(consMEs, moduleColors, moduleLabels, consTree, file = "Consensus-NetworkConstruction-mammals.RData") ``` ```{r plot} pdf("consensus tree 1.pdf", width = 9, height = 6) sizeGrWindow(9,6) plotDendroAndColors(consTree, cbind(unmergedColors, moduleColors), c("Unmerged", "Merged"), dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05) dev.off() pdf("consensus tree 2.pdf", width = 9, height = 6) sizeGrWindow(9,6) plotDendroAndColors(consTree, cbind(moduleColors), c("Modules"), dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05) dev.off() ``` ## Read consesus WGCNA ```{r} load("WGCNA results/Consensus WGCNA 57 species tissues/Consensus-NetworkConstruction-mammals.RData") ``` ```{r} datExpr <- t(bValsNoMars) consMEList = moduleEigengenes(datExpr, colors = matchedCons$consMatched) consMEs = consMEList$eigengenes saveRDS(consMEs, "WGCNA results/Consensus WGCNA 57 species tissues/newConsMEsMatchedCols.RDS") ```