################################################################################ # # Transcriptomic analysis # ################################################################################ # Format R session ------------------------------------------------------------- # Load packages library(biomaRt) library(cobs) library(doParallel) library(edgeR) library(EMMREML) library(limma) library(tidyverse) select <- dplyr::select # Set theme and colors for graphing theme_set(theme_classic(base_size = 20, base_family = "")) # graphing defaults wes_col <- "#D55E00" # specify color for Western diet med_col <- "#56B4E9" # specify color for Mediterranean diet # Import raw data load("data_files/processed_data_2.RData") # PCA on Gene Expression ------------------------------------------------------- # Conduct principal component analysis on the correlation matrix of normalized # residual gene expression. # PCA GE_PCA <- prcomp(cor(r_matrix)) sum_GE_PCA <- summary(GE_PCA) # store PC importance information sample_info$ge_pc1 <- GE_PCA$x[, 1] # add PC projection to Sample Info matrix # T test comparing PC1 projection between diet groups t.test(sample_info[sample_info$diet == "MD", "ge_pc1"], sample_info[sample_info$diet == "WD", "ge_pc1"]) # Generate Figure 1A figure_1a <- sample_info %>% ggplot(aes(x = diet, y = ge_pc1, fill = diet, color = diet)) + geom_hline(yintercept = 0, size = 1, linetype = "dotted") + geom_boxplot(outlier.alpha = 0, col = "black", alpha = 0.3, coef = 0) + geom_jitter(size = 2, width = 0.1) + labs(y = "Projection onto Gene Expression PC1") + scale_fill_manual(name = "Diet", breaks = c("WD", "MD"), values = c(wes_col, med_col), labels = c("WD", "MD")) + scale_color_manual(name = "Diet", breaks = c("WD", "MD"), values = c(wes_col, med_col), labels = c("WD", "MD")) + scale_x_discrete(labels = c("WD" = "Western", "MD" = "Mediterranean")) + theme(axis.title.x = element_blank(), legend.position = "none") saveRDS(figure_1a, file = "plots/figure_1a.RDS") # Gene expression (GE) ~ diet -------------------------------------------------- # Model normalized residual gene expression as a function of diet using linear # mixed effects model. # Create Z matrix Z_matrix <- diag(1, nrow = 35) # identity matrix # Set the parameters for parallel computing for lmm using EMMREML ncores <- detectCores(logical = TRUE) # find number of available cores clus <- makeCluster(ncores, setup_strategy = "sequential") # create cluster registerDoParallel(cores = ncores) clusterExport(clus, varlist = c("r_matrix", "sample_info", "Z_matrix", "kinship"), envir = environment()) # initiate local cluster # Model residual gene expression as a function of diet in genewise models emma_GE_diet <- data.frame(t(parApply(clus, r_matrix, 1, function(y){ library(EMMREML) emma <- emmreml(y = y, # model each gene X = model.matrix(~ sample_info$diet_west), # design matrix Z = as.matrix(Z_matrix), # identity matrix K = as.matrix(kinship), # relatedness matrix varbetahat = T, varuhat = T, PEVuhat = T, test = T) return(c(emma$betahat[2], emma$pvalbeta[2, "none"], emma$varbetahat[2])) }))) colnames(emma_GE_diet) <- c("es_diet", "diet_pval", "diet_var") # rename columns # Calculate standardized betas emma_GE_diet$std_beta_diet <- emma_GE_diet$es_diet / sqrt(emma_GE_diet$diet_var) # Empirical False Discovery Rate (FDR) ----------------------------------------- # Model the effect of diet (or DAB) on gene expression in 1000 permutations, # permuting the meta data in order to determine a null distribtuion of p-values. # Then, use the permuted null distribution to calculate an empirical FDR, # (Snyder-Mackler et al., 2016). Due to computational demands, this is best # completed on a high performance computing cluster. # Create a biomart object with ensembl mfas genes mart <- useMart(biomart = "ENSEMBL_MART_ENSEMBL", dataset = "mfascicularis_gene_ensembl") # Create a biomart object of GO terms associated with genes from study mfas_genes <- getBM(attributes = c("ensembl_gene_id", "external_gene_name"), filters = "ensembl_gene_id", values = unlist(emma_GE_diet$stable_ID), mart = mart) # Create data frame combining diet and DAB models emma_GE_diet$stable_ID <- row.names(emma_GE_diet) emma_GE_diet <- as_tibble(emma_GE_diet) %>% arrange(stable_ID) # Add gene names to DEG_info table DEG_info <- emma_GE_diet %>% left_join(mfas_genes, by = c("stable_ID" = "ensembl_gene_id")) # Save output data ------------------------------------------------------------- # Save data for downstream analyses save(gene_counts, sample_info, r_matrix, kinship, DEG_info, emma_GE_diet, file = "data_files/processed_data_3.RData")