https://github.com/emkcosta/KillifishAutomaticFeederPaper
Tip revision: 451bc5d78cd266a00b53612585d201d404f73920 authored by Emma on 24 October 2022, 20:33:32 UTC
Delete SuppFig2 directory
Delete SuppFig2 directory
Tip revision: 451bc5d
9_sexDEG_enrich_220805_codechk.R
# Title: PCA and DEseq2 analysis comparing across tissues, sexes, and dietary conditions
# Author: Jingxun Chen
# Date: code compiled on 20220805
# Related publication: Andrew McKay, Emma K. Costa, and Jingxun Chen, eLife, 2022
# ------------------------------------------------------------------
# Set up
# ------------------------------------------------------------------
# Set wd to the current directory and seed (for reproducible results)
setwd("/Users/jingxun/Dropbox/KillifishFeederPaper_AndrewMcKay/Revision/Code/RNAseq_Code_check/sexDEG_enrich")
set.seed(1234)
# load package
library(dplyr)
# ------------------------------------------------------------------
# Load data and find overlaps
# ------------------------------------------------------------------
# Pick either the sexDEGs identified in AL or those identified in DR as input:
# sexDEG <- read.csv("Input/liver_SexDiverged_AL_MoverF_DEG_220426.csv", header = T)
sexDEG <- read.csv("Input/liver_SexDiverged_DR_MoverF_DEG_220714.csv", header = T)
# Load the dietDEGs identified in males or in females
dietDEG_f <- read.csv("Input/liver_Female_DRoverAL_DEG_220401.csv", header = T)
dietDEG_m <- read.csv("Input/liver_Male_DRoverAL_DEG_220401.csv", header = T)
# Find the number of genes overlapped between female dietDEG and male dietDEG
dietDEG_f_sig <- subset(dietDEG_f, padj < 0.05)
dietDEG_m_sig <- subset(dietDEG_m, padj < 0.05)
overlap <- inner_join(dietDEG_m_sig, dietDEG_f_sig, by = 'Gene')
length(overlap$Gene)
# ------------------------------------------------------------------
# Select input to run the following common script
# ------------------------------------------------------------------
# Pick either male dietDEGs or female dietDEGs as input
# dietDEG <- dietDEG_f
dietDEG <- dietDEG_m
# ------------------------------------------------------------------
# common script for males and females - start here **************
# ------------------------------------------------------------------
# Rename the columns to reflect whether they are from the sexDEG vs. dietDEG dataframes
colnames(sexDEG) <- c('Gene', 'baseMean_s', 'log2FoldChange_s', 'lfcSE_s', 'stat_s', 'pvalue_s', 'padj_s')
colnames(dietDEG) <- c('Gene', 'baseMean_d', 'log2FoldChange_d', 'lfcSE_d', 'stat_d', 'pvalue_d', 'padj_d')
# Keep all genes that are only present in both data frames
join_sexDiet <- inner_join(sexDEG, dietDEG, by = 'Gene')
# Remove the genes with 'NA' for padj (in either sexDEG, dietDEG, or both)
join_sexDiet_noNA <- subset(join_sexDiet, (!is.na(join_sexDiet$padj_s)) & (!is.na(join_sexDiet$padj_d)))
# Note: dietDEG_female joined with sexDEG_DR = 16426, dietDEG_male joined with sexDEG_DR = 17732
# Note: dietDEG_female joined with sexDEG_AL = 16426, dietDEG_male joined with sexDEG_AL = 17732
# Subset based on criteria
sexDEG_sig <- subset(join_sexDiet_noNA, padj_s < 0.05)
sexDEG_notsig <- subset(join_sexDiet_noNA, padj_s >= 0.05)
dietDEG_sig <- subset(join_sexDiet_noNA, padj_d < 0.05)
dietDEG_notsig <- subset(join_sexDiet_noNA, padj_d >= 0.05)
# Find genes that are shared in sexDEG and dietDEG lists
share_notsig <- inner_join(sexDEG_notsig, dietDEG_notsig , by = 'Gene')
share_sig <- inner_join(sexDEG_sig, dietDEG_sig , by = 'Gene')
# ------------------------------ Find the non-dietDEG 'control' gene set ------------------------------
# The control gene set shares the same expression distribution and gene number as dietDEGs
nCounts <- read.csv('Input/CountsNormDESeq2_Liver_ALDR_splitMF_220402.csv', header = T)
# Find the average of normalized counts, using all data
rownames(nCounts) <- nCounts$Gene
nCounts <- subset(nCounts, select = -Gene) # remove Gene column to do math in the next step
nCounts$ave <- rowMeans(nCounts) # find the average counts of all samples
nCounts <- cbind(Gene = rownames(nCounts), nCounts) # add the Gene column back for 'join' function later
nCounts <- subset(nCounts, select = c(Gene, ave)) # clean up dataframe
# Find the average expression for dietDEGs, using all data
dietDEG_sig_nCounts_all <- left_join(dietDEG_sig, nCounts, by ='Gene') # find counts for dietDEG_sig
dietDEG_sig_nCounts_all <- subset(dietDEG_sig_nCounts_all, select = c(Gene, ave)) # keep only the average column
n <- length(dietDEG_sig$Gene) # calculate the number of genes
nonDietDEG_ref <- data.frame(matrix(ncol = 4, nrow = n)) # create an empty dataframe
colnames(nonDietDEG_ref) <- c('Gene', 'nonDietDEG_RefGenes', 'num_sexDEG', 'num_notSexDEG') # name each column
# Find genes whose expression are +/- 2% of each diet DEG's normalized count
for (i in (1:n)) {
nCounts_i <- subset(nCounts, (nCounts$ave < (dietDEG_sig_nCounts_all$ave[i])*1.02) & (nCounts$ave > (dietDEG_sig_nCounts_all$ave[i]*0.98)))# within 2 % of expression (note: 1% results in 1 DEG without REF gene)
nCounts_i <- subset(nCounts_i, Gene!= dietDEG_sig_nCounts_all$Gene[i]) # remove the dietDEG from the dataframe
nonDietDEG_ref$Gene[i] <- dietDEG_sig_nCounts_all$Gene[i] # save the dietDEG name
a <- list(nCounts_i$Gene)
nonDietDEG_ref$nonDietDEG_RefGenes[i] <- a # save the non-dietDEG reference gene list
nonDietDEG_ref$num_sexDEG[i] <- length((inner_join(nCounts_i,sexDEG_sig, by = 'Gene')$Gene)) # find the overlap between sexDEG and this non-dietDEG reference gene list
nonDietDEG_ref$num_notSexDEG[i] <- length(nCounts_i$Gene)-nonDietDEG_ref$num_sexDEG[i] # find the non-sexDEG, non-dietDEG list
}
# -------------------------- Sanity checks --------------------------
# sexDEG enrichment in the not-diet DEG reference gene list
sexDEG_ref <- sum(nonDietDEG_ref$num_sexDEG)
notsexDEG_ref <- sum(nonDietDEG_ref$num_notSexDEG)
ratio = sexDEG_ref/(sexDEG_ref+notsexDEG_ref) #female = 2per: 6.3808%; male = 2per: 6.5584%
# find the number of dietDEGs with no reference genes within 2% normalized counts away from their expression
zero_nonDietDEG_ref <- subset(nonDietDEG_ref, (num_sexDEG=='0') &(num_notSexDEG == '0'))
percent_Non_zero <- (n - length(zero_nonDietDEG_ref$Gene))/n # should be 100% non-zero
# -------------------------- Create non-diet DEG reference gene sets --------------------------
nonDietDEG_ref_set <- data.frame(matrix(ncol = 1, nrow = n))
colnames(nonDietDEG_ref_set) <- c('Gene')
# -------Run Fisher's exact test using the non-Diet DEG control gene set ---------
# Simulate the non-dietDEG control gene set for 1000 times, and run Fisher's exact test each time.
# Set up the data table
refset <- data.frame(matrix(ncol=1, nrow=n)) # create a new dataframe
colnames(refset) <- c('Gene') # set column names
ref_table <- data.frame(matrix(ncol=2, nrow=1000)) # create a new dataframe
colnames(ref_table) <- c('enrichment','pvalue') # set column names
# 1000-times Loop to make the control set, test, and store enrichment & p-value each time.
for (k in (1:1000)){
for (i in (1:n)) {
refset_i <- data.frame(nonDietDEG_ref$nonDietDEG_RefGenes[i]) # for each gene, convert its control gene set from being a list to dataframe
colnames(refset_i) <- c('Gene') # set column name
refGene <- sample(refset_i$Gene, 1) # randomly select 1 member from the control gene set
refset$Gene[i] <- refGene # save the selected gene.
} # At the end of this loop, each gene's control gene constitutes a member of the non-Diet DEG control gene set.
a <- nrow(inner_join(refset, sexDEG_sig, by = 'Gene'))
c <- nrow(refset) - a
b <- nrow(sexDEG_sig) - a
d <- nrow(share_notsig)
dat <- data.frame ('dietRefDEG_sig' = c (a, c), 'dietRefDEG_notsig' = c(b, d), row.names = c ('sexDEG_sig', 'sexDEG_notsig'), stringsAsFactors = FALSE)
colnames(dat) <- c('dietRefDEG_sig', 'dietRefDEG_notsig')
test <- fisher.test(dat)
ref_table$pvalue[k] <- test$p.value
ref_table$enrichment[k] <- a/n
}
# ------------------------------------------------------------------
# common script for males and females - end here **************
# ------------------------------------------------------------------
# Below are the codes for exporting data & plotting graphs. Select the proper file name for each run.
# ----------------------------------------------------------------------------------------------
# Export data
# ----------------------------------------------------------------------------------------------
# Export p-values and enrichment
# write.csv(ref_table, 'Output/FisherExactTest_2per_NonDietDEG_REF_f_220706.csv')
# write.csv(ref_table, 'Output/FisherExactTest_2per_NonDietDEG_REF_m_220706.csv')
# write.csv(ref_table, 'Output/FisherExactTest_2per_DRsexDEG_NonDietDEG_REF_f_220723.csv')
write.csv(ref_table, 'Output/FisherExactTest_2per_DRsexDEG_NonDietDEG_REF_m_220723.csv')
# ---------------------------------------------------------------------------------------------
# Plotting Pi charts
# ---------------------------------------------------------------------------------------------
# *************** PART 1: Observed data ***************
# Run each section separately for each input chosen above.
# # ------------ The percentage of AL-sexDEGs for the female dietDEGs ------------
# both <- nrow(share_sig)
# dietOnly <- nrow(dietDEG_sig) - both
# slices <- c(both, dietOnly)
# lbls <- c("sex-DEG", "not sex-DEG")
# pct <- round(slices/sum(slices)*100)
# lbls <- paste(lbls, pct) # add percents to labels
# lbls <- paste(lbls,"%",sep="") # ad % to labels
#
# pdf('Plots/PiChart_Liver_ALsexDEG_dieDEG_enrich_f_220724.pdf')
# pie(slices,labels = lbls, col=c("gray","white"), main="Percentage of AL-sex-DEGs among the liver female diet-DEGs")
# dev.off()
# ------------ The percentage of AL-sexDEGs for the male dietDEGs ------------
# both <- nrow(share_sig)
# dietOnly <- nrow(dietDEG_sig) - both
# slices <- c(both, dietOnly)
# lbls <- c("sex-DEG", "not sex-DEG")
# pct <- round(slices/sum(slices)*100)
# lbls <- paste(lbls, pct) # add percents to labels
# lbls <- paste(lbls,"%",sep="") # ad % to labels
#
# pdf('Plots/PiChart_Liver_ALsexDEG_dieDEG_enrich_m_220724.pdf')
# pie(slices,labels = lbls, col=c("gray","white"), main="Percentage of AL-sex-DEGs among the liver male diet-DEGs")
# dev.off()
# # ------------ The percentage of DR-sexDEGs for the female dietDEGs ------------
# both <- nrow(share_sig)
# dietOnly <- nrow(dietDEG_sig) - both
# slices <- c(both, dietOnly)
# lbls <- c("sex-DEG", "not sex-DEG")
# pct <- round(slices/sum(slices)*100)
# lbls <- paste(lbls, pct) # add percents to labels
# lbls <- paste(lbls,"%",sep="") # ad % to labels
#
# pdf('Plots/PiChart_Liver_DRsexDEG_dieDEG_enrich_f_220724.pdf')
# pie(slices,labels = lbls, col=c("gray","white"), main="Percentage of DR-sex-DEGs among the liver female diet-DEGs")
# dev.off()
#
# # ------------ The percentage of DR-sexDEGs for the male dietDEGs ------------
both <- nrow(share_sig)
dietOnly <- nrow(dietDEG_sig) - both
slices <- c(both, dietOnly)
lbls <- c("sex-DEG", "not sex-DEG")
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) # add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
pdf('Plots/PiChart_Liver_DRsexDEG_dieDEG_enrich_m_220724.pdf')
pie(slices,labels = lbls, col=c("gray","white"), main="Percentage of DR-sex-DEGs among the liver male diet-DEGs")
dev.off()
#
# *************** PART 2: Simulated control data ***************
# load data
# ref_table_al_f <- read.csv('Output/FisherExactTest_2per_NonDietDEG_REF_f_220706.csv', row.names = 1)
# ref_table_al_m <- read.csv('Output/FisherExactTest_2per_NonDietDEG_REF_m_220706.csv', row.names = 1)
# ref_table_dr_f <- read.csv('Output/FisherExactTest_2per_DRsexDEG_NonDietDEG_REF_f_220723.csv', row.names = 1)
ref_table_dr_m <- read.csv('Output/FisherExactTest_2per_DRsexDEG_NonDietDEG_REF_m_220723.csv', row.names = 1)
# select data to run the common script in the next step
# table <- ref_table_al_f
# table <- ref_table_al_m
# table <- ref_table_dr_f
table <- ref_table_dr_m
# ---- common script starts here **** -----
both <- median(table$enrichment)
dietOnly <- 1 - both
slices <- c(both, dietOnly)
lbls <- c("sex-DEG", "not sex-DEG")
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) # add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
# ---- common script ends here **** -----
# Pick the section to run based on which input 'table' was selected in Line 248.
# Plotting
# pdf('Plots/PiChart_Liver_ALsexDEG_dieDEG_enrich_ctrl_f_220724.pdf')
# pie(slices,labels = lbls, col=c("gray","white"), main="Percentage of AL-sex-DEGs among the liver female non-diet-DEGs controls")
# dev.off()
# pdf('Plots/PiChart_Liver_ALsexDEG_dieDEG_enrich_ctrl_m_220724.pdf')
# pie(slices,labels = lbls, col=c("gray","white"), main="Percentage of AL-sex-DEGs among the liver male non-diet-DEGs controls")
# dev.off()
# pdf('Plots/PiChart_Liver_DRsexDEG_dieDEG_enrich_ctrl_f_220724.pdf')
# pie(slices,labels = lbls, col=c("gray","white"), main="Percentage of DR-sex-DEGs among the liver female non-diet-DEGs controls")
# dev.off()
#
pdf('Plots/PiChart_Liver_DRsexDEG_dieDEG_enrich_ctrl_m_220724.pdf')
pie(slices,labels = lbls, col=c("gray","white"), main="Percentage of DR-sex-DEGs among the liver male non-diet-DEGs controls")
dev.off()
#
sessionInfo()
