https://github.com/emkcosta/KillifishAutomaticFeederPaper
Tip revision: 451bc5d78cd266a00b53612585d201d404f73920 authored by Emma on 24 October 2022, 20:33:32 UTC
Delete SuppFig2 directory
Delete SuppFig2 directory
Tip revision: 451bc5d
4_GSEA_220805_codechk.R
# Title: Perfrom functional enrichemnt using GSEA based on GO ontology
# Author: Jingxun Chen
# Date: code compiled on 20220805
# Related publication: Andrew McKay, Emma K. Costa, and Jingxun Chen, eLife, 2022
# ------------------------------------------------------------------
# Set up
# ------------------------------------------------------------------
# Set wd to the current directory
setwd("/Users/jingxun/Dropbox/KillifishFeederPaper_AndrewMcKay/Revision/Code/RNAseq_Code_check/GSEA")
# Load packages
library("DOSE")
library("clusterProfiler")
library("org.Hs.eg.db")
# To convert NCBI ids to human entrez ids. This is needed to run the package. There are ways to adapt it for nfur only, but for now I do everything based on human orthologs
hSymbols = read.table("Input/NCBI-Human-orthologs.txt", head = T, sep = "\t")
# ------------------------------------------------------------------
# Select input
# ------------------------------------------------------------------
# Select a proper input for GSEA by removing the '#' sign.
# Add a '#' sign for input not analyzed currently.
# Run each input one at a time.
#data <- read.csv("Input/liver_Female_DRoverAL_DEG_220401.csv")
data <- read.csv("Input/liver_Male_DRoverAL_DEG_220401.csv")
# ------------------------------------------------------------------
# Generate GSEA input, which is a ranked list of genes.
# ------------------------------------------------------------------
# calculate ranks based on (-log10(p-value) x log2FoldChange)
data$mlog10QvalxFC <- (-log10(data$pvalue))*(data$log2FoldChange)
data <- subset(data, select = c(Gene, mlog10QvalxFC))
head(data)
# ------------------------------------------------------------------------------------------------
# Get human ortholog symbols based on the BLAST results file using org.Hs.eg.db package
# ------------------------------------------------------------------------------------------------
# Some Ids will fail to map and will be ignored
dataH = merge(hSymbols, data, by.x = "ncbi", by.y = "Gene")
entrezIds = bitr(as.character(dataH[,2]), fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db") # Get entrez ids for annotation
### Note: "Warning message: 4.13% of input gene IDs are fail to map" ###
dataHE = merge(dataH, entrezIds, by.x = "human", by.y = "SYMBOL") # Get human symbols
head(dataHE)
# There can be duplicate values because of paralogs. I take the average of those for quantitative score
dataHE$mlog10QvalxFC <- as.numeric(dataHE$mlog10QvalxFC)
unique = aggregate(dataHE[,3], list(dataHE$human), mean)
dataHEU = merge(unique, entrezIds, by.x = "Group.1", by.y = "SYMBOL")
colnames(dataHEU) = c("human", "mlog10QvalxFC", "entrez")
head(dataHEU)
geneList = dataHEU[,2] # gene list for GO
names(geneList) = as.character(dataHEU[,1]) # with entrez ids as names
# *** Sort the gene list based on quantitative score in decreasing order. This is critical for GSEA
geneList = sort(geneList, decreasing = TRUE)
head(geneList)
tail(geneList)
# ------------------------------------------------------------------------------------------------
# Do different enrichment analyses
# ------------------------------------------------------------------------------------------------
# --------------------- Gene Ontology -------------------
ego3 <- gseGO(geneList = geneList,
OrgDb = org.Hs.eg.db,
keyType = 'SYMBOL',
ont = c("ALL"),
pvalueCutoff = 1)
head(ego3)
# Select proper outfile name based on input
#write.table(ego3, "Output/liver_Female_DRoverAL_DEG_220401_GOGSEA.csv", sep = ",", quote = T, row.names = F)
write.table(ego3, "Output/liver_Male_DRoverAL_DEG_220401_GOGSEA.csv", sep = ",", quote = T, row.names = F)
# ------------------------------------------------------------------
# Generate the killifish & human gene name conversion file
# ------------------------------------------------------------------
# Select the proper output name based on input
#write.table(dataHE, "Output/liver_Female_DRoverAL_DEG_220401_GOGSEA_HumanName.csv", sep = ",", quote = T, row.names = F)
write.table(dataHE, "Output/liver_Male_DRoverAL_DEG_220401_GOGSEA_HumanName.csv", sep = ",", quote = T, row.names = F)
# ------------------------------------------------------------------
# Clear list to run the script again
# ------------------------------------------------------------------
rm(list=ls())
sessionInfo() 