1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
# Title: Perfrom functional enrichemnt using GSEA based on GO ontology
# Author: Jingxun Chen
# Date: code compiled on 20220805
# Related publication: Andrew McKay, Emma K. Costa, and Jingxun Chen, eLife, 2022


# ------------------------------------------------------------------
# Set up 
# ------------------------------------------------------------------

# Set wd to the current directory
setwd("/Users/jingxun/Dropbox/KillifishFeederPaper_AndrewMcKay/Revision/Code/RNAseq_Code_check/GSEA")

# Load packages
library("DOSE")
library("clusterProfiler") 
library("org.Hs.eg.db")

# To convert NCBI ids to human entrez ids. This is needed to run the package. There are ways to adapt it for nfur only, but for now I do everything based on human orthologs
hSymbols = read.table("Input/NCBI-Human-orthologs.txt", head = T, sep = "\t")


# ------------------------------------------------------------------
# Select input
# ------------------------------------------------------------------
# Select a proper input for GSEA by removing the '#' sign. 
# Add a '#' sign for input not analyzed currently. 
# Run each input one at a time.

#data <- read.csv("Input/liver_Female_DRoverAL_DEG_220401.csv")
data <- read.csv("Input/liver_Male_DRoverAL_DEG_220401.csv")

# ------------------------------------------------------------------
# Generate GSEA input, which is a ranked list of genes.
# ------------------------------------------------------------------
# calculate ranks based on (-log10(p-value) x log2FoldChange)

data$mlog10QvalxFC <- (-log10(data$pvalue))*(data$log2FoldChange)
data <- subset(data, select = c(Gene, mlog10QvalxFC))
head(data)


# ------------------------------------------------------------------------------------------------
# Get human ortholog symbols based on the BLAST results file using org.Hs.eg.db package
# ------------------------------------------------------------------------------------------------

# Some Ids will fail to map and will be ignored
dataH = merge(hSymbols, data, by.x = "ncbi", by.y = "Gene") 
entrezIds = bitr(as.character(dataH[,2]), fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db") # Get entrez ids for annotation
### Note: "Warning message: 4.13% of input gene IDs are fail to map" ###
dataHE = merge(dataH, entrezIds, by.x = "human", by.y = "SYMBOL") # Get human symbols
head(dataHE)

# There can be duplicate values because of paralogs. I take the average of those for quantitative score
dataHE$mlog10QvalxFC <- as.numeric(dataHE$mlog10QvalxFC)
unique = aggregate(dataHE[,3], list(dataHE$human), mean)
dataHEU = merge(unique, entrezIds, by.x = "Group.1", by.y = "SYMBOL")
colnames(dataHEU) = c("human", "mlog10QvalxFC", "entrez")
head(dataHEU)

geneList = dataHEU[,2]  # gene list for GO 
names(geneList) = as.character(dataHEU[,1]) # with entrez ids as names
  
# *** Sort the gene list based on quantitative score in decreasing order. This is critical for GSEA  
geneList = sort(geneList, decreasing = TRUE)
  
head(geneList)
tail(geneList)
  


# ------------------------------------------------------------------------------------------------
# Do different enrichment analyses
# ------------------------------------------------------------------------------------------------

# --------------------- Gene Ontology -------------------
ego3 <- gseGO(geneList     = geneList,
              OrgDb        = org.Hs.eg.db,
              keyType      = 'SYMBOL',
              ont          = c("ALL"),
              pvalueCutoff = 1)

head(ego3)

# Select proper outfile name based on input
#write.table(ego3, "Output/liver_Female_DRoverAL_DEG_220401_GOGSEA.csv", sep = ",", quote = T, row.names = F)
write.table(ego3, "Output/liver_Male_DRoverAL_DEG_220401_GOGSEA.csv", sep = ",", quote = T, row.names = F)


# ------------------------------------------------------------------
# Generate the killifish & human gene name conversion file  
# ------------------------------------------------------------------
# Select the proper output name based on input

#write.table(dataHE, "Output/liver_Female_DRoverAL_DEG_220401_GOGSEA_HumanName.csv", sep = ",", quote = T, row.names = F)
write.table(dataHE, "Output/liver_Male_DRoverAL_DEG_220401_GOGSEA_HumanName.csv", sep = ",", quote = T, row.names = F)


# ------------------------------------------------------------------
# Clear list to run the script again  
# ------------------------------------------------------------------

rm(list=ls()) 

sessionInfo()