R_pgm5_EWAS-GWAS_overlap
#(5) EWAS-GWAS overlap
#The following R code is a function to compute the summary statistics for our EWAS-GWAS overlap analaysis based on hypergeometric tests. The parameters in the function are: ORDER is a dummy variable, aar0 is data frame of EWAS summary statistics that lists the variables as described in head(aars), other0 lists the gene P value generated by MAGENTA, and HGNC gene symbol from a test GWAS, other.name0 is the label of the test GWAS, cutoff is 2.5% in our study, n.topcpg0=1000 (the top number of CpGs) in our study, and background.hg is the background based on the genomic regions in our custom Mammalian array, as described in head(background.hg).
#
#R code
#
> head(background.hg)
HGNC_gene CHR bp CGid GeneID
1 A1BG 19 58864846 cg22568540 1
2 A1CF 10 52619664 cg02811585 29974
3 A1CF 10 52573791 cg25496747 29974
4 AACS 12 125578355 cg02591564 65985
6 AAK1 2 69811375 cg05656329 22848
7 AATF 17 35336293 cg04920416 26574
> head(aars)
CGid Meta.Z P HGNC_gene CHR bp GeneID
17511 cg18141557 58.21023 0 POU3F3 2 105472226 5455
8958 cg09227056 55.57959 0 EVX2 2 176940449 344191
12408 cg12879445 55.28574 0 COX8C 14 93897410 341947
12409 cg12879445 55.28574 0 KIAA1409 14 93897410 57578
15207 cg15682828 55.10546 0 ZIC2 13 100635161 7546
5426 cg05551621 54.09696 0 FOXD3 1 63789233 27022
F_enrich<- function(ORDER,aar0,other0,other.name0,cutoff,n.topcpg0,background.hg=background.hg19){
aar0=subset(aar0,!is.na(aar0$Meta.Z))
other0=subset(other0,!is.na(other0$p.other))
other0.gene=other0
other0=merge(by='HGNC_gene',other0,subset(background.hg,select=c(HGNC_gene,CHR,bp,CGid,HGNC_gene.amin)))
other0$genomic_region=paste(as.character(other0$HGNC_gene),as.character(other0$CGid),sep='-')
#length(unique(other0$GeneID))
#
#
background=other0
#very important
ntot=nrow(background)
#
aar0=aar0[order(-abs(aar0$Meta.Z)),]
aar0$genomic_region=paste(as.character(aar0$HGNC_gene),as.character(aar0$CGid),sep='-')
aar.top=vector(length=2,mode='list')
aar.top[[1]]=subset(aar0,Meta.Z>0)
aar.top[[2]]=subset(aar0,Meta.Z<0)
aar.top[[1]]=aar.top[[1]][1:n.topcpg0,]
aar.top[[2]]=aar.top[[2]][1:n.topcpg0,]
names(aar.top)=c('pos','neg')
#
other0=other0[order(other0$p.other),]
other0.gene=other0.gene[order(other0.gene$p.other),]
#
output={}
n2=round(cutoff*dim(other0.gene)[1])#top cutoff genes
other.gene=other0.gene[1:n2,]
#
other=subset(other0,HGNC_gene%in%other.gene$HGNC_gene)
#
for(t in 1:2){
index=is.element(aar.top[[t]]$genomic_region,other$genomic_region)
x=sum(as.numeric(index))
m=dim(other)[1]# number white balls/other gwas
n=ntot-m
k=dim(aar.top[[t]])[1] # numer of draw
#
p.enrich= phyper(x-1,m=m,n=n,k=k,lower.tail=F)
logp.enrich =phyper(x-1,m=m,n=n,k=k,lower.tail=F,log.p=T)
log10p.enrich= -(logp.enrich/log(10))
a1=10^(ceiling(log10p.enrich)-log10p.enrich)
Hyper.P.scientic=ifelse(p.enrich>=0.1,
round(p.enrich,digits=2),
paste0(round(a1,1),'e-',ceiling(log10p.enrich)))
#
#
OverlapGenes=paste(unique(aar.top[[t]]$HGNC_gene[index]),collapse =';')
OverlapGenes.CpG=paste(aar.top[[t]]$CGid[index],collapse =';')
#
n.overlapgene=length(unique(aar.top[[t]]$HGNC_gene[index]))
n.annotation=length(unique((other$HGNC_gene)))
#
output0=data.frame(index=ORDER,class=names(aar.top)[t],cutoff=cutoff,GWAS=other.name0,
n.overlapgene=n.overlapgene,
n.annotation=n.annotation,
n.overlapGR=x,
n.annotationGR=m,
Overlap=paste(x,m,sep="/"),
P=p.enrich,log10P=log10p.enrich,P.scientic=Hyper.P.scientic,
OverlapGenes=OverlapGenes,OverlapGenes.CpG=OverlapGenes.CpG)
output=rbind(output,output0)
}
return(output)
}