#(5) EWAS-GWAS overlap #The following R code is a function to compute the summary statistics for our EWAS-GWAS overlap analaysis based on hypergeometric tests. The parameters in the function are: ORDER is a dummy variable, aar0 is data frame of EWAS summary statistics that lists the variables as described in head(aars), other0 lists the gene P value generated by MAGENTA, and HGNC gene symbol from a test GWAS, other.name0 is the label of the test GWAS, cutoff is 2.5% in our study, n.topcpg0=1000 (the top number of CpGs) in our study, and background.hg is the background based on the genomic regions in our custom Mammalian array, as described in head(background.hg). # #R code # > head(background.hg) HGNC_gene CHR bp CGid GeneID 1 A1BG 19 58864846 cg22568540 1 2 A1CF 10 52619664 cg02811585 29974 3 A1CF 10 52573791 cg25496747 29974 4 AACS 12 125578355 cg02591564 65985 6 AAK1 2 69811375 cg05656329 22848 7 AATF 17 35336293 cg04920416 26574 > head(aars) CGid Meta.Z P HGNC_gene CHR bp GeneID 17511 cg18141557 58.21023 0 POU3F3 2 105472226 5455 8958 cg09227056 55.57959 0 EVX2 2 176940449 344191 12408 cg12879445 55.28574 0 COX8C 14 93897410 341947 12409 cg12879445 55.28574 0 KIAA1409 14 93897410 57578 15207 cg15682828 55.10546 0 ZIC2 13 100635161 7546 5426 cg05551621 54.09696 0 FOXD3 1 63789233 27022 F_enrich<- function(ORDER,aar0,other0,other.name0,cutoff,n.topcpg0,background.hg=background.hg19){ aar0=subset(aar0,!is.na(aar0$Meta.Z)) other0=subset(other0,!is.na(other0$p.other)) other0.gene=other0 other0=merge(by='HGNC_gene',other0,subset(background.hg,select=c(HGNC_gene,CHR,bp,CGid,HGNC_gene.amin))) other0$genomic_region=paste(as.character(other0$HGNC_gene),as.character(other0$CGid),sep='-') #length(unique(other0$GeneID)) # # background=other0 #very important ntot=nrow(background) # aar0=aar0[order(-abs(aar0$Meta.Z)),] aar0$genomic_region=paste(as.character(aar0$HGNC_gene),as.character(aar0$CGid),sep='-') aar.top=vector(length=2,mode='list') aar.top[[1]]=subset(aar0,Meta.Z>0) aar.top[[2]]=subset(aar0,Meta.Z<0) aar.top[[1]]=aar.top[[1]][1:n.topcpg0,] aar.top[[2]]=aar.top[[2]][1:n.topcpg0,] names(aar.top)=c('pos','neg') # other0=other0[order(other0$p.other),] other0.gene=other0.gene[order(other0.gene$p.other),] # output={} n2=round(cutoff*dim(other0.gene)[1])#top cutoff genes other.gene=other0.gene[1:n2,] # other=subset(other0,HGNC_gene%in%other.gene$HGNC_gene) # for(t in 1:2){ index=is.element(aar.top[[t]]$genomic_region,other$genomic_region) x=sum(as.numeric(index)) m=dim(other)[1]# number white balls/other gwas n=ntot-m k=dim(aar.top[[t]])[1] # numer of draw # p.enrich= phyper(x-1,m=m,n=n,k=k,lower.tail=F) logp.enrich =phyper(x-1,m=m,n=n,k=k,lower.tail=F,log.p=T) log10p.enrich= -(logp.enrich/log(10)) a1=10^(ceiling(log10p.enrich)-log10p.enrich) Hyper.P.scientic=ifelse(p.enrich>=0.1, round(p.enrich,digits=2), paste0(round(a1,1),'e-',ceiling(log10p.enrich))) # # OverlapGenes=paste(unique(aar.top[[t]]$HGNC_gene[index]),collapse =';') OverlapGenes.CpG=paste(aar.top[[t]]$CGid[index],collapse =';') # n.overlapgene=length(unique(aar.top[[t]]$HGNC_gene[index])) n.annotation=length(unique((other$HGNC_gene))) # output0=data.frame(index=ORDER,class=names(aar.top)[t],cutoff=cutoff,GWAS=other.name0, n.overlapgene=n.overlapgene, n.annotation=n.annotation, n.overlapGR=x, n.annotationGR=m, Overlap=paste(x,m,sep="/"), P=p.enrich,log10P=log10p.enrich,P.scientic=Hyper.P.scientic, OverlapGenes=OverlapGenes,OverlapGenes.CpG=OverlapGenes.CpG) output=rbind(output,output0) } return(output) }