bootstrap<-function(dataset,iterations,index,groups,para1,para2){ s.size.g1 <- length(dataset[,index][dataset[,groups]%in%para1]) s.size.g2 <- length(dataset[,index][dataset[,groups]%in%para2]) pool <- dataset[,index] obs.diff.b1 <- median (dataset[,index][dataset[,groups]==para1]) - median (dataset[,index][dataset[,groups]==para2]) iterations <- iterations sampl.dist.b1 <- NULL for (i in 1 : iterations) { resample <- sample (c(1:length (pool)), length(pool), replace = TRUE) g1.perm = pool[resample][1 : s.size.g1] g2.perm = pool[resample][(s.size.g1+1) : length(pool)] sampl.dist.b1[i] = median (g1.perm) - median (g2.perm) } p.boot1 <- (sum ( abs(sampl.dist.b1) >= abs(obs.diff.b1)) + 1)/ (iterations+1) return(p.boot1) } set.seed(1234) setwd("/Users/zhaox12/Dropbox (NYU Langone Health)/Xin_backup/Teresa_lab/project/10.protein/11.12-For paper/Manuscript_2021/figure_2021/fig2/210806.phylopscore/") ################################################################################################# # # # COLON,breast,ovarian,CCRCC,LUAD,ENDOMETRIAL # Lung squamous, embargo date is Dec 1 2021 # Head and neck is Aug 3, 2021 # GBM is March 1 2021 # # ################################################################################################# corum <- as.data.frame(read.delim("/Users/zhaox12/Desktop/Teresas_lab/project/10.protein/database/coreComplexesv3.0.txt", sep = "\t")) corum<-corum[corum$Organism=="Human",] score<-na.omit(read.delim("/Users/zhaox12/Desktop/Teresas_lab/project/36.evolution/hg19.gene.score.txt",sep="\t",header=T)) library(ggplot2) library(boot) library(ggpubr) library(rstatix) master_list <- strsplit(as.character(corum$subunits.Entrez.IDs.), split = ";") master_list <- unique(as.numeric(as.character(unlist(master_list)))) # Create master list of CORUM UniProt IDs master_list_uniprot <- strsplit(as.character(corum$subunits.UniProt.IDs.), split = ";") master_list_uniprot <- unique(as.character(unlist(master_list_uniprot))) # Create master list of CORUM gene names (use this) master_list_names <- strsplit(as.character(corum$subunits.Gene.name.), split = ";") master_list_names <- unique(as.character(unlist(master_list_names))) score$corum<-ifelse(score$genes%in%master_list_names,TRUE,FALSE) corums_cors <- score[score$corum == TRUE, "median.PhyloP.score"] noncorums_cors <- score[score$corum == FALSE, "median.PhyloP.score"] ######################################################################################################## ## generate pan-cancer data ###############################3 colon<-read.delim("/Users/zhaox12/Desktop/Teresas_lab/project/10.protein/11.12-For paper/Manuscript_2021/figure_2021/fig1/data/Colon.corr.table.txt",sep="\t") colnames(colon)[2]<-"colon.cor.rna.dna" colnames(colon)[5]<-"colon.cor.rna.pro" breast<-read.delim("/Users/zhaox12/Desktop/Teresas_lab/project/10.protein/11.12-For paper/Manuscript_2021/figure_2021/fig1/data/BREAST.corr.table.txt",sep="\t") colnames(breast)[2]<-"breast.cor.rna.dna" colnames(breast)[5]<-"breast.cor.rna.pro" ovarian<-read.delim("/Users/zhaox12/Desktop/Teresas_lab/project/10.protein/11.12-For paper/Manuscript_2021/figure_2021/fig1/data/OV.corr.table.txt",sep="\t") colnames(ovarian)[2]<-"ovarian.cor.rna.dna" colnames(ovarian)[5]<-"ovarian.cor.rna.pro" ccRCC<-read.delim("/Users/zhaox12/Desktop/Teresas_lab/project/10.protein/11.12-For paper/Manuscript_2021/figure_2021/fig1/data/ccRCC.corr.table.txt",sep = "\t") colnames(ccRCC)[2]<-"ccRCC.cor.rna.dna" colnames(ccRCC)[5]<-"ccRCC.cor.rna.pro" endometrial<-read.delim("/Users/zhaox12/Desktop/Teresas_lab/project/10.protein/11.12-For paper/Manuscript_2021/figure_2021/fig1/data/Endometrial.corr.table.txt",sep = "\t") colnames(endometrial)[2]<-"endometrial.cor.rna.dna" colnames(endometrial)[5]<-"endometrial.cor.rna.pro" luad<-read.delim("/Users/zhaox12/Desktop/Teresas_lab/project/10.protein/11.12-For paper/Manuscript_2021/figure_2021/fig1/data/LUAD.corr.table.txt",sep = "\t") colnames(luad)[2]<-"luad.cor.rna.dna" colnames(luad)[5]<-"luad.cor.rna.pro" hnsc<-read.delim("/Users/zhaox12/Desktop/Teresas_lab/project/10.protein/11.12-For paper/Manuscript_2021/figure_2021/fig1/data/HNSC.corr.table.txt",sep = "\t") colnames(hnsc)[2]<-"hnsc.cor.rna.dna" colnames(hnsc)[5]<-"hnsc.cor.rna.pro" score<-na.omit(read.delim("/Users/zhaox12/Desktop/Teresas_lab/project/36.evolution/hg19.gene.score.txt",sep="\t",header=T)) score$corum<-ifelse(score$genes%in%master_list_names,TRUE,FALSE) merge.data<-merge(merge(merge(merge(merge(merge(colon,breast,by="gene",all = T),ovarian,by="gene",all = T),ccRCC,by="gene",all = T), endometrial,by="gene",all = T),luad,by="gene",all = T),hnsc,by="gene",all = T) Pan.dna.rna<-cbind(merge.data$gene, merge.data[,colnames(merge.data) %like% "cor.rna.dna"]) Pan.rna.pro<-cbind(merge.data$gene, merge.data[,colnames(merge.data) %like% "cor.rna.pro"]) Pan.dna.rna$max.rna.dna<-apply(Pan.dna.rna[,2:8],1,function(x)max(x,na.rm=T)) Pan.rna.pro$max.rna.pro<-apply(Pan.rna.pro[,2:8],1,function(x)max(x,na.rm=T)) Pan.data<-merge(Pan.dna.rna,Pan.rna.pro,by="merge.data$gene",all = T) colnames(Pan.data)[1]<-"gene" names<-list(colon,breast,ovarian,ccRCC,endometrial,luad,hnsc,Pan.data) names1<-c("Colon","Breast","OV","ccRCC","Endometrial","LUAD","HNSC","PAN-Cancer") col1<-"#13502f" ###Dark green col2<-"#79b695" ## light green ##76b493 cancer<-"PhyloP.score" for (i in 1:length(names)){ #i<-1 merge.data<-names[[i]] CPTAC<-names1[i] if(names1[i]=="PAN-Cancer"){ data.dna.rna<-cbind(merge.data$gene, merge.data[,colnames(merge.data) %like% ".rna.dna"]) colnames(data.dna.rna)[1]<-"genes" score.dna.rna<-merge(score,data.dna.rna[,c("genes","max.rna.dna")],by="genes") score.dna.rna1<-score.dna.rna score.dna.rna$group<-ifelse(score.dna.rna$median.PhyloP.score>quantile(score.dna.rna$median.PhyloP.score,0.7),"TOP30%", ifelse(score.dna.rna$median.PhyloP.scorecorum.top30,"Corum.TOP30%", ifelse(score.dna.rna$Corum=="Corum" & score.dna.rna$median.PhyloP.scorenocorum.top30,"No_Corum.TOP30%", ifelse(score.dna.rna$Corum=="No_Corum" & score.dna.rna$median.PhyloP.scorequantile(score.rna.pro$median.PhyloP.score,0.7),"TOP30%", ifelse(score.rna.pro$median.PhyloP.scorecorum.top30,"Corum.TOP30%", ifelse(score.rna.pro$Corum=="Corum" & score.rna.pro$median.PhyloP.scorenocorum.top30,"No_Corum.TOP30%", ifelse(score.rna.pro$Corum=="No_Corum" & score.rna.pro$median.PhyloP.scorequantile(score.dna.rna[,2],0.7),"TOP30%", ifelse(score.dna.rna[,2]corum.top30,"Corum.TOP30%", ifelse(score.dna.rna$Corum=="Corum" & score.dna.rna[,2]nocorum.top30,"No_Corum.TOP30%", ifelse(score.dna.rna$Corum=="No_Corum" & score.dna.rna[,2]quantile(score.rna.pro[,2],0.7),"TOP30%", ifelse(score.rna.pro[,2]corum.top30,"Corum.TOP30%", ifelse(score.rna.pro$Corum=="Corum" & score.rna.pro[,2]nocorum.top30,"No_Corum.TOP30%", ifelse(score.rna.pro$Corum=="No_Corum" & score.rna.pro[,2]% wilcox_test(median.PhyloP.score ~ Corum) %>% adjust_pvalue(method = "bonferroni") %>% add_significance("p.adj") stat.test1 <- stat.test1 %>% add_xy_position(x = "Corum", dodge = 0.8) stat.test1$y.position<-1.15 boot1<-signif(bootstrap(score.com,10000,"median.PhyloP.score","Corum","Corum","No_Corum"),3) stat.test1$boot<-boot1 n_fun <- function(x){ return(data.frame(y = 1*1.35, label = paste0("N=",length(x)))) } col3<-"#aaa900" #dark yellow col4<-"#6500aa" #purple g1<-ggplot(score.com, aes(x=Corum, y=median.PhyloP.score)) + # geom_point(aes(fill = Corum, color=Corum), alpha=0.5, # size = 2, shape = 21, position = position_jitterdodge()) + geom_violin(aes(fill = Corum))+ geom_boxplot(aes(fill = Corum),alpha = 0.5,outlier.colour = NA,width=0.5)+ stat_summary(fun.data = n_fun, geom = "text", aes(group=Corum), hjust = 0.5, position = position_dodge(0.8),size=5) + scale_color_manual(values=c(alpha(col3,1),alpha(col4,1)))+ scale_fill_manual(values=c(alpha(col3,1),alpha(col4,1))) + theme_classic2()+ ggtitle(paste0(""))+ ylab("median of PhyloP score")+xlab("")+scale_y_continuous(limits = c(-1, 1.4))+ theme(axis.text=element_text(size=12,face = "bold"), axis.title=element_text(size=14,face="bold"), plot.title = element_text(size = 16, face = "bold"), legend.text=element_text(size=10,face = "bold"), legend.title=element_text(size=12,face = "bold"))+ theme(legend.position = "right")+stat_pvalue_manual( stat.test1, label = "boot",tip.length = 0,size=5 ) #####rects plot rects <- data.frame(y = 1:3, colors = c("grey","grey","grey"), text = paste(c("NoCorum","Corum","ALL"))) p3 <- ggplot(rects[1,], aes(x=0, y-1 , fill = colors,label=text)) + geom_tile(width = .1, height = .6) + # make square tiles geom_text(color = "grey",size=0.1) + # add white text in the middle scale_fill_identity(guide = "none") +# color the tiles with the colors in the data frame coord_fixed() + # make sure tiles are square theme_void()+ # remove any axis markings annotate(geom = "text", x = 0, y = rects$y[1]-1, label = rects$text[1], color = "white",size=7, angle = 90) p2 <- ggplot(rects[2,], aes(x=0, y-1 , fill = colors,label=text)) + geom_tile(width = .1, height = .6) + # make square tiles geom_text(color = "grey",size=0.1) + # add white text in the middle scale_fill_identity(guide = "none") +# color the tiles with the colors in the data frame coord_fixed() + # make sure tiles are square theme_void()+ # remove any axis markings annotate(geom = "text", x = 0, y = rects$y[2]-1, label = rects$text[2], color = "white",size=7, angle = 90) p1 <- ggplot(rects[3,], aes(x=0, y-1 , fill = colors,label=text)) + geom_tile(width = .1, height = .6) + # make square tiles geom_text(color = "grey",size=0.1) + # add white text in the middle scale_fill_identity(guide = "none") +# color the tiles with the colors in the data frame coord_fixed() + # make sure tiles are square theme_void()+ # remove any axis markings annotate(geom = "text", x = 0, y = rects$y[3]-1, label = rects$text[3], color = "white",size=7, angle = 90) figure<-ggarrange(ggarrange(RNA_DNA1,RNA_PRO1,p1,nrow=1,ncol=3,common.legend=T,legend = "right",widths=c(1,1,0.1),heights = c(1,1,0.75)), ggarrange(RNA_DNA2,RNA_PRO2,p2,nrow=1,ncol=3,common.legend=T,legend = "right",widths=c(1,1,0.1),heights = c(1,1,0.75)), ggarrange(RNA_DNA3,RNA_PRO3,p3,nrow=1,ncol=3,common.legend=T,legend = "right",widths=c(1,1,0.1),heights = c(1,1,0.75)), ggarrange(g1,NULL,NULL,nrow=1,ncol=3,common.legend=F,widths=c(1,0.6,0.1),heights = c(1,1,0.75)), nrow=4,ncol = 1,heights =c(1,1,1,1.2)) pdf(paste0(names1[i],".phylopscore.pdf"),height=12,width=9) print(figure) dev.off() }