https://github.com/DanniGadd/EpiScores-for-protein-levels
Raw File
Tip revision: a5130fab3895a0d95f0dcc8826aa9fb5e8c0fa86 authored by DanniGadd on 16 February 2022, 08:46:38 UTC
Merge branch 'main' of https://github.com/DanniGadd/EpiScores-for-protein-levels
Tip revision: a5130fa
19_cox_corr_plots_for_proteins.R
Copyright (c) <2022>, <DanniGadd>
All rights reserved.

This source code is licensed under the MIT license found in the
LICENSE file in the root directory.

####################################################################################
####################################################################################
############################## CORR PLOTS ##########################################
####################################################################################
####################################################################################

# Correlation plots for top episcores/proxies against key variables 

library(tidyverse)

### SOMA LOAD IN AND SELECT PROXIES FOR COX MODELS (135)
prot <- read.csv("/Cluster_Filespace/Marioni_Group/Danni/Proxies_stradl/KORA_train_recieved_020221/GS_combined_9537_projections.csv")
names(prot)[1] <- "XID"
colnames(prot) <- sub('.', '', colnames(prot))
#prot = read.csv("/Cluster_Filespace/Marioni_Group/Danni/Proxies_stradl/KORA_train_recieved_020221/Cox/olink_files/GS_combined_9537_projections.csv")

# read in proteins taken forward from validation test steps 
results <- read.csv("/Cluster_Filespace/Marioni_Group/Danni/Proxies_stradl/KORA_train_recieved_020221/Correlations_020221.csv")
results <- results %>% filter(r > 0.1)
results <- results %>% filter(p < 0.05)
list3 <- results$SeqId %>% as.data.frame()
names(list3)[1] <- "Protein"
list3$Protein <- str_replace_all(list3$Protein, "-", ".")
#list3 <- read.csv("/Cluster_Filespace/Marioni_Group/Danni/Proxies_stradl/KORA_train_recieved_020221/Cox/olink_files/List_of_proteins_for_cox_models_220121.csv")


ID <- prot$ID
over <- which(colnames(prot) %in% list3$Protein)
pre <- prot[over]
pred <- cbind(ID, pre)
soma <- pred

### OLINK LOAD IN AND SELECT PROXIES FOR COX MODELS ()
prot = read.csv("/Cluster_Filespace/Marioni_Group/Danni/LBC_proteins_Jan2021/Model_projections/GS_combined_9537_projections.csv")
# read in proteins taken forward from validation test steps 
list3 <- read.csv("/Cluster_Filespace/Marioni_Group/Danni/LBC_proteins_Jan2021/Choosing_list_of_top_proxies/List_of_proteins_for_cox_models_220121.csv")
ID <- prot$ID
over <- which(colnames(prot) %in% list3$Protein)
pre <- prot[over]
pred <- cbind(ID, pre)
olink <- pred

### JOIN UP PROXIES 

# first match the olink to the soma IDs 
people <- soma$ID
matched <- olink[match(people, olink$ID),]
join <- cbind(soma, matched)
join <- join[-137]

join <- join[-1]


####################################################################################

### Corr plots 

####################################################################################

data <- join 

cor <- cor(data)

library(ggcorrplot)

pdf("/Cluster_Filespace/Marioni_Group/Danni/Proxies_stradl/KORA_train_recieved_020221/Cox/corr_plot_for_proteins_164.pdf",width = 40, height = 40)
ggcorrplot(cor, hc.order = TRUE, type = "lower",
     outline.col = "white")
dev.off()


# Restrict to those which had disease associations 

results <- read.csv("/Cluster_Filespace/Marioni_Group/Danni/Proxies_stradl/KORA_train_recieved_020221/Cox/Joint_results_with_olink/results_cox_040221.csv")

proteins <- results$SeqId %>% unique() # 82

data2 <- join 

data2 <- data2[which(colnames(data2) %in% proteins)]


cor <- cor(data2)

library(ggcorrplot)

pdf("/Cluster_Filespace/Marioni_Group/Danni/Proxies_stradl/KORA_train_recieved_020221/Cox/corr_plot_for_proteins_associated_with_disease_82.pdf",width = 30, height = 30)
ggcorrplot(cor, hc.order = TRUE, type = "lower",
     outline.col = "white")
dev.off()




####################################################################################

### Grimage vs proxies 

####################################################################################

### Reload the following and join the join object to d1 to get age accel scores 


library(tidyverse)

### SOMA LOAD IN AND SELECT PROXIES FOR COX MODELS (135)

prot <- read.csv("/Cluster_Filespace/Marioni_Group/Danni/Proxies_stradl/KORA_train_recieved_020221/GS_combined_9537_projections.csv")
names(prot)[1] <- "XID"
colnames(prot) <- sub('.', '', colnames(prot))

#prot = read.csv("/Cluster_Filespace/Marioni_Group/Danni/Proxies_stradl/KORA_train_recieved_020221/Cox/olink_files/GS_combined_9537_projections.csv")

# read in proteins taken forward from validation test steps 
results <- read.csv("/Cluster_Filespace/Marioni_Group/Danni/Proxies_stradl/KORA_train_recieved_020221/Correlations_020221.csv")
results <- results %>% filter(r > 0.1)
results <- results %>% filter(p < 0.05)
list3 <- results$SeqId %>% as.data.frame()
names(list3)[1] <- "Protein"
list3$Protein <- str_replace_all(list3$Protein, "-", ".")
#list3 <- read.csv("/Cluster_Filespace/Marioni_Group/Danni/Proxies_stradl/KORA_train_recieved_020221/Cox/olink_files/List_of_proteins_for_cox_models_220121.csv")


ID <- prot$ID
over <- which(colnames(prot) %in% list3$Protein)
pre <- prot[over]
pred <- cbind(ID, pre)

soma <- pred

### OLINK LOAD IN AND SELECT PROXIES FOR COX MODELS ()

prot = read.csv("/Cluster_Filespace/Marioni_Group/Danni/LBC_proteins_Jan2021/Model_projections/GS_combined_9537_projections.csv")

# read in proteins taken forward from validation test steps 

list3 <- read.csv("/Cluster_Filespace/Marioni_Group/Danni/LBC_proteins_Jan2021/Choosing_list_of_top_proxies/List_of_proteins_for_cox_models_220121.csv")

ID <- prot$ID
over <- which(colnames(prot) %in% list3$Protein)
pre <- prot[over]
pred <- cbind(ID, pre)

olink <- pred

### JOIN UP PROXIES 

# first match the olink to the soma IDs 
people <- soma$ID
matched <- olink[match(people, olink$ID),]
join <- cbind(soma, matched)

join <- join[-137]


# d1 file

d1 <- read.csv("/Cluster_Filespace/Marioni_Group/Danni/Proxies_stradl/KORA_train_recieved_020221/Cox/d1.csv")

cols <- d1[c(1,2,3,4,25,108,109)]
names(cols)[2] <- "ID"

joint <- left_join(join, cols, by = "ID")

names <- colnames(joint)[2:165]

### Plot proxies vs grimage accel 

library(ggpubr)

# Set dataset 
data <- joint
data$AgeAccelGrim <- as.numeric(data$AgeAccelGrim)

# Plot
plot_list <- list()

for(i in names){
      id <- i
      interest <- data[c("AgeAccelGrim", id)]
      names(interest)[2] <- "protein"
      p <- ggplot(interest, aes(x=AgeAccelGrim, y=protein)) +
        geom_point() + geom_smooth(method = 'lm', color = "grey") + theme_minimal() + ggtitle(i) + ylab(i) + xlab("GrimAge acceleration")
        stat_cor(aes(label = ..r.label..), method = "pearson", cor.coef.name = "r", size = 7)
      plot_list[[i]] <- p 
}

pdf(file = paste0("/Cluster_Filespace/Marioni_Group/Danni/Proxies_stradl/KORA_train_recieved_020221/Cox/grimage_accel_vs_proxies.pdf"))
for (i in 1:164) {
    print(plot_list[[i]])
}
dev.off()


### Record the correlations for the plots

# Make results table 
results <- data.frame(Proxy = names, r = 1:29, p = 1:29, LC = 1:29, UC = 1:29)

# Correlate for each protein against the proxy 

for (i in 1:29){
name <- names[i]
cor1 <- cor.test(data[,name], data$AgeAccelGrim.y)
int <- cor1$conf.int[1:2]
p <- cor1$p.value 
r <- cor1$estimate 
results[i,"r"] <- r
results[i,"p"] <- p
results[i,4:5] <- int
}

# Order by r
results <- results[rev(order(results$r)),]

write.csv(results, "/Volumes/marioni-lab/Danni/LBC_proteins_Jan2021/Grimage/grimage_accel_vs_proxies_results_table.csv", row.names = F)


####################################################################################

### Age vs proxies

####################################################################################

### Plot proxies vs grimage accel 

library(ggpubr)

names <- read.csv("/Volumes/marioni-lab/Danni/LBC_proteins_Jan2021/Choosing_list_of_top_proxies/List_of_proteins_for_cox_models_220121.csv")
names <- names$Protein %>% as.character()

# Set dataset 
data <- join
data$Age <- as.numeric(data$Age)

# Plot
plot_list <- list()

for(i in names){
      id <- i
      interest <- data[c("Age", id)]
      names(interest)[2] <- "protein"
      p <- ggplot(interest, aes(x=Age, y=protein)) +
        geom_point() + geom_smooth(method = 'lm', color = "grey") + theme_minimal() + ggtitle(i) + ylab(i) + xlab("Age")
        stat_cor(aes(label = ..r.label..), method = "pearson", cor.coef.name = "r", size = 7)
      plot_list[[i]] <- p 
}

pdf(file = paste0("/Volumes/marioni-lab/Danni/LBC_proteins_Jan2021/Grimage/age_vs_proxies.pdf"))
for (i in 1:29) {
    print(plot_list[[i]])
}
dev.off()


### Record the correlations for the plots

# Make results table 
results <- data.frame(Proxy = names, r = 1:29, p = 1:29, LC = 1:29, UC = 1:29)

# Correlate for each protein against the proxy 

for (i in 1:29){
name <- names[i]
cor1 <- cor.test(data[,name], data$Age)
int <- cor1$conf.int[1:2]
p <- cor1$p.value 
r <- cor1$estimate 
results[i,"r"] <- r
results[i,"p"] <- p
results[i,4:5] <- int
}

# Order by r
results <- results[rev(order(results$r)),]

write.csv(results, "/Volumes/marioni-lab/Danni/LBC_proteins_Jan2021/Grimage/age_vs_proxies_results_table.csv", row.names = F)


#################################################################################################################

# Plot proxy levels by Sex in the GS population

#################################################################################################################

names <- read.csv("/Volumes/marioni-lab/Danni/LBC_proteins_Jan2021/Choosing_list_of_top_proxies/List_of_proteins_for_cox_models_220121.csv")
names <- names$Protein %>% as.character()

# Set dataset 
data$Female <- as.character(data$Female)

# Plot 
plot_list <- list()

for(i in names){
      id <- i
      interest <- data[c("Female", id)]
      names(interest)[2] <- "protein"
      p <- ggplot(interest, aes(x=Female, y=protein)) +
        geom_violin(trim=FALSE, fill='#A4A4A4', color="darkred")+
        geom_boxplot(width=0.1) + theme_minimal() + ggtitle(i)

      plot_list[[i]] <- p 
}

pdf(file = paste0("/Volumes/marioni-lab/Danni/LBC_proteins_Jan2021/Grimage/proxies_by_sex.pdf"))
for (i in 1:29) {
    print(plot_list[[i]])
}
dev.off()



#################################################################################################################

# Plot proxy levels by Set in the GS population

#################################################################################################################


# Set dataset 
data$Set <- as.character(data$Set)

# Plot each protein proxy by set 1 and 2 as violin boxplots first 
plot_list <- list()

for(i in names){
      id <- i
      interest <- data[c("Set", id)]
      names(interest)[2] <- "protein"
      p <- ggplot(interest, aes(x=Set, y=protein)) +
        geom_violin(trim=FALSE, fill='#A4A4A4', color="darkred")+
        geom_boxplot(width=0.1) + theme_minimal() + ggtitle(i)

      plot_list[[i]] <- p 
}

pdf(file = paste0("/Volumes/marioni-lab/Danni/LBC_proteins_Jan2021/Grimage/proxies_by_set.pdf"))
for (i in 1:29) {
    print(plot_list[[i]])
}
dev.off()




#################################################################################################################

# Disease specific plots 

#################################################################################################################

# read in the cox tables for each trait

AD <- read.csv("/Volumes/marioni-lab/Protein_DNAm_Proxies/Cox_tte_tables_for_12_traits/AD_extracted_cases_over_60_years_old.csv")
bowel <- read.csv("/Volumes/marioni-lab/Protein_DNAm_Proxies/Cox_tte_tables_for_12_traits/Bowel.Cancer_all_cases.csv")
stroke <- read.csv("/Volumes/marioni-lab/Protein_DNAm_Proxies/Cox_tte_tables_for_12_traits/Stroke_all_cases.csv")
RA <- read.csv("/Volumes/marioni-lab/Protein_DNAm_Proxies/Cox_tte_tables_for_12_traits/RA_all_cases.csv")
pain <- read.csv("/Volumes/marioni-lab/Protein_DNAm_Proxies/Cox_tte_tables_for_12_traits/Pain_all_cases.csv")
lung <- read.csv("/Volumes/marioni-lab/Protein_DNAm_Proxies/Cox_tte_tables_for_12_traits/Lung.Cancer_all_cases.csv")
IHD <- read.csv("/Volumes/marioni-lab/Protein_DNAm_Proxies/Cox_tte_tables_for_12_traits/IHD_all_cases.csv")
diab <- read.csv("/Volumes/marioni-lab/Protein_DNAm_Proxies/Cox_tte_tables_for_12_traits/Diabetes_all_cases.csv")
depr <- read.csv("/Volumes/marioni-lab/Protein_DNAm_Proxies/Cox_tte_tables_for_12_traits/Depression_all_cases.csv")
IBD <- read.csv("/Volumes/marioni-lab/Protein_DNAm_Proxies/Cox_tte_tables_for_12_traits/IBD_all_cases.csv")
COPD <- read.csv("/Volumes/marioni-lab/Protein_DNAm_Proxies/Cox_tte_tables_for_12_traits/COPD_all_cases.csv")
breast <- read.csv("/Volumes/marioni-lab/Protein_DNAm_Proxies/Cox_tte_tables_for_12_traits/Breast.Cancer_all_cases.csv")

#################################################################################################################

# Disease specific plots - plot by age in each case/control case (f.adj and basic versions for comparison as cases change)

#################################################################################################################

### BASIC MODELS 

data <- d1 

my.list = list(AD, bowel, stroke, RA, pain, lung, IHD, diab, depr, IBD, COPD, breast)
names <- c("AD", "bowel", "stroke", "RA", "pain", "lung", "IHD", "diab", "depr", "IBD", "COPD", "breast")

# Plot each disease by cases and controls vs age 
plot_list <- list()

for(i in 1:12){
  disease <- my.list[[i]]
  join <- left_join(disease, data, by = "Sample_Name")
  # add status identifier 
  join <- join %>% mutate(Status = case_when(
  Event == "1" ~ "Cases",
  Event == "0" ~ "Controls"))
  # remove NA event rows
  overlap <- which(!join$Event == "NA")
  join <- join[overlap,]
  # get relevant info for plot 
  interest <- join[c("Status", "age_at_event")]
  # violin plot 
  p <- ggplot(interest, aes(x=Status, y=age_at_event)) +
  geom_violin(trim=FALSE, fill='#A4A4A4', color="darkred")+
  geom_boxplot(width=0.1) + theme_minimal() + ylab("Age") + ggtitle(names[[i]]) + xlab(names[[i]])
  plot_list[[i]] <- p
}
  
pdf(file = paste0("/Volumes/marioni-lab/Danni/LBC_proteins_Jan2021/Grimage/All_disease_cases_and_controls_by_age_basic_model.pdf"))
for (i in 1:12) {
    print(plot_list[[i]])
}
dev.off()






#################################################################################################################

### PLOTS FOR EACH DISEASE AND PROTEIN (f.adj and basic versions for comparison as cases change)

#################################################################################################################


### FULLY ADJUSTED 

data <- d1 

my.list = list(AD, bowel, stroke, RA, pain, lung, IHD, diab, depr, IBD, COPD, breast)
names <- c("AD", "bowel", "stroke", "RA", "pain", "lung", "IHD", "diab", "depr", "IBD", "COPD", "breast")


names <- read.csv("/Volumes/marioni-lab/Danni/LBC_proteins_Jan2021/Choosing_list_of_top_proxies/List_of_proteins_for_cox_models_220121.csv")
proteins <- names$Protein %>% as.character()

# Plot each disease by cases and controls vs age 
plot_list <- list()
plot_list2 <- list()

for(i in 1:12){
  disease <- my.list[[i]]
  join <- left_join(disease, data, by = "Sample_Name")
  # remove missing covariates 
  join <- join[!is.na(join$units), ]
  join <- join[!is.na(join$usual), ]
  join <- join[!is.na(join$smokingScore), ]
  join <- join[!is.na(join$simd), ]
  join <- join[!is.na(join$EA), ]
  join <- join[!is.na(join$bmi), ]
  # add status identifier 
  join <- join %>% mutate(Status = case_when(
  Event == "1" ~ "Cases",
  Event == "0" ~ "Controls"))
  # remove NA event rows
  overlap <- which(!join$Event == "NA")
  join <- join[overlap,]
  for (j in proteins){
    id <- j
    dataset <- join
    # get relevant info for plot 
    interest <- dataset[c("Status", id)]
    names(interest)[2] <- "protein"
    # violin plot 
    p <- ggplot(interest, aes(x=Status, y=protein)) +
    geom_violin(trim=FALSE, fill='#A4A4A4', color="darkred")+
    geom_boxplot(width=0.1) + theme_minimal() + ylab(j) + ggtitle(j) + xlab(names[[i]])
    plot_list2[[j]] <- p
  }
  plot_list[[i]] <- plot_list2
}

plot_list[[2]][[1]] # disease then protein


pdf(file = paste0("/Volumes/marioni-lab/Protein_DNAm_Proxies/Plots_for_cox_model_proxy_distributions/All_disease_cases_and_controls_by_proxy_fully_adjusted_model.pdf"))
for (i in 1:12){
  for (j in 1:56){
    print(plot_list[[i]][[j]])
  }
}
dev.off()


### BASIC MODEL 


data <- d1 

my.list = list(AD, bowel, stroke, RA, pain, lung, IHD, diab, depr, IBD, COPD, breast)
names <- c("AD", "bowel", "stroke", "RA", "pain", "lung", "IHD", "diab", "depr", "IBD", "COPD", "breast")

proteins <- read.csv("/Volumes/marioni-lab/Protein_DNAm_Proxies/Glmnet_predictor_results/Model_results_tables_for_full_training/names_of_56_proteins.csv")
proteins <- proteins$Protein %>% as.character()

# Plot each disease by cases and controls vs age 
plot_list <- list()
plot_list2 <- list()

for(i in 1:12){
  disease <- my.list[[i]]
  join <- left_join(disease, data, by = "Sample_Name")
  # add status identifier 
  join <- join %>% mutate(Status = case_when(
  Event == "1" ~ "Cases",
  Event == "0" ~ "Controls"))
  # remove NA event rows
  overlap <- which(!join$Event == "NA")
  join <- join[overlap,]
  for (j in proteins){
    id <- j
    dataset <- join
    # get relevant info for plot 
    interest <- dataset[c("Status", id)]
    names(interest)[2] <- "protein"
    # violin plot 
    p <- ggplot(interest, aes(x=Status, y=protein)) +
    geom_violin(trim=FALSE, fill='#A4A4A4', color="darkred")+
    geom_boxplot(width=0.1) + theme_minimal() + ylab(j) + ggtitle(j) + xlab(names[[i]])
    plot_list2[[j]] <- p
  }
  plot_list[[i]] <- plot_list2
}

plot_list[[2]][[1]] # disease then protein


pdf(file = paste0("/Volumes/marioni-lab/Protein_DNAm_Proxies/Plots_for_cox_model_proxy_distributions/All_disease_cases_and_controls_by_proxy_basic_model.pdf"))
for (i in 1:12){
  for (j in 1:56){
    print(plot_list[[i]][[j]])
  }
}
dev.off()


### WBC plots - by cases and controls for each disease 


### FULLY ADJUSTED 

data <- d1 

my.list = list(AD, bowel, stroke, RA, pain, lung, IHD, diab, depr, IBD, COPD, breast)
names <- c("AD", "bowel", "stroke", "RA", "pain", "lung", "IHD", "diab", "depr", "IBD", "COPD", "breast")

WBCs <- c("CD4T", "CD8T", "NK", "Bcell", "Gran", "Mono")

# Plot each disease by cases and controls vs age 
plot_list <- list()
plot_list2 <- list()

for(i in 1:12){
  disease <- my.list[[i]]
  join <- left_join(disease, data, by = "Sample_Name")
  # remove missing covariates 
  join <- join[!is.na(join$units), ]
  join <- join[!is.na(join$usual), ]
  join <- join[!is.na(join$smokingScore), ]
  join <- join[!is.na(join$simd), ]
  join <- join[!is.na(join$EA), ]
  join <- join[!is.na(join$bmi), ]
  # add status identifier 
  join <- join %>% mutate(Status = case_when(
  Event == "1" ~ "Cases",
  Event == "0" ~ "Controls"))
  # remove NA event rows
  overlap <- which(!join$Event == "NA")
  join <- join[overlap,]
  for (j in WBCs){
    id <- j
    dataset <- join
    # get relevant info for plot 
    interest <- dataset[c("Status", id)]
    names(interest)[2] <- "WBC"
    # violin plot 
    p <- ggplot(interest, aes(x=Status, y=WBC)) +
    geom_violin(trim=FALSE, fill='#A4A4A4', color="darkred")+
    geom_boxplot(width=0.1) + theme_minimal() + ylab(j) + ggtitle(j) + xlab(names[[i]])
    plot_list2[[j]] <- p
  }
  plot_list[[i]] <- plot_list2
}

plot_list[[2]][[1]] # disease then protein


pdf(file = paste0("/Volumes/marioni-lab/Protein_DNAm_Proxies/Plots_for_cox_model_proxy_distributions/All_disease_cases_and_controls_by_WBC_props_fully_adjusted_model.pdf"))
for (i in 1:12){
  for (j in 1:6){
    print(plot_list[[i]][[j]])
  }
}
dev.off()



### BASIC MODEL

data <- d1 

my.list = list(AD, bowel, stroke, RA, pain, lung, IHD, diab, depr, IBD, COPD, breast)
names <- c("AD", "bowel", "stroke", "RA", "pain", "lung", "IHD", "diab", "depr", "IBD", "COPD", "breast")

WBCs <- c("CD4T", "CD8T", "NK", "Bcell", "Gran", "Mono")

# Plot each disease by cases and controls vs age 
plot_list <- list()
plot_list2 <- list()

for(i in 1:12){
  disease <- my.list[[i]]
  join <- left_join(disease, data, by = "Sample_Name")
  # add status identifier 
  join <- join %>% mutate(Status = case_when(
  Event == "1" ~ "Cases",
  Event == "0" ~ "Controls"))
  # remove NA event rows
  overlap <- which(!join$Event == "NA")
  join <- join[overlap,]
  for (j in WBCs){
    id <- j
    dataset <- join
    # get relevant info for plot 
    interest <- dataset[c("Status", id)]
    names(interest)[2] <- "WBC"
    # violin plot 
    p <- ggplot(interest, aes(x=Status, y=WBC)) +
    geom_violin(trim=FALSE, fill='#A4A4A4', color="darkred")+
    geom_boxplot(width=0.1) + theme_minimal() + ylab(j) + ggtitle(j) + xlab(names[[i]])
    plot_list2[[j]] <- p
  }
  plot_list[[i]] <- plot_list2
}

plot_list[[2]][[1]] # disease then protein


pdf(file = paste0("/Volumes/marioni-lab/Protein_DNAm_Proxies/Plots_for_cox_model_proxy_distributions/All_disease_cases_and_controls_by_WBC_props_basic_model.pdf"))
for (i in 1:12){
  for (j in 1:6){
    print(plot_list[[i]][[j]])
  }
}
dev.off()































back to top