https://github.com/DanniGadd/EpiScores-for-protein-levels
Raw File
Tip revision: a5130fab3895a0d95f0dcc8826aa9fb5e8c0fa86 authored by DanniGadd on 16 February 2022, 08:46:38 UTC
Merge branch 'main' of https://github.com/DanniGadd/EpiScores-for-protein-levels
Tip revision: a5130fa
20_cox_processing_models_generation_scotland.R
Copyright (c) <2022>, <DanniGadd>
All rights reserved.

This source code is licensed under the MIT license found in the
LICENSE file in the root directory.

####################################################################################
####################################################################################
############################## PROCESSING COX MODEL RESULTS ########################
####################################################################################
####################################################################################

### MAIN MODEL RESULTS - post rank inverse normalisation, collated and plotted

library(tidyverse)

setwd("Y:/Danni/LBC_proteins_Jan2021/00_Running_pQTLs_regressed/Cox_250221_agreed_model_results")

# Load in basic model 
comb <- read.csv("RI_sensitivity_basic_all_ordered.csv")
outcomes <- comb$Outcome %>% as.data.frame() 
names(outcomes)[1] <- "X"
count(outcomes, X)
breast <- comb %>% filter(Outcome == "Breast.Cancer")
diab <- comb %>% filter(Outcome == "Diabetes")
diab[which(!diab$Predictor %in% breast$Predictor),]
# 3195-50 - this is missing from breast in basic model again (as per previous basic model)
# Run and add into basic model as above 
breast <- read.csv("Breast.Cancer_all_cases.csv", check.names = F)

# Add rank-inverse step 
pred <- breast
for(i in names(pred)[109:218]){ 
 pred[,i] <- qnorm((rank(pred[,i],na.last="keep")-0.5)/sum(!is.na(pred[,i])))
}
cox <- pred 

library(survival)
library(kinship2)
library(coxme)
library(readxl)
library(tidyverse)
library(gdata)

# Function to Extract Lmekin Results  
extract_coxme_table <- function (mod){
  beta <- mod$coefficients #$fixed is not needed
  nvar <- length(beta)
  nfrail <- nrow(mod$var) - nvar
  se <- sqrt(diag(mod$var)[nfrail + 1:nvar])
  z<- round(beta/se, 2)
  p<- signif(1 - pchisq((beta/se)^2, 1), 2)
  table=data.frame(cbind(beta,se,z,p))
  return(table)
}

# Read kin model in 
ped <- read.csv("pedigree_formatted.csv")
kin <- with(ped, pedigree(volid, father, mother, sex, famid=famid))
kin_model <- kinship(kin) 

# Use BFGS but with relaxed threshold 
checks <- coxme.control(optpar = list(method = "BFGS", control=list(reltol = 1e-3)))
mod <- coxme(Surv(cox$tte, cox$Event) ~ scale(cox[,"3195-50"]) + cox$Age + factor(cox$Set) + (1|cox$Sample_Name), varlist = kin_model*2, control=checks)

output <- data.frame(V1 = 1, V2 = 1)
output[1,1] <- as.character("3195-50")
  output[1,2] <- as.character("Breast.Cancer")
  output[1,3:5]<-round(exp(cbind(coef(mod), confint(mod)))[1,1:3],2)
  output[1,6] <- extract_coxme_table(mod)[1,4]
  output[1,7] <- mod$n[1]
  output[1,8] <- mod$n[2]-mod$n[1]
  output[1,9] <-cox.zph(mod)[1][[1]][3]

output_breast <- output 
colnames(output_breast) <- colnames(comb)

# Combine with the basic model and save out 
comb <- read.csv("RI_sensitivity_basic_all_ordered.csv")
comb2 <- rbind(comb, output_breast)
write.csv(comb2, "RI_basic_1320.csv", row.names = F)

# Load in fully adjusted model 
comb_full <- read.csv("RI_sensitivity_fully_adjusted_all.csv")
outcomes2 <- comb_full$Outcome %>% as.data.frame() 
names(outcomes2)[1] <- "X"
count(outcomes2, X)
breast <- comb_full %>% filter(Outcome == "Breast.Cancer")
diab <- comb_full %>% filter(Outcome == "Diabetes")
breast[which(!breast$Predictor %in% diab$Predictor),]

# 2580-83
# 3175-51
# Here is the file:
test_file <- read.csv("exact_diab_set.csv", check.names = F)

# Rank inverse transform the proteins 
pred <- test_file
for(i in names(pred)[109:218]){ 
 pred[,i] <- qnorm((rank(pred[,i],na.last="keep")-0.5)/sum(!is.na(pred[,i])))
}
cox <- pred  
mod = coxme(Surv(cox$tte, cox$Event) ~ scale(cox[,"2580-83"]) + factor(cox$Female) + cox$bmi + cox$Age + factor(cox$Set) + cox$units + cox$smokingScore + cox$simd + cox$EA + (1|cox$Sample_Name), varlist = kin_model*2)
output <- data.frame(V1 = 1, V2 = 1)
output[1,1] <- as.character("2580-83")
  output[1,2] <- as.character("Diabetes")
  output[1,3:5]<-round(exp(cbind(coef(mod), confint(mod)))[1,1:3],2)
  output[1,6] <- extract_coxme_table(mod)[1,4]
  output[1,7] <- mod$n[1]
  output[1,8] <- mod$n[2]-mod$n[1]
  output[1,9] <-cox.zph(mod)[1][[1]][3]

diab1 <- output 
colnames(diab1) <- colnames(comb)

# Protein 2 
mod = coxme(Surv(cox$tte, cox$Event) ~ scale(cox[,"3175-51"]) + factor(cox$Female) + cox$bmi + cox$Age + factor(cox$Set) + cox$units + cox$smokingScore + cox$simd + cox$EA + (1|cox$Sample_Name), varlist = kin_model*2)
output <- data.frame(V1 = 1, V2 = 1)
output[1,1] <- as.character("3175-51")
  output[1,2] <- as.character("Diabetes")
  output[1,3:5]<-round(exp(cbind(coef(mod), confint(mod)))[1,1:3],2)
  output[1,6] <- extract_coxme_table(mod)[1,4]
  output[1,7] <- mod$n[1]
  output[1,8] <- mod$n[2]-mod$n[1]
  output[1,9] <-cox.zph(mod)[1][[1]][3]

diab2 <- output 
colnames(diab2) <- colnames(comb)
comb3 <- rbind(comb_full, diab1)
comb3 <- rbind(comb3, diab2)
write.csv(comb3, "RI_sensitivity_fully_adjusted_1320.csv", row.names = F)


####################################################################################

### ANNOTATE RESULTS TABLES AND CALCULATE BASIC > FA  

####################################################################################

library(tidyverse)
library(readxl)

setwd("Y:/Protein_DNAm_Proxies/Work_and_code_post_KORA/Cox_250221_agreed_model_results")

# Read them back in 

comb <- read.csv("RI_basic_1320.csv")

comb_full <- read.csv("RI_sensitivity_fully_adjusted_1320.csv")

# Add the annotations into the files 

anno <- read_excel("Y:/STRADL/SOMAscan_Assay_v4_Annotations_version3.3.2_DG.xlsx")
anno <- anno[c(1:4,13,18,20)]
names(anno)[1] <- "Predictor"

comb <- left_join(comb, anno, by = "Predictor")

comb_full <- left_join(comb_full, anno, by = "Predictor")


# remove those which dont meet the proportionality assumption
length(which(comb$cox.zph < 0.05)) ## 2
comb = comb[-which(comb$cox.zph < 0.05),] 

# Remove the il.12B connections

comb <- comb[-which(comb$Predictor == "IL.12B"),]

# Do FDR correction
comb$FDR <- p.adjust(comb$P.Value, method = "BH")

# Write basic model which has been FDR corrected out 
write.csv(comb, "RI_sensitivity_Basic_1320_FDR_corrected_annotated.csv", row.names =F)


# Keep any that meet significance threshold 
keep1 <- comb[which(comb$FDR < 0.05),] # 294 associations 
keep = paste(keep1$Predictor, keep1$Outcome, sep = "_")

# Keep only those passing the threshold from the basic model in the fully adjusted 
comb_full$retain <- paste(comb_full$Predictor, comb_full$Outcome, sep = "_")
comb_full <- comb_full[which(comb_full$retain %in% keep),]
comb_full$retain <- NULL
names(comb_full)[6] <- "P.Value" 

# make sure filtered ones from fully adjusted dont have coxzph P of less than 0.05 
length(which(comb_full$cox.zph < 0.05)) ## 0

# Save the final model results 
write.csv(keep1, "RI_sensitivity_Basic_1320_that_pass_FDR_standard_model_annotated.csv", row.names =F)
write.csv(comb_full, "RI_sensitivity_Fully_adj_1320_filtered_to_basic_passing_FDR_standard_model_annotated.csv", row.names =F)

# Save a copy of just the significant associations 
sig <- comb_full %>% filter(P.Value < 0.05)

dim(sig) # 137
write.csv(sig, "RI_sensitivity_Fully_adj_1320_filtered_to_just_137_significant_associations_annotated.csv", row.names = F)

#######################################################################

### Comparing results post rank inverse against pre 

#######################################################################

# Read in both sets of results 
RI <- read.csv("RI_sensitivity_Fully_adj_1320_filtered_to_just_137_significant_associations_annotated.csv")
OR <- read.csv("Fully_adj_1320_filtered_to_just_138_significant_associations_annotated.csv")

keep = paste(RI$Predictor, RI$Outcome, sep = "_")
length(keep) 

keep2 = paste(OR$Predictor, OR$Outcome, sep = "_")
length(keep2) 

# Compare the keep lists to find those that are different across the 2 sets 
ov1 <- which(keep %in% keep2) # 129 common across both sets 
ov2 <- which(!keep %in% keep2) # 8 are in the RI results that arent in the OR results
ov3 <- which(!keep2 %in% keep) # 9 are in the OR that arent in the RI results 

# So 285 associations (of a possible 297 in the standard and 303 in the outlier removed models) are common to both 
# of the sets of results pre/post outlier removal.

# Lets look into associations that differ to see what role outliers are playing 

RI_res <- keep[ov2]

# So the following associations are present in the RI model, but dissapear when in standard model 
# [1] "SMPD1_Alzheimer's Disease" "4160-49_Stroke"
# [3] "4568-17_Stroke"            "5363-51_COPD"
# [5] "4435-66_Diabetes"          "3298-52_Depression"
# [7] "5028-59_Diabetes"          "3805-16_Diabetes"

OR_res <- keep2[ov3]

# So the following associations are in the standard model, but dissappear after RI
# [1] "2950-57_Diabetes"     "2982-82_RA"           "2658-27_Diabetes"
# [4] "4924-32_Depression"   "2665-26_Bowel.Cancer" "4160-49_Diabetes"
# [7] "4160-49_IHD"          "3175-51_Bowel.Cancer" "3216-2_Pain"

# Subset the results based on these lists 
RI$retain <- paste(RI$Predictor, RI$Outcome, sep = "_")
RI1 <- RI[which(RI$retain %in% RI_res),]

# Subset the results based on these lists 
OR$retain <- paste(OR$Predictor, OR$Outcome, sep = "_")
OR1 <- OR[which(OR$retain %in% OR_res),]


# Plot the log HR and p vals between the common assocs - both basic and FA models 

#######################################################################

### Processing WBC sensitivity results  

#######################################################################

# RI WBC results 

### Code used to calculate overlap between basic and fully adjusted models (to be used once all 1320 associations present)

# Load in basic model 
comb <- read.csv("RI_basic_1320.csv")

# Load in fully adjusted model 
comb_full <- read.csv("RI_WBC_added_fully_adjusted_all.csv")

# Add the annotations into the files 

anno <- read_excel("Y:/STRADL/SOMAscan_Assay_v4_Annotations_version3.3.2_DG.xlsx")
anno <- anno[c(1:4,13,18,20)]
names(anno)[1] <- "Predictor"

comb <- left_join(comb, anno, by = "Predictor")

comb_full <- left_join(comb_full, anno, by = "Predictor")


# remove those which dont meet the proportionality assumption
length(which(comb$cox.zph < 0.05)) ## 1
comb = comb[-which(comb$cox.zph < 0.05),] 

# Do FDR correction
comb$FDR <- p.adjust(comb$P.Value, method = "BH")

# Write basic model which has been FDR corrected out 
write.csv(comb, "RI_WBC_sensitivity_Basic_1320_FDR_corrected_annotated.csv", row.names =F)


# Keep any that meet significance threshold 
keep1 <- comb[which(comb$FDR < 0.05),] # 297 associations 
keep = paste(keep1$Predictor, keep1$Outcome, sep = "_")

# Keep only those passing the threshold from the basic model in the fully adjusted 
comb_full$retain <- paste(comb_full$Predictor, comb_full$Outcome, sep = "_")
comb_full <- comb_full[which(comb_full$retain %in% keep),]
comb_full$retain <- NULL
names(comb_full)[6] <- "P.Value" 

# make sure filtered ones from fully adjusted dont have coxzph P of less than 0.05 
length(which(comb_full$cox.zph < 0.05)) ## 0

# Save the final model results 
write.csv(keep1, "RI_WBC_sensitivity_Basic_1320_that_pass_FDR_standard_model_annotated.csv", row.names =F)
write.csv(comb_full, "RI_WBC_sensitivity_Fully_adj_1320_filtered_to_basic_passing_FDR_standard_model_annotated.csv", row.names =F)

# Save a copy of just the significant associations 
sig <- comb_full %>% filter(P.Value < 0.05)

dim(sig) # 117
write.csv(sig, "RI_WBC_sensitivity_Fully_adj_1320_filtered_to_just_116_significant_associations_annotated.csv", row.names = F)



#######################################################################

### Processing grimage sensitivity results 

#######################################################################

## RI GRIMAGE MODEL 

### Code used to calculate overlap between basic and fully adjusted models (to be used once all 1320 associations present)

# Load in basic model 
comb <- read.csv("RI_basic_1320.csv")

# Load in fully adjusted model 
comb_full <- read.csv("RI_grimage_added_fully_adjusted_all.csv")

# Add the annotations into the files 

anno <- read_excel("Y:/STRADL/SOMAscan_Assay_v4_Annotations_version3.3.2_DG.xlsx")
anno <- anno[c(1:4,13,18,20)]
names(anno)[1] <- "Predictor"

comb <- left_join(comb, anno, by = "Predictor")

comb_full <- left_join(comb_full, anno, by = "Predictor")


# remove those which dont meet the proportionality assumption
length(which(comb$cox.zph < 0.05)) ## 1
comb = comb[-which(comb$cox.zph < 0.05),] 

# Do FDR correction
comb$FDR <- p.adjust(comb$P.Value, method = "BH")

# Write basic model which has been FDR corrected out 
write.csv(comb, "RI_grimage_sensitivity_Basic_1320_FDR_corrected_annotated.csv", row.names =F)


# Keep any that meet significance threshold 
keep1 <- comb[which(comb$FDR < 0.05),] # 297 associations 
keep = paste(keep1$Predictor, keep1$Outcome, sep = "_")

# Keep only those passing the threshold from the basic model in the fully adjusted 
comb_full$retain <- paste(comb_full$Predictor, comb_full$Outcome, sep = "_")
comb_full <- comb_full[which(comb_full$retain %in% keep),]
comb_full$retain <- NULL
names(comb_full)[6] <- "P.Value" 

# make sure filtered ones from fully adjusted dont have coxzph P of less than 0.05 
length(which(comb_full$cox.zph < 0.05)) ## 0

# Save the final model results 
write.csv(keep1, "RI_grimage_sensitivity_Basic_1320_that_pass_FDR_standard_model_annotated.csv", row.names =F)
write.csv(comb_full, "RI_grimage_sensitivity_Fully_adj_1320_filtered_to_basic_passing_FDR_standard_model_annotated.csv", row.names =F)

# Save a copy of just the significant associations 
sig <- comb_full %>% filter(P.Value < 0.05)

dim(sig) # 94
write.csv(sig, "RI_grimage_sensitivity_Fully_adj_1320_filtered_to_just_105_significant_associations_annotated.csv", row.names = F)




#######################################################################

### Processing WBC + grimage sensitivity results 

#######################################################################

## RI WBC and Grimage results 

### Code used to calculate overlap between basic and fully adjusted models (to be used once all 1320 associations present)

# Load in basic model 
comb <- read.csv("RI_basic_1320.csv")

# Load in fully adjusted model 
comb_full <- read.csv("RI_WBC_and_grimage_added_fully_adjusted_all.csv")

# Add the annotations into the files 

anno <- read_excel("Y:/STRADL/SOMAscan_Assay_v4_Annotations_version3.3.2_DG.xlsx")
anno <- anno[c(1:4,13,18,20)]
names(anno)[1] <- "Predictor"

comb <- left_join(comb, anno, by = "Predictor")

comb_full <- left_join(comb_full, anno, by = "Predictor")


# remove those which dont meet the proportionality assumption
length(which(comb$cox.zph < 0.05)) ## 1
comb = comb[-which(comb$cox.zph < 0.05),] 

# Do FDR correction
comb$FDR <- p.adjust(comb$P.Value, method = "BH")

# Write basic model which has been FDR corrected out 
write.csv(comb, "RI_WBC_and_grimage_added_Basic_1320_FDR_corrected_annotated.csv", row.names =F)


# Keep any that meet significance threshold 
keep1 <- comb[which(comb$FDR < 0.05),] # 297 associations 
keep = paste(keep1$Predictor, keep1$Outcome, sep = "_")

# Keep only those passing the threshold from the basic model in the fully adjusted 
comb_full$retain <- paste(comb_full$Predictor, comb_full$Outcome, sep = "_")
comb_full <- comb_full[which(comb_full$retain %in% keep),]
comb_full$retain <- NULL
names(comb_full)[6] <- "P.Value" 

# make sure filtered ones from fully adjusted dont have coxzph P of less than 0.05 
length(which(comb_full$cox.zph < 0.05)) ## 0

# Save the final model results 
write.csv(keep1, "RI_WBC_and_grimage_added_Basic_1320_that_pass_FDR_standard_model_annotated.csv", row.names =F)
write.csv(comb_full, "RI_WBC_and_grimage_added_Fully_adj_1320_filtered_to_basic_passing_FDR_standard_model_annotated.csv", row.names =F)

# Save a copy of just the significant associations 
sig <- comb_full %>% filter(P.Value < 0.05)

dim(sig) # 89
write.csv(sig, "RI_WBC_and_grimage_added_Fully_adj_1320_filtered_to_just_105_significant_associations_annotated.csv", row.names = F)


### TIME TO COLLATE RESULTS AND CREATE SUPPL TABLES AND PLOTS 

##################################################################################################

### Processing significant associations - collating to results tables in right format 

##################################################################################################

setwd("Y:/Danni/LBC_proteins_Jan2021/00_Running_pQTLs_regressed/Cox_250221_agreed_model_results")

library(tidyverse)


### format basic file results table 

basic <- read.csv("RI_sensitivity_Basic_1320_FDR_corrected_annotated.csv")

# Get a unified naming column which has both protein names in 2 panels merged 

library(dplyr)

basic <- basic %>% 
    mutate(naming = coalesce(Target,Predictor))

# Add an identifier for those that are olink vs somascan panels 

basic <- basic %>% 
    mutate(identifier = case_when(Target != "NA" ~ "SomaScan"))

basic$identifier[is.na(basic$identifier)] <- "Olink"

# Select the results you want from the columns 

basic1 <- basic[c(1,17,2,3:8,16,18)]

write.csv(basic1, "RI_basic_results_table_formatted_1.csv")


### UPDATE: add naming column which is corrected to an appropriate format 
library(readxl)
linker <- read_excel("Y:/Protein_DNAm_Proxies/Work_and_code_post_KORA/somascan_annotation.xlsx")
linker <- linker %>% as.data.frame()
linker <- linker[c(1,2,7)]
names(linker) <- c("Predictor", "Panel", "Name")

basic2 <- left_join(basic1, linker, by = "Predictor")

# write this file out as table for suppl information

basic2 <- basic2[c(1,13,12,3,4,5,6,7,8,9,10)]

write.csv(basic2, "Y:/Protein_DNAm_Proxies/Work_and_code_post_KORA/RI_basic_results_table_formatted_1_updated_naming.csv", row.names = F)

# this has now been inlcuded as suppl basic model results 

### Format fully adjusted results table 

sig <- read.csv("RI_sensitivity_Fully_adj_1320_filtered_to_just_137_significant_associations_annotated.csv") # 138 total associations 

outcomes <- length(sig$Outcome %>% unique()) # 11 outcomes

proteins <- length(sig$Predictor %>% unique()) # 78 episcores

### format sig table first 

# Get a unified naming column which has both protein names in 2 panels merged 

library(dplyr)

sig <- sig %>% 
    mutate(naming = coalesce(Target,Predictor))

# Add an identifier for those that are olink vs somascan panels 

sig <- sig %>% 
    mutate(identifier = case_when(Target != "NA" ~ "SomaScan"))

sig$identifier[is.na(sig$identifier)] <- "Olink"


sig1 <- sig[c(1,16,2,3:8,17)]

write.csv(sig1, "RI_fully_adjusted_results_table_formatted.csv", row.names = F)


### See if we can add the WBC results in 

WBC_list <- read.csv("RI_WBC_sensitivity_Fully_adj_1320_filtered_to_just_116_significant_associations_annotated.csv")
WBC_list <- WBC_list[c(1,2)]
WBC_list$WBC_pass <- "remains"

WBC <- read.csv("RI_WBC_added_fully_adjusted_all.csv")

sig <- WBC 

sig2 <- sig[c(1:8)]

join <- left_join(sig1, sig2, by = c("Predictor", "Outcome"))

join2 <- left_join(join, WBC_list, by = c("Predictor", "Outcome"))

join2$WBC_pass[is.na(join2$WBC_pass)] <- "attenuated"


### See if we can add grimage results in 

WBC_list <- read.csv("RI_WBC_and_grimage_added_Fully_adj_1320_filtered_to_just_105_significant_associations_annotated.csv")
WBC_list <- WBC_list[c(1,2)]
WBC_list$grim_pass <- "remains"

WBC <- read.csv("RI_WBC_and_grimage_added_fully_adjusted_all.csv")

sig <- WBC 

sig3 <- sig[c(1:8)]

join3 <- left_join(join2, sig3, by = c("Predictor", "Outcome"))

join3 <- left_join(join3, WBC_list, by = c("Predictor", "Outcome"))

join3$grim_pass[is.na(join3$grim_pass)] <- "attenuated"

# write this file out as table for suppl information

write.csv(join3, "RI_suppl_table_FA_results_formatted.csv", row.names = F)


### UPDATE: add naming column which is corrected to an appropriate format 
library(readxl)
linker <- read_excel("Y:/Protein_DNAm_Proxies/Work_and_code_post_KORA/somascan_annotation.xlsx")
linker <- linker %>% as.data.frame()
linker <- linker[c(1,2,7)]
names(linker) <- c("Predictor", "Panel", "Name")

join4 <- left_join(join3, linker, by = "Predictor")

# write this file out as table for suppl information

write.csv(join4, "Y:/Protein_DNAm_Proxies/Work_and_code_post_KORA/RI_suppl_table_FA_results_formatted_updated_naming.csv", row.names = F)


# ##################################################################################################

# ### Processing significant associations - fully adj main model results 

# ##################################################################################################

# # I guess we can read back in the previously genertaed file 

# #setwd("Y:/Danni/LBC_proteins_Jan2021/00_Running_pQTLs_regressed/Cox_250221_agreed_model_results")

# library(tidyverse)

# # sig <- read.csv("RI_suppl_table_FA_results_formatted.csv")
# sig <- read.csv("RI_suppl_table_FA_results_formatted_updated_naming.csv")

# sig <- sig[c(1,25,26,3:10,17,24)]

# write.csv(sig, "RI_suppl_table_FA_results_formatted_slimmed_to_key_info.csv", row.names = F)

# # Get a list of all individual associations - outcomes grouped by protein

# sig$Short_name <- as.character(sig$Short_name)
# sig$Outcome <- as.character(sig$Outcome)

# prot <- sig[,c("Short_name", "Outcome")] 

# prot <- as.data.frame(prot)

# prot <- prot %>% group_by(Short_name) 

# counts <- count(prot, Short_name)

# counts <- counts[order(-counts$n),]

# #  1 C5a             5
# #  2 Contactin-4     4
# #  3 sE-Selectin     4
# #  4 EN.RAGE         3
# #  5 FGF.21          3
# #  6 HGF             3
# #  7 LY9             3
# #  8 MMP-9           3
# #  9 OSM             3
# # 10 sL-Selectin     3

# write.csv(counts, "summary_counts_per_episcore_138.csv", row.names = F)



# # Count the number of associations for each disease 

# prot <- sig[,c("Short_name", "Outcome")] 

# prot <- as.data.frame(prot)

# prot <- prot %>% group_by(Outcome) 

# counts2 <- count(prot, Outcome)

# counts2 <- counts2[order(-counts2$n),]

# #    Outcome                 n
# #    <chr>               <int>
# #  1 COPD                   41
# #  2 Diabetes               33
# #  3 IHD                    17
# #  4 RA                     14
# #  5 Depression             10
# #  6 Lung.Cancer             8
# #  7 Stroke                  8
# #  8 IBD                     3
# #  9 Alzheimer's Disease     1
# # 10 Bowel.Cancer            1
# # 11 Pain                    1


# write.csv(counts2, "summary_counts_per_disease_138.csv", row.names = F)


##################################################################################################

### DO THE REPLICATION ASSESSMENT IN DIABETES DATASETS 

##################################################################################################

setwd("Y:/Danni/LBC_proteins_Jan2021/00_Running_pQTLs_regressed/Cox_250221_agreed_model_results")

library(tidyverse)

### Get episcores in format that cna be matched across results from both studies 

# Fully adjusted model results 
pass <- read.csv("RI_suppl_table_FA_results_formatted.csv")

# filter to diabetes only 
pass <- pass %>% filter(Outcome == "Diabetes")

#  [1] SHBG                             Adiponectin
#  [3] IGFBP-1                          sE-Selectin
#  [5] Stanniocalcin-1                  LG3BP
#  [7] Aminoacylase-1                   Growth hormone receptor
#  [9] Trypsin 2                        NCAM-120
# [11] NEP                              TECK
# [13] FGF.21                           6Ckine
# [15] BMP-1                            C5a
# [17] SLIK5                            sL-Selectin
# [19] OMD                              Afamin
# [21] IR                               Thrombopoietin Receptor
# [23] Notch 1                          C9
# [25] alpha-1-antichymotrypsin complex VEGFA
# [27] WFKN2                            N.CDase
# [29] Contactin-4                      TSP2
# [31] ENPP7                            sCD163
# [33] Endocan

# Get names used for diab assocs 
passed <- pass[c(2,10)]
passed$pass <- "Replicated"

# filter out the 4 olink proteins for this replication part 
passed <- passed %>% filter(identifier == "SomaScan")

# annotations file
library(readxl)
library(tidyverse)
anno <- read_excel("Y:/STRADL/SOMAscan_Assay_v4_Annotations_version3.3.2_DG.xlsx")
anno <- anno[c(1:4,13,18,20)]
anno <- as.data.frame(anno)
names(anno)[6] <- "gene"
names(anno)[2] <- "naming"

# Join in annotations so we can select gene info to match from as well as seqid and naming 
passed2 <- left_join(passed, anno, by = "naming")

# Select details for matching table 
passed3 <- passed2[c("SeqId", "gene", "naming", "pass")]

##### Elhadad study

## PREVALENT markers

# I have taken the list of top bioamrkers from the table 2 provided in the study to 
# create a crude match up between these proteins and the episcore associations 
# These only include those which replicated across KORA and HUNT 

list <- c("IDUA", "Aminoacylase-1", "Apo B", "CATZ", "ARMEL", "C2", "LG3BP",
  "Gelsolin", "Met", "Kallikrein 7", "Cathepsin A", "MATN2", "OMD", "PYY", 
  "Periostin", "C1-esterase inhibitor", "Renin", "RGMB", "SHBG", "SLIK5", "TGFbR3",
  "Trypsin", "TSG-6", "WIF1")

test <- passed3[which(passed$naming %in% list),]

# 5 of the original 24 associations for prevalent diabetes replicated 


### INCIDENT markers

# I've taken the list of markers from table 3 which were associated with incident 
# diabetes and replicated across KORA and HUNT

list2 <- c("Aminoacylase-1", "Growth hormone receptor", "Insulin-like growth factor-binding protein 2")

test2 <- passed3[which(passed$naming %in% list2),]

# 2 of the original 3 incident associations replicated with episcores

# Collate these results into a table format 

start <- passed3[c(1,2,3)]

start <- left_join(start, test, by = "SeqId")
start <- left_join(start, test2, by = "SeqId")

start <- start[c(1,2,3,6,9)]

# name columns 
names(start) <- c("SeqId", "Gene", "Naming", "Prevalent Elhadad et al", "Incident Elhadad et al")

##### Gudmundsdottir_et_al - notes 

library(readxl)

### Prevlaent in AGES-Reykjavik

prev <- read_excel("Y:/Protein_DNAm_Proxies/Work_and_code_post_KORA/replication/Suppl_info_valborg_circulating_candidates/Gudmundsdottir_et_al_table_3_AGES_prev.xls") 
prev <- prev %>% as.data.frame()

prev1 <- prev %>% filter(P.bon...6 < 0.05) # 555 proteins significant in base model
names(prev1)[1] <- "Protein"
prev_un <- length(unique(prev1$Protein)) # 520 unique proteins in this group 

prev2 <- prev %>% filter(P.bon...26 < 0.05) # 154 proteins significant in fully adjusted model
names(prev2)[1] <- "Protein"
prev_un2 <- length(unique(prev2$Protein)) # 142 unique proteins in this group 

### QMDiab prevalent replication

prev_QMD <- read.csv("Y:/Protein_DNAm_Proxies/Work_and_code_post_KORA/replication/Suppl_info_valborg_circulating_candidates/Gudmundsdottir_et_al_edited_table_5_QMD_prev_rep.csv")
prev_QMD <- prev_QMD %>% as.data.frame()

prev_QMD <- prev_QMD %>% filter(P.bon < 0.05) # 43 proteins significant in base model 
names(prev_QMD)[1] <- "Protein"

prev_QMD2 <- prev_QMD %>% filter(P.bon.1 < 0.05) # 33 after controlling for BMI 
names(prev_QMD2)[1] <- "Protein"

prot_QMD <- prev_QMD[c(1)] # Isolate proteins which were prev in some form (from base model for now)

### Incident in AGES-Reykjavik

inc <- read_excel("Y:/Protein_DNAm_Proxies/Work_and_code_post_KORA/replication/Suppl_info_valborg_circulating_candidates/Gudmundsdottir_et_al_edited_table_6_AGES_incident.xls")
inc <- inc %>% as.data.frame()

inc1 <- inc %>% filter(P.bon...6 < 0.05)
names(inc1)[1] <- "Protein"
inc1_un <- length(unique(inc1$Protein)) # 99 incident unique 


inc2 <- inc %>% filter(P.bon...26 < 0.05)
names(inc2)[1] <- "Protein"
inc2_un <- length(unique(inc2$Protein)) # none reached significance in the full model 

# Find those episcore-diabetes associations that repliacted across this study 

AGES_basic <- passed3[which(passed3$gene %in% prev1$Protein),]
names(AGES_basic)[2] <- "Gene"
names(AGES_basic)[4] <- "Replicated_AGES_basic_prevalent"
AGES_basic <- AGES_basic[c(2,4)]


AGES_full <- passed3[which(passed3$gene %in% prev2$Protein),]
names(AGES_full)[2] <- "Gene"
names(AGES_full)[4] <- "Replicated_AGES_full_prevalent"
AGES_full <- AGES_full[c(2,4)]


QMD_basic <- passed3[which(passed3$gene %in% prev_QMD$Protein),]
names(QMD_basic)[2] <- "Gene"
names(QMD_basic)[4] <- "Replicated_QMD_basic_prevalent"
QMD_basic <- QMD_basic[c(2,4)]


inc_AGES_basic <- passed3[which(passed3$gene %in% inc1$Protein),]
names(inc_AGES_basic)[2] <- "Gene"
names(inc_AGES_basic)[4] <- "Replicated_AGES_basic_incident"
inc_AGES_basic <- inc_AGES_basic[c(2,4)]

### NOW JOIN THINGS UP 

start <- left_join(start, AGES_basic, by = "Gene")
start <- left_join(start, AGES_full, by = "Gene")
start <- left_join(start, QMD_basic, by = "Gene")
start <- left_join(start, inc_AGES_basic, by = "Gene")
write.csv(start, "Y:/Protein_DNAm_Proxies/Work_and_code_post_KORA/replication/replication_summary.csv", row.names = F)

# Write out a file with diabetes info for our study, to lookup direction of effect and check all replicate in this regard too 
pass2 <- pass[c(1:4)]
write.csv(pass2, "Y:/Protein_DNAm_Proxies/Work_and_code_post_KORA/replication/diabetes_soma_associations_summary.csv", row.names = F)


##################################################################################################

### MAKE THE FORREST PLOT FOR DIABETES

##################################################################################################

## Plot - REMOVING OLINK AND DOING SEPARATE 

setwd("Y:/Protein_DNAm_Proxies/Work_and_code_post_KORA/Cox_250221_agreed_model_results")

library(tidyverse)

# load in the datset withthe HR info for FA results 
x <- read_excel("Y:/Protein_DNAm_Proxies/Work_and_code_post_KORA/RI_suppl_table_FA_results_formatted_updated_naming_clec11a_plots.xlsx")
x <- as.data.frame(x)

# Join in the correct naming for the plots as generated above 
naming <- read.csv("Y:/Protein_DNAm_Proxies/Work_and_code_post_KORA/plot_annotations.csv")
names(naming)[4] <- "Plot_names"
naming <- naming[c(1,4)]
naming$Predictor <- as.character(naming$Predictor)

# check order of predictor is the same 
identical(naming$Predictor, x$Predictor) # TRUE 

# Join in the plot naming 
naming <- naming[2]
x <- cbind(x, naming)

# remove diabetes and COPD 

x <- x[which(x$Outcome %in% c("Diabetes")),]

# read in results of replication 
library(readxl)
rep <- read_excel("Y:/Protein_DNAm_Proxies/Work_and_code_post_KORA/replication/replication_summary_formatted.xlsx")
rep <- as.data.frame(rep)
names(rep)[1] <- "Predictor"

# select the info you need for replication annotations in the plot 
rep2 <- rep[c(1,10,11,13)]

x <- left_join(x, rep2, by = "Predictor")
names(x)[28:30] <- c("Cohort", "Diabetes", "Replication")

# This was just placeholder data for testing so we can comment out now 
# x$Replicates <- rep(c("Gudmundsdottir et al, 2020", "Elhadad et al, 2020", "Unreplicated"), times = 11)
# x$points <- rep(c("prev", "inc", "both"), times = 11)

x <- na.omit(x)

x$Outcome2 <- x$Outcome
#levels(x$Outcome2) = c(" ", "  ", "    ")
x$TraitVar <- paste0(x$Plot_names)
x$TraitVar = factor(x$TraitVar, levels=unique(x$TraitVar[rev(order(x$Hazard.Ratio.x))]))


My_Theme = theme(
  axis.title.x = element_text(size = 20),
  axis.text.x = element_text(size = 20),
  axis.text.y = element_text(size = 20),
  axis.title.y = element_text(size = 25),
  strip.text = element_text(size = 25, face = "bold"),
  legend.text=element_text(size=16),
  legend.title=element_text(size=20, face = "bold"))

plot1 <- ggplot(x,aes(y=Hazard.Ratio.x, x=TraitVar)) + 
  geom_point(aes(shape = Diabetes, color = Replication), size = 5)+
  geom_errorbar(aes(ymin = LCI.x, ymax = UCI.x),
                position = position_dodge(0.5), width = 0.05,
                colour = "black")+
  ylab("Hazard Ratio [95% Confidence Interval]")+ xlab ("")+
  geom_hline(yintercept = 1, linetype = "dotted")+
  theme(axis.text.x = element_text(size = 12, vjust = 0.5), legend.position = "right",
        plot.title = element_text(size = 12))+ theme(legend.title = element_text(hjust = 0.5)) +
  coord_flip() + facet_wrap(~Outcome, scales = "free_y") + My_Theme + ylim(0.30, 2.8)

## Now plot the olink as a separate panel 

setwd("Y:/Protein_DNAm_Proxies/Work_and_code_post_KORA/Cox_250221_agreed_model_results")

library(tidyverse)

# load in the datset withthe HR info for FA results 
x <- read_excel("Y:/Protein_DNAm_Proxies/Work_and_code_post_KORA/RI_suppl_table_FA_results_formatted_updated_naming_clec11a_plots.xlsx")
x <- as.data.frame(x)

# Join in the correct naming for the plots as generated above 
naming <- read.csv("Y:/Protein_DNAm_Proxies/Work_and_code_post_KORA/plot_annotations.csv")
names(naming)[4] <- "Plot_names"
naming <- naming[c(1,4)]
naming$Predictor <- as.character(naming$Predictor)

# check order of predictor is the same 
identical(naming$Predictor, x$Predictor) # TRUE 

# Join in the plot naming 
naming <- naming[2]
x <- cbind(x, naming)

# remove diabetes and COPD 

x <- x[which(x$Outcome %in% c("Diabetes")),]

# read in results of replication 
library(readxl)
rep <- read_excel("Y:/Protein_DNAm_Proxies/Work_and_code_post_KORA/replication/replication_summary_formatted.xlsx")
rep <- as.data.frame(rep)
names(rep)[1] <- "Predictor"

# select the info you need for replication annotations in the plot 
rep2 <- rep[c(1,10,11,13)]

x <- left_join(x, rep2, by = "Predictor")
names(x)[28:30] <- c("Cohort", "Diabetes", "Replication")

# This was just placeholder data for testing so we can comment out now 
# x$Replicates <- rep(c("Gudmundsdottir et al, 2020", "Elhadad et al, 2020", "Unreplicated"), times = 11)
# x$points <- rep(c("prev", "inc", "both"), times = 11)

x <- x[-which(x$Panel == "SomaScan"),]

x$Outcome2 <- x$Outcome
#levels(x$Outcome2) = c(" ", "  ", "    ")
x$TraitVar <- paste0(x$Plot_names)
x$TraitVar = factor(x$TraitVar, levels=unique(x$TraitVar[rev(order(x$Hazard.Ratio.x))]))


My_Theme = theme(
  axis.title.x = element_text(size = 20),
  axis.text.x = element_text(size = 20),
  axis.text.y = element_text(size = 20),
  axis.title.y = element_text(size = 25),
  strip.text = element_text(size = 25, face = "bold"),
  legend.text=element_text(size=16),
  legend.title=element_text(size=20, face = "bold"))

plot2 <- ggplot(x,aes(y=Hazard.Ratio.x, x=TraitVar)) + 
  geom_point(size = 5, color = "black")+
  geom_errorbar(aes(ymin = LCI.x, ymax = UCI.x),
                position = position_dodge(0.5), width = 0.05,
                colour = "black")+
  ylab("Hazard Ratio [95% Confidence Interval]")+ xlab ("")+
  geom_hline(yintercept = 1, linetype = "dotted")+
  theme(axis.text.x = element_text(size = 12, vjust = 0.5), legend.position = "right",
        plot.title = element_text(size = 12))+ theme(legend.title = element_text(hjust = 0.5)) +
  coord_flip() + My_Theme + ylim(0.30, 2.8)

  # + facet_wrap(~Outcome, scales = "free_y") + My_Theme

# Patchwork them together 

### COMBINE PLOTS 1 and 2 
library(patchwork)
pdf("diabetes_joint.pdf", width = 15, height = 10)
plot1 / plot2 + plot_layout(heights = c(3, 0.44))
dev.off()






############################################################################################

# Igraph plot

############################################################################################


setwd("Y:/Protein_DNAm_Proxies/Work_and_code_post_KORA/Cox_250221_agreed_model_results")

library(tidyverse)

# load in the datset withthe HR info for FA results 
x <- read_excel("Y:/Protein_DNAm_Proxies/Work_and_code_post_KORA/RI_suppl_table_FA_results_formatted_updated_naming_clec11a_plots.xlsx")
x <- as.data.frame(x)

# Join in the correct naming for the plots as generated above 
naming <- read.csv("Y:/Protein_DNAm_Proxies/Work_and_code_post_KORA/plot_annotations.csv")
names(naming)[4] <- "Plot_names"
naming <- naming[c(1,4)]
naming$Predictor <- as.character(naming$Predictor)

# check order of predictor is the same 
identical(naming$Predictor, x$Predictor) # TRUE 

# Join in the plot naming 
naming <- naming[2]
x <- cbind(x, naming)

# Look at the counts data for these proteins 

prot <- x[,c("Plot_names", "Outcome")] 

prot <- as.data.frame(prot)

prot <- prot %>% group_by(Plot_names) 

counts <- count(prot, Plot_names)

counts <- counts[order(-counts$n),]

counts <- as.data.frame(counts)

#    Plot_names n
# 1          C5 5
# 2       CNTN4 4
# 3        SELE 4
# 4     EN-RAGE 3
# 5      FGF-21 3
# 6         HGF 3
# 7         LY9 3
# 8        MMP9 3
# 9     NTRK3.2 3
# 10        OSM 3
# 11      PRSS2 3
# 12       SELL 3
# 13    SLITRK5 3


setwd("Y:/Danni/LBC_proteins_Jan2021/00_Running_pQTLs_regressed/Cox_250221_agreed_model_results")

library(tidyverse)
library(igraph)
library(statnet)
library(ggraph)
library(graphlayouts)
library(tidygraph)


# Fully adjusted model results 
pass <- read.csv("RI_suppl_table_FA_results_formatted.csv")

counts3 <- read.csv("summary_counts_per_episcore_138.csv")

data <- left_join(pass, counts3, by = "naming")

incidence <- data[which(data$n > 2),]

#incidence <- incidence[c(29,30,34,20,32, 8,12,17,18,23,1,22,26,3,7,24,35,6,13,14,16,33,28,2,4,5,9,10,11,15,19,21,25,27,31),]


# Create log HR 
incidence$logHR <- log(incidence$Hazard.Ratio.x)

# 2 edge columns 
data <- incidence[c(2,3,26)]
names(data)[3] <- "Importance"

# filter to remove COPD 
# data <- data %>% filter(!Outcome == "COPD")

# Change naming 
data <- data %>% 
  mutate(Outcome = str_replace(Outcome, "Lung.Cancer", "Lung cancer"))

data <- data %>% 
mutate(Outcome = str_replace(Outcome, "Bowel.Cancer", "Bowel cancer"))


# Get colour coding info for nodes ready 
info <- data$naming %>% as.data.frame()
names(info)[1] <- "Node"
info$Type <- "EpiScore"

# # Match order to main info going into plot 
match <- info[match(data$naming, info$Node),]

# Outcomes coded to different label
outcomes <- data[2] %>% unique()
outcomes$type <- outcomes$Outcome
names(outcomes)[1] <- "Node"
names(outcomes)[2] <- "Type"

rbind <- rbind(match, outcomes) %>% unique()

data$color <- ifelse(data$Importance > 0,'black','#BBBBBB')

data$Importance <- abs(data$Importance)

names(rbind)[2] <- "Nodes"

#######################################################################

### NETWORK PLOT

#######################################################################

setwd("Y:/Danni/LBC_proteins_Jan2021/00_Running_pQTLs_regressed/Cox_250221_agreed_model_results")

set.seed(597)

library(tidyverse)
library(igraph)
library(statnet)
library(ggraph)
library(graphlayouts)
library(tidygraph)


# Fully adjusted model results 
pass <- read.csv("RI_suppl_table_FA_results_formatted.csv")
counts3 <- read.csv("summary_counts_per_episcore_138.csv")
data <- left_join(pass, counts3, by = "naming")
incidence <- data[which(data$n > 2),]

# Create log HR (importance variable)
incidence$logHR <- log(incidence$Hazard.Ratio.x)

# Create the relationship variable 
incidence$Relationship <- ifelse(incidence$logHR < 0,"protective","predictive")

# Select data needed  
data <- incidence[c(2,3,26,27)]
names(data)[3] <- "Importance"
data$Importance <- abs(data$Importance)

# Add disease group 
data$Disease <- data$Outcome

# Change naming 
data <- data %>% 
  mutate(Outcome = str_replace(Outcome, "Lung.Cancer", "Lung cancer"))

data <- data %>% 
mutate(Outcome = str_replace(Outcome, "Bowel.Cancer", "Bowel cancer"))

data$Disease <- as.factor(data$Disease)

graph2 <- graph_from_data_frame(data, vertices=rbind, directed = FALSE)

# Make a palette of 3 colors
coul  <- c( "gray34", "gray34", "gray34", "gray34", "gray50", "gray34", "gray34", "gray34", "gray34")
 
# Create a vector of color
my_color <- coul[as.numeric(as.factor(V(graph2)$Nodes))]

My_Theme = theme(
  axis.title.x = element_text(size = 20),
  axis.text.x = element_text(size = 20),
  axis.text.y = element_text(size = 20),
  axis.title.y = element_text(size = 25),
  strip.text = element_text(size = 25, face = "bold"),
  legend.text=element_text(size=20),
  legend.title=element_text(size=25, face = "bold"))

# save the plot 

pdf("network_plot_labels_1.pdf", width = 12, height = 8)
  ggraph(graph2,"stress",bbox = 15)+
  geom_edge_link2(aes(edge_colour = Relationship)) +
  geom_node_point(size = 7, colour = my_color) +
  #geom_node_point(aes(fill = Disease),shape = 21,size = 3)+
  #geom_node_text(aes(label = name), repel = T, nudge_x = 0, nudge_y = 0, size = 4) +
  geom_node_label(aes(label = name), repel = T, nudge_x = 0, nudge_y = 0, label.padding = unit(0.6, "lines"), label.size = 0.2) +
  scale_size(range=c(2,5),guide = FALSE) + theme(legend.position = "none", panel.background = element_rect(fill = "white"), legend.text=element_text(size=16),
  legend.title=element_text(size=18, face = "bold"))
  #theme_graph(background = "white")
  dev.off()

# save the plot 

pdf("network_plot_unlabelled_1.pdf", width = 12, height = 8)
  ggraph(graph2,"stress",bbox = 15)+
  geom_edge_link2(aes(edge_colour = Relationship)) +
  geom_node_point(size = 7, colour = my_color) +
  #geom_node_point(aes(fill = Disease),shape = 21,size = 3)+
  #geom_node_text(aes(label = name), repel = T, nudge_x = 0, nudge_y = 0, size = 4) +
  #geom_node_label(aes(label = name), repel = T, nudge_x = 0, nudge_y = 0, label.padding = unit(0.6, "lines"), label.size = 0.2) +
  scale_size(range=c(2,5),guide = FALSE) + theme(legend.position = "none", panel.background = element_rect(fill = "white"), legend.text=element_text(size=16),
  legend.title=element_text(size=18, face = "bold"))
  #theme_graph(background = "white")
  dev.off()
back to top