https://github.com/DanniGadd/EpiScores-for-protein-levels
Raw File
Tip revision: a5130fab3895a0d95f0dcc8826aa9fb5e8c0fa86 authored by DanniGadd on 16 February 2022, 08:46:38 UTC
Merge branch 'main' of https://github.com/DanniGadd/EpiScores-for-protein-levels
Tip revision: a5130fa
21_cox_models_WBCs_as_predictors_sensitivity.R
Copyright (c) <2022>, <DanniGadd>
All rights reserved.

This source code is licensed under the MIT license found in the
LICENSE file in the root directory.

####################################################################################
####################################################################################
############################## COXME MODEL #########################################
####################################################################################
####################################################################################

# white blood cells as predictors sensitivity check 

####################################################################################

## Installing Requisite Packages 

####################################################################################

if(!require(survival)){
  install.packages("survival")
}

if(!require(kinship2)){
  install.packages("kinship2")
}

if(!require(coxme)){
  install.packages("coxme")
}

if(!require(readxl)){
  install.packages("readxl")
}

library(survival)
library(kinship2)
library(coxme)
library(readxl)
library(tidyverse)
library(gdata)

####################################################################################

## Create Kinship Matrix

####################################################################################

ped = read.csv("/Volumes/marioni-lab/Danni/LBC_proteins_project/Incidence/pedigree.csv")

ped$father <- as.numeric(ped$father)
ped$mother <- as.numeric(ped$mother)
ped$father[ped$father==0] <- NA
ped$mother[ped$mother==0] <- NA
table(ped$sex)
ped$sex <- as.numeric(ped$sex)
ped$sex[ped$sex==2] <- 0
ped$sex <- ped$sex+1
kin <- with(ped, pedigree(volid, father, mother, sex, famid=famid))
kin_model <- kinship(kin) 


# Function to Extract Lmekin Results  
extract_coxme_table <- function (mod){
  beta <- mod$coefficients #$fixed is not needed
  nvar <- length(beta)
  nfrail <- nrow(mod$var) - nvar
  se <- sqrt(diag(mod$var)[nfrail + 1:nvar])
  z<- round(beta/se, 2)
  p<- signif(1 - pchisq((beta/se)^2, 1), 2)
  table=data.frame(cbind(beta,se,z,p))
  return(table)
}

####################################################################################

#### Preparing of Phenotype Files

####################################################################################

d1 <- read.csv("/Volumes/marioni-lab/Protein_DNAm_Proxies/Work_and_code_post_KORA/d1_080321.csv")

####################################################################################

## Read in Incidence Files for each trait - combining pirmary and secondary cases

####################################################################################

AD <- read.csv("/Volumes/marioni-lab/Generation_Scotland_data/GP_data_October_2020_updated_cases_12_traits/Merged_primary_secondary_codes_with_terms_removed_29_10_20/AD_combined.csv")

Bowel.Cancer <- read.csv("/Volumes/marioni-lab/Generation_Scotland_data/GP_data_October_2020_updated_cases_12_traits/Merged_primary_secondary_codes_with_terms_removed_29_10_20/Bowel_cancer_combined.csv")

Breast.Cancer <- read.csv("/Volumes/marioni-lab/Generation_Scotland_data/GP_data_October_2020_updated_cases_12_traits/Merged_primary_secondary_codes_with_terms_removed_29_10_20/Breast_cancer_combined.csv")

COPD <- read.csv("/Volumes/marioni-lab/Generation_Scotland_data/GP_data_October_2020_updated_cases_12_traits/Merged_primary_secondary_codes_with_terms_removed_29_10_20/COPD_combined.csv")

Depression <- read.csv("/Volumes/marioni-lab/Generation_Scotland_data/GP_data_October_2020_updated_cases_12_traits/Merged_primary_secondary_codes_with_terms_removed_29_10_20/Depression_mod_severe_combined.csv")

Diabetes <- read.csv("/Volumes/marioni-lab/Generation_Scotland_data/GP_data_October_2020_updated_cases_12_traits/Merged_primary_secondary_codes_with_terms_removed_29_10_20/Diabetes_combined.csv")

IBD <- read.csv("/Volumes/marioni-lab/Generation_Scotland_data/GP_data_October_2020_updated_cases_12_traits/Merged_primary_secondary_codes_with_terms_removed_29_10_20/IBD_combined.csv")

IHD <- read.csv("/Volumes/marioni-lab/Generation_Scotland_data/GP_data_October_2020_updated_cases_12_traits/Merged_primary_secondary_codes_with_terms_removed_29_10_20/IHD_combined.csv")

Lung.Cancer <- read.csv("/Volumes/marioni-lab/Generation_Scotland_data/GP_data_October_2020_updated_cases_12_traits/Merged_primary_secondary_codes_with_terms_removed_29_10_20/Lung_cancer_combined.csv")

Pain <- read.csv("/Volumes/marioni-lab/Generation_Scotland_data/GP_data_October_2020_updated_cases_12_traits/Merged_primary_secondary_codes_with_terms_removed_29_10_20/Pain_combined.csv")

RA <- read.csv("/Volumes/marioni-lab/Generation_Scotland_data/GP_data_October_2020_updated_cases_12_traits/Merged_primary_secondary_codes_with_terms_removed_29_10_20/RA_combined.csv")

Stroke <- read.csv("/Volumes/marioni-lab/Generation_Scotland_data/GP_data_October_2020_updated_cases_12_traits/Merged_primary_secondary_codes_with_terms_removed_29_10_20/Stroke_combined.csv")



####################################################################################

## HAZARD MODELS - PREDICTING TIME-TO-ONSET OF DISEASES FROM STUDY BASELINE (2006)

####################################################################################


# the usual clock for running all proteins
clock <- names(d1)[7:11] 

# Edit down the clock to run just for one protein to extract the tables for cox info 
# clock <- names(d1)[110]



####################################################################################

## HAZARD MODELS - PREDICTING TIME-TO-ONSET OF DISEASES FROM STUDY BASELINE (2006)

####################################################################################


## BASIC MODEL

d1_AD <- d1

## Run Incidence of Alzheimer's Disease Analysis - Restricted to Participants > 60 

mat_hazard_ad <- matrix(nrow=length(clock),ncol=9)
output_hazard_AD<- as.data.frame(mat_hazard_ad)
for(j in 1:length(clock)){ 
  tryCatch({ 
  dat1= d1_AD
  tmp1 = AD[which(AD$id %in% dat1$Sample_Name),]
  
  ## Obtain Age of Onset 
  affected = dat1[which(dat1$Sample_Name %in% tmp1$id),] 
  age_onset = AD[,c("first", "id")]
  affected = merge(age_onset, affected, by.x = "id", by.y = "Sample_Name")
  affected$Event = 1
  affected$yoe = substring(affected$first, 1, 4)
  affected$moe = substring(affected$first, 5,6)
  affected$month_event1 = (as.numeric(affected$moe) - as.numeric(affected$mob))/12
  affected$age_event1 = as.numeric(affected$yoe) - as.numeric(affected$yob)
  affected$age_event = affected$age_event1 + affected$month_event1
  affected$first = NULL
  affected$yoe = NULL 
  affected$moe = NULL
  affected$month_event1 = NULL 
  affected$age_event1 = NULL
  
  healthy = dat1[-which(dat1$Sample_Name %in% AD$id),]
  healthy$Event = 0
  healthy$age_event = 0 
  affected$id.y <- NULL
  healthy$id <- NULL
  names(affected)[names(affected)=="id"] <- "Sample_Name"
  cox = rbind(affected, healthy)
  
  cox$age_death = 0
  cox$age_death = ifelse(cox$dead %in% 1, cox$aged, 0)
  cox$age_at_event = ifelse(cox$Event %in% 1, cox$age_event, (ifelse(cox$dead %in% 1 & cox$Event %in% 0, cox$age_death, cox$aged)))
  cox$tte = cox$age_at_event - cox$Age
  cox$tte = as.numeric(cox$tte)
  cox$tte <- ifelse(cox$tte < -1, "NA", cox$tte)
  cox$tte = ifelse(cox$tte < 0, 0, cox$tte)
  cox$Event = as.numeric(cox$Event)
  cox$tte<-as.numeric(cox$tte)
  
  cox = cox[cox$age_at_event >=65,]
  mod = coxme(Surv(cox$tte, cox$Event) ~ scale(cox[,clock[[j]]]) + factor(cox$Female) + cox$Age + factor(cox$Set) + (1|cox$Sample_Name), varlist = kin_model*2)
  print(clock[[j]])
  print("AD")
  output_hazard_AD[j,1] <- as.character(clock[[j]])
  output_hazard_AD[j,2] <- as.character("Alzheimer's Disease")
  output_hazard_AD[j,3:5]<-round(exp(cbind(coef(mod), confint(mod)))[1,1:3],2)
  output_hazard_AD[j,6] <- extract_coxme_table(mod)[1,4]
  output_hazard_AD[j,7] <- mod$n[1]
  output_hazard_AD[j,8] <- mod$n[2]-mod$n[1]
  output_hazard_AD[j,9] <-cox.zph(mod)[1][[1]][3]
  }, error = function(e) cat("skipped"))
} 



## Create List of Remaining Dataframes - not Cancer  

my.list = list(COPD,Depression,Stroke,Diabetes,Pain,RA,IHD)
names = list("COPD","Depression","Stroke","Diabetes","Pain","RA","IHD")
names(my.list) <- names 

l=lapply(my.list, "[", c(1:2))

names(d1)[names(d1) == "COPD_Y"] <- "COPD"
names(d1)[names(d1) == "depression_Y"] <- "Depression"
names(d1)[names(d1) == "heart_disease_Y"] <- "IHD"
names(d1)[names(d1) == "stroke_Y"] <- "Stroke"
names(d1)[names(d1) == "diabetes_Y"] <- "Diabetes"
names(d1)[names(d1) == "rheum_arthritis_Y"] <- "RA"

mat_hazard <- matrix(nrow=62*length(my.list),ncol=9)
output_hazard <- as.data.frame(mat_hazard)
k=c(0,62,124,186,248,310,372,434)

## Loop of Survival Models - Longitudinal Associations
for(j in 1:length(clock)){
  for(i in 1:length(l)){ 
   tryCatch({ 
    tmp <- l[[i]]
    
    ## Exclude Indiviudals who Reported Disease at Study Baseline  
    dat1= d1[-which(d1[,names[[i]]] %in% 1),]
    tmp1 = tmp[which(tmp$id %in% dat1$Sample_Name),]
    
    ## Obtain Age of Onset 
    affected = dat1[which(dat1$Sample_Name %in% tmp1$id),] 
    age_onset = tmp[,c("first", "id")]
    affected = merge(age_onset, affected, by.x = "id", by.y = "Sample_Name")
    affected$Event = 1
    affected$yoe = substring(affected$first, 1, 4)
    affected$moe = substring(affected$first, 5,6)
    affected$month_event1 = (as.numeric(affected$moe) - as.numeric(affected$mob))/12
    affected$age_event1 = as.numeric(affected$yoe) - as.numeric(affected$yob)
    affected$age_event = affected$age_event1 + affected$month_event1
    affected$first = NULL
    affected$yoe = NULL 
    affected$moe = NULL
    affected$month_event1 = NULL 
    affected$age_event1 = NULL
    
    healthy = dat1[-which(dat1$Sample_Name %in% tmp$id),]
    healthy$Event = 0
    healthy$age_event = 0 
    affected$id.y <- NULL
    healthy$id <- NULL
    names(affected)[names(affected)=="id"] <- "Sample_Name"
    cox = rbind(affected, healthy)
    
    cox$age_death = 0
    cox$age_death = ifelse(cox$dead %in% 1, cox$aged, 0)
    cox$age_at_event = ifelse(cox$Event %in% 1, cox$age_event, (ifelse(cox$dead %in% 1 & cox$Event %in% 0, cox$age_death, cox$aged)))
    cox$tte = cox$age_at_event - cox$Age
    cox$tte = as.numeric(cox$tte)
    cox$tte <- ifelse(cox$tte < -1, "NA", cox$tte)
    cox$tte = ifelse(cox$tte < 0, 0, cox$tte)
    cox$Event = as.numeric(cox$Event)
    cox$tte<-as.numeric(cox$tte)
       
  
    mod = coxme(Surv(cox$tte, cox$Event) ~ scale(cox[,clock[[j]]]) + cox$Age + factor(cox$Female) + factor(cox$Set) + (1|cox$Sample_Name), varlist = kin_model*2)
    print(names[[i]])
    print(clock[[j]])
    output_hazard[j+k[[i]],1] <- as.character(clock[[j]])
    output_hazard[j+k[[i]],2] <- as.character(names[[i]])
    output_hazard[j+k[[i]],3:5] <- round(exp(cbind(coef(mod), confint(mod)))[1,1:3],2)
    output_hazard[j+k[[i]],6] <- extract_coxme_table(mod)[1,4]
    output_hazard[j+k[[i]],7] <- mod$n[1]
    output_hazard[j+k[[i]],8] <- mod$n[2]-mod$n[1]
    output_hazard[j+k[[i]],9] <-cox.zph(mod)[1][[1]][3]
   }, error = function(e) cat("skipped"))
  } 
} 


## Create List of Remaining Dataframes - Cancer excluding breast 

names(d1)[names(d1) == "bowel_cancer_Y"] <- "Bowel.Cancer"
names(d1)[names(d1) == "lung_cancer_Y"] <- "Lung.Cancer"

my.list.cancer = list(Lung.Cancer,Bowel.Cancer)
names = list("Lung.Cancer","Bowel.Cancer") 
names(my.list.cancer) <- names 

mat_hazard_cancer <- matrix(nrow=62*length(my.list.cancer),ncol=9)
output_hazard_cancer<- as.data.frame(mat_hazard_cancer)
k=c(0,62)


l.cancer=lapply(my.list.cancer, "[", 1:3)
smr=lapply(my.list.cancer, "[", c(1,3))
smr1=lapply(smr,subset, smr==1)

for(j in 1:length(clock)){ 
  for(i in 1:length(l.cancer)){ 
   tryCatch({ 
    tmp <- l.cancer[[i]]
    smr_tmp <-  smr1[[i]]
    
    ## Exclude Indiviudals who Reported Disease at Study Baseline 
    ## Exclude Indiviudals Reported on Cancer Inpatient Day Visit List - might not have had cancer 
    
    dat1= d1[-which(d1[,names[[i]]] %in% 1),]
    dat1=dat1[-which(dat1$Sample_Name %in% smr_tmp$id),]
    tmp1 = tmp[which(tmp$id %in% dat1$Sample_Name),]
    
    ## Obtain Age of Onset 
    affected = dat1[which(dat1$Sample_Name %in% tmp1$id),] 
    age_onset = tmp[,c("dt", "id")]
    affected = merge(age_onset, affected, by.x = "id", by.y = "Sample_Name")
    affected$Event = 1
    affected$yoe = substring(affected$dt, 1, 4)
    affected$moe = substring(affected$dt, 5,6)
    affected$month_event1 = (as.numeric(affected$moe) - as.numeric(affected$mob))/12
    affected$age_event1 = as.numeric(affected$yoe) - as.numeric(affected$yob)
    affected$age_event = affected$age_event1 + affected$month_event1
    affected$dt = NULL
    affected$yoe = NULL 
    affected$moe = NULL
    affected$month_event1 = NULL 
    affected$age_event1 = NULL
    
    healthy = dat1[-which(dat1$Sample_Name %in% tmp$id),]
    healthy$Event = 0
    healthy$age_event = 0 
    affected$id.y <- NULL
    healthy$id <- NULL
    names(affected)[names(affected)=="id"] <- "Sample_Name"
    cox = rbind(affected, healthy)
    
    cox$age_death = 0
    cox$age_death = ifelse(cox$dead %in% 1, cox$aged, 0)
    cox$age_at_event = ifelse(cox$Event %in% 1, cox$age_event, (ifelse(cox$dead %in% 1 & cox$Event %in% 0, cox$age_death, cox$aged)))
    cox$tte = cox$age_at_event - cox$Age
    cox$tte = as.numeric(cox$tte)
    cox$tte <- ifelse(cox$tte < -1, "NA", cox$tte)
    cox$tte = ifelse(cox$tte < 0, 0, cox$tte)
    cox$Event = as.numeric(cox$Event)
    cox$tte<-as.numeric(cox$tte)
    
    mod = coxme(Surv(cox$tte, cox$Event) ~ scale(cox[,clock[[j]]]) + cox$Age + factor(cox$Set) +  factor(cox$Female) + (1|cox$Sample_Name), varlist = kin_model*2)
    print(clock[[j]])
    print(names[[i]])
    output_hazard_cancer[j+k[[i]],1] <- as.character(clock[[j]])
    output_hazard_cancer[j+k[[i]],2] <- as.character(names[[i]])
    output_hazard_cancer[j+k[[i]], 3:5]<- round(exp(cbind(coef(mod), confint(mod)))[1,1:3],2)
    output_hazard_cancer[j+k[[i]],6] <- extract_coxme_table(mod)[1,4]
    output_hazard_cancer[j+k[[i]],7] <- mod$n[1]
    output_hazard_cancer[j+k[[i]],8] <- mod$n[2]-mod$n[1]
    output_hazard_cancer[j+k[[i]],9] <-cox.zph(mod)[1][[1]][3]
   }, error = function(e) cat("skipped"))
  } 
} 



## Create List of Remaining Dataframes - Breast cancer only (due to separate female covariate being removed) 

names(d1)[names(d1) == "breast_cancer_Y"] <- "Breast.Cancer"

my.list.cancer = list(Breast.Cancer)
names = list("Breast.Cancer") 
names(my.list.cancer) <- names 

mat_hazard_breast <- matrix(nrow=62*length(my.list.cancer),ncol=9)
output_hazard_breast<- as.data.frame(mat_hazard_breast)
k=c(0,62,124)


l.cancer=lapply(my.list.cancer, "[", 1:3)
smr=lapply(my.list.cancer, "[", c(1,3))
smr1=lapply(smr,subset, smr==1)

for(j in 1:length(clock)){ 
  for(i in 1:length(l.cancer)){ 
   tryCatch({ 
    tmp <- l.cancer[[i]]
    smr_tmp <-  smr1[[i]]
    
    ## Exclude Indiviudals who Reported Disease at Study Baseline 
    ## Exclude Indiviudals Reported on Cancer Inpatient Day Visit List - might not have had cancer 
    
    dat1= d1[-which(d1[,names[[i]]] %in% 1),]
    dat1=dat1[-which(dat1$Sample_Name %in% smr_tmp$id),]
    tmp1 = tmp[which(tmp$id %in% dat1$Sample_Name),]
    
    ## Obtain Age of Onset 
    affected = dat1[which(dat1$Sample_Name %in% tmp1$id),] 
    age_onset = tmp[,c("dt", "id")]
    affected = merge(age_onset, affected, by.x = "id", by.y = "Sample_Name")
    affected$Event = 1
    affected$yoe = substring(affected$dt, 1, 4)
    affected$moe = substring(affected$dt, 5,6)
    affected$month_event1 = (as.numeric(affected$moe) - as.numeric(affected$mob))/12
    affected$age_event1 = as.numeric(affected$yoe) - as.numeric(affected$yob)
    affected$age_event = affected$age_event1 + affected$month_event1
    affected$dt = NULL
    affected$yoe = NULL 
    affected$moe = NULL
    affected$month_event1 = NULL 
    affected$age_event1 = NULL
    
    healthy = dat1[-which(dat1$Sample_Name %in% tmp$id),]
    healthy$Event = 0
    healthy$age_event = 0 
    affected$id.y <- NULL
    healthy$id <- NULL
    names(affected)[names(affected)=="id"] <- "Sample_Name"
    cox = rbind(affected, healthy)
    cox = subset(cox, Female == "1")
    
    cox$age_death = 0
    cox$age_death = ifelse(cox$dead %in% 1, cox$aged, 0)
    cox$age_at_event = ifelse(cox$Event %in% 1, cox$age_event, (ifelse(cox$dead %in% 1 & cox$Event %in% 0, cox$age_death, cox$aged)))
    cox$tte = cox$age_at_event - cox$Age
    cox$tte = as.numeric(cox$tte)
    cox$tte <- ifelse(cox$tte < -1, "NA", cox$tte)
    cox$tte = ifelse(cox$tte < 0, 0, cox$tte)
    cox$Event = as.numeric(cox$Event)
    cox$tte<-as.numeric(cox$tte)

    
    mod = coxme(Surv(cox$tte, cox$Event) ~ scale(cox[,clock[[j]]]) + cox$Age + factor(cox$Set) + (1|cox$Sample_Name), varlist = kin_model*2)
    print(clock[[j]])
    print(names[[i]])
    output_hazard_breast[j+k[[i]],1] <- as.character(clock[[j]])
    output_hazard_breast[j+k[[i]],2] <- as.character(names[[i]])
    output_hazard_breast[j+k[[i]], 3:5]<- round(exp(cbind(coef(mod), confint(mod)))[1,1:3],2)
    output_hazard_breast[j+k[[i]],6] <- extract_coxme_table(mod)[1,4]
    output_hazard_breast[j+k[[i]],7] <- mod$n[1]
    output_hazard_breast[j+k[[i]],8] <- mod$n[2]-mod$n[1]
    output_hazard_breast[j+k[[i]],9] <-cox.zph(mod)[1][[1]][3]
   }, error = function(e) cat("skipped"))
  } 
} 


### IBD dealt with separately, as it has no baseline data

mat_hazard_IBD <- matrix(nrow=length(clock),ncol=9)
output_hazard_IBD<- as.data.frame(mat_hazard_IBD)
for(j in 1:length(clock)){ 
  tryCatch({ 
  dat1= d1
  tmp1 = IBD[which(IBD$id %in% dat1$Sample_Name),]
  
  ## Obtain Age of Onset 
  affected = dat1[which(dat1$Sample_Name %in% tmp1$id),] 
  age_onset =  IBD[,c("first", "id")]
  affected = merge(age_onset, affected, by.x = "id", by.y = "Sample_Name")
  affected$Event = 1
  affected$yoe = substring(affected$first, 1, 4)
  affected$moe = substring(affected$first, 5,6)
  affected$month_event1 = (as.numeric(affected$moe) - as.numeric(affected$mob))/12
  affected$age_event1 = as.numeric(affected$yoe) - as.numeric(affected$yob)
  affected$age_event = affected$age_event1 + affected$month_event1
  affected$first = NULL
  affected$yoe = NULL 
  affected$moe = NULL
  affected$month_event1 = NULL 
  affected$age_event1 = NULL
  
  healthy = dat1[-which(dat1$Sample_Name %in%  IBD$id),]
  healthy$Event = 0
  healthy$age_event = 0 
  affected$id.y <- NULL
  healthy$id <- NULL
  names(affected)[names(affected)=="id"] <- "Sample_Name"
  cox = rbind(affected, healthy)
  
  cox$age_death = 0
  cox$age_death = ifelse(cox$dead %in% 1, cox$aged, 0)
  cox$age_at_event = ifelse(cox$Event %in% 1, cox$age_event, (ifelse(cox$dead %in% 1 & cox$Event %in% 0, cox$age_death, cox$aged)))
  cox$tte = cox$age_at_event - cox$Age
  cox$tte = as.numeric(cox$tte)
  cox$tte <- ifelse(cox$tte < -1, "NA", cox$tte)
  cox$tte = ifelse(cox$tte < 0, 0, cox$tte)
  cox$Event = as.numeric(cox$Event)
  cox$tte<-as.numeric(cox$tte)
  
  mod = coxme(Surv(cox$tte, cox$Event) ~ scale(cox[,clock[[j]]]) + factor(cox$Female) + cox$Age + factor(cox$Set) + (1|cox$Sample_Name), varlist = kin_model*2)
  print(clock[[j]])
  print("IBD")
  output_hazard_IBD[j,1] <- as.character(clock[[j]])
  output_hazard_IBD[j,2] <- as.character("IBD")
  output_hazard_IBD[j,3:5]<-round(exp(cbind(coef(mod), confint(mod)))[1,1:3],2)
  output_hazard_IBD[j,6] <- extract_coxme_table(mod)[1,4]
  output_hazard_IBD[j,7] <- mod$n[1]
  output_hazard_IBD[j,8] <- mod$n[2]-mod$n[1]
  output_hazard_IBD[j,9] <-cox.zph(mod)[1][[1]][3]
  }, error = function(e) cat("skipped"))
} 


# Save results from the basic model 

comb = rbind(output_hazard_AD, output_hazard)
comb = rbind(comb, output_hazard_cancer)
comb = rbind(comb, output_hazard_IBD)
comb = rbind(comb, output_hazard_breast)
names(comb) <- c("Predictor", "Outcome", "Hazard Ratio", "LCI", "UCI", "P Value", "No. of Cases", "No. of Controls", "cox.zph")

comb <- comb[complete.cases(comb), ]

#ahrr <- read.csv("/Volumes/marioni-lab/Protein_DNAm_Proxies/Predictors_with_cg05575921.csv")
#comb$AHRR <- 0
#comb[comb$Predictor %in% ahrr$Predictor, "AHRR"] <- "Yes"
#comb[-which(comb$Predictor %in% ahrr$Predictor), "AHRR"] <- "No"
comb <- comb[order(comb$`P Value`),]

write.csv(comb, "/Volumes/marioni-lab/Protein_DNAm_Proxies/Work_and_code_post_KORA/WBC_as_predictors/basic_model.csv",row.names=F)



####################################################################################

## FULLY-ADJUSTED MODEL 

####################################################################################

# the usual clock for running all proteins
# clock <- names(d1)[110:171] 

# Edit down the clock to run just for one protein to extract the tables for cox info 
# clock <- names(d1)[156]

## model structure:  
# mod = coxme(Surv(cox$tte, cox$Event) ~ scale(cox[,clock[[j]]]) + factor(cox$Female)+ cox$Age + factor(cox$Set)  + cox$units + factor(cox$usual) + cox$smokingScore + cox$simd + cox$EA + cox$bmi + (1|cox$Sample_Name), varlist = kin_model*2)

## Run Incidence of Alzheimer's Disease Analysis - Restricted to Participants > 60 

d1_AD <- d1

mat_hazard_ad <- matrix(nrow=length(clock),ncol=9)
output_hazard_AD<- as.data.frame(mat_hazard_ad)
for(j in 1:length(clock)){ 
  tryCatch({ 
  dat1= d1_AD
  tmp1 = AD[which(AD$id %in% dat1$Sample_Name),]
  
  ## Obtain Age of Onset 
  affected = dat1[which(dat1$Sample_Name %in% tmp1$id),] 
  age_onset = AD[,c("first", "id")]
  affected = merge(age_onset, affected, by.x = "id", by.y = "Sample_Name")
  affected$Event = 1
  affected$yoe = substring(affected$first, 1, 4)
  affected$moe = substring(affected$first, 5,6)
  affected$month_event1 = (as.numeric(affected$moe) - as.numeric(affected$mob))/12
  affected$age_event1 = as.numeric(affected$yoe) - as.numeric(affected$yob)
  affected$age_event = affected$age_event1 + affected$month_event1
  affected$first = NULL
  affected$yoe = NULL 
  affected$moe = NULL
  affected$month_event1 = NULL 
  affected$age_event1 = NULL
  
  healthy = dat1[-which(dat1$Sample_Name %in% AD$id),]
  healthy$Event = 0
  healthy$age_event = 0 
  affected$id.y <- NULL
  healthy$id <- NULL
  names(affected)[names(affected)=="id"] <- "Sample_Name"
  cox = rbind(affected, healthy)
  
  cox$age_death = 0
  cox$age_death = ifelse(cox$dead %in% 1, cox$aged, 0)
  cox$age_at_event = ifelse(cox$Event %in% 1, cox$age_event, (ifelse(cox$dead %in% 1 & cox$Event %in% 0, cox$age_death, cox$aged)))
  cox$tte = cox$age_at_event - cox$Age
  cox$tte = as.numeric(cox$tte)
  cox$tte <- ifelse(cox$tte < -1, "NA", cox$tte)
  cox$tte = ifelse(cox$tte < 0, 0, cox$tte)
  cox$Event = as.numeric(cox$Event)
  cox$tte<-as.numeric(cox$tte)
  
  cox = cox[cox$age_at_event >=65,]
  mod = coxme(Surv(cox$tte, cox$Event) ~ scale(cox[,clock[[j]]]) + factor(cox$Female)+ cox$Age + factor(cox$Set)  + cox$units + cox$smokingScore + cox$simd + cox$EA + cox$bmi + (1|cox$Sample_Name), varlist = kin_model*2)
  print(clock[[j]])
  print("AD")
  output_hazard_AD[j,1] <- as.character(clock[[j]])
  output_hazard_AD[j,2] <- as.character("Alzheimer's Disease")
  output_hazard_AD[j,3:5]<-round(exp(cbind(coef(mod), confint(mod)))[1,1:3],2)
  output_hazard_AD[j,6] <- extract_coxme_table(mod)[1,4]
  output_hazard_AD[j,7] <- mod$n[1]
  output_hazard_AD[j,8] <- mod$n[2]-mod$n[1]
  output_hazard_AD[j,9] <-cox.zph(mod)[1][[1]][3]
  }, error = function(e) cat("skipped"))
} 



## Create List of Remaining Dataframes - not Cancer  

my.list = list(COPD,Depression,Stroke,Diabetes,Pain,RA,IHD)
names = list("COPD","Depression","Stroke","Diabetes","Pain","RA","IHD")
names(my.list) <- names 

l=lapply(my.list, "[", c(1:2))

names(d1)[names(d1) == "COPD_Y"] <- "COPD"
names(d1)[names(d1) == "depression_Y"] <- "Depression"
names(d1)[names(d1) == "heart_disease_Y"] <- "IHD"
names(d1)[names(d1) == "stroke_Y"] <- "Stroke"
names(d1)[names(d1) == "diabetes_Y"] <- "Diabetes"
names(d1)[names(d1) == "rheum_arthritis_Y"] <- "RA"

mat_hazard <- matrix(nrow=62*length(my.list),ncol=9)
output_hazard <- as.data.frame(mat_hazard)
k=c(0,62,124,186,248,310,372,434)

## Loop of Survival Models - Longitudinal Associations
for(j in 1:length(clock)){
  for(i in 1:length(l)){ 
   tryCatch({ 
    tmp <- l[[i]]
    
    ## Exclude Indiviudals who Reported Disease at Study Baseline  
    dat1= d1[-which(d1[,names[[i]]] %in% 1),]
    tmp1 = tmp[which(tmp$id %in% dat1$Sample_Name),]
    
    ## Obtain Age of Onset 
    affected = dat1[which(dat1$Sample_Name %in% tmp1$id),] 
    age_onset = tmp[,c("first", "id")]
    affected = merge(age_onset, affected, by.x = "id", by.y = "Sample_Name")
    affected$Event = 1
    affected$yoe = substring(affected$first, 1, 4)
    affected$moe = substring(affected$first, 5,6)
    affected$month_event1 = (as.numeric(affected$moe) - as.numeric(affected$mob))/12
    affected$age_event1 = as.numeric(affected$yoe) - as.numeric(affected$yob)
    affected$age_event = affected$age_event1 + affected$month_event1
    affected$first = NULL
    affected$yoe = NULL 
    affected$moe = NULL
    affected$month_event1 = NULL 
    affected$age_event1 = NULL
    
    healthy = dat1[-which(dat1$Sample_Name %in% tmp$id),]
    healthy$Event = 0
    healthy$age_event = 0 
    affected$id.y <- NULL
    healthy$id <- NULL
    names(affected)[names(affected)=="id"] <- "Sample_Name"
    cox = rbind(affected, healthy)
    
    cox$age_death = 0
    cox$age_death = ifelse(cox$dead %in% 1, cox$aged, 0)
    cox$age_at_event = ifelse(cox$Event %in% 1, cox$age_event, (ifelse(cox$dead %in% 1 & cox$Event %in% 0, cox$age_death, cox$aged)))
    cox$tte = cox$age_at_event - cox$Age
    cox$tte = as.numeric(cox$tte)
    cox$tte <- ifelse(cox$tte < -1, "NA", cox$tte)
    cox$tte = ifelse(cox$tte < 0, 0, cox$tte)
    cox$Event = as.numeric(cox$Event)
    cox$tte<-as.numeric(cox$tte)
       
  
    mod = coxme(Surv(cox$tte, cox$Event) ~ scale(cox[,clock[[j]]]) + factor(cox$Female)+ cox$Age + factor(cox$Set)  + cox$units + cox$smokingScore + cox$simd + cox$EA + cox$bmi + (1|cox$Sample_Name), varlist = kin_model*2)
    print(names[[i]])
    print(clock[[j]])
    output_hazard[j+k[[i]],1] <- as.character(clock[[j]])
    output_hazard[j+k[[i]],2] <- as.character(names[[i]])
    output_hazard[j+k[[i]],3:5] <- round(exp(cbind(coef(mod), confint(mod)))[1,1:3],2)
    output_hazard[j+k[[i]],6] <- extract_coxme_table(mod)[1,4]
    output_hazard[j+k[[i]],7] <- mod$n[1]
    output_hazard[j+k[[i]],8] <- mod$n[2]-mod$n[1]
    output_hazard[j+k[[i]],9] <-cox.zph(mod)[1][[1]][3]
   }, error = function(e) cat("skipped"))
  } 
} 


## Create List of Remaining Dataframes - Cancer excluding breast 

names(d1)[names(d1) == "bowel_cancer_Y"] <- "Bowel.Cancer"
names(d1)[names(d1) == "lung_cancer_Y"] <- "Lung.Cancer"

my.list.cancer = list(Lung.Cancer,Bowel.Cancer)
names = list("Lung.Cancer","Bowel.Cancer") 
names(my.list.cancer) <- names 

mat_hazard_cancer <- matrix(nrow=62*length(my.list.cancer),ncol=9)
output_hazard_cancer<- as.data.frame(mat_hazard_cancer)
k=c(0,62)


l.cancer=lapply(my.list.cancer, "[", 1:3)
smr=lapply(my.list.cancer, "[", c(1,3))
smr1=lapply(smr,subset, smr==1)

for(j in 1:length(clock)){ 
  for(i in 1:length(l.cancer)){ 
   tryCatch({ 
    tmp <- l.cancer[[i]]
    smr_tmp <-  smr1[[i]]
    
    ## Exclude Indiviudals who Reported Disease at Study Baseline 
    ## Exclude Indiviudals Reported on Cancer Inpatient Day Visit List - might not have had cancer 
    
    dat1= d1[-which(d1[,names[[i]]] %in% 1),]
    dat1=dat1[-which(dat1$Sample_Name %in% smr_tmp$id),]
    tmp1 = tmp[which(tmp$id %in% dat1$Sample_Name),]
    
    ## Obtain Age of Onset 
    affected = dat1[which(dat1$Sample_Name %in% tmp1$id),] 
    age_onset = tmp[,c("dt", "id")]
    affected = merge(age_onset, affected, by.x = "id", by.y = "Sample_Name")
    affected$Event = 1
    affected$yoe = substring(affected$dt, 1, 4)
    affected$moe = substring(affected$dt, 5,6)
    affected$month_event1 = (as.numeric(affected$moe) - as.numeric(affected$mob))/12
    affected$age_event1 = as.numeric(affected$yoe) - as.numeric(affected$yob)
    affected$age_event = affected$age_event1 + affected$month_event1
    affected$dt = NULL
    affected$yoe = NULL 
    affected$moe = NULL
    affected$month_event1 = NULL 
    affected$age_event1 = NULL
    
    healthy = dat1[-which(dat1$Sample_Name %in% tmp$id),]
    healthy$Event = 0
    healthy$age_event = 0 
    affected$id.y <- NULL
    healthy$id <- NULL
    names(affected)[names(affected)=="id"] <- "Sample_Name"
    cox = rbind(affected, healthy)
    
    cox$age_death = 0
    cox$age_death = ifelse(cox$dead %in% 1, cox$aged, 0)
    cox$age_at_event = ifelse(cox$Event %in% 1, cox$age_event, (ifelse(cox$dead %in% 1 & cox$Event %in% 0, cox$age_death, cox$aged)))
    cox$tte = cox$age_at_event - cox$Age
    cox$tte = as.numeric(cox$tte)
    cox$tte <- ifelse(cox$tte < -1, "NA", cox$tte)
    cox$tte = ifelse(cox$tte < 0, 0, cox$tte)
    cox$Event = as.numeric(cox$Event)
    cox$tte<-as.numeric(cox$tte)
    
    mod = coxme(Surv(cox$tte, cox$Event) ~ scale(cox[,clock[[j]]]) + factor(cox$Female)+ cox$Age + factor(cox$Set)  + cox$units + cox$smokingScore + cox$simd + cox$EA + cox$bmi + (1|cox$Sample_Name), varlist = kin_model*2)
    print(clock[[j]])
    print(names[[i]])
    output_hazard_cancer[j+k[[i]],1] <- as.character(clock[[j]])
    output_hazard_cancer[j+k[[i]],2] <- as.character(names[[i]])
    output_hazard_cancer[j+k[[i]], 3:5]<- round(exp(cbind(coef(mod), confint(mod)))[1,1:3],2)
    output_hazard_cancer[j+k[[i]],6] <- extract_coxme_table(mod)[1,4]
    output_hazard_cancer[j+k[[i]],7] <- mod$n[1]
    output_hazard_cancer[j+k[[i]],8] <- mod$n[2]-mod$n[1]
    output_hazard_cancer[j+k[[i]],9] <-cox.zph(mod)[1][[1]][3]
   }, error = function(e) cat("skipped"))
  } 
} 


## Create List of Remaining Dataframes - Breast cancer only (due to separate female covariate being removed) 

names(d1)[names(d1) == "breast_cancer_Y"] <- "Breast.Cancer"

my.list.cancer = list(Breast.Cancer)
names = list("Breast.Cancer") 
names(my.list.cancer) <- names 

mat_hazard_breast <- matrix(nrow=62*length(my.list.cancer),ncol=9)
output_hazard_breast<- as.data.frame(mat_hazard_breast)
k=c(0,62,124)


l.cancer=lapply(my.list.cancer, "[", 1:3)
smr=lapply(my.list.cancer, "[", c(1,3))
smr1=lapply(smr,subset, smr==1)

for(j in 1:length(clock)){ 
  for(i in 1:length(l.cancer)){ 
   tryCatch({ 
    tmp <- l.cancer[[i]]
    smr_tmp <-  smr1[[i]]
    
    ## Exclude Indiviudals who Reported Disease at Study Baseline 
    ## Exclude Indiviudals Reported on Cancer Inpatient Day Visit List - might not have had cancer 
    
    dat1= d1[-which(d1[,names[[i]]] %in% 1),]
    dat1=dat1[-which(dat1$Sample_Name %in% smr_tmp$id),]
    tmp1 = tmp[which(tmp$id %in% dat1$Sample_Name),]
    
    ## Obtain Age of Onset 
    affected = dat1[which(dat1$Sample_Name %in% tmp1$id),] 
    age_onset = tmp[,c("dt", "id")]
    affected = merge(age_onset, affected, by.x = "id", by.y = "Sample_Name")
    affected$Event = 1
    affected$yoe = substring(affected$dt, 1, 4)
    affected$moe = substring(affected$dt, 5,6)
    affected$month_event1 = (as.numeric(affected$moe) - as.numeric(affected$mob))/12
    affected$age_event1 = as.numeric(affected$yoe) - as.numeric(affected$yob)
    affected$age_event = affected$age_event1 + affected$month_event1
    affected$dt = NULL
    affected$yoe = NULL 
    affected$moe = NULL
    affected$month_event1 = NULL 
    affected$age_event1 = NULL
    
    healthy = dat1[-which(dat1$Sample_Name %in% tmp$id),]
    healthy$Event = 0
    healthy$age_event = 0 
    affected$id.y <- NULL
    healthy$id <- NULL
    names(affected)[names(affected)=="id"] <- "Sample_Name"
    cox = rbind(affected, healthy)
    cox = subset(cox, Female == "1")
    
    cox$age_death = 0
    cox$age_death = ifelse(cox$dead %in% 1, cox$aged, 0)
    cox$age_at_event = ifelse(cox$Event %in% 1, cox$age_event, (ifelse(cox$dead %in% 1 & cox$Event %in% 0, cox$age_death, cox$aged)))
    cox$tte = cox$age_at_event - cox$Age
    cox$tte = as.numeric(cox$tte)
    cox$tte <- ifelse(cox$tte < -1, "NA", cox$tte)
    cox$tte = ifelse(cox$tte < 0, 0, cox$tte)
    cox$Event = as.numeric(cox$Event)
    cox$tte<-as.numeric(cox$tte)
    
    mod = coxme(Surv(cox$tte, cox$Event) ~ scale(cox[,clock[[j]]]) + cox$Age + factor(cox$Set)  + cox$units + cox$smokingScore + cox$simd + cox$EA + cox$bmi + (1|cox$Sample_Name), varlist = kin_model*2)
    print(clock[[j]])
    print(names[[i]])
    output_hazard_breast[j+k[[i]],1] <- as.character(clock[[j]])
    output_hazard_breast[j+k[[i]],2] <- as.character(names[[i]])
    output_hazard_breast[j+k[[i]], 3:5]<- round(exp(cbind(coef(mod), confint(mod)))[1,1:3],2)
    output_hazard_breast[j+k[[i]],6] <- extract_coxme_table(mod)[1,4]
    output_hazard_breast[j+k[[i]],7] <- mod$n[1]
    output_hazard_breast[j+k[[i]],8] <- mod$n[2]-mod$n[1]
    output_hazard_breast[j+k[[i]],9] <-cox.zph(mod)[1][[1]][3]
   }, error = function(e) cat("skipped"))
  } 
} 


### IBD dealt with separately, as it has no baseline data

mat_hazard_IBD <- matrix(nrow=length(clock),ncol=9)
output_hazard_IBD<- as.data.frame(mat_hazard_IBD)
for(j in 1:length(clock)){ 
  tryCatch({ 
  dat1= d1
  tmp1 = IBD[which(IBD$id %in% dat1$Sample_Name),]
  
  ## Obtain Age of Onset 
  affected = dat1[which(dat1$Sample_Name %in% tmp1$id),] 
  age_onset =  IBD[,c("first", "id")]
  affected = merge(age_onset, affected, by.x = "id", by.y = "Sample_Name")
  affected$Event = 1
  affected$yoe = substring(affected$first, 1, 4)
  affected$moe = substring(affected$first, 5,6)
  affected$month_event1 = (as.numeric(affected$moe) - as.numeric(affected$mob))/12
  affected$age_event1 = as.numeric(affected$yoe) - as.numeric(affected$yob)
  affected$age_event = affected$age_event1 + affected$month_event1
  affected$first = NULL
  affected$yoe = NULL 
  affected$moe = NULL
  affected$month_event1 = NULL 
  affected$age_event1 = NULL
  
  healthy = dat1[-which(dat1$Sample_Name %in%  IBD$id),]
  healthy$Event = 0
  healthy$age_event = 0 
  affected$id.y <- NULL
  healthy$id <- NULL
  names(affected)[names(affected)=="id"] <- "Sample_Name"
  cox = rbind(affected, healthy)
  
  cox$age_death = 0
  cox$age_death = ifelse(cox$dead %in% 1, cox$aged, 0)
  cox$age_at_event = ifelse(cox$Event %in% 1, cox$age_event, (ifelse(cox$dead %in% 1 & cox$Event %in% 0, cox$age_death, cox$aged)))
  cox$tte = cox$age_at_event - cox$Age
  cox$tte = as.numeric(cox$tte)
  cox$tte <- ifelse(cox$tte < -1, "NA", cox$tte)
  cox$tte = ifelse(cox$tte < 0, 0, cox$tte)
  cox$Event = as.numeric(cox$Event)
  cox$tte<-as.numeric(cox$tte)
  
  mod = coxme(Surv(cox$tte, cox$Event) ~ scale(cox[,clock[[j]]]) + factor(cox$Female)+ cox$Age + factor(cox$Set)  + cox$units + cox$smokingScore + cox$simd + cox$EA + cox$bmi + (1|cox$Sample_Name), varlist = kin_model*2)
  print(clock[[j]])
  print("IBD")
  output_hazard_IBD[j,1] <- as.character(clock[[j]])
  output_hazard_IBD[j,2] <- as.character("IBD")
  output_hazard_IBD[j,3:5]<-round(exp(cbind(coef(mod), confint(mod)))[1,1:3],2)
  output_hazard_IBD[j,6] <- extract_coxme_table(mod)[1,4]
  output_hazard_IBD[j,7] <- mod$n[1]
  output_hazard_IBD[j,8] <- mod$n[2]-mod$n[1]
  output_hazard_IBD[j,9] <-cox.zph(mod)[1][[1]][3]
  }, error = function(e) cat("skipped"))
} 


# Save results from fully adjusted model 

comb_full = rbind(output_hazard_AD, output_hazard)
comb_full = rbind(comb_full, output_hazard_cancer)
comb_full = rbind(comb_full, output_hazard_IBD)
comb_full = rbind(comb_full, output_hazard_breast)
names(comb_full) <- c("Predictor", "Outcome", "Hazard Ratio", "LCI", "UCI", "P.Value", "No. of Cases", "No. of Controls", "cox.zph")

comb_full <- comb_full[complete.cases(comb_full), ]

#ahrr <- read.csv("/Volumes/marioni-lab/Protein_DNAm_Proxies/Predictors_with_cg05575921.csv")
#comb_full$AHRR <- 0
#comb_full[comb_full$Predictor %in% ahrr$Predictor, "AHRR"] <- "Yes"
#comb_full[-which(comb_full$Predictor %in% ahrr$Predictor), "AHRR"] <- "No"
comb_full <- comb_full[order(comb_full$`P.Value`),]

# write.csv(comb_full, "/Volumes/marioni-lab/Protein_DNAm_Proxies/Incidence_results/WBC_as_predictors_fully_adjusted_model_sensitivity.csv",row.names=F)


write.csv(comb_full, "/Volumes/marioni-lab/Protein_DNAm_Proxies/Work_and_code_post_KORA/WBC_as_predictors/full_model.csv",row.names=F)




# Load in basic model 
comb <- read.csv("/Volumes/marioni-lab/Protein_DNAm_Proxies/Work_and_code_post_KORA/WBC_as_predictors/basic_model.csv")

# Load in fully adjusted model 
comb_full <- read.csv("/Volumes/marioni-lab/Protein_DNAm_Proxies/Work_and_code_post_KORA/WBC_as_predictors/full_model.csv")

# remove those which dont meet the proportionality assumption
length(which(comb$cox.zph < 0.05)) ## 0
#comb = comb[-which(comb$cox.zph < 0.05),]

# Do FDR correction
comb$FDR <- p.adjust(comb$P.Value, method = "BH")
names(comb)[6] <- "P.Value"

# Keep any that meet significance threshold 
keep1 <- comb[which(comb$FDR < 0.05),] # 90 associations of 742 total 
keep = paste(keep1$Predictor, keep1$Outcome, sep = "_")

# Keep only those passing the threshold from the basic model in the fully adjusted 
comb_full$retain <- paste(comb_full$Predictor, comb_full$Outcome, sep = "_")
comb_full <- comb_full[which(comb_full$retain %in% keep),]
comb_full$retain <- NULL
names(comb_full)[6] <- "P.Value"

# make sure filtered ones from fully adjusted dont have coxzph P of less than 0.05 
length(which(comb_full$cox.zph < 0.05)) ## 0

# Save the final model results 
write.csv(keep1, "/Volumes/marioni-lab/Protein_DNAm_Proxies/Work_and_code_post_KORA/WBC_as_predictors/keep1_model.csv", row.names =F)
write.csv(comb_full, "/Volumes/marioni-lab/Protein_DNAm_Proxies/Work_and_code_post_KORA/WBC_as_predictors/full_filtered_by_basic_FDR.csv", row.names =F)

### COMBINE RESULTS 

join <- left_join(keep1, comb_full, by = c("Predictor", "Outcome"))

write.csv(join, "/Volumes/marioni-lab/Protein_DNAm_Proxies/Work_and_code_post_KORA/WBC_as_predictors/suppl_table_joint_full_basic.csv")





























back to top