https://github.com/DanniGadd/EpiScores-for-protein-levels
Raw File
Tip revision: a5130fab3895a0d95f0dcc8826aa9fb5e8c0fa86 authored by DanniGadd on 16 February 2022, 08:46:38 UTC
Merge branch 'main' of https://github.com/DanniGadd/EpiScores-for-protein-levels
Tip revision: a5130fa
06_run_KORA_models.R
Copyright (c) <2022>, <DanniGadd>
All rights reserved.

This source code is licensed under the MIT license found in the
LICENSE file in the root directory.

############################################################################################
############################################################################################
################# Elastic net models for KORA ##############################################
############################################################################################
############################################################################################

# Here, we train elastic net models for 793 KORA proteins 

### INFLAM TRAIN FULL

# libraries
library(tidyverse)
library(glmnet)
library(imputeTS)
library(bestNormalize)
library(foreign)


load("allData.RData")
# open a tsv file

nproc=1

basename = "fit_KORA1"
outfile = paste(basename, nproc, "tsv", sep = ".")
cat ("output to", outfile, "\n")

Output = list()
k = 1
l = 1
cat(file = outfile, append = FALSE, "SeqId", "r", "p-value", "r valid", "p-value valid", "fit$df", "fit$dev.ratio", "fit$lambda", "label", sep = "\t", "\n")
#this next loop needs to be parallelized, otherwise will take ages to run
for (i in 1:793) {
  
  cat("working on protein", l, "of", length(range), "\n")
  
  ## Residualise proteins and scale - use covars
  odata = y[,i]
  SeqId = colnames(y)[i]
  label = ProteinSeqID_positions$EntrezGeneSymbol[match(colnames(y)[i],ProteinSeqID_positions$SeqId)]
  cat(SeqId, label, "\n")
  name_p = paste(SeqId, ":", label)
  
  
  
  start_time <- Sys.time()
  lasso.cv <- cv.glmnet(x, odata, family="gaussian", alpha = 0.5, nfolds=10) # cross validation to get best lambda (this would normally be set to 10)
  fit <- glmnet(x, odata, family = "gaussian", alpha = 0.5, standardize = F, lambda = lasso.cv$lambda.min) # model fit 
  save(fit,file=paste("fit",i,".RData",sep=""))
  end_time <- Sys.time()
  print( end_time - start_time)
  
  coefs <- coef(fit) # Extract coeficients 
  # coefs <- as.data.frame(coefs[which(coefs!=0),]) # Remove coeficients that are 0 
  # coefs$Predictor <- name_p # Assign protein identifier
  # names(coefs)[1] <- "Coefficient" # Tidy naming 
  # coefs$CpG_Site <- rownames(coefs) # Create cpg column
  # coefs <- coefs[c(3,1,2)] # order 
  # coefs3 <- coefs[-1,] # Remove intercept (if there was only intercept, this will now have 0 rows)
  
  coefs3 = data.frame(
    CpG_site = coefs@Dimnames[[1]][coefs@i+1],
    Coefficient = coefs@x,
    Predictor = name_p
  )
  
  features <- nrow(coefs3) # Count number of features contributing for the protein - i.e. if its > 0 it has features 
  if (features > 0) {
    rownames(coefs3) <- NULL # remove rows as they arent needed 
    Output[[k]] <- coefs3 # save weights as outputs list to bind together for all proteins
    
    # test the prediction
    ypred = predict(fit, newx = x)
    c = cor.test(odata,ypred)
    
    yvalidpred = predict(fit, newx = xvalid)
    cvalid = cor.test(yvalid[,i],yvalidpred)
    
    cat(file = outfile, append = TRUE, SeqId, c$estimate, c$p.value, cvalid$estimate, cvalid$p.value, fit$df, fit$dev.ratio, fit$lambda, label, sep = "\t", "\n")
    cat(SeqId, c$estimate, cvalid$estimate, fit$df, label, sep = "\t", "\n")
    
    k = k+1
  } else {
    cat(file = outfile, append = TRUE, SeqId, rep(".",5), label, sep = "\t", "\n")
  }
  message(paste0(features, " weights for protein ", name_p, " have been calculated."))
  l = l+1
}

# save Output
weightfile = paste("weights", nproc, "csv", sep=".")
weight_total <- do.call(rbind, Output) # Bind predictor weights together
write.csv(weight_total, weightfile, row.names = F)
back to top