Content - ef3df3c670b18b425d84ea650647c32cd8ea99ea - 0ba921d/04_prep_Methylation_stradl_projections.R

visit type:
https://github.com/DanniGadd/EpiScores-for-protein-levels

17 November 2023, 07:53:22 UTC
Tip revision: a5130fab3895a0d95f0dcc8826aa9fb5e8c0fa86 authored by DanniGadd on 16 February 2022, 08:46:38 UTC
Merge branch 'main' of https://github.com/DanniGadd/EpiScores-for-protein-levels
Tip revision: a5130fa
04_prep_Methylation_stradl_projections.R
Copyright (c) <2022>, <DanniGadd>
All rights reserved.

This source code is licensed under the MIT license found in the
LICENSE file in the root directory.

############################################################################################
############################################################################################
################# METHYLATION PREP - STRADL ################################################
############################################################################################
############################################################################################

# This script preps methylation data in STRADL and saves the CpG sites common to cohorts 

# Set working directory 
setwd("/Cluster_Filespace/Marioni_Group/Danni/Proxies_stradl/")

# Libraries required 
library(glmnet)
library(tidyverse)

############################################################################################

### JOIN STRADL

############################################################################################

### KORA PREP FILES

# Get CpG sites common to STRADL waves and GS waves 

# Load meth data for waves 2 and 3 of STRADL 
wave2 <- readRDS("/Cluster_Filespace/Marioni_Group/Rob/Somalogic/wave2-STRADL-mvals.rds")
wave3 <- readRDS("/Cluster_Filespace/Marioni_Group/Rob/Somalogic/wave3-STRADL-mvals.rds")

# Load EPIC array file 
anno <- readRDS("/Cluster_Filespace/Marioni_Group/Daniel/EPIC_AnnotationObject_df.rds")

# Subset EPIC annotation file to probes common to 450k and EPIC array
common_anno <- anno[which(anno$Methyl450_Loci == "TRUE"),]

# Subset methylation files to those in the common annotation file 
wave2 <- wave2[rownames(wave2) %in% rownames(common_anno),]
wave3 <- wave3[rownames(wave3) %in% rownames(common_anno),]

dim(wave2) # 409445    504
dim(wave3) # 398624    306

wave2 <- t(wave2)
wave3 <- t(wave3)


# Subset to 778 people 
# Read in SomaLogic protein data in 778 with DNAm available - pre residualised for age/sex/PCs/plate
prot <- readRDS("/Cluster_Filespace/Marioni_Group/Danni/LBC_proteins_Jan2021/Validation/STRADL_778_residualised.rds")

# Match IDs between proxy and measured PAPPA 
target <- read.delim("/Cluster_Filespace/Marioni_Group/Danni/LBC_proteins_Jan2021/Validation/STRADL_DNAm_target_REM_17April2020.txt")
IDs <- target[c(1,3)]

library(tidyverse)
prot2 <- left_join(prot, IDs, by = "Stradl_id")

overlap <- which(rownames(wave2) %in% prot2$DNAm_id)
length(overlap) # 479 
wave2 <- wave2[overlap,]

overlap <- which(rownames(wave3) %in% prot2$DNAm_id)
length(overlap) # 299
wave3 <- wave3[overlap,]

# Convert to beta 
m <- wave2
m2beta <- function(m) { 
  beta <- 2^m/(2^m + 1)
  return(beta)
}
m <- m2beta(m) # convert m-value to beta-value 

library(imputeTS)
m <- na_mean(m) # Impute NA values

scaled <- apply(m, 2, scale) # scale
rownames(scaled) <- rownames(m)
wave2_s <- scaled


# Convert to beta 
m <- wave3
m2beta <- function(m) { 
  beta <- 2^m/(2^m + 1)
  return(beta)
}
m <- m2beta(m) # convert m-value to beta-value 

library(imputeTS)
m <- na_mean(m) # Impute NA values

scaled <- apply(m, 2, scale) # scale
rownames(scaled) <- rownames(m)
wave3_s <- scaled

# Save out separate files 
saveRDS(wave2_s, "/Cluster_Filespace/Marioni_Group/Danni/Proxies_stradl/W2_prep_479.rds")
saveRDS(wave3_s, "/Cluster_Filespace/Marioni_Group/Danni/Proxies_stradl/W3_prep_299.rds")



# Join waves together

overlap <- which(rownames(wave3) %in% rownames(wave2))
length(overlap) # 397812

w3 <- wave3[overlap,]

overlap <- which(rownames(wave2) %in% rownames(w3))
length(overlap) # 397812

w2 <- wave2[overlap,]

dim(w2) # 397812    504
dim(w3) # 397812    306

identical(rownames(w2), rownames(w3)) # TRUE

join <- cbind(w2, w3)
dim(join) # 397812    810

join <- t(join)

# Subset to the list of 778 individuals 
list <- read.csv("/Cluster_Filespace/Marioni_Group/Danni/Proxies_stradl/Example_KORA_training/list_778_DNAm_id.csv")

overlap <- which(rownames(join) %in% list$DNAm_ID)
join2 <- join[overlap,]
dim(join2) # 778 397812

# Save it to test out KORA pipeline demo 
saveRDS(join2, "/Cluster_Filespace/Marioni_Group/Danni/Proxies_stradl/Example_KORA_training/Starting_files/methylation_data.rds")


############################################################################################

### JOIN GS

############################################################################################

# GS data load in 

wave3 = readRDS("/Cluster_Filespace/Marioni_Group/GS/GS_methylation/wave3_mvals.rds")

wave1 = readRDS("/Cluster_Filespace/Marioni_Group/GS/GS_methylation/norm_mvals_5087.rds")

dim(wave3)
# 773860   4450

dim(wave1)
# 860926   5087

# Load EPIC array file 
anno <- readRDS("/Cluster_Filespace/Marioni_Group/Daniel/EPIC_AnnotationObject_df.rds")

# Subset EPIC annotation file to probes common to 450k and EPIC array
common_anno <- anno[which(anno$Methyl450_Loci == "TRUE"),]

# Subset methylation files to those in the common annotation file 
wave1 <- wave1[rownames(wave1) %in% rownames(common_anno),]
wave3 <- wave3[rownames(wave3) %in% rownames(common_anno),]

dim(wave1) # 451685   5087
dim(wave3) # 398624   4450

# Join waves together

overlap <- which(rownames(wave3) %in% rownames(wave1))
length(overlap) # 398422

w3 <- wave3[overlap,]

overlap <- which(rownames(wave1) %in% rownames(w3))
length(overlap) # 398422

w1 <- wave1[overlap,]

identical(rownames(w1), rownames(w3)) # TRUE

join <- cbind(w1, w3)
dim(join) # 398422   9537

join <- t(join)

# Subset to a few individuals only for ease 
join_v <- join[c(1:100),c(1:398422)]

# Write this small version out for the CpG site commonality comparison for KORA 
saveRDS(join_v, "/Cluster_Filespace/Marioni_Group/Danni/Proxies_stradl/Example_KORA_training/Starting_files/methylation_data_CpGs_GS.rds")

# Read in the STRADL file with its own CpGs 
STRADL <- readRDS("/Cluster_Filespace/Marioni_Group/Danni/Proxies_stradl/Example_KORA_training/Starting_files/methylation_data.rds")

# read in the GS ones for CpGs 
GS <- readRDS("/Cluster_Filespace/Marioni_Group/Danni/Proxies_stradl/Example_KORA_training/Starting_files/methylation_data_CpGs_GS.rds")

# Which sites common to both 

# > dim(STRADL)
# [1]    778 397812
# > dim(GS)
# [1]    100 398422

overlap <- which(colnames(GS) %in% colnames(STRADL))
length(overlap) # 397630

GS <- GS[,overlap]
dim(GS) # 100 397630

overlap2 <- which(colnames(STRADL) %in% colnames(GS))
length(overlap2) # 397630

STRADL <- STRADL[,overlap2]
dim(STRADL) # 778 397630

identical(colnames(GS), colnames(STRADL)) # TRUE

### WRITE OUT THE CPGS WHICH ARE COMMON ACROSS ALL OF STRADL AND GS WAVES HERE:
names <- colnames(GS) %>% as.data.frame()
names(names)[1] <- "Common_sites"
write.csv(names, "/Cluster_Filespace/Marioni_Group/Danni/Proxies_stradl/KORA_proteins/CpGs_consistent_in_GS_and_STRADL_397630.csv", row.names = F)

# Write out list of 9537
list <- rownames(join) %>% as.data.frame()
names(list)[1] <- "DNAm_ID"
write.csv(list, "/Cluster_Filespace/Marioni_Group/Danni/Proxies_stradl/KORA_proteins/GS_DNAm_IDs_9537.csv", row.names = F)


############################################################################################

### KORA OVERLAP FILE

############################################################################################


# Load STRADL proteins which are already normalised/residualised according to separate script  
prot <- readRDS("/Cluster_Filespace/Marioni_Group/Danni/Proxies_stradl/Normalisation_residualisation_STRADL/Soma_scan_normalised_and_adj_age_sex_PCs_plate_110121.rds")
STRADL <- prot %>% as.data.frame()

STRADL <- STRADL[-1]
names <- colnames(STRADL) %>% as.data.frame()
names(names) <- "UniprotIDs"
write.csv(names, "/Cluster_Filespace/Marioni_Group/Danni/Proxies_stradl/STRADL_epitope_IDs.csv")

# Assess overlap between STRADL and KORA protein lists for reference
KORA <- read.csv("/Cluster_Filespace/Marioni_Group/Danni/Proxies_stradl/KORA_proteins/Protein_info.csv")
overlap <- which(colnames(STRADL) %in% KORA$UniProt)
length(overlap) # 894
list <- which(colnames(STRADL) %in% KORA$UniProt)

match <- STRADL[,list]
names <- colnames(match) %>% as.data.frame()
names(names)[1] <- "Matched"

write.csv(names, "/Cluster_Filespace/Marioni_Group/Danni/Proxies_stradl/KORA_proteins/KORA_proteins_that_match_STRADL_894.csv", row.names = F)

anno <- readRDS("/Cluster_Filespace/Marioni_Group/Daniel/EPIC_AnnotationObject_df.rds")

saveRDS(anno, "/Cluster_Filespace/Marioni_Group/Danni/Proxies_stradl/Example_KORA_training/Starting_files/annotation_450k_dataset.rds")