swh:1:snp:0fa5e44aa9eaf68dc00949be32686577b750591b
Tip revision: 7f9ab85810fd5d7f3fb0616b57677250b3a7cbf0 authored by caeseriousli on 05 March 2024, 02:24:07 UTC
Update README.md
Update README.md
Tip revision: 7f9ab85
5. Leave One Species Out Predictor with dogs.R
library(dplyr)
library(doParallel)
library(glmnet)
## Average dog methylation data
##########################################################################
############## Fit Dog Lifespan Predictor ONLY ############################
samples = read.csv("DogBreeds.csv",
stringsAsFactors = FALSE)
samples$logLifespanMedianHorvath = log(samples$LifespanMedianHorvath)
samples$logWeight.kg.avg = log(samples$Weight.kg.avg)
dat0 = readRDS("averaged_DogBreeds.RDS")
if(sum(!samples$DogBreed == rownames(dat0)) > 0) {
stop("DogBreed must match dat0 rownames")
}
## Filter CpG
# This comes from filter CG R file
cpgs = readRDS(paste0("/DetectionP_median85_CGs_v11_AddYumanensis.RDS"))
dat0 = dat0[, colnames(dat0) %in% cpgs]
## Adjustment MULTIPLY by 1.3
samples$LifespanMedianHorvath = samples$LifespanMedianHorvath * 1.3
## Construct training test separation variable
outcomeName = "logLifespanMedianHorvath"
leavename = "DogBreed"
temp = data.frame(Order = unique(samples[, leavename])); colnames(temp) = leavename
temp$randos = 1:nrow(temp)
pred.folds = nrow(temp)
samples = samples %>% left_join(temp, by = leavename)
library(doParallel)
registerDoParallel(2)
print(paste0("dim(dat0)", dim(dat0)))
print(dim(samples))
test.set = foreach(i = 1:pred.folds, .combine = rbind, .packages = c('doParallel', 'glmnet')) %dopar% {
manual.folds = NA
species.weights = NA
# print(paste0("Weighting: ", species.weights[1]))
lifespanpredictor = elasticNetTrain(dat0, samples,
outcomeName = outcomeName, selectLambda = "min",
leaveOneOut = i, leaveOutName = "randos", manual.folds = manual.folds,
species.weights = NA)
toreturn = lifespanpredictor$test
toreturn$Nselected = nrow(lifespanpredictor$glmnet.final.nonzero)
return(toreturn)
}
write.csv(test.set, paste0("./dog lifespan predictor.csv"), row.names = FALSE)
######################################################################################
############## Fit Combined Species + DogBreeds Predictor ############################
# leave one clade out
library(dplyr)
library(doParallel)
library(glmnet)
library(ggplot2)
SpeciesTissue = "AdjustLifespanAggressive_"
# SpeciesTissue = "NoMonotremata_"
# SpeciesTissue = "SpeciesTissue_"
# SpeciesTissue = "AddAge_"
# SpeciesTissue = ""
## This string controlls what the outcome is. In this case it's maximum lifespan
outcomeName = "logmaxAgeCaesar"
# outcomeName = "logmaxanAge"
## This string controlls what training-test split to use, leave one clade out?
## one species out? or leave out "small" mammals
## It's further explained in below ifelse statements, each correspond to a supplementary analysis
# leavename = "LeaveSmall"
# leavename = "Order"
# leavename = "OrderFamily"
leavename = "SpeciesLatinName"
# leavename = "LeaveCetacea"
leaveselect = 0
source("./utilities_analysis.R")
# anAge data
anageCaesar <- read.csv("anAgeUpdatedCaesarVersion49.csv",
stringsAsFactors = FALSE)
## load Averaged data produced by "Average CpGs by Species.R". Please run this file first to produce species-averaged data
## Since Mammalian lifespan predictor is trained on species-level data, we need to average all-sample data by species.
if(SpeciesTissue == "SpeciesTissue_") {
load('./SpeciesTissue_Averaged_methCombined.RData')
} else {
load('./Averaged_methCombined.RData')
}
dat0 = Averaged_methCombined[, !grepl("rs", colnames(Averaged_methCombined))] # filter out control probes
## Match KNN samples
dat0 = dat0[!rownames(dat0) %in%
c("Eulemur collaris", "Gerbillus cheesmani", "Sus scrofa minusculus",
"Cephalorhynchus hectori maui", "Rattus norvegicus domestica", "Galea leucoblepharum",
"Paraechinus hypomelas", "Microgale drouhardi", "Microgale thomasi"), ]
if(SpeciesTissue == "NoMonotremata_") {
temp = anageCaesar[anageCaesar$profiled & anageCaesar$Order == "Monotremata", "SpeciesLatinName"]
dat0 = dat0[!rownames(dat0) %in%temp, ]
}
## As described in Li. C. Z. et. al. (2024), filter CpGs by detection p-values
## Please use "CG Screening by DetectionP Value.R" to derive the results first
## The code in "CG Screening by DetectionP Value.R" takes in all Mammalian 40K Array detection p-value output from Sesame normalization pipeline
## And take the median p-value (FDR adjusted) of each probe per species, and keep those with 85% of species Median value >= 0.05
## Rationale: only want to keep probes that work in most of the species.
## In the end, load the "good" CpG list from the RDS file to be used for Elastic Net training code.
cpgs = readRDS(paste0("./DetectionP_median85_CGs_v11_AddYumanensis.RDS"))
dat0 = dat0[, colnames(dat0) %in% cpgs]
## Matching genotype data to phenotype annotations
samples = data.frame(SpeciesLatinName = rownames(dat0))
rm(Averaged_methCombined)
samples = samples %>% left_join(anageCaesar, by = "SpeciesLatinName")
## Adjust lifespan estimates based on Lu T. A. (2022) to be connsistent
samples$maxAgeCaesar = ifelse(samples$SpeciesLatinName %in% c("Homo sapiens", "Mus musculus"),
samples$maxAgeCaesar, samples$maxAgeCaesar*1.3)
samples$maxanAge = ifelse(samples$SpeciesLatinName %in% c("Homo sapiens", "Mus musculus"),
samples$maxanAge, samples$maxanAge*1.3)
samples$logmaxAgeCaesar = log(samples$maxAgeCaesar)
samples$logweightCaesar = log(samples$weightCaesar)
samples$logaveragedMaturity.yrs = log(samples$averagedMaturity.yrs)
samples$logGestation.Incubation..days. = log(samples$Gestation.Incubation..days.)
samples$OrderFamily = paste0(samples$Order, "_", samples$Family)
samples$logmaxanAge = log(samples$maxanAge)
#### Add dog samples ######
dogs = read.csv("./DogBreeds.csv",
stringsAsFactors = FALSE)
dogs$logLifespanMedianHorvath = log(dogs$LifespanMedianHorvath)
dogs$logWeight.kg.avg = log(dogs$Weight.kg.avg)
# ready to merge with dogs
dogs$SpeciesLatinName = dogs$DogBreed
dogs$maxAgeCaesar = dogs$LifespanMedianHorvath
dogs$logmaxAgeCaesar = dogs$logLifespanMedianHorvath
dogs$weightCaesar = dogs$Weight.kg.avg
dogs$logweightCaesar = dogs$logWeight.kg.avg
dogs = dogs[, c("SpeciesLatinName", "maxAgeCaesar", "logmaxAgeCaesar", "weightCaesar", "logweightCaesar")]
samples = samples[, c("SpeciesLatinName", "maxAgeCaesar", "logmaxAgeCaesar", "weightCaesar", "logweightCaesar")]
samples = rbind(samples, dogs)
dogdat0 = readRDS("./averaged_DogBreeds.RDS")
dogdat0 = dogdat0[, !grepl("rs", colnames(dogdat0))]
dogdat0 = dogdat0[, colnames(dogdat0) %in% cpgs]
if(sum(!samples$DogBreed == rownames(dogdat0)) > 0) {
stop("DogBreed must match dat0 rownames")
}
dat0 = rbind(dat0, dogdat0)
#############################
if(outcomeName == "logmaxanAge") {
dat0 = dat0[!is.na(samples$maxanAge), ]
samples = samples[!is.na(samples$maxanAge), ]
if(sum(!samples$SpeciesLatinName == rownames(dat0)) > 0) stop("LatinNames Don't Match")
}
titleName = paste0(mypath, "DogCombined_", outcomeName, "_", SpeciesTissue,
leaveselect,"leaveOne",
leavename, "_")
temp = data.frame(Order = unique(samples[, leavename])); colnames(temp) = leavename
temp$randos = 1:nrow(temp)
pred.folds = nrow(temp)
samples = samples %>% left_join(temp, by = leavename)
## In the paper, for LOCO, leaveselect is set to 2, keep only 2 species from large Mammalian Orders in training set
if(leaveselect > 0 & leavename == "Order") {
# set.seed(123)
if(leaveselect == 2) {
for (i in 1:length(unique(samples[, "randos"]))) {
# myindices = which(samples[, "randos"] == i)
myindices = samples[samples[, "randos"] == i, ]
if(nrow(myindices) >= 20) {
indextemp = which(myindices[, outcomeName] == max(myindices[, outcomeName]) |
myindices[, outcomeName] == min(myindices[, outcomeName]))
# indextemp = c(indextemp, which.min(abs(myindices[, "logmaxAgeCaesar"] - median(myindices[, "logmaxAgeCaesar"]))))
myindices = myindices[indextemp, , drop = FALSE]
# myindices = sample(myindices, leaveselect, replace = FALSE)
samples$randos[samples$SpeciesLatinName %in% myindices$SpeciesLatinName] = 0
}
}
}
## Leave small species out to produce Supplementary S14
} else if(grepl("LeaveSmall", leavename)) {
weightThreshold = 150
samples$randos = ifelse(samples$weightCaesar < weightThreshold, 1, 2)
myfdr = paste0("weightCut", weightThreshold)
pred.folds = 1
} else if(grepl("Leave", leavename)) {
tempname = sapply(strsplit(leavename, "Leave"), "[", c(2))
samples$randos = NA
samples$randos = ifelse(samples$Order == tempname, 1, 2)
pred.folds = 1
}
rm(temp)
## Using a parallelized code to speed up the training process.
## Connvenient function, elasticNetTrain() written to train a glmnet object in accordance to the genotype
## data, outcome, and splits
## For details please see sourced R code source("./utilities_analysis.R")
registerDoParallel(2)
print(paste0("dim(dat0)", dim(dat0)))
print(dim(samples))
test.set = foreach(i = 1:pred.folds, .combine = rbind, .packages = c('doParallel', 'glmnet')) %dopar% {
manual.folds = NA
species.weights = NA
# This function uses glmnet net to train Elastic Net models
lifespanpredictor = elasticNetTrain(dat0, samples,
outcomeName = outcomeName, selectLambda = "min",
leaveOneOut = i, leaveOutName = "randos", manual.folds = manual.folds,
species.weights = NA)
toreturn = lifespanpredictor$test
toreturn$Nselected = nrow(lifespanpredictor$glmnet.final.nonzero)
return(toreturn)
}
write.csv(test.set, paste0("PATH to Results"), row.names = FALSE)