1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
## As described in Li. C. Z. et. al. (2024), filter CpGs by detection p-values
## This code takes in all Mammalian 40K Array detection p-value output from Sesame normalization pipeline
## And take the median p-value (FDR adjusted) of each probe per species, and keep those with 85% of species Median value >= 0.05
## Rationale: only want to keep probes that work in most of the species.
## In the end, save the file to an RDS file to be used for Elastic Net training code.

  
load("./detectionP_combined_sesame.RData")
detectionP_combined_sesame = detectionP_combined_sesame[!grepl("rs", detectionP_combined_sesame$CGid), ]

if(grepl("EPIC450", titleName)) {
  epic = read.csv("~/StuffCaesar/utilities/ProbesSharedWithEPIC.CanonicalManifestMinfi_overlapwith450k.csv")
  detectionP_combined_sesame = detectionP_combined_sesame[detectionP_combined_sesame$CGid %in% epic$IlmnID, ]
} else if(grepl("EPIC", titleName))  {
  epic = read.csv("~/StuffCaesar/utilities/ProbesSharedWithEPIC.CanonicalManifestMinfi.csv")
  detectionP_combined_sesame = detectionP_combined_sesame[detectionP_combined_sesame$CGid %in% epic$IlmnID, ]
}

#cpgs = detectionP_combined_sesame$CGid
#detectionP_combined_sesame = detectionP_combined_sesame[, -1]
#detectionP_combined_sesame = t(detectionP_combined_sesame)
#colnames(detectionP_combined_sesame) = cpgs
combinedsamples = combinedsamples[!combinedsamples$SpeciesLatinName == "", ] 
combinedsamples = combinedsamples[!is.na(combinedsamples$SpeciesLatinName) & !combinedsamples$SpeciesLatinName == "Peromyscus hybrid polionatus maniculatus", ] 

# exclude experiment folders 
## Note: these are the folders that I excluded IN ADDITION TO STEVE'S REMOVE FOLDER
combinedsamples = combinedsamples[!grepl("N65\\.|N70\\.|N74\\.|N84\\.|N86\\.|N87\\.|N88\\.|N89\\.|N62\\.", combinedsamples$Folder), ]
combinedsamples = combinedsamples[!grepl("N02\\.|N25\\.|N39\\.|N112", combinedsamples$Folder), ]
####
# exclude new data and non-human data (TO BE MODIFIED IN THE FUTURE)
combinedsamples = combinedsamples[!grepl("N95|N96|N97|N98|N99|N100|N101|N102|N103|N104|N105|N107|N108|N111|N113", 
                                         combinedsamples$Folder), ]
#
## Add Steve's exclude Dec2021
combinedsamples = combinedsamples[!grepl("N71\\.|N79\\.|N80\\.|N82\\.|N83\\.|N85\\.|N128", combinedsamples$Folder), ]

## ADD only N141
combinedsamples = combinedsamples[!grepl("N138\\.|N139\\.|N140\\.|N142\\.", combinedsamples$Folder), ]

combinedsamples = combinedsamples[!grepl("iPSC|ES", combinedsamples$Tissue), ]
combinedsamples = combinedsamples[combinedsamples$CanBeUsedForAgingStudies == "yes", ]
combinedsamples = combinedsamples[!grepl("Tt x Tt gilli", combinedsamples$SpeciesLatinName), ]

if(grepl("Yumanensis", mytitle)) {
  combinedsamples = combinedsamples[!grepl("yumanensis", combinedsamples$SpeciesLatinName), ]
}
if(grepl("noHydroFibro", mytitle)) {
  combinedsamples = combinedsamples[!(combinedsamples$SpeciesLatinName == "Hydrochoerus hydrochaeris" & 
                                        combinedsamples$Tissue == "Fibroblast"), ]
}

if(grepl("noExperiment", mytitle)) {
  combinedsamples = read.csv("~/StuffCaesar/utilities/combinedsamples_noExperiments.csv", stringsAsFactors = F)
}

###
#combinedsamples = combinedsamples[combinedsamples$Basename %in% colnames(detectionP_combined_sesame), ]
###

xy <- prepDat0(combinedsamples, detectionP_combined_sesame, filterCanBeUsed = TRUE, 
               filterAgeConfidence = FALSE, filterCharacteristics = NULL, filterFolders = NULL,
               filterFewSpecies = FALSE)
combinedsamples <- xy$ys
detectionP_combined_sesame <- xy$x
rm(xy); gc()
detectionP_combined_sesame = 
  detectionP_combined_sesame[, which(colnames(detectionP_combined_sesame) %in% colnames(dat0))]

combinedsamples = combinedsamples %>% left_join(anageCaesar, by = "SpeciesLatinName")

#if(grepl("DetectionP", titleName)) {
#    combinedsamples$SpeciesLatinName = paste0(combinedsamples$SpeciesLatinName, "_", combinedsamples$Tissue)
#}

detectionP_combined_sesame = as.data.frame(detectionP_combined_sesame)
cpgs = colnames(detectionP_combined_sesame)
detectionP_combined_sesame$SpeciesLatinName = combinedsamples$SpeciesLatinName

detectionP_combined_sesame = detectionP_combined_sesame %>% 
  group_by(SpeciesLatinName) %>%
  summarise_at(cpgs, median)
detectionP_combined_sesame = as.data.frame(detectionP_combined_sesame)
rownames(detectionP_combined_sesame) = detectionP_combined_sesame$SpeciesLatinName
detectionP_combined_sesame = detectionP_combined_sesame[, -which(colnames(detectionP_combined_sesame) == "SpeciesLatinName")]
detectionP_combined_sesame = as.data.frame(detectionP_combined_sesame)

cpgs = colnames(detectionP_combined_sesame)
basenames = rownames(detectionP_combined_sesame)
detectionP_combined_sesame = as.matrix(detectionP_combined_sesame)
temp = p.adjust(detectionP_combined_sesame, method = "fdr")
detectionP_combined_sesame = matrix(data = temp, nrow = nrow(detectionP_combined_sesame), byrow = FALSE)
colnames(detectionP_combined_sesame) = cpgs
rownames(detectionP_combined_sesame) = basenames

filterrows = ceiling(nrow(detectionP_combined_sesame) * 0.85)
#filterrows = 140
cpgs = colnames(detectionP_combined_sesame)
cpgs = cpgs[apply(detectionP_combined_sesame, 2, function(x) return(sum(x < 0.05) >= filterrows))]

# FILTER OUT SNPs
cpgs = cpgs[!grepl("rs", cpgs)]

saveRDS(cpgs, file = paste0("YOUR_PATH/DetectionP_median85_CGs_v11.RDS"))