Raw File
---
title:"BrainAge_VidalPineiro_ModelGeneration(XGB; Lifebrain replication)"
author: "DVP"
date: "12/27/2020"
output: html_document
---

# Setup
```{r setup, include=T}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(magrittr)
library(janitor)
library(ggridges)
library(xgboost)
library(mgcv)
library(GGally)
library(fastDummies)
library(gamm4)
library(ggpointdensity)


data_folder=file.path("./data/LifeBrain",paste0("noExcl_resCohort_outlierremove"))

try(dir.create(data_folder))
options(bitmapType = "cairo")

raw_folder = list()
raw_folder[["LCBC"]]="./data-raw/BrainAge/LCBC/tabulated_data"
raw_folder[["Betula"]]="./data-raw/BrainAge/Betula/tabulated_data"
raw_folder[["CamCan"]]="./data-raw/BrainAge/CamCan/tabulated_data"
raw_folder[["BaseII"]]="./data-raw/BrainAge/BaseII/tabulated_data"
raw_folder[["UB_ALL"]]="./data-raw/BrainAge/UB_ALL/tabulated_data"
raw_folder[["AIBL"]]="./data-raw/BrainAge/AIBL/tabulated_data"

# where to place results

# track submitted jobs
squeue = function(user, job) {
  df.squeue = system(paste0("squeue --name=", job," -u ",user), intern = T) %>% 
      strsplit(., " +") %>% 
      simplify2array() %>% 
      t() %>% 
      as.data.frame()
  return(df.squeue)
  
}

# read stats files from dataset dirs
read_mri_stats = function(site, file) {
    df.stats = read.table(file.path(raw_folder[[site]], 
                                    "mri_stats",
                                    file), 
                header = T, 
                stringsAsFactors = F)
}

# removes appending and separates according to strings from a filename
paste_files = function(df, vector) {
  terms <- paste0(vector, collapse="|");   
  regex <- paste0("(?<=", terms, ")"); 
  modality =  df$files %>% 
    tools::file_path_sans_ext() %>% 
    strsplit(., regex, perl = T) %>% 
    simplify2array() %>% 
    .[2,]
  return(modality)
}

# remove longitudinal data with too close gaps and removed repeated cross/sectional data
remove_too_short = function(df, gap){
  tmp= df %>% group_by(CrossProject_ID) %>% summarise(n = n(),
                                                    AgeBsl = min(age),
                                                    AgeGap = max(age) - min(age))
  df %<>% 
    left_join(., tmp) 
  
  tmp2 = df %>% 
    filter(!n == 1 & AgeGap == 0) %>% 
    select(CrossProject_ID, Folder)
  idx = tmp2$Folder[tmp2 %>% select(CrossProject_ID) %>% duplicated()]
  
  # remove those with short gap amd duplicated baseline
  df = df %>% 
    filter(!(!n == 1 & AgeGap < gap & !age == AgeBsl)) %>% 
    filter(!Folder %in% idx) %>% 
    select(-c(n, AgeBsl, AgeGap))
  return(df)
}
```

# Data Preprocessing (commong steps)
### LCBC data - merge
```{r LCBC-merge data}
site = "LCBC"
if(!file.exists(file.path(raw_folder[[site]],"Raw_merged.Rda"))){
  df = list()
  # a) load and merge mri_stats #
  df$files = list.files(file.path(raw_folder[[site]], "mri_stats"))
  df$stats = lapply(df$files, read_mri_stats, site = site)
  df$stats = lapply(df$stats, 
                    function(x) { colnames(x)[1] = "imageid"; x})
  df$stats = lapply(df$stats, 
                    function(x){x = x %>% separate("imageid", c("Folder"), sep= ".long.base_", remove = F);x })
  vector <- c("cort.", "subc.") 
  df$modality = paste_files(df, vector)
  for(i in 1:length(df$modality)) {
    idx = which(!names(df$stats[[i]]) %in% c("imageid", "Folder"))
    names(df$stats[[i]])[idx] = paste(names(df$stats[[i]])[idx], df$modality[i], sep = "_")
  }
  df$merged = df$stats %>% reduce(full_join)

  # open sociodemographics. 
  load(file.path(raw_folder[[site]], "databases", "MOAS.RData"))
  df$db = MOAS %>% 
    select(!starts_with(c("MRI_aparc", "MRI_aseg"))) %>% 
    remove_empty("cols") %>% 
    filter(!Site_Name == "noMRI") %>% 
    mutate(sex = if_else(Sex == "Male",1,0),
           cohort = "LCBC", 
           CrossProject_ID = as.character(CrossProject_ID))
  rm(MOAS)
  # probably need to curate this a little bit
  df$join = df$db %>% 
    filter(Age > 18 & !is.na(Age)) %>% 
    select(CrossProject_ID, 
           Folder,
           Age,
           sex,
           Site_Name,
           cohort)  
  names(df$join) = c("CrossProject_ID",
                     "Folder",
                     "age",
                     "sex",
                     "Site",
                     "cohort")
                         
  df$export = inner_join(df$merged, df$join) 
  # save
  save(df, file = file.path(raw_folder[[site]],"Raw_merged.Rda")) 
} 
```

### Betula data - merge
```{r Betula-merge data}
site = "Betula"
if(!file.exists(file = file.path(raw_folder[[site]],"Raw_merged.Rda"))) {
  df = list()
  # a) load and merge mri_stats #
  df$files = list.files(file.path(raw_folder[[site]], "mri_stats"))
  df$stats = lapply(df$files, read_mri_stats, site = site)
  df$stats = lapply(df$stats, 
                    function(x) { colnames(x)[1] = "imageid"; x})
  df$stats = lapply(df$stats, 
                    function(x){x = x %>% separate("imageid", c("Folder"), sep= ".long.", remove = F);x })
  vector <- c("cort.", "subc.") 
  df$modality = paste_files(df, vector)
  for(i in 1:length(df$modality)) {
    idx = which(!names(df$stats[[i]]) %in% c("imageid", "Folder"))
    names(df$stats[[i]])[idx] = paste(names(df$stats[[i]])[idx], df$modality[i], sep = "_")
  }
  df$merged = df$stats %>% reduce(full_join)

  # open sociodemographics. 
  df$db = read.csv(file.path(raw_folder[[site]], "databases", "longData_LifeBrain.csv")) %>% 
    select(-(contains(c("Hippocampus", "Estim"))))
  
  tmp = readxl::read_xlsx(file.path(raw_folder[[site]], "databases","Exclusions_Betula_T5_and_T6.xlsx"),
                    sheet = 2) %>%
    mutate(Folder = paste0("sub-",T_unique_ID),
           Exclusion = `Include=1;exclude=0`) %>% 
    select(Folder, Exclusion)
  df$db %<>% left_join(., tmp) %>% 
    mutate(sex = if_else(Sex == 2,1,0),
           cohort = "Betula",
           site = "Betula")
  
  # probably need to curate this a little bit
  df$join = df$db %>% 
    filter(age_MRI > 18 & !is.na(age_MRI) & Exclusion == 1) %>% 
    select(Betula_ID, 
           Folder,
           age_MRI,
           sex,
           site,
           cohort)  
  names(df$join) = c("CrossProject_ID",
                     "Folder",
                     "age",
                     "sex",
                     "Site",
                     "cohort")
                         
  df$export = inner_join(df$merged, df$join) 
  # save
  save(df, file = file.path(raw_folder[[site]],"Raw_merged.Rda")) 
}

```

### BaseII - merge
```{r BaseII-merge data}
site = "BaseII"
if(!file.exists(file.path(raw_folder[[site]],"Raw_merged.Rda"))) {
  df = list()
  # a) load and merge mri_stats #
  df$files = list.files(file.path(raw_folder[[site]], "mri_stats"))
  df$stats = lapply(df$files, read_mri_stats, site = site)
  df$stats = lapply(df$stats, 
                    function(x) { colnames(x)[1] = "imageid"; x})
  df$stats = lapply(df$stats, 
                    function(x){x = x %>% separate("imageid", c("Folder"), sep= ".long.", remove = F);x })
  vector <- c("cort.", "subc.") 
  df$modality = paste_files(df, vector)
  for(i in 1:length(df$modality)) {
    idx = which(!names(df$stats[[i]]) %in% c("imageid", "Folder"))
    names(df$stats[[i]])[idx] = paste(names(df$stats[[i]])[idx], df$modality[i], sep = "_")
  }
  df$merged = df$stats %>% reduce(full_join)

  # open sociodemographics. 
  df$db = read.csv(file.path(raw_folder[[site]], "databases", "longData_LifeBrain.csv"), sep = "\t") %>% 
    mutate(sex = if_else(sex == "M",1,0),
           cohort = "BaseII",
           site = "BaseII")
  
  # probably need to curate this a little bit
  df$join = df$db %>% 
    filter(Age_MR > 18 & !is.na(Age_MR)) %>% 
    select(participant_id, 
           ID,
           Age_MR,
           sex,
           site,
           cohort)  
  names(df$join) = c("CrossProject_ID",
                     "Folder",
                     "age",
                     "sex",
                     "Site",
                     "cohort")
                         
  df$export = inner_join(df$merged, df$join) 
  # save
  save(df, file = file.path(raw_folder[[site]],"Raw_merged.Rda")) 
}

```


### CamCan - merge
```{r CamCan-merge data}
site = "CamCan"
if(!file.exists(file.path(raw_folder[[site]],"Raw_merged.Rda"))) {
  df = list()
  # a) load and merge mri_stats #
  df$files = list.files(file.path(raw_folder[[site]], "mri_stats"))
  df$stats = lapply(df$files, read_mri_stats, site = site)
  df$stats = lapply(df$stats, 
                    function(x) { colnames(x)[1] = "imageid"; x})
  df$stats = lapply(df$stats, 
                    function(x){x = x %>% separate("imageid", c("Folder"), sep= ".long.", remove = F);x })
  vector <- c("cort.", "subc.") 
  df$modality = paste_files(df, vector)
  for(i in 1:length(df$modality)) {
    idx = which(!names(df$stats[[i]]) %in% c("imageid", "Folder"))
    names(df$stats[[i]])[idx] = paste(names(df$stats[[i]])[idx], df$modality[i], sep = "_")
  }
  df$merged = df$stats %>% reduce(full_join)

  # open sociodemographics. 
  df$db = read.csv(file.path(raw_folder[[site]], "databases", "longData_LifeBrain.csv")) %>% 
    mutate(sex = if_else(Sex == "MALE",1,0),
           cohort = "CamCan",
           site = "CamCan")
           #site = if_else(MT_TR == 30, "CamCan30", "CamCan50"))
       
  df$join = df$db %>% 
    filter(Age_MR > 18 & !is.na(Age_MR)) %>% 
    select(CCID, 
           Folder,
           Age_MR,
           sex,
           site,
           cohort)  

  names(df$join) = c("CrossProject_ID",
                     "Folder",
                     "age",
                     "sex",
                     "Site",
                     "cohort")
                         
  df$export = inner_join(df$merged, df$join) 
  # save
  save(df, file = file.path(raw_folder[[site]],"Raw_merged.Rda"))
}

```

### UB - merge
```{r UB-merge data}
site = "UB_ALL"
if(!file.exists(file.path(raw_folder[[site]],"Raw_merged.Rda"))) {
  df = list()
  # a) load and merge mri_stats #
  df$files = list.files(file.path(raw_folder[[site]], "mri_stats"))
  df$stats = lapply(df$files, read_mri_stats, site = site)
  df$stats = lapply(df$stats, 
                    function(x) { colnames(x)[1] = "imageid"; x})
  df$stats = lapply(df$stats, 
                    function(x){x = x %>% separate("imageid", c("Folder"), sep= ".long.", remove = F);x })
  vector <- c("cort.", "subc.") 
  df$modality = paste_files(df, vector)
  for(i in 1:length(df$modality)) {
    idx = which(!names(df$stats[[i]]) %in% c("imageid", "Folder"))
    names(df$stats[[i]])[idx] = paste(names(df$stats[[i]])[idx], df$modality[i], sep = "_")
  }
  df$merged = df$stats %>% reduce(full_join)

  # open sociodemographics. 
  df$db = read.csv(file.path(raw_folder[[site]], "databases",  "Bareclona_IDlink.tsv"), sep = "\t") %>% 
    mutate(sex = if_else(Sex == "MALE",1,0),
           Site = "Bcn",
           cohort = "Bcn")

    df$join = df$db %>% 
    filter(Age_MR > 18 & !is.na(Age_MR)) %>% 
    select(ID, 
           Folder,
           Age_MR,
           sex,
           Site,
           cohort)  

  names(df$join) = c("CrossProject_ID",
                     "Folder",
                     "age",
                     "sex",
                     "Site",
                     "cohort")
                         
  df$export = inner_join(df$merged, df$join) 
  
  # save
  save(df, file = file.path(raw_folder[[site]],"Raw_merged.Rda")) 
}

```



### AIBL - merge
```{r AIBL-merge data}
site = "AIBL"
if(!file.exists(file.path(raw_folder[[site]],"Raw_merged.Rda"))) {
  df = list()
  # a) load and merge mri_stats #
  df$files = list.files(file.path(raw_folder[[site]], "mri_stats"))
  df$stats = lapply(df$files, read_mri_stats, site = site)
  df$stats = lapply(df$stats, 
                    function(x) { colnames(x)[1] = "imageid"; x})
  df$stats = lapply(df$stats, 
                    function(x){x = x %>% separate("imageid", c("Folder"), sep= ".long.", remove = F);x })
  vector <- c("cort.", "subc.") 
  df$modality = paste_files(df, vector)
  for(i in 1:length(df$modality)) {
    idx = which(!names(df$stats[[i]]) %in% c("imageid", "Folder"))
    names(df$stats[[i]])[idx] = paste(names(df$stats[[i]])[idx], df$modality[i], sep = "_")
  }
  df$merged = df$stats %>% reduce(full_join)

  # open sociodemographics. 
  tmp = read.csv(file.path(raw_folder[[site]], "databases",  "aibl_coredatabase.csv"), 
                   sep = "\t", 
                   stringsAsFactors = F)
  tmp.fs = read.csv(file.path(raw_folder[[site]], "databases",  "aibl_FieldStrength.csv"), 
                   sep = ";", 
                   stringsAsFactors = F) %>% 
    select(RID, Skanner, VISCODE) %>% unique()
  df$db =left_join(tmp, tmp.fs) %>% 
    mutate(cohort = "AIBL",
           site = Skanner,
           RID = as.character(RID)) %>% 
    separate(Folder, "Folder", sep = ".long.")
  # demented at any point?
  idx.dem = df$db %>% filter(!DIAGNOSIS == "nc") %>% .$RID %>% unique()
  df$db %<>% mutate(any_dem = if_else(RID %in% idx.dem, 1,0))
  
  df$join = df$db %>% 
  filter(!is.na(AGEESTIMATE) & any_dem == 0) %>% 
  select(RID, 
         Folder,
         AGEESTIMATE,
         sex,
         site,
         cohort)  

  names(df$join) = c("CrossProject_ID",
                     "Folder",
                     "age",
                     "sex",
                     "Site",
                     "cohort")
                         
  df$export = inner_join(df$merged, df$join) 
  # save
  save(df, file = file.path(raw_folder[[site]],"Raw_merged.Rda")) 
}

```

## Merge - all
```{r Open and merge all info}

if (!file.exists(file.path(data_folder, "all_merged.Rda"))) {
  sites = names(raw_folder)
  # decide whether to remove anything
  db = list()
  db = lapply(sites, function(x) {load(file.path(raw_folder[[x]],"Raw_merged.Rda")); db[x] = df$export})
  names(db) = sites
  df = db %>% reduce(rbind) %>% 
    filter(!is.na(sex))
  
  # get info on imaging data
  mris = c("volume", "area", "gwc", "thickness", "intensity")
  noT1w = df %>% dplyr::select(!contains(mris)) %>% names()
  T1w_vars = df %>% dplyr::select(contains(mris)) %>% names()
  
  # remove duplicated data - change names to vars
  df.Harmonize = read.csv("data/Harmonize.csv", stringsAsFactors = F)
  idx =df.Harmonize$LifebrainOnly == 1
  rm_vars =T1w_vars[!T1w_vars %in% df.Harmonize$LifeBrain[idx]]
  df %<>% dplyr::select(-rm_vars)
  # use consensus names
  names(df)[names(df) %in% T1w_vars] = 
    df.Harmonize$ConensusName[match(names(df)[names(df) %in% T1w_vars], df.Harmonize$LifeBrain)]
  # get new T1w vars
  T1w_vars = names(df)[!names(df) %in% noT1w]
  
  # save
  save(df, T1w_vars, noT1w, file = file.path(data_folder, "all_merged.Rda"))
  save(T1w_vars,file=(file.path(data_folder,"T1w_vars.Rda"))) 
} else {
  load(file.path(data_folder, "all_merged.Rda"))
}

```

## Preprocessing T1w data

```{r preprocess T1w data, echo=T}

if (!file.exists(file.path(data_folder,"all_preproc.Rda"))) {
  # a) flag and remove outlier observations
  df.isoutlier = matrix(0, length(df$imageid), length(T1w_vars)) %>% as.data.frame()
  rownames(df.isoutlier) = df$imageid
  names(df.isoutlier) = T1w_vars
  devfactor = 4
  df.metric = data.frame(vars = T1w_vars,
                            SD = df %>% summarise_at(T1w_vars, sd) %>% as.numeric(),
                              X = df %>% summarise_at(T1w_vars, mean) %>% as.numeric())
  for (vars in df.metric$vars) {
    idx = which(df.metric$vars == vars)
    thr = df.metric[idx,"SD"]*devfactor
    iid = which(!between(df[,vars], df.metric[idx,"X"]-thr,df.metric[idx,"X"]+thr))
    if (!is_empty(iid)) df.isoutlier[iid,which(names(df.isoutlier) == vars)] = 1
  }
  
  rm.subjs = rownames(df.isoutlier)[which(rowSums(df.isoutlier) > length(T1w_vars)*.05)]
  print(paste("removing", length(rm.subjs), "observations"))
  df = df %>% filter(!imageid %in%rm.subjs)
  
  ## b) remove longitudinal data with short follow ups
  gap = 5/12
  df = remove_too_short(df, gap) # currently around 100 observations are lost
  
  # get infor on timepoint and age 
  tmp= df %>% group_by(CrossProject_ID,cohort) %>% summarise(n = n(),
                                                      AgeBsl = min(age),
                                                      AgeGap = max(age) - min(age))
  df %<>% 
      left_join(., tmp) %>% 
    mutate(Time = age - AgeBsl)
  
  
  ## c) residualize data by cohort (extract Site coefs based on GAMM modeling 
  df <- df %>% dummy_cols(select_columns = "Site",remove_most_frequent_dummy = T)
  df.pp.fit = 
    df %>% 
      pivot_longer(cols = T1w_vars,
                 names_to = "MRI", 
                 values_to = "value") %>% 
    group_by(MRI) %>% 
    do(fit = gamm4(value ~ s(age, k = 8) +
                    Site_TrioTim + 
                    Site_Avanto +
                    Site_Verio + 
                    Site_BaseII + 
                    Site_Betula + 
                    Site_CamCan + 
                    Site_ousAvanto + 
                    Site_ousPrisma +
                    Site_Bcn ,
                    random=~(1|CrossProject_ID), 
                    data = .))
      
    # extract coefficients
  df.coef = sapply(df.pp.fit$fit, function(x) summary(x$gam)$p.coeff[-1]) %>% 
    simplify2array() %>% 
    as.data.frame()
  names(df.coef) = df.pp.fit$MRI
  
  ## apply coefficients to whole dataset / might not be too efficient
  for (vars in names(df.coef)) {
    print(vars)
    for (coef in rownames(df.coef)) {
    df = df %>% mutate(!!vars := get(vars) - get(coef)*df.coef[coef,vars])
    }
  }
  
  df %<>% select(-starts_with("Site_"))
  
  
  ## d) scale data (again use training dataset as reference)
  # get mean and sd for reference sample
  df.pp <- df %>% filter(n == 1)
  df.scale = data.frame(T1w_vars,
                        mean = df.pp %>% summarise_at(T1w_vars, mean, na.rm = T) %>% t(),
                        sd = df.pp %>% summarise_at(T1w_vars, sd, na.rm = T) %>% t())
  
  ## apply coefficients to whole dataset / might not be to efficient
  for (vars in rownames(df.scale)) {
    print(vars)
    df = df %>% mutate(!!vars := (get(vars) - df.scale[vars, "mean"]) / df.scale[vars, "sd"])
  }
  
  
  # save
  save(df, 
       df.scale,
       df.coef,
       T1w_vars, 
       noT1w,
       file = file.path(data_folder,"all_preproc.Rda"))
  } else {
  load(file.path(data_folder,"all_preproc.Rda"))
}

```

### Supplementary Table. Info for Demographics Lifebrain
```{r explore data}

print("explore train data")
df %>% group_by(cohort) %>% 
  filter(n == 1) %>% 
  summarise(sdAge =sd(AgeBsl),
            n = n(),
            age = mean(AgeBsl),
            agem = min(AgeBsl),
            ageM = max(AgeBsl))
df %>% group_by(cohort) %>% 
  filter(n == 1) %>% 
  count(sex)
df %>% filter(n == 1) %>% 
  summarise(sdAge =sd(AgeBsl),
            n = n(),
            age = mean(AgeBsl),
            agem = min(AgeBsl),
            ageM = max(AgeBsl))
df %>% filter(n == 1) %>% 
  count(sex)

print("explore test data") 
max(df$Time)

tmp= df %>% filter(!n == 1) %>% 
   group_by(cohort,CrossProject_ID) %>% 
        summarise(n = n(),
        AgeBsl = min(AgeBsl),
        Time = max(Time),
        sex = first(sex))
c("Sex: Test:", table(tmp$sex))
tmp %>% group_by(cohort) %>% count(sex)


tmp %>% ungroup() %>%  
  summarise(sdAge =sd(AgeBsl),
           Nobs = mean(n),
            SDobs =sd(n),
            n = n(),
            age = mean(AgeBsl),
            agem = min(AgeBsl),
            ageM = max(AgeBsl),
           TimeM = mean(Time),
           TimeSD = sd(Time))
  
tmp %>% group_by(cohort) %>% 
  summarise(sdAge =sd(AgeBsl),
           Nobs = mean(n),
            SDobs =sd(n),
            n = n(),
            age = mean(AgeBsl),
            agem = min(AgeBsl),
            ageM = max(AgeBsl),
           TimeM = mean(Time),
           TimeSD = sd(Time))
```

### Supplementary Fig. X Lifebrain sample distribution
```{r Fig1b}
## plot train and dataset with age
gs = ggplot() +
  geom_density(df.Train[[1]], mapping = aes(x = age, y = -..density..), fill="#66a61e", color="#66a61e", size =1.5, alpha =.8) + 
  geom_label( aes(x=86, y=-0.02, label="Training set"), color="#66a61e", size =4) +
  geom_density(df.Test[[1]], mapping = aes(x = age, y = ..density..), fill="#1b9e77", color="#1b9e77", size =1.5, alpha =.8) + 
  geom_label( aes(x=86, y=0.02, label="Test set"), color="#1b9e77", size =4) +
  
  geom_hline(yintercept = 0, color="#1b9e77", size = 1) +
    theme_classic() +
    theme(legend.position = 'none',
        axis.text = element_text(size = 16),
        axis.title = element_text(size = 20),
        #axis.line.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.text.x = element_blank()) + 
  xlab("Chronological Age") +
  ylab("") +
  coord_flip() + 
  scale_x_continuous(expand = c(0,0)) + 
  scale_y_continuous(expand = expansion(add = c(.009,0.001)))

ggsave("figures/sample_distribution_lifebrain.png", 
       dpi = 500, 
       plot = gs,
       width = 10,
       height = 10, 
       units = "cm")
```

### Supplementary Fig X density distribution lifebrain cohorts
```{r ridges}
colorscale = c('#1b9e77','#d95f02','#7570b3','#e7298a','#66a61e','#e6ab02')

gs = ggplot(df %>% filter(n == 1), 
       aes(x = age, y = cohort, fill = cohort)) +
  geom_density_ridges2(scale = 1.6) +
  theme_classic() +
    theme(legend.position = 'none',
        axis.text = element_text(size = 16),
        axis.title = element_text(size = 20), 
        axis.title.y = element_blank(),
        axis.ticks.y = element_blank(),
        plot.title = element_text(size = 20, hjust = 0.5, vjust = 0)) +
  scale_y_discrete(expand = c(0,0), label = c("AIBL", "Base-II", "UB", "Betula", "Cam-CAN", "LCBC")) + 
  scale_x_continuous(expand = c(0,0), name = "Chronological Age") +
  scale_fill_manual(values = colorscale) +
  ggtitle("Training set")

ggsave("figures/sample_distribution_lifebrain_train_ridge.png", 
       dpi = 500, 
       plot = gs,
       width = 10,
       height = 10, 
       units = "cm")

gs = ggplot(df %>% filter(!n == 1), 
       aes(x = age, y = cohort, fill = cohort)) +
  geom_density_ridges2(scale = 1.6) +
  theme_classic() +
    theme(legend.position = 'none',
        axis.text = element_text(size = 16),
        axis.title = element_text(size = 20), 
        axis.title.y = element_blank(),
        axis.ticks.y = element_blank(),
        plot.title = element_text(size = 20, hjust = 0.5, vjust = 0)) +
  scale_y_discrete(expand = c(0,0), label = c("AIBL", "Base-II", "UB", "Betula", "Cam-CAN", "LCBC")) + 
  scale_x_continuous(expand = expansion(add = c(0,2)), name = "Chronological Age") +
  scale_fill_manual(values = colorscale) +
  ggtitle("Test set")

ggsave("figures/sample_distribution_lifebrain_test_ridge.png", 
       dpi = 500, 
       plot = gs,
       width = 10,
       height = 10, 
       units = "cm")
```


# ML - fitting

### Select Train and Test 
Separate train and test data. That is separate longitudinal and cross-sectional data
```{r train and test data}

if (!file.exists(file.path(data_folder, "all_TrainTest.Rda"))) {
  # define lists
  df.Train = df.Test = label.train = label.test = data.train = data.test = dtrain = dtest = list()
  
  df.Train[[1]] <- df %>% filter(n == 1)
  df.Test[[1]] <- df %>% filter(!n == 1)
  
  data.train[[1]] = df.Train[[1]][, T1w_vars] %>% as.matrix()
  label.train[[1]] = df.Train[[1]]$age %>% as.matrix()
  data.test[[1]] = df.Test[[1]][, T1w_vars] %>% as.matrix() %>% as.matrix()
  label.test[[1]] = df.Test[[1]]$age %>% as.matrix()
  
  # create xgb matrices
  dtrain[[1]] <- xgb.DMatrix(data = data.train[[1]],
                             label = label.train[[1]]) 
  dtest[[1]] <- xgb.DMatrix(data = data.test[[1]],
                            label=label.test[[1]])
  
  save(df.Train, 
       label.train, 
       data.train, 
       df.Test, 
       label.test, 
       data.test, 
       dtrain, 
       dtest,
       file = file.path(data_folder, "all_TrainTest.Rda"))
} else {
  load(file.path(data_folder, "all_TrainTest.Rda"))
}
```

### Explore hyperparameter spaces with cross-validation
```{r Hyper_parameter_search,  eval=F, echo=T}
set.seed(1234)
it=50
  
if(!file.exists(file.path(data_folder,"RandomHyperParameterSearchCV.Rda"))) {
  xgb_grid_1 = expand.grid(
                  nrounds = seq(100, 600,50),
                  eta = c(.01, .05, .1, .15, .2),
                  max_depth = seq(2, 8,1),
                  gamma = seq(0.5,1.5,0.5),
                  min_child_weight=seq(1,4),
                  rmse =NaN,
                  Nrmse=NaN,
                  train=NaN,
                  idx=NaN
                  )
  
  save(xgb_grid_1, file = file.path(data_folder,"RandomHyperParameterSearchCV.Rda"))
} else {
  load(file.path(data_folder,"RandomHyperParameterSearchCV.Rda"))
}
  idx = sample(dim(xgb_grid_1)[1],it)
  ii = which(!(1:it %in% xgb_grid_1$idx))
  # parameters
  for (i in ii) {
    print(i)
    eta = xgb_grid_1$eta[idx[i]]
    max_depth=xgb_grid_1$max_depth[idx[i]]
    gamma = xgb_grid_1$gamma[idx[i]]
    min_child_weight = xgb_grid_1$min_child_weight[idx[i]]
    nrounds = xgb_grid_1$nrounds[idx[i]]
    (system(paste("sbatch scripts/LifeBrain/RandomHyperParameterSearchCV.sh", 
                 eta, 
                 max_depth, 
                 gamma, 
                 min_child_weight, 
                 nrounds,
                 i,
                 idx[i],
                 data_folder,
                 sex_split = F,
                 sep = " ")))
  Sys.sleep(1)  
  }
  
  
  df.squeue = squeue("p274-didacvp","HyperParameterSearch")
  while (length(df.squeue$V1) > 1) {
    Sys.sleep(120) 
    print("script running on sbatch")
    df.squeue = squeue("p274-didacvp","HyperParameterSearch")
  }
  
  # reload 
  load(file.path(data_folder,"RandomHyperParameterSearchCV.Rda"))
  ii = which(!(1:it %in% xgb_grid_1$idx))
  if(is_empty(ii)) {
    xgb_grid_1 = xgb_grid_1 %>% filter(!is.na(rmse))
    save(xgb_grid_1, file = "data/RandomHyperParameterSearchCV_all.Rda")
  } else { 
    disp("something wrong with randomized search")
  }
  
  xgb_grid_1 %>% arrange(rmse) %>% head()
  
```

### Apply CV (XGB.CV) of winning model 
Test model with own dataset. 
```{r xgbcv}
if(!file.exists(file.path(data_folder, "xgbcv.CV.Rda"))) {
  
  eta = xgb_grid_1[which.min(xgb_grid_1$rmse),"eta"]
  max_depth=xgb_grid_1[which.min(xgb_grid_1$rmse),"max_depth"]
  gamma = xgb_grid_1[which.min(xgb_grid_1$rmse),"gamma"]
  min_child_weight = xgb_grid_1[which.min(xgb_grid_1$rmse),"min_child_weight"]
  nrounds = xgb_grid_1[which.min(xgb_grid_1$rmse),"nrounds"]
  
  (system(paste("sbatch scripts/LifeBrain/BrainAgeCV.sh", 
                   eta, 
                   max_depth, 
                   gamma, 
                   min_child_weight, 
                   nrounds,
                   data_folder,
                   sex_split = F, 
                   sep = " ")))
  
  df.squeue = squeue("p274-didacvp","BrainAgeCV")
  
  while (length(df.squeue$V1) > 1) {
    Sys.sleep(120) 
    print("script running on sbatch")
    df.squeue = squeue("p274-didacvp","BrainAgeCV")
  }
} 

load(file.path(data_folder, "xgbcv.CV.Rda"))
```

# Plotting and predicting
```{r xgboost cv plotts and summary  stats}

df.out = data.frame(
  CrossProject_ID = df.Train[[1]]$CrossProject_ID,
  Folder = df.Train[[1]]$Folder,
  BA = xgbcv$pred,
  age = label.train[[1]],
  sex = df.Train[[1]]$sex)


lm.age = lm(BA ~ age, data = df.out)
summary(lm.age)
cor.test(df.out$BA, df.out$age)
err <- mean(abs(df.out$BA - df.out$age))
rmse = sqrt(sum((df.out$BA - df.out$age)^2) / length(df.out$BA))
age.bias <- cor.test(df.out$age, (df.out$BA - df.out$age))$estimate
print(paste("mean absolute error (MAE)=", round(err,2)))
print(paste("root mean square error=",round(rmse,2)))
print(paste("r-sq =", round(summary(lm.age)$r.squared, 2)))
print(paste("age.bias =", age.bias))
```

### Plot for Lifebrain - training CV brain age fitting (not included)
```{r plot CA vs BA}
gs = ggplot(data = df.out,
       mapping = aes(x = age,
                     y = BA)) + 
  geom_pointdensity(adjust = 4) + 
  geom_abline(intercept = 0, 
              slope = 1, 
              colour = "grey60", 
              linetype = 4,
              size = 1.5) + 
  geom_smooth(method = "lm", 
              color = "#1b9e77",
              size = 1.5) + 
  geom_smooth(color = "#d95f02",
              size = 1.5) +
  theme_classic() + 
  theme(legend.position = 'none',
        axis.text = element_text(size = 16),
        axis.title = element_text(size = 20)) +
  scale_color_viridis_c() +
  ylab("Brain Age") +
  xlab("Chronological Age")

ggsave("figures/TrainXGV.Lifebrain.png", 
       dpi = 500, 
       plot = gs,
       width = 10,
       height = 10, 
       units = "cm")
```

### Bias Correction

```{r Bias Correction}
# Bias correction - linear  and GAM
sm.rel = gam(BA ~ s(age, k = 8), data = df.out) 
df.out %<>% 
  mutate(BAG = BA - age, 
         ExpAge = lm.age$coefficients[[1]] + age*lm.age$coefficients[[2]],
         BAG_corr = BA - ExpAge, 
         BAG_corr_gam = sm.rel$residuals)

ggpairs(df.out %>% 
          select(BA, 
                 age, 
                 BAG, 
                 ExpAge, 
                 BAG_corr,
                 BAG_corr_gam), 
        progress = F)
```


## XGB_Predict
```{r xgboost predict}
if(!file.exists(file.path(data_folder, "xgbcv.full.Rda"))) {
  eta = xgb_grid_1[which.min(xgb_grid_1$rmse),"eta"]
  max_depth=xgb_grid_1[which.min(xgb_grid_1$rmse),"max_depth"]
  gamma = xgb_grid_1[which.min(xgb_grid_1$rmse),"gamma"]
  min_child_weight = xgb_grid_1[which.min(xgb_grid_1$rmse),"min_child_weight"]
  nrounds = xgb_grid_1[which.min(xgb_grid_1$rmse),"nrounds"]
  
  (system(paste("sbatch scripts/LifeBrain/BrainAgeFull.sh", 
                   eta, 
                   max_depth, 
                   gamma, 
                   min_child_weight, 
                   nrounds,
                   data_folder,
                   sex_split = F,
                   sep = " ")))
  
  df.squeue = squeue("p274-didacvp","BrainAgeFull")
  while (length(df.squeue$V1) > 1) {
    Sys.sleep(120) 
    print("script running on sbatch")
    df.squeue = squeue("p274-didacvp","BrainAgeFull")
  }
} 

load(file.path(data_folder, "xgbcv.full.Rda"))
pred <- predict(bst, data.test[[1]])

#view variable importance plot
xgb.dump(bst, with_stats = TRUE, file.path(data_folder,'bst.model.dump.txt'))
mat <- xgb.importance (feature_names = T1w_vars,model = bst)
xgb.plot.importance (importance_matrix = mat, top = 15, rel_to_first = T, left_margin = 15)
xgb.ggplot.deepness(bst)
suppressMessages(xgb.plot.multi.trees(bst, fill = T))
```


## Fig SI Predict Lifebrain 
```{r test data info}

df.pred = data.frame(
  CrossProject_ID = df.Test[[1]]$CrossProject_ID,
  BA = pred,
  age = label.test[[1]], 
  Folder = df.Test[[1]]$Folder)


gs = ggplot(data = df.pred,
       mapping = aes(x = age,
                     y = BA)) + 
  geom_line(mapping = aes(group = CrossProject_ID), size = .3, color = "grey50") +
  geom_pointdensity(adjust = 4) + 
  geom_abline(intercept = 0, 
              slope = 1, 
              colour = "grey60", 
              linetype = 4,
              size = 1.5) + 
  geom_smooth(method = "lm", 
              color = "#1b9e77",
              size = 1.5) + 
  geom_smooth(color = "#d95f02",
              size = 1.5) +
  theme_classic() + 
  theme(legend.position = 'none',
        axis.text = element_text(size = 16),
        axis.title = element_text(size = 20)) +
  scale_color_viridis_c() +
  ylab("Brain Age") +
  xlab("Chronological Age")

ggsave("figures/TestXGV.Lifebrain.png", 
       dpi = 500, 
       plot = gs,
       width = 10,
       height = 10, 
       units = "cm")
```

## table features. get info variance explained in lifebrain
```{r}
## save age effects of features 
if(!file.exists(file.path(data_folder,"age.effects.Rda"))) {
  gam.age = df.Train[[1]] %>% 
    pivot_longer(T1w_vars, 
                 names_to = "features",
                 values_to = "value") %>% 
    group_by(features) %>% 
    do(fit = gam(value ~ s(age), data = ., method = 'REML'))  
  
  df.age = data.frame(
    feature =gam.age$features,
    r.sq = sapply(gam.age$fit, function(x) summary(x)$r.sq) %>% 
            simplify2array())
  
  save(gam.age, df.age, file = file.path(data_folder,"age.effects.Rda"))
} else {
  load(file = file.path(data_folder,"age.effects.Rda"))
}

write.csv(df.age, file = "figures/age_effects_Lifebrain.csv")
```


## Summarized model info. 
```{r get brain age stats}

lm.Age = lm(BA ~ age, data = df.pred)
summary(lm.Age)
cor.test(df.pred$BA, df.pred$age)
err <- mean(abs(pred - label.test[[1]]))
rmse = sqrt(sum((pred - label.test[[1]])^2) / length(pred))
age.bias <- cor.test(df.pred$age, (df.pred$BA - df.pred$age))$estimate

print(paste("mean absolute error (MAE)=", round(err,2)))
print(paste("root mean square error=",round(rmse,2)))
print(paste("r-sq =", round(summary(lm.Age)$r.squared, 2)))
print(paste("age.bias =", age.bias))
```

### Bias correction Test
```{r}
sm.rel = gam(BA ~ s(age, k = 8), data = df.out) 
lm.Age2 = lm(BA ~ poly(age,2, raw = T), data = df.out)
lm.Age = lm(BA ~ age, data = df.out)

df.pred %<>% 
  mutate(BAG = BA - age, 
         ExpAge = lm.Age$coefficients[[1]] + age*lm.Age$coefficients[[2]],
         ExpAge2 = lm.Age2$coefficients[[1]] + age*lm.Age2$coefficients[[2]] + age^2*lm.Age2$coefficients[[3]],
         BAG_corr = BA - ExpAge, 
         BAG_corr2 = BA - ExpAge2, 
         BAG_corr_gam = BA - predict(sm.rel, df.pred))

ggpairs(df.pred %>% 
          select(BA, 
                 age, 
                 BAG, 
                 ExpAge, 
                 BAG_corr,
                 BAG_corr2,
                 BAG_corr_gam), 
        progress = F)

# note that results are almost equivalent as when using the cross-validated approach
```



# Saving Resuøts
merge data.frame and save results
```{r Saving and Knitting}
# vars to joing
vars = c("Folder",
         "BA",
         "BAG",
         "ExpAge",
         "BAG_corr",
         "BAG_corr_gam")    
df.Test[[1]] = 
  left_join(df.Test[[1]], 
            df.pred %>% select(vars))
df.Train[[1]] = 
  left_join(df.Train[[1]], 
            df.out %>% select(vars))

save(df.Test, df.Train, file = file.path(data_folder, "ResultsBrainAge.Rda"))
#rmarkdown::render("LifeBrain_BrainAge.Rmd", "html_document")
```


#scaling
```{r scaling}
#
load(file.path(data_folder, "ResultsBrainAge.Rda"))

## apply scaling according to Smith 2.8 formula in Smith Neuroimage 2019.


set = c(rep("Test", length(df.Test[[1]]$BAG_corr_gam)),
        rep("Train", length(df.Train[[1]]$BAG_corr_gam)))
delta = c(df.Test[[1]]$BAG_corr_gam,df.Train[[1]]$BAG_corr_gam)
age = c(df.Test[[1]]$age,df.Train[[1]]$age)

# Y0
scaled_age = ((age - min(age)) / (max(age) - min (age)))
# eq.18 (log(|delta|)) = D0 + lambda*scaled_aged
mod = lm(log(abs(delta)) ~ scaled_age)
summary(mod)

# lambda =  exp(lambda) - 1
lambda = exp(coef(mod)[["scaled_age"]])-1
mean(abs(delta))*lambda

# eq.17 delta = delta0(1 + lamda*scaled_age)
delta_0 <- delta / (1 + lambda * scaled_age)

par(mfrow=c(1,3)) 
plot(age, delta)
plot(age, delta_0)
plot(delta, delta_0)


df.Test[[1]]$delta_0 = delta_0[set == "Test"]
df.Train[[1]]$delta_0 = delta_0[set == "Train"]

save(df.Test, 
     df.Train, 
     file = file.path(data_folder, "ResultsBrainAge.Rda"))

```

back to top