Raw File
data_import.R
#' Loading the panel, the single-cell data and exploring normalisation options

# Load adjusted prepData function to allow for more metadata (age, gender, etc)
source("./Custom_functions/prepData_adjusted.R") 

### Load panel (it has the metal names swaped so we need to change them):
load_panel <- function(settings) {  
    panel <- fread(file.path(settings$experiment_dir, "CP_CyTOF_panel.csv"))
    panel[, name1 := sub('^([[:digit:]]+)([[:alpha:]]+)$', '\\1', fcs_colname)]
    panel[, name2 := sub('^([[:digit:]]+)([[:alpha:]]+)$', '\\2', fcs_colname)]
    
    pnames <- fread(file.path(settings$experiment_dir, "CP_CyTOF_panel_names.csv"))
    pnames[, name1 := sub('^([[:alpha:]]+)([[:digit:]]+)([[:alpha:]]+)$', '\\1', fcs_colname)]
    pnames[, name2 := sub('^([[:alpha:]]+)([[:digit:]]+)([[:alpha:]]+)$', '\\2', fcs_colname)]
    pnames[, name3 := sub('^([[:alpha:]]+)([[:digit:]]+)([[:alpha:]]+)$', '\\3', fcs_colname)]
    pnames <- pnames[fcs_colname != name1]
    
    panel <- merge(panel, pnames, by.x = "name1", by.y = "name2")
    panel <- panel[, c("fcs_colname.y", "antigen", "marker_class")]
    
    setnames(panel, "fcs_colname.y", "fcs_colname")
    return(panel)
}


#################
### LOAD DATA ###

load_data <- function(settings) {
    if (settings$tissue == 'cp') {
        if (settings$norm) {
            normalised_folder <- "normalised"
        } else {
            normalised_folder <- "original"
        }
        files <- list.files(file.path(settings$experiment_dir, "Pregated_fcs/Cp",
                                      normalised_folder), 
                            full.names = T)
        files <- files[grepl(".fcs$", files)] # keep only fcs files
        files <- files[!grepl("ref", files)] # exclude ref
        files <- files[!grepl("2019-110", files)] # exclude control6 (no cells)
        if (settings$subsetting == T) {
            fs <- list()
            ncells <- c()
            for (file in files) {
                cat("Reading File", file, "\n", sep = " ")
                temp <- read.FCS(filename = file)
                ncells <- c(ncells, dim(temp@exprs)[1])
            }
            n_subset <- quantile(ncells, 0.75)
            for (file in files) {   
                temp <- read.FCS(filename = file)
                if(nrow(temp) > n_subset) { 
                    lines <- sample(1:nrow(temp), n_subset)
                    temp <- temp[lines]
                } 
                print(dim(temp))
                fs[[file]] <- temp
            }
            fs <- as(fs, "flowSet")
        } else {
            fs <- read.flowSet(files = files,
                               transformation = F, truncate_max_range = F)
        }
        # fs <- flowCore::rbind2(fs1, fs2)
        
        ## Metadata:
        md <- fread(file.path(settings$experiment_dir, "Clinical_info/CP_CyTOF_metadata_cp.csv"))
        md <- md[patient_id != "2019-110"] # exclude control6 (no cells)
        for (i in 1:nrow(md)) {
            md[i, file_name := files[grepl(md[i]$patient_id, files)]]
        }
        batch <- fread(file.path(settings$experiment_dir, "CP_CyTOF_batches_cp.csv"))
        setnames(batch, "barcode", "batch_id")
        info <- fread(file.path(settings$experiment_dir, "Clinical_info/CP_CyTOF_clinical_info_selection_cp.csv"))
        info <- info[, c("donor", "age", "gender", "pmd_hours")]
        # Combine metadata of fixed effects
        md <- merge(md, batch, by = "patient_id")
        md <- merge(md, info, by.x = "patient_id", by.y = "donor")
        return(list(fs = fs, md = md))
    }
    if (settings$tissue == 'septum') {
        if (settings$norm) {
            normalised <- "normalised/"
        } else {
            normalised <- "original/"
        }
        files <- list.files(paste0(settings$experiment_dir, "/Pregated_fcs/Spt/",
                                   normalised), 
                            full.names = T)
        files <- files[grepl(".fcs$", files)] # keep only fcs files
        files <- files[!grepl("ref", files)] # exclude ref
        if (settings$subsetting == T) {
            fs <- list()
            ncells <- c()
            for (file in files) {
                cat("Reading File", file, "\n", sep = " ")
                temp <- read.FCS(filename = file)
                ncells <- c(ncells, dim(temp@exprs)[1])
            }
            n_subset <- quantile(ncells, 0.75)
            for (file in files) {   
                temp <- read.FCS(filename = file)
                if(nrow(temp) > n_subset) { 
                    lines <- sample(1:nrow(temp), n_subset)
                    temp <- temp[lines]
                } 
                print(dim(temp))
                fs[[file]] <- temp
            }
            fs <- as(fs, "flowSet")
        } else {
            fs <- read.flowSet(files = files,
                               transformation = F, truncate_max_range = F)
        }
        # fs <- flowCore::rbind2(fs1, fs2)
        
        md <- fread(file.path(settings$experiment_dir, "Clinical_info/CP_CyTOF_metadata_septum.csv"))
        # Keep only md from those samples that have septum
        md[, file_name := ""] 
        for (i in 1:nrow(md)) {
            md[i, file_name := {
                my_value <- files[grepl(md[i]$patient_id, files)]
                if (is_empty(my_value)) {
                    'no_septum'
                } else {
                    my_value
                }
            }
            ]
        }
        md <- md[file_name != "no_septum"]
        batch <- fread(file.path(settings$experiment_dir, "CP_CyTOF_batches_spt.csv"))
        setnames(batch, "barcode", "batch_id")
        info <- fread(file.path(settings$experiment_dir, "Clinical_info/CP_CyTOF_clinical_info_selection_septum.csv"))
        info <- info[, c("donor", "age", "gender", "pmd_hours")]
        # Combine metadata of fixed effects
        md <- merge(md, batch, by = "patient_id")
        md <- merge(md, info, by.x = "patient_id", by.y = "donor")
        return(list(fs = fs, md = md))    }
    if (settings$tissue == 'blood') {
        if (settings$norm) {
            normalised <- "normalised/"
        } else {
            normalised <- "original/"
        }
        files <- list.files(paste0(settings$experiment_dir, "/Pregated_fcs/Bld/",
                                   normalised), 
                            full.names = T)
        files <- files[grepl(".fcs$", files)] # keep only fcs files
        files <- files[!grepl("ref", files)] # exclude ref
        if (settings$subsetting == T) {
            fs <- list()
            ncells <- c()
            for (file in files) {
                cat("Reading File", file, "\n", sep = " ")
                temp <- read.FCS(filename = file)
                ncells <- c(ncells, dim(temp@exprs)[1])
            }
            n_subset <- quantile(ncells, 0.75)
            for (file in files) {   
                temp <- read.FCS(filename = file)
                if(nrow(temp) > n_subset) { 
                    lines <- sample(1:nrow(temp), n_subset)
                    temp <- temp[lines]
                } 
                print(dim(temp))
                fs[[file]] <- temp
            }
            fs <- as(fs, "flowSet")
        } else {
            fs <- read.flowSet(files = files,
                               transformation = F, truncate_max_range = F)
        }
        # fs <- flowCore::rbind2(fs1, fs2)
        
        md <- fread(file.path(settings$experiment_dir, "Clinical_info/CP_CyTOF_metadata_blood.csv"))
        # Keep only md from those samples that have septum
        md[, file_name := ""] 
        for (i in 1:nrow(md)) {
            md[i, file_name := {
                my_value <- files[grepl(md[i]$patient_id, files)]
                if (is_empty(my_value)) {
                    'no_bld'
                } else {
                    my_value
                }
            }
            ]
        }
        md <- md[file_name != "no_bld"]
        # for (i in 1:nrow(md)) {
        #     md[i, file_name := files[grepl(md[i]$patient_id, files)]]
        # }
        batch <- fread(file.path(settings$experiment_dir, "CP_CyTOF_batches_bld.csv"))
        setnames(batch, "barcode", "batch_id")
        info <- fread(file.path(settings$experiment_dir, "Clinical_info/CP_CyTOF_clinical_info_selection_blood.csv"))
        info <- info[, c("donor", "age", "gender", "pmd_hours")]
        # Combine metadata of fixed effects
        md <- merge(md, batch, by = "patient_id")
        md <- merge(md, info, by.x = "patient_id", by.y = "donor")
        return(list(fs = fs, md = md))   
    }
}


#####################
### NORMALISATION ###

normalise <- function(settings, exclude = c('CD39'), method="95p", transformation=FALSE) {
    source("./Custom_functions/BatchAdjust.R")
    
    # Create list of channels to adjust:
    panel_dt <- as.data.table(panel)
    exclude <- exclude # list antigens/channels to exclude
    panel_dt <- panel_dt[!(antigen %in% exclude),] # Exclude unreliable markers
    fwrite((panel_dt[, .(fcs_colname)]), 
           paste0(settings$experiment_dir, "/channels_to_adjust.txt"), 
           col.names = F)
    
    # Choose tissue folder name
    tissue <- settings$tissue
    folder_name <- if (tissue == 'cp') {
        'Cp'
    } else if (tissue == 'septum') {
        'Spt'
    } else if (tissue == 'blood') {
        'Bld'
    }
    
    BatchAdjust(
        basedir= paste0(settings$experiment_dir, "/Pregated_fcs/", 
                        folder_name, "/"),
        outdir= paste0(settings$experiment_dir, "/Pregated_fcs/", 
                       folder_name, "/normalised"),
        channelsFile = paste0(settings$experiment_dir, "/channels_to_adjust.txt"),
        batchKeyword=paste0("_", folder_name),
        anchorKeyword = "ref",
        method=method,
        transformation=transformation,
        addExt=NULL,
        plotDiagnostics=TRUE)
}


load_experiment <- function(settings) {
    #' Loads panel, fcs files and metadata.
    
    ### Load panel (marker_class varies: use latest version; or use cellpop-specific panel)
    panel <- load_panel(settings)
    
    ### NORMALISATION ###
    if (settings$norm == T) {
        normalise(exclude = c('CD39'), method="95p", transformation=FALSE)
    }
    
    ### Load fcs data into a flowset and metadata:
    # read.flowSet(), by default, may transform the marker intensities and
    # remove cells with extreme positive values. This behavior can be controlled
    # with arguments transformation and truncate_max_range, respectively.
    # Apply function to each tissue:
    data <- load_data(settings)
    fs <- data$fs
    md <- data$md
    
    # Check that all panel columns are in the flowSet object:
    all(panel$fcs_colname %in% colnames(fs))
    
    panel <- as.data.frame(panel)
    
    return(list(
        panel = panel,
        flowset = fs,
        metadata = md
    ))
}


construct_sce <- function(experiment) {
    md <- experiment$metadata
    fs <- experiment$flowset
    panel <- experiment$panel
    
    ### Specify levels for conditions & sample IDs to assure desired ordering:
    md$condition <- factor(md$condition, levels = c("control", "ms", "ad"))
    md$sample_id <- factor(md$sample_id,
                           levels = md$sample_id[order(md$condition, md$sample_id)])
    
    ### Construct SingleCellExperiment (default arcsinh transformation of
    # marker expressions with a cofactor of 5
    # Do the following manually because of error in function
    # when "merging" fs and md, otherwise sample names are wrong:
    md <- md[match(c(keyword(fs, "FILENAME")), md$file_name)]
    
    ### Construct sce:
    sce <- prepData_adjusted(fs, panel, md)#, features = panel$fcs_colname)
    
    return(sce)
}
back to top