swh:1:snp:18ef956b602668aadd1027bd3add90630713e6b5
Raw File
Tip revision: 44f72a1e27db1bafefb159cdc26bc71f3691bf54 authored by Behzad Yaghmaeian Salmani on 07 March 2024, 16:41:19 UTC
First commit
Tip revision: 44f72a1
human_mouse_integration.R

suppressPackageStartupMessages({
  library(Seurat)
  library(Matrix)
  library(stringr)
  library(sctransform)
  library(future)
  require(scales)
  library(RColorBrewer)
  library("readxl")
  library(dplyr)
  library(dendextend)
  library(patchwork)
  library(ggplot2)
  library(Orthology.eg.db)
  library(org.Mm.eg.db)
  library(org.Hs.eg.db)
})

###  re-analysis of human data Kamath et al. NatureNeuroscience 2022

### Generation of human dataset

# data (count matrix, barcodes and features) downloaded from: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE178265
# GSE178265_Homo_bcd.tsv.gz	
# GSE178265_Homo_features.tsv.gz	
# GSE178265_Homo_matrix.mtx.gz

# data files renamed and seurat object made. 

km <- Read10X("/path/to/files/data")

km <- CreateSeuratObject(km, project = 'km')

# metadata_PD tsv file downloaded from:
# https://singlecell.broadinstitute.org/single_cell/study/SCP1768/single-cell-genomic-profiling-of-human-dopamine-neurons-identifies-a-population-that-selectively-degenerates-in-parkinsons-disease-single-nuclei-data#study-download

# this is the metadata of all nuclei from all species in this study. 

metadata <- read.table(file = '/path/to/dir/METADATA_PD.tsv', sep = '\t', header = T)

# get cell barcodes from the km Seurat object and subset the metadata based on that:

barcodes <- FetchData(km, vars = 'ident')

# turn barcodes' row names into a vector:
cellbc <- row.names(barcodes)

# subset metadata based on this vector:

metadf <- metadata[metadata$NAME %in% cellbc, ] 

write.table(metadf, file="/path/to/dir/Metadata_human.tsv", 
            quote=FALSE, sep='\t', col.names = TRUE)

# it's only human now:
unique(metadf$species__ontology_label )
#  [1] "Homo sapiens"

# set the barcodes (NAME column) as the row names of the metadf: 
row.names(metadf) <- metadf$NAME 

km <- AddMetaData(object = km, metadata = metadf, col.name = NULL)

km <- PercentageFeatureSet(km, "^MT-", col.name = "percent_mito", assay = "RNA")

km <- PercentageFeatureSet(km, "^RP[SL]", col.name = "percent_ribo", assay = "RNA")

km <- CellCycleScoring(km, g2m.features = cc.genes$g2m.genes, s.features = cc.genes$s.genes)

km$UMIperGene <- km$nCount_RNA / km$nFeature_RNA

### gene & cell filtering 

selected_cells <- WhichCells(km, expression = nCount_RNA >= 650 & percent_mito <= 10 & UMIperGene >= 1.2 )

selected_genes <- rownames(km)[Matrix::rowSums(km) > 1]

km <- subset(km, features = selected_genes, cells = selected_cells)

# remove MALAT1

km <- km[ ! grepl("MALAT1", rownames(km)), ]

saveRDS(km, "/path/to/dir/km.rds")

### subset dataset based on condition & organ 

table(km$disease__ontology_label )   
#  Lewy body dementia       normal      Parkinson disease 
#              67466        231530                135344 

# first, subset normal:

km <- SetIdent(km, value = 'disease__ontology_label')

kmsub1 <- subset(km, idents = 'normal')

kmsub1 <- SetIdent(kmsub1, value = 'organ__ontology_label')

kmsub1 <- subset(kmsub1, idents = 'substantia nigra pars compacta')

saveRDS(kmsub1, "/path/to/dir/km_cntl.rds")

### subset diseased individuals 

kmsub2 <- subset(km, idents = c('Parkinson disease', 'Lewy body dementia'))

kmsub2 <- SetIdent(kmsub2, value = 'organ__ontology_label')

kmsub2 <- subset(kmsub2, idents = 'substantia nigra pars compacta')

saveRDS(kmsub2, "/path/to/dir/km_PDLBD.rds")


### integration of control SNpc dataset 

# load all data from previous step 
mydata <- readRDS("/path/to/dir/km_cntl.rds")

# split the dataset by donor_id
my.list <- SplitObject(mydata, split.by = 'donor_id')
gc()
# normalize and identify variable features for each dataset independently
my.list <- lapply(X = my.list, FUN = function(x) {
  x <- NormalizeData(x)
  x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})

gc()

# select features that are repeatedly variable across datasets for integration nfeatures = 2000 default 
features <- SelectIntegrationFeatures(object.list = my.list, nfeatures = 2000)

gc()

# scale and run PCA on each object in the list. 

my.list <- lapply(X = my.list, FUN = function(x) {
  x <- ScaleData(x, features = features, verbose = FALSE)
  x <- RunPCA(x, features = features, verbose = FALSE)
})

gc()

# Perform integration
my.anchors <- FindIntegrationAnchors(object.list = my.list, reduction = "rpca", dims = 1:50)

rm(my.list)

gc()

# creates an 'integrated' data assay 
mydata <- IntegrateData(anchorset = my.anchors, dims = 1:50)

gc()

# Perform integrated analysis
DefaultAssay(mydata) <- "integrated"

# Run the standard workflow for visualization and clustering
mydata <- ScaleData(mydata, verbose = FALSE)

mydata <- RunPCA(mydata, npcs = 50, verbose = FALSE)

tiff(file = "/path/to/dir/elbowplot.tiff", 
     units="cm", width = 50, height = 50, res = 300)

ElbowPlot(mydata, reduction = "pca", ndims = 50) + ggtitle("km_Cntl integrated")

dev.off()

gc()

mydata <- RunUMAP(mydata, reduction = "pca", dims = 1:30)

mydata <- FindNeighbors(mydata, reduction = "pca", dims = 1:30)

mydata <- FindClusters(mydata, resolution = seq(0.1, 1.5, by=0.1), n.start = 100, n.iter = 100)

saveRDS(mydata, "/path/to/dir/km_cntl_integ_snpc.rds")

print(sessionInfo())


### integration of PD LBD dataset 

# load all data from previous step 
mydata <- readRDS("/path/to/dir/km_PDLBD.rds")

print(mydata)

# split the dataset by donor_id
my.list <- SplitObject(mydata, split.by = 'donor_id')
gc()
# normalize and identify variable features for each dataset independently
my.list <- lapply(X = my.list, FUN = function(x) {
  x <- NormalizeData(x)
  x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})

gc()

# select features that are repeatedly variable across datasets for integration nfeatures = 2000 default 
features <- SelectIntegrationFeatures(object.list = my.list, nfeatures = 2000)

gc()

# scale and run PCA on each object in the list. 

my.list <- lapply(X = my.list, FUN = function(x) {
  x <- ScaleData(x, features = features, verbose = FALSE)
  x <- RunPCA(x, features = features, verbose = FALSE)
})

gc()

### Perform integration
my.anchors <- FindIntegrationAnchors(object.list = my.list, reduction = "rpca", dims = 1:50)

rm(my.list)

gc()

#  creates an 'integrated' data assay 
mydata <- IntegrateData(anchorset = my.anchors, dims = 1:50)

gc()

# Perform integrated analysis
DefaultAssay(mydata) <- "integrated"

# Run the standard workflow for visualization and clustering
mydata <- ScaleData(mydata, verbose = FALSE)

mydata <- RunPCA(mydata, npcs = 50, verbose = FALSE)

tiff(file = "/path/to/dir/elbowplot.tiff", 
     units="cm", width = 50, height = 50, res = 300)

ElbowPlot(mydata, reduction = "pca", ndims = 50) + ggtitle("km_PDLBD integrated")

dev.off()

gc()

mydata <- RunUMAP(mydata, reduction = "pca", dims = 1:30)

mydata <- FindNeighbors(mydata, reduction = "pca", dims = 1:30)

mydata <- FindClusters(mydata, resolution = seq(0.1, 1.5, by=0.1), n.start = 100, n.iter = 100)

saveRDS(mydata, "/path/to/dir/km_PDLBD_integrated.rds")

print(sessionInfo())


###  integration of control with PDLBD 
# default assay was set to integrated for both control and PDLBD objects, 
# to enable use of the the already calculated PCA in each. 
# Reciprocal PCA (RPCA) was used. 

cnt <- readRDS("/path/to/dir/km_cntl_integ_snpc.rds")

DefaultAssay(cnt) <- 'integrated'

pd <- readRDS("/path/to/dir/km_PDLBD_integrated.rds")

DefaultAssay(pd) <- 'integrated'

# Perform integration

my.anchors <- FindIntegrationAnchors(object.list = list(cnt, pd), reduction = "rpca", dims = 1:50)

rm(cnt)
rm(pd)
gc()

#  create an 'integrated' data assay
mydata <- IntegrateData(anchorset = my.anchors)

gc()

# Perform integrated analysis
DefaultAssay(mydata) <- "integrated"

# Run the standard workflow for visualization and clustering
mydata <- ScaleData(mydata, verbose = FALSE)

mydata <- RunPCA(mydata, npcs = 50, verbose = FALSE)

tiff(file = "/path/to/dir/elbowplot.tiff", 
     units="cm", width = 40, height = 40, res = 300)

ElbowPlot(mydata, reduction = "pca", ndims = 50) + ggtitle("All integrated")

dev.off()

gc()

mydata <- RunUMAP(mydata, reduction = "pca", dims = 1:30)

mydata <- FindNeighbors(mydata, reduction = "pca", dims = 1:30)

mydata <- FindClusters(mydata, resolution = seq(0.1, 1.5, by=0.1), n.start = 100, n.iter = 100)

saveRDS(mydata, "/path/to/dir/km_All_integrated.rds")

### subset DA clusters based on canonical dopaminergic markers expression 
# clusters #4 and #10 at integrated_snn_res.0.1 resolution are dopaminergic

mydata <- SetIdent(mydata, value = 'integrated_snn_res.0.1')

km_DA <- subset(mydata, idents = c(4, 10))

table(km_DA$Status )

#  Ctrl   LBD    PD 
# 17039  4642  3322 

saveRDS(km_DA, "/path/to/dir/kmAll_DA.rds")

print(sessionInfo())


####### Integration of human and mouse dataset

# counts matrix of 'RNA' assay from human DA dataset
km_DA <- readRDS("/path/to/dir/kmAll_DA.rds")

DefaultAssay(km_DA) <- 'RNA'

Hscounts <- GetAssayData(km_DA, slot = 'counts')

write.table(Hscounts, file= '/path/to/dir/Hs_counts.tsv', sep='\t', col.names = T)

# gene map function 
mapfun <- function(mousegenes){
  gns <- mapIds(org.Mm.eg.db, mousegenes, "ENTREZID", "SYMBOL")
  mapped <- select(Orthology.eg.db, gns, "Homo_sapiens","Mus_musculus")
  naind <- is.na(mapped$Homo_sapiens)
  hsymb <- mapIds(org.Hs.eg.db, as.character(mapped$Homo_sapiens[!naind]), "SYMBOL", "ENTREZID")
  out <- data.frame(Mouse_symbol = mousegenes, mapped, Human_symbol = NA)
  out$Human_symbol[!naind] <- hsymb
  out
}

# keys = genes symbols 
z <- keys(org.Mm.eg.db, "SYMBOL")

Hs_Mm_mappedgenes <- mapfun(z)

write.csv(Hs_Mm_mappedgenes, '/path/to/dir/Hs_Mm_mappedgenes.csv', row.names = TRUE)

# create the human count matrix, with Orthologous mouse gene IDs
mmhs <- read.csv('/path/to/dir/Hs_Mm_mappedgenes.csv', sep = ',')

#  Row names in the metadata need to match the column names of the counts matrix, header = T

Hscounts <- read.table(file = '/path/to/dir/Hs_counts.tsv', sep = '\t', header = T) 

jgenes <- intersect(mmhs$Human_symbol, rownames(Hscounts))

# two subsets, genes with / without matching mouse symbol 
mdf <- Hscounts[rownames(Hscounts) %in% jgenes, ]

mdf2 <- Hscounts[!(rownames(Hscounts) %in% jgenes), ]

mmgenes <- mmhs[mmhs$Human_symbol %in% jgenes, ]

# re-arrange columns 
mmgenes <- mmgenes[, c(2, 5)]

# re-order mmgenes based on mdf rows:
mmgenes <- mmgenes[match(rownames(mdf), mmgenes$Human_symbol ),]

identical(rownames(mdf), mmgenes$Human_symbol )
# [1] TRUE

mdf <- cbind(mdf, mmgenes)

rownames(mdf) <- mdf$Mouse_symbol

mdf <- mdf[, -c(25004:25005)] 

# now the second part of the count matrix df: mdf2

# create a new column based on row names to modify them and reuse as row names 
mdf2$c1 <- rownames(mdf2) 

identical(rownames(mdf2), mdf2$c1 )
#[1] TRUE

mdf2$c1 <- tolower(mdf2$c1 ) 

# First letter to upper case:
firstup <- function(x) {
  substr(x, 1, 1) <- toupper(substr(x, 1, 1))
  x
}

mdf2$c1 <- firstup(mdf2$c1 ) 

mdf2$c1 <- make.unique(mdf2$c1 ) 

mdf2$c1  <-  gsub('\\.', '-', mdf2$c1 )

rownames(mdf2) <- NULL

rownames(mdf2) <- mdf2$c1 

mdf2$c1 <- NULL 

# join the two data frames to create the whole count matrix based on mouse orthologs and unique human genes

merged_mdf <- rbind.data.frame(mdf, mdf2)

dim(merged_mdf)
#[1] 38945 25003

#same dim as the original count matrix
dim(Hscounts)
# [1] 38945 25003

write.table(merged_mdf, file= '/path/to/dir/Hs_counts_Mm.tsv', sep='\t', col.names = T)

# pull the metadata info from Seurat object 
metadata <- km_DA@meta.data 
dim(metadata)
#  [1] 25003    44

colnames(metadata)

# remove extra metdata columns 
# remove all "integrated_snn_res...." columns:
to.go <- sapply(grep('snn', colnames(metadata), value = T),
                function(x) c(paste(x, collapse = ",")))

for(i in to.go) {
  metadata[[i]] <- NULL
}

# remove "Name" and other unwanted columns:
to.go <- c('seurat_clusters', 'NAME', 'libname', 'species', 'organ', 'library_preparation_protocol', 
           'orig.ident', 'disease' )

for(i in to.go) {
  metadata[[i]] <- NULL
}

# set donor_ID as orig.ident:
metadata$orig.ident <- metadata$donor_id  

identical(metadata$donor_id, metadata$orig.ident  )
# [1] TRUE

metadata$donor_id <- NULL 

metadata <- metadata[, c(21, 1:20)] 

colnames(metadata)

# re-arrange metadata to make it more compatible with mouse metadata and vice versa 

metadata$species <- 'Homo_sapiens' 

metadata$species__ontology_label <- NULL

metadata <- metadata[, c(1:3, 21, 4:20)] 

metadata$disease__ontology_label <- NULL

metadata$lib_prep_protocol <- metadata$library_preparation_protocol__ontology_label 

metadata$library_preparation_protocol__ontology_label <- NULL 

metadata <- metadata[, c(1:4, 7, 11, 14:20, 5, 6, 8:10, 12, 13)] 

colnames(metadata)

# load the counts.tsv file, generated above:  

counts <- read.table(file = '/path/to/dir/Hs_counts_Mm.tsv', sep = '\t', header = T) 

# in the counts df, from which matrix.mtx is generated, cell IDs have a <.> but, 
# in metadata df, cell IDs have <-> instead. so for compatibility, <.>  must be changed to <->

colnames(counts) <-  gsub('\\.', '-', colnames(counts) )

identical(rownames(metadata), colnames(counts))
#  [1] TRUE

# re-write the new, updated counts table 
write.table(counts, file= '/path/to/dir/HsMmcounts.tsv', sep='\t', col.names = T)

# generate the 3 files required for Read10x() (in a new directory) 
write(x = rownames(counts), file = '/path/to/new_dir/features.tsv', sep = '\t')

write(x = colnames(counts), file = '/path/to/new_dir/barcodes.tsv', sep = '\t')

mat <- data.matrix(counts)
sp.mat <- Matrix(mat, sparse = T)
writeMM(obj = sp.mat, file = '/path/to/new_dir/matrix.mtx')

# in the written features.tsv file, there's only one column with gene names
#  but read10x(), takes column #2 for gene names by default, so gene.column = 1          

HsDA <- Read10X("/path/to/new_dir", gene.column = 1, cell.column = 1)

# Create Seurat Object for human dopaminergic data
HsDA <- CreateSeuratObject(HsDA, meta.data = metadata, project = 'HsDA')

View(HsDA@meta.data)

saveRDS(HsDA, '/path/to/dir/HsDA.rds')

### load the mouse mDA dataset generated in mDA.R 

mda <- readRDS("/path/to/dir/mDA.rds")

DefaultAssay(mda) <- 'RNA'

# get the counts matrix of 'RNA' assay from mouse mDA dataset

Mmcounts <- GetAssayData(mda, slot = 'counts')

write.table(Mmcounts, file= '/path/to/dir/Mm_counts.tsv', sep='\t', col.names = F)

#  change sparse matrix to dataframe

Mmcounts <- as.data.frame(as.matrix(Mmcounts))

dim(Mmcounts)
# [1] 26497 33052

# pull the metadata info from Seurat object 
metadata <- mda@meta.data 
dim(metadata)
#  [1] 33052    64

colnames(metadata)

# function to remove all "integrated_snn_res...." columns:

to.go <- sapply(grep('snn', colnames(metadata), value = T),
                function(x) c(paste(x, collapse = ",")))

for(i in to.go) {
  metadata[[i]] <- NULL
}

to.go <- sapply(grep('kmeans', colnames(metadata), value = T),
                function(x) c(paste(x, collapse = ",")))

for(i in to.go) {
  metadata[[i]] <- NULL
}

metadata <- metadata[, -c(13:30)] 

colnames(metadata)

dim(metadata)
# 33052    16

### Create Seurat Object from the counts df and the metadata df: 

# Row names in the metadata need to match the column names of the counts matrix

identical(rownames(metadata), colnames(Mmcounts))
#  [1] TRUE

MmDA <- CreateSeuratObject(counts = Mmcounts, meta.data = metadata, project = 'MmDA')

View(MmDA@meta.data )

# re-arrange metadata to make it more compatible for integration 

MmDA$species <- 'Mus_musculus' 

MmDA@meta.data <- MmDA@meta.data[, c(1:3, 17, 4:16)]

MmDA$UMIperGene <- MmDA$UMIsPerGene 

MmDA$UMIsPerGene <- NULL

MmDA$Status <- MmDA$condition  

MmDA$condition <- NULL

MmDA$GenesPerUMI <- NULL 

MmDA$sex <- 'female'

MmDA@meta.data <- MmDA@meta.data[, c(1:4, 17, 16, 6:10, 15, 5, 11:14)]  

names(MmDA@meta.data )

saveRDS(MmDA, '/path/to/dir/MmDA.rds')


### integration 

# load human & mouse datasets generated above
MmDA <- readRDS('/path/to/dir/MmDA.rds')

HsDA <- readRDS('/path/to/dir/HsDA.rds')

mylist <- list(HsDA, MmDA)

# normalize and identify variable features for each dataset independently
mylist <- lapply(X = mylist, FUN = function(x) {
  x <- NormalizeData(x)
  x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})

# select features that are repeatedly variable across datasets for integration
features <- SelectIntegrationFeatures(object.list = mylist)

myanchors <- FindIntegrationAnchors(object.list = mylist, anchor.features = features)

rm(MmDA)
rm(HsDA)
gc()

#  create the 'integrated' data assay
mydata <- IntegrateData(anchorset = myanchors)

# Perform integrated analysis
DefaultAssay(mydata) <- "integrated"

# Run the standard workflow for visualization and clustering
mydata <- ScaleData(mydata, verbose = FALSE)

mydata <- RunPCA(mydata, npcs = 50, verbose = FALSE)

tiff(file = "/path/to/dir/elbowplot.tiff", 
     units="cm", width = 30, height = 30, res = 300)

ElbowPlot(mydata, reduction = "pca", ndims = 50) + ggtitle("HsMm integrated")

dev.off()

mydata <- RunUMAP(mydata, reduction = "pca", dims = 1:30)

mydata <- FindNeighbors(mydata, reduction = "pca", dims = 1:30)

mydata <- FindClusters(mydata, resolution = seq(0.1, 1.5, by=0.1), n.start = 100, n.iter = 100)

## create a new metadata column for both status and sample ID
mydata <- SetIdent(mydata, value = 'Status')

mydata$status.sample <- paste(Idents(mydata), mydata$orig.ident, sep = "_")

saveRDS(mydata, "/path/to/dir/DAHsMmIntegrated.rds")

### figure 8 

ord <- c("Ctrl", "PD", "LBD", "intact", "lesion")

mydata$Status <- factor(mydata$Status, levels = ord ) 

p <-  DimPlot(mydata, split.by = 'Status', group.by = 'Status', ncol = 3) + coord_fixed()

tiff(file = "/path/to/dir/fig8.tiff", 
     units="cm", width=50, height=25, res=300)

p + theme(legend.text = element_text(size = 18, face = 'bold'), 
          strip.text.x = element_text(size = 18, face = "bold"))

dev.off()

DimPlot(mydata, group.by = 'Status') + coord_fixed()

print(sessionInfo())









back to top