--- title: "Patch-seq QC assessment for manuscript" output: html_notebook --- Use the package patchSeqQC to assign QC scores (and in particular a contamination score) for each cell. To do this, we load the patchSeqQC library and then follow the relevant aspects of the tutorial here: https://github.com/PavlidisLab/patchSeqQC. The basic idea is as follows: (1) define markers for different broad classes, (2) determine expression of these genes in a reference data set, and (3) see how these genes are expressed in patch-seq data to assess contamination. The advantage of this method is it can be done independent of cluster calls, so long as you have a reliable set of marker genes for different broad classes. This requires both FACS and Patch-seq data to run. It is important to note that this analysis is based almost entirely on the the Shreejoy Trapathy's `patchSeqQC` library linked above, and which has now been published here: https://www.frontiersin.org/articles/10.3389/fnmol.2018.00363/full. Please be sure to cite this paper! ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` First load the required libraries and set the working directory. (NOTE: Probably most of these aren't actually needed) ```{r load_libraries} suppressPackageStartupMessages({ library(feather) library(dplyr) library(bindrcpp) library(viridisLite) library(ggplot2) library(mfishtools) # devtools::install_github("AllenInstitute/mfishtools") library(patchseqtools) # devtools::install_github("AllenInstitute/patchseqtools") library(patchSeqQC) # devtools::install_github('PavlidisLab/patchSeqQC') library(scrattch.io) # devtools::install_github("AllenInstitute/scrattch.io") }) options(stringsAsFactors = FALSE) patchDir = "\\\\allen/programs/celltypes/workgroups/rnaseqanalysis/shiny/patch_seq/star/mouse_patchseq_VISp_20210403_collapsed40_cpm/" facsDir = "\\\\allen/programs/celltypes/workgroups/rnaseqanalysis/shiny/facs_seq/mouse_V1_ALM_20180520/" outputFolder = "\\\\allen/programs/celltypes/workgroups/hct/HCT_RNAseq/Jeremy/patchseq_analysis/mouse_IVSCC_methods_paper/" setwd(outputFolder) ``` Load and subsample the FACs data to 100 cells per cluster, excluding all ALM and outlier clusters. Note that the `anno.feather` and `data.feather` files for the FACS data below can be generated by following the `Code 1. Download and prepare the data` script in the `VENcelltypes` R library available here: https://github.com/AllenInstitute/L5_VEN. Just follow the instructions for mouse VISp (the other parts can be ignored). ```{r load FACS 1} # Read in the data --> NOTE, this can be generated from the website Samp.dat <- read_feather(paste(facsDir,"anno.feather",sep="")) Expr.dat <- feather(paste(facsDir,"data.feather",sep="")) # FPKM Samp.dat <- Samp.dat[match(Expr.dat$sample_id,Samp.dat$sample_id),] # Define a second annotation and data file with all VISp clusters ld <- sort(unique(Samp.dat$cluster_label)) useClust2 <- ld for (val in c("ALM","Batch Grouping","Doublet","High Intron","Low Quality")) useClust2 <- as.character(useClust2[!grepl(val,useClust2)]) kpSamp2 <- subsampleCells(Samp.dat$subclass_label,100) kpSamp2 <- kpSamp2&is.element(Samp.dat$cluster_label,useClust2) gc() ``` ```{r load FACS 2} annoFACs2 <- Samp.dat[kpSamp2,] datFACs2 <- as.matrix(Expr.dat[kpSamp2,names(Expr.dat)!="sample_id"]) rownames(datFACs2) <- annoFACs2$sample_id datFACs2 <- t(datFACs2) annoFACs2$subclass_label = make.names(annoFACs2$subclass_label) # Define class labels classBr <- annoFACs2$subclass_label classBr[annoFACs2$class_label!="Non-Neuronal"] = annoFACs2$class_label[annoFACs2$class_label!="Non-Neuronal"] classBr <- factor(classBr) clustersF <- factor(annoFACs2$subclass_label) gc() ``` Load the Patch-seq data and filter to only include those cells that will be used in the IVSCC methods manuscript. This includes cells from Vip, Pvalb, Sst, and Rbp4 Cre lines. ```{r load patchseq} # Read in the data Samp.datp <- read_feather(paste(patchDir,"anno.feather",sep="")) Expr.datp <- feather(paste(patchDir,"data.feather",sep="")) # FPKM Samp.datp <- Samp.datp[match(Expr.datp$sample_id,Samp.datp$sample_id),] # Read in sample subset and check to make sure they are all there cells_ivscc <- read.csv("cells_for_IVSCC_methods_all.csv") # "cells_for_IVSCC_methods.csv" print(c(length(as.character(cells_ivscc$specimen_id)),length(intersect(as.character(cells_ivscc$specimen_id),Samp.datp$spec_id_label)))) # Filter data kpSampP <- match(as.character(cells_ivscc$specimen_id),Samp.datp$spec_id_label) #1:dim(Samp.datp)[1] # is.element(Samp.datp$collection_label,c("Patch-seq Production","Patch-seq Pre-production")) # Use all cells annoPat_all<- Samp.datp[kpSampP,] annoPat_all$dendcluster_color = annoPat_all$cluster_color # I don't know why/if we need this line of code... datPat_all <- as.matrix(Expr.datp[kpSampP,names(Expr.datp)!="sample_id"]) rownames(datPat_all) <- annoPat_all$sample_id datPat_all <- t(datPat_all) gc() ``` Define marker genes for each broad class and contamination class (use 50 total for now). These are selected using some reasonable approach which involves a combination of median expression per class and proportion of cells per class expressing a given gene. The approach is specifically what was published in the `patchSeqQC` library, with the only difference being that the specific reference data set was changed to our mouse VISp/ALM data set. Here is the description of on and off markers: 'On markers, are genes that are highly expressed in the cell type of interest with enriched expression relative to other cell types. The second class, Off markers, are expected to be expressed at low levels in a given patch-seq cell type.' For this analysis, we build "on" markers by subclass, and "off" markers by subclass for Non-neuronal cells and by class for neuronal cells. This approximates what was done in Shreejoy's paper. ```{r define marker genes} markers <- defineClassMarkers(datFACs2,clustersF,classBr,numMarkers = 50) allMarkers <- unique(unlist(markers)) markerTable <- NULL for (i in 1:length(markers)) markerTable <- cbind(markerTable,markers[[i]]) colnames(markerTable) <- names(markers) write.csv(markers, paste0(outputFolder,"class_markers.csv"),row.names=FALSE) ``` Format the data for FACS and patch-seq. ```{r Format the data} # Format the data for FACS and patch-seq tmp <- datFACs2 rownames(tmp) <- make.names(rownames(tmp)) facs_df <- as.data.frame(t(tmp[allMarkers,])+1) facs_df$sample_id <- rownames(facs_df) facs_df$major_type <- as.character(classBr) facs_df$contam_type <- as.character(clustersF) tmp <- datPat_all rownames(tmp) <- make.names(rownames(tmp)) pat_df <- as.data.frame(t(tmp[allMarkers,annoPat_all$sample_id])+1) pat_df$sample_id <- rownames(pat_df) ``` Define which subclass each patch-seq cell is assigned to, based on maximal marker expression. ```{r Calculate contamination} nm <- names(markers) isOn <- substr(nm,nchar(nm)-2,nchar(nm))=="_on" useThese <- nm[isOn&(!is.element(nm,paste0(nm,"_on")))] useThese <- setdiff(useThese,c("CR_on","Meis2_on")) # These types are rare and unlikely to be actually patched. subclassDat <- calcContamAllTypes(pat_df, markers[useThese]) # Identify subclass based on marker gene expression subclass <- colnames(subclassDat)[subclassDat %>% apply(1,which.max)] subclass <- gsub("_on","",subclass) pat_df$contam_type <- subclass tmp2 = match(pat_df$contam_type,annoFACs2$subclass_label) pat_df$major_type <- as.character(classBr)[tmp2] pat_df$contam_type <- paste0(pat_df$contam_type,"_on") ``` ### Calculate the QC metrics This is also a wrapper function for `patchSeqQC` which includes quality score, contamination score, and marker gene sum (both normalized and un-normalized). Here are some approximate definitions for these things: * norm_marker_sum - This is a measure of how much expression of expected marker genes are found in a patch-seq cell relative to what is seen in the FACs data from which the markers were derived (more detail below.) * contam_sum - the contamination index for cell c (of type A), reflects off-target contamination across multiple broad cell types * quality_score - "we correlated each patch-seq sample's expression of “on” and “off” marker genes with the average expression profile of dissociated cells of the same type' (Details on norm marker sum: "We summarized the expression of multiple cell type specific markers specific to cell type B (MarkersB), in a cell c of type A as: Mc_A, B=∑m∈MarkersBlog2(cm). Where cm denotes the normalized expression of marker gene m in cell c. We further used the dissociated-cell reference data to quantify how much marker expression of cell type B's markers one would typically expect in cells of type A as: dA_B=mediantypeA(Mc_A, B). Reflecting the median marker expression of cell type B's markers in dissociated cells of type A.") ```{r caclulate QC metrics} qcMetrics <- calculatePatchSeqQCMetrics2(pat_df,facs_df,markers) # We are using NMS score of 0.4 as a pass/fail call qcMetrics$QC_pass <- c(TRUE,FALSE)[(qcMetrics$marker_sum_norm<0.40)+1] ``` ### Plot subclass vs. NMS pass First group by NMS pass/fail. ```{r prep data for plot} # Merge sparse subclasses class2 <- subclass class2[is.element(class2,c("Serpinf1","Sncg","Lamp5"))] = "Inh_other" class2[is.element(class2,c("L2.3.IT","L6.CT","L6.IT","NP"))] = "Exc_other" class2 <- factor(class2,levels=c("Pvalb","Sst","Vip","Inh_other","L4","L5.IT","L5.PT","Exc_other")) geno <- factor(cells_ivscc$Cre, levels=c("Pvalb","Sst","Vip","Rbp4")) datPlot <- data.frame(cell=annoPat_all$spec_id_label, NMS_pass=factor(qcMetrics$QC_pass,levels=c(TRUE,FALSE)), genotype=geno, subclass=class2) head(datPlot) ``` ```{r plot subclass assignments by NMS score, fig.width=7,fig.height=9} g1 <- ggplot(datPlot, aes(x= subclass, group=NMS_pass)) + geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + geom_text(aes( label = scales::percent(..prop..,accuracy = 0.1), y= ..prop.. ), stat= "count", size=3, vjust = -.5) + labs(y = "Percent", fill="subclass") + facet_grid(genotype ~ NMS_pass) + scale_y_continuous(labels = scales::percent, expand = expansion(mult = c(0,0.1), add = 0)) + scale_x_discrete(expand = expansion(add = 1)) + theme(axis.text.x = element_text(angle = 90)) g1 ggsave("subclass_plot.pdf",height=9,width=8) ``` Now subset for GABA cells only and bin by 0.1 bins. ```{r prep data for plot GABA} kp <- (geno!="Rbp4") match <- as.character(geno)==as.character(class2) datPlot <- data.frame(cell=annoPat_all$spec_id_label, NMS_score=floor(10*qcMetrics$marker_sum_norm)/10, genotype=geno, match=match)[kp,] head(datPlot) tabPlot <- table(datPlot$NMS_score,datPlot$match,droplevels(datPlot$genotype)) for (i in 1:3) tabPlot[,,i] <- tabPlot[,,i]/rowSums(tabPlot[,,i]) write.csv(tabPlot[,2,],"heatmap_values.csv") ``` ```{r plot GABA results, fig.height=8,fig.width=5} out <- tabPlot[,2,] out <- out[dim(out)[1]:1,] col <- WGCNA::blueWhiteRed(50)[30:50] gplots::heatmap.2(out,scale="none", Rowv = NA, Colv=NA, col=col, cellnote=round(100*out), notecol="black",trace="none",notecex=0.7,margins=c(5,15)) pdf("subclass_heatmap.pdf") gplots::heatmap.2(out,scale="none", Rowv = NA, Colv=NA, col=col, cellnote=round(100*out), notecol="black",trace="none",notecex=0.7,margins=c(8,20)) dev.off() ``` ```{r session info} sessionInfo() ```