---
title: "Adult scRNAseq datasets analysis"
subtitle: "GL60(01+04) + run_3"
author: "Renaud Mevel"
output:
  html_document:
    self_contained: yes
    toc: true
    toc_float: true
    df_print: paged
    number_sections: false
editor_options: 
  chunk_output_type: console
---
```{r setup, echo=FALSE, message=FALSE, results='hide'}
library(knitr)
knitr::opts_chunk$set(cache=TRUE, error=FALSE, cache.lazy = TRUE)
```

## Objective

Analysis of 3 single cell RNAseq datasets generated on the 10x platform, including:  
* pre-processing of the 3 individual runs  
* assignment of cells to MULTIseq tag oligos used during sample multiplexing  
* integration of the 3 runs into a single object  
* dimension reduction, clustering  
  
    
Raw sequencing files and processed gene expression matrices have been deposited in the NCBI Gene Expression Omnibus under the accession number GSE15194.  
    
  
Directories need to be adapted throughout the scripts.
    
## Prepare the environment

```{r , warning=FALSE, message=FALSE}
# Data wrangling
library(plyr)
library(dplyr)
library(tidyverse)
library(data.table)

# Plots
library(gridExtra)
library(ggpubr)

# sc
library(Seurat)
library(sctransform)
library(MAST)
library(org.Mm.eg.db)
library(DoubletFinder)
library(ggbeeswarm)

# GO
library(gprofiler2)

# Palettes
library(pals)
pal25 <- as.character(pals::cols25(n=25))
kel <- as.character(pals::kelly(n=22)[3:22])
pal.trt <- c("#a1e186", "#b9006e")
pal.rfp <- c("#ea4749", "#479bea")
pal.runs <- c("#ec0016", "#ffc554", "#20a4ff")
pal.lobe = c("#272873", "#45A5A7", "#cb5155")
pal.pop <- 
  c( 
    "#7a0177", "#dd3497",    # cas AP
    "#f768a1", "#fa9fb5",    # cas DLP
    "#fc9272", "#cb181d",    # cas
    "#CC6677", "#AA4466",    # cas VP
    "#081d58", "#225ea8",    # hn AP
    "#7fcdbb", "#7fb8cd",    # hn DLP
    "#67a9cf", "#b2d3e7"     # hn VP
    )

pal.pop2 <- 
  c( 
    "#7a0177", "#dd3497",    # cas AP
    "#f768a1", "#fa9fb5",    # cas DLP
    "#CC6677", "#AA4466",    # cas VP
    "#8294B7", "#3F5C91",    # hn AP
    "#75BBB4", "#468C84",    # hn DLP
    "#77C7E6", "#3A92C7"     # hn VP
    )

pal.cl <- c(
  "#B2B8E0", #LumA
  "#4A6FE3", #LumB
  "#1037AA", #LumC
  "#D33F6A", #LumD
  "#EF9708", #LumE
  "#F0B98D", #LumF
  "#8DD593"  #Basal
  )

# Directories
setwd(dir = "~/set-directory/")
pdf.dir <- "~/set-directory/"
fig.dir <- "~/set-directory/"

# Functions
source("Adult_functions.R")

# Seed
set.seed(1)
```

## Run 1: run_1

### Load 10x

```{r}
# This contains the  output of cell ranger count
cr <- Read10X(data.dir = file.path("raw_data/run_1/filtered_feature_bc_matrix/")) # NOVASEQ RUN
sc <- CreateSeuratObject(counts = cr, project = "run_1", assay = "RNA")
```

### Load multiseq

```{r}
bar.table01 <- readRDS(file = file.path("raw_data/run_1/multiseq_count_matrix/bar.table.rds"))
```

Transform and plot data
```{r}
# transform
tbar <- t(bar.table01)
tbar <- tbar[1:8, ]

htos01 <- as.data.frame(tbar)

# make long format and plot log read counts
df.long01 <- as.data.frame(tbar) %>% 
  tibble::rownames_to_column(var = "tag") %>% 
  tidyr::gather(cell_barcode, value, -tag) %>% 
  dplyr::mutate(Log = log10(value+1))

ggplot(df.long01, aes(x=cell_barcode)) + 
  geom_point(aes(y=Log, colour = factor(tag)), alpha = 0.5) + 
  theme(panel.grid.minor = element_blank(),
        axis.text.x=element_blank()) +
  facet_grid(~tag) +
  scale_colour_brewer(palette = "Paired") +
  ggtitle("Run 1: MULTIseq read count") +  
  ylab("Log10 of tag read count") +
  xlab("Unique cell Barcodes") +
  labs(color='tags')
```

Add hto data
```{r}
# rename
colnames(htos01) <- paste0(colnames(htos01), "-1", sep = "")

# Add HTO data as a new assay independent from RNA
sc[["HTO"]] <- CreateAssayObject(counts = htos01)

# Number of cells
cat("Number of cells:", length(colnames(sc)))
```

### Filter

Filter-out low quality cells. Keep cells which have more than 750 genes, less than 7500 UMIs, and less than 10% mitochondrial transcripts.
```{r}
sc[["percent.mt"]] <- PercentageFeatureSet(sc, pattern = "^mt-")
plot1 <- VlnPlot(sc, features = c("percent.mt"))
plot2 <- VlnPlot(sc, features = c("percent.mt"), y.max = 10)
plot1+plot2

VlnPlot(sc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

plot1 <- FeatureScatter(sc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(sc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1+plot2

sc <- subset(sc, subset = nFeature_RNA > 750 & nFeature_RNA < 7500 & percent.mt < 10)
VlnPlot(sc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

plot1 <- FeatureScatter(sc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(sc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1+plot2

cat("Number of cells:", length(colnames(sc)))
```

### Demultiplex cells

Normalize HTO data, here we use centered log-ratio (CLR) transformation. Then use `HTODemux` to find thresholds.
```{r}
DefaultAssay(object = sc) <- "HTO"
sc <- NormalizeData(sc, assay = "HTO", normalization.method = "CLR")

# 0.99
sc <- HTODemux(sc, assay = "HTO", positive.quantile = 0.99, kfunc = "kmeans")

# Print numbers of cells per classification
table(sc$HTO_classification.global)
table(sc$hash.ID)
```

Show barcodes in UMAP space. Some cells appear to be wrongly classified. We will deal with them later by applying a stricter threshold determined empirically to remove cells with low barcode counts.
```{r}
sc <- RunUMAP(sc, assay = "HTO", features = rownames(sc@assays$HTO))
DimPlot(sc, reduction = "umap", cols = pal25, pt.size = 1)
ggsave(filename = file.path(fig.dir, "0_individual_runs/Run1_UMAP_tags_unfiltered.png"), device = "png", width = 6, height = 4, dpi = 300)
```

Note: the labelling of Barcode 3 has failed, meant to label DLP_HN_RFP+ cells. This population will be included later from the 3rd run ('run_3').
  
  
Add the multiseq tag data to sc object
```{r}
# add metadata under tag name
sc <- AddMetaData(object = sc, metadata = sc$hash.ID, col.name = "tag")
```

### Metadata

Add population/sample name to the tag information
```{r}
#read in metadata
meta <- read.delim(file.path("metadata/metadata_run1.txt"), "\t", header = T, stringsAsFactors = F)
meta <- dplyr::filter(meta, run_id == "run_1")
meta <- dplyr::rename(meta, final.ids = bar_id)

#extract cell ids/htos
df.tags <- as.data.frame(sc$tag)

#make the datafrane
df.pop <- tibble::rownames_to_column(.data = df.tags)
df.pop <- dplyr::rename(df.pop, final.ids = `sc$tag`)
df.pop <- left_join(df.pop, meta, "final.ids")

df.sa <- dplyr::select(df.pop, rowname, population)
table(df.sa$population)

df.trt <- dplyr::select(df.pop, rowname, treatment)
table(df.trt$treatment)

df.lobe <- dplyr::select(df.pop, rowname, lobe)
table(df.lobe$lobe)

df.rfp <- dplyr::select(df.pop, rowname, RFP)
table(df.rfp$RFP)

#make right vector to add metadata
df.sa <- tibble::column_to_rownames(.data = df.sa, var = "rowname")
df.trt <- tibble::column_to_rownames(.data = df.trt, var = "rowname")
df.lobe <- tibble::column_to_rownames(.data = df.lobe, var = "rowname")
df.rfp <- tibble::column_to_rownames(.data = df.rfp, var = "rowname")

#add metadata
sc <- AddMetaData(object = sc, metadata = df.sa, col.name = "population")
sc <- AddMetaData(object = sc, metadata = df.trt, col.name = "treatment")
sc <- AddMetaData(object = sc, metadata = df.lobe, col.name = "lobe")
sc <- AddMetaData(object = sc, metadata = df.rfp, col.name = "rfp")
```

### Manual filtering

UMAP of singlets in barcode space reveals some likely wrong clustering.
```{r}
tmp <- sc
DefaultAssay(tmp) <- "HTO"
Idents(tmp) <- "HTO_classification.global"
tmp <- subset(tmp, idents = "Singlet")
Idents(tmp) <- "hash.ID"
table(tmp$HTO_classification.global)
table(tmp$hash.ID)
tmp <- RunUMAP(tmp, assay = "HTO", features = rownames(tmp@assays$HTO))
DimPlot(tmp, reduction = "umap", cols = pal25, pt.size = 1)
rm(tmp)
```

*Cut-offs found by HTODemux:*  
* Cutoff for Bar1 : 32 reads  
* Cutoff for Bar2 : 122 reads  
* Cutoff for Bar3 : 0 reads  
* Cutoff for Bar4 : 46 reads  
* Cutoff for Bar5 : 19 reads  
* Cutoff for Bar6 : 22 reads  
* Cutoff for Bar7 : 6 reads  
* Cutoff for Bar8 : 5 reads  
  
  
*Manual cut-offs:*  
* Bar1: 35  
* Bar2: 125  
* Bar3: exclude  
* Bar4: 50  
* Bar5: 30  
* Bar6: 30  
* Bar7: 25  
* Bar8: 25  
  
  
Apply empirical thresholds:
```{r}
DefaultAssay(sc) <- "HTO"

Bar1 <- subset(sc, idents = "Bar1", subset = Bar1>35, slot = "counts")
Bar2 <- subset(sc, idents = "Bar2", subset = Bar2>125, slot = "counts")
Bar4 <- subset(sc, idents = "Bar4", subset = Bar4>50, slot = "counts") #
Bar5 <- subset(sc, idents = "Bar5", subset = Bar5>30, slot = "counts")
Bar6 <- subset(sc, idents = "Bar6", subset = Bar6>30, slot = "counts") #
Bar7 <- subset(sc, idents = "Bar7", subset = Bar7>25, slot = "counts") #
Bar8 <- subset(sc, idents = "Bar8", subset = Bar8>25, slot = "counts") #

sc <- merge(Bar1, y = c(Bar2,Bar4,Bar5,Bar6,Bar7,Bar8))

# Number of cells per classification
table(sc$HTO_classification.global)
table(sc$hash.ID)

# UMAP in barode space
sc <- RunUMAP(sc, assay = "HTO", features = rownames(sc@assays$HTO))
DimPlot(sc, reduction = "umap", cols = pal25)

cat("Number of cells:", length(colnames(sc)))
```

### DoubletFinder

The use of Multiseq barcodes enables the identification of most doublets. However, because the labelling did not work well for one population (Barcode 3), we are likely missing certain barcodes. To further clean the dataset of additional doublets, we apply the [DoubletFinder](https://github.com/chris-mcginnis-ucsf/DoubletFinder) package.
```{r}
## Pre-process  -------------------------------------------------------------------------------------------------------------
sc.d1 <- sc
DefaultAssay(sc.d1) <- "RNA"
sc.d1 <- NormalizeData(sc.d1, normalization.method = "LogNormalize", scale.factor = 1e4)
sc.d1 <- FindVariableFeatures(sc.d1, selection.method = "vst", nfeatures = 3000, verbose = FALSE)
sc.d1 <- ScaleData(sc.d1, verbose = FALSE)
sc.d1 <- RunPCA(sc.d1, npcs = 50, verbose = T)
sc.d1 <- RunUMAP(sc.d1, reduction = "pca", dims = 1:50, seed.use = 1)
sc.d1 <- FindNeighbors(sc.d1, reduction = "pca", dims = 1:50, k.param = 20L)
sc.d1 <- FindClusters(sc.d1, resolution = 0.8)
DimPlot(sc.d1, cols = pal25) 

## pK Identification (no ground-truth) ---------------------------------------------------------------------------------------
sweep.res.list_sc.d1 <- paramSweep_v3(sc.d1, PCs = 1:50, sct = FALSE)
sweep.stats_sc.d1 <- summarizeSweep(sweep.res.list_sc.d1, GT = FALSE)
bcmvn_sc.d1 <- find.pK(sweep.stats_sc.d1)

## Plot results of pK Identification
pK=as.numeric(as.character(bcmvn_sc.d1$pK))
BCmetric=bcmvn_sc.d1$BCmetric
pK_choose = pK[which(BCmetric %in% max(BCmetric))]

par(mar=c(5,4,4,8)+1,cex.main=1.2,font.main=2)
plot(x = pK, y = BCmetric, pch = 16,type="b", col = "blue",lty=1)
abline(v=pK_choose,lwd=2,col='red',lty=2)
title("The BCmvn distributions")
text(pK_choose,max(BCmetric),as.character(pK_choose),pos = 4,col = "red")

## Homotypic Doublet Proportion Estimate -------------------------------------------------------------------------------------
annotations <- sc.d1@meta.data$tag
homotypic.prop <- modelHomotypic(annotations)           
nExp_poi <- round(0.05*length(colnames(sc.d1)))  ## Assuming 5% doublet 
nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop))

## Run DoubletFinder with varying classification stringencies ----------------------------------------------------------------
sc.d1 <- doubletFinder_v3(sc.d1, PCs = 1:50, pN = 0.25, pK = 0.02, nExp = nExp_poi.adj, reuse.pANN = FALSE, sct = FALSE)
table(sc.d1$DF.classifications_0.25_0.02_25)
DimPlot(sc.d1, group.by = "DF.classifications_0.25_0.02_25", cols = pal25)

## Extract cell ids of Singlets ----------------------------------------------------------------------------------------------
Idents(sc.d1) <- "DF.classifications_0.25_0.02_25"
sc.d1 <- subset(sc.d1, idents = "Singlet")
singlets <- names(sc.d1$DF.classifications_0.25_0.02_25)
```

### Singlets

```{r}
# keep singlets only based on DoubletFinder
# (or not singlets <- names(sc$HTO_classification.global)) // delete
sc <- subset(sc, cells = singlets)
cat("Number of cells:", length(colnames(sc)))

# make a copy at this stage for future use if needed
sc01_save <- sc

sc01 <- sc
DefaultAssay(sc01) <- "RNA"
Idents(sc01) <- "population"

# umap on tags after removing negatives and doublets
sc.umap01<- RunUMAP(sc01, assay = "HTO", features = rownames(sc01@assays$HTO))
DimPlot(sc.umap01, group.by = "tag", cols = pal25, pt.size = 1) 
#ggsave(filename = file.path(pdf.dir, "run_1_UMAP_tags_filtered.pdf"), width = 10, height = 6)
ggsave(filename = file.path(fig.dir, "0_individual_runs/Run1_UMAP_tags_filtered.png"), device = "png", width = 6, height = 4, dpi = 300)
```

Remove unwanted objects
```{r}
rm(sc,Bar1,Bar2,Bar4,Bar5,Bar6,Bar7,Bar8)
rm(singlets,sc.d1,sweep.res.list_sc.d1,sweep.stats_sc.d1,bcmvn_sc.d1,pK,BCmetric,pK_choose,annotations,homotypic.prop,nExp_poi,nExp_poi.adj)
```

## Run 2: run_2

### Load 10x

```{r}
# This contains the  output of cell ranger count
cr <- Read10X(data.dir = file.path("raw_data/run_2/filtered_feature_bc_matrix/")) # NOVASEQ RUN
sc <- CreateSeuratObject(counts = cr, project = "run_2", assay = "RNA")
```

### Load multiseq

```{r}
bar.table04 <- readRDS(file = file.path("raw_data/run_2/multiseq_count_matrix/bar.table.rds"))
```

Transform and plot data
```{r}
# transform
tbar <- t(bar.table04)
tbar <- tbar[1:12, ]

htos04 <- as.data.frame(tbar)
  
# make long format and plot log read counts
df.long04 <- as.data.frame(tbar) %>% 
  tibble::rownames_to_column(var = "tag") %>% 
  tidyr::gather(cell_barcode, value, -tag) %>% 
  dplyr::mutate(Log = log10(value+1))

ggplot(df.long04, aes(x=cell_barcode)) + 
  geom_point(aes(y=Log, colour = factor(tag)), alpha = 0.5) + 
  theme(panel.grid.minor = element_blank(),
        axis.text.x=element_blank()) +
  facet_grid(~tag) +
  scale_colour_brewer(palette = "Paired") +
  ggtitle("Run 2: MULTIseq read count") +
  ylab("Log10 of tag read count") +
  xlab("Unique cell Barcodes") +
  labs(color='tags')
```

Add hto data
```{r}
# rename
colnames(htos04) <- paste0(colnames(htos04), "-1", sep = "")

# Add HTO data as a new assay independent from RNA
sc[["HTO"]] <- CreateAssayObject(counts = htos04)

# Number of cells
cat("Number of cells:", length(colnames(sc)))
```

### Filter

Filter-out low quality cells. Keep cells which have more than 750 genes, less than 7500 UMIs, and less than 10% mitochondrial transcripts.
```{r}
sc[["percent.mt"]] <- PercentageFeatureSet(sc, pattern = "^mt-")
plot1 <- VlnPlot(sc, features = c("percent.mt"))
plot2 <- VlnPlot(sc, features = c("percent.mt"), y.max = 10)
plot1+plot2

VlnPlot(sc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

plot1 <- FeatureScatter(sc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(sc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1+plot2

sc <- subset(sc, subset = nFeature_RNA > 750 & nFeature_RNA < 7500 & percent.mt < 10)
VlnPlot(sc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

plot1 <- FeatureScatter(sc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(sc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1+plot2

# Number of cells
cat("Number of cells:", length(colnames(sc)))
```

### Demultiplex cells

Normalize HTO data, here we use centered log-ratio (CLR) transformation. Then use `HTODemux` to find thresholds.
```{r}
DefaultAssay(object = sc) <- "HTO"
sc <- NormalizeData(sc, assay = "HTO", normalization.method = "CLR")

# 0.99
sc <- HTODemux(sc, assay = "HTO", positive.quantile = 0.99, kfunc = "kmeans")

# Print numbers of cells per classification
table(sc$HTO_classification.global)
table(sc$hash.ID)
```

Show barcodes in UMAP space. Some cells appear to be wrongly classified. We will deal with them later by applying a stricter threshold determined empirically to remove cells with low barcode counts.
```{r}
sc <- RunUMAP(sc, assay = "HTO", features = rownames(sc@assays$HTO))
DimPlot(sc, reduction = "umap", cols = pal25, pt.size = 1)
ggsave(filename = file.path(fig.dir, "0_individual_runs/Run2_UMAP_tags_unfiltered.png"), device = "png", width = 6, height = 4, dpi = 300)
```

Note: the labelling of Barcode 3 has failed here too, meant to label DLP_HN_RFP+ cells. This population will be included later from the 3rd run ('run_3').
  
  
Add the multiseq tag data to sc object
```{r}
# add metadata under tag name
sc <- AddMetaData(object = sc, metadata = sc$hash.ID, col.name = "tag")
```

### Metadata

Add population/sample name to the tag information
```{r}
#read in metadata
meta <- read.delim(file.path("metadata/metadata_run2.txt"), "\t", header = T, stringsAsFactors = F)
meta <- dplyr::filter(meta, run_id == "run_2")
meta <- dplyr::rename(meta, final.ids = bar_id)

#extract cell ids/htos
df.tags <- as.data.frame(sc$tag)

#make the datafrane
df.pop <- tibble::rownames_to_column(.data = df.tags)
df.pop <- dplyr::rename(df.pop, final.ids = `sc$tag`)
df.pop <- left_join(df.pop, meta, "final.ids")

df.sa <- dplyr::select(df.pop, rowname, population)
table(df.sa$population)

df.trt <- dplyr::select(df.pop, rowname, treatment)
table(df.trt$treatment)

df.lobe <- dplyr::select(df.pop, rowname, lobe)
table(df.lobe$lobe)

df.rfp <- dplyr::select(df.pop, rowname, RFP)
table(df.rfp$RFP)

#make right vector to add metadata
df.sa <- tibble::column_to_rownames(.data = df.sa, var = "rowname")
df.trt <- tibble::column_to_rownames(.data = df.trt, var = "rowname")
df.lobe <- tibble::column_to_rownames(.data = df.lobe, var = "rowname")
df.rfp <- tibble::column_to_rownames(.data = df.rfp, var = "rowname")

#add metadata
sc <- AddMetaData(object = sc, metadata = df.sa, col.name = "population")
sc <- AddMetaData(object = sc, metadata = df.trt, col.name = "treatment")
sc <- AddMetaData(object = sc, metadata = df.lobe, col.name = "lobe")
sc <- AddMetaData(object = sc, metadata = df.rfp, col.name = "rfp")
```

### Manual filtering

UMAP of singlets in barcode space reveals some likely wrong clustering.
```{r}
tmp <- sc
DefaultAssay(tmp) <- "HTO"
Idents(tmp) <- "HTO_classification.global"
tmp <- subset(tmp, idents = "Singlet")
Idents(tmp) <- "hash.ID"
table(tmp$HTO_classification.global)
table(tmp$hash.ID)
tmp <- RunUMAP(tmp, assay = "HTO", features = rownames(tmp@assays$HTO))
DimPlot(tmp, reduction = "umap", cols = pal25, pt.size = 1)
rm(tmp)
```

*Cut-offs found by HTODemux:*  // UPDATE VALUES
Cutoff for Bar1 : 33 reads  
Cutoff for Bar2 : 70 reads  
Cutoff for Bar3 : 1 reads  
Cutoff for Bar4 : 75 reads  
Cutoff for Bar5 : 93 reads  
Cutoff for Bar6 : 148 reads  
Cutoff for Bar7 : 30 reads  
Cutoff for Bar8 : 17 reads  
Cutoff for Bar9 : 24 reads  
Cutoff for Bar10 : 12 reads  
Cutoff for Bar11 : 19 reads  
Cutoff for Bar12 : 17 reads  
  
  
*Manual cut-offs:*  
* Bar1: 35  
* Bar2: 75  
* Bar3: exclude  
* Bar4: 80  
* Bar5: 100  
* Bar6: 150  
* Bar7: 35  
* Bar8: 25  
* Bar9: 25  
* Bar10: 20  
* Bar11: 25  
* Bar12: 25  
  
  
Apply empirical thresholds:
```{r}
DefaultAssay(sc) <- "HTO"

Bar1 <- subset(sc, idents = "Bar1", subset = Bar1>35, slot = "counts")
Bar2 <- subset(sc, idents = "Bar2", subset = Bar2>75, slot = "counts")
Bar4 <- subset(sc, idents = "Bar4", subset = Bar4>80, slot = "counts")
Bar5 <- subset(sc, idents = "Bar5", subset = Bar5>100, slot = "counts")
Bar6 <- subset(sc, idents = "Bar6", subset = Bar6>150, slot = "counts")
Bar7 <- subset(sc, idents = "Bar7", subset = Bar7>35, slot = "counts")
Bar8 <- subset(sc, idents = "Bar8", subset = Bar8>25, slot = "counts")
Bar9 <- subset(sc, idents = "Bar9", subset = Bar9>25, slot = "counts")
Bar10 <- subset(sc, idents = "Bar10", subset = Bar10>20, slot = "counts")
Bar11 <- subset(sc, idents = "Bar11", subset = Bar11>25, slot = "counts")
Bar12 <- subset(sc, idents = "Bar12", subset = Bar12>25, slot = "counts")

sc <- merge(Bar1, y = c(Bar2,Bar4,Bar5,Bar6,Bar7,Bar8,Bar9,Bar10,Bar11,Bar12))

# Number of cells per classification
table(sc$HTO_classification.global)
table(sc$hash.ID)

# UMAP in barode space
sc <- RunUMAP(sc, assay = "HTO", features = rownames(sc@assays$HTO))
DimPlot(sc, reduction = "umap", cols = pal25)

# Number of cells
cat("Number of cells:", length(colnames(sc)))s
```

### DoubletFinder

The use of Multiseq barcodes enables the identification of most doublets. However, because the labelling did not work well for one population (Barcode 3), we are likely missing certain barcodes. To further clean the dataset of additional doublets, we apply the [DoubletFinder](https://github.com/chris-mcginnis-ucsf/DoubletFinder) package.
```{r}
## Pre-process  -------------------------------------------------------------------------------------------------------------
sc.d4 <- sc
DefaultAssay(sc.d4) <- "RNA"
sc.d4 <- NormalizeData(sc.d4, normalization.method = "LogNormalize", scale.factor = 1e4)
sc.d4 <- FindVariableFeatures(sc.d4, selection.method = "vst", nfeatures = 3000, verbose = FALSE)
sc.d4 <- ScaleData(sc.d4, verbose = FALSE)
sc.d4 <- RunPCA(sc.d4, npcs = 50, verbose = T)
sc.d4 <- RunUMAP(sc.d4, reduction = "pca", dims = 1:50, seed.use = 1)
sc.d4 <- FindNeighbors(sc.d4, reduction = "pca", dims = 1:50, k.param = 20L)
sc.d4 <- FindClusters(sc.d4, resolution = 0.8)
DimPlot(sc.d4, cols = pal25) 

## pK Identification (no ground-truth) ---------------------------------------------------------------------------------------
sweep.res.list_sc.d4 <- paramSweep_v3(sc.d4, PCs = 1:50, sct = FALSE)
sweep.stats_sc.d4 <- summarizeSweep(sweep.res.list_sc.d4, GT = FALSE)
bcmvn_sc.d4 <- find.pK(sweep.stats_sc.d4)

## Plot results of pK Identification
pK=as.numeric(as.character(bcmvn_sc.d4$pK))
BCmetric=bcmvn_sc.d4$BCmetric
pK_choose = pK[which(BCmetric %in% max(BCmetric))]

par(mar=c(5,4,4,8)+1,cex.main=1.2,font.main=2)
plot(x = pK, y = BCmetric, pch = 16,type="b", col = "blue",lty=1)
abline(v=pK_choose,lwd=2,col='red',lty=2)
title("The BCmvn distributions")
text(pK_choose,max(BCmetric),as.character(pK_choose),pos = 4,col = "red")

## Homotypic Doublet Proportion Estimate -------------------------------------------------------------------------------------
annotations <- sc.d4@meta.data$tag
homotypic.prop <- modelHomotypic(annotations)           
nExp_poi <- round(0.05*length(colnames(sc.d4)))  ## Assuming 5% doublet 
nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop))

## Run DoubletFinder with varying classification stringencies ----------------------------------------------------------------
## Use pK of 0.01 as it corresponds to the second maxima after 0.25, which is too high and performs poorly. 
## 0.01 accurately identifies a small talk of cells clustering with basal cells and expressing high levels of Pbsn.
sc.d4 <- doubletFinder_v3(sc.d4, PCs = 1:50, pN = 0.25, pK = 0.01, nExp = nExp_poi.adj, reuse.pANN = FALSE, sct = FALSE)
table(sc.d4$DF.classifications_0.25_0.01_95)
DimPlot(sc.d4, group.by = "DF.classifications_0.25_0.01_95", cols = pal25)

## Extract cell ids of Singlets ----------------------------------------------------------------------------------------------
Idents(sc.d4) <- "DF.classifications_0.25_0.01_95"
sc.d4 <- subset(sc.d4, idents = "Singlet")
singlets <- names(sc.d4$DF.classifications_0.25_0.01_95)
```

### Singlets

```{r}
# keep singlets only based on DoubletFinder
# (or not singlets <- names(sc$HTO_classification.global)) // delete
sc <- subset(sc, cells = singlets)

# Number of cells
cat("Number of cells:", length(colnames(sc)))

# make a copy at this stage for future use if needed
sc04_save <- sc

sc04 <- sc
DefaultAssay(sc04) <- "RNA"
Idents(sc04) <- "population"

# umap on tags after removing negatives and doublets
sc.umap04<- RunUMAP(sc04, assay = "HTO", features = rownames(sc04@assays$HTO))
DimPlot(sc.umap04, group.by = "tag", cols = pal25, pt.size = 1) 
#ggsave(filename = file.path(pdf.dir, "run_2_UMAP_tags_filtered.pdf"), width = 10, height = 6)
ggsave(filename = file.path(fig.dir, "0_individual_runs/Run2_UMAP_tags_filtered.png"), device = "png", width = 6, height = 4, dpi = 300)
```

Remove unwanted objects
```{r}
rm(sc,cr,Bar1,Bar2,Bar4,Bar5,Bar6,Bar7,Bar8,Bar9,Bar10,Bar11,Bar12,tbar)
rm(plot1,plot2)
rm(singlets,sc.d4,sweep.res.list_sc.d4,sweep.stats_sc.d4,bcmvn_sc.d4,pK,BCmetric,pK_choose,annotations,homotypic.prop,nExp_poi,nExp_poi.adj)
```

## Run 3: run_3

### Load 10x

```{r}
# This contains the  output of cell ranger count
cr <- Read10X(data.dir = file.path("raw_data/run_3/filtered_feature_bc_matrix/")) # HISEQ RUN
sc <- CreateSeuratObject(counts = cr, project = "run_3", assay = "RNA")
```

### Load multiseq

```{r}
bar.table64 <- readRDS(file = file.path("raw_data/run_3/multiseq_count_matrix/bar.table.rds"))
```

Transform and plot data
```{r}
# transform
tbar <- t(bar.table64)
tbar <- tbar[1:6, ]

htos64 <- as.data.frame(tbar)

# make long format and plot log read counts
df.long64 <- as.data.frame(tbar) %>% 
  tibble::rownames_to_column(var = "tag") %>% 
  tidyr::gather(cell_barcode, value, -tag) %>% 
  dplyr::mutate(Log = log10(value+1))

ggplot(df.long64, aes(x=cell_barcode)) + 
  geom_point(aes(y=Log, colour = factor(tag)), alpha = 0.5) + 
  theme(panel.grid.minor = element_blank(),
        axis.text.x=element_blank()) +  # turn off minor grid
  facet_grid(~tag) +
  scale_colour_brewer(palette = "Paired") +
  ggtitle("Run 3: MULTIseq read count") +
  ylab("Log10 of tag read count") +
  xlab("Unique cell Barcodes") +
  labs(color='tags')
```

Add hto data
```{r}
# rename
colnames(htos64) <- paste0(colnames(htos64), "-1", sep = "")

# Add HTO data as a new assay independent from RNA
sc[["HTO"]] <- CreateAssayObject(counts = htos64)

# Number of cells
cat("Number of cells:", length(colnames(sc)))
```

### Filter

Filter-out low quality cells
```{r}
sc[["percent.mt"]] <- PercentageFeatureSet(sc, pattern = "^mt-")
plot1 <- VlnPlot(sc, features = c("percent.mt"))
plot2 <- VlnPlot(sc, features = c("percent.mt"), y.max = 10)
plot1+plot2

VlnPlot(sc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

plot1 <- FeatureScatter(sc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(sc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1+plot2

sc <- subset(sc, subset = nFeature_RNA > 750 & nFeature_RNA < 7500 & percent.mt < 10)
VlnPlot(sc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

plot1 <- FeatureScatter(sc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(sc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1+plot2

# Number of cells
cat("Number of cells:", length(colnames(sc)))
```

### Demultiplex cells

Normalize HTO data, here we use centered log-ratio (CLR) transformation
```{r}
DefaultAssay(object = sc) <- "HTO"
sc <- NormalizeData(sc, assay = "HTO", normalization.method = "CLR")

# 0.99
sc <- HTODemux(sc, assay = "HTO", positive.quantile = 0.99, kfunc = "kmeans")

# Print numbers of cells per classification
table(sc$HTO_classification.global)
table(sc$hash.ID)
```

Show barcodes in UMAP space. Few cells appear to be wrongly classified. We will deal with them later by applying a stricter threshold determined empirically to remove cells with low barcode counts.
```{r}
sc <- RunUMAP(sc, assay = "HTO", features = rownames(sc@assays$HTO))
DimPlot(sc, reduction = "umap", cols = pal25, pt.size = 1)
ggsave(filename = file.path(fig.dir, "0_individual_runs/Run3_UMAP_tags_unfiltered.png"), device = "png", width = 6, height = 4, dpi = 300)
```

Add tag data to sc object
```{r}
# add metadata under tag name
sc <- AddMetaData(object = sc, metadata = sc$hash.ID, col.name = "tag")
```

### Metadata

Add population/sample name to the tag information
```{r}
#read in metadata
meta <- read.delim(file.path("metadata/metadata_run3.txt"), "\t", header = T, stringsAsFactors = F)
meta <- dplyr::filter(meta, run_id == "run_3")
meta <- dplyr::rename(meta, final.ids = bar_id)

#extract cell ids/htos
df.tags <- as.data.frame(sc$tag)

#make the datafrane
df.pop <- tibble::rownames_to_column(.data = df.tags)
df.pop <- dplyr::rename(df.pop, final.ids = `sc$tag`)
df.pop <- left_join(df.pop, meta, "final.ids")

df.sa <- dplyr::select(df.pop, rowname, population)
table(df.sa$population)

df.trt <- dplyr::select(df.pop, rowname, treatment)
table(df.trt$treatment)

df.lobe <- dplyr::select(df.pop, rowname, lobe)
table(df.lobe$lobe)

df.rfp <- dplyr::select(df.pop, rowname, RFP)
table(df.rfp$RFP)

#make right vector to add metadata
df.sa <- tibble::column_to_rownames(.data = df.sa, var = "rowname")
df.trt <- tibble::column_to_rownames(.data = df.trt, var = "rowname")
df.lobe <- tibble::column_to_rownames(.data = df.lobe, var = "rowname")
df.rfp <- tibble::column_to_rownames(.data = df.rfp, var = "rowname")

#add metadata
sc <- AddMetaData(object = sc, metadata = df.sa, col.name = "population")
sc <- AddMetaData(object = sc, metadata = df.trt, col.name = "treatment")
sc <- AddMetaData(object = sc, metadata = df.lobe, col.name = "lobe")
sc <- AddMetaData(object = sc, metadata = df.rfp, col.name = "rfp")
```

### Manual filtering

UMAP of singlets in barcode space shows good separation between conditions, so we don't add an extra manual filtering step.
```{r}
tmp <- sc
DefaultAssay(tmp) <- "HTO"
Idents(tmp) <- "HTO_classification.global"
tmp <- subset(tmp, idents = "Singlet")
Idents(tmp) <- "hash.ID"
table(tmp$HTO_classification.global)
table(tmp$hash.ID)
tmp <- RunUMAP(tmp, assay = "HTO", features = rownames(tmp@assays$HTO))
DimPlot(tmp, reduction = "umap", cols = pal25, pt.size = 1)
rm(tmp)
```

### Singlets

```{r}
# keep singlets only based on DoubletFinder
Idents(sc) <- "HTO_classification.global"
sc <- subset(sc, idents = "Singlet")

# Number of cells
cat("Number of cells:", length(colnames(sc)))

# make a copy at this stage for future use if needed
sc64_save <- sc

sc64 <- sc
DefaultAssay(sc64) <- "RNA"
Idents(sc64) <- "population"

# umap on tags after removing negatives and doublets
sc.umap64<- RunUMAP(sc64, assay = "HTO", features = rownames(sc64@assays$HTO))
DimPlot(sc.umap64, group.by = "tag", cols = pal25, pt.size = 1) 
#ggsave(filename = file.path(pdf.dir, "run_3_UMAP_tags_filtered.pdf"), width = 10, height = 6)
ggsave(filename = file.path(fig.dir, "0_individual_runs/Run3_UMAP_tags_filtered.png"), device = "png", width = 6, height = 4, dpi = 300)
```

Remove unwanted objects
```{r}
rm(sc)
```


## A. Seurat Integration

joseph_export
```{r}
joseph_export <- list(sc01, sc04, sc64)
saveRDS(joseph_export, file = file.path("r_save/non_integrated_export.rds"))
```


Prepare objects for integration. Normalise each object and find variable features:
```{r}
# run 1
sc.n1 <- sc01
DefaultAssay(sc.n1) <- "RNA"
sc.n1 <- NormalizeData(sc.n1, normalization.method = "LogNormalize", scale.factor = 1e4)
sc.n1 <- FindVariableFeatures(sc.n1, selection.method = "vst", nfeatures = 3000, verbose = FALSE)

# run 2
sc.n4 <- sc04
DefaultAssay(sc.n4) <- "RNA"
sc.n4 <- NormalizeData(sc.n4, normalization.method = "LogNormalize", scale.factor = 1e4)
sc.n4 <- FindVariableFeatures(sc.n4, selection.method = "vst", nfeatures = 3000, verbose = FALSE)

# run 3
sc.n64 <- sc64
DefaultAssay(sc.n64) <- "RNA"
sc.n64 <- NormalizeData(sc.n64, normalization.method = "LogNormalize", scale.factor = 1e4)
sc.n64 <- FindVariableFeatures(sc.n64, selection.method = "vst", nfeatures = 3000, verbose = FALSE)
```

### Integrate

Perform integration:
```{r}
prostate.list <- list(sc.n1, sc.n4, sc.n64)

# Find anchors
prostate.anchors <- FindIntegrationAnchors(object.list = prostate.list, dims = 1:50, anchor.features = 2000)

# Integrate
prostate.integrated <- IntegrateData(anchorset = prostate.anchors, dims = 1:50)

# copy to new object
sci <- prostate.integrated

# remove unwanted variables
rm(prostate.anchors,prostate.list)

# number of variable features in the integrated dataset
length(sci[["integrated"]]@var.features)
```

### Filter

Run 1 contains few castrated cells, which we kept until this point to perform the integration of the 3 datasets. However, these cells were sorted from the whole castrated prostate and we are not able to assign them to a specific lobe. For clarity and consistency, we remove these x cells from the analysis.
```{r}
Idents(sci) <- "lobe"
sci <- subset(sci, idents = c("AP", "DLP", "VP"))
Idents(sci) <- "population"
```

Change ident levels
```{r}
sci$lobe <- factor(x = sci$lobe, levels = c("AP", "DLP", "VP"))
```

### Scale RNA

Normalise/Scale RNA for later downstream use.
```{r}
DefaultAssay(sci) <- "RNA"

sci <- NormalizeData(sci, normalization.method = "LogNormalize", scale.factor = 1e4, assay = "RNA")
sci <- FindVariableFeatures(sci, selection.method = "vst", nfeatures = 3000, verbose = FALSE, assay = "RNA")
sci <- ScaleData(sci, verbose = T, assay = "RNA", features = rownames(sci))
```

## B. All cells

### UMAP/PCA

**Dim. red.**  
Note: might have to optimise, especially regarding the Ly6d/Upk3a cluster...  
```{r}
DefaultAssay(sci) <- "integrated"

sci <- ScaleData(sci, verbose = FALSE)
sci <- RunPCA(sci, npcs = 50, verbose = T)
sci <- RunUMAP(sci, reduction = "pca", dims = 1:50, seed.use = 1)
sci <- FindNeighbors(sci, reduction = "pca", dims = 1:50, k.param = 20L)
sci <- FindClusters(sci, resolution = 0.4)

DimPlot(sci, reduction = "umap", cols = pal25, label = TRUE, pt.size = 0.5)
```

### Visualise

Visualise clusters
```{r}
DimPlot(sci, reduction = "umap", cols = pal25, label = TRUE, pt.size = 1, label.size = 5) + ggtitle("All cells - Clusters")
ggsave(filename = file.path(pdf.dir, "Integrated_UMAP_clus_allcells.pdf"), width = 15, height = 10)
```

Visualise integration
```{r}
DimPlot(sci, reduction = "umap", pt.size = 1, group.by = "orig.ident", cols = pal.runs)
ggsave(filename = file.path(pdf.dir, "Integrated_UMAP_integration_allcells.pdf"), width = 15, height = 10)
```

Populations
```{r}
DimPlot(sci, group.by="population", pt.size=1, cols=pal.pop)
```

Visualise populations
```{r}
p1=DimPlot(sci, group.by="population", pt.size=.8, cols=pal.pop) + ggtitle("Sorted populations")
p2=DimPlot(sci, group.by="treatment", pt.size=.8, cols=pal.trt) + ggtitle("Hormonal status")
p3=DimPlot(sci, group.by = "lobe", pt.size=.8, cols=pal.lobe) + ggtitle("Lobes")
p4=DimPlot(sci, group.by = "rfp", pt.size=.8, cols=pal.rfp) + ggtitle("RFP status")
(p1+p2)/(p3+p4)
ggsave(filename = file.path(pdf.dir, "Allcells_UMAP_meta_allcells.pdf"), width = 20, height = 15)
```

Visualise gene expression
```{r}
DefaultAssay(sci) <- "RNA"

FeaturePlot(sci, "Msmb")
FeaturePlot(sci, "Pecam1")

FeaturePlot(sci, c("Ly6d", "Foxq1", "Upk3a", "Nupr1"))
FeaturePlot(sci, c("Top2a", "Mcm6"))
FeaturePlot(sci, c("Runx1", "Nkx3-1"))
FeaturePlot(sci, c("Krt5", "Trp63"))
FeaturePlot(sci, c("Ly6d", "Tacstd2"))
FeaturePlot(sci, c("Pbsn", "Nkx3-1"))
```

Plot features
```{r}
VlnPlot(sci, group.by = "population", ncol = 1, pt.size = 0.05, cols = pal25,
        features = c("nFeature_RNA", "nCount_RNA", "percent.mt"))

VlnPlot(sci, group.by = "population", ncol = 1, pt.size = 0.05, cols = pal25,
        features = c( "nCount_RNA"))
```

### Identify clusters

Plot some genes
```{r}
DefaultAssay(sci) <- "RNA"
FeaturePlot(sci, features = c("Epcam", "Svs4", "Upk3a", "Ly6d", "Col1a1", "Vim"))
FeaturePlot(sci, features = c("Cd52", "Ptprc", "Itgam", "Pecam1", "Cd74", "Cd3d"))
```

Find markers
```{r, eval=FALSE}
all.markers <- FindAllMarkers(sci, only.pos = TRUE, 
                              min.pct = 0.50, logfc.threshold = 0.25, 
                              test.use = "MAST")

top.all <- all.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
```

Plot single gene
```{r}
FeaturePlot(sci, features = c("Svs4"))
```
  
  
Hemato: Ptprc, Cd52, Itgam, Cd74, Runx3  
Fibro: Vim, Col1a1, Mgp, Acta2  
Endo: Pecam1, Vim
Cycling: Top2a, Mki67  
Bladder: Upk3a, Ly6d, Ivl, S100a6  
Seminal vesicles: Svs4, Svs5  
  
  
*Dotplot*
```{r}
DefaultAssay(sci) <- "RNA"

epi.markers.to.plot <- 
  c(
    "Epcam", "Krt8", "Cd24a",
    "Sbp", "Spink1","Abo",
    "Psca", "Krt4", "Wfdc2",
    "Tgm4", "Nkx3-1", "Pbsn",
    "Lmo7", "Clu", "Krt19",
    "Ang5", "Crym", "Piezo2",
    "Trp63", "Krt5", "Krt14",
    
    "Vim", "Ptprc", "Cd74", "Itgam",
     "Cd3d", "Pecam1", "Col1a1", "Svs4"
    )

DotPlot(sci, 
        features = rev(epi.markers.to.plot), 
        group.by = "seurat_clusters",
        cols = c("darkblue", "tomato"), dot.scale = 8) + RotatedAxis()

ggsave(filename = file.path(pdf.dir, "Allcells_dotplot.pdf"), width = 8, height = 6, device = "pdf")
```

### PaperFigures

UMAPs
```{r}
# Clusters
DimPlot(sci, reduction = "umap", cols = kel, pt.size = 0.4)
ggsave(filename = file.path(fig.dir, "1_integration/Allcells_UMAP_clusters.png"), device = "png", width = 6, height = 4, dpi = 300)

# Runs
## combined
DimPlot(sci, reduction = "umap", pt.size = 0.4, group.by = "orig.ident", cols = pal.runs)
ggsave(filename = file.path(fig.dir, "1_integration/Allcells_UMAP_runs.png"), device = "png", width = 6, height = 4, dpi = 300)
## split
DimPlot(sci, reduction = "umap", pt.size = 0.4, group.by = "orig.ident", cols = pal.runs, split.by = "orig.ident")
ggsave(filename = file.path(fig.dir, "1_integration/Allcells_UMAP_runs_split.png"), device = "png", width = 12, height = 4, dpi = 300)
```

Dotplot
```{r}
# res 0.4 - relevel clusters
sci$seurat_clusters <- factor(sci$seurat_clusters, 
                            levels=c("6", "2", "3", "4", "14", "10", 
                                     "0", "1", "5", "8",
                                     "7", "15", "13", "9", "11", "12"))

# Plot
DefaultAssay(sci) <- "RNA"
markers.to.plot <- 
  c(
    "Epcam", "Krt8", "Cd24a",
    "Spink1", "Krt19",
    "Tacstd2", "Psca", "Krt4",
    "Tgm4", "Nkx3-1", "Pbsn", "Msmb",
    "Piezo2",
    "Trp63", "Krt5", "Krt14",
    
    "Vim", "Ptprc", "Cd74", "Itgam",
     "Cd3d", "Pecam1", "Col1a1", "Svs2", "Pax2"
    )

DotPlot(sci, 
        features = rev(markers.to.plot), 
        group.by = "seurat_clusters",
        cols = c("darkblue", "tomato"), dot.scale = 5) + RotatedAxis() + FontSize(x.text = 12, y.text = 12)

ggsave(filename = file.path(fig.dir, "1_integration/Allcells_dotplot.png"), device = "png", width = 8, height = 6, dpi = 300)
```

Gene expression
```{r}
source("r_scripts/functions.R")
# Extract UMAPs coordinates
sci.UMAP <- as.data.frame(Embeddings(sci[["umap"]]))
sci.UMAP <- sci.UMAP %>% 
  dplyr::mutate(CellBarcode = colnames(sci),
                Clusters = sci$seurat_clusters,
                Population = sci$population,
                Treatment = sci$treatment,
                Lobe = sci$lobe,
                RFP = sci$rfp)

# Runx1
gene <- "Runx1"
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("1_integration/1_integration/Allcells_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Ptprc
gene <- "Ptprc"
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Col1a1
gene <- "Col1a1"
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Pecam1
gene <- "Pecam1"
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Cd74
gene <- "Cd74"
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Svs4
gene <- "Svs4"
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Svs4
gene <- "Svs2"
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Cd3g
gene <- "Cd3g"
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Vim
gene <- "Vim"
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Retnla
gene <- "Retnla"
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Ccl4
gene <- "Ccl4"
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Cdh5
gene <- "Cdh5"
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Epcam
gene <- "Epcam"
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Krt8
gene <- "Krt8"
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Krt5
gene <- "Krt5"
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Cd24a
gene <- "Cd24a"
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Pax2
gene <- "Pax2"
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Foxi1
gene <- "Foxi1"
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sci, sci.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("1_integration/Allcells_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

```


### Save/Load

```{r, eval=FALSE}
save.image(file = file.path("r_save/seurat_integration.RData"))
```

```{r, eval=FALSE}
saveRDS(sci, file = file.path("r_save/sci_integrated.rds"))
```

Load if need to start here
```{r, eval=FALSE}
sci <- readRDS(file.path("r_save/sci_integrated.rds"))
```

## C. Epithelial cells

### Filter

*Keep prostate epithelial clusters*
```{r}
Idents(sci) <- "seurat_clusters"
sce <- subset(sci, idents = c(0:6,8,10,14), invert = FALSE)
DimPlot(sce, reduction = "umap", cols = kel, label = TRUE, pt.size = 0.5)
```

### PCA/UMAP

**Dim. red.**
```{r}
DefaultAssay(sce) <- "integrated"
sce <- RunPCA(sce, npcs = 50, verbose = T, assay = "integrated")
sce <- RunUMAP(sce, reduction = "pca", dims = 1:50, seed.use = 1, assay = "integrated", min.dist = 0.3)
sce <- FindNeighbors(sce, reduction = "pca", dims = 1:50, assay = "integrated", k.param = 20L)
sce <- FindClusters(sce, resolution = 0.3)

DimPlot(sce, reduction = "umap", cols = kel, label = TRUE, pt.size = 0.5)
```

Save original seurat_clusters in a separate object
```{r}
seu_clus <- sce$integrated_snn_res.0.3
#sce <- AddMetaData(object = sce, metadata = seu_clus, col.name = "orig_seu_clus")
```


### Clustree

Use Clustree to evaluate clustering. We should 0.3 as this identifies 6 distinct luminal clusters, separated from 3 basal cell clusters. 
```{r}
library(clustree)

scc <- FindClusters(
  sce,
  reduction = "pca", 
  dims = 1:50,
  k.param = 20L,
  resolution = seq(0, 1, 0.1),
  save.SNN = TRUE,
  print.output = FALSE)

clustree(scc)
DimPlot(object = scc, reduction = "umap", label=TRUE, group.by = "integrated_snn_res.0.6")
```

### Plot data

Visualise clusters
```{r}
DimPlot(sce, reduction = "umap", cols = pal25, label = TRUE, pt.size = 1, label.size = 5) + ggtitle("Epithelial cells - Clusters")
ggsave(filename = file.path(pdf.dir, "PE_UMAP_clusters_unlabelled.pdf"), width = 15, height = 10)
```

Visualise integration
```{r}
DimPlot(sce, reduction = "umap", pt.size = 1, group.by = "orig.ident", cols = pal.runs)
ggsave(filename = file.path(pdf.dir, "PE_UMAP_runs.pdf"), width = 15, height = 10)
```

Populations
```{r}
DimPlot(sce, group.by="population", pt.size=1, cols=pal.pop)
```

Visualise populations
```{r}
p11=DimPlot(sce, group.by="population", pt.size=.8, cols=pal.pop) + ggtitle("Sorted populations")
p12=DimPlot(sce, group.by="treatment", pt.size=.8, cols=pal.trt) + ggtitle("Hormonal status")
p13=DimPlot(sce, group.by = "lobe", pt.size=.8, cols=pal.lobe) + ggtitle("Lobes")
p14=DimPlot(sce, group.by = "rfp", pt.size=.8, cols=pal.rfp) + ggtitle("RFP status")
(p11+p12)/(p13+p14)
ggsave(filename = file.path(pdf.dir, "PE_UMAP_meta.pdf"), width = 20, height = 15)
```

Individual plots
```{r, eval=FALSE}
DimPlot(sce, group.by="population", pt.size=.8, cols=pal.pop) + ggtitle("Sorted populations")
DimPlot(sce, group.by="treatment", pt.size=.8, cols=pal.trt) + ggtitle("Hormonal status")
DimPlot(sce, group.by = "lobe", pt.size=.8, cols=pal.lobe) + ggtitle("Lobes")
DimPlot(sce, group.by = "rfp", pt.size=.8, cols=pal.rfp) + ggtitle("RFP status")
```

Plot Runx1 vs RFP
```{r}
DefaultAssay(sce) <- "RNA"
g1 <- FeaturePlot(sce, "Runx1")
g2 <- DimPlot(sce, group.by = "rfp", pt.size=.5, cols=pal.rfp) + ggtitle("RFP")
g1+g2
```

Bac-a-sable: Plot single gene
```{r}
FeaturePlot(sce, "Tacstd2")
```

Bac-a-sable: Plot 2 genes
```{r, eval=FALSE}
FeaturePlot(sce, c("Runx1", "Nkx3-1"))
```

### Re-label clusters

Plot clusters
```{r}
DimPlot(sce, reduction = "umap", cols = pal25, label = TRUE, pt.size = 0.5)
```
  
New cluster IDs
```{r}
in_silico_clusters <- c(
  "Bas",     #0
  "Bas",     #1
  "Lum-A",   #2
  "Lum-D",   #3
  "Lum-B",   #4
  "Bas",     #5
  "Lum-E",   #6
  "Lum-F",   #7
  "Lum-C"    #8
  )

names(in_silico_clusters) <- levels(sce$seurat_clusters)
sce <- RenameIdents(sce, in_silico_clusters)

#relevel
relevel <- c(
  "Lum-A",   #2
  "Lum-B",   #4
  "Lum-C",   #8
  "Lum-D",   #3
  "Lum-E",   #6
  "Lum-F",   #7

  "Bas"
  )

Idents(sce) <- factor(Idents(sce), levels = relevel)
sce$in_silico_clusters <- Idents(sce)

DimPlot(sce, reduction = "umap", cols = pal.cl, label = TRUE, repel = TRUE, pt.size = .8)
ggsave(filename = file.path(pdf.dir, "PE_UMAP_clusters.pdf"), width = 9, height = 6, device = "pdf")
```

### Save/Load PE

```{r}
saveRDS(sce, file = file.path("r_save/sce_integrated.rds"))
```

Load if need to start here
```{r, eval=FALSE}
sce <- readRDS(file.path("r_save/sce_integrated.rds"))
```

### FindMarkers

Load if exists already
```{r, eval=FALSE}
epi.markers <- readRDS(file.path("r_save/epi.markers.rds"))
```

Use **MAST** algorithm.
```{r, eval=FALSE}
epi.markers <- FindAllMarkers(
  sce, 
  test.use = "MAST",
  assay = "RNA",
  min.pct = 0.25, 
  logfc.threshold = 0.25,
  only.pos = FALSE)

saveRDS(epi.markers, file = file.path("r_save/epi.markers.rds"))

# Top positive markers
top5.epi <- 
  epi.markers %>% 
  group_by(cluster) %>% 
  filter(avg_logFC>0) %>% 
  top_n(n=5, wt=avg_logFC)

top10.epi <- 
  epi.markers %>% 
  group_by(cluster) %>% 
  filter(avg_logFC>0) %>% 
  top_n(n=10, wt=avg_logFC)

# write csv
write.table(x = epi.markers, file = paste0("r_export/", "epi.markers.txt"), 
            sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE)
```

Lum-D vs Lum-ABC
```{r}
D_vs_ABC <- FindMarkers(
  sce,
  ident.1 = c("Lum-D"),
  ident.2 = c("Lum-A", "Lum-B", "Lum-C"), 
  test.use = "MAST",
  assay = "RNA",
  min.pct = 0.25, 
  logfc.threshold = 0.25,
  only.pos = FALSE)

write.table(x = D_vs_ABC, file = paste0("r_export/", "D_vs_ABC.txt"), 
            sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE)
```


### Cell cycle phases

Tutorial here: https://satijalab.org/seurat/v3.0/cell_cycle_vignette.html  
```{r}
# Marker list from Tirosh et al, 2015. Segregate this list into markers of G2/M phase and markers of S phase.0
s.genes <- cc.genes.updated.2019$s.genes
g2m.genes <- cc.genes.updated.2019$g2m.genes

# Function to convert each gene to mouse nomenclature by changing case (checked that gene names exist in ensembl too)
firstup <- function(x) { 
  x <- tolower(x)
  substr(x, 1, 1) <- toupper(substr(x, 1, 1))
  x
}

s.genes <- firstup(s.genes)
g2m.genes <- firstup(g2m.genes)
```

Make sure to specify the "assay" parameter 
```{r}
cc <- CellCycleScoring(
  sce,
  s.features = s.genes,
  g2m.features = g2m.genes,
  assay = 'RNA',
  set.ident = TRUE
)

head(cc[[]])
```

```{r}
#umap1 after
pal.cc <- c("#4B6EB3", "#FFCC4C", "#FF7A7D") #cell cycle phases

DimPlot(cc, reduction = "umap.sc", group.by = "Phase", cols=pal.cc, pt.size=0.8)
umap1 <- umap1 + NoLegend() + NoAxes()
ggsave(umap1, filename = file.path(fig.dir, "2_all_cells/Allcells_UMAP_after_ccReg_nolegend.png"), device = "png", width = 6, height = 6, dpi = 300)
```


Cell cycle phase
```{r}
DimPlot(object = sce, reduction = "umap", group.by = "Phase", label=TRUE, label.size = 5, pt.size = 1, cols=pal.cc)
ggsave(filename = file.path(fig.dir, "3_main_cluster/Main_UMAP_cellcycle.png"), device = "png", width = 6, height = 4, dpi = 300)
DimPlot(object = sce, reduction = "umap", group.by = "Phase", pt.size = 1, cols=pal.cc) + NoLegend() + NoAxes()
ggsave(filename = file.path(fig.dir, "3_main_cluster/Main_UMAP_cellcycle_nolegend.png"), device = "png", width = 7, height = 5, dpi = 300)
```

Cell cycle phase per day
```{r}
statcc <- 
  meta %>% 
  group_by(sample, Phase) %>% 
  summarise(count=n()) %>% 
  mutate(perc=(count/sum(count)*100))

ggplot(statcc, aes(x = sample, y = perc, fill = Phase, label = count)) +
  geom_bar(stat="identity", width = 0.5) +
  labs(x="Time point", y="Percentage", fill="Phase",
       title = "Proportion of cycling cells per days") +
   theme(
     panel.grid.minor = element_blank(),
     panel.grid.major = element_blank(),
     panel.background = element_blank(),
     panel.border = element_rect(colour = "grey30", fill=NA, size=0.5),
     strip.background = element_rect(fill="white", colour = "white")
  ) + rotate()
```

Cell cycle phase per cluster
```{r}
statcc <- 
  sce@meta.data %>% 
  group_by(in_silico_clusters, Phase) %>% 
  summarise(count=n()) %>% 
  mutate(perc=(count/sum(count)*100))
  
ggplot(statcc, aes(x = in_silico_clusters, y = perc, fill = factor(Phase, levels=c("G1", "S", "G2M")), label = count)) +
  geom_bar(stat="identity", width = 0.8) +
  geom_text(size = 3, position = position_stack(vjust = 0.5)) +
  scale_fill_manual(values = pal.cc) +
  labs(x="Clusters", y="Percentage", fill="",
       title = "Proportion of cycling cells per clusters") +
   theme(
     panel.grid.minor = element_blank(),
     panel.grid.major = element_blank(),
     panel.background = element_blank(),
     panel.border = element_rect(colour = "grey30", fill=NA, size=0.5),
     strip.background = element_rect(fill="white", colour = "white")
  ) + rotate()

ggsave(filename = file.path(fig.dir, "3_main_cluster/Meta_ccPhase_per_cluster.png"), device = "png", width = 4, height = 3, dpi = 300)
```

## D. Loom Export

Export to loom file to generate PAGA and new UMAP initiated on PAGA.

All epithelial cells
```{r}
library(loomR)

scl <- sce
DefaultAssay(scl) <- "integrated"
scl@project.name <- "mevel_et_al_sc_adult"

# remove useless slots
scl$nCount_HTO <- NULL
scl$nFeature_HTO <- NULL
scl$HTO_maxID <- NULL
scl$HTO_secondID <- NULL
scl$HTO_margin <- NULL
scl$HTO_classification <- NULL
scl$HTO_classification.global <- NULL
scl$hash.ID <- NULL
scl$tag <- NULL
scl$integrated_snn_res.0.3 <- NULL

# add cell id in metadata
df.cell <- data.frame(Cell = colnames(sce))
rownames(df.cell) <- df.cell$Cell
scl <- AddMetaData(object = scl, metadata = df.cell, col.name = "Cell")

# keep only useful assay
seu.full <- DietSeurat(
  scl,
  counts = TRUE,
  data = TRUE,
  scale.data = TRUE,
  features = NULL,
  assays = "integrated",
  dimreducs = c("umap", "pca"),
  graphs = c("integrated_nn", "integrated_snn")
)

# as loom
loom.scl <- as.loom(scl)
loom.scl$close_all()
rm(loom.scl)
```

Luminal only
```{r}
library(loomR)

# filter
Idents(sce) <- "in_silico_clusters"
scl <- subset(sce, idents = c("Bas"), invert = TRUE)

# make integrated default
DefaultAssay(scl) <- "integrated"
scl@project.name <- "mevel_et_al_sc_adult_luminal"

# remove useless slots
scl$nCount_HTO <- NULL
scl$nFeature_HTO <- NULL
scl$HTO_maxID <- NULL
scl$HTO_secondID <- NULL
scl$HTO_margin <- NULL
scl$HTO_classification <- NULL
scl$HTO_classification.global <- NULL
scl$hash.ID <- NULL
scl$tag <- NULL
scl$integrated_snn_res.0.3 <- NULL

# add cell id in metadata
df.cell <- data.frame(Cell = colnames(sce))
rownames(df.cell) <- df.cell$Cell
scl <- AddMetaData(object = scl, metadata = df.cell, col.name = "Cell")

# keep only useful assay
seu.full <- DietSeurat(
  scl,
  counts = TRUE,
  data = TRUE,
  scale.data = TRUE,
  features = NULL,
  assays = "integrated",
  dimreducs = c("umap", "pca"),
  graphs = c("integrated_nn", "integrated_snn")
)

# as loom
loom.scl <- as.loom(scl)
loom.scl$close_all()
rm(loom.scl)
```

## E. Metadata metrics

Store metadata
```{r}
meta = sce@meta.data
```

#### Numbers
  
*Per run*
```{r}
stat <- 
  meta %>% 
  group_by(orig.ident) %>% 
  summarise(count=n()) %>% 
  mutate(perc=(count/sum(count)*100))

# Plot
ggplot(stat, aes(x=orig.ident, y=count, fill=orig.ident)) +
  geom_bar(stat = "identity", width = 0.4) +
  geom_text(aes(y=count,label = count, hjust=0)) +
  labs(x="Run ID", y="Number of cells", 
       title = "Number of cells per 10x run") +
  scale_fill_manual(values = pal.runs) +
  scale_y_continuous(limits=c(0, max(stat$count) + 0.05*max(stat$count))) +
   theme(
     legend.position = "none",
     panel.grid.minor = element_blank(),
     panel.grid.major = element_blank(),
     panel.background = element_blank(),
     panel.border = element_rect(colour = "grey30", fill=NA, size=0.5),
     strip.background = element_rect(fill="white", colour = "white")
  ) + rotate()
```

*Per tagged population*
```{r}
stat <- 
  meta %>% 
  group_by(population) %>% 
  summarise(count=n()) %>% 
  mutate(perc=(count/sum(count)*100))

# Plot
ggplot(stat, aes(x=population, y=count, fill=population)) +
  geom_bar(stat = "identity", width = 0.4) +
  scale_fill_manual(values = pal.pop) +
  labs(x="Run ID", y="Number of cells", 
       title = "Number of cells per population") +
   theme(
     legend.position = "none",
     panel.grid.minor = element_blank(),
     panel.grid.major = element_blank(),
     panel.background = element_blank(),
     panel.border = element_rect(colour = "grey30", fill=NA, size=0.5),
     strip.background = element_rect(fill="white", colour = "white")
  ) + rotate()
```

*Per RFP*
```{r}
stat <- 
  meta %>% 
  group_by(rfp) %>% 
  summarise(count=n()) %>% 
  mutate(perc=(count/sum(count)*100))

# Plot
ggplot(stat, aes(x=rfp, y=count, fill=rfp)) +
  geom_bar(stat = "identity", width = 0.4) +
  scale_fill_manual(values = pal.rfp) +
  labs(x="Run ID", y="Number of cells", 
       title = "Number of cells per RFP status") +
   theme(
     legend.position = "none",
     panel.grid.minor = element_blank(),
     panel.grid.major = element_blank(),
     panel.background = element_blank(),
     panel.border = element_rect(colour = "grey30", fill=NA, size=0.5),
     strip.background = element_rect(fill="white", colour = "white")
  ) + rotate()
```

*Per Treatment*
```{r}
stat <- 
  meta %>% 
  group_by(treatment) %>% 
  summarise(count=n()) %>% 
  mutate(perc=(count/sum(count)*100))

# Plot
ggplot(stat, aes(x=treatment, y=count, fill=treatment)) +
  geom_bar(stat = "identity", width = 0.4) +
  scale_fill_manual(values = pal.trt) +
  labs(x="Run ID", y="Number of cells", 
       title = "Number of cells per Hormonal Status") +
   theme(
     legend.position = "none",
     panel.grid.minor = element_blank(),
     panel.grid.major = element_blank(),
     panel.background = element_blank(),
     panel.border = element_rect(colour = "grey30", fill=NA, size=0.5),
     strip.background = element_rect(fill="white", colour = "white")
  ) + rotate()
```

*Per Lobe*
```{r}
stat <- 
  meta %>% 
  group_by(lobe) %>% 
  summarise(count=n()) %>% 
  mutate(perc=(count/sum(count)*100))

# Plot
ggplot(stat, aes(x=lobe, y=count, fill=lobe)) +
  geom_bar(stat = "identity", width = 0.4) +
  scale_fill_manual(values = pal.lobe) +
  labs(x="Run ID", y="Number of cells", 
       title = "Number of cells per lobe") +
   theme(
     legend.position = "none",
     panel.grid.minor = element_blank(),
     panel.grid.major = element_blank(),
     panel.background = element_blank(),
     panel.border = element_rect(colour = "grey30", fill=NA, size=0.5),
     strip.background = element_rect(fill="white", colour = "white")
  ) + rotate()
```

#### Proportions

*Intact/Regressed Per In Silico Cluster* - Proportion are not really meaningful due to pre-sort!
```{r}
stat <- 
  meta %>% 
  group_by(in_silico_clusters, treatment) %>% 
  summarise(count=n()) %>% 
  mutate(perc=(count/sum(count)*100))

ggplot(stat, aes(x = in_silico_clusters, y = perc, fill = treatment, label = count)) +
  geom_bar(stat="identity", width = 0.5) +
  geom_text(size = 4, position = position_stack(vjust = 0.5)) +
  scale_fill_manual(values = pal.trt) +
  labs(x=NULL, y="Percentage", fill="Treatment",
       title = "Proportion of intact/regressed cells per in silico cluster") +
   theme(
     panel.grid.minor = element_blank(),
     panel.grid.major = element_blank(),
     panel.background = element_blank(),
     panel.border = element_rect(colour = "grey30", fill=NA, size=0.5),
     strip.background = element_rect(fill="white", colour = "white")
  ) + rotate()

# numbers
ggplot(stat, aes(x = in_silico_clusters, y = count, fill = treatment)) +
  geom_bar(stat="identity", position = "stack", width = 0.5) +
  scale_fill_manual(values = pal.trt) +
  labs(x=NULL, y="Cell number", fill="Treatment",
       title = "Number of intact/regressed cells per in silico cluster") +
   theme(
     panel.grid.minor = element_blank(),
     panel.grid.major = element_blank(),
     panel.background = element_blank(),
     panel.border = element_rect(colour = "grey30", fill=NA, size=0.5),
     strip.background = element_rect(fill="white", colour = "white")
  ) + rotate()
```

*RFP Hi/Lo Per In Silico Cluster* - Proportion are not really meaningful due to pre-sort!
```{r}
stat <- 
  meta %>% 
  group_by(in_silico_clusters, rfp) %>% 
  summarise(count=n()) %>% 
  mutate(perc=(count/sum(count)*100))

# percentage
ggplot(stat, aes(x = in_silico_clusters, y = perc, fill = rfp, label = count)) +
  geom_bar(stat="identity", width = 0.5) +
  geom_text(size = 4, position = position_stack(vjust = 0.5)) +
  scale_fill_manual(values = pal.rfp) +
  labs(x=NULL, y="Percentage", fill="RFP",
       title = "Proportion of RFP Hi/Lo cells per in silico cluster") +
   theme(
     panel.grid.minor = element_blank(),
     panel.grid.major = element_blank(),
     panel.background = element_blank(),
     panel.border = element_rect(colour = "grey30", fill=NA, size=0.5),
     strip.background = element_rect(fill="white", colour = "white")
  ) + rotate()

# numbers
ggplot(stat, aes(x = in_silico_clusters, y = count, fill = rfp)) +
  geom_bar(stat="identity", position = "stack", width = 0.5) +
  scale_fill_manual(values = pal.rfp) +
  labs(x=NULL, y="Cell number", fill="RFP",
       title = "Number of cells from each RFP sorted population") +
   theme(
     panel.grid.minor = element_blank(),
     panel.grid.major = element_blank(),
     panel.background = element_blank(),
     panel.border = element_rect(colour = "grey30", fill=NA, size=0.5),
     strip.background = element_rect(fill="white", colour = "white")
  ) + rotate()
```

*Population Per In Silico Cluster* - Proportion are not really meaningful due to pre-sort!
```{r}
stat <- 
  meta %>% 
  group_by(in_silico_clusters, population) %>% 
  summarise(count=n()) %>% 
  mutate(perc=(count/sum(count)*100))

# percentage
ggplot(stat, aes(x = in_silico_clusters, y = perc, fill = population, label = count)) +
  geom_bar(stat="identity", width = 0.5) +
  scale_fill_manual(values = pal.pop) +
  labs(x=NULL, y="Percentage", fill="Sorted population",
       title = "Population Per In Silico Cluster") +
   theme(
     panel.grid.minor = element_blank(),
     panel.grid.major = element_blank(),
     panel.background = element_blank(),
     panel.border = element_rect(colour = "grey30", fill=NA, size=0.5),
     strip.background = element_rect(fill="white", colour = "white")
  ) + rotate()

# numbers
ggplot(stat, aes(x = in_silico_clusters, y = count, fill = population)) +
  geom_bar(stat="identity", position = "stack", width = 0.5) +
  scale_fill_manual(values = pal.pop) +
  labs(x=NULL, y="Cell number", fill="Sorted populations",
       title = "Population Per In Silico Cluster") +
   theme(
     panel.grid.minor = element_blank(),
     panel.grid.major = element_blank(),
     panel.background = element_blank(),
     panel.border = element_rect(colour = "grey30", fill=NA, size=0.5),
     strip.background = element_rect(fill="white", colour = "white")
  ) + rotate()
```

*Population Per In Silico Cluster* - Proportion are not really meaningful due to pre-sort!
```{r}
stat <- 
  meta %>% 
  group_by(in_silico_clusters, lobe) %>% 
  summarise(count=n()) %>% 
  mutate(perc=(count/sum(count)*100))

# percentage
ggplot(stat, aes(x = in_silico_clusters, y = perc, fill = lobe, label = count)) +
  geom_bar(stat="identity", width = 0.5) +
  scale_fill_manual(values = pal.lobe) +
  labs(x=NULL, y="Percentage", fill="Sorted population - lobe",
       title = "Population Per In Silico Cluster") +
   theme(
     panel.grid.minor = element_blank(),
     panel.grid.major = element_blank(),
     panel.background = element_blank(),
     panel.border = element_rect(colour = "grey30", fill=NA, size=0.5),
     strip.background = element_rect(fill="white", colour = "white")
  ) + rotate()
```


*Population per run* - Proportion are not really meaningful due to pre-sort!
```{r}
stat <- 
  meta %>% 
  group_by(orig.ident, population) %>% 
  summarise(count=n()) %>% 
  complete(population, fill = list(count = 0)) %>% 
  mutate(perc=(count/sum(count)*100))

# percentage
ggplot(stat, aes(x = orig.ident, y = perc, fill = population, label = count)) +
  geom_bar(stat="identity", width = 0.5) +
  geom_text(size = 3, position = position_stack(vjust = 0.5)) +
  scale_fill_manual(values = pal.pop) +
  labs(x=NULL, y="Percentage", fill="Sorted populations",
       title = "Proportion of cells from each sorted population per 10x run") +
   theme(
     panel.grid.minor = element_blank(),
     panel.grid.major = element_blank(),
     panel.background = element_blank(),
     panel.border = element_rect(colour = "grey30", fill=NA, size=0.5),
     strip.background = element_rect(fill="white", colour = "white")
  ) + rotate()

# numbers
ggplot(stat, aes(x = orig.ident, y = count, fill = population)) +
  geom_bar(stat="identity", position = "stack", width = 0.5) +
  scale_fill_manual(values = pal.pop) +
  labs(x=NULL, y="Cell number", fill="Sorted populations",
       title = "Number of cells from each sorted population per 10x run") +
   theme(
     panel.grid.minor = element_blank(),
     panel.grid.major = element_blank(),
     panel.background = element_blank(),
     panel.border = element_rect(colour = "grey30", fill=NA, size=0.5),
     strip.background = element_rect(fill="white", colour = "white")
  ) + rotate()

# numbers / per population / facet by run
ggplot(stat, aes(x = population, y = count, fill = population)) +
  geom_bar(stat="identity", width = 0.6) +
  geom_text(aes(y=count,label = count, hjust=-0.2)) +
  scale_fill_manual(values = pal.pop) +
  scale_y_continuous(limits=c(0, max(stat$count) + 0.1*max(stat$count))) +
  labs(x=NULL, y="Cell number", fill="Sorted populations",
       title = "Number of cells from each sorted population per 10x run") +
  facet_grid(.~orig.ident) +
   theme(
     panel.grid.minor = element_blank(),
     panel.grid.major = element_blank(),
     panel.background = element_blank(),
     panel.border = element_rect(colour = "grey30", fill=NA, size=0.5),
     strip.background = element_rect(fill="white", colour = "white")
  ) + rotate()

# Pooled runs
ggplot(stat, aes(x = population, y = count, fill = population)) +
  geom_bar(stat="identity", width = 0.5) +
  scale_fill_manual(values = pal.pop) +
  labs(x=NULL, y="Cell number", fill="Sorted populations",
       title = "Number of cells from each sorted population") +
   theme(
     panel.grid.minor = element_blank(),
     panel.grid.major = element_blank(),
     panel.background = element_blank(),
     panel.border = element_rect(colour = "grey30", fill=NA, size=0.5),
     strip.background = element_rect(fill="white", colour = "white")
  ) + rotate()
```

### Sankey

```{r}
library(alluvial)

df.allu <- meta %>% 
  filter(in_silico_clusters != "Bas") %>% 
  tibble::rownames_to_column( "cell_id") %>%
  group_by(in_silico_clusters, treatment) %>%
  dplyr::summarize(counts = n()) %>%
  ungroup() %>%
  dplyr::arrange(desc(counts))

alluvial(
  dplyr::select(df.allu, treatment, in_silico_clusters),
  freq = df.allu$counts,
  border="grey80",
  alpha = 0.8,
  blocks=TRUE
)
```

```{r}
library(ggalluvial)
library(ggfittext)

df.allu <- meta %>% 
  #filter(in_silico_clusters != "Bas") %>% 
  tibble::rownames_to_column( "cell_id") %>%
  group_by(in_silico_clusters, treatment, lobe, rfp) %>%
  dplyr::summarize(counts = n()) %>%
  ungroup() %>%
  dplyr::arrange(desc(counts))

# Lobes / geom label
ggplot(df.allu, aes(y = counts, axis1 = treatment, axis2 = in_silico_clusters, axis3 = rfp)) +
  geom_alluvium(aes(fill = lobe), width = 1/6) +
  geom_stratum(width = 0.25, fill = "white", color = "grey") +
  geom_label(stat = "stratum", infer.label = TRUE, label.size = NA, alpha = 0, color = 'black', fontface = "bold") +
  scale_x_discrete(limits = c("Hormonal status", "Cluster", "RFP"), expand = c(.05, .05)) +
  scale_fill_manual(values = pal.lobe) +
  theme(
    axis.ticks = element_blank(),
    axis.title.y=element_blank(),
    axis.text.y=element_blank(),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    panel.background = element_blank(),
    strip.background = element_rect(fill="white", colour = "white")
    ) 

# Treatment|Clusters|RFP / colour by lobe  -----------------------------------
ggplot(df.allu, aes(y = counts, axis1 = treatment, axis2 = in_silico_clusters, axis3 = factor(rfp, levels = c("Low", "High")))) +
  geom_alluvium(aes(fill = lobe), width = 1/6) +
  scale_fill_manual(values = pal.lobe) +
  geom_stratum(width = 0.25, fill = "white", color = "grey80", linetype = "solid") +
  geom_text(stat = "stratum", infer.label = TRUE) +
  scale_x_discrete(limits = c("Hormonal status", "Cluster", "RFP status"), expand = c(.05, .05)) +
  theme(
    axis.ticks = element_blank(),
    axis.title.y=element_blank(),
    axis.text.y=element_blank(),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    panel.background = element_blank(),
    strip.background = element_rect(fill="white", colour = "white")
    ) 

# Treatment|Clusters|RFP / colour by cluster  -----------------------------------
ggplot(df.allu, aes(y = counts, axis1 = treatment, axis2 = in_silico_clusters, axis3 = factor(rfp, levels = c("Low", "High")))) +
  geom_alluvium(aes(fill = in_silico_clusters), width = 1/6) +
  scale_fill_manual(values = pal.cl) +
  geom_stratum(width = 0.25, fill = "white", color = "grey80", linetype = "solid") +
  geom_text(stat = "stratum", infer.label = TRUE) +
  scale_x_discrete(limits = c("Hormonal status", "Cluster", "RFP status"), expand = c(.05, .05)) +
  theme(
    axis.ticks = element_blank(),
    axis.title.y=element_blank(),
    axis.text.y=element_blank(),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    panel.background = element_blank(),
    strip.background = element_rect(fill="white", colour = "white")
    ) 

# RFP -----------------------------------------------------
df.allu <- meta %>% 
  filter(in_silico_clusters != "Bas") %>% 
  tibble::rownames_to_column( "cell_id") %>%
  group_by(in_silico_clusters, treatment, lobe, rfp) %>%
  dplyr::summarize(counts = n()) %>%
  ungroup() %>%
  dplyr::arrange(desc(counts))

ggplot(df.allu, aes(y = counts, axis1 = treatment, axis2 = in_silico_clusters)) +
  geom_alluvium(aes(fill = rfp), width = 1/6) +
  scale_fill_manual(values = pal.rfp) +
  geom_stratum(width = 0.25, fill = "white", color = "grey80", linetype = "solid") +
  geom_text(stat = "stratum", infer.label = TRUE) +
  scale_x_discrete(limits = c("Hormonal status", "Cluster", "RFP status"), expand = c(.05, .05)) +
  theme(
    axis.ticks = element_blank(),
    axis.title.y=element_blank(),
    axis.text.y=element_blank(),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    panel.background = element_blank(),
    strip.background = element_rect(fill="white", colour = "white")
    ) 

ggplot(df.allu, aes(y = counts, axis1 = treatment, axis2 = rfp)) +
  geom_alluvium(aes(fill = rfp), width = 1/6) +
  scale_fill_manual(values = pal.rfp) +
  geom_stratum(width = 0.25, fill = "white", color = "grey80", linetype = "solid") +
  geom_text(stat = "stratum", infer.label = TRUE) +
  scale_x_discrete(limits = c("Hormonal status", "Cluster", "RFP status"), expand = c(.05, .05)) +
  theme(
    axis.ticks = element_blank(),
    axis.title.y=element_blank(),
    axis.text.y=element_blank(),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    panel.background = element_blank(),
    strip.background = element_rect(fill="white", colour = "white")
    ) 
```

## F. Scanpy UMAP

Read in coordinates
```{r}
cell_embedding <- 
  read_csv("/sc_adult_dataset/paga_all_epithelial/output_umap/paga/cell_embedding.csv",
           col_types = cols(.default = col_double(), Cell = col_character()))# %>% mutate(in_silico_clusters = colData(sce)$in_silico_clusters)
```

Add new UMAP layout to the seurat object
```{r}
new.UMAP <- cell_embedding %>% 
  tibble::column_to_rownames("Cell") %>% 
  dplyr::rename(UMAP1 = X, UMAP2 = Y) %>% 
  as.matrix()

sce[["umap.sc"]] <- CreateDimReducObject(embeddings = new.UMAP, key = "umap_", assay = "integrated")
DimPlot(sce, reduction = "umap.sc", cols = pal.cl)
```

### => Save/Load PE

```{r}
sce.sc.umap <- sce
saveRDS(sce.sc.umap, file = file.path("r_save/sce.sc.umap.rds"))
```

Load if need to start here
```{r, eval=FALSE}
sce <- readRDS(file.path("r_save/sce.sc.umap.rds"))
```

## G. Lum-D

We subset cells of the Lum_D cluster, and perform DEA between intact and regressed cells of this cluster.

### Filter

*Keep Lum-D cluster*
```{r}
Idents(sce) <- "in_silico_clusters"
lumd <- subset(sce, idents = c("Lum-D"), invert = FALSE)
DimPlot(lumd, reduction = "umap", cols = pal.cl, pt.size = 1)
```

### UMAP/PCA

```{r}
DefaultAssay(lumd) <- "integrated"
lumd <- RunPCA(lumd, npcs = 50, verbose = T, assay = "integrated")
lumd <- RunUMAP(lumd, reduction = "pca", dims = 1:50, seed.use = 1, assay = "integrated", min.dist = 0.3)
lumd <- FindNeighbors(lumd, reduction = "pca", dims = 1:50, assay = "integrated", k.param = 20L)
lumd <- FindClusters(lumd, resolution = 0.8)
DimPlot(lumd, reduction = "umap", cols = kel, label = TRUE, pt.size = 2)
```

Plot metadata
```{r}
pl1=DimPlot(lumd, group.by="population", pt.size=1, cols=pal.pop) + ggtitle("Sorted populations")
pl2=DimPlot(lumd, group.by="treatment", pt.size=1, cols=pal.trt) + ggtitle("Hormonal status")
pl3=DimPlot(lumd, group.by = "lobe", pt.size=1, cols=pal.lobe) + ggtitle("Lobes")
pl4=DimPlot(lumd, group.by = "rfp", pt.size=1, cols=pal.rfp) + ggtitle("RFP status")
(pl1+pl2)/(pl3+pl4)
```

### FindMarkers

DEA between intact and regressed cells.  
Use **MAST** algorithm.  

```{r, eval=FALSE}
Idents(lumd) <- "treatment"

lumd.cas.markers <- FindMarkers(
  lumd,
  ident.1 = "Intact", 
  ident.2 = "Regressed",
  test.use = "MAST",
  assay = "RNA",
  min.pct = 0.25, 
  logfc.threshold = 0.25,
  only.pos = FALSE)

# Top positive markers
top.cas.lumd <- 
  lumd.cas.markers %>% 
  tibble::rownames_to_column(var="gene") %>% 
  filter(p_val_adj<0.05)

# write csv
write.table(x = top.cas.lumd, file = paste0("r_export/", "lumD_diff.txt"), 
            sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE)
```

FindAllMarkers
```{r, eval=FALSE}
Idents(lumd) <- "seurat_clusters"

lumd.markers <- FindAllMarkers(
  lumd,
  test.use = "MAST",
  assay = "RNA",
  min.pct = 0.25, 
  logfc.threshold = 0.25,
  only.pos = FALSE)

#saveRDS(lumd.markers, file = file.path("r_save/lumd.markers.rds"))

# Top positive markers
top10.lumd <- 
  lumd.markers %>% 
  group_by(cluster) %>% 
  filter(avg_logFC>0) %>% 
  top_n(n=10, wt=avg_logFC)

```

Genes
```{r}
#Idents(lumd) <- "lobe"
feat <- c("Krt4", "Tacstd2", "Runx1", "Psca", "Mmp7", "Ccnd1")

VlnPlot(lumd, features = feat, 
        group.by = "lobe", 
        split.by = "treatment",
        assay = "RNA", 
        cols = pal.trt, 
        pt.size = 0.5,
        adjust = ,)
```

### Violin Plots

Function
```{r}
# Plot expression by cluster
plotViolin_colByCluster <- function(GENE="Runx1", meta="treatment"){
  
  # Variables
  g <- GENE
  meta.to.plot <- meta # "Clusters", "Population", "Treatment", "RFP", "Lobe"
  
  # Change palette depending on which grouping variable is used
  if (meta.to.plot == "in_silico_clusters") { 
    pal=pal.cl
  } else if (meta.to.plot == "population") {
    pal=pal.pop
  } else if (meta.to.plot == "treatment") {
    pal=pal.trt
  } else if (meta.to.plot == "rfp") {
    pal=pal.rfp
  } else if (meta.to.plot == "lobe") {
    pal=pal.lobe
  }
  
  # Extract gene expression
  g.exp <- lumd[["RNA"]]@data[g,]
  
  # Extract corresponding metadata
  df_plot <- lumd@meta.data %>% 
    tibble::rownames_to_column(var = "CellBarcode") %>% 
    dplyr::mutate(Gene = g.exp) %>% 
    dplyr::rename(in_silico_clusters = in_silico_clusters,
                  population = population,
                  treatment = treatment,
                  lobe = lobe,
                  rfp = rfp)
  
  df_plot <- df_plot %>% 
    dplyr::rename(GroupBy = !!as.name(meta.to.plot)) %>% 
    dplyr::select(GroupBy, Gene, lobe)
  
  pViolgene <-
  ggplot(df_plot, aes(x=lobe, y=Gene, colour=GroupBy)) +
    geom_violin(fill=NA, scale="width", position = position_dodge(width = 0.8)) +
    #geom_boxplot(position = position_dodge(width = 0.9), outlier.size = 0) +
    geom_quasirandom(size = 0.4, alpha = 0.8, dodge.width = 0.8, varwidth = TRUE) +
    scale_colour_manual(values=pal) +
    theme_bw() +
    theme(legend.position = "none",
          panel.grid.major = element_line(linetype = "blank"), 
          panel.grid.minor = element_line(linetype = "blank"),
          #axis.line=element_blank(),
          axis.text.x=element_blank(),
          #axis.text.y=element_blank(),
          #axis.ticks=element_blank(),
          axis.title.x=element_blank(),
          axis.title.y=element_blank()
          ) +
          #axis.text.x = element_text(angle = 0, hjust = 1)) +
    labs(x ="", y = "", title = "")
    
  print(pViolgene)
}
```

Selection
```{r}
plotViolin_colByCluster(GENE="Psca", meta="treatment")
ggsave(filename = file.path(fig.dir, "6_LumD/LumD_Violin_Psca.png"), device = "png", width = 2.5, height = 2.5, dpi = 300)

plotViolin_colByCluster(GENE="Krt4", meta="treatment")
ggsave(filename = file.path(fig.dir, "6_LumD/LumD_Violin_Krt4.png"), device = "png", width = 2.5, height = 2.5, dpi = 300)

plotViolin_colByCluster(GENE="Tacstd2", meta="treatment")
ggsave(filename = file.path(fig.dir, "6_LumD/LumD_Violin_Tacstd2.png"), device = "png", width = 2.5, height = 2.5, dpi = 300)

plotViolin_colByCluster(GENE="Runx1", meta="treatment")
ggsave(filename = file.path(fig.dir, "6_LumD/LumD_Violin_Runx1.png"), device = "png", width = 2.5, height = 2.5, dpi = 300)

plotViolin_colByCluster(GENE="Tspan1", meta="treatment")
ggsave(filename = file.path(fig.dir, "6_LumD/LumD_Violin_Tspan1.png"), device = "png", width = 2.5, height = 2.5, dpi = 300)

plotViolin_colByCluster(GENE="Nupr1", meta="treatment")
ggsave(filename = file.path(fig.dir, "6_LumD/LumD_Violin_Nupr1.png"), device = "png", width = 2.5, height = 2.5, dpi = 300)

plotViolin_colByCluster(GENE="Ar", meta="treatment")
ggsave(filename = file.path(fig.dir, "6_LumD/LumD_Violin_Ar.png"), device = "png", width = 2.5, height = 2.5, dpi = 300)
```

## X. Paper Figures

### UMAP

```{r}
#Original clusters
sce <- AddMetaData(object = sce, metadata = seu_clus, col.name = "orig_seu_clus")
DimPlot(sce, reduction = "umap.sc", cols = kel, pt.size = .8, group.by = "orig_seu_clus")
ggsave(filename = file.path(fig.dir, "2_pe/PE_umap.sc_orig_seu_clusters.png"), device = "png", width = 6, height = 4, dpi = 300)
DimPlot(sce, reduction = "umap.sc", cols = kel, pt.size = .8, group.by = "orig_seu_clus") + NoLegend() + NoAxes()
ggsave(filename = file.path(fig.dir, "2_pe/PE_umap.sc_orig_seu_clusters_nolegend.png"), device = "png", width = 6, height = 6, dpi = 300)

#Clusters
DimPlot(sce, reduction = "umap.sc", cols = pal.cl, pt.size = 1)
ggsave(filename = file.path(fig.dir, "2_pe/PE_umap.sc_clusters.png"), device = "png", width = 6, height = 4, dpi = 300)
DimPlot(sce, reduction = "umap.sc", cols = pal.cl, pt.size = 1) + NoLegend() + NoAxes()
ggsave(filename = file.path(fig.dir, "2_pe/PE_umap.sc_clusters_nolegend.png"), device = "png", width = 6, height = 6, dpi = 300)

#Hormones
DimPlot(sce, reduction = "umap.sc", group.by="treatment", pt.size=1, cols=pal.trt)
ggsave(filename = file.path(fig.dir, "2_pe/PE_umap.sc_treatment.png"), device = "png", width = 6, height = 4, dpi = 300)
DimPlot(sce, reduction = "umap.sc", group.by="treatment", pt.size=1, cols=pal.trt) + NoLegend() + NoAxes()
ggsave(filename = file.path(fig.dir, "2_pe/PE_umap.sc_treatment_nolegend.png"), device = "png", width = 6, height = 6, dpi = 300)

#Population
DimPlot(sce, reduction = "umap.sc", group.by="population", pt.size=.8, cols=pal.pop2)
ggsave(filename = file.path(fig.dir, "2_pe/PE_umap.sc_population.png"), device = "png", width = 6, height = 4, dpi = 300)
DimPlot(sce, reduction = "umap.sc", group.by="population", pt.size=.8, cols=pal.pop2) + NoLegend() + NoAxes()
ggsave(filename = file.path(fig.dir, "2_pe/PE_umap.sc_population_nolegend.png"), device = "png", width = 6, height = 6, dpi = 300)

#Lobes
DimPlot(sce, reduction = "umap.sc", group.by = "lobe", pt.size=1, cols=pal.lobe)
ggsave(filename = file.path(fig.dir, "2_pe/PE_umap.sc_lobe.png"), device = "png", width = 6, height = 4, dpi = 300)
DimPlot(sce, reduction = "umap.sc", group.by = "lobe", pt.size=1, cols=pal.lobe) + NoLegend() + NoAxes()
ggsave(filename = file.path(fig.dir, "2_pe/PE_umap.sc_lobe_nolegend.png"), device = "png", width = 6, height = 6, dpi = 300)

#RFP
DimPlot(sce, reduction = "umap.sc",  group.by = "rfp", pt.size=1, cols=pal.rfp)
ggsave(filename = file.path(fig.dir, "2_pe/PE_umap.sc_rfp.png"), device = "png", width = 6, height = 5, dpi = 300)
DimPlot(sce, reduction = "umap.sc", group.by = "rfp", pt.size=1, cols=pal.rfp) + NoLegend() + NoAxes()
ggsave(filename = file.path(fig.dir, "2_pe/PE_umap.sc_rfp_nolegend.png"), device = "png", width = 6, height = 6, dpi = 300)
```

### UMAP by population

by individual population
```{r}
Idents(sce) <- "population"
vec.pop <- levels(factor(as.vector(sce$population)))
t  <- theme(axis.line=element_blank(),
            axis.text.x=element_blank(),
            axis.text.y=element_blank(),
            axis.ticks=element_blank(),
            axis.title.x=element_blank(),
            axis.title.y=element_blank(),
            legend.position="none",
            panel.background=element_blank(),
            panel.border=element_blank(),
            panel.grid.major=element_line(color = "grey80", size = 0.5),
            panel.grid.minor=element_blank(),
            plot.background=element_blank())
umap.coord <- as.data.frame(sce@reductions[["umap.sc"]]@cell.embeddings)
xl <- c(min(umap.coord$umap_1), max(umap.coord$umap_1))
yl <- c(min(umap.coord$umap_2), max(umap.coord$umap_2))


for (i in seq_along(vec.pop)) {
  
  pop <- vec.pop[i]
  o1 <- subset(sce, idents = pop, invert = FALSE)
  DimPlot(o1, reduction = "umap.sc",group.by="population",pt.size=2,cols=pal.pop2[i])+NoLegend()+NoAxes()+t+xlim(xl)+ylim(yl)
  ggsave(filename = file.path(fig.dir, paste0("2_pe/PE_POP_", pop, ".png")), device = "png", width = 6, height = 6, dpi = 300)
  
}

# All + grid
DimPlot(sce, reduction = "umap.sc", group.by="population", pt.size=.8, cols=pal.pop2) + NoLegend() + NoAxes()+t+xlim(xl)+ylim(yl)
ggsave(filename = file.path(fig.dir, "2_pe/PE_umap.sc_population_grid.png"), device = "png", width = 6, height = 6, dpi = 300)

```


### Dotplot

Dotplot 1
```{r}
DefaultAssay(sce) <- "RNA"

epi.markers.to.plot <- 
  c(
    "Epcam", "Krt8", "Cd24a", "Dpp4",
    "Sbp", "Spink1", "Spink5",
    "Tgm4", "Nkx3-1", "Pbsn",
    "Msmb", "Wfdc3", "Itln1",
    "Psca", "Krt4", "Wfdc2",
    
    "Clu", "Basp1", "Sprr1a",
    "Crym", "Ang5", "Fabp5",
                         
    "Krt5", "Krt14", "Trp63"
    )

DotPlot(
  sce, 
  features = rev(epi.markers.to.plot), 
  group.by = "in_silico_clusters",
  cols = c("darkblue", "tomato"), 
  dot.scale = 5) + 
  RotatedAxis() + 
  labs(x="Genes", y="In Silico Clusters")

ggsave(filename = file.path(fig.dir, "2_pe/PE_dotplot1.pdf"), width = 10, height = 6, device = "pdf")


DotPlot(sce, 
        features = rev(epi.markers.to.plot), 
        group.by = "in_silico_clusters",
        cols = c("darkblue", "tomato"), dot.scale = 6) + RotatedAxis()

ggsave(filename = file.path(fig.dir, "2_pe/PE_dotplot1.png"), device = "png", width = 8, height = 4, dpi = 300)
```

Dotplot 2 (choice)
```{r}
DefaultAssay(sce) <- "RNA"

epi.markers.to.plot <- 
  c(
    "Epcam", "Krt8", "Cd24a", "Dpp4",
    "Sbp", "Spink1", "Spink5",
    "Tgm4", "Nkx3-1", "Pbsn",
    "Msmb", "Wfdc3", "Itln1",
    "Psca", "Krt4", "Wfdc2",
    
    "Clu", "Basp1", "Lpl",
    "Car2", "Crym", "Ang5", "Fabp5", 
                         
    "Krt5", "Krt14", "Trp63"
    )

DotPlot(
  sce, 
  features = rev(epi.markers.to.plot), 
  group.by = "in_silico_clusters",
  cols = c("darkblue", "tomato"), 
  dot.scale = 5) + 
  RotatedAxis() + 
  labs(x="Genes", y="In Silico Clusters")

ggsave(filename = file.path(fig.dir, "2_pe/PE_dotplot2.pdf"), width = 10, height = 6, device = "pdf")


DotPlot(sce, 
        features = rev(epi.markers.to.plot), 
        group.by = "in_silico_clusters",
        cols = c("darkblue", "tomato"), dot.scale = 6) + RotatedAxis()

ggsave(filename = file.path(fig.dir, "2_pe/PE_dotplot2.png"), device = "png", width = 9, height = 4, dpi = 300)
```

### Heatmap

Top 10 marker genes of each cluster
```{r}
library(viridis)

DoHeatmap(object = sce,
          features = top10.epi$gene, 
          group.colors = pal.cl, 
          assay = "RNA", 
          slot = "data", 
          size = 4, 
          draw.lines = TRUE, 
          lines.width = 10) + scale_fill_viridis(na.value="white", option="inferno", begin = 0.1)

ggsave(filename = file.path(fig.dir, "2_pe/PE_heatmap_top10.png"), device = "png", width = 6, height = 9, dpi = 300)
```

### Genes 

```{r}
source("r_scripts/functions.R")
# Extract UMAPs coordinates
sce.UMAP <- as.data.frame(Embeddings(sce[["umap.sc"]]))
sce.UMAP <- sce.UMAP %>% 
  dplyr::mutate(CellBarcode = colnames(sce),
                Clusters = sce$seurat_clusters,
                Population = sce$population,
                Treatment = sce$treatment,
                Lobe = sce$lobe,
                RFP = sce$rfp)

# Runx1
gene <- "Runx1"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Nkx3-1
gene <- "Nkx3-1"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Spink1
gene <- "Spink1"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Msmb
gene <- "Msmb"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Tgm4
gene <- "Tgm4"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Pbsn
gene <- "Pbsn"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Ar
gene <- "Ar"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Tacstd2
gene <- "Tacstd2"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Krt4
gene <- "Krt4"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Krt8
gene <- "Krt8"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Krt18
gene <- "Krt18"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Krt5
gene <- "Krt5"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Trp63
gene <- "Trp63"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Epcam
gene <- "Epcam"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Cd49f
gene <- "Itga6"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Cd24a
gene <- "Cd24a"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Krt7
gene <- "Krt7"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Krt19
gene <- "Krt19"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Runx2
gene <- "Runx2"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)
# Runx3
gene <- "Runx3"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)
# Cbfb
gene <- "Cbfb"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Hoxb13
gene <- "Hoxb13"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Foxa1
gene <- "Foxa1"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Ly6d
gene <- "Ly6d"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Ly6e
gene <- "Ly6e"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Ly6a
gene <- "Ly6a"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)


# Psca
gene <- "Psca"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Krt14
gene <- "Krt14"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Apoe
gene <- "Apoe"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Krt15
gene <- "Krt15"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Cdh1
gene <- "Cdh1"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Dpp4
gene <- "Dpp4"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Dcn
gene <- "Dcn"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

### --- CAS

# Clu
gene <- "Clu"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Basp1
gene <- "Basp1"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Sprr1a
gene <- "Sprr1a"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Crip1
gene <- "Crip1"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Lpl
gene <- "Lpl"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Thbs1
gene <- "Thbs1"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Car2
gene <- "Car2"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)

# Crym
gene <- "Crym"
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "yes")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, ".png")), device = "png", width = 6, height = 6, dpi = 300)
plotUMAP_colByGene(sce, sce.UMAP, gene, ptsize = 1, alpha = 1, legend = "minimal")
ggsave(filename = file.path(fig.dir, paste0("3_pe_geneplots/PE_Gene_", gene, "_minimal",".png")), device = "png", width = 6, height = 6, dpi = 300)


```

### Metadata

```{r}
meta = sce@meta.data

### --- Number of epithelial cells per sorted population ------------------------------------

stat <- 
  meta %>% 
  group_by(population) %>% 
  summarise(count=n()) %>% 
  mutate(perc=(count/sum(count)*100))

ggplot(stat, aes(x=(population), y=count, fill=population)) +
  geom_bar(stat = "identity", width = 0.6) +
  geom_text(aes(y=count,label = count, hjust=-0.2)) +
  scale_fill_manual(values = pal.pop2) +
  labs(x="Sorted population", y="Number of cells") +
  scale_y_continuous(limits=c(0, max(stat$count) + 0.1*max(stat$count))) +
   theme(
               
     axis.title.x=element_blank(),
     axis.title.y=element_blank(),
     axis.text.y=element_blank(),
     
     legend.position = "none",
     panel.grid.minor = element_blank(),
     panel.grid.major = element_blank(),
     panel.background = element_blank(),
     panel.border = element_rect(colour = "grey30", fill=NA, size=0.5),
     strip.background = element_rect(fill="white", colour = "white")
  ) + rotate()

ggsave(filename = file.path(fig.dir, "Meta_PE_n_population.png"), device = "png", width = 3, height = 3, dpi = 300)

### --- Number of epithelial cells per RFP ------------------------------------

stat <- 
  meta %>% 
  group_by(rfp) %>% 
  summarise(count=n()) %>% 
  mutate(perc=(count/sum(count)*100))

ggplot(stat, aes(x=rfp, y=count, fill=rfp)) +
  geom_bar(stat = "identity", width = 0.6) +
  geom_text(aes(y=count,label = count, hjust=-0.2)) +
  scale_fill_manual(values = pal.rfp) +
  labs(x="Sorted population", y="Number of cells") +
  scale_y_continuous(limits=c(0, max(stat$count) + 0.2*max(stat$count))) +
   theme(
          
     axis.title.x=element_blank(),
     axis.title.y=element_blank(),
     axis.text.y=element_blank(),
     
     legend.position = "none",
     panel.grid.minor = element_blank(),
     panel.grid.major = element_blank(),
     panel.background = element_blank(),
     panel.border = element_rect(colour = "grey30", fill=NA, size=0.5),
     strip.background = element_rect(fill="white", colour = "white")
  ) + rotate()

ggsave(filename = file.path(fig.dir, "Meta_PE_n_rfp.png"), device = "png", width = 3, height = 1, dpi = 300)

### --- Number of epithelial cells per Treatment ------------------------------------

stat <- 
  meta %>% 
  group_by(treatment) %>% 
  summarise(count=n()) %>% 
  mutate(perc=(count/sum(count)*100))

ggplot(stat, aes(x=treatment, y=count, fill=treatment)) +
  geom_bar(stat = "identity", width = 0.6) +
  geom_text(aes(y=count,label = count, hjust=-0.2)) +
  scale_fill_manual(values = pal.trt) +
  labs(x="Sorted population", y="Number of cells") +
  scale_y_continuous(limits=c(0, max(stat$count) + 0.2*max(stat$count))) +
   theme(
          
     axis.title.x=element_blank(),
     axis.title.y=element_blank(),
     axis.text.y=element_blank(),
     
     legend.position = "none",
     panel.grid.minor = element_blank(),
     panel.grid.major = element_blank(),
     panel.background = element_blank(),
     panel.border = element_rect(colour = "grey30", fill=NA, size=0.5),
     strip.background = element_rect(fill="white", colour = "white")
  ) + rotate()

ggsave(filename = file.path(fig.dir, "Meta_PE_n_treatment.png"), device = "png", width = 3, height = 1, dpi = 300)

### --- Number of epithelial cells per Lobe ------------------------------------

stat <- 
  meta %>% 
  group_by(lobe) %>% 
  summarise(count=n()) %>% 
  mutate(perc=(count/sum(count)*100))

ggplot(stat, aes(x=lobe, y=count, fill=lobe)) +
  geom_bar(stat = "identity", width = 0.6) +
  geom_text(aes(y=count,label = count, hjust=-0.2)) +
  scale_fill_manual(values = pal.lobe) +
  labs(x="Sorted population", y="Number of cells") +
  scale_y_continuous(limits=c(0, max(stat$count) + 0.2*max(stat$count))) +
   theme(
          
     axis.title.x=element_blank(),
     axis.title.y=element_blank(),
     axis.text.y=element_blank(),
     
     legend.position = "none",
     panel.grid.minor = element_blank(),
     panel.grid.major = element_blank(),
     panel.background = element_blank(),
     panel.border = element_rect(colour = "grey30", fill=NA, size=0.5),
     strip.background = element_rect(fill="white", colour = "white")
  ) + rotate()

ggsave(filename = file.path(fig.dir, "Meta_PE_n_lobe.png"), device = "png", width = 3, height = 1.4, dpi = 300)

```

Sankey
```{r}
df.allu <- meta %>% 
  filter(in_silico_clusters != "Bas") %>% 
  tibble::rownames_to_column( "cell_id") %>%
  group_by(in_silico_clusters, treatment, lobe, rfp) %>%
  dplyr::summarize(counts = n()) %>%
  ungroup() %>%
  dplyr::arrange(desc(counts))

# Treatment|Clusters|RFP / colour by lobe  -----------------------------------
ggplot(df.allu, aes(y = counts, axis1 = treatment, axis2 = in_silico_clusters, axis3 = factor(rfp, levels = c("Low", "High")))) +
  geom_alluvium(aes(fill = lobe), width = 1/6) +
  scale_fill_manual(values = pal.lobe) +
  geom_stratum(width = 0.25, fill = "white", color = "grey80", linetype = "solid") +
  geom_text(stat = "stratum", infer.label = TRUE) +
  scale_x_discrete(limits = c("Hormonal status", "Cluster", "RFP status"), expand = c(.05, .05)) +
  theme(
    legend.position = "none",
    axis.ticks = element_blank(),
    axis.title.y=element_blank(),
    axis.text.y=element_blank(),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    panel.background = element_blank(),
    strip.background = element_rect(fill="white", colour = "white")) 

ggsave(filename = file.path(fig.dir, "/sankey/Sankey_lobes.pdf"), device = "pdf", width = 10, height = 8, dpi = 300)


# Treatment|Clusters|RFP / colour by rfp  -----------------------------------
ggplot(df.allu, aes(y = counts, axis1 = treatment, axis2 = in_silico_clusters, axis3 = factor(rfp, levels = c("Low", "High")))) +
  geom_alluvium(aes(fill = rfp), width = 1/6) +
  scale_fill_manual(values = pal.rfp) +
  geom_stratum(width = 0.25, fill = "white", color = "grey80", linetype = "solid") +
  geom_text(stat = "stratum", infer.label = TRUE) +
  scale_x_discrete(limits = c("Hormonal status", "Cluster", "RFP status"), expand = c(.05, .05)) +
  theme(
    legend.position = "none",
    axis.ticks = element_blank(),
    axis.title.y=element_blank(),
    axis.text.y=element_blank(),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    panel.background = element_blank(),
    strip.background = element_rect(fill="white", colour = "white")) 

ggsave(filename = file.path(fig.dir, "/sankey/Sankey_rfp.pdf"), device = "pdf", width = 10, height = 8, dpi = 300)


# Treatment|Clusters|RFP / colour by treatment  -----------------------------------
ggplot(df.allu, aes(y = counts, axis1 = treatment, axis2 = in_silico_clusters, axis3 = factor(rfp, levels = c("Low", "High")))) +
  geom_alluvium(aes(fill = treatment), width = 1/6) +
  scale_fill_manual(values = pal.trt) +
  geom_stratum(width = 0.25, fill = "white", color = "grey80", linetype = "solid") +
  geom_text(stat = "stratum", infer.label = TRUE) +
  scale_x_discrete(limits = c("Hormonal status", "Cluster", "RFP status"), expand = c(.05, .05)) +
  theme(
    legend.position = "none",
    axis.ticks = element_blank(),
    axis.title.y=element_blank(),
    axis.text.y=element_blank(),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    panel.background = element_blank(),
    strip.background = element_rect(fill="white", colour = "white")) 

ggsave(filename = file.path(fig.dir, "/sankey/Sankey_treatment.pdf"), device = "pdf", width = 10, height = 8, dpi = 300)


# Treatment|Clusters|RFP / colour by clusters  -----------------------------------
ggplot(df.allu, aes(y = counts, axis1 = treatment, axis2 = in_silico_clusters, axis3 = factor(rfp, levels = c("Low", "High")))) +
  geom_alluvium(aes(fill = in_silico_clusters), width = 1/6) +
  scale_fill_manual(values = pal.cl) +
  geom_stratum(width = 0.25, fill = "white", color = "grey80", linetype = "solid") +
  geom_text(stat = "stratum", infer.label = TRUE) +
  scale_x_discrete(limits = c("Hormonal status", "Cluster", "RFP status"), expand = c(.05, .05)) +
  theme(
    legend.position = "none",
    axis.ticks = element_blank(),
    axis.title.y=element_blank(),
    axis.text.y=element_blank(),
    panel.grid.minor = element_blank(),
    panel.grid.major = element_blank(),
    panel.background = element_blank(),
    strip.background = element_rect(fill="white", colour = "white")) 

ggsave(filename = file.path(fig.dir, "/sankey/Sankey_clusters.pdf"), device = "pdf", width = 10, height = 8, dpi = 300)
```

--------------------------------------------------------------------------------
## Session Information
```{r}
sessionInfo()
```