https://github.com/liuyifang/Drosophila-PDGF-VEGF-signaling-from-muscles-to-hepatocyte-like-cells-protects-against-obesity
Raw File
Tip revision: f1ad799015c901dad378f6e488dc38f4a19fd703 authored by Yifang Liu on 06 November 2020, 22:49:13 UTC
Add files via upload
Tip revision: f1ad799
2020-02-05_prepare_EGFP_matrix.Rmd
---
title: "prepare EGFP matrix"
author: "Yifang Liu"
date: "`r Sys.Date()`"
output:
  rmdformats::html_clean:
    code_folding: hide
    fig_width: 10
    fig_height: 10
    highlight: kate
    thumbnails: false
    lightbox: true
    gallery: true
---

```{r knitr_init, echo=FALSE, cache=FALSE}
library(knitr)
library(rmdformats)

options(max.print = 200)
opts_chunk$set(echo = TRUE,
               cache = FALSE,
               prompt = FALSE,
               tidy = TRUE,
               comment = NA,
               message = FALSE,
               warning = FALSE,
               dev = c('png', 'pdf'),
               fig.width = 10,
               fig.height = 10,
               fig.align = "center",
               fig.path = 'PDF_2020-02-05_prepare_EGFP_matrix/',
               dpi = 72)
opts_knit$set(width = 75)
```

```{r setup}
set.seed(123)

# Suppress loading messages
suppressPackageStartupMessages({
  library(Matrix)
  library(dplyr)
  library(tidyverse)
  library(Seurat)
  library(cowplot)
  library(Rcpp)
  library(harmony)
})

theme_set(theme_cowplot())
```

```{r}
# G0_matrix
G0_matrix <- readRDS("/Users/yifang/Projects/With/Arpan/Harmony/TSC1_EGFP_SoupX_fixed_0.45_191029/TSC1_EGFP_SoupX_fixed_0.45_191028/G0.Rds") # matrix after SoupX
colnames(G0_matrix) <- paste0("G0_", colnames(G0_matrix))

# G1_matrix
G1_matrix <- readRDS("/Users/yifang/Projects/With/Arpan/Harmony/TSC1_EGFP_SoupX_fixed_0.45_191029/TSC1_EGFP_SoupX_fixed_0.45_191028/G1.Rds")
colnames(G1_matrix) <- paste0("G1_", colnames(G1_matrix))

# # T0_matrix
# T0_matrix <- readRDS("/Users/yifang/Projects/With/Arpan/Harmony/TSC1_EGFP_SoupX_fixed_0.45_191029/TSC1_EGFP_SoupX_fixed_0.45_191028/T0.Rds")
# colnames(T0_matrix) <- paste0("T0_", colnames(T0_matrix))
#
# # T1_matrix
# T1_matrix <- readRDS("/Users/yifang/Projects/With/Arpan/Harmony/TSC1_EGFP_SoupX_fixed_0.45_191029/TSC1_EGFP_SoupX_fixed_0.45_191028/T1.Rds")
# colnames(T1_matrix) <- paste0("T1_", colnames(T1_matrix))

dir.create("Data")
```

```{r}
# G0
G0_metadata <- colnames(G0_matrix)
G0_metadata <- as.data.frame(G0_metadata)
colnames(G0_metadata) <- "barcode"
G0_metadata$nUMI <- colSums(G0_matrix)
G0_metadata$nRNA <- colSums(G0_matrix > 0)
mito <- grep(pattern = "^mt:", x = rownames(G0_matrix), value = TRUE)
G0_metadata$percent_mito <- colSums(G0_matrix[mito, ]) / colSums(G0_matrix)

G0_nRNA_lower_bound <- 10^(mean(log10(G0_metadata$nRNA)) -
            2*sd(log10(G0_metadata$nRNA)))
G0_nRNA_upper_bound <- 10^(mean(log10(G0_metadata$nRNA)) +
            2*sd(log10(G0_metadata$nRNA)))

G0_nUMI_lower_bound <- 10^(mean(log10(G0_metadata$nUMI)) -
            2*sd(log10(G0_metadata$nUMI)))
G0_nUMI_upper_bound <- 10^(mean(log10(G0_metadata$nUMI)) +
            2*sd(log10(G0_metadata$nUMI)))

G0_metadata_no0 <- subset(G0_metadata, percent_mito > 0)
G0_percent_mito_lower_bound <- 10^(mean(log10(G0_metadata_no0$percent_mito)) -
            2*sd(log10(G0_metadata_no0$percent_mito)))
G0_percent_mito_upper_bound <- 10^(mean(log10(G0_metadata_no0$percent_mito)) +
            2*sd(log10(G0_metadata_no0$percent_mito)))

# G1
G1_metadata <- colnames(G1_matrix)
G1_metadata <- as.data.frame(G1_metadata)
colnames(G1_metadata) <- "barcode"
G1_metadata$nUMI <- colSums(G1_matrix)
G1_metadata$nRNA <- colSums(G1_matrix > 0)
mito <- grep(pattern = "^mt:", x = rownames(G1_matrix), value = TRUE)
G1_metadata$percent_mito <- colSums(G1_matrix[mito, ]) / colSums(G1_matrix)

G1_nRNA_lower_bound <- 10^(mean(log10(G1_metadata$nRNA)) -
            2*sd(log10(G1_metadata$nRNA)))
G1_nRNA_upper_bound <- 10^(mean(log10(G1_metadata$nRNA)) +
            2*sd(log10(G1_metadata$nRNA)))

G1_nUMI_lower_bound <- 10^(mean(log10(G1_metadata$nUMI)) -
            2*sd(log10(G1_metadata$nUMI)))
G1_nUMI_upper_bound <- 10^(mean(log10(G1_metadata$nUMI)) +
            2*sd(log10(G1_metadata$nUMI)))

G1_metadata_no0 <- subset(G1_metadata, percent_mito > 0)
G1_percent_mito_lower_bound <- 10^(mean(log10(G1_metadata_no0$percent_mito)) -
            2*sd(log10(G1_metadata_no0$percent_mito)))
G1_percent_mito_upper_bound <- 10^(mean(log10(G1_metadata_no0$percent_mito)) +
            2*sd(log10(G1_metadata_no0$percent_mito)))

# # T0
# T0_metadata <- colnames(T0_matrix)
# T0_metadata <- as.data.frame(T0_metadata)
# colnames(T0_metadata) <- "barcode"
# T0_metadata$nUMI <- colSums(T0_matrix)
# T0_metadata$nRNA <- colSums(T0_matrix > 0)
# mito <- grep(pattern = "^mt:", x = rownames(T0_matrix), value = TRUE)
# T0_metadata$percent_mito <- colSums(T0_matrix[mito, ]) / colSums(T0_matrix)
#
# T0_nRNA_lower_bound <- 10^(mean(log10(T0_metadata$nRNA)) -
#             2*sd(log10(T0_metadata$nRNA)))
# T0_nRNA_upper_bound <- 10^(mean(log10(T0_metadata$nRNA)) +
#             2*sd(log10(T0_metadata$nRNA)))
#
# T0_nUMI_lower_bound <- 10^(mean(log10(T0_metadata$nUMI)) -
#             2*sd(log10(T0_metadata$nUMI)))
# T0_nUMI_upper_bound <- 10^(mean(log10(T0_metadata$nUMI)) +
#             2*sd(log10(T0_metadata$nUMI)))
#
# T0_metadata_no0 <- subset(T0_metadata, percent_mito > 0)
# T0_percent_mito_lower_bound <- 10^(mean(log10(T0_metadata_no0$percent_mito)) -
#             2*sd(log10(T0_metadata_no0$percent_mito)))
# T0_percent_mito_upper_bound <- 10^(mean(log10(T0_metadata_no0$percent_mito)) +
#             2*sd(log10(T0_metadata_no0$percent_mito)))
#
# # T1
# T1_metadata <- colnames(T1_matrix)
# T1_metadata <- as.data.frame(T1_metadata)
# colnames(T1_metadata) <- "barcode"
# T1_metadata$nUMI <- colSums(T1_matrix)
# T1_metadata$nRNA <- colSums(T1_matrix > 0)
# mito <- grep(pattern = "^mt:", x = rownames(T1_matrix), value = TRUE)
# T1_metadata$percent_mito <- colSums(T1_matrix[mito, ]) / colSums(T1_matrix)
#
# T1_nRNA_lower_bound <- 10^(mean(log10(T1_metadata$nRNA)) -
#             2*sd(log10(T1_metadata$nRNA)))
# T1_nRNA_upper_bound <- 10^(mean(log10(T1_metadata$nRNA)) +
#             2*sd(log10(T1_metadata$nRNA)))
#
# T1_nUMI_lower_bound <- 10^(mean(log10(T1_metadata$nUMI)) -
#             2*sd(log10(T1_metadata$nUMI)))
# T1_nUMI_upper_bound <- 10^(mean(log10(T1_metadata$nUMI)) +
#             2*sd(log10(T1_metadata$nUMI)))
#
# T1_metadata_no0 <- subset(T1_metadata, percent_mito > 0)
# T1_percent_mito_lower_bound <- 10^(mean(log10(T1_metadata_no0$percent_mito)) -
#             2*sd(log10(T1_metadata_no0$percent_mito)))
# T1_percent_mito_upper_bound <- 10^(mean(log10(T1_metadata_no0$percent_mito)) +
#             2*sd(log10(T1_metadata_no0$percent_mito)))
```

# G0 QC {.tabset}

## nUMI

```{r G0_QC_nUMI}
ggplot(data = G0_metadata, aes(x = nUMI)) + geom_histogram(binwidth = 10) + xlim(0, 8000) +
  geom_vline(xintercept = G0_nUMI_lower_bound) +
  geom_vline(xintercept = G0_nUMI_upper_bound)
```

## nRNA

```{r G0_QC_nRNA}
ggplot(data = G0_metadata, aes(x = nRNA)) + geom_histogram(binwidth = 10) + xlim(0, 1400) +
  geom_vline(xintercept = G0_nRNA_lower_bound) +
  geom_vline(xintercept = G0_nRNA_upper_bound)
```

## percent_mito

```{r G0_QC_percent_mito}
ggplot(data = G0_metadata, aes(x = percent_mito)) + geom_histogram(binwidth = 0.01) + xlim(0, 1) +
  geom_vline(xintercept = G0_percent_mito_upper_bound)
```

# G1 QC {.tabset}

## nUMI

```{r G1_QC_nUMI}
ggplot(data = G1_metadata, aes(x = nUMI)) + geom_histogram(binwidth = 10) + xlim(0, 8000) +
  geom_vline(xintercept = G1_nUMI_lower_bound) +
  geom_vline(xintercept = G1_nUMI_upper_bound)
```

## nRNA

```{r G1_QC_nRNA}
ggplot(data = G1_metadata, aes(x = nRNA)) + geom_histogram(binwidth = 10) + xlim(0, 1400) +
  geom_vline(xintercept = G1_nRNA_lower_bound) +
  geom_vline(xintercept = G1_nRNA_upper_bound)
```

## percent_mito

```{r G1_QC_percent_mito}
ggplot(data = G1_metadata, aes(x = percent_mito)) + geom_histogram(binwidth = 0.01) + xlim(0, 1) +
  geom_vline(xintercept = G1_percent_mito_upper_bound)
```

<!-- # T0 QC {.tabset} -->

<!-- ## nUMI -->

<!-- ```{r T0_QC_nUMI} -->
<!-- ggplot(data = T0_metadata, aes(x = nUMI)) + geom_histogram(binwidth = 10) + xlim(0, 8000) + -->
<!--   geom_vline(xintercept = T0_nUMI_lower_bound) + -->
<!--   geom_vline(xintercept = T0_nUMI_upper_bound) -->
<!-- ``` -->

<!-- ## nRNA -->

<!-- ```{r T0_QC_nRNA} -->
<!-- ggplot(data = T0_metadata, aes(x = nRNA)) + geom_histogram(binwidth = 10) + xlim(0, 1400) + -->
<!--   geom_vline(xintercept = T0_nRNA_lower_bound) + -->
<!--   geom_vline(xintercept = T0_nRNA_upper_bound) -->
<!-- ``` -->

<!-- ## percent_mito -->

<!-- ```{r T0_QC_percent_mito} -->
<!-- ggplot(data = T0_metadata, aes(x = percent_mito)) + geom_histogram(binwidth = 0.01) + xlim(0, 1) + -->
<!--   geom_vline(xintercept = T0_percent_mito_upper_bound) -->
<!-- ``` -->

<!-- # T1 QC {.tabset} -->

<!-- ## nUMI -->

<!-- ```{r T1_QC_nUMI} -->
<!-- ggplot(data = T1_metadata, aes(x = nUMI)) + geom_histogram(binwidth = 10) + xlim(0, 8000) + -->
<!--   geom_vline(xintercept = T1_nUMI_lower_bound) + -->
<!--   geom_vline(xintercept = T1_nUMI_upper_bound) -->
<!-- ``` -->

<!-- ## nRNA -->

<!-- ```{r T1_QC_nRNA} -->
<!-- ggplot(data = T1_metadata, aes(x = nRNA)) + geom_histogram(binwidth = 10) + xlim(0, 1400) + -->
<!--   geom_vline(xintercept = T1_nRNA_lower_bound) + -->
<!--   geom_vline(xintercept = T1_nRNA_upper_bound) -->
<!-- ``` -->

<!-- ## percent_mito -->

<!-- ```{r T1_QC_percent_mito} -->
<!-- ggplot(data = T1_metadata, aes(x = percent_mito)) + geom_histogram(binwidth = 0.01) + xlim(0, 1) + -->
<!--   geom_vline(xintercept = T1_percent_mito_upper_bound) -->
<!-- ``` -->

# Filter

```{r filter}
G0_metadata_after_filter <- subset(G0_metadata, (nRNA > G0_nRNA_lower_bound  & nRNA < G0_nRNA_upper_bound) & (nUMI > G0_nUMI_lower_bound & nUMI < G0_nUMI_upper_bound) & (percent_mito < G0_percent_mito_upper_bound))

G1_metadata_after_filter <- subset(G1_metadata, (nRNA > G1_nRNA_lower_bound  & nRNA < G1_nRNA_upper_bound) & (nUMI > G1_nUMI_lower_bound & nUMI < G1_nUMI_upper_bound) & (percent_mito < G1_percent_mito_upper_bound))

# T0_metadata_after_filter <- subset(T0_metadata, (nRNA > T0_nRNA_lower_bound  & nRNA < T0_nRNA_upper_bound) & (nUMI > T0_nUMI_lower_bound & nUMI < T0_nUMI_upper_bound) & (percent_mito < T0_percent_mito_upper_bound))
#
# T1_metadata_after_filter <- subset(T1_metadata, (nRNA > T1_nRNA_lower_bound  & nRNA < T1_nRNA_upper_bound) & (nUMI > T1_nUMI_lower_bound & nUMI < T1_nUMI_upper_bound) & (percent_mito < T1_percent_mito_upper_bound))

G0_matrix_after_filter <- G0_matrix[ , G0_metadata_after_filter$barcode]
G1_matrix_after_filter <- G1_matrix[ , G1_metadata_after_filter$barcode]
# T0_matrix_after_filter <- T0_matrix[ , T0_metadata_after_filter$barcode]
# T1_matrix_after_filter <- T1_matrix[ , T1_metadata_after_filter$barcode]

# all.equal(row.names(T0_matrix_after_filter), row.names(T1_matrix_after_filter))
# all.equal(row.names(T0_matrix_after_filter), row.names(G0_matrix_after_filter))
# all.equal(row.names(T0_matrix_after_filter), row.names(G1_matrix_after_filter))

# G0_G1_T0_T1_matrix_after_filter <- cbind(
#   G0_matrix_after_filter,
#   G1_matrix_after_filter,
#   T0_matrix_after_filter,
#   T1_matrix_after_filter)

G0_G1_matrix_after_filter <- cbind(
  G0_matrix_after_filter,
  G1_matrix_after_filter)

remove_genes <- c("EGFP", "Tsc1", "gig")
EGFP_matrix <- G0_G1_matrix_after_filter[!(rownames(G0_G1_matrix_after_filter) %in% remove_genes), ]

saveRDS(EGFP_matrix, file = "Data/2020-02-05_EGFP_matrix.Rds")
```

# QC table before filter

```{r}
G0_Num_Cells <- nrow(G0_metadata)
G1_Num_Cells <- nrow(G1_metadata)
# T0_Num_Cells <- nrow(T0_metadata)
# T1_Num_Cells <- nrow(T1_metadata)

G0_Median_Genes <- median(G0_metadata$nRNA)
G1_Median_Genes <- median(G1_metadata$nRNA)
# T0_Median_Genes <- median(T0_metadata$nRNA)
# T1_Median_Genes <- median(T1_metadata$nRNA)

G0_Median_UMIs <- median(G0_metadata$nUMI)
G1_Median_UMIs <- median(G1_metadata$nUMI)
# T0_Median_UMIs <- median(T0_metadata$nUMI)
# T1_Median_UMIs <- median(T1_metadata$nUMI)

# QC_table_before_filter <- c("G0", "G1", "T0", "T1") %>% as.data.frame()
QC_table_before_filter <- c("G0", "G1") %>% as.data.frame()
colnames(QC_table_before_filter) <- "Sample"

# QC_table_before_filter$Num_Cells <- c(G0_Num_Cells, G1_Num_Cells, T0_Num_Cells, T1_Num_Cells)
# QC_table_before_filter$Median_Genes <- c(G0_Median_Genes, G1_Median_Genes, T0_Median_Genes, T1_Median_Genes)
# QC_table_before_filter$Median_UMIs <- c(G0_Median_UMIs, G1_Median_UMIs, T0_Median_UMIs, T1_Median_UMIs)
QC_table_before_filter$Num_Cells <- c(G0_Num_Cells, G1_Num_Cells)
QC_table_before_filter$Median_Genes <- c(G0_Median_Genes, G1_Median_Genes)
QC_table_before_filter$Median_UMIs <- c(G0_Median_UMIs, G1_Median_UMIs)

write.table(QC_table_before_filter,
            file = "Data/QC_table_before_filter.txt",
            row.names = FALSE,
            sep = "\t")

DT::datatable(QC_table_before_filter)
```

# QC table after filter

```{r}
G0_Num_Cells <- nrow(G0_metadata_after_filter)
G1_Num_Cells <- nrow(G1_metadata_after_filter)
# T0_Num_Cells <- nrow(T0_metadata_after_filter)
# T1_Num_Cells <- nrow(T1_metadata_after_filter)

G0_Median_Genes <- median(G0_metadata_after_filter$nRNA)
G1_Median_Genes <- median(G1_metadata_after_filter$nRNA)
# T0_Median_Genes <- median(T0_metadata_after_filter$nRNA)
# T1_Median_Genes <- median(T1_metadata_after_filter$nRNA)

G0_Median_UMIs <- median(G0_metadata_after_filter$nUMI)
G1_Median_UMIs <- median(G1_metadata_after_filter$nUMI)
# T0_Median_UMIs <- median(T0_metadata_after_filter$nUMI)
# T1_Median_UMIs <- median(T1_metadata_after_filter$nUMI)

# QC_table_after_filter <- c("G0", "G1", "T0", "T1") %>% as.data.frame()
QC_table_after_filter <- c("G0", "G1") %>% as.data.frame()
colnames(QC_table_after_filter) <- "Sample"

# QC_table_after_filter$Num_Cells <- c(G0_Num_Cells, G1_Num_Cells, T0_Num_Cells, T1_Num_Cells)
# QC_table_after_filter$Median_Genes <- c(G0_Median_Genes, G1_Median_Genes, T0_Median_Genes, T1_Median_Genes)
# QC_table_after_filter$Median_UMIs <- c(G0_Median_UMIs, G1_Median_UMIs, T0_Median_UMIs, T1_Median_UMIs)
QC_table_after_filter$Num_Cells <- c(G0_Num_Cells, G1_Num_Cells)
QC_table_after_filter$Median_Genes <- c(G0_Median_Genes, G1_Median_Genes)
QC_table_after_filter$Median_UMIs <- c(G0_Median_UMIs, G1_Median_UMIs)

write.table(QC_table_after_filter,
            file = "Data/QC_table_after_filter.txt",
            row.names = FALSE,
            sep = "\t")

DT::datatable(QC_table_after_filter)
```

# Notes

2020-02-05:

  * generate Data/2020-02-05_EGFP_matrix.Rds.

Sat Nov 30, 2019:

  * G0 G1 T0 T1 QC.

# Session Info

```{r sessioninfo, message=TRUE}
sessionInfo()
```
back to top