Content - 346498ecd91ae636b0f69b4cfda539f4835e3beb - 71c7fc9/seq_analysis_git.Rmd

visit type:
https://github.com/s-meaden/landsberger

05 April 2024, 20:49:39 UTC
Tip revision: e2f20b1f9d3de7f802b704e5100c5371ad793b72 authored by s-meaden on 14 September 2018, 13:37:52 UTC
Add files via upload
Tip revision: e2f20b1
seq_analysis_git.Rmd
---
title: "deep_seq_analysis"
output: 
  html_document:
    code_folding: hide
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```


```{r}
library(tidyverse)
library(RColorBrewer)
library(plotly)
library(htmlwidgets)
```

Set working directory to location of pileup files counting allele frequency data:
```{r}
setwd("/Users/seanmeaden/Dropbox/CRISPR Postdoc/Other_projects/Mariann_Tipping_Points/Data/sequence_data/pileups/")
list.files(path=".")
```

Put all together in a list
```{r}
# Make empty list
allcnts.list <- list()

# Pileup files for each sample
files <- list.files(path=".", pattern="*bam_pileup", recursive=FALSE)
files
# Read all into list
for (i in 1:length(files))
{
allcnts.list[[i]]<-read.table(files[i] ,sep="\t",col.names=c("chr","pos","ref","depth","refcnt","altcnt","ncnt","indelcnt"))
print(files[i])
}
# Name same as samples
names(allcnts.list)<-files

#Check
names(allcnts.list)
allcnts.list[1]
```

Convert to df to plot:
```{r}
all <- do.call("rbind", allcnts.list)
all$id <- rep(names(allcnts.list), sapply(allcnts.list, nrow))

#Tidy up names
all$id<-gsub("\\_\\.bam_pileup_out.txt", "", all$id)
all$id<-paste("sample", all$id, sep ="_")
row.names(all)<-rep(1:nrow(all))
head(all)
```

Get target sequences from each BIM and use in plot (from the mapped spacer reads):
```{r}
targs<-read.table("../target_seqs1.bed")
colnames(targs)<-c("spacer", "start", "stop")
head(targs)
# Add in PAM sequence at 5' end:
targs$stop<-targs$stop + 2
```

Sample information:
sample 1: lab DMS3vir
      2:   lab DMS3virAcrF1
      3:  lab DMS3virAcrF4
      4:  lab DMS3vir evolved on WT
      5:  lab DMS3virAcrF1 evolved on WT
      6:    lab DMS3virAcrF4 evolved on WT
      7:    lab DMS3virAcrF1 evolved on BIM2
      8:    lab DMS3virAcrF4 evolved on BIM2
      9:    lab DMS3virAcrF1 evolved on BIM5
      10:   lab DMS3virAcrF4 evolved on BIM5
```{r}

all <- all %>%
  mutate(strain = case_when(id == "sample_1" ~ "DMS3vir",
                            id == "sample_2" ~ "AcrF1",
                            id == "sample_3" ~ "AcrF4",
                            id == "sample_4" ~ "DMS3vir",
                            id == "sample_5" ~ "AcrF1",
                            id == "sample_6" ~ "AcrF4",
                            id == "sample_7" ~ "AcrF1",
                            id == "sample_8" ~ "AcrF4",
                            id == "sample_9" ~ "AcrF1",
                            id == "sample_10" ~ "AcrF4"), 
         treatment = case_when(id == "sample_1" ~ "ANC",
                            id == "sample_2" ~ "ANC",
                            id == "sample_3" ~ "ANC",
                            id == "sample_4" ~ "WT",
                            id == "sample_5" ~ "WT",
                            id == "sample_6" ~ "WT",
                            id == "sample_7" ~ "BIM2",
                            id == "sample_8" ~ "BIM2",
                            id == "sample_9" ~ "BIM5",
                            id == "sample_10" ~ "BIM5")) 
head(all)
```

#Coverage plot:
```{r}
all %>%
  ggplot(., aes(pos, depth))+
  geom_line( aes(group = id, color = strain), alpha = 0.4)+
  scale_color_brewer(type = "qual", palette = 6)+
  xlab(" Genome Positon (log10)")+
  ylab(" Coverage")+
  theme_classic()+
  theme(text = element_text(size = 20))

# Sanity check: Zoom in and label c-repressor that was removed (to make Vir)
all %>%
  filter(pos > 200 & pos < 2000) %>%
  ggplot(., aes(pos, depth))+
  geom_line( aes(group = id, color = strain), alpha = 0.4)+
  scale_color_brewer(type = "qual", palette = 6)+
  xlab(" Genome Positon (log10)")+
  ylab(" Coverage")+
  theme_classic()+
  theme(text = element_text(size = 20))+
  geom_vline(xintercept = c(500, 700), color = "black", linetype = "dashed")
```
 Looks pretty close to prediction. Maybe c-repressor truncation bigger than we though? 

 Zoom in and label Acr region recombined region (i.e. from JBD phages)
```{r}
all %>%
  filter(pos > 16000 & pos < 25000) %>%
  ggplot(., aes(pos, depth))+
  geom_line( aes(group = id, color = strain), alpha = 0.4)+
  scale_color_brewer(type = "qual", palette = 6)+
  xlab(" Genome Positon (log10)")+
  ylab(" Coverage")+
  theme_classic()+
  theme(text = element_text(size = 20))+
    geom_vline(xintercept = c(17700, 19357), color = "black", linetype = "dashed")
# Looks sensible?
```

Get ratio of ref/alt. Distinguish between high/low freq SNPs.
Need to define a biologically relevant threshold here vs. noise from sequencing / basecall
```{r}
# Hard filter sites with coverage low reads.
head(all)
#all<-subset(all, depth > 1000)

# Get SNP frequency

all$freq <- (all$depth - all$refcnt)/all$depth

head(all)
hist(all$freq, breaks = 100)
```

# Plot raw w/ no cutoffs:

```{r}
snps<-all
head(snps)
```


# Zoom in on protospacers:
Focus on PAM and seed region only
```{r}
targs

snps <- snps %>%
  mutate(spacer = case_when(
      pos >= 27872 & pos <= 27882 ~ "spacer1",
      pos >= 28031 & pos <= 28041 ~ "spacer2",
      pos >= 30696 & pos <= 30706 ~ "spacer3",
      pos >= 29990 & pos <= 30000 ~ "spacer4",
      pos >= 35792 & pos <= 35802 ~ "spacer5",
      TRUE ~ "other"
      )) #%>%
  #group_by(spacer) %>% summarise(my_cnt1 = n())
  #ggplot(., aes(pos, ratio))+
  #geom_point(aes(color = "spacer"))
head(snps)
```


# Find max SNP freq per spacer:
```{r}
spacer1<-subset(snps, spacer == "spacer1")
head(spacer1)
spacer1

spacers<-c("spacer1", "spacer2", "spacer3", "spacer4", "spacer5")

snps %>%
  group_by(spacer, strain, treatment) %>%
  filter(spacer %in% spacers) %>%
  # Just check WT first:
  summarise(max = max(freq)) %>%
  ggplot(., aes(treatment, max))+
  geom_boxplot(aes(color = strain))+
  scale_y_log10()+
  theme_classic()+
  xlab("Treatment")+
  ylab("Max SNP Frequency")+
  theme(text = element_text(size = 20))+
  facet_wrap(~ spacer)
```

# A couple of high freq SNPs in DMS3 on WT
```{r}
head(snps)

snps %>%
  filter(., spacer %in% spacers) %>%
  ggplot(., aes(pos, freq))+
  geom_jitter( aes(color = paste(strain, treatment, sep = "_")))+
  facet_wrap(~ spacer, ncol = 1, scales = "free_x")+
  theme_bw()
```
Note y scale. All SNPs really low freq apart from spacer 1. Zoom in on spacer 1:
```{r}
snps$sample<-paste(snps$strain, snps$treatment, sep = "_")
p <- snps %>%
  filter(., spacer == "spacer1") %>%
  ggplot(., aes(pos, freq))+
  geom_point( aes(color = sample), width = 2)+
  theme_bw()+
  xlab("Position")+
  ylab("Variant Frequency")+
  theme(text = element_text(size = 20))+
  labs(color = "Sample")
p

P<-ggplotly(p, width = 1500, tooltip = c('sample', 'pos', 'freq'))
P
```

# SNP frequency across all target sites:
```{r}
snps %>%
  filter(., spacer %in% spacers) %>%
  group_by(., spacer, treatment, strain) %>%
  summarise(mean_freq = mean(freq), sd = sd(freq), se = sd(freq) / sqrt(length(freq))) %>%
  ggplot(., aes(treatment, mean_freq, fill = strain))+
  geom_bar(stat = "identity", position = "dodge")+
  #geom_pointrange(aes(ymin = mean_freq - se, ymax = mean_freq + se, color = strain))+
  theme_classic()+
  scale_fill_brewer(type = "qual", palette = 6)+
  geom_errorbar( aes(ymin = mean_freq - se, ymax = mean_freq + se), , position = "dodge", alpha = 0.7, size = 0.5)+
  xlab("Treatment")+
  ylab("Mean SNP Frequency across\ntarget sites")+
  theme(text = element_text(size = 20))+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  facet_wrap(~ spacer)+
  labs(fill = "Strain")
```

# Show w/ pointrange / boxplot:
```{r}
snps
snps %>%
  filter(., spacer %in% spacers) %>%
  group_by(., spacer, treatment, strain) %>%
  summarise(mean_freq = mean(freq), sd = sd(freq), se = sd(freq) / sqrt(length(freq))) %>%
  ggplot(., aes(treatment, mean_freq))+
  #geom_pointrange( aes(y = mean_freq, ymin = mean_freq - se, ymax = mean_freq + se, color = strain), position = position_dodge(width = 1), fill = "white", shape = 22)+
  geom_pointrange( aes(y = mean_freq, ymin = mean_freq - se, ymax = mean_freq + se, color = strain), position = position_dodge(width = 1))+
  theme_classic()+
  scale_color_brewer(type = "qual", palette = 6)+
  xlab("Treatment")+
  ylab("Mean SNP Frequency across\ntarget sites")+
  theme(text = element_text(size = 20))+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  facet_wrap(~ spacer)+
  labs(color = "Strain")
```