Content - c02d77a25f0105f5fcb02bc33f5dc40488fdad54

Permalink
##############################################################################################################
### Ribo-seq analysis and visualization

library(tidyverse)
library(RColorBrewer)
library(scales)
library(patchwork)

# set working directory
setwd("/Users/cwnelson88/scripts_NGS/SARS-CoV-2-ORF3d/")

##############################################################################################################
### MAPPED READS BY POSITION OF FIRST NUCLEOTIDE IN GENOME

##############################################################################################################
### INPUT
mapped_reads_by_readlength <- read_tsv("data/mapped_reads_by_readlength.tsv")
mapped_reads_by_readlength$position <- mapped_reads_by_readlength$position + 1
mapped_reads_by_readlength
# 2,610,414 x 6
# sample      read_length chromosome  position frame read_count

# input and construct SAMPLE METADATA translator
sample_metadata_joiner <- read_tsv("data/riboseq_sample_metadata.tsv")

# Join
mapped_reads_by_readlength <- left_join(x = mapped_reads_by_readlength, y = sample_metadata_joiner, by = "sample")
mapped_reads_by_readlength
# still 2,610,414 x 6? YES, it's 2,610,414 x 9

# Another condition option
mapped_reads_by_readlength$condition_combn <- paste0(mapped_reads_by_readlength$treatment, " / ", mapped_reads_by_readlength$time)
unique(mapped_reads_by_readlength$condition_combn)
mapped_reads_by_readlength$condition_combn <- factor(mapped_reads_by_readlength$condition_combn,
                                                     levels = c("CHX / 5 hpi", "CHX / 24 hpi",
                                                                "Harr / 5 hpi", "Harr / 24 hpi",
                                                                "LTM / 5 hpi", "LTM / 24 hpi",
                                                                "mRNA / 5 hpi", "mRNA / 24 hpi")) # each of these has two samples

# Factor
mapped_reads_by_readlength$frame <- factor(mapped_reads_by_readlength$frame,
                                           levels = c(0, 1, 2), # was 0-based when determined
                                           labels = c('Frame 1', 'Frame 2', 'Frame 3')) 




####################################################################################################
### codon position x read length while excluding OLGs

# Strip away superfluous columns
(mapped_reads_by_readlength_wCDS <- dplyr::select(mapped_reads_by_readlength, -chromosome, -frame, -condition_combn))

### Add CDS position, knowing position is 1-based ###
mapped_reads_by_readlength_wCDS$region_type <- "UTR"
mapped_reads_by_readlength_wCDS$gene <- NA
mapped_reads_by_readlength_wCDS$CDS <- NA
mapped_reads_by_readlength_wCDS$dist_from_end <- NA

#ORF1ab_1:266-13468
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 266 & mapped_reads_by_readlength_wCDS$position <= 13468, ]$region_type <- "gene"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 266 & mapped_reads_by_readlength_wCDS$position <= 13468, ]$gene <- "ORF1ab_1"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 266 & mapped_reads_by_readlength_wCDS$position <= 13468, ]$CDS <- 
  mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 266 & mapped_reads_by_readlength_wCDS$position <= 13468, ]$position - 266 + 1
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 266 & mapped_reads_by_readlength_wCDS$position <= 13468, ]$dist_from_end <- 
  13468 - mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 266 & mapped_reads_by_readlength_wCDS$position <= 13468, ]$position

#ORF1ab_2:13468-21555
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 13468 & mapped_reads_by_readlength_wCDS$position <= 21555, ]$region_type <- "gene"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 13468 & mapped_reads_by_readlength_wCDS$position <= 21555, ]$gene <- "ORF1ab_2"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 13468 & mapped_reads_by_readlength_wCDS$position <= 21555, ]$CDS <- 
  mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 13468 & mapped_reads_by_readlength_wCDS$position <= 21555, ]$position - 13468 + 1
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 13468 & mapped_reads_by_readlength_wCDS$position <= 21555, ]$dist_from_end <- 
  21555 - mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 13468 & mapped_reads_by_readlength_wCDS$position <= 21555, ]$position

#S:21563-25384
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 21563 & mapped_reads_by_readlength_wCDS$position <= 25384, ]$region_type <- "gene"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 21563 & mapped_reads_by_readlength_wCDS$position <= 25384, ]$gene <- "S"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 21563 & mapped_reads_by_readlength_wCDS$position <= 25384, ]$CDS <- 
  mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 21563 & mapped_reads_by_readlength_wCDS$position <= 25384, ]$position - 21563 + 1
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 21563 & mapped_reads_by_readlength_wCDS$position <= 25384, ]$dist_from_end <- 
  25384 - mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 21563 & mapped_reads_by_readlength_wCDS$position <= 25384, ]$position

#ORF3a:25393-26220
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 25393 & mapped_reads_by_readlength_wCDS$position <= 26220, ]$region_type <- "gene"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 25393 & mapped_reads_by_readlength_wCDS$position <= 26220, ]$gene <- "ORF3a"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 25393 & mapped_reads_by_readlength_wCDS$position <= 26220, ]$CDS <- 
  mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 25393 & mapped_reads_by_readlength_wCDS$position <= 26220, ]$position - 25393 + 1
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 25393 & mapped_reads_by_readlength_wCDS$position <= 26220, ]$dist_from_end <- 
  26220 - mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 25393 & mapped_reads_by_readlength_wCDS$position <= 26220, ]$position

#E:26245-26472
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 26245 & mapped_reads_by_readlength_wCDS$position <= 26472, ]$region_type <- "gene"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 26245 & mapped_reads_by_readlength_wCDS$position <= 26472, ]$gene <- "E"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 26245 & mapped_reads_by_readlength_wCDS$position <= 26472, ]$CDS <- 
  mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 26245 & mapped_reads_by_readlength_wCDS$position <= 26472, ]$position - 26245 + 1
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 26245 & mapped_reads_by_readlength_wCDS$position <= 26472, ]$dist_from_end <- 
  26472 - mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 26245 & mapped_reads_by_readlength_wCDS$position <= 26472, ]$position

#M:26523-27191
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 26523 & mapped_reads_by_readlength_wCDS$position <= 27191, ]$region_type <- "gene"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 26523 & mapped_reads_by_readlength_wCDS$position <= 27191, ]$gene <- "M"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 26523 & mapped_reads_by_readlength_wCDS$position <= 27191, ]$CDS <- 
  mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 26523 & mapped_reads_by_readlength_wCDS$position <= 27191, ]$position - 26523 + 1
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 26523 & mapped_reads_by_readlength_wCDS$position <= 27191, ]$dist_from_end <- 
  27191 - mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 26523 & mapped_reads_by_readlength_wCDS$position <= 27191, ]$position

#ORF6:27202-27387
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 27202 & mapped_reads_by_readlength_wCDS$position <= 27387, ]$region_type <- "gene"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 27202 & mapped_reads_by_readlength_wCDS$position <= 27387, ]$gene <- "ORF6"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 27202 & mapped_reads_by_readlength_wCDS$position <= 27387, ]$CDS <- 
  mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 27202 & mapped_reads_by_readlength_wCDS$position <= 27387, ]$position - 27202 + 1
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 27202 & mapped_reads_by_readlength_wCDS$position <= 27387, ]$dist_from_end <- 
  27387 - mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 27202 & mapped_reads_by_readlength_wCDS$position <= 27387, ]$position

#ORF7a:27394-27759
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 27394 & mapped_reads_by_readlength_wCDS$position <= 27759, ]$region_type <- "gene"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 27394 & mapped_reads_by_readlength_wCDS$position <= 27759, ]$gene <- "ORF7a"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 27394 & mapped_reads_by_readlength_wCDS$position <= 27759, ]$CDS <- 
  mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 27394 & mapped_reads_by_readlength_wCDS$position <= 27759, ]$position - 27394 + 1
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 27394 & mapped_reads_by_readlength_wCDS$position <= 27759, ]$dist_from_end <- 
  27759 - mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 27394 & mapped_reads_by_readlength_wCDS$position <= 27759, ]$position

#ORF7b:27756-27887
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 27756 & mapped_reads_by_readlength_wCDS$position <= 27887, ]$region_type <- "gene"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 27756 & mapped_reads_by_readlength_wCDS$position <= 27887, ]$gene <- "ORF7b"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 27756 & mapped_reads_by_readlength_wCDS$position <= 27887, ]$CDS <- 
  mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 27756 & mapped_reads_by_readlength_wCDS$position <= 27887, ]$position - 27756 + 1
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 27756 & mapped_reads_by_readlength_wCDS$position <= 27887, ]$dist_from_end <- 
  27887 - mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 27756 & mapped_reads_by_readlength_wCDS$position <= 27887, ]$position

#ORF8:27894-28259
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 27894 & mapped_reads_by_readlength_wCDS$position <= 28259, ]$region_type <- "gene"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 27894 & mapped_reads_by_readlength_wCDS$position <= 28259, ]$gene <- "ORF8"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 27894 & mapped_reads_by_readlength_wCDS$position <= 28259, ]$CDS <- 
  mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 27894 & mapped_reads_by_readlength_wCDS$position <= 28259, ]$position - 27894 + 1
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 27894 & mapped_reads_by_readlength_wCDS$position <= 28259, ]$dist_from_end <- 
  28259 - mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 27894 & mapped_reads_by_readlength_wCDS$position <= 28259, ]$position

#N:28274-29533
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 28274 & mapped_reads_by_readlength_wCDS$position <= 29533, ]$region_type <- "gene"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 28274 & mapped_reads_by_readlength_wCDS$position <= 29533, ]$gene <- "N"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 28274 & mapped_reads_by_readlength_wCDS$position <= 29533, ]$CDS <- 
  mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 28274 & mapped_reads_by_readlength_wCDS$position <= 29533, ]$position - 28274 + 1
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 28274 & mapped_reads_by_readlength_wCDS$position <= 29533, ]$dist_from_end <- 
  29533 - mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 28274 & mapped_reads_by_readlength_wCDS$position <= 29533, ]$position

#ORF10:29558-29674
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 29558 & mapped_reads_by_readlength_wCDS$position <= 29674, ]$region_type <- "gene"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 29558 & mapped_reads_by_readlength_wCDS$position <= 29674, ]$gene <- "ORF10"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 29558 & mapped_reads_by_readlength_wCDS$position <= 29674, ]$CDS <- 
  mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 29558 & mapped_reads_by_readlength_wCDS$position <= 29674, ]$position - 29558 + 1
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 29558 & mapped_reads_by_readlength_wCDS$position <= 29674, ]$dist_from_end <- 
  29674 - mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 29558 & mapped_reads_by_readlength_wCDS$position <= 29674, ]$position


### Exclude/categorize OLG sites ###
#pp1ab/pp1a (266..13468,13468..21555)/(266..13483)
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= (13468-2) & mapped_reads_by_readlength_wCDS$position <= (13483+2), ]$region_type <- "OLG"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= (13468-2) & mapped_reads_by_readlength_wCDS$position <= (13483+2), ]$CDS <- NA
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= (13468-2) & mapped_reads_by_readlength_wCDS$position <= (13483+2), ]$gene <- NA

#ORF3c:25457-25582
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= (25457-2) & mapped_reads_by_readlength_wCDS$position <= (25582+2), ]$region_type <- "OLG"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 25457 & mapped_reads_by_readlength_wCDS$position <= 25582, ]$gene <- "ORF3c"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 25457 & mapped_reads_by_readlength_wCDS$position <= 25582, ]$CDS <- 
  mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 25457 & mapped_reads_by_readlength_wCDS$position <= 25582, ]$position - 25457 + 1
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 25457 & mapped_reads_by_readlength_wCDS$position <= 25582, ]$dist_from_end <- 
  25582 - mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 25457 & mapped_reads_by_readlength_wCDS$position <= 25582, ]$position

#ORF3d:25524-25697
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= (25524-2) & mapped_reads_by_readlength_wCDS$position <= (25697+2), ]$region_type <- "OLG"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 25524 & mapped_reads_by_readlength_wCDS$position <= 25697, ]$gene <- "ORF3d"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 25524 & mapped_reads_by_readlength_wCDS$position <= 25697, ]$CDS <- 
  mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 25524 & mapped_reads_by_readlength_wCDS$position <= 25697, ]$position - 25524 + 1
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 25524 & mapped_reads_by_readlength_wCDS$position <= 25697, ]$dist_from_end <- 
  25697 - mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 25524 & mapped_reads_by_readlength_wCDS$position <= 25697, ]$position

#ORF3b:25814-25882
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= (25814-2) & mapped_reads_by_readlength_wCDS$position <= (25882+2), ]$region_type <- "OLG"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 25814 & mapped_reads_by_readlength_wCDS$position <= 25882, ]$gene <- "ORF3b"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 25814 & mapped_reads_by_readlength_wCDS$position <= 25882, ]$CDS <- 
  mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 25814 & mapped_reads_by_readlength_wCDS$position <= 25882, ]$position - 25814 + 1
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 25814 & mapped_reads_by_readlength_wCDS$position <= 25882, ]$dist_from_end <- 
  25882 - mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 25814 & mapped_reads_by_readlength_wCDS$position <= 25882, ]$position

#ORF7a/ORF7b (27394-27759)/(27756-27887)
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= (27756-2) & mapped_reads_by_readlength_wCDS$position <= (27759+2), ]$region_type <- "OLG"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= (27756-2) & mapped_reads_by_readlength_wCDS$position <= (27759+2), ]$CDS <- NA
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= (27756-2) & mapped_reads_by_readlength_wCDS$position <= (27759+2), ]$gene <- NA

#ORF9b:28284-28577
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= (28284-2) & mapped_reads_by_readlength_wCDS$position <= (28577+2), ]$region_type <- "OLG"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 28284 & mapped_reads_by_readlength_wCDS$position <= 28577, ]$gene <- "ORF9b"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 28284 & mapped_reads_by_readlength_wCDS$position <= 28577, ]$CDS <- 
  mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 28284 & mapped_reads_by_readlength_wCDS$position <= 28577, ]$position - 28284 + 1
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 28284 & mapped_reads_by_readlength_wCDS$position <= 28577, ]$dist_from_end <- 
  28577 - mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 28284 & mapped_reads_by_readlength_wCDS$position <= 28577, ]$position

#ORF9c:28734-28955
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= (28734-2) & mapped_reads_by_readlength_wCDS$position <= (28955+2), ]$region_type <- "OLG"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 28734 & mapped_reads_by_readlength_wCDS$position <= 28955, ]$gene <- "ORF9c"
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 28734 & mapped_reads_by_readlength_wCDS$position <= 28955, ]$CDS <- 
  mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 28734 & mapped_reads_by_readlength_wCDS$position <= 28955, ]$position - 28734 + 1
mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 28734 & mapped_reads_by_readlength_wCDS$position <= 28955, ]$dist_from_end <- 
  28955 - mapped_reads_by_readlength_wCDS[mapped_reads_by_readlength_wCDS$position >= 28734 & mapped_reads_by_readlength_wCDS$position <= 28955, ]$position


## Add frame, 0-based (0, 1, 2)
mapped_reads_by_readlength_wCDS$frame <- NA
mapped_reads_by_readlength_wCDS[! is.na(mapped_reads_by_readlength_wCDS$CDS), ]$frame <- 
  (mapped_reads_by_readlength_wCDS[! is.na(mapped_reads_by_readlength_wCDS$CDS), ]$CDS - 1) %% 3
mapped_reads_by_readlength_wCDS$frame <- factor(mapped_reads_by_readlength_wCDS$frame,
                                                levels = c(0, 1, 2), # 0-based
                                                labels = c('Codon pos 1', 'Codon pos 2', 'Codon pos 3'))

##################################################
### PREPARE FOR PLOTTING, ALL OF SUBSET
### vvv CHANGE THIS vvv ###
genes_included <- c("ORF1ab_1", "ORF1ab_2", "S", "ORF3a", "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8", "N", "ORF10") # all
#genes_included <- c("S", "ORF3a", "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8", "N", "ORF10") # all except ORF1ab
#genes_included <- "ORF3a"
#genes_included <- "N"
regions_included <- c("gene")
#regions_included <- c("gene", "OLG")
min_read_length <- 26
max_read_length <- 32
min_CDS <- 1
min_dist_from_end <- 0 #50
### ^^^ CHANGE THIS ^^^ ###

# Summarize
(mapped_reads_by_readlength_wCDS_summary <- filter(mapped_reads_by_readlength_wCDS, region_type %in% regions_included, gene %in% genes_included,
                                                   ! is.na(CDS), CDS >= min_CDS,
                                                   ! is.na(dist_from_end), dist_from_end >= min_dist_from_end) %>%
    group_by(read_length, time, treatment, frame) %>%
    summarise(
      read_count = sum(read_count)
    )) # 360 x 5

# Read count sum for each combination (pool frames)
(mapped_reads_by_readlength_wCDS_sums <- filter(mapped_reads_by_readlength_wCDS, region_type %in% regions_included, gene %in% genes_included,
                                                ! is.na(CDS), CDS >= min_CDS,
                                                ! is.na(dist_from_end), dist_from_end >= min_dist_from_end) %>%
    group_by(read_length, time, treatment) %>%
    summarise(
      read_count_sum = sum(read_count) # 
    )) # 120 x 4

# Join
mapped_reads_by_readlength_wCDS_summary <- left_join(mapped_reads_by_readlength_wCDS_summary, mapped_reads_by_readlength_wCDS_sums, 
                                                     by = c("read_length", "time", "treatment"))

# Proportion in each frame
mapped_reads_by_readlength_wCDS_summary$prop <- mapped_reads_by_readlength_wCDS_summary$read_count / mapped_reads_by_readlength_wCDS_summary$read_count_sum

## Calculate maxes -- specific for the range of lengths we want to plot
read_count_sum_max_5hr <- max(mapped_reads_by_readlength_wCDS_summary[mapped_reads_by_readlength_wCDS_summary$time == "5 hpi" & mapped_reads_by_readlength_wCDS_summary$read_length >= 26 & mapped_reads_by_readlength_wCDS_summary$read_length <= 32, ]$read_count_sum)
prop_max_5hr <- max(mapped_reads_by_readlength_wCDS_summary[mapped_reads_by_readlength_wCDS_summary$time == "5 hpi" & mapped_reads_by_readlength_wCDS_summary$read_length >= 26 & mapped_reads_by_readlength_wCDS_summary$read_length <= 32, ]$prop)
read_count_sum_max_24hr <- max(mapped_reads_by_readlength_wCDS_summary[mapped_reads_by_readlength_wCDS_summary$time == "24 hpi" & mapped_reads_by_readlength_wCDS_summary$read_length >= 26 & mapped_reads_by_readlength_wCDS_summary$read_length <= 32, ]$read_count_sum)
prop_max_24hr <- max(mapped_reads_by_readlength_wCDS_summary[mapped_reads_by_readlength_wCDS_summary$time == "24 hpi" & mapped_reads_by_readlength_wCDS_summary$read_length >= 26 & mapped_reads_by_readlength_wCDS_summary$read_length <= 32, ]$prop)

mapped_reads_by_readlength_wCDS_summary$read_count_sum_max <- read_count_sum_max_5hr
mapped_reads_by_readlength_wCDS_summary[mapped_reads_by_readlength_wCDS_summary$time == "24 hpi", ]$read_count_sum_max <- read_count_sum_max_24hr
mapped_reads_by_readlength_wCDS_summary$prop_max <- prop_max_5hr
mapped_reads_by_readlength_wCDS_summary[mapped_reads_by_readlength_wCDS_summary$time == "24 hpi", ]$prop_max <- prop_max_24hr

### PLOT Figure 2—figure supplement 3
(mapped_reads_by_readlength_wCDS_summary_PLOT <- ggplot(data = filter(mapped_reads_by_readlength_wCDS_summary, read_length >= min_read_length, read_length <= max_read_length),
                                                        mapping = aes(x = read_length, y = 100*prop, color = frame)) +
    geom_bar(data = filter(mapped_reads_by_readlength_wCDS_summary, read_length >= min_read_length, read_length <= max_read_length, frame == "Codon pos 1"), # otherwise triplicate plotting
             mapping = aes(x = read_length, y = read_count_sum / read_count_sum_max * 100 * prop_max), fill = "#E0E0E0", stat = "identity", inherit.aes = FALSE) +
    geom_hline(yintercept = 100*1/3, linetype = "dashed", color = "lightgrey") + 
    geom_point(size = rel(0.8)) +
    geom_line(size = rel(0.25)) +
    facet_grid(time ~ treatment) +
    xlab("Read length") +
    ylab("Fraction reads mapping (%)") +
    theme_bw() +
    theme(panel.grid = element_blank(),
          axis.title = element_text(size = rel(0.75)),
          legend.title = element_blank(),
          legend.text = element_text(size = rel(0.7)), 
          axis.text = element_text(size = rel(0.7)), 
          strip.background = element_blank(),
          strip.text.y = element_text(angle = 0),
          axis.ticks.y.right = element_blank(),
          axis.text.y.right = element_blank()
    ) +
    scale_color_manual(values = c("#00D39B", "#D48EB4", "#D0C311")) +
    scale_y_continuous(expand = expand_scale(mult = c(0, 0.1))))



####################################################################################################
### SLIDING WINDOWS prop frame analysis, pooling the samples for a given condition

### USE PYTHON with desired parameter combination, e.g.,
#DO: Rscript riboseq_sliding_window.R riboseq_sample_metadata.tsv mapped_reads_by_readlength_ALL.tsv 30 30

############################################################################
######### vv CHANGE THIS: SELECTION WINDOW SIZE AND READ LENGTH vv #########
############################################################################
# SPECIFY THE WINDOW SIZE YOU USED (CRITICAL)
WIN_SIZE <- 30 # 19 29 30 # also defined with you run the bacth R script for sliding windows; MAKE SURE THEY MATCH

### WIN_SIZE=19 ###
#READ_LEN=29
#mapped_reads_by_readlength_sumSamples <- read_tsv("data/mapped_reads_by_readlength_sumSamples_win19read29.tsv")
#READ_LEN=30
#mapped_reads_by_readlength_sumSamples <- read_tsv("data/mapped_reads_by_readlength_sumSamples_win19read30.tsv")

### WIN_SIZE=29 ###
#READ_LEN=29
#mapped_reads_by_readlength_sumSamples <- read_tsv("data/mapped_reads_by_readlength_sumSamples_win29read29.tsv")
#READ_LEN=30
#mapped_reads_by_readlength_sumSamples <- read_tsv("data/mapped_reads_by_readlength_sumSamples_win29read30.tsv")

### WIN_SIZE=30 ###
#READ_LEN=29
#mapped_reads_by_readlength_sumSamples <- read_tsv("data/mapped_reads_by_readlength_sumSamples_win30read29.tsv")
#READ_LEN=30
mapped_reads_by_readlength_sumSamples <- read_tsv("data/mapped_reads_by_readlength_sumSamples_win30read30.tsv")

############################################################################
######### ^^ CHANGE THIS: SELECT WINDOW SIZE AND READ LENGTH ^^ #########
############################################################################


############################################################################
### Wrangle data
# Sum of all read counts mapping to window
mapped_reads_by_readlength_sumSamples$window_sum <- mapped_reads_by_readlength_sumSamples$window_sum_codon_pos_1 +
  mapped_reads_by_readlength_sumSamples$window_sum_codon_pos_2 +
  mapped_reads_by_readlength_sumSamples$window_sum_codon_pos_3

# Center of window, if important
mapped_reads_by_readlength_sumSamples$window_center <- ((2 * mapped_reads_by_readlength_sumSamples$position) + WIN_SIZE - 1) / 2

### MAKE LONG
(mapped_reads_by_readlength_sumSamples_LONG <- dplyr::select(mapped_reads_by_readlength_sumSamples, 
                                                             condition_combn, position, window_center, window_sum, 
                                                             window_sum_codon_pos_1, window_sum_codon_pos_2, window_sum_codon_pos_3) %>%
    pivot_longer(cols = c('window_sum_codon_pos_1', 'window_sum_codon_pos_2', 'window_sum_codon_pos_3'), names_to = "frame", values_to = "read_count"))

# Factor frame
mapped_reads_by_readlength_sumSamples_LONG$frame <- factor(mapped_reads_by_readlength_sumSamples_LONG$frame,
                                                           levels = c('window_sum_codon_pos_1', 'window_sum_codon_pos_2', 'window_sum_codon_pos_3'),
                                                           labels = c("Frame 1", "Frame 2", "Frame 3"))

# Factor condition
mapped_reads_by_readlength_sumSamples_LONG$condition_combn <- factor(mapped_reads_by_readlength_sumSamples_LONG$condition_combn,
                                                                     levels = c("CHX / 5 hpi", "CHX / 24 hpi", 
                                                                                "Harr / 5 hpi", "LTM / 5 hpi", 
                                                                                "Harr / 24 hpi", "LTM / 24 hpi", 
                                                                                "mRNA / 5 hpi", "mRNA / 24 hpi"))


############################################################################################################################
### PLOT

### GENE

#ORF1ab_1:266-13468
#ORF1ab_2:13468-21555
#S:21563-25384

#ORF3a:25393-26220
#ORF3c:25457-25582
#ORF3d:25524-25697
#ORF3d_short:25596-25697
#ORF3a_short:25765-26220
#ORF3b:25814-25882

#ORF3b_1:25814-25882
#ORF3b_2:25910-25984
#ORF3b_3:26072-26170
#ORF3b_4:26183-26281

#E:26245-26472
#M:26523-27191
#ORF6:27202-27387
#ORF7a:27394-27759
#ORF7b:27756-27887
#ORF8:27894-28259
#N:28274-29533
#ORF9b:28284-28577
#ORF9c:28734-28955
#ORF10:29558-29674

### vvv CHANGE THIS vvv ###
#curr_conditions <- c("CHX / 5 hpi", "Harr / 5 hpi", "LTM / 5 hpi", "mRNA / 5 hpi")
curr_conditions <- c("Harr / 5 hpi", "LTM / 5 hpi")
#curr_conditions <- c("CHX / 5 hpi") 
# "Harr / 5 hpi", "LTM / 5 hpi", "Harr / 24 hpi", "LTM / 24 hpi", "CHX / 5 hpi", "CHX / 24 hpi", "mRNA / 5 hpi", "mRNA / 24 hpi"
### ^^^ CHANGE THIS ^^^ ###


(mapped_reads_by_readlength_sumSamples_LONG_SUBSET <- dplyr::filter(mapped_reads_by_readlength_sumSamples_LONG, 
                                                                    condition_combn %in% curr_conditions, 
                                                                    ### CHANGE THIS ###
                                                                    #window_center >= 28274 - 200, # for 8-N-9b-9c-10: window_center >= 28274 - 200
                                                                    #window_center <= 29533 + 200)) # for 8-N-9b-9c-10: window_center <= 29533 + 200
                                                                    #window_center >= 25393 - 200, # for S - 3a - E: window_center >= 25393 - 200
                                                                    #window_center <= 26220 + 200)) # for S - 3a - E: window_center <= 26220 + 200))
                                                                    #window_center >= 13468 - 250, # for ORF1ab ribosomal slip: window_center >= 13468 - 250
                                                                    #window_center <= 13468 + 250)) # for ORF1ab ribosomal slip: window_center <= 13468 + 250
                                                                    window_center >= 25457-50, # for just 3a OLGs: 25457-50 # for just 3a OLGs, wider: 25400 
                                                                    window_center <= 25697+50)) # for just 3a OLGs, alt: 25697+50 # for just 3a OLGs, wider: 25800 

###########################
### vvv CHANGE THIS vvv ###
# IFF you want to POOL all selected treatments together, as for Fig 2
(mapped_reads_by_readlength_sumSamples_LONG_SUBSET <- mapped_reads_by_readlength_sumSamples_LONG_SUBSET %>%
   group_by(position, window_center, frame) %>%
   summarise(
     window_sum = sum(window_sum),
     read_count = sum(read_count)
   )) 
## ^^^ CHANGE THIS ^^^ ###
###########################

### Add binomial confidence intervals
mapped_reads_by_readlength_sumSamples_LONG_SUBSET$prop_reads <- NA
mapped_reads_by_readlength_sumSamples_LONG_SUBSET$binom_CI_point <- NA
mapped_reads_by_readlength_sumSamples_LONG_SUBSET$binom_CI_low <- NA
mapped_reads_by_readlength_sumSamples_LONG_SUBSET$binom_CI_high <- NA

# loop for CI
for (i in 1:nrow(mapped_reads_by_readlength_sumSamples_LONG_SUBSET)) {
  #i <- 1
  mapped_reads_by_readlength_sumSamples_LONG_SUBSET[i, ]$prop_reads <- mapped_reads_by_readlength_sumSamples_LONG_SUBSET[i, ]$read_count / mapped_reads_by_readlength_sumSamples_LONG_SUBSET[i, ]$window_sum
  
  binom_CI <- prop.test(x = mapped_reads_by_readlength_sumSamples_LONG_SUBSET[i, ]$read_count, n = mapped_reads_by_readlength_sumSamples_LONG_SUBSET[i, ]$window_sum, conf.level = 0.95)
  mapped_reads_by_readlength_sumSamples_LONG_SUBSET[i, ]$binom_CI_point <- as.vector(binom_CI$estimate)
  mapped_reads_by_readlength_sumSamples_LONG_SUBSET[i, ]$binom_CI_low <- as.vector(binom_CI$conf.int[1])
  mapped_reads_by_readlength_sumSamples_LONG_SUBSET[i, ]$binom_CI_high <- as.vector(binom_CI$conf.int[2])
  mapped_reads_by_readlength_sumSamples_LONG_SUBSET[i, ]$binom_CI_point <- as.vector(binom_CI$estimate)
}

#View(mapped_reads_by_readlength_sumSamples_LONG_SUBSET)
(read_count_sum_max <- max(mapped_reads_by_readlength_sumSamples_LONG_SUBSET$window_sum))
(prop_max <- max(mapped_reads_by_readlength_sumSamples_LONG_SUBSET$prop_reads))


# REGIONAL CONTEXT <-- cannot easily be generalized ### CHANGE for region ###
(mapped_reads_by_readlength_sumSamples_LONG_SUBSET_CONTEXT <- dplyr::filter(mapped_reads_by_readlength_sumSamples_LONG, 
                                                                            condition_combn %in% curr_conditions,
                                                                            ### CHANGE THIS ###
                                                                            #(window_center >= 28274 & window_center < 28284) | (window_center > 28577 & window_center < 28734) | (window_center > 28955 & window_center <= 29533))) # N excluding OLGs
                                                                            (window_center >= 25393 & window_center < 25457) | (window_center > 25697 & window_center <= 26220))) # ORF3a excluding OLGs
#(window_center >= 266 & window_center < 13468) | (window_center > 13483 & window_center <= 21555))) # ORF1ab excluding OL region
##window_center >= 25393, # ORF3a start; not used because includes OLGs
##window_center <= 26220)) # ORF3a end; not used because includes OLGs


###########################
### OPTION 1: POOL TREATMENTS (main Fig. 2)
### vvv CHANGE THIS vvv ###
# IFF you want to POOL all selected treatments together, as for Fig 2B
(mapped_reads_by_readlength_sumSamples_LONG_SUBSET_CONTEXT <- mapped_reads_by_readlength_sumSamples_LONG_SUBSET_CONTEXT %>%
   group_by(position, window_center, frame) %>%
   summarise(
     window_sum = sum(window_sum),
     read_count = sum(read_count)
   )) 

mapped_reads_by_readlength_sumSamples_LONG_SUBSET_CONTEXT$prop <- mapped_reads_by_readlength_sumSamples_LONG_SUBSET_CONTEXT$read_count / mapped_reads_by_readlength_sumSamples_LONG_SUBSET_CONTEXT$window_sum

(mapped_reads_by_readlength_sumSamples_LONG_SUBSET_CONTEXT_summary <- mapped_reads_by_readlength_sumSamples_LONG_SUBSET_CONTEXT %>%
    group_by(frame) %>%
    summarise(
      region_mean_prop = mean(prop)
    ))
### ^^^ CHANGE THIS ^^^ ###
###########################

###########################
### OPTION 2: DON'T POOL TREATMENTS (supplement)
### vvv CHANGE THIS vvv ###
#mapped_reads_by_readlength_sumSamples_LONG_SUBSET_CONTEXT$prop <- mapped_reads_by_readlength_sumSamples_LONG_SUBSET_CONTEXT$read_count / mapped_reads_by_readlength_sumSamples_LONG_SUBSET_CONTEXT$window_sum
#(mapped_reads_by_readlength_sumSamples_LONG_SUBSET_CONTEXT_summary <- mapped_reads_by_readlength_sumSamples_LONG_SUBSET_CONTEXT %>%
#  group_by(condition_combn, frame) %>%
#  summarise(
#    region_mean_prop = mean(prop)
#  ))
### ^^^ CHANGE THIS ^^^ ###
###########################


############################################################################################################
### PLOT Figure 2C: ZOOM-IN of ORF3a/ORF3c/ORF3d OLG region ###
(mapped_reads_by_readlength_sumSamples_PLOT <- ggplot(data = mapped_reads_by_readlength_sumSamples_LONG_SUBSET,
                                                      mapping = aes(x = window_center, y = 100 * prop_reads, color = frame)) +
   
   # LINES
   geom_vline(xintercept = 25697, linetype = "dotted", color = "grey", size = 0.25) +# linetype = "dashed", # ORF3d-2
   geom_vline(xintercept = 25596, linetype = "dotted", color = "grey", size = 0.25) +# linetype = "dashed", # ORF3d-2
   geom_vline(xintercept = 25697, linetype = "dotted", color = "grey", size = 0.25) +# linetype = "dashed", # ORF3d
   geom_vline(xintercept = 25524, linetype = "dotted", color = "grey", size = 0.25) +# linetype = "dashed", # ORF3d
   geom_vline(xintercept = 25582, linetype = "dotted", color = "grey", size = 0.25) +# linetype = "dashed", # ORF3c
   geom_vline(xintercept = 25457, linetype = "dotted", color = "grey", size = 0.25) + # linetype = "dashed", # ORF3c
   
   # ORF3d-2
   geom_rect(mapping = aes(xmin = 25596, xmax = 25697, ymin = 93, ymax = 100), color = 'black', size = rel(0.25), fill = "#F0E442", alpha = 0.1, inherit.aes = FALSE) + 
   geom_text(mapping = aes(x = 25596, y = (93 + 100) / 2), color = 'black', hjust = -0.02, vjust = 0.5, size = rel(2.5), fontface = 'italic', label = "ORF3d-2") + 
   
   # ORF3d
   geom_rect(mapping = aes(xmin = 25524, xmax = 25697, ymin = 85, ymax = 92), color = 'black', size = rel(0.25), fill = "#F0E442", alpha = 0.1, inherit.aes = FALSE) + 
   geom_text(mapping = aes(x = 25524, y = (85 + 92) / 2), color = 'black', hjust = -0.02, vjust = 0.5, size = rel(2.5), fontface = 'italic', label = "ORF3d") + 
   
   # ORF3c
   geom_rect(mapping = aes(xmin = 25457, xmax = 25582, ymin = 93, ymax = 100), color = 'black', size = rel(0.25), fill = "#D48EB4", alpha = 0.1, inherit.aes = FALSE) + 
   geom_text(mapping = aes(x = 25457, y = (93 + 100) / 2), color = 'black', hjust = -0.02, vjust = 0.5, size = rel(2.5), fontface = 'italic', label = "ORF3c") + 
   
   # ORF3a
   geom_rect(mapping = aes(xmin = -Inf, xmax = Inf, ymin = 101, ymax = 108), color = 'black', size = rel(0.25), fill = "#00D39B", alpha = 0.1, inherit.aes = FALSE) + 
   geom_text(mapping = aes(x = 25457-50, y = (101 + 108) / 2), color = 'black', hjust = -0.02, vjust = 0.5, size = rel(2.5), fontface = 'italic', label = "ORF3a") +
   
   geom_bar(data = filter(mapped_reads_by_readlength_sumSamples_LONG_SUBSET, frame == "Frame 1"), # redundant; otherwise triplicate plotting
            mapping = aes(x = window_center, y = window_sum / read_count_sum_max * 100 * prop_max), fill = "#E0E0E0", stat = "identity", inherit.aes = FALSE) + 
   
   # Region mean prop
   geom_hline(data = mapped_reads_by_readlength_sumSamples_LONG_SUBSET_CONTEXT_summary, 
              mapping = aes(yintercept = 100 * region_mean_prop, color = frame), linetype = "dashed", size = rel(0.25)) +
   
   # ERROR RIBBON
   geom_ribbon(mapping = aes(ymin = 100 * binom_CI_low, ymax = 100 * binom_CI_high, fill = frame), alpha = 0.25, linetype = 0) +
   geom_line(size = rel(0.25)) +
   
   # -->CHANGE THIS<-- FACET if more than one treatment; COMMENT OUT otherwise
   #facet_grid(condition_combn ~ .) +
   xlab("Genome position") +
   ylab("Fraction reads (%)") +
   theme_bw() +
   theme(panel.grid = element_blank(),
         axis.title = element_text(size = rel(0.75)),
         legend.title = element_blank(),
         legend.text = element_text(size = rel(0.7)),
         strip.background = element_blank(),
         axis.ticks.y.right = element_blank(),
         axis.text.y.right = element_blank()
   ) +
   scale_color_manual(values = c("#00D39B", "#D48EB4", "#BDB10F")) +
   scale_fill_manual(values = c("#00D39B", "#D48EB4", "#F0E442")) +
   scale_x_continuous(expand = expand_scale(mult = c(0, 0))) +
   scale_y_continuous(limits = c(0, 108), breaks = c(0, 25, 50, 75), expand = expand_scale(mult = c(0, 0))))


###########################
### PLOT Figure 2—figure supplement 8, for S - 3a - E ###
#(mapped_reads_by_readlength_sumSamples_PLOT <- ggplot(data = mapped_reads_by_readlength_sumSamples_LONG_SUBSET,
#                                                      mapping = aes(x = window_center, y = 100 * prop_reads, color = frame)) +
#   
#   # E
#   geom_vline(xintercept = 26245, linetype = "dotted", color = "grey", size = 0.25) +
#   geom_rect(mapping = aes(xmin = 26245, xmax = Inf, ymin = 101, ymax = 108), color = 'black', size = rel(0.25), fill = "white", alpha = 0.1, inherit.aes = FALSE) + 
#   geom_text(mapping = aes(x = 26245, y = (101 + 108) / 2), color = 'black', hjust = -0.02, vjust = 0.5, size = rel(2.5), fontface = 'italic', label = "E") + 
#   
#   # ORF3b-4
#   geom_vline(xintercept = 26183, linetype = "dotted", color = "grey", size = 0.25) +
#   geom_rect(mapping = aes(xmin = 26183, xmax = 26281, ymin = 85, ymax = 92), color = 'black', size = rel(0.25), fill = "#D48EB4", alpha = 0.1, inherit.aes = FALSE) + 
#   geom_text(mapping = aes(x = 26183, y = (85 + 92) / 2), color = 'black', hjust = -0.02, vjust = 0.5, size = rel(2.5), fontface = 'italic', label = "3b-4") + 
#   
#   # ORF3b-3
#   geom_vline(xintercept = 26072, linetype = "dotted", color = "grey", size = 0.25) +
#   geom_rect(mapping = aes(xmin = 26072, xmax = 26170, ymin = 85, ymax = 92), color = 'black', size = rel(0.25), fill = "#D48EB4", alpha = 0.1, inherit.aes = FALSE) + 
#   geom_text(mapping = aes(x = 26072, y = (85 + 92) / 2), color = 'black', hjust = -0.02, vjust = 0.5, size = rel(2.5), fontface = 'italic', label = "3b-3") + 
#   
#   # ORF3b-2
#   geom_vline(xintercept = 25910, linetype = "dotted", color = "grey", size = 0.25) +
#   geom_rect(mapping = aes(xmin = 25910, xmax = 25984, ymin = 85, ymax = 92), color = 'black', size = rel(0.25), fill = "#D48EB4", alpha = 0.1, inherit.aes = FALSE) + 
#   geom_text(mapping = aes(x = 25910, y = (85 + 92) / 2), color = 'black', hjust = -0.02, vjust = 0.5, size = rel(2.5), fontface = 'italic', label = "3b-2") + 
#   
#   # ORF3b-1
#   geom_vline(xintercept = 25814, linetype = "dotted", color = "grey", size = 0.25) +
#   geom_rect(mapping = aes(xmin = 25814, xmax = 25882, ymin = 85, ymax = 92), color = 'black', size = rel(0.25), fill = "#D48EB4", alpha = 0.1, inherit.aes = FALSE) + 
#   geom_text(mapping = aes(x = 25814, y = (85 + 92) / 2), color = 'black', hjust = -0.02, vjust = 0.5, size = rel(2.5), fontface = 'italic', label = "3b") + 
#   
#   # ORF3a-2
#   geom_vline(xintercept = 25765, linetype = "dotted", color = "grey", size = 0.25) +
#   geom_rect(mapping = aes(xmin = 25765, xmax = 26220, ymin = 93, ymax = 100), color = 'black', size = rel(0.25), fill = "#00D39B", alpha = 0.1, inherit.aes = FALSE) + 
#   geom_text(mapping = aes(x = 25765, y = (93 + 100) / 2), color = 'black', hjust = -0.02, vjust = 0.5, size = rel(2.5), fontface = 'italic', label = "3a-2") + 
#   
#   # ORF3d-2
#   geom_vline(xintercept = 25596, linetype = "dotted", color = "grey", size = 0.25) +
#   geom_rect(mapping = aes(xmin = 25596, xmax = 25697, ymin = 93, ymax = 100), color = 'black', size = rel(0.25), fill = "#F0E442", alpha = 0.1, inherit.aes = FALSE) + 
#   geom_text(mapping = aes(x = 25596, y = (93 + 100) / 2), color = 'black', hjust = -0.02, vjust = 0.5, size = rel(2.5), fontface = 'italic', label = "3d-2") + 
#   
#   # ORF3d
#   geom_vline(xintercept = 25524, linetype = "dotted", color = "grey", size = 0.25) +
#   geom_rect(mapping = aes(xmin = 25524, xmax = 25697, ymin = 85, ymax = 92), color = 'black', size = rel(0.25), fill = "#F0E442", alpha = 0.1, inherit.aes = FALSE) + 
#   geom_text(mapping = aes(x = 25524, y = (85 + 92) / 2), color = 'black', hjust = -0.02, vjust = 0.5, size = rel(2.5), fontface = 'italic', label = "3d") + 
#   
#   # ORF3c
#   geom_vline(xintercept = 25457, linetype = "dotted", color = "grey", size = 0.25) +
#   geom_rect(mapping = aes(xmin = 25457, xmax = 25582, ymin = 93, ymax = 100), color = 'black', size = rel(0.25), fill = "#D48EB4", alpha = 0.1, inherit.aes = FALSE) + 
#   geom_text(mapping = aes(x = 25457, y = (93 + 100) / 2), color = 'black', hjust = -0.02, vjust = 0.5, size = rel(2.5), fontface = 'italic', label = "3c") + 
#   
#   # ORF3a
#   geom_vline(xintercept = 25393, linetype = "dotted", color = "grey", size = 0.25) +
#   geom_rect(mapping = aes(xmin = 25393, xmax = 26220, ymin = 101, ymax = 108), color = 'black', size = rel(0.25), fill = "#00D39B", alpha = 0.1, inherit.aes = FALSE) + 
#   geom_text(mapping = aes(x = 25393, y = (101 + 108) / 2), color = 'black', hjust = -0.02, vjust = 0.5, size = rel(2.5), fontface = 'italic', label = "3a") + 
#   
#   # S
#   geom_rect(mapping = aes(xmin = -Inf, xmax = 25384, ymin = 101, ymax = 108), color = 'black', size = rel(0.25), fill = "white", alpha = 0.1, inherit.aes = FALSE) + 
#   geom_text(mapping = aes(x = -Inf, y = (101 + 108) / 2), color = 'black', hjust = -0.02, vjust = 0.5, size = rel(2.5), fontface = 'italic', label = "S") + 
#   
#   geom_bar(data = filter(mapped_reads_by_readlength_sumSamples_LONG_SUBSET, frame == "Frame 1"), # otherwise triplicate plotting
#            mapping = aes(x = window_center, y = window_sum / read_count_sum_max * 100 * prop_max), fill = "#E0E0E0", stat = "identity", inherit.aes = FALSE) + 
#   
#   # Region mean prop
#   geom_hline(data = mapped_reads_by_readlength_sumSamples_LONG_SUBSET_CONTEXT_summary, 
#              mapping = aes(yintercept = 100 * region_mean_prop, color = frame), linetype = "dashed", size = rel(0.25)) +
#   
#   # ERROR RIBBON
#   geom_ribbon(mapping = aes(ymin = 100 * binom_CI_low, ymax = 100 * binom_CI_high, fill = frame), alpha = 0.25, linetype = 0) +
#   
#   geom_line(size = rel(0.25)) +
#   facet_grid(condition_combn ~ .) +
#   xlab("Genome position") +
#   ylab("Fraction reads (%)") +
#   theme_bw() +
#   theme(panel.grid = element_blank(),
#         plot.title = element_text(hjust = 0.5, size = rel(0.9)),
#         axis.title = element_text(size = rel(0.75)),
#         legend.title = element_blank(),
#         legend.text = element_text(size = rel(0.7)), 
#         axis.text = element_text(size = rel(0.7)), 
#         strip.background = element_blank(),
#         strip.text.y = element_text(angle = 0), 
#         axis.ticks.y.right = element_blank(),
#         axis.text.y.right = element_blank()
#   ) +
#   scale_color_manual(values = c("#00D39B", "#D48EB4", "#BDB10F")) +
#   scale_fill_manual(values = c("#00D39B", "#D48EB4", "#F0E442")) +
#   scale_x_continuous(expand = expand_scale(mult = c(0, 0))) +
#   scale_y_continuous(limits = c(0, 108), breaks = c(0, 25, 50, 75), expand = expand_scale(mult = c(0, 0)))) 



###########################
### PLOT sliding window for ORF8 - N - 9b - 9c - 10 ###

#ORF8:27894-28259, so frame 27894 %% 3 = 0 (Codon pos 1) # white
#N:28274-29533, so frame 28274 %% 3 = 2 (Codon pos 3), # structural gene blue: #71BFED
#ORF9b:28284-28577, so frame 28284 %% 3 = 0 (Codon pos 1) # overlapping gene burgundy: #D48EB4
#ORF9c:28734-28955, so frame 28734 %% 3 = 0 (Codon pos 1) # overlapping gene burgundy: #D48EB4
#ORF10:29558-29674, so frame 29558 %% 3 = 2 (Codon pos 3) # white

#(mapped_reads_by_readlength_sumSamples_PLOT <- ggplot(data = mapped_reads_by_readlength_sumSamples_LONG_SUBSET,
#                                                      mapping = aes(x = window_center, y = 100 * prop_reads, color = frame)) +
#   # ORF10
#   geom_vline(xintercept = 29558, linetype = "dotted", color = "grey", size = 0.25) +
#   geom_rect(mapping = aes(xmin = 29558, xmax = 29674, ymin = 101, ymax = 108), color = 'black', size = rel(0.25), fill = "white", alpha = 0.1, inherit.aes = FALSE) +
#   geom_text(mapping = aes(x = 29558, y = (101 + 108) / 2), color = 'black', hjust = -0.02, vjust = 0.5, size = rel(2.5), fontface = 'italic', label = "ORF10") + 
#   
#   # ORF9c
#   geom_vline(xintercept = 28734, linetype = "dotted", color = "grey", size = 0.25) +
#   geom_rect(mapping = aes(xmin = 28734, xmax = 28955, ymin = 93, ymax = 100), color = 'black', size = rel(0.25), fill = "#D48EB4", alpha = 0.1, inherit.aes = FALSE) + 
#   geom_text(mapping = aes(x = 28734, y = (93 + 100) / 2), color = 'black', hjust = -0.02, vjust = 0.5, size = rel(2.5), fontface = 'italic', label = "ORF9c") + 
#   
#   # ORF9b
#   geom_vline(xintercept = 28284, linetype = "dotted", color = "grey", size = 0.25) +
#   geom_rect(mapping = aes(xmin = 28284, xmax = 28577, ymin = 93, ymax = 100), color = 'black', size = rel(0.25), fill = "#D48EB4", alpha = 0.1, inherit.aes = FALSE) + 
#   geom_text(mapping = aes(x = 28284, y = (93 + 100) / 2), color = 'black', hjust = -0.02, vjust = 0.5, size = rel(2.5), fontface = 'italic', label = "ORF9b") + 
#   
#   # N
#   geom_vline(xintercept = 28274, linetype = "dotted", color = "grey", size = 0.25) +
#   geom_rect(mapping = aes(xmin = 28274, xmax = 29533, ymin = 101, ymax = 108), color = 'black', size = rel(0.25), fill = "#71BFED", alpha = 0.1, inherit.aes = FALSE) + 
#   geom_text(mapping = aes(x = 28274, y = (101 + 108) / 2), color = 'black', hjust = -0.02, vjust = 0.5, size = rel(2.5), fontface = 'italic', label = "N") + 
#   
#   # ORF8
#   geom_rect(mapping = aes(xmin = -Inf, xmax = 28259, ymin = 101, ymax = 108), color = 'black', size = rel(0.25), fill = "white", alpha = 0.1, inherit.aes = FALSE) + 
#   geom_text(mapping = aes(x = -Inf, y = (101 + 108) / 2), color = 'black', hjust = -0.02, vjust = 0.5, size = rel(2.5), fontface = 'italic', label = "ORF8") + 
#   
#   geom_bar(data = filter(mapped_reads_by_readlength_sumSamples_LONG_SUBSET, frame == "Frame 1"), # otherwise triplicate plotting
#            mapping = aes(x = window_center, y = window_sum / read_count_sum_max * 100 * prop_max), fill = "#E0E0E0", stat = "identity", inherit.aes = FALSE) + 
#   
#   # Region mean prop
#   geom_hline(data = mapped_reads_by_readlength_sumSamples_LONG_SUBSET_CONTEXT_summary, 
#              mapping = aes(yintercept = 100 * region_mean_prop, color = frame), linetype = "dashed", size = rel(0.25)) +
#   
#   # ERROR RIBBON
#   geom_ribbon(mapping = aes(ymin = 100 * binom_CI_low, ymax = 100 * binom_CI_high, fill = frame), alpha = 0.25, linetype = 0) +
#   
#   geom_line(size = rel(0.25)) +
#   facet_grid(condition_combn ~ .) +
#   xlab("Genome position") +
#   ylab("Fraction reads (%)") +
#   theme_bw() +
#   theme(panel.grid = element_blank(),
#         plot.title = element_text(hjust = 0.5, size = rel(0.9)),
#         axis.title = element_text(size = rel(0.75)),
#         legend.title = element_blank(),
#         legend.text = element_text(size = rel(0.7)), 
#         axis.text = element_text(size = rel(0.7)), 
#         strip.background = element_blank(),
#         strip.text.y = element_text(angle = 0),
#         axis.ticks.y.right = element_blank(),
#         axis.text.y.right = element_blank()
#   ) +
#   scale_color_manual(values = c("#00D39B", "#71BFED", "#D48EB4")) +
#   scale_fill_manual(values = c("#00D39B", "#71BFED", "#D48EB4")) +
#   scale_x_continuous(expand = expand_scale(mult = c(0, 0))) +
#   scale_y_continuous(limits = c(0, 108), breaks = c(0, 25, 50, 75), expand = expand_scale(mult = c(0, 0)))) 



########################################################################################################################
########################################################################################################################
### GET GENE START SITES AND ANNOTATE
(Wuhan_Hu_1_genes <- read_tsv("data/Wuhan_Hu_1_genes.tsv"))
unique(Wuhan_Hu_1_genes$gene)

genes_to_include <- c("ORF1ab_1", "ORF1ab_2", "S", 
                      "ORF3a", "ORF3c", "ORF3d", "ORF3d_short", "ORF3a_short",
                      "ORF3b_1", "ORF3b_2", "ORF3b_3", "ORF3b_4", 
                      "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8",
                      "N", "ORF9b", "ORF9c", "ORF10")

gene_position_joiner <- tibble(position = unique(mapped_reads_by_readlength$position)) # now 1-based
gene_position_joiner # 29,884 x 1

## DID THIS ONCE OVERNIGHT AND SAVED
## Actually LOOPING the 2-million record file would take about 5 hours per gene, because it's R, so just loop the joiner
## STILL takes a LONG TIME; do it once and SAVE for reloading
#for (this_gene in unique(Wuhan_Hu_1_genes$gene)) {
#  if(this_gene %in% genes_to_include) {
#    this_gene_start <- Wuhan_Hu_1_genes[Wuhan_Hu_1_genes$gene == this_gene, ]$start
#    this_gene_end <- Wuhan_Hu_1_genes[Wuhan_Hu_1_genes$gene == this_gene, ]$end
#    
#    # Initialize a new column by the name of this gene
#    gene_position_joiner[[this_gene]] <- NA
#    
#    # Populate this column with genome's position relative to the start site of this gene, or NA if after its end
#    for (this_site in sort(unique(gene_position_joiner$position))) {
#      #this_gene_start is 29558
#      #this_site <- 29558
#      #this_site <- 29557
#      #this_site <- 29559
#      
#      if(this_site <= this_gene_end) {
#        gene_position_joiner[gene_position_joiner$position == this_site, ][[this_gene]] <- this_site - this_gene_start
#      } # else it's NA
#    }
#  }
#}

### SAVE
#write_tsv(gene_position_joiner, "data/gene_position_joiner.tsv")

########################################
# RELOAD
gene_position_joiner <- read_tsv("data/gene_position_joiner.tsv")

### JOIN --> SUPER FAST
mapped_reads_by_readlength <- left_join(x = mapped_reads_by_readlength, y = gene_position_joiner, by = "position")
#2,610,414 x 32 = SUCCESS!


################################################################################
### POOL THE TWO SAMPLES FOR EACH TREATMENT; CUT DOWN SIZE; SPECIFY READ LENGTH RANGE; LIMIT TO ONE UPSTREAM

### GENE

#ORF1ab_1:266-13468
#ORF1ab_2:13468-21555
#S:21563-25384

#ORF3a:25393-26220
#ORF3c:25457-25582
#ORF3d:25524-25697
#ORF3d_short:25596-25697
#ORF3a_short:25765-26220
#ORF3b:25814-25882

#E:26245-26472
#M:26523-27191
#ORF6:27202-27387
#ORF7a:27394-27759
#ORF7b:27756-27887
#ORF8:27894-28259
#N:28274-29533
#ORF9b:28284-28577
#ORF9c:28734-28955
#ORF10:29558-29674


### CHANGE THIS ### replace with name and start site of desired gene
this_gene_name <- "ORF3b" # ORF3a ORF3c ORF3d ORF3d-2 ORF3a-2 ORF3b
this_gene_start <- 25814 # 25393 25457 25524 25596
curr_conditions <- c("Harr / 5 hpi", "LTM / 5 hpi")
### CHANGE THIS ###

# Adds the two samples together
(riboseq_sumSamplesUPSTREAM <- filter(dplyr::select(mapped_reads_by_readlength, -chromosome, -frame, -condition), 
                                      read_length >= 27, read_length <= 32,
                                      
                                      # ALL PURPOSE
                                      #position >= this_gene_start - 25, position <= this_gene_start + 25 # centered
                                      position >= this_gene_start - 25, position <= this_gene_start # upstream only
) %>% 
    group_by(condition_combn, read_length, position) %>%
    summarise(
      sample_count = n(),
      read_count_sum = sum(read_count)
    )) 

# sum ALL READS IN THE START UPSTREAM for each condition
(riboseq_readSumPerCombnUPSTREAM <- riboseq_sumSamplesUPSTREAM %>%
    group_by(condition_combn) %>% 
    summarise(
      read_sum_UPSTREAMxCONDITION = sum(read_count_sum)
    ))

# LEFT JOIN TOTAL READS FOR EACH COMBN; calculate reads per million mapped reads
riboseq_sumSamplesUPSTREAM <- left_join(x = riboseq_sumSamplesUPSTREAM, y = riboseq_readSumPerCombnUPSTREAM, by = c("condition_combn")) 
riboseq_sumSamplesUPSTREAM

# prop of reads in UPSTREAMxCONDITION
riboseq_sumSamplesUPSTREAM$prop_reads_UPSTREAMxCONDITION <- riboseq_sumSamplesUPSTREAM$read_count_sum / riboseq_sumSamplesUPSTREAM$read_sum_UPSTREAMxCONDITION
riboseq_sumSamplesUPSTREAM

### PREP BARS DUMMY DATAFRAME
(riboseq_sumSamplesUPSTREAM_summary <- riboseq_sumSamplesUPSTREAM %>%
    group_by(condition_combn, position) %>%
    summarise(
      position_total_depth = sum(read_count_sum)
    ))

(position_total_depth_max <- max(filter(riboseq_sumSamplesUPSTREAM_summary, condition_combn %in% curr_conditions)$position_total_depth))
(prop_reads_UPSTREAMxCONDITION_max <- max(filter(riboseq_sumSamplesUPSTREAM, condition_combn %in% curr_conditions)$prop_reads_UPSTREAMxCONDITION))

### PLOT Figure 2—figure supplement 4
### CHANGE THIS ###
(riboseq_sumSamplesUPSTREAM_PLOT <- 
    ggplot(data = filter(riboseq_sumSamplesUPSTREAM, condition_combn %in% curr_conditions), 
           mapping = aes(x = position, y = 100 * prop_reads_UPSTREAMxCONDITION, color = factor(read_length))) + 
    
    # P SITE OFFSET at -13, -12, -11 upstream start sites
    geom_rect(mapping = aes(xmin = this_gene_start - 13.5, xmax = this_gene_start - 10.5, ymin = -Inf, ymax = Inf), fill = "#F7FFD6", inherit.aes = FALSE) + # FBFFEB F7FFD6 F3FFC2
    
    # ORF9b ONLY: P SITE OFFSET of N at -23, -22, -21 upstream start sites <-- CHANGE THIS
    #geom_rect(mapping = aes(xmin = this_gene_start - 23.5, xmax = this_gene_start - 20.5, ymin = -Inf, ymax = Inf), fill = "#F7FFD6", inherit.aes = FALSE) + # FBFFEB F7FFD6 F3FFC2
    
    # GENE START
    geom_vline(xintercept = this_gene_start, linetype = "dashed", color = "grey", size = 0.3) +
    
    # BAR version -- worry about bars later  
    geom_bar(data = filter(riboseq_sumSamplesUPSTREAM_summary, condition_combn %in% curr_conditions),
             mapping = aes(x = position, y = position_total_depth / position_total_depth_max * 100 * prop_reads_UPSTREAMxCONDITION_max), fill = "#E0E0E0", stat = "identity", inherit.aes = FALSE) + 
    
    # POINT/LINE version
    geom_point(size = rel(0.8)) +
    geom_line(size = rel(0.25)) +
    
    ggtitle(this_gene_name) + # "ORF3c" "ORF3d" "ORF3d-2" ORF3a etc.
    
    facet_grid(condition_combn ~ ., scales = "free_y") + 
    theme_bw() +
    theme(panel.grid = element_blank(),
          plot.title = element_text(hjust = 0.5, face = "italic", size = rel(1)),
          legend.text = element_text(size = rel(1.15)),
          axis.text.x = element_text(size = rel(1.15)), 
          axis.title.x = element_blank(),
          axis.text.y = element_text(size = rel(1.15)),
          axis.title.y = element_blank(),
          panel.border = element_rect(),
          strip.background = element_blank(),
          strip.text.x = element_text(size = rel(1.15)),
          strip.text.y = element_text(size = rel(1.15), angle = 0)) +
    scale_color_manual(values = rev(hue_pal()(6)), name = "Read length (bp)") +
    scale_x_continuous(expand = expand_scale(mult = c(0, 0))) +
    scale_y_continuous(breaks = pretty_breaks(2), expand = expand_scale(mult = c(0, 0.1)))
)



####################
### POOL THE TWO SAMPLES FOR EACH TREATMENT; CUT DOWN SIZE
(riboseq_sumSamples <- filter(dplyr::select(mapped_reads_by_readlength, -chromosome, -frame, -condition)) %>% # , read_length >=26, read_length <= 32
   group_by(condition_combn, read_length, position) %>%
   summarise(
     sample_count = n(),
     read_count_sum = sum(read_count)
   )) # should cut in half (two samples per condition)

(riboseq_readSumPerCombn <- riboseq_sumSamples %>%
    group_by(condition_combn, read_length) %>%
    summarise(
      read_sum_genome = sum(read_count_sum)
    ))

# LEFT JOIN TOTAL READS FOR EACH COMBN AND READ LENGTH; calculate reads per million mapped reads
riboseq_sumSamples <- left_join(x = riboseq_sumSamples, y = riboseq_readSumPerCombn, by = c("condition_combn", "read_length"))
riboseq_sumSamples$reads_per_million <- riboseq_sumSamples$read_count_sum / riboseq_sumSamples$read_sum_genome * 1e6
riboseq_sumSamples

# LEFT JOIN RELATIVE POSITIONS OF GENES
riboseq_sumSamples <- left_join(x = riboseq_sumSamples, y = gene_position_joiner, by = "position")
riboseq_sumSamples # 1,725,599 x 29

# make long
riboseq_sumSamples_long <- riboseq_sumSamples %>%
  pivot_longer(cols = genes_to_include, names_to = "gene", values_to = "relative_position")
riboseq_sumSamples_long

# filter to within 40bp of SOME start site, so it's not too big
(riboseq_sumSamples_long <- filter(riboseq_sumSamples_long, relative_position >= -40, relative_position <= 40))
# 159,391 x 9

# Factor genes 5' to 3'
genes_to_include_labels <- c("1a", "1b", "S", 
                             "3a", "3c", "3d", "3d-2", "3a-2",
                             "3b-1", "3b-2", "3b-3", "3b-4", 
                             "E", "M", "6", "7a", "7b", "8",
                             "N", "9b", "9c", "10")
riboseq_sumSamples_long$gene <- factor(riboseq_sumSamples_long$gene, levels = genes_to_include, labels = genes_to_include_labels)

genes_to_include_labels_subset <- c("1a", "1b", "S", 
                                    "3a", "3c", "3d", "3d-2", "3a-2",
                                    "3b-1", #"3b-2", "3b-3", "3b-4", 
                                    "E", "M", "6", "7a", "7b", "8",
                                    "N", "9b", "9c", "10")

####################
# PLOT Harr at 5 and 24 hpi, or whatever combination is of interest

# CHX / 5 hpi -- CHX / 24 hpi -- Harr / 5 hpi -- Harr / 24 hpi -- LTM / 5 hpi -- LTM / 24 hpi -- mRNA / 5 hpi -- mRNA / 24 hpi
(riboseq_sumSamples_long_PLOT <- ggplot(data = filter(riboseq_sumSamples_long, 
                                                      gene %in% genes_to_include_labels_subset,
                                                      relative_position >= -20, relative_position <= 20,
                                                      read_length == 30,
                                                      condition_combn %in% c("Harr / 5 hpi", "Harr / 24 hpi")), 
                                        mapping = aes(x = relative_position, y = reads_per_million)) + # read_count_sum
   
   # 12 bp upstream
   geom_rect(mapping = aes(xmin = -12.5, xmax = -11.5, ymin = -Inf, ymax = Inf), fill = "#F3FFC2", alpha = 0.1, inherit.aes = FALSE) +
   geom_vline(xintercept = 0, linetype = "dashed", color = "grey", size = 0.3) +
   
   # BAR version
   geom_bar(stat = "identity") +
   
   ggtitle("Read length 30 bp - free y axis") + # "Harr / 5 hpi"
   
   facet_grid(gene ~ condition_combn, scales = "free_y") + 
   xlab("Distance from start (nt)") +
   ylab("Reads per million mapped") +
   theme_bw() +
   theme(panel.grid = element_blank(),
         plot.title = element_text(hjust = 0.5),
         strip.text = element_text(size = 10),
         axis.text.x = element_text(size = 9.5), 
         axis.title.x = element_text(size = 10),
         axis.text.y = element_text(size = 9.5),
         axis.title.y = element_text(size = 10),
         panel.border = element_rect(),
         strip.background = element_blank(),
         strip.text.y = element_text(size = rel(0.9), angle = 0)) +
   scale_y_continuous(breaks = pretty_breaks(2), expand = expand_scale(mult = c(0, 0.05))) 
)



####################
# plot ALL conditions for 29-31bp pooled lengths
(riboseq_sumSamples2 <- filter(dplyr::select(mapped_reads_by_readlength, -chromosome, -frame, -condition), read_length >= 29, read_length <= 31) %>%
   group_by(condition_combn, position) %>%
   summarise(
     record_count = n(),
     read_count_sum = sum(read_count)
   )) 

(riboseq_readSumPerCombn2 <- riboseq_sumSamples2 %>%
    group_by(condition_combn) %>%
    summarise(
      read_sum_genome = sum(read_count_sum)
    ))

# LEFT JOIN TOTAL READS FOR EACH COMBN AND READ LENGTH; calculate reads per million mapped reads
riboseq_sumSamples2 <- left_join(x = riboseq_sumSamples2, y = riboseq_readSumPerCombn2, by = c("condition_combn"))
riboseq_sumSamples2$reads_per_million <- riboseq_sumSamples2$read_count_sum / riboseq_sumSamples2$read_sum_genome * 1e6
riboseq_sumSamples2

# LEFT JOIN RELATIVE POSITIONS OF GENES
riboseq_sumSamples2 <- left_join(x = riboseq_sumSamples2, y = gene_position_joiner, by = "position")
riboseq_sumSamples2 

# make long
riboseq_sumSamples2_long <- riboseq_sumSamples2 %>%
  pivot_longer(cols = genes_to_include, names_to = "gene", values_to = "relative_position")
riboseq_sumSamples2_long

# filter to within 40bp of SOME start site, so it's not too big
(riboseq_sumSamples2_long <- filter(riboseq_sumSamples2_long, relative_position >= -40, relative_position <= 40))

# Factor genes 5' to 3'
genes_to_include_labels <- c("1ab", "1b", "S", 
                             "3a", "3c", "3d", "3d-2", "3a-2",
                             "3b-1", "3b-2", "3b-3", "3b-4", 
                             "E", "M", "6", "7a", "7b", "8",
                             "N", "9b", "9c", "10")
riboseq_sumSamples2_long$gene <- factor(riboseq_sumSamples2_long$gene, levels = genes_to_include, labels = genes_to_include_labels)

genes_to_include_labels_subset <- c("1ab", "1b", "S", 
                                    "3a", "3c", "3d", "3d-2", "3a-2",
                                    "3b-1", #"3b-2", "3b-3", "3b-4", 
                                    "E", "M", "6", "7a", "7b", "8",
                                    "N", "9b", "9c", "10")

####################
# plot ALL condition combinations for SPECIFIED READ LENGTH RANGES

# CHX / 5 hpi -- CHX / 24 hpi -- Harr / 5 hpi -- Harr / 24 hpi -- LTM / 5 hpi -- LTM / 24 hpi -- mRNA / 5 hpi -- mRNA / 24 hpi
(riboseq_sumSamples2_long_PLOT <- ggplot(data = filter(riboseq_sumSamples2_long, 
                                                       gene %in% genes_to_include_labels_subset,
                                                       gene != "1b",
                                                       relative_position >= -20, relative_position <= 20,
                                                       condition_combn %in% c("Harr / 5 hpi", "LTM / 5 hpi")),
                                         mapping = aes(x = relative_position, y = reads_per_million)) +
   
   # 12 bp upstream
   geom_rect(mapping = aes(xmin = -13.5, xmax = -10.5, ymin = -Inf, ymax = Inf), fill = "#F3FFC2", alpha = 0.1, inherit.aes = FALSE) +
   geom_vline(xintercept = 0, linetype = "dashed", color = "grey", size = 0.3) +
   geom_bar(stat = "identity") + # , position = position_dodge()
   
   facet_grid(gene ~ condition_combn, scales = "free_y") +
   xlab("Distance from start (nt)") +
   ylab("Reads per million mapped") + 
   theme_bw() +
   theme(panel.grid = element_blank(),
         plot.title = element_text(hjust = 0.5),
         strip.text = element_text(size = 10),
         axis.text.x = element_text(size = 9.5), 
         axis.title.x = element_text(size = 10),
         axis.text.y = element_text(size = 9.5),
         axis.title.y = element_text(size = 10),
         panel.border = element_rect(),
         strip.background = element_blank(),
         strip.text.y = element_text(size = rel(0.9), angle = 0)) +
   scale_y_continuous(breaks = pretty_breaks(2), expand = expand_scale(mult = c(0, 0.05))) 
)




########################################
### BOOK-KEEPING: WHAT ARE THE SAMPLES FOR EACH TREATMENT?
########################################
mapped_reads_by_readlength %>%
  group_by(condition_combn, sample) %>%
  summarise(
    count = n()
  )

#condition_combn sample       count
#1 CHX / 5 hpi     SRR11713356  76388
#2 CHX / 5 hpi     SRR11713357  78242
#3 CHX / 24 hpi    SRR11713364  70230
#4 CHX / 24 hpi    SRR11713365 327831
#5 Harr / 5 hpi    SRR11713360  57931
#6 Harr / 5 hpi    SRR11713361  50150
#7 Harr / 24 hpi   SRR11713368 313066
#8 Harr / 24 hpi   SRR11713369 258834
#9 LTM / 5 hpi     SRR11713358  64767
#10 LTM / 5 hpi     SRR11713359  75772
#11 LTM / 24 hpi    SRR11713366 160926
#12 LTM / 24 hpi    SRR11713367 229330
#13 mRNA / 5 hpi    SRR11713354 122305
#14 mRNA / 5 hpi    SRR11713355 124381
#15 mRNA / 24 hpi   SRR11713362 292810
#16 mRNA / 24 hpi   SRR11713363 307451


########################################
### FOUR-SAMPLE VERSION
########################################

####################
# plot ALL for 29-31bp pooled lengths, "Harr / 5 hpi" and "LTM / 5 hpi" only
(riboseq_sumSamples4 <- filter(dplyr::select(mapped_reads_by_readlength, -chromosome, -frame, -condition), read_length >= 29, read_length <= 31,
                               condition_combn %in% c("Harr / 5 hpi", "LTM / 5 hpi")) %>%
   group_by(sample, position) %>%
   summarise(
     record_count = n(),
     read_count_sum = sum(read_count)
   )) 

(riboseq_readSumPerCombn4 <- riboseq_sumSamples4 %>%
    group_by(sample) %>%
    summarise(
      read_sum_genome = sum(read_count_sum)
    ))
#sample      read_sum_genome
#SRR11713358          344461
#SRR11713359          569576
#SRR11713360          223651
#SRR11713361          122341

# LEFT JOIN TOTAL READS FOR EACH COMBN AND READ LENGTH; calculate reads per million mapped reads
riboseq_sumSamples4 <- left_join(x = riboseq_sumSamples4, y = riboseq_readSumPerCombn4, by = c("sample"))
riboseq_sumSamples4$reads_per_million <- riboseq_sumSamples4$read_count_sum / riboseq_sumSamples4$read_sum_genome * 1e6
riboseq_sumSamples4

# LEFT JOIN RELATIVE POSITIONS OF GENES
riboseq_sumSamples4 <- left_join(x = riboseq_sumSamples4, y = gene_position_joiner, by = "position")
riboseq_sumSamples4 # 47,129 x 28

# make long
riboseq_sumSamples4_long <- riboseq_sumSamples4 %>%
  pivot_longer(cols = genes_to_include, names_to = "gene", values_to = "relative_position")
riboseq_sumSamples4_long

# filter to within 40bp of SOME start site, so it's not too big
(riboseq_sumSamples4_long <- filter(riboseq_sumSamples4_long, relative_position >= -20, relative_position <= 20))

# FACTOR genes 5' to 3'
genes_to_include_labels2 <- c("ORF1ab", "ORF1b", "S", 
                              "ORF3a", "ORF3c", "ORF3d", "ORF3d-2", "ORF3a-2",
                              "ORF3b", "ORF3b-2", "ORF3b-3", "ORF3b-4", 
                              "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8",
                              "N", "ORF9b", "ORF9c", "ORF10")
riboseq_sumSamples4_long$gene <- factor(riboseq_sumSamples4_long$gene, levels = genes_to_include, labels = genes_to_include_labels2)

genes_to_include_labels2_subset <- c("ORF1ab", "ORF1b", "S", 
                                     "ORF3a", "ORF3c", "ORF3d", "ORF3d-2", "ORF3a-2",
                                     "ORF3b", #"ORF3b-2", "ORF3b-3", "ORF3b-4", 
                                     "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8",
                                     "N", "ORF9b", "ORF9c", "ORF10")

# FIND MAX BAR (red in Figure 2A, Figure 2—figure supplement 1)
riboseq_sumSamples4_long$peak <- "Non-Peak"
for (this_gene in unique(riboseq_sumSamples4_long$gene)) {
  #this_gene <- '1b'
  this_gene_data <- filter(riboseq_sumSamples4_long, gene == this_gene)
  
  for (this_sample in unique(this_gene_data$sample)) {
    #this_sample <- 'SRR11713358'
    this_gene_sample_data <- filter(this_gene_data, sample == this_sample)
    this_gene_max_depth <- max(this_gene_sample_data$reads_per_million)
    this_gene_max_depth_position <- this_gene_sample_data[this_gene_sample_data$reads_per_million == this_gene_max_depth, ]$relative_position
    
    # Cat
    cat(this_gene, this_sample, this_gene_max_depth_position, "\n")
    
    # Add peak position height
    riboseq_sumSamples4_long[riboseq_sumSamples4_long$gene == this_gene & 
                               riboseq_sumSamples4_long$sample == this_sample &
                               riboseq_sumSamples4_long$relative_position == this_gene_max_depth_position, ]$peak = "Peak" 
  }
}

riboseq_sumSamples4_long$peak <- factor(riboseq_sumSamples4_long$peak, levels = c("Peak", "Non-Peak"))

# Factor sample order Harr first
riboseq_sumSamples4_long$sample <- factor(riboseq_sumSamples4_long$sample, levels = c("SRR11713360", "SRR11713361", "SRR11713358", "SRR11713359"))


####################
# PLOT Figure 2—figure supplement 1, ALL FOUR SAMPLES with the SPECIFIED READ LENGTH RANGES

(riboseq_sumSamples4_long_PLOT <- ggplot(data = filter(riboseq_sumSamples4_long, 
                                                       gene %in% genes_to_include_labels2_subset,
                                                       gene != "1b", gene != "ORF1b",
                                                       relative_position >= -20, relative_position <= 20),
                                         mapping = aes(x = relative_position, y = reads_per_million, fill = peak)) + 
   
   geom_vline(xintercept = 0, linetype = "dotted", color = "darkgrey", size = 0.5) + 
   geom_bar(stat = "identity") + 
   facet_grid(gene ~ sample, scales = "free_y") + 
   xlab("Distance from start (nt)") +
   ylab("Reads per million mapped") + 
   theme_bw() +
   theme(panel.grid = element_blank(),
         plot.title = element_text(hjust = 0.5),
         legend.position = 'none',
         strip.text = element_text(size = 10),
         panel.border = element_rect(),
         strip.background = element_blank(),
         strip.text.y = element_text(size = rel(0.9), angle = 0, face = "italic")) +
   scale_y_continuous(breaks = pretty_breaks(2), expand = expand_scale(mult = c(0, 0.05))) + 
   scale_fill_manual(values = c("red", "grey35"))
)



########################################
### FOUR-SAMPLE VERSION, **24 HPI** to see how it compares
########################################

####################
# plot ALL for 29-31bp pooled lengths, "Harr / 5 hpi" and "LTM / 5 hpi" only
(riboseq_sumSamples24hpi <- filter(dplyr::select(mapped_reads_by_readlength, -chromosome, -frame, -condition), read_length >= 29, read_length <= 31,
                                   condition_combn %in% c("Harr / 24 hpi", "LTM / 24 hpi")) %>%
   group_by(sample, position) %>%
   summarise(
     record_count = n(),
     read_count_sum = sum(read_count)
   )) 

(riboseq_readSumPerCombn24hpi <- riboseq_sumSamples24hpi %>%
    group_by(sample) %>%
    summarise(
      read_sum_genome = sum(read_count_sum)
    ))
#sample      read_sum_genome
#SRR11713366          833006
#SRR11713367         2463553
#SRR11713368        10245313
#SRR11713369         3828880

# LEFT JOIN TOTAL READS FOR EACH COMBN AND READ LENGTH; calculate reads per million mapped reads
riboseq_sumSamples24hpi <- left_join(x = riboseq_sumSamples24hpi, y = riboseq_readSumPerCombn24hpi, by = c("sample"))
riboseq_sumSamples24hpi$reads_per_million <- riboseq_sumSamples24hpi$read_count_sum / riboseq_sumSamples24hpi$read_sum_genome * 1e6
riboseq_sumSamples24hpi

# LEFT JOIN RELATIVE POSITIONS OF GENES
riboseq_sumSamples24hpi <- left_join(x = riboseq_sumSamples24hpi, y = gene_position_joiner, by = "position")
riboseq_sumSamples24hpi # 94,780 x 28

# make long
riboseq_sumSamples24hpi_long <- riboseq_sumSamples24hpi %>%
  pivot_longer(cols = genes_to_include, names_to = "gene", values_to = "relative_position")
riboseq_sumSamples24hpi_long
# 2,085,160 x 8

# filter to within 40bp of SOME start site, so it's not too big
(riboseq_sumSamples24hpi_long <- filter(riboseq_sumSamples24hpi_long, relative_position >= -20, relative_position <= 20))
# 3,415 x 8

# FACTOR genes 5' to 3'
genes_to_include_labels2 <- c("ORF1ab", "ORF1b", "S", 
                              "ORF3a", "ORF3c", "ORF3d", "ORF3d-2", "ORF3a-2",
                              "ORF3b", "ORF3b-2", "ORF3b-3", "ORF3b-4", 
                              "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8",
                              "N", "ORF9b", "ORF9c", "ORF10")
riboseq_sumSamples24hpi_long$gene <- factor(riboseq_sumSamples24hpi_long$gene, levels = genes_to_include, labels = genes_to_include_labels2)

genes_to_include_labels2_subset <- c("ORF1ab", "ORF1b", "S", 
                                     "ORF3a", "ORF3c", "ORF3d", "ORF3d-2", "ORF3a-2",
                                     "ORF3b", #"ORF3b-2", "ORF3b-3", "ORF3b-4", 
                                     "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8",
                                     "N", "ORF9b", "ORF9c", "ORF10")

# FIND MAX BAR
riboseq_sumSamples24hpi_long$peak <- "Non-Peak"
for (this_gene in unique(riboseq_sumSamples24hpi_long$gene)) {
  #this_gene <- '1b'
  this_gene_data <- filter(riboseq_sumSamples24hpi_long, gene == this_gene)
  
  for (this_sample in unique(this_gene_data$sample)) {
    #this_sample <- 'SRR11713358'
    this_gene_sample_data <- filter(this_gene_data, sample == this_sample)
    this_gene_max_depth <- max(this_gene_sample_data$reads_per_million)
    this_gene_max_depth_position <- this_gene_sample_data[this_gene_sample_data$reads_per_million == this_gene_max_depth, ]$relative_position
    
    # Cat
    cat(this_gene, this_sample, this_gene_max_depth_position, "\n")
    
    # Add peak position height
    riboseq_sumSamples24hpi_long[riboseq_sumSamples24hpi_long$gene == this_gene & 
                                   riboseq_sumSamples24hpi_long$sample == this_sample &
                                   riboseq_sumSamples24hpi_long$relative_position == this_gene_max_depth_position, ]$peak = "Peak" 
  }
}

riboseq_sumSamples24hpi_long$peak <- factor(riboseq_sumSamples24hpi_long$peak, levels = c("Peak", "Non-Peak"))

# Factor sample order Harr first
riboseq_sumSamples24hpi_long$sample <- factor(riboseq_sumSamples24hpi_long$sample, levels = c("SRR11713368", "SRR11713369", "SRR11713366", "SRR11713367"))


####################
# PLOT a version of Figure 2—figure supplement 1 at 24 hpi

(riboseq_sumSamples24hpi_long_PLOT <- ggplot(data = filter(riboseq_sumSamples24hpi_long, 
                                                           gene %in% genes_to_include_labels2_subset,
                                                           gene != "1b", gene != "ORF1b",
                                                           relative_position >= -20, relative_position <= 20),
                                             mapping = aes(x = relative_position, y = reads_per_million, fill = peak)) + 
   
   geom_vline(xintercept = 0, linetype = "dotted", color = "darkgrey", size = 0.5) + 
   geom_bar(stat = "identity") + 
   facet_grid(gene ~ sample, scales = "free_y") +
   xlab("Distance from start (nt)") +
   ylab("Reads per million mapped") + 
   theme_bw() +
   theme(panel.grid = element_blank(),
         plot.title = element_text(hjust = 0.5),
         legend.position = 'none',
         strip.text = element_text(size = 10),
         panel.border = element_rect(),
         strip.background = element_blank(),
         strip.text.y = element_text(size = rel(0.9), angle = 0, face = "italic")) +
   scale_y_continuous(breaks = pretty_breaks(2), expand = expand_scale(mult = c(0, 0.05))) +
   scale_fill_manual(values = c("red", "grey35"))
)



########################################
### Plot ALL for 29-31bp pooled lengths, BUT NOW POOL Harr/5hpi and LTM/5hpi for a summary Figure 2

### POOL HARR SAMPLES
(riboseq_sumSamples3_Harr <- filter(dplyr::select(mapped_reads_by_readlength, -chromosome, -frame, -condition), read_length >= 29, read_length <= 31,
                                    condition_combn == "Harr / 5 hpi") %>%
   group_by(position) %>%
   summarise(
     record_count = n(),
     read_count_sum = sum(read_count)
   )) 

# calculate total number of mapped reads across genome
read_sum_genome_POOLED_Harr <- sum(riboseq_sumSamples3_Harr$read_count_sum)

# calculate reads per million mapped reads
riboseq_sumSamples3_Harr$reads_per_million <- riboseq_sumSamples3_Harr$read_count_sum / read_sum_genome_POOLED_Harr * 1e6
riboseq_sumSamples3_Harr$condition <- "Harr_5hpi"

### POOL LTM SAMPLES
(riboseq_sumSamples3_LTM <- filter(dplyr::select(mapped_reads_by_readlength, -chromosome, -frame, -condition), read_length >= 29, read_length <= 31,
                                   condition_combn == "LTM / 5 hpi") %>%
    group_by(position) %>%
    summarise(
      record_count = n(),
      read_count_sum = sum(read_count)
    )) 

# calculate total number of mapped reads across genome
read_sum_genome_POOLED_LTM <- sum(riboseq_sumSamples3_LTM$read_count_sum)

# calculate reads per million mapped reads
riboseq_sumSamples3_LTM$reads_per_million <- riboseq_sumSamples3_LTM$read_count_sum / read_sum_genome_POOLED_LTM * 1e6
riboseq_sumSamples3_LTM$condition <- "LTM_5hpi"

### MEAN of Harr & LTM
riboseq_sumSamples3 <- rbind(riboseq_sumSamples3_Harr, riboseq_sumSamples3_LTM) 
(riboseq_sumSamples3 <- riboseq_sumSamples3 %>%
    group_by(position) %>%
    summarise(
      record_count = sum(record_count),
      read_count_sum = sum(read_count_sum),
      reads_per_million = mean(reads_per_million)
    )) # 18,960 x 4

# LEFT JOIN RELATIVE POSITIONS OF GENES
riboseq_sumSamples3 <- left_join(x = riboseq_sumSamples3, y = gene_position_joiner, by = "position")
riboseq_sumSamples3

# make long
riboseq_sumSamples3_long <- riboseq_sumSamples3 %>%
  pivot_longer(cols = genes_to_include, names_to = "gene", values_to = "relative_position")
riboseq_sumSamples3_long

# filter to EXACT start site region
(riboseq_sumSamples3_long <- filter(riboseq_sumSamples3_long, relative_position >= -20, relative_position <= 20))

# Factor genes 5' to 3'
genes_to_include_groupByOLG <- c("ORF1ab_1", "ORF1ab_2", "S", 
                                 "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8",
                                 "ORF10",
                                 "ORF3a", "ORF3c", "ORF3d", "ORF3d_short", "ORF3a_short",
                                 "ORF3b_1", "ORF3b_2", "ORF3b_3", "ORF3b_4", 
                                 "N", "ORF9b", "ORF9c")
genes_to_include_groupByOLG_labels <- c("1ab", "1b", "S", 
                                        "E", "M", "6", "7a", "7b", "8",
                                        "10",
                                        "3a", "3c", "3d", "3d-2", "3a-2",
                                        "3b", "3b-2", "3b-3", "3b-4", 
                                        "N", "9b", "9c")
genes_to_include_groupByOLG_labels2 <- c("ORF1ab", "ORF1b", "S", 
                                         "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8",
                                         "ORF10",
                                         "ORF3a", "ORF3c", "ORF3d", "ORF3d-2", "ORF3a-2",
                                         "ORF3b", "ORF3b-2", "ORF3b-3", "ORF3b-4", 
                                         "N", "ORF9b", "ORF9c")
riboseq_sumSamples3_long$gene <- factor(riboseq_sumSamples3_long$gene, levels = genes_to_include_groupByOLG, labels = genes_to_include_groupByOLG_labels2)

genes_to_include_labels_subset2 <- c("ORF1ab", "ORF1b", "S", 
                                     "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8",
                                     "ORF10",
                                     "ORF3a", "ORF3c", "ORF3d", "ORF3d-2", "ORF3a-2",
                                     "ORF3b", #"ORF3b-2", "ORF3b-3", "ORF3b-4", 
                                     "N", "ORF9b", "ORF9c")

# Factor for gridding
genes_to_include_labels_OLG2 <- c("ORF3a", "ORF3c", "ORF3d", "ORF3d-2", "ORF3a-2",
                                  "ORF3b", "ORF3b-2", "ORF3b-3", "ORF3b-4", 
                                  "N", "ORF9b", "ORF9c")
genes_to_include_labels_NonOLG2 <- c("ORF1ab", "ORF1b", "S", 
                                     "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8",
                                     "ORF10")

riboseq_sumSamples3_long$region_type <- NA
riboseq_sumSamples3_long[riboseq_sumSamples3_long$gene %in% genes_to_include_labels_OLG2, ]$region_type <- "OLG loci"
riboseq_sumSamples3_long[riboseq_sumSamples3_long$gene %in% genes_to_include_labels_NonOLG2, ]$region_type <- "Non-OLG loci"
riboseq_sumSamples3_long$region_type <- factor(riboseq_sumSamples3_long$region_type, levels = c("Non-OLG loci", "OLG loci"))

# DUMMY DATA FRAME to highlight the site in this region with the max depth
riboseq_sumSamples3_long$peak <- "Non-Peak"
for (this_gene in unique(riboseq_sumSamples3_long$gene)) {
  #this_gene <- '1b'
  this_gene_data <- filter(riboseq_sumSamples3_long, gene == this_gene)
  this_gene_max_depth <- max(this_gene_data$reads_per_million)
  this_gene_max_depth_position <- this_gene_data[this_gene_data$reads_per_million == this_gene_max_depth, ]$relative_position
  
  # Cat
  cat(this_gene, this_gene_max_depth_position, "\n")
  
  # Add peak position height
  riboseq_sumSamples3_long[riboseq_sumSamples3_long$gene == this_gene & riboseq_sumSamples3_long$relative_position == this_gene_max_depth_position, ]$peak = "Peak"
}

riboseq_sumSamples3_long$peak <- factor(riboseq_sumSamples3_long$peak, levels = c("Peak", "Non-Peak"))


####################
# PLOT Figure 2C, POOLED: Harr / 5 hpi AND LTM / 5 hpi
(riboseq_sumSamples3_long_PLOT <- ggplot(data = filter(riboseq_sumSamples3_long, 
                                                       gene %in% genes_to_include_labels_subset2,
                                                       gene != "1b", gene != "ORF1b"),
                                         mapping = aes(x = relative_position, y = reads_per_million, fill = peak)) +
   
   # gene start site
   geom_vline(xintercept = 0, linetype = "dotted", color = "darkgrey", size = 0.5) +
   geom_bar(stat = "identity") + 
   
   facet_wrap(. ~ gene, ncol = 2, scales = "free_y", dir = "v", drop = TRUE) + 
   xlab("Distance from start (nt)") + 
   ylab("Reads per million mapped") + 
   theme_bw() +
   theme(panel.grid = element_blank(),
         legend.position = 'none',
         strip.background = element_blank(),
         strip.text = element_text(size = rel(0.9), margin = margin(0, 0, 1, 0, "pt"),  face = 'italic'), # size = 10, 
         axis.title = element_text(size = rel(0.9)),
         axis.text = element_text(size = rel(0.9)),
         axis.ticks = element_line(size = rel(0.9)),
         panel.border = element_rect(size = rel(0.9))
   ) +
   scale_y_continuous(breaks = pretty_breaks(2), expand = expand_scale(mult = c(0, 0.1))) +
   scale_fill_manual(values = c("red", "grey35"))
)