Revision 469689991e7dbce2be4c4f618584304d91841c49 authored by Chase W. Nelson on 01 October 2020, 17:39:04 UTC, committed by Chase W. Nelson on 01 October 2020, 17:39:04 UTC
1 parent 0b9f8cb
Raw File
epitope_MHCI.R
### EPITOPE ANALYSIS of NetMHCpan results

library(tidyverse)
library(boot)
library(scales)
library(RColorBrewer)

# set working directory
setwd("/Users/cwnelson88/scripts_NGS/SARS-CoV-2-ORF3d/")


##############################
### MHC class I epitopes -- summarized all alleles by hand and added codon position in 20200413_NetMHCpan_WuhanHu1_ORFs_nt_aa.xlsx
(epitope_summary <- read_tsv("data/NetMHCpan_Wuhan-Hu-1.tsv"))
#29,075 x 9

##"Ave" is mean `1-log50k`. Looks.
## strong binder = SB = % Rank 0.5
## weak binder = WB = % Rank 2

# inventory genes
unique(epitope_summary$ID)

# RENAME after figuring out which ID is which gene, because names were truncated by NetMHCpan
epitope_summary[epitope_summary$ID == 'N_NOLa', ]$ID <- 'N_NOL2'
epitope_summary[epitope_summary$ID == 'N_NOLb', ]$ID <- 'N_NOL3'
epitope_summary[epitope_summary$ID == 'N_OL_a', ]$ID <- 'N_OL_ORF9b'
epitope_summary[epitope_summary$ID == 'N_OL_b', ]$ID <- 'N_OL_ORF9c'
epitope_summary[epitope_summary$ID == 'ORF1a_a', ]$ID <- 'ORF1a'
epitope_summary[epitope_summary$ID == 'ORF1a_b', ]$ID <- 'ORF1ab_1'
epitope_summary[epitope_summary$ID == 'ORF1a_c', ]$ID <- 'ORF1ab_1_NOL'
epitope_summary[epitope_summary$ID == 'ORF1a_d', ]$ID <- 'ORF1ab_2'
epitope_summary[epitope_summary$ID == 'ORF1a_e', ]$ID <- 'ORF1ab_2_NOL'
epitope_summary[epitope_summary$ID == 'ORF3a_a', ]$ID <- 'ORF3a'
epitope_summary[epitope_summary$ID == 'ORF3a_b', ]$ID <- 'ORF3a_NOL1'
epitope_summary[epitope_summary$ID == 'ORF3a_c', ]$ID <- 'ORF3a_NOL2'
epitope_summary[epitope_summary$ID == 'ORF3a_d', ]$ID <- 'ORF3a_OL_ORF3bp'
epitope_summary[epitope_summary$ID == 'ORF7a_a', ]$ID <- 'ORF7a'
epitope_summary[epitope_summary$ID == 'ORF7a_b', ]$ID <- 'ORF7a_NOL'
epitope_summary[epitope_summary$ID == 'ORF7b_a', ]$ID <- 'ORF7b'
epitope_summary[epitope_summary$ID == 'ORF7b_b', ]$ID <- 'ORF7b_NOL'
epitope_summary[epitope_summary$ID == 'ORF3b', ]$ID <- 'ORF3d' # previously used ORF3b as the name for ORF3d

# BOOKKEEPING: Because names were truncated, ambiguous
(epitope_summary_codon_counts <- epitope_summary %>%
    group_by(ID) %>%
    summarise(
      residues = n()
    ))


###############################
### ADD IN THE NEW CANDIDATE PEPTIDES
(epitope_data_newPeptides <- read_tsv("data/NetMHCpan_Wuhan-Hu-1_additional.tsv"))
# 1572 x 12

# Summarize 
(epitope_summary_newPeptides <- epitope_data_newPeptides %>%
    group_by(Pos, Peptide, ID, core) %>%
    summarise(
      Ave = mean(`1-log50k`),
      NB = mean(NB),
      SB = mean(SB),
      WB = mean(WB)
    )) # 131 x 10

# RENAME after figuring out which ID is which gene, because names were truncated by NetMHCpan
unique(epitope_summary_newPeptides$ID)

epitope_summary_newPeptides[epitope_summary_newPeptides$ID == "3d_2_25596-2569", ]$ID <- 'ORF3d2'
epitope_summary_newPeptides[epitope_summary_newPeptides$ID == "S_iO1_21744-218", ]$ID <- 'S-iORF1'
epitope_summary_newPeptides[epitope_summary_newPeptides$ID == "3c_25457-25582", ]$ID <- 'ORF3c'
epitope_summary_newPeptides[epitope_summary_newPeptides$ID == "3b_25814-25882", ]$ID <- 'ORF3b' # this is the first short ORF3b in SARS-CoV-2
epitope_summary_newPeptides[epitope_summary_newPeptides$ID == "S_iO2_21768-218", ]$ID <- 'S-iORF2'

# Examine
(epitope_summary_newPeptides_codon_counts <- epitope_summary_newPeptides %>%
    group_by(ID) %>%
    summarise(
      residues = n()
    ))


##############################
### JOIN THE OLD AND NEW PEPTIDE ANALYSES
epitope_summary <- dplyr::select(epitope_summary, Pos, Peptide, ID, Ave, NB, SB, WB)

# Reorder or remove superfluous columns
epitope_summary_newPeptides <- dplyr::select(epitope_summary_newPeptides, Pos, Peptide, ID, Ave, NB, SB, WB)

# add together
(epitope_summary <- rbind(as_tibble(epitope_summary), as.tibble(epitope_summary_newPeptides)))
#29,206 x 7

# number codons, convert to 1-based
epitope_summary$codon_start = epitope_summary$Pos + 1
epitope_summary$codon_end = epitope_summary$codon_start + 9 - 1

# reorder columns
epitope_summary <- dplyr::select(epitope_summary, ID, Pos, codon_start, codon_end, Peptide, Ave, NB, SB, WB)
epitope_summary # 29,206 x 9

### SAVE AND RELOAD
# SAVE for epitope tallying: ID/NB/product/codon_start/codon_end
epitope_summary_5col <- epitope_summary
epitope_summary_5col$product <- epitope_summary_5col$ID
epitope_summary_5col <- dplyr::select(epitope_summary_5col, ID, NB, product, codon_start, codon_end)
write_tsv(epitope_summary_5col, "data/MHCI_epitope_summary.tsv") # INCLUDING NEW PEPTIDES


##################################################
### SLIDING WINDOW in Python to calculate epitope COVERAGE of each site
##################################################

# DO: tally_epitope_coverage.py MHCI_epitope_summary.tsv 9 > MHCI_epitope_summary_tally.tsv 


####################
### RELOAD AFTER RUNNING PYTHON
(epitope_results <- read_tsv("data/MHCI_epitope_summary_tally.tsv", 
                             col_names = c("ID", "codon_start", "codon_end", "NB", "NB_coverage"))) 
# 29,206 x 5

# rename column
names(epitope_results)[names(epitope_results) == "ID"] <- 'product'
unique(epitope_results$product)

# FILTER and RENAME desired products
epitope_gene_IDs <- c("S", "S-iORF1", "S-iORF2", 
                      "ORF3a", "ORF3c", "ORF3d", "ORF3d2", "ORF3b",
                      "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8p", "N", "ORF9b", "ORF9c", "ORF10") # ORF8p at first
epitope_gene_names <- c("S", "S-i1", "S-i2", 
                        "3a", "3c", "3d", "3d-2", "3b",
                        "E", "M", "6", "7a", "7b", "8", "N", "9b", "9c", "10")
(epitope_results <- filter(epitope_results, product %in% c(epitope_gene_IDs)))
# 2902 x 5

# FACTOR gene
epitope_results$product <- factor(epitope_results$product,
                                  levels = epitope_gene_IDs,
                                  labels = epitope_gene_names)

### SUMMARIZE COVERAGE FOR EACH GENE
(epitope_summary <- epitope_results %>%
    group_by(product) %>%
    summarise(
      nonamer_count = n(),
      mean_NB = mean(NB_coverage),
      sd_NB = sd(NB_coverage),
      SE_NB = sd_NB/ sqrt(nonamer_count),
      sum_NB = sum(NB_coverage)
    ))

(epitope_summary_hits <- filter(epitope_results, NB_coverage > 0) %>%
    group_by(product) %>%
    summarise(
      nonamer_epitope_count = n()
    ))

# join number of nonamers for each gene
(epitope_summary <- left_join(epitope_summary, epitope_summary_hits, by = "product"))
epitope_summary[is.na(epitope_summary$nonamer_epitope_count), ]$nonamer_epitope_count <- 0 # superlfuous, THIS time

# calculate proportion of the product hit
epitope_summary$prop_product_hit <- epitope_summary$nonamer_epitope_count / epitope_summary$nonamer_count
epitope_summary$NB_per_codon <- epitope_summary$sum_NB / epitope_summary$nonamer_count
epitope_summary

# PLOT PROPORTION BY GENE
ggplot(epitope_summary, mapping = aes(x = product, y = prop_product_hit)) +
  geom_bar(stat = 'identity') +
  xlab("") +
  ylab("Predicted epitope coverage") +
  theme_bw() +
  theme(panel.grid = element_blank(),
        legend.position = 'none',
        legend.title = element_blank(),
        axis.text.x = element_text(size = 9), 
        axis.text.y = element_text(size = 9),
        axis.title.y = element_text(size = 10),
        panel.border = element_rect(),
        strip.background = element_blank()) +
  scale_y_continuous(limits = c(0, 1), labels = percent_format(accuracy = 1))
# 3d-2 and N have the lowest proportion of residues covered by an eptiope


##################################################
### ADD TWO CLASSES OF NEGATIVE CONTROLS! 
##################################################

####################
# (1) NONFUNCTIONAL ORFS. result expected for real ORFs that have been evolving in the genome without functional constraint
#(nonfunctional_ORFs <- read_tsv("/Users/cwnelson88/Desktop/SARS-CoV-2/NetMHCpan/NetMHCpan_Epitope_Negative_Controls/SARSCOV2-ORFs-90-ATG-longest_unannotated_noStop_netMHCpan_parsed_cut.txt"))
#nonfunctional_ORFs <- filter(nonfunctional_ORFs, Allele == "HLA-A01:01") # because they contain redundant information
#nonfunctional_ORFs$codon_start <- nonfunctional_ORFs$Pos + 1
#nonfunctional_ORFs$codon_end <- nonfunctional_ORFs$Pos + 9
#nonfunctional_ORFs$NB_coverage <- 0

# did the below with Python instead later (tally_*.py)
# uncomment to repeat here, but LENGTHY <-- CHANGE THIS
#for (this_ID in unique(nonfunctional_ORFs$ID)) { 
#  #this_ID <- "s26342-26181"
#  cat(this_ID, "\n")
#  
#  for (this_codon_start in sort(unique(nonfunctional_ORFs[nonfunctional_ORFs$ID == this_ID, ]$codon_start))) {
#    cat('.')
#    this_codon_end <- nonfunctional_ORFs[nonfunctional_ORFs$ID == this_ID & nonfunctional_ORFs$codon_start == this_codon_start, ]$codon_end
#    this_NB <- nonfunctional_ORFs[nonfunctional_ORFs$ID == this_ID & nonfunctional_ORFs$codon_start == this_codon_start, ]$NB
#    
#    for (i in seq(this_codon_start, this_codon_end, 1)) {
#      nonfunctional_ORFs[nonfunctional_ORFs$ID == this_ID & nonfunctional_ORFs$codon_start == i, ]$NB_coverage <- 
#        nonfunctional_ORFs[nonfunctional_ORFs$ID == this_ID & nonfunctional_ORFs$codon_start == i, ]$NB_coverage + this_NB
#    }
#  }
#  cat("\n")
#}

## SAVE AND RELOAD
#write_tsv(nonfunctional_ORFs, "data/MHCI_short_unannot_ORFs.tsv")

## RELOAD IMPORTANT
nonfunctional_ORFs <- read_tsv("data/MHCI_short_unannot_ORFs.tsv")
nonfunctional_ORFs
names(nonfunctional_ORFs)[names(nonfunctional_ORFs) == "ID"] <- 'product'

### NB
(nonfunctional_ORFs_summary <- nonfunctional_ORFs %>%
    group_by(product) %>%
    summarise(
      nonamer_count = n(),
      mean_NB = mean(NB_coverage),
      sd_NB = sd(NB_coverage),
      SE_NB = sd_NB/ sqrt(nonamer_count),
      sum_NB = sum(NB_coverage)
    ))

(nonfunctional_ORFs_summary_hits <- filter(nonfunctional_ORFs, NB_coverage > 0) %>%
    group_by(product) %>%
    summarise(
      nonamer_nonfunctional_ORFs_count = n()
    ))

# join nonamers hit and nonamer counts
(nonfunctional_ORFs_summary <- left_join(nonfunctional_ORFs_summary, nonfunctional_ORFs_summary_hits, by = "product"))
nonfunctional_ORFs_summary[is.na(nonfunctional_ORFs_summary$nonamer_nonfunctional_ORFs_count), ]$nonamer_nonfunctional_ORFs_count <- 0 # superfluous THIS time

# proportions
nonfunctional_ORFs_summary$prop_product_hit <- nonfunctional_ORFs_summary$nonamer_nonfunctional_ORFs_count / nonfunctional_ORFs_summary$nonamer_count
nonfunctional_ORFs_summary$NB_per_codon <- nonfunctional_ORFs_summary$sum_NB / nonfunctional_ORFs_summary$nonamer_count
nonfunctional_ORFs_summary

# Define confidence bounds -- 95% confidence interval
(nonfunctional_ORFs_epitopes_per9mer_mean <- mean(nonfunctional_ORFs_summary$NB_per_codon))
(nonfunctional_ORFs_epitopes_per9mer_SD <- sd(nonfunctional_ORFs_summary$NB_per_codon))
(nonfunctional_ORFs_epitopes_per9mer_SE <- sd(nonfunctional_ORFs_summary$NB_per_codon) / sqrt(nrow(nonfunctional_ORFs_summary)))
(nonfunctional_ORFs_epitopes_per9mer_p025 <- quantile(nonfunctional_ORFs_summary$NB_per_codon, 0.025))
(nonfunctional_ORFs_epitopes_per9mer_p975 <- quantile(nonfunctional_ORFs_summary$NB_per_codon, 0.975))


####################
## (2) RANDOM ORFs. result expected for entirely random ORFs that have the same amino acid content of each protein

# Because they are very large, we cut them down to size, e.g.,
# DO: cat NC_045512.2_S_aa_random_netMHCpan_parsed.txt | gsed -E 's/_rand[a-zA-Z0-9_]+//' | cut -f1,3,10 > S_random.tsv
# DO: convert to 1-based positions by adding 1
# DO: cut down to only one allele (the 12 are redundant)
# DO: save as, e.g., S_random.tsv


################################################################################
### TALLY each site's overlap using the Python script because R can't handle the damned loop and I'm too stupid to vectorize. For example:
# DO, for each protein:
#tally_epitope_coverage.py S_random.tsv 9 > S_random_tally.tsv

# LOAD AND PROCESS
S_random_tally <- read_tsv("data/NetMHCpan_S_random_tally.tsv", col_names = c("ID", "codon_start", "codon_end", "NB", "NB_coverage"))
S_random_tally$product <- "S"
ORF3a_random_tally <- read_tsv("data/NetMHCpan_ORF3a_random_tally.tsv", col_names = c("ID", "codon_start", "codon_end", "NB", "NB_coverage"))
ORF3a_random_tally$product <- "ORF3a"
ORF3c_random_tally <- read_tsv("data/NetMHCpan_ORF3c_random_tally.tsv", col_names = c("ID", "codon_start", "codon_end", "NB", "NB_coverage"))
ORF3c_random_tally$product <- "ORF3c"
ORF3d_random_tally <- read_tsv("data/NetMHCpan_ORF3d_random_tally.tsv", col_names = c("ID", "codon_start", "codon_end", "NB", "NB_coverage")) # used to be called ORF3c
ORF3d_random_tally$product <- "ORF3d"
ORF3d2_random_tally <- read_tsv("data/NetMHCpan_ORF3d2_random_tally.tsv", col_names = c("ID", "codon_start", "codon_end", "NB", "NB_coverage"))
ORF3d2_random_tally$product <- "ORF3d-2"
ORF3b_random_tally <- read_tsv("data/NetMHCpan_ORF3b_random_tally.tsv", col_names = c("ID", "codon_start", "codon_end", "NB", "NB_coverage"))
ORF3b_random_tally$product <- "ORF3b"
E_random_tally <- read_tsv("data/NetMHCpan_E_random_tally.tsv", col_names = c("ID", "codon_start", "codon_end", "NB", "NB_coverage"))
E_random_tally$product <- "E"
M_random_tally <- read_tsv("data/NetMHCpan_M_random_tally.tsv", col_names = c("ID", "codon_start", "codon_end", "NB", "NB_coverage"))
M_random_tally$product <- "M"
ORF6_random_tally <- read_tsv("data/NetMHCpan_ORF6_random_tally.tsv", col_names = c("ID", "codon_start", "codon_end", "NB", "NB_coverage"))
ORF6_random_tally$product <- "ORF6"
ORF7a_random_tally <- read_tsv("data/NetMHCpan_ORF7a_random_tally.tsv", col_names = c("ID", "codon_start", "codon_end", "NB", "NB_coverage"))
ORF7a_random_tally$product <- "ORF7a"
ORF7b_random_tally <- read_tsv("data/NetMHCpan_ORF7b_random_tally.tsv", col_names = c("ID", "codon_start", "codon_end", "NB", "NB_coverage"))
ORF7b_random_tally$product <- "ORF7b"
ORF8_random_tally <- read_tsv("data/NetMHCpan_ORF8_random_tally.tsv", col_names = c("ID", "codon_start", "codon_end", "NB", "NB_coverage"))
ORF8_random_tally$product <- "ORF8"
N_random_tally <- read_tsv("data/NetMHCpan_N_random_tally.tsv", col_names = c("ID", "codon_start", "codon_end", "NB", "NB_coverage"))
N_random_tally$product <- "N"
ORF9b_random_tally <- read_tsv("data/NetMHCpan_ORF9b_random_tally.tsv", col_names = c("ID", "codon_start", "codon_end", "NB", "NB_coverage"))
ORF9b_random_tally$product <- "ORF9b"
ORF9c_random_tally <- read_tsv("data/NetMHCpan_ORF9c_random_tally.tsv", col_names = c("ID", "codon_start", "codon_end", "NB", "NB_coverage"))
ORF9c_random_tally$product <- "ORF9c"
ORF10_random_tally <- read_tsv("data/NetMHCpan_ORF10_random_tally.tsv", col_names = c("ID", "codon_start", "codon_end", "NB", "NB_coverage"))
ORF10_random_tally$product <- "ORF10"


### Combine
ALL_ORFs_random <- rbind(S_random_tally, ORF3a_random_tally, ORF3c_random_tally, ORF3d_random_tally, ORF3d2_random_tally, ORF3b_random_tally,
                         E_random_tally, M_random_tally, ORF6_random_tally, ORF7a_random_tally, ORF7b_random_tally, ORF8_random_tally,
                         N_random_tally, ORF9b_random_tally, ORF9c_random_tally, ORF10_random_tally)
ALL_ORFs_random # 2,625,000 x 6
unique(ALL_ORFs_random$ID) #"s1"    "s2"    "s3"    "s4"    "s5"    "s6"    "s7"    "s8"    "s9" ...
unique(ALL_ORFs_random$product)


### SUMMARIZE BY REPLICATE (ID) and GENE
(ALL_ORFs_random_summary <- ALL_ORFs_random %>%
    group_by(product, ID) %>%
    summarise(
      nonamer_count = n(),
      mean_NB = mean(NB_coverage),
      sd_NB = sd(NB_coverage),
      SE_NB = sd_NB/ sqrt(nonamer_count),
      sum_NB = sum(NB_coverage)
    ))
#16,000 x 7

(ALL_ORFs_random_summary_hits <- filter(ALL_ORFs_random, NB_coverage > 0) %>%
    group_by(product, ID) %>%
    summarise(
      nonamer_ALL_ORFs_random_count = n()
    ))
# 15,993 x 3

# join nonamer hits and count
(ALL_ORFs_random_summary <- left_join(ALL_ORFs_random_summary, ALL_ORFs_random_summary_hits, by = c("product", "ID")))
ALL_ORFs_random_summary[is.na(ALL_ORFs_random_summary$nonamer_ALL_ORFs_random_count), ]$nonamer_ALL_ORFs_random_count <- 0 

ALL_ORFs_random_summary$prop_product_hit <- ALL_ORFs_random_summary$nonamer_ALL_ORFs_random_count / ALL_ORFs_random_summary$nonamer_count
ALL_ORFs_random_summary$NB_per_codon <- ALL_ORFs_random_summary$sum_NB / ALL_ORFs_random_summary$nonamer_count 
ALL_ORFs_random_summary 

# Summarize by gene
(ALL_ORFs_random_summary_byGene <- ALL_ORFs_random_summary %>%
    group_by(product) %>%
    summarise(
      random_replicate_count = n(),
      random_NB_per_codon_mean = mean(NB_per_codon),
      random_NB_per_codon_sd = sd(NB_per_codon),
      random_NB_per_codon_SE = random_NB_per_codon_sd / sqrt(random_replicate_count),
      random_NB_per_codon_Q1 = quantile(NB_per_codon, 0.025),
      random_NB_per_codon_Q3 = quantile(NB_per_codon, 0.975)
    ))


# FACTOR genes to short form to match names
random_gene_IDs <- c("ORF1ab",
                     "S", "ORF3a", "ORF3c", "ORF3d", "ORF3d-2", "ORF3b",
                     "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8", "N", "ORF9b", "ORF9c", "ORF10")
random_gene_names <- c("1ab", 
                       "S", "3a", "3c", "3d", "3d-2", "3b",
                       "E", "M", "6", "7a", "7b", "8", "N", "9b", "9c", "10")

# Recode product names
ALL_ORFs_random_summary_byGene$product <- factor(ALL_ORFs_random_summary_byGene$product,
                                                 levels = random_gene_IDs,
                                                 labels = random_gene_names)
ALL_ORFs_random_summary_byGene



#######################################################
### JOIN negative control to test data!
epitope_summary <- left_join(epitope_summary, ALL_ORFs_random_summary_byGene, by = "product")
epitope_summary # 18 x 15


#######################################################
###  Pivot to LONG and transfer values to columns
(epitope_summary_LONG <- epitope_summary %>%
   pivot_longer(cols = c('NB_per_codon', 'random_NB_per_codon_mean'), names_to = "sample_type", values_to = "NB_per_codon")) 

# Define 95% CI for observed ORFs
epitope_summary_LONG$NB_per_codon_LOW <- epitope_summary_LONG$NB_per_codon - 1.96 * epitope_summary_LONG$SE_NB
epitope_summary_LONG$NB_per_codon_HIGH <- epitope_summary_LONG$NB_per_codon + 1.96 * epitope_summary_LONG$SE_NB

# Define 95% CI for randomized ORFs
epitope_summary_LONG[epitope_summary_LONG$sample_type == "random_NB_per_codon_mean", ]$NB_per_codon_LOW <- 
  epitope_summary_LONG[epitope_summary_LONG$sample_type == "random_NB_per_codon_mean", ]$random_NB_per_codon_Q1 # not Q1, actually 2.5%ile
epitope_summary_LONG[epitope_summary_LONG$sample_type == "random_NB_per_codon_mean", ]$NB_per_codon_HIGH <- 
  epitope_summary_LONG[epitope_summary_LONG$sample_type == "random_NB_per_codon_mean", ]$random_NB_per_codon_Q3 # not Q3, actually 97.5%ile

# FACTOR sample type
epitope_summary_LONG$sample_type <- factor(epitope_summary_LONG$sample_type,
                                           levels = c("NB_per_codon", "random_NB_per_codon_mean"),
                                           labels = c("Observed", "Expected (randomized peptide)"))
#View(epitope_summary_LONG)

# FACTOR gene
length(unique(epitope_summary_LONG$product)) # 18
epitope_summary_LONG$product <- factor(epitope_summary_LONG$product,
                                       levels = random_gene_names,
                                       labels = random_gene_names)

# subset for use
epitope_gene_names_subset <- setdiff(epitope_gene_names, c("3d-2"))

# short unannotated ORFs
(nonfunctional_ORFs_epitopes_per9mer_p025 <- quantile(nonfunctional_ORFs_summary$NB_per_codon, 0.025))
(nonfunctional_ORFs_epitopes_per9mer_p975 <- quantile(nonfunctional_ORFs_summary$NB_per_codon, 0.975))


### ADD TO PLOT
(epitopes_per9mer_byProduct_MHCI_PLOT <- ggplot(filter(epitope_summary_LONG, product %in% epitope_gene_names_subset),
                                                mapping = aes(x = product, y = NB_per_codon, fill = sample_type)) + 
    
    # nonfunctional ORF control
    geom_rect(xmin = -Inf, xmax = Inf,
              ymin = nonfunctional_ORFs_epitopes_per9mer_p025, ymax = nonfunctional_ORFs_epitopes_per9mer_p975,
              fill = "#E9E9E9", linetype = 0, inherit.aes = FALSE) +
    geom_hline(yintercept = nonfunctional_ORFs_epitopes_per9mer_mean, color = "grey", linetype = "dashed", size = 0.25) + # grey
    
    geom_bar(stat = "identity", position = position_dodge(0.8), width = 0.8) +
    
    geom_errorbar(mapping = aes(ymin = NB_per_codon_LOW, ymax = NB_per_codon_HIGH), position = position_dodge(0.8), width = 0.2, size = 0.175) +
    xlab("Protein") +
    ylab("Epitopes per residue") +
    theme_bw() +
    theme(panel.grid = element_blank(),
          legend.position = 'none',
          axis.text.x = element_text(size = rel(0.85)), 
          axis.text.y = element_text(size = rel(0.85)),
          axis.title.x = element_text(size = rel(0.85)),
          axis.title.y = element_text(size = rel(0.85)),
          panel.border = element_rect(),
          strip.background = element_blank()) +
    scale_y_continuous(expand = expand_scale(mult = c(0, 0.05))) +
    scale_fill_manual(values = c("#0072B2", "gray"))
)


## Quantify which are significant
NUM_REPLICATES <- 1000
epitope_summary_test <- epitope_summary
epitope_summary_test <- dplyr::select(epitope_summary_test, product, NB_per_codon)

# Recode randomized product names
ALL_ORFs_random_summary$product <- factor(ALL_ORFs_random_summary$product,
                                          levels = random_gene_IDs,
                                          labels = random_gene_names)
ALL_ORFs_random_summary

epitope_summary_test$random_NB_gt <- 0
epitope_summary_test$random_NB_eq <- 0
epitope_summary_test$random_NB_lt <- 0

for (this_product in unique(epitope_summary_test$product)) {
  NB_per_codon_observed <- epitope_summary_test[epitope_summary_test$product == this_product, ]$NB_per_codon
  epitope_summary_test[epitope_summary_test$product == this_product, ]$random_NB_gt <- sum(ALL_ORFs_random_summary[ALL_ORFs_random_summary$product == this_product, ]$NB_per_codon > NB_per_codon_observed)
  epitope_summary_test[epitope_summary_test$product == this_product, ]$random_NB_eq <- sum(ALL_ORFs_random_summary[ALL_ORFs_random_summary$product == this_product, ]$NB_per_codon == NB_per_codon_observed)
  epitope_summary_test[epitope_summary_test$product == this_product, ]$random_NB_lt <- sum(ALL_ORFs_random_summary[ALL_ORFs_random_summary$product == this_product, ]$NB_per_codon < NB_per_codon_observed)
}

# one-sided ASL P-values
epitope_summary_test$ASL_Obs_lt_Exp_P <- 1
epitope_summary_test$ASL_Obs_gt_Exp_P <- 1
epitope_summary_test$ASL_Obs_lt_Exp_P <- epitope_summary_test$random_NB_lt / (epitope_summary_test$random_NB_lt + epitope_summary_test$random_NB_eq + epitope_summary_test$random_NB_gt)
epitope_summary_test$ASL_Obs_gt_Exp_P <- epitope_summary_test$random_NB_gt / (epitope_summary_test$random_NB_lt + epitope_summary_test$random_NB_eq + epitope_summary_test$random_NB_gt)


# two-sided ASL P-values
epitope_summary_test$ASL_Obs_ne_Exp_P <- 1
epitope_summary_test[epitope_summary_test$ASL_Obs_lt_Exp_P < epitope_summary_test$ASL_Obs_gt_Exp_P, ]$ASL_Obs_ne_Exp_P <- 
  2 * epitope_summary_test[epitope_summary_test$ASL_Obs_lt_Exp_P < epitope_summary_test$ASL_Obs_gt_Exp_P, ]$ASL_Obs_lt_Exp_P
epitope_summary_test[epitope_summary_test$ASL_Obs_lt_Exp_P > epitope_summary_test$ASL_Obs_gt_Exp_P, ]$ASL_Obs_ne_Exp_P <- 
  2 * epitope_summary_test[epitope_summary_test$ASL_Obs_lt_Exp_P > epitope_summary_test$ASL_Obs_gt_Exp_P, ]$ASL_Obs_gt_Exp_P
epitope_summary_test[epitope_summary_test$ASL_Obs_ne_Exp_P == 0, ]$ASL_Obs_ne_Exp_P <- 1 / NUM_REPLICATES


### Finally, test versus nonfunctionals
epitope_summary_test$nonfunctional_NB_gt <- 0
epitope_summary_test$nonfunctional_NB_eq <- 0
epitope_summary_test$nonfunctional_NB_lt <- 0

for (this_product in unique(epitope_summary_test$product)) {
  NB_per_codon_observed <- epitope_summary_test[epitope_summary_test$product == this_product, ]$NB_per_codon
  epitope_summary_test[epitope_summary_test$product == this_product, ]$nonfunctional_NB_gt <- sum(nonfunctional_ORFs_summary$NB_per_codon > NB_per_codon_observed)
  epitope_summary_test[epitope_summary_test$product == this_product, ]$nonfunctional_NB_eq <- sum(nonfunctional_ORFs_summary$NB_per_codon == NB_per_codon_observed)
  epitope_summary_test[epitope_summary_test$product == this_product, ]$nonfunctional_NB_lt <- sum(nonfunctional_ORFs_summary$NB_per_codon < NB_per_codon_observed)
}


# one-sided ASL_nonfunctional P-values
epitope_summary_test$ASL_nonfunctional_Obs_lt_Exp_P <- 1
epitope_summary_test$ASL_nonfunctional_Obs_gt_Exp_P <- 1
epitope_summary_test$ASL_nonfunctional_Obs_lt_Exp_P <- epitope_summary_test$nonfunctional_NB_lt / (epitope_summary_test$nonfunctional_NB_lt + epitope_summary_test$nonfunctional_NB_eq + epitope_summary_test$nonfunctional_NB_gt)
epitope_summary_test$ASL_nonfunctional_Obs_gt_Exp_P <- epitope_summary_test$nonfunctional_NB_gt / (epitope_summary_test$nonfunctional_NB_lt + epitope_summary_test$nonfunctional_NB_eq + epitope_summary_test$nonfunctional_NB_gt)


# two-sided ASL_nonfunctional P-values
epitope_summary_test$ASL_nonfunctional_Obs_ne_Exp_P <- 1
epitope_summary_test[epitope_summary_test$ASL_nonfunctional_Obs_lt_Exp_P < epitope_summary_test$ASL_nonfunctional_Obs_gt_Exp_P, ]$ASL_nonfunctional_Obs_ne_Exp_P <- 
  2 * epitope_summary_test[epitope_summary_test$ASL_nonfunctional_Obs_lt_Exp_P < epitope_summary_test$ASL_nonfunctional_Obs_gt_Exp_P, ]$ASL_nonfunctional_Obs_lt_Exp_P
epitope_summary_test[epitope_summary_test$ASL_nonfunctional_Obs_lt_Exp_P > epitope_summary_test$ASL_nonfunctional_Obs_gt_Exp_P, ]$ASL_nonfunctional_Obs_ne_Exp_P <- 
  2 * epitope_summary_test[epitope_summary_test$ASL_nonfunctional_Obs_lt_Exp_P > epitope_summary_test$ASL_nonfunctional_Obs_gt_Exp_P, ]$ASL_nonfunctional_Obs_gt_Exp_P
epitope_summary_test[epitope_summary_test$ASL_nonfunctional_Obs_ne_Exp_P == 0, ]$ASL_nonfunctional_Obs_ne_Exp_P <- 1 / NUM_REPLICATES


### SAVE
write_tsv(epitope_summary_test, "data/MHCI_epitope_summary_test.tsv")


###  MHC class II in ANOTHER SCRIPT: epitope_MHCII.R




back to top