Content - 5ded4592806811202e545ef579e7202b08242910 - b28faf8/simulation_study_v3.R

visit type:
https://github.com/FrancoisBlanquart/selection_variant

27 November 2023, 18:37:41 UTC
Tip revision: a242a18349393e2e98d353a879ef9e66e99f21c1 authored by François Blanquart on 13 December 2021, 07:50:23 UTC
Update README.md
Tip revision: a242a18
simulation_study_v3.R
rm(list = ls())

library(MASS)
library(plyr)
library(VGAM)
library(plyr)
library(Rcpp)

# SIMULATION EXPERIENCE

source("~/ownCloud/coronavirus/variant_France/clean_code/functions_v3.R")

source("~/ownCloud/coronavirus/variant_France/clean_code/get_clean_data.R")

# INFERENCE FUNCTIONS
source("~/ownCloud/coronavirus/variant_France/clean_code/inference_functions.R")

source("~/ownCloud/coronavirus/variant_France/clean_code/conceptual_figures_v2.R")

# need to load cases_per_French_region_v5.R

sourceCpp("~/ownCloud/coronavirus/variant_France/clean_code/simulate_v2.cpp")  # simulations in Rcpp

make_smooth <- function(vec){
  x <- 1:length(vec)
  y <- loess(vec ~ x, span = 0.2)
  yfit <- predict(y, newdata=  x)
  yfit
}

pop_idf <- 12278210 # pop IDF
immunity_idf <- immunity[which(immunity$date=="2021-01-01" & immunity$region == "IDF"), "infected.mean"]/100

# reset sd_advantage
sd_advantage <- 0

################################################## RUN DYNAMIC SIMULATIONS AND PLOT (RH, RV) ##################################################

# BM MOTION OF THE WT with mean 1.3 and sd 

R0H <- 1.5
flag <- T
cond_on_R0 <- function(myR0vec) mean(myR0vec[1:20]) > 1.1 & mean(myR0vec[(max_time-20):max_time]) < 0.9 & all(myR0vec > 0.5) & all(myR0vec < 5)
while(flag){
  R0vecH <- mvrnorm(n = 1, mu = rep(R0H, max_time), Sigma = create_BMautocov_mat(temporal_autocor, max_time = max_time))
  if(cond_on_R0(R0vecH)) flag <- F # select a realisation of BM where it is below 1 at the end
}
R0vecH <- c(2.1307553, 2.0105587, 2.0023232, 1.9215775, 1.9642554, 1.9238395, 2.1029669, 2.0780443, 2.1581173, 2.0758108, 2.1253340, 1.9948015, 1.8292173, 1.7309693, 1.7447241, 1.7712299, 1.7653672, 1.7129285, 1.8179133, 1.7841625, 1.9068882, 1.9366379, 2.0484530, 2.1590289, 2.1527725, 2.2736368, 2.2387613, 2.2257805, 2.1303752, 2.1895513, 2.2356056, 2.2355079, 2.3258214, 2.2258386, 2.1631581, 2.2186670, 2.1714491, 2.2069247, 2.1436307, 2.1366790, 2.0831706, 1.9912173, 1.8703819, 1.6309556, 1.4634919, 1.3115412, 1.1725386, 1.0649104, 1.0250046, 0.9133384, 1.0018016, 0.9228869, 0.9090948, 0.7077709, 0.5578058, 0.6371114, 0.7721366, 0.7062939, 0.6210061, 0.4607542, 0.5308621, 0.3945342, 0.4594047, 0.6524794, 0.6660125, 0.5297658, 0.5369410, 0.5645049, 0.6640699, 0.6662376, 0.5892468, 0.5752508, 0.7023259, 0.6168691, 0.4811847, 0.3900102, 0.4270968, 0.2734790, 0.3581384, 0.3733780)
# as of 30/08 move to smooth linear function:
Rinit <- 2; Rfin <- 0.5
R0vecH <- c(rep(Rinit, 10), seq(Rinit, Rfin, by = -(Rinit-Rfin)/(70-1)))
plot(R0vecH, type = "l"); abline(h = 1, lty = 2)

# PARAMETERS OF THE TRANSMISSION FOR VARIANTS:
burnin <- 10


beta_advantage <- 0.
sc1 <- list(s1 = beta_advantage, s2 = -0.4, s3 = 0)
sc2 <- list(s1 = beta_advantage, s2 = 0.4, s3 = 0.)
sc3 <- list(s1 = beta_advantage, s2 = 0., s3 = 0.4)
sc4 <- list(s1 = beta_advantage, s2 = 0., s3 = -0.4)
sc5 <- list(s1 = beta_advantage, s2 = 0., s3 = 0)
cols <- c(RColorBrewer::brewer.pal(n = 8, name = "Paired")[1:4], mygray)

pdf(paste0("~/ownCloud/coronavirus/variant_France/clean_code/simulation_growth_WT_M_witherror_all", ".pdf"), width = 4*3*0.8, height = 4*0.8)

par(mfrow = c(1,3), mar = c(4,4,1,1))

for(Nsamplesize in c(30000, 10000, 3000, 30000)){

  
  plot(NULL, xlim = c(-0.25, 0.25), ylim = c(-0.25, 0.25), xlab = "Growth rate historical strains", ylab = "Growth rate emerging variant", las = 1, main = paste0("Daily sample size, N = ", Nsamplesize))
  
  # test a series of (R, duration) combinations that give the same difference in r betwen WT and M (eqaul to 0.07)
  mean_rW <- new_Rtor(param = c(MUW^2/SDW^2, SDW^2/MUW), R = mean(R0vecH)) # mean r for WT
  #target_rM <- mean_rW + 0.07 # target for the delta of growth rates between mutant and WT
  ta_seq <- c()
  
  for(k in 4:1){
    
      myscenario <- get(paste0("sc", k))
      transmission_advantage <- myscenario$s1
      duration_advantage <- myscenario$s2
      sd_advantage <- myscenario$s3
        
      # previously: new_rtoR(param = c(MUM^2/SDM^2, SDM^2/MUM), r = target_rM) / mean(R0vecH) - 1 # transmission advantage needed to reach target rM
      ta_seq <- c(ta_seq, transmission_advantage)
      #plot(siW[1:20], type = "o", pch = 20, ylab = "Distribution serial interval")
      #points(siM[1:20], type = "o", pch = 20, col = "red") 
      
      # need to re-source parameters
      # define distribution of generation time for the mutant by reloading the sourcing file (which updates variant distribution with new values of duration advantage)
      source("~/ownCloud/coronavirus/variant_France/clean_code/source_simulation_parameters.R")
      
      # simulate:
      sim <- simulate_Rcpp(
        max_age = 60, # maximum age of infection to consider
        initial_IW = 4000, # initial number of infected spread on each age (around 8000 infected per day in IDF if half are detected)
        initial_IM = 400,
        tmax = max_time, # maximum time (3 months)
        simtlockdown = 50, # date of the lockdown (only when step function)
        ifr = 0.0, # ifr
        total_pop = pop_idf * (1 - immunity_idf), # for population immmunity - size of the population left to infect
        par = c(
          R0vecH,
          0.5, # pdetection min
          0.5, # pdetection max
          1,   # k time coefficient
          100, # time mid-point for detection
          transmission_advantage # transmission advantage
        ),
        betafun = 3, # mode for beta function (smooth sigmoid or step function)
        siW = siW, # generation time ("serial interval" distribution)
        siM = siM, # generation time ("serial interval" distribution)
        tid = tid, # time from infection to death
        tit = tit # time from infection to test
      )
      
      
      # cases per day:
      # plot(sim$all_detected_incidenceW, col = "blue", type = "o", pch = 20, lwd = 4, las = 1, ylim = c(0, 10000), ylab = "Detected cases", xlab = "Time (days)", main = paste("transmission advantage = ", transmission_advantage, ", relative duration = ", duration_advantage))
      # points(sim$all_detected_incidenceM, col = "red", type = "o", pch = 20, lwd = 4)
      # points(sim$all_detected_incidenceW+sim$all_detected_incidenceM, col = "gray", type = "o", pch = 20, lwd = 4)
      
      # create a simulated dataset: (note we remove the first 10 days)
      sim_sub_b <- generate_sim_data(mysim = sim, freq_sample_size = Nsamplesize, weekly = F)
      
      points(sim_sub_b$rH_error, sim_sub_b$rV_error, pch = 20, col = cols[k], cex = 0.7)
      abline(0,1)
      
      ll <- loess(sim_sub_b$rV_error ~ sim_sub_b$rH_error); yfit <- predict(ll, newdata = seq(-0.2, 0.1, 0.01))
      # ll <- lm(rV_error ~ rH_error, data = sim_sub_b); yfit <- predict(ll, newdata = data.frame(rH_error = seq(-0.2, 0.1, 0.01)))
      
      lines(seq(-0.2, 0.1, 0.01), yfit, col=cols[k],lty = 1, lwd=2)
      #abline(lm0$coefficients[1], lm0$coefficients[2], col = cols[col_idx])

      
  }
  
  legend("bottomright", col = cols, pch = 20, legend = c("Shorter mgt", "Longer mgt", "Larger var", "Shorter var"), bty = "n")
  abline(h = 0, lty = 2)
  abline(v = 0, lty = 2)
  if(Nsamplesize==3000) dev.off()
}




# check and visualise the correlation between true R and predicted R DONE

# note the difference in absolute values can be explained mostly by the  the impact of detected individuals and secondarily by the build-up of population immunity
pdf("~/ownCloud/coronavirus/variant_France/clean_code/R0_dynamics_correlation_example.pdf", width = 6, height = 4*2)

# re-simulate with random parameter variation
R0vecH <- c(2.1307553, 2.0105587, 2.0023232, 1.9215775, 1.9642554, 1.9238395, 2.1029669, 2.0780443, 2.1581173, 2.0758108, 2.1253340, 1.9948015, 1.8292173, 1.7309693, 1.7447241, 1.7712299, 1.7653672, 1.7129285, 1.8179133, 1.7841625, 1.9068882, 1.9366379, 2.0484530, 2.1590289, 2.1527725, 2.2736368, 2.2387613, 2.2257805, 2.1303752, 2.1895513, 2.2356056, 2.2355079, 2.3258214, 2.2258386, 2.1631581, 2.2186670, 2.1714491, 2.2069247, 2.1436307, 2.1366790, 2.0831706, 1.9912173, 1.8703819, 1.6309556, 1.4634919, 1.3115412, 1.1725386, 1.0649104, 1.0250046, 0.9133384, 1.0018016, 0.9228869, 0.9090948, 0.7077709, 0.5578058, 0.6371114, 0.7721366, 0.7062939, 0.6210061, 0.4607542, 0.5308621, 0.3945342, 0.4594047, 0.6524794, 0.6660125, 0.5297658, 0.5369410, 0.5645049, 0.6640699, 0.6662376, 0.5892468, 0.5752508, 0.7023259, 0.6168691, 0.4811847, 0.3900102, 0.4270968, 0.2734790, 0.3581384, 0.3733780)
transmission_advantage <- 0.1
duration_advantage <- 0
sd_advantage <- 0
# need to re-source parameters
# define distribution of generation time for the mutant by reloading the sourcing file (which updates variant distribution with new values of duration advantage)
source("~/ownCloud/coronavirus/variant_France/clean_code/source_simulation_parameters.R")

# simulate:
sim <- simulate_Rcpp(
  max_age = 60, # maximum age of infection to consider
  initial_IW = 4000, # initial number of infected spread on each age (around 8000 infected per day in IDF if half are detected)
  initial_IM = 400,
  tmax = max_time, # maximum time (3 months)
  simtlockdown = 50, # date of the lockdown (only when step function)
  ifr = 0.0, # ifr
  total_pop = pop_idf * (1 - immunity_idf), # for population immmunity - size of the population left to infect
  par = c(
    R0vecH,
    0.5, # pdetection min
    0.5, # pdetection max
    1,   # k time coefficient
    100, # time mid-point for detection
    transmission_advantage # transmission advantage
  ),
  betafun = 3, # mode for beta function (smooth sigmoid or step function)
  siW = siW, # generation time ("serial interval" distribution)
  siM = siM, # generation time ("serial interval" distribution)
  tid = tid, # time from infection to death
  tit = tit # time from infection to test
)
sim_sub_b <- generate_sim_data(mysim = sim, freq_sample_size = Nsamplesize, weekly = F)
# re-simulate with random R0 variation
par(mfrow = c(2,1), mar = c(4,4,1,1))

# cases per day:
max_incidence <- max(sim$all_detected_incidenceW+sim$all_detected_incidenceM)
plot(sim$all_detected_incidenceW, col = "black", type = "l", pch = 20, lwd = 1, lty = 1, las = 1, ylim = c(0, 1.1*max_incidence), ylab = "Detected cases", xlab = "Time (days)", las = 1)
     #,main = paste("transmission advantage = ", signif(transmission_advantage, 3), ", relative duration = ", signif(duration_advantage, 3)))
points(sim$all_detected_incidenceM, col = cols[k], type = "l", lwd = 1, lty = 1)
#points(sim$all_detected_incidenceW+sim$all_detected_incidenceM, col = "gray", type = "l", lwd = 1)
legend("topleft", col = c("black", cols[k]), legend = c("Historical strains cases", "Emerging variant cases"), lty = 2, lwd = 1, bty = "n")

plot(rtoR(r = sim_sub_b$rH, mu = MUW, SD = SDW), type = "l", ylim = c(0., 3.2), xlab = "Time (days)", ylab = expression(paste("Reproduction number ", R)) , lwd = 1, col = "black", las = 1)
points(rtoR(r = sim_sub_b$rV, mu = MUM, SD = SDM), type = "l", lwd = 1, col = cols[k])
points(R0vecH[11:max_time], type = "l", col = "black", lwd = 3)
points(R0vecH[11:max_time] * (1 + transmission_advantage), type = "l", col = cols[k], lwd = 3, lty = 1)
abline(h = 1, lty = 2)
legend("topright", col = c("black", cols[k], "black", cols[k]), lwd = c(3,3,1,1), lty =1 , legend = c(expression(paste("True ", R[0], " historical strains")), expression(paste("True ", R[0], " emerging variant")), expression(paste(R, " realised in H cases")), expression(paste(R, " realised in E cases"))), bty = "n")

# covariance between the two
if(F){
  cc <- ccf(x = R0vecH[11:max_time], y = rtoR(r = sim_sub_b$rH_error, mu = MUW, SD = SDW), plot = F)
  plot(cc$lag, cc$acf, type = "l", ylim = c(0,1), ylab = "Autocorrelation between true and realised Re", xlab = "Lag")
  abline(v = cc$lag[which.max(cc$acf)], lty = 2)
}

dev.off()


################################################## CHECK ANALYTICAL FORMULA FOR COVARIANCE ##################################################


samp_error <- list()
nsamp_total <- 10000
for(i in 1:nsamp_total) samp_error[[i]] <- data.frame(generate_sim_data(mysim = sim, freq_sample_size = Nsamplesize, weekly = F))
idx <- 15 # date index

# useful quantities
I1 <- samp_error[[1]][idx, "Psmoothed"]
I0 <- samp_error[[1]][idx - 1, "Psmoothed"]
p1 <- samp_error[[1]][idx, "fVOC"]
p0 <- samp_error[[1]][idx - 1, "fVOC"]
N1 <- Nsamplesize
N0 <- Nsamplesize

# convergence towards true mean

xseq <- seq(1, nsamp_total, 100)
plot(xseq, sapply(xseq, function(nsamp) mean(unlist(lapply(samp_error[1:nsamp], function(tab) tab[idx, "rH_error"])))), type = "l", ylab = "rH")
abline(h = samp_error[[1]][idx, "rH"], lwd =3, col = "darkgreen")

plot(xseq, sapply(xseq, function(nsamp) mean(unlist(lapply(samp_error[1:nsamp], function(tab) tab[idx, "rV_error"])))), type = "l", ylab = "rV")
abline(h = samp_error[[1]][idx, "rV"], lwd =3, col = "darkgreen")

# variance of the rH, covariance rH rV same time
mean_rH0 <- mean(unlist(lapply(samp_error[1:nsamp_total], function(tab) tab[idx-1, "rH_error"])))
mean_rV0 <- mean(unlist(lapply(samp_error[1:nsamp_total], function(tab) tab[idx-1, "rV_error"])))
mean_rH1 <- mean(unlist(lapply(samp_error[1:nsamp_total], function(tab) tab[idx, "rH_error"])))
mean_rV1 <- mean(unlist(lapply(samp_error[1:nsamp_total], function(tab) tab[idx, "rV_error"])))
mean_rH_squared <- mean(unlist(lapply(samp_error[1:nsamp_total], function(tab) tab[idx, "rH_error"]^2)))
mean_rV_squared <- mean(unlist(lapply(samp_error[1:nsamp_total], function(tab) tab[idx, "rV_error"]^2)))
mean_rH_rV <- mean(unlist(lapply(samp_error[1:nsamp_total], function(tab) tab[idx, "rH_error"] * tab[idx, "rV_error"])))
mean_rH0_rH1 <- mean(unlist(lapply(samp_error[1:nsamp_total], function(tab) tab[idx-1, "rH_error"] * tab[idx, "rH_error"])))
mean_rV0_rV1 <- mean(unlist(lapply(samp_error[1:nsamp_total], function(tab) tab[idx-1, "rV_error"] * tab[idx, "rV_error"])))
mean_rH0_rV1 <- mean(unlist(lapply(samp_error[1:nsamp_total], function(tab) tab[idx-1, "rH_error"] * tab[idx, "rV_error"])))
mean_rH1_rV0 <- mean(unlist(lapply(samp_error[1:nsamp_total], function(tab) tab[idx-1, "rV_error"] * tab[idx, "rH_error"])))

var_rH <- mean_rH_squared - mean_rH1^2
var_rV <- mean_rV_squared - mean_rV1^2
cov_rH_rV <- mean_rH_rV - mean_rH1*mean_rV1
cov_rH0_rH1 <- mean_rH0_rH1 - mean_rH0*mean_rH1
cov_rV0_rV1 <- mean_rV0_rV1 - mean_rV0*mean_rV1
cov_rH0_rV1 <- mean_rH0_rV1 - mean_rH0 * mean_rV1
cov_rH1_rV0 <- mean_rH1_rV0 - mean_rH1 * mean_rV0

pred <- c(
  1/I1 + 1/I0 + p0 / (N0 * (1-p0)) + p1 / (N1 * (1-p1)),  # theoretical prediction var(rH)
  1/I1 + 1/I0 + (1-p0) / (N0 * p0) + (1-p1) / (N1 * p1),  # theoretical prediction var(rV)
  1/I1 + 1/I0 - 1/N0 - 1/N1,                              # theoretical prediction cov(rH, rV)
  -1/I0 - p0/(N0 * (1-p0)),                               # theoretical prediction cov(rH0, rH1)
  -1/I0 - (1-p0)/(N0 * p0),                               # theoretical prediction cov(rV0, rV1)
  -1/I0 + 1/N0,                                           # theoretical prediction cov(rH0, rV1) and  cov(rV0, rH1)
  -1/I0 + 1/N0
)

pdf("~/ownCloud/coronavirus/variant_France/clean_code/check_covariances.pdf", width = 4, height = 4)
par(mar = c(4,4,1,1))
plot(c(var_rH, var_rV, cov_rH_rV, cov_rH0_rH1, cov_rV0_rV1, cov_rH0_rV1, cov_rH1_rV0), pred, pch = 20, xlab = "Simulated variances and covariances", ylab = "Predicted variances and covariances", las = 1)
abline(0, 1, lty = 2)
dev.off()

######################################################################################################################################################
##################################################            INFER FROM SIMULATED DATA             ##################################################
######################################################################################################################################################

initial_da_vec <- seq(-0.4, 0.4, 0.2) 
n_iter_optim <- 20

burnin <- 10
init_IM <- 80
baseline_transmissibility <- 0.3; baseline_transmissibility2 <- 0.45
nrepR0 <- 5

get_initRW <- function(sim_tab, varcov_mat, initial_ta = NULL, initial_da = 0, weekly){
  
  # A function to generate a good starting point for RW
  # Create a good initial RW vector
  # Based on rH and rV and a priori idea of transmission advantage and duration advantage
  if(!weekly){
    weekly_factor <- 1 
    nrows <- max_time
  } else {
    weekly_factor <- 7
    nrows <- nweeks + burnin # the burnin period was already included in nweeks
  }
  initRW1 <- rtoR(sim_tab$rH_error[2:(nrows-burnin)], MUW/weekly_factor, SDW/weekly_factor) # initial values for RW (using data)
  initRW2 <-  rtoR(sim_tab$rV_error[2:(nrows-burnin)], MUW * (1 + initial_da)/weekly_factor, SDW/weekly_factor)
  
  initRW1[is.na(initRW1)] <- 0.5 # too small values of r lead to NA
  initRW1[initRW1 > 5] <- 5
  initRW2[is.na(initRW2)] <- 0.5
  initRW2[initRW2 > 5] <- 5
  
  # estimate transmission advantage if no value was provided
  if(is.null(initial_ta)) initial_ta <- median(1 - initRW1/initRW2, na.rm = T)
  if(initial_ta < 0.1) initial_ta <- 0.1
  initRW2 <- initRW2 / (1+initial_ta)
  
  # weights:
  varH <- diag(varcov_mat)[seq(1, 2*(nrows-burnin)-2, 2)]; NH <- 1/varH
  varV <- diag(varcov_mat)[seq(2, 2*(nrows-burnin)-2, 2)]; NV <- 1/varV
  stopifnot(length(NH) == length(initRW1))
  stopifnot(length(NH) == length(initRW2))
  initRW <- (NH * initRW1 + NV * initRW2) / (NH + NV)
  
  return(list(initRW = initRW, initial_ta = initial_ta, initial_da = initial_da))
  
}

# npar1 <- 3 + ndays - 1
par(mfrow = c(3,1))
for(sample_size in c(1000, 10000)){

  res <- matrix(NA, nrow = 10*nrepR0*10, ncol = 6 + max_time)
  idx <- 1

  for(k in 1:10){
    
    if(k == 1){ transmission_advantage <- baseline_transmissibility ; duration_advantage <- -0.4 }
    if(k == 2){ transmission_advantage <- baseline_transmissibility ; duration_advantage <- -0.2 }
    if(k == 3){ transmission_advantage <- baseline_transmissibility ; duration_advantage <-  0 }
    if(k == 4){ transmission_advantage <- baseline_transmissibility2 ; duration_advantage <-  0.2 }
    if(k == 5){ transmission_advantage <- baseline_transmissibility2 ; duration_advantage <-  0.4 }
    if(k == 6){ transmission_advantage <- 0.1 ; duration_advantage <- 0 }
    if(k == 7){ transmission_advantage <- 0.2 ; duration_advantage <- 0 }
    if(k == 8){ transmission_advantage <- 0.3 ; duration_advantage <- 0 }
    if(k == 9){ transmission_advantage <- 0.4 ; duration_advantage <- 0 }
    if(k == 10){ transmission_advantage <- 0.5 ; duration_advantage <-0 }
    
    for(repR0 in 1:nrepR0){ # R0 trajectories with different variability (linear decline)
      
      # draw random R0H trajectory:
      #flag <- T
      #cond_on_R0 <- function(myR0vec) mean(myR0vec[1:20]) > 1.1 & mean(myR0vec[(max_time-20):max_time]) < 0.9 & all(myR0vec > 0.5) & all(myR0vec < 5)
      #while(flag){
      #  R0vecH <- mvrnorm(n = 1, mu = rep(R0H, max_time), Sigma = create_BMautocov_mat(temporal_autocor, max_time = max_time))
      #  if(cond_on_R0(R0vecH)) flag <- F # select a realisation of BM where it is below 1 at the end
      #}
      initial_R0 <- 1 + repR0 / 10
      final_R0 <- 1 - repR0 / 10
      R0vecH <- c(rep(initial_R0, 10), seq(initial_R0, final_R0, by = -(initial_R0 - final_R0)/(70-1)))
      
      cat(transmission_advantage, duration_advantage, "\n")
      #print(R0vecH)
      # define distribution of generation time for the mutant by reloading the sourcing file (which updates variant distribution with new values of duration advantage)
      source("~/ownCloud/coronavirus/variant_France/clean_code/source_simulation_parameters.R")
      
      # simulate:
      sim <- simulate_Rcpp(
        max_age = 60, # maximum age of infection to consider
        initial_IW = 4000, # initial number of infected spread on each age (around 8000 infected per day in IDF if half are detected)
        initial_IM = init_IM,
        tmax = max_time, # maximum time (3 months)
        simtlockdown = 50, # date of the lockdown (only when step function)
        ifr = 0.0, # ifr
        total_pop = pop_idf * (1 - immunity_idf), # for population immmunity - size of the population left to infect
        par = c(
          R0vecH,
          0.5, # pdetection min
          0.5, # pdetection max
          1,   # k time coefficient
          100, # time mid-point for detection
          transmission_advantage # transmission advantage
        ),
        betafun = 3, # mode for beta function (smooth sigmoid or step function)
        siW = siW, # generation time ("serial interval" distribution)
        siM = siM, # generation time ("serial interval" distribution)
        tid = tid, # time from infection to death
        tit = tit # time from infection to test
      )
      # note: incidence infections at time i (all_incidenceW and all_incidenceM) reflect transmission that occurred between i-1 and i
      # the first number of all_detected_incidenceM is initial_IM *  pdetection because of the assumption that initially all ages of infection are equally representd
      # cases are shifted compared to infections
      
      for(m in 1:10){ # replicates
        
        # create the simulated dataset from the simulations (with error)
        sim_sub_b <- generate_sim_data(mysim = sim, freq_sample_size = sample_size, weekly = F)
        names(sim_sub_b)[names(sim_sub_b) == "N"] <- "Ntrue"
        if(any(is.infinite(sim_sub_b$rH_error)) | any(is.infinite(sim_sub_b$rV_error))) next # go to next one if infinite rH or rV
        max_frequency_reached <- max(sim_sub_b$fVOC, na.rm = T)
        
        # Var-covar matrix of the multivariate normal distribution
        varcov <- create_cov_mat(tab = sim_sub_b)
        stopifnot(isSymmetric.matrix(varcov)) #  check matrix is symmetric:
        
        # data
        mydata <- c(rbind(sim_sub_b$rH_error[2:(max_time-burnin)], sim_sub_b$rV_error[2:(max_time-burnin)])) # estimated r; what we are trying to fit
        
        # the true_RW is:
        true_RW1 <- rtoR(sim_sub_b$rH[2:(max_time-burnin)], MUW, SDW) 
        true_RW2 <- rtoR(sim_sub_b$rV[2:(max_time-burnin)], MUM, SDM) / (1 + transmission_advantage)
        true_RW <- 0.5 * (true_RW1 + true_RW2)
        
        
        opt2_list <- list()
        RW_error <- c()
        idx_list <- 1
        
        for(my_initial_da in initial_da_vec){
          
          # generate a good initRW:
          tmp <- get_initRW(sim_tab = sim_sub_b, varcov_mat = varcov, initial_ta = NULL, initial_da = my_initial_da, weekly = FALSE)
          initRW <- (tmp$initRW)
          initial_ta <- tmp$initial_ta
          initial_da <- tmp$initial_da
          if(any(is.na(initRW))) stop("Some elements of initRW are NA")
        
          lower2 <- c(0, -0.5)
          upper2 <- c(1, 1)
          npar2 <- 2
          
          initfun2 <- function() c(initial_ta, initial_da)
          opt2 <- optim.fun.repeated(n.repeats = 1, lik.fun = myneglik2, init.fun = initfun2, lower = lower2, upper = upper2, mydata = mydata, myinitRW = initRW, vars = varcov, fixed_s3 = 0, weekly = F)
          opt2_list[[idx_list]] <- opt2; idx_list <- idx_list + 1
          RW_error <- c(RW_error, mean((initRW - true_RW)^2))
          #2nd round:
          
          for(l in 1:n_iter_optim){
            tmp <- get_initRW(sim_tab = sim_sub_b, varcov_mat = varcov, initial_ta = opt2$pars[1], initial_da = opt2$pars[2], weekly = FALSE)
            initRW <- (tmp$initRW)
            initial_ta <- tmp$initial_ta
            initial_da <- tmp$initial_da
            if(initial_da >= upper2[2]) initial_da <- upper2[2] * 0.9
            if(initial_ta >= upper2[1]) initial_ta <- upper2[1] * 0.9
            initfun2 <- function() c(initial_ta, initial_da)
            opt2 <- optim.fun.repeated(n.repeats = 1, lik.fun = myneglik2, init.fun = initfun2, lower = lower2, upper = upper2, mydata = mydata, myinitRW = initRW, vars = varcov, fixed_s3 = 0, weekly = F)
            opt2_list[[idx_list]] <- opt2; idx_list <- idx_list+1
            RW_error <- c(RW_error, mean((initRW - true_RW)^2))
          }
        }
        
        all_lik <- unlist(lapply(opt2_list, function(x) x$lik))
        opt2 <- opt2_list[[which.min(all_lik)]]
        
        # plot a few things
        plot(all_lik, type = "o", pch = 20,  main = paste("max freq", signif(max_frequency_reached, 3), " - best par:", paste(signif(opt2$pars, 3), collapse = ",  ")))
        plot(initRW, type = "l", lwd = 3, pch= 20, col= "blue", ylim = c(0,2))
        points(true_RW, type = "l", col = "gray", lwd = 3)
        plot(all_lik, RW_error, pch = 20)
        points(all_lik[which.min(all_lik)], RW_error[which.min(all_lik)], pch = 20, cex = 4, col = "red")
        # save results
        res[idx, 1:6] <- c(transmission_advantage, duration_advantage, opt2$pars, opt2$lik, max_frequency_reached)
        res[idx, 7:(7+max_time-1)] <- R0vecH
        idx <- idx + 1
        
      } # end of loop on replicates with error
    } # end of loop on R0 trajectory more or less variable
  } # end of loop on parameters

  res <- data.frame(res)
  names(res) <- c("ta", "da", "inf_ta", "inf_da", "max_lik", "max_fVOC", paste0("R0", 1:max_time))
  # plot(res$ta, res$inf_ta); abline(0,1)
  # plot(res$da, res$inf_da); abline(0,1)
  
  assign(x = paste0("res_", sample_size), res)

}



######################################################################################################################################################
######################################            INFER FROM WEEKLY SIMULATED DATA             #######################################################
######################################################################################################################################################

burnin <- 1
sample_size <- 1000

res <- matrix(NA, nrow = 10*nrepR0*10, ncol = 6 + max_time)
idx <- 1

for(k in 1:10){
  
  if(k == 1){ transmission_advantage <- baseline_transmissibility ; duration_advantage <- -0.4 }
  if(k == 2){ transmission_advantage <- baseline_transmissibility ; duration_advantage <- -0.2 }
  if(k == 3){ transmission_advantage <- baseline_transmissibility ; duration_advantage <-  0 }
  if(k == 4){ transmission_advantage <- baseline_transmissibility2 ; duration_advantage <-  0.2 }
  if(k == 5){ transmission_advantage <- baseline_transmissibility2 ; duration_advantage <-  0.4 }
  if(k == 6){ transmission_advantage <- 0.1 ; duration_advantage <- 0 }
  if(k == 7){ transmission_advantage <- 0.2 ; duration_advantage <- 0 }
  if(k == 8){ transmission_advantage <- 0.3 ; duration_advantage <- 0 }
  if(k == 9){ transmission_advantage <- 0.4 ; duration_advantage <- 0 }
  if(k == 10){ transmission_advantage <- 0.5 ; duration_advantage <-0 }
  
  for(repR0 in 1:nrepR0){ # R0 trajectories with different variability (linear decline)
    
    # draw random R0H trajectory:
    #flag <- T
    #while(flag){
    #  R0vecH <- mvrnorm(n = 1, mu = rep(R0H, max_time), Sigma = create_BMautocov_mat(temporal_autocor, max_time = max_time))
    #  if(cond_on_R0(R0vecH)) flag <- F # select a realisation of BM where it is below 1 at the end
    #}
    initial_R0 <- 1 + repR0 / 10
    final_R0 <- 1 - repR0 / 10
    R0vecH <- c(rep(initial_R0, 10), seq(initial_R0, final_R0, by = -(initial_R0 - final_R0)/(70-1)))
    cat(transmission_advantage, duration_advantage, "\n")
    #print(R0vecH)
    
    # define distribution of generation time for the mutant by reloading the sourcing file (which updates variant distribution with new values of duration advantage)
    source("~/ownCloud/coronavirus/variant_France/clean_code/source_simulation_parameters.R")
    
    # simulate:
    sim <- simulate_Rcpp(
      max_age = 60, # maximum age of infection to consider
      initial_IW = 4000, # initial number of infected spread on each age (around 8000 infected per day in IDF if half are detected)
      initial_IM = init_IM,
      tmax = max_time, # maximum time (3 months)
      simtlockdown = 50, # date of the lockdown (only when step function)
      ifr = 0.0, # ifr
      total_pop = pop_idf * (1 - immunity_idf), # for population immmunity - size of the population left to infect
      par = c(
        R0vecH,
        0.5, # pdetection min
        0.5, # pdetection max
        1,   # k time coefficient
        100, # time mid-point for detection
        transmission_advantage # transmission advantage
      ),
      betafun = 3, # mode for beta function (smooth sigmoid or step function)
      siW = siW, # generation time ("serial interval" distribution)
      siM = siM, # generation time ("serial interval" distribution)
      tid = tid, # time from infection to death
      tit = tit # time from infection to test
    )
    
    for(m in 1:10){ # replicates on data
      
      # create the simulated dataset from the simulations (with error)
      sim_sub_b <- generate_sim_data(mysim = sim, freq_sample_size = sample_size, weekly = T)
      names(sim_sub_b)[names(sim_sub_b) == "N"] <- "Ntrue"
      if(any(is.infinite(sim_sub_b$rH_error)) | any(is.infinite(sim_sub_b$rV_error))) next # go to next one if infinite rH or rV
      nweeks <- nrow(sim_sub_b)
      max_frequency_reached <- max(sim_sub_b$fVOC, na.rm = T)
      
      # Var-covar matrix of the multivariate normal distribution
      varcov_weekly <- create_cov_mat(tab = sim_sub_b)
      stopifnot(isSymmetric.matrix(varcov_weekly))
      
      # data
      mydata <- c(rbind(sim_sub_b$rH_error[2:nweeks], sim_sub_b$rV_error[2:nweeks])) # estimated r; what we are trying to fit
      
      opt2_list <- list()
      RW_error <- c()
      idx_list <- 1
      
      for(my_initial_da in initial_da_vec){
        
        # generate a good initRW:
        tmp <- get_initRW(sim_tab = sim_sub_b, varcov_mat = varcov_weekly, initial_da = my_initial_da, weekly = TRUE)
        initRW <- tmp$initRW
        initial_ta <- tmp$initial_ta
        initial_da <- tmp$initial_da
        
        if(any(is.na(initRW))) stop("Some elements of initRW are NA")
        
        # the true_RW is:
        true_RW1 <- rtoR(sim_sub_b$rH[2:nweeks], MUW/7, SDW/7) 
        true_RW2 <- rtoR(sim_sub_b$rV[2:nweeks], MUM/7, SDM/7) / (1 + transmission_advantage)
        true_RW2[is.na(true_RW2)] <- true_RW1[is.na(true_RW2)]
        true_RW <- 0.5 * (true_RW1 + true_RW2)
        
        if(any(is.na(initRW))) stop("Some elements of initRW are NA")
        
        #plot(initRW, type = "l", lwd = 3, pch= 20, col= "blue", ylim = c(0,5))
        #points(true_RW, type = "l", col = "gray", lwd = 3)
      
        initfun2 <- function() c(initial_ta, initial_da)
        opt2 <- optim.fun.repeated(n.repeats = 1, lik.fun = myneglik2, init.fun = initfun2, lower = lower2, upper = upper2, mydata = mydata, myinitRW = initRW, vars = varcov_weekly, fixed_s3= 0, weekly = T)
        
        # 2nd round
        opt2_list[[idx_list]] <- opt2; idx_list <- idx_list + 1
        RW_error <- c(RW_error, mean((initRW - true_RW)^2))
        
        for(l in 1:n_iter_optim){
          tmp <- get_initRW(sim_tab = sim_sub_b, varcov_mat = varcov_weekly, initial_ta = opt2$pars[1], initial_da = opt2$pars[2], weekly = TRUE)
          initRW <- tmp$initRW
          initial_ta <- tmp$initial_ta
          initial_da <- tmp$initial_da
          initfun2 <- function() c(initial_ta, initial_da)
          opt2 <- optim.fun.repeated(n.repeats = 1, lik.fun = myneglik2, init.fun = initfun2, lower = lower2, upper = upper2, mydata = mydata, myinitRW = initRW, vars = varcov_weekly,  fixed_s3= 0, weekly = T)
          opt2_list[[idx_list]] <- opt2; idx_list <- idx_list + 1
          RW_error <- c(RW_error, mean((initRW - true_RW)^2))
        }
      }
      all_lik <- unlist(lapply(opt2_list, function(x) x$lik))
      opt2 <- opt2_list[[which.min(all_lik)]]
      res[idx, 1:6] <- c(transmission_advantage, duration_advantage, opt2$pars, opt2$lik, max_frequency_reached)
      res[idx, 7:(7+max_time-1)] <- R0vecH
      idx <- idx + 1
      
      # ad plot a few things
      plot(all_lik, type = "o", pch = 20,  main = paste("max freq", signif(max_frequency_reached, 3), " - best par:", paste(signif(opt2$pars, 3), collapse = ",  ")))
      plot(initRW, type = "l", lwd = 3, pch= 20, col= "blue", ylim = c(0,2))
      points(true_RW, type = "l", col = "gray", lwd = 3)
      plot(all_lik, RW_error, pch = 20)
      points(all_lik[which.min(all_lik)], RW_error[which.min(all_lik)], pch = 20, cex = 4, col = "red")
      
    } # end of loop on replicates with error
  } # end of loop on R0 trajectories
} # end of loop on parameters s1, s2

res <- data.frame(res)
names(res) <- c("ta", "da", "inf_ta", "inf_da", "max_lik", "max_fVOC", paste0("R0", 1:max_time))
assign(x = paste0("res_", sample_size, "_weekly"), res)

save.image(file = "~/ownCloud/coronavirus/variant_France/clean_code/simulations_30August2021.RData")




##################################################                    PLOT THE OVERALL FIGURE                  ##################################################

# load("~/ownCloud/coronavirus/variant_France/clean_code/simulations_30August2021.RData")

cols2 <- RColorBrewer::brewer.pal(n = 3, name = "Set1")

pdf(paste0("~/ownCloud/coronavirus/variant_France/clean_code/simulation_study_inference.pdf"), width = 4*3*0.6, height = 3*4.2*0.6)

par(mfrow =c(3,2), mar = c(5,4,2,1))
for(initial_R0 in c(1.5, 1.3, 1.1)){
  ylims <- c(0, 0.6)
  plot(NULL, xlim = ylims, ylim  = ylims,
       xlab = "True transmission advantage", ylab = "Inferred transmission advantage", las = 1, bty = "n")
  abline(0, 1, col = "gray")
  idx <- 1
  
  for(tab in c("res_1000", "res_10000", "res_1000_weekly")){
    truc <- get(tab)
    ylims <- c(0, 0.6)
    sub <- which(truc$R01==initial_R0 & truc$da == 0)
    points(truc$ta[sub] + (idx-1)/100., truc$inf_ta[sub], pch = 20, col = cols2[idx])
    idx <- idx + 1
  }
  legend("topleft", legend = c("1000 daily", "10000 daily", "7000 weekly"), col = cols2, pch = 20, bty = "n")
  
  ylims <- c(-0.6, 1)
  plot(NULL, xlim = ylims, ylim  = ylims,
       xlab = "True relative mgt", ylab = "Inferred relative mgt", las = 1, bty = "n")
  abline(0, 1, col = "gray")
  idx <- 1
  for(tab in c("res_1000", "res_10000", "res_1000_weekly")){
    truc <- get(tab)
    sub <- which(truc$R01==initial_R0 & (truc$ta == baseline_transmissibility | truc$ta == baseline_transmissibility2))
    points(truc$da[sub] + 2*(idx-1)/100., truc$inf_da[sub], pch = 20, col = cols2[idx])
    idx <- idx + 1
  }
  legend("topleft", legend = c("1000 daily", "10000 daily", "7000 weekly"), col = cols2, pch = 20, bty = "n")
}

dev.off()