Raw File
1-main_code_sccs.R
######################
### Assessing the impact of preventive mass vaccination campaigns on yellow fever outbreaks in Africa : a population-level self-controlled case-series study
### Jean et al., 2020 : https://doi.org/10.1101/2020.07.09.20147355 
### contact: Kévin JEAN, kevin.jean@lecnam.net

library(lubridate)
library(survival)
library(gnm)
library(geepack)

rm(list=ls(all=TRUE)) 


source("functions_PMVC_impact.R")





# time period considered
yy0 = 2004 # 2004 means 2005 is the 1st year considered
yy1 = 2019 # 2019 means 2018 is the last year considered

#############
## Load data sets
############
dn1 = readRDS("formatted_data/dn1.RDS"); length(dn1) # vector of identifiers for provinces considered in analyses

### vaccination coverage - retrieved from https://shiny.dide.imperial.ac.uk/polici/ (Hamlet et al, Vaccine 2019)
vc = read.csv("formatted_data/vc.pop.level_2d.csv", h=T, stringsAsFactors = F)
rownames(vc) = dn1
dim(vc)
vc = vc[,paste0("X", yy0:yy1)];dim(vc)
rownames(vc) = dn1


### environmental data - retrieved from Hamlet et al, PLOS NTDs 2018: https://journals.plos.org/plosntds/article?id=10.1371/journal.pntd.0006284
env = read.csv("formatted_data/YF_seasonality_env.csv", h=T, stringsAsFactors = F);dim(env)
colnames(env) = c("adm1", "report", "logpop", "survqual", "temp.suit",
                  "rainfall", "interaction", "EVI")
env$adm0_adm1 = paste0(substr(env$adm1, 1,3),"_",substr(env$adm1, 4,6))
env = env[env$adm0_adm1 %in% dn1,];dim(env)


### outbreak data
out = read.csv("formatted_data/outbreaks_1980s-2018.csv", stringsAsFactors = F); dim(out)
colnames(out)
out = out[out$year>yy0,];dim(out)
table(out$year)


### PMVC data
pmvc = read.csv("formatted_data/camp_cleaned.csv", stringsAsFactors = F); dim(pmvc)
colnames(pmvc)
pmvc = pmvc[pmvc$year>yy0,];dim(pmvc)



#############
## formate data set for analyses
############
first.out.only = T
missing.out.month = 1
out.day = 2
PMVC.month = 12
PMVC.day = 30
camp = pmvc

dat = load_out_and_pmvc_data(first.out.only, missing.out.month,out.day,PMVC.month, PMVC.day,
                             out, camp = pmvc, env = env,
                             dn1 = dn1)
head(dat)

cas = dat[dat$nb_outbreak>0,]  # select only provinces with outbreaks
cas_exp = dat[dat$nb_outbreak>0 & dat$PMVC01 ==1,]; dim(cas_exp)  #select only provinces with outbreaks and PMVC, ie SCCS sample



#############################################
#############################################
#############################################
#############################################


########
# descriptive analysis 

table(dat$nb_outbreak)
table(dat$nb_PMVC )
table(dat$nb_outbreak>0, dat$PMVC01)

sum(cas$nb_outbreak)
sum(cas_exp$nb_outbreak) # 43 for all outbreaks, 33 for first outbreak only


table(cas_exp$nb_outbreak) # 33 outbreaks considered
str(cas_exp)
dmy(cas_exp$outbreak1_date)
table(dmy(cas_exp$outbreak1_date) < dmy(cas_exp$PMVC_date1 ))


#############
## create pseudo-observation data sets
############

# SCCS
pseudo.cas = pseudo.for.sccs(cas_exp) # build up the pseudo-observations data
table(pseudo.cas$even)
sum(pseudo.cas$even)

table(pseudo.cas$expo)
table(pseudo.cas$expo, pseudo.cas$even)
100*prop.table(table(pseudo.cas$expo, pseudo.cas$even),2)
mod = gnm(even ~ expo  + offset(loginterval), eliminate = indiv, family = "poisson",
          data = pseudo.cas)
SummaryGNM(mod)

irr_vec = c(coef = summary(mod)$coefficients[,1], se =summary(mod)$coefficients[,2])



# if (first.out.only == T){
#   saveRDS(irr_vec, file = "IRR_value_1st_out_only.RDS")
# } else {saveRDS(irr_vec, file ="IRR_value.RDS") }



# SCCS with pre-expo
pseudo.cas.preexpo = pseudo.for.sccs.preexp(cas_exp, Nyears = 3)
table(pseudo.cas.preexpo$indiv)
table(pseudo.cas.preexpo$even)
sum(pseudo.cas.preexpo$even)

table(pseudo.cas.preexpo$expo)
table(pseudo.cas.preexpo$expo,pseudo.cas.preexpo$even)

mod_preexpo = gnm(even ~ expo  + offset(loginterval), eliminate = indiv, family = "poisson",
          data = pseudo.cas.preexpo)
SummaryGNM(mod_preexpo)





# SCCS with vac cov
pseudo.cas.vc = pseudo.for.sccs.vc(cas_exp); dim(pseudo.cas.vc)
table(pseudo.cas.vc$even)
hist(pseudo.cas.vc$vc)
pseudo.cas.vc$vc10 = pseudo.cas.vc$vc * 10 # for a 10% increase in coverage
mod <- clogit(even ~ vc10  + strata(indiv) + offset(loginterval), data = pseudo.cas.vc)
Summaryclogit(mod)


cut_vc = seq(0,1, by = .2)
pseudo.cas.vc$vc_cat = findInterval(pseudo.cas.vc$vc, cut_vc)
pseudo.cas.vc$vc_cat = as.factor(pseudo.cas.vc$vc_cat )
table(pseudo.cas.vc$vc_cat,pseudo.cas.vc$even)
sum(pseudo.cas.vc$even)


pseudo.cas.vc$vc_cat = relevel(pseudo.cas.vc$vc_cat, ref = 3)
mod_vc_cat <- clogit(even ~ vc_cat  + strata(indiv) + offset(loginterval), data = pseudo.cas.vc)
Summaryclogit(mod_vc_cat)




#############
## re-sample to assess the effect of spatial autocorrelation
## re-sample only 1 province per country
############
dat = load_out_and_pmvc_data(first.out.only= T, missing.out.month= 7,PMVC.month=12, PMVC.day=30,
                             out, camp = pmvc, env = env,
                             dn1 = dn1)
head(dat);dim(dat)
cas = dat[dat$nb_outbreak>0,];dim(cas)
cas_exp = dat[dat$nb_outbreak>0 & dat$PMVC01 == 1,];dim(cas_exp)


alpha =.05
zq <- qnorm(1-alpha/2)

resampled_sccs = function(dat){
  # dat is a data set generated by the load_out_and_pmvc_data function
  cas = dat[dat$nb_outbreak>0,];dim(cas)
  cou = unique(substr(cas$adm1,0,3)); length(cou)
  re_cas = NULL
  for (i in 1:length(cou)){
    pro = cas$adm1[grep(cou[i], cas$adm1)]
    line = cas[cas$adm1 == sample(pro, 1),]
    re_cas = rbind(re_cas, line)
  }
  re_pseudo.cas = pseudo.for.sccs(re_cas)
  re_mod = gnm(even ~ expo  + offset(loginterval), eliminate = indiv, family = "poisson",
               data = re_pseudo.cas)
 # print(SummaryGNM(re_mod))
  return(re_mod)
}




N = 100
res_sample = NULL
for (i in 1:N){
  sa = resampled_sccs(dat)
  coe = coef(summary(sa))[,"Estimate"]
  ste = coef(summary(sa))[,"Std. Error"]
  irr = exp(coe)
  low_b = exp(coe - zq*ste)
  up_b = exp(coe + zq*ste)
  line = c(coe, ste, irr, low_b, up_b)
  res_sample = rbind(res_sample,line)
}
res_sample = data.frame(res_sample)
colnames(res_sample)= c("coef", "se", "IRR", "IRR_lower", "IRR_upper")
dim(res_sample)
res_sample =res_sample[!is.infinite(res_sample$IRR_upper),];dim(res_sample) # retrieve resampling with random zero

theta_b = mean(res_sample$se); theta_b
pooled_IRR = exp(mean(res_sample[,1])) 
mean_low_b = exp(mean(res_sample[,1])- zq*theta_b )
mean_upper_b = exp(mean(res_sample[,1])+ zq*theta_b )
print( paste0(pooled_IRR, "[", mean_low_b, " - ", mean_upper_b, "]"))

back to top