https://github.com/kjean/YF_outbreak_PMVC/
Tip revision: 14703d7c5c7f63df6de04b81d5a48751604a906a authored by kjean on 07 January 2021, 14:42:35 UTC
Update README.md
Update README.md
Tip revision: 14703d7
functions_PMVC_impact.R
######################
### Assessing the impact of preventive mass vaccination campaigns on yellow fever outbreaks in Africa : a population-level self-controlled case-series study
### Jean et al., 2020 : https://doi.org/10.1101/2020.07.09.20147355
### contact: Kévin JEAN, kevin.jean@lecnam.net
load_out_and_pmvc_data = function(first.out.only = F, missing.out.month = 7,out.day = 5 ,PMVC.month = 12, PMVC.day=30,
out = out, camp = pmvc, env = env,
dn1 = dn1){
# first.out.only : if T, then censore after the first outbreak
# missing.out.month: value for outrbeak month when missing
# PMVC.month = assumed month of PMVC implementation
# out is outbreak data
# camp = vaccination campaigns data
# env = environmental dataset
# dn1 = vector of provices considered
out$month[is.na(out$month)] = missing.out.month
out = out[order(out$adm0_adm1),]
out$year_month = paste0(out$year, out$month)
out = out[order(out$year_month),]
out = out[order(out$adm0_adm1),]
# retrive duplicated outbreaks
uniq_out = paste(out$adm0_adm1, out$year_month, sep="_")
out = out[!duplicated(uniq_out),]
# if required: retrieve duplicates = keep only the 1st outbreak
if(first.out.only){
cbind(out$adm0_adm1, out$year, out$year_month, duplicated(out$adm0_adm1))
out = out[!duplicated(out$adm0_adm1),]; dim(out)
}
# retrive duplicated campaigns
camp$year_place = paste0(camp$adm0_adm1, "_", camp$year)
camp = camp[!duplicated(camp$year_place),]; dim(camp)
camp = camp[order(camp$year),]
camp = camp[order(camp$adm0_adm1),]
# table(table(camp$adm0_adm1)) # max 3 PMVC per province
# camp = camp[!duplicated(camp$adm0_adm1),]; dim(camp)
#### create the merged dataset
nr = length(dn1)
data = data.frame(adm1 = dn1, nb_outbreak = rep(0, nr),
outbreak1_date = rep(NA, nr),
outbreak2_date = rep(NA, nr),
outbreak3_date = rep(NA, nr),stringsAsFactors = F)
data = data[order(data$adm1),]
for (i in 1:nrow(data)){
if(data$adm1[i] %in% out$adm0_adm1){
data$nb_outbreak[i] = table(out$adm0_adm1)[data$adm1[i]]
adm = data$adm1[i]
data$outbreak1_date[i] = paste(out.day,out$month[out$adm0_adm1 ==adm][1],
out$year[out$adm0_adm1 ==adm][1], sep ="/") # we put the outbreak at the date of 5th to avoid issue based on bisextile years...
if(data$nb_outbreak[i] >1){
data$outbreak2_date[i] = paste(out.day,out$month[out$adm0_adm1 ==adm][2],
out$year[out$adm0_adm1 ==adm][2], sep ="/")
}
if(data$nb_outbreak[i] >2){ #max 3 outbreaks per province
data$outbreak3_date[i] = paste(out.day,out$month[out$adm0_adm1 ==adm][3],
out$year[out$adm0_adm1 ==adm][3], sep ="/")
}
}
}
## add date of PMVC
data$PMVC01 = NA
data$PMVC_date3 = data$PMVC_date2 = data$PMVC_date1 = NA
data$nb_PMVC= 0
for (i in 1:nrow(data)){
date_month1 = date_month2 =date_month3 =NA
if(data$adm1[i] %in% camp$adm0_adm1){
data$nb_PMVC[i] = table(camp$adm0_adm1)[data$adm1[i]]
data$PMVC01[i] = 1
date_month1 = paste(PMVC.day,PMVC.month , camp$year[camp$adm0_adm1 == data$adm1[i]][1], sep= "/")
data$PMVC_date1[i] = date_month1
if(data$nb_PMVC[i] >1){
date_month2 = paste(PMVC.day,PMVC.month , camp$year[camp$adm0_adm1 == data$adm1[i]][2], sep= "/")
data$PMVC_date2[i] = date_month2
}
if(data$nb_PMVC[i] >2){
date_month3 = paste(PMVC.day,PMVC.month , camp$year[camp$adm0_adm1 == data$adm1[i]][3], sep= "/")
data$PMVC_date3[i] = date_month3
}
}
}
data$PMVC01[is.na(data$PMVC01)] =0
# merge with environmental dataset
data = merge(data, env, by.x = "adm1", by.y= "adm0_adm1"); dim(data)
return(data)
}
#########
## formate for SCCS analysis, pseudo obs
#########
pseudo.for.sccs = function(dat){
# keep only cases
cas = dat[dat$nb_outbreak>0,]
# add start and stop dates
t0 = "31/12/2004"
t1 = "01/01/2019"
cas$pre_start = 0
cas$end = interval(dmy(t0),dmy(t1))/days(1)
cas$PMVC01[is.na(cas$PMVC01)] =0
cas$PMVC_date1[cas$PMVC01 == 0 ] = "01/01/2019" # for no PMVC, put PMVC date to date end
table(cas$PMVC01)
cas$out1_day = interval(dmy(t0),dmy(cas$outbreak1_date))/days(1)
cas$out2_day = interval(dmy(t0),dmy(cas$outbreak2_date))/days(1) # if first.out.only = T, NA for all
cas$out3_day = interval(dmy(t0),dmy(cas$outbreak3_date))/days(1) # if first.out.only = T, NA for all
##
nevents <- nrow(cas)
ncuts = 3 # start, stop, start_expo
ind = rep(cas$adm1, times = ncuts)
#create an ordered list of individual events and
#cut points for start, end, exposure and age groups
expo = interval(dmy(t0),dmy(cas$PMVC_date1))/days(1)
cutp <- c(as.matrix(cas$pre_start), as.matrix(cas$end), expo)
o <- order(ind, cutp)
ind = as.factor(ind[o])
cutp = cutp[o]
#calculate interval lengths, set to 0 if before start or after end
interval <- c(0, cutp[2:length(ind)]-cutp[1:length(ind)-1])
interval <- ifelse(cutp<=cas$pre_start[ind], 0, interval)
interval <- ifelse(cutp>cas$end[ind], 0, interval)
#event = 1 if event occurred in interval, otherwise 0
event1 <- ifelse(cas$out1_day[ind]>cutp-interval, 1, 0)
event1 <- ifelse(cas$out1_day[ind]<=cutp, event1, 0)
event2 <- ifelse(!is.na(cas$out2_day[ind]) & cas$out2_day[ind]>cutp-interval,1, 0)
event2 <- ifelse(!is.na(cas$out2_day[ind]) & cas$out2_day[ind]<=cutp, event2, 0)
event3 <- ifelse(!is.na(cas$out3_day[ind]) & cas$out3_day[ind]>cutp-interval,1,0)
event3 <- ifelse(!is.na(cas$out3_day[ind]) & cas$out3_day[ind]<=cutp, event3, 0)
tot_even = event1 + event2 + event3
#exposure groups
exgr <- rep(0, nevents*ncuts)
exgr <- ifelse(cutp > expo[ind], 1, exgr)
exgr <- as.factor(exgr)
#put all data in a data frame, take out data with 0 interval lengths
pseudo <- data.frame(indiv = ind[interval!=0], even = tot_even[interval!=0], interval = interval[interval!=0], expo = exgr[interval!=0], loginterval = log(interval[interval!=0]))
return(pseudo)
}
## consider exposure as continuous and changing each year
pseudo.for.sccs.vc = function(cas){
#same function but add vac cov for each year
cas$pre_start = 0
t0 = "31/12/2004"
t1 = "01/01/2019"
cas$end = interval(dmy(t0),dmy(t1))/days(1)
cas$PMVC_date1[cas$PMVC01 == 0 ] = "01/01/2019" # for no PMVC, put PMVC date to date end
cas$out1_day = interval(dmy(t0),dmy(cas$outbreak1_date))/days(1)
cas$out2_day = interval(dmy(t0),dmy(cas$outbreak2_date))/days(1) # if first.out.only = T, NA for all
cas$out3_day = interval(dmy(t0),dmy(cas$outbreak3_date))/days(1) # if first.out.only = T, NA for all
t_year = paste0("01/01/", 2005:2018)# add the day of each 1st january
year_change = interval(dmy(t0),dmy(t_year))/days(1)
ncuts = 4 + ncol(vc)# start, stop, start_expo + nb_years
pseudo = NULL
for (adm in cas$adm1){
r = cas[cas$adm1==adm,] # r stands for row
pre_start = 0
end = interval(dmy(t0),dmy(t1))/days(1)
expo = interval(dmy(t0),dmy(r$PMVC_date1))/days(1)
cutp = c(pre_start, end, expo, year_change)
names(cutp) = c("pre_start", "end", "expo", paste0("X", 2005:2018))
o = order(c(pre_start, end, expo, year_change))
cutp = cutp[o]
year_cutp = year(dmy(t0)+days(cutp)) # keep track of the year of each event to match with vc
interval = c(0,365,cutp[3:(length(cutp))] - cutp[2:(length(cutp)-1)]) #interval is the lenght since last event
names(interval)[2] = "X2005"
#event = 1 if event occurred in interval, otherwise 0
event1 = event2 = event3 =rep(0, length(cutp))
names(event1)=names(event2)=names(event3)=names(cutp)
index1 = max(which(r$out1_day>cutp))
event1[index1] = 1
index2 = ifelse( !is.na(r$out2_day), max(which(r$out2_day>cutp)), NA)
event2[index2] = 1
index3 = ifelse( !is.na(r$out3_day), max(which(r$out3_day>cutp)), NA)
event3[index3] = 1
tot_even = event1 + event2 + event3
# cbind(interval, event1, event2, event3)
exp = ifelse(cutp>expo,1,0)
ind = rep(adm, times = ncuts)
datr = data.frame(indiv = rep(adm, length(cutp)), even=tot_even, expo =exp, interval=interval,
years = year_cutp)
datr$vc = t(vc[rownames(vc) == adm, match( paste0("X", datr$years),colnames(vc) ) ] )
datr = datr[datr$interval>1,]
pseudo = rbind(pseudo, datr)
}
pseudo$loginterval = log(pseudo$interval)
return(pseudo)
}
###### include a pre-exposure period of N years
pseudo.for.sccs.preexp = function(cas, Nyears){
# keep only cases
#cas = dat[dat$nb_outbreak>0,]
# add start and stop dates
t0 = "31/12/2004"
t1 = "01/01/2019"
cas$pre_start = 0
cas$end = interval(dmy(t0),dmy(t1))/days(1)
cas$PMVC01[is.na(cas$PMVC01)] =0
cas$PMVC_date1[cas$PMVC01 == 0 ] = "01/01/2019" # for no PMVC, put PMVC date to date end
table(cas$PMVC01)
# create the date of pre_expo start
cas$PMVC_datepreexp = interval(dmy(t0),dmy(cas$PMVC_date1) - years(Nyears))/days(1)
cas$PMVC_datepreexp[cas$PMVC01 == 0 ] = interval(dmy(t0),dmy(t1))/days(1) # for no PMVC, put PMVC pre-expo date to date end
cas$out1_day = interval(dmy(t0),dmy(cas$outbreak1_date))/days(1)
cas$out2_day = interval(dmy(t0),dmy(cas$outbreak2_date))/days(1) # if first.out.only = T, NA for all
cas$out3_day = interval(dmy(t0),dmy(cas$outbreak3_date))/days(1) # if first.out.only = T, NA for all
##
nevents <- nrow(cas)
ncuts = 4 # start, stop, start_pre_expo, start_expo
ind = rep(cas$adm1, times = ncuts)
#create an ordered list of individual events and
#cut points for start, end, exposure and age groups
expo = interval(dmy(t0),dmy(cas$PMVC_date1))/days(1)
cutp <- c(as.matrix(cas$pre_start), as.matrix(cas$end), as.matrix(cas$PMVC_datepreexp), expo)
o <- order(ind, cutp)
ind = as.factor(ind[o])
cutp = cutp[o]
#calculate interval lengths, set to 0 if before start or after end
interval <- c(0, cutp[2:length(ind)]-cutp[1:length(ind)-1])
interval <- ifelse(cutp<=cas$pre_start[ind], 0, interval)
interval <- ifelse(cutp>cas$end[ind], 0, interval)
#event = 1 if event occurred in interval, otherwise 0
event1 <- ifelse(cas$out1_day[ind]>cutp-interval, 1, 0)
event1 <- ifelse(cas$out1_day[ind]<=cutp, event1, 0)
event2 <- ifelse(!is.na(cas$out2_day[ind]) & cas$out2_day[ind]>cutp-interval,1, 0)
event2 <- ifelse(!is.na(cas$out2_day[ind]) & cas$out2_day[ind]<=cutp, event2, 0)
event3 <- ifelse(!is.na(cas$out3_day[ind]) & cas$out3_day[ind]>cutp-interval,1,0)
event3 <- ifelse(!is.na(cas$out3_day[ind]) & cas$out3_day[ind]<=cutp, event3, 0)
tot_even = event1 + event2 + event3
#exposure groups
exgr <- rep(0, nevents*ncuts)
exgr <- ifelse(cutp > cas$PMVC_datepreexp[ind], 1, exgr)
exgr <- ifelse(cutp > expo[ind], 2, exgr)
exgr <- as.factor(exgr)
#put all data in a data frame, take out data with 0 interval lengths
pseudo <- data.frame(indiv = ind[interval!=0], even = tot_even[interval!=0], interval = interval[interval!=0], expo = exgr[interval!=0], loginterval = log(interval[interval!=0]))
return(pseudo)
}
#########
## formate for cohort analysis, pseudo obs
#########
#########
## formate for cohort analysis, pseudo obs
#########
pseudo.for.cohort = function(dat){
t0 = "31/12/2004"
t1 = "01/01/2019"
dat$pre_start = 0
dat$end = interval(dmy(t0),dmy(t1))/days(1)
dat$PMVC_date1[dat$PMVC01 == 0 ] = "01/01/2019" # for no PMVC, put PMVC date to date end
dat$outbreak1_date[is.na(dat$outbreak1_date) ] = "02/01/2019" # for no outbreak, put outreak date to date end
dat$out_day = interval(dmy(t0),dmy(dat$outbreak1_date))/days(1)
ncuts = 4 # start, stop, start_expo, event
pseudo = NULL
for (adm in dat$adm1){
r = dat[dat$adm1==adm,] # r stands for row
pre_start = 0
end = interval(dmy(t0),dmy(t1))/days(1)
ev = r$out_day
expo = interval(dmy(t0),dmy(r$PMVC_date1))/days(1)
cutp = c(pre_start, ev, end, expo)
o = order(c(pre_start, ev, end, expo))
cutp = cutp[o]
interval = c(0,cutp[2:length(cutp)] - cutp[1:(length(cutp)-1)])
event = ifelse(ev==cutp,1,0)
exp = ifelse(cutp>expo,1,0)
to_censor = ifelse(cutp>ev,1,0) # indicative variable flagging lines to be censored, ie after outbreaks
ind = rep(adm, times = ncuts)
datr = data.frame(indiv = ind, event=event, expo =exp, interval=interval)
# need to sensor and retrieve interval = 0
datr = datr[to_censor==0 & interval>1,]
pseudo = rbind(pseudo, datr)
}
pseudo$loginterval = log(pseudo$interval)
pseudo = merge(pseudo, env, by.x="indiv", by.y = "adm0_adm1")
return(pseudo)
}
#### same for all outbreaks
format_cohort_calc_pf = function(dat){
t0 = "31/12/2004"
t1 = "01/01/2019"
dat$pre_start = 0
dat$end = interval(dmy(t0),dmy(t1))/days(1)
dat$PMVC_date1[dat$PMVC01 == 0 ] = "01/01/2019" # for no PMVC, put PMVC date to date end
dat$outbreak1_date[is.na(dat$outbreak1_date) ] = "02/01/2019" # for no outbreak, put outreak date to date end
dat$outbreak2_date[is.na(dat$outbreak2_date) ] = "02/01/2019"
dat$outbreak3_date[is.na(dat$outbreak3_date) ] = "02/01/2019"
dat$out1_day = interval(dmy(t0),dmy(dat$outbreak1_date))/days(1)
dat$out2_day = interval(dmy(t0),dmy(dat$outbreak2_date))/days(1) # if first.out.only = T, NA for all
dat$out3_day = interval(dmy(t0),dmy(dat$outbreak3_date))/days(1) # if first.out.only = T, NA for all
di = interval(dmy(t0),dmy(dat$PMVC_date1))/days(1)
Ti = interval(dmy(t0),dmy(t1))/days(1)
Ni = rep(0, nrow(dat))
Ni = ifelse(dat$out1_day< di, Ni+1, Ni)
Ni = ifelse(dat$out2_day< di, Ni+1, Ni)
Ni = ifelse(dat$out3_day< di, Ni+1, Ni)
Mi = rep(0, nrow(dat))
Mi = ifelse(dat$out1_day>= di & dat$out1_day<dat$end, Mi+1, Mi)
Mi = ifelse(dat$out2_day>= di & dat$out2_day<dat$end, Mi+1, Mi)
Mi = ifelse(dat$out3_day>= di & dat$out3_day<dat$end, Mi+1, Mi)
vec = data.frame(adm1 = dat$adm1, di, Ti, Ni, Mi)
return(vec)
}
#### same with vaccination coverage as continuous
pseudo.for.cohort.vc = function(dat){
t0 = "31/12/2004"
t1 = "01/01/2019"
dat$pre_start = 0
dat$end = interval(dmy(t0),dmy(t1))/days(1)
dat$PMVC_date1[dat$PMVC01 == 0 ] = "01/01/2019" # for no PMVC, put PMVC date to date end
dat$outbreak1_date[is.na(dat$outbreak1_date) ] = "02/01/2019" # for no outbreak, put outreak date to date end
dat$out_day = interval(dmy(t0),dmy(dat$outbreak1_date))/days(1)
t_year = paste0("01/01/", 2005:2018)# add the day of each 1st january
year_change = interval(dmy(t0),dmy(t_year))/days(1)
ncuts = 4 + length(year_change)# start, stop, start_expo + nb_years
pseudo = NULL
for (adm in dat$adm1){
r = dat[dat$adm1==adm,] # r stands for row
pre_start = 0
end = interval(dmy(t0),dmy(t1))/days(1)
ev = r$out_day
expo = interval(dmy(t0),dmy(r$PMVC_date1))/days(1)
cutp = c(pre_start, ev, end, expo, year_change)
names(cutp) = c("pre_start", "ev", "end", "expo", paste0("X", 2005:2018))
o = order(c(pre_start, ev, end, expo, year_change))
cutp = cutp[o]
year_cutp = year(dmy(t0)+days(cutp)) # keep track of the year of each event to match with vc
interval = c(0,cutp[2:length(cutp)] - cutp[1:(length(cutp)-1)])
event = ifelse(ev==cutp,1,0)
exp = ifelse(cutp>expo,1,0)
to_censor = ifelse(cutp>ev,1,0) # indicative variable flagging lines to be censored, ie after outbreaks
ind = rep(adm, times = ncuts)
datr = data.frame(indiv = ind, event=event, expo =exp, interval=interval, years = year_cutp)
datr$vc = t(vc[rownames(vc) == adm, match( paste0("X", datr$years),colnames(vc) ) ] )
datr = datr[to_censor==0 & interval>1,]
pseudo = rbind(pseudo, datr)
}
pseudo$loginterval = log(pseudo$interval)
pseudo = merge(pseudo, env, by.x="indiv", by.y = "adm0_adm1")
return(pseudo)
}
##################################
##################################
##################################
SummaryGEEGLM <- function(fit, alpha=.05, dig=2, p.dig=4){
# output a summary data frame of GEE results
#fit is a fitted geese() object. dig is number of digits to report.
zq <- qnorm(1-alpha/2)
estimate <- fit$coefficients
lower <- fit$coefficients - zq*coef(summary(fit))[,"Std.err"]
upper <- fit$coefficients + zq*coef(summary(fit))[,"Std.err"]
robust.se <- round(coef(summary(fit))[,"Std.err"], dig)
p <- coef(summary(fit))[,"Pr(>|W|)"]
p <- round(p, digits=p.dig)
Sig <- ifelse(p<0.05, '*', ifelse(p<0.01,'**', ifelse(p<0.001, '***', '')))
RR <- round(exp(estimate), dig) #incidence rate ratio
RR.lower <- round(exp(lower), dig)
RR.upper <- round(exp(upper), dig)
estimate <- round(estimate, dig)
return(data.frame(cbind(estimate,robust.se,RR,RR.lower,RR.upper,p, Sig)))
}
SummaryGEEGLM_v2 <- function(fit, alpha=.05, dig=2, p.dig=4){
# output a summary data frame of GEE results
#fit is a fitted geese() object. dig is number of digits to report.
zq <- qnorm(1-alpha/2)
estimate <- fit$coefficients[-1]
lower <- fit$coefficients[-1] - zq*coef(summary(fit))[,"Std.err"][-1]
upper <- fit$coefficients[-1] + zq*coef(summary(fit))[,"Std.err"][-1]
p <- coef(summary(fit))[,"Pr(>|W|)"][-1]
p <- round(p, digits=p.dig)
#Sig <- ifelse(p<0.05, '*', ifelse(p<0.01,'**', ifelse(p<0.001, '***', '')))
RR <- round(exp(estimate), dig) #incidence rate ratio
RR.lower <- round(exp(lower), dig)
RR.upper <- round(exp(upper), dig)
return(data.frame(cbind(RR, paste0(RR.lower, "-", RR.upper) , p)))
}
SummaryGNM <- function(fit, alpha=.05, dig=2, p.dig=4){
# output a summary data frame of GNM results
#f it is a fitted gnm() object. dig is number of digits to report.
zq <- qnorm(1-alpha/2)
estimate <- summary(fit)$coefficients[,1]
se <- summary(fit)$coefficients[,2]
lower <- estimate -zq*se
upper <- estimate +zq*se
RR <- round(exp(estimate), dig) #incidence rate ratio
RR.lower <- round(exp(lower), dig)
RR.upper <- round(exp(upper), dig)
return(data.frame(cbind(RR, paste0(RR.lower, "-", RR.upper) )))
}
Summaryclogit <- function(fit, alpha=.05, dig=2, p.dig=4){
# output a summary data frame of GNM results
#f it is a fitted gnm() object. dig is number of digits to report.
zq <- qnorm(1-alpha/2)
estimate <- summary(fit)$coefficients[,1]
se <- summary(fit)$coefficients[,3]
lower <- estimate -zq*se
upper <- estimate +zq*se
RR <- round(exp(estimate), dig) #incidence rate ratio
RR.lower <- round(exp(lower), dig)
RR.upper <- round(exp(upper), dig)
return(data.frame(cbind(RR, paste0(RR.lower, "-", RR.upper) )))
}
##################################
##################################
##################################