Content - 9df42f1a1bbaac98958684ebd1a19ab7c282f61b - b28faf8/spatial_correlation.R

spatial_correlation.R
# data retrieved from the csv file that can be downloaded here:
# https://www.ecdc.europa.eu/en/publications-data/data-virus-variants-covid-19-eueea
# the adress of the csv file is here:
# https://opendata.ecdc.europa.eu/covid19/virusvariant/csv/data.csv
# on 25th August 2021

a <- read.csv("~/Downloads/data-5.csv") # The downloaded file
head(a)
a <- a[
  with(a, order(country, year_week)),
]
View(a[which(a$country == "Italy" & a$variant == "B.1.617.2" & a$source == "TESSy"),])

voc <-  "B.1.617.2"
#voc <- "B.1.1.7"
source <- "TESSy" #; source <- "GISAID"
threshold <- 50


# selected country based on inspection of the curve on TESSy (looking sigmoidally shaped)
list_cc <- c("Austria", "Belgium", "Denmark", "France", "Germany", "Greece", "Ireland", "Italy", "Netherlands", "Norway", "Sweden")
a2 <- c()
a3 <- c()

for(cc in list_cc){
  
  idx1 <- which(a$country==cc & a$variant == voc & a$source == source)
  #idx2 <- which(a$country==cc & a$variant == voc & a$source == source)
  
  if(length(idx1) > 3){
    plot(a$percent_variant[idx1], type = "o", pch = 20, ylim = c(0, 100), main = cc)
    idx1 <- idx1[1:(length(idx1)-3)]
  
    first_over_50 <- idx1[which(a$percent_variant[idx1] > 50)[1]]
    last_below_50 <- idx1[which(a$percent_variant[idx1] < 50)]; last_below_50 <- last_below_50[length(last_below_50)]
    
    date1 <- a$year_week[last_below_50]
    date2 <- a$year_week[first_over_50]
    print(c(cc, date1, date2))
    
    a2 <- rbind(a2, a[c(last_below_50, first_over_50), ])
    a3 <- rbind(a3, a[idx1, ]) # keep all data still
  }
}
a3$week <- as.numeric(sapply(a3$year_week, function(yw) strsplit(yw, split = "-")[[1]][2]))
a3$year <- as.numeric(sapply(a3$year_week, function(yw) strsplit(yw, split = "-")[[1]][1]))
a3 <- a3[a3$year == 2021, ] # restrict to 2021
write.csv(x = a3, file = "~/ownCloud/coronavirus/variant_France/delta_frequencies_EU.csv", row.names = F)


rs <- data.frame(country = list_cc, Psmoothed0 = NA, Psmoothed1 = NA, NVOC0 = NA, Ntrue0 = NA, NVOC1 = NA, Ntrue1 = NA, rH = NA, rV = NA)
for(cc in list_cc){
  idx <- which(a2$country == cc)
  
  if(length(idx) == 2){
    
    i <- which(rs$country == cc)
    
    NW <- (1 - a2$percent_variant[idx]/100) * a2$new_cases[idx]
    NM <- a2$percent_variant[idx]/100 * a2$new_cases[idx]
    
    # fill in the r table:
    rs[i, c("Psmoothed0", "Psmoothed1")] <- a2$new_cases[idx]
    rs[i, c("Ntrue0", "Ntrue1")] <- a2$number_sequenced[idx]
    rs[i, c("NVOC0", "NVOC1")] <- a2$number_detections_variant[idx]
    rs[i, "rH"] <- (1/1) * log(NW[2]/NW[1])
    rs[i, "rV"] <- (1/1) * log(NM[2]/NM[1])
    
    
  } else {
    stop("country without two entries")
  }
}

plot(rs$rH, rs$rV, pch = 20)
abline(0,1)
lm0 <- lm(rs$rV ~ rs$rH)
text(rs$rH, rs$rV + 0.05, rs$country)
abline(h = 0, lty = 2)
abline(v = 0, lty = 2)
abline(lm0$coefficients[1], lm0$coefficients[2], col = "gray", lwd = 3)

summary(lm0)$r.s
summary(lm0)$coefficients
write.csv(x = rs, file = "~/ownCloud/coronavirus/variant_France/spatial_variation_EU.csv", row.names = F)