Raw File
paper.tex
%% Copyright (C) 2020 Boud Roukema - SARS-CoV-2 infection rates paper
%% Copyright (C) 2018-2020 Mohammad Akhlaghi <mohammad@akhlaghi.org>
%% See the end of the file for license conditions.
%\documentclass[10pt, twocolumn]{article}

%\documentclass[smallextended]{svjour3}
\documentclass[twocolumn]{svjour3}

\newcommand\projectname{subpoisson}

%% (OPTIONAL) CONVENIENCE VARIABLE: Only relevant when you use Maneage's
%% '\includetikz' macro to build plots/figures within LaTeX using TikZ or
%% PGFPlots. If so, when the Figure files (PDFs) are already built, you can
%% avoid TikZ or PGFPlots completely by commenting/removing the definition
%% of '\makepdf' below. This is useful when you don't want to slow-down a
%% LaTeX-only build of the project (for example this happens when you run
%% './project make dist'). See the definition of '\includetikz' in
%% `tex/preamble-pgfplots.tex' for more.
\newcommand{\makepdf}{}

%% (OPTIONAL) CONVENIENCE VARIABLE: Only relevant when
%% 'tex/src/preamble-necessary.tex' is included (in particular the small
%% patch relating to '\highlightchanges'). In there, Maneage defines two
%% macros: `\tonote' and `\new'. When '\highlightchanges' is defined (value
%% is irrelevant), the text in those two macros becomes colored (in the
%% former, the text becomes dark red, in the latter it becomes dark
%% green). When not defined, text in the former isn't printed in the output
%% at all, and text in the latter becomes the same color as the rest of the
%% text. This is useful in cases that you need to distribute drafts and you
%% want to hightlight the new parts and add notes in the middle of the text
%% only for discussion, and build a clean PDF without any such highlights
%% without modifying the text.
\newcommand{\highlightchanges}{}

%% VALUES FROM ANALYSIS (NUMBERS AND STRINGS): these are automatically
%% generated by the analysis phase of the project. The files loaded by
%% 'project.tex' only contain macro definitions (with '\newcommand') and
%% nothing else. So they won't interfere with any LaTeX style and can be
%% safely used in any pre-defined style.
\input{tex/build/macros/project.tex}

%% CUSTOM PREAMBLES FOR DEMO: You can remove them if you are using a
%% specific journal style, or don't need features like BibLaTeX (advanced
%% bibliography management) or PGFPlots (for drawing plots within LaTeX
%% directly from tables of data). If you don't need them, you can also
%% delete their files from your branch to keep the 'tex/src' directory on
%% your branch clean.
\input{tex/src/preamble-style.tex}
\input{tex/src/preamble-eje.tex}
%\input{tex/src/preamble-header.tex}
%\input{tex/src/preamble-biblatex.tex}
%\input{tex/src/preamble-pgfplots.tex}
\input{tex/src/preamble-necessary.tex}
\input{tex/src/preamble-journals-astro.tex}
\input{tex/src/preamble-shortcuts-astro.tex}

\newcommand\NThreshStart{n_1}
\newcommand\NThreshStop{n_2}

%% Start creating the paper.
\begin{document}


%% PROJECT TITLE: The project title should also be printed as metadata in
%% all output files. To avoid inconsistancy caused by manually typing it,
%% the title is defined with other core project metadata in
%% 'reproduce/analysis/config/metadata.conf'. That value is then written in
%% the '\projectitle' LaTeX macro by
%% 'reproduce/analysis/make/initialize.mk' and is directly used here. So
%% please set your project's title in 'metadata.conf' (ideally with other
%% basic project information) and re-run the project to have your new
%% title. If you later use a different LaTeX style, please use the same
%% '\projectitle' in it (after importing 'tex/build/macros/project.tex'
%% like above), don't type it by hand.
\title{\uppercase{\projecttitle}}
%Poisson-like assessment of the SARS-CoV-2 daily infection counts
\titlerunning{\uppercase{\projecttitle}}
%\titlerunning{Poisson-like analysis of SARS-CoV-2 counts}

%% AUTHOR INFORMATION: For a more fine-grained control of the headers
%% including author name, or paper info, see
%% `tex/src/preamble-header.tex'. Note that if you plan to use a journal's
%% LaTeX style file, you will probably set the authors in a different way,
%% feel free to change them here, this part is not related to the analysis.
\newcommand\boudroukemaorcid{\href{https://orcid.org/0000-0002-3772-0250}{{\mathsf{\tiny ORCID}}}}
\author{Boudewijn F. Roukema$^{1,2,\boudroukemaorcid}$}

\authorrunning{Boudewijn F. Roukema}
%\author[2]{Coauthor one}
%\author[1,3]{Coauthor two}
  %\affil[1]

\institute{${}^1${Institute of Astronomy, Faculty of Physics,
           Astronomy and Informatics, Nicolaus Copernicus
           University, Grudziadzka 5, 87-100 Toru\'n, Poland}
      \\
%      \affil[2]
            ${}^2${Univ Lyon, Ens de Lyon, Univ Lyon1, CNRS, Centre de
           Recherche Astrophysique de Lyon UMR5574, F--69007, Lyon,
           France}\\
      Email: boud\,@\,astro.umk.pl
}

%\affil[3]{And generally as many affiliations as you like.
%\par \emph{Received YYYY MM DD; accepted YYYY MM DD; published YYYY MM DD}}
%\date{}
\date{Received: \ldots / Accepted: \ldots}

\journalname{To be submitted}

\def\abstractcontent{
  %% Project abstract and keywords. 150-250 words for EJE
  \begin{abstract}
    %context
    The noise in daily infection counts of an epidemic should be super-Poissonian due to intrinsic epidemiological and administrative clustering.
    %aim
    Here, we use this clustering to classify the official national SARS-CoV-2 daily infection counts and check for infection counts that are unusual by being anti-clustered.
    %method
    We adopt a one-parameter model of $\phi_i'$ infections per cluster, dividing any daily count $n_i$ into $n_i/\phi_i'$ \enquote*{clusters}, for \enquote*{country} $i$.
    We assume that $n_i/\phi_i'$ on a given day $j$ is drawn from a Poisson distribution whose mean is robustly estimated from the $\poissNMedianModelvalue$ neighbouring days, and calculate the inferred Poisson probability $P_{ij}'$ of the observation.
    The $P_{ij}'$ values should be uniformly distributed.
    We find the value $\phi_i$ that minimises the Kolmogorov--Smirnov distance from a uniform distribution.
    We investigate the $(\phi_i, N_i)$ distribution, for total infection count $N_i$.
    %results
    We consider consecutive count sequences above a threshold of {\poissNThresholdvalue} daily infections.
    We find that most of the daily infection count sequences are inconsistent with a Poissonian model.
    All are consistent with the $\phi_i$ model.
    Clustering increases with total infection count for the full sequences: $\phi_i \sim\sqrt{N_i}$.
    The 28-, 14- and 7-day least noisy sequences for several countries are best modelled as sub-Poissonian, suggesting a distinct epidemiological family.
    The 28-day sequences of {\poissNLowestPhiBelowHalfCountriesvalue} have strongly sub-Poissonian preferred models, with $\phi_i^{28} <0.5$; and {\poissNLowestPhiBelowThreeCountriesvalue} have $\phi_i^{28} <3.0$.
    %conclusion
    Independent verification may be warranted for those countries with unusually low clustering.

    \keywords{COVID-19 \and Epidemic curve \and Poisson point process}
    %\textbf{TODO: PACS code and MSC subclass codes}
  \end{abstract}
}

\maketitle

%\protect\resizebox{\textwidth}{!}{



%\end{multicols}
%% To add the first page's headers.
%\thispagestyle{firststyle}


%% Start of main body.
\spacedsection{Introduction}

The daily counts of new, laboratory-confirmed infections with severe acute respiratory syndrome coronavirus~2 (SARS-CoV-2) constitute one of the key statistics followed by citizens and health agencies around the world in the ongoing 2019--2020 coronavirus disease 2019 (COVID-19) pandemic \citep{Huang2020covid19,Li2020underdocSARSCoV2}.
Can these counts be classified in a way that makes as few epidemiological assumptions as possible, as motivation for deeper analysis to either validate or invalidate the counts?
While full epidemiological modelling and prediction is a vital component of COVID-19 research \citep[e.g.][]{Chowdhury2020,KimLLP2020overdisp,MolinaCuevas2020covid19,JiangZhaoShao2020covid19,Afshordi2020covid19}, these cannot be accurately used to study the pandemic as a whole -- a global phenomenon by definition -- if the data at the global level is itself inaccurate.
Knowledge of the global state of the current pandemic is weakened if any of the national-level SARS-CoV-2 infection data have been artificially interfered with by the health agencies providing that data or by other actors involved in the chain of data lineage \citep{Pasquier2017}.
Since personal medical data are private information, only a limited number of individuals at health agencies are expected to be able to check the validity of these counts based on original records.
Nevertheless, artificial interventions in the counts could potentially reveal themselves in statistical properties of the counts.
Unusual statistical properties in a wide variety of quantitative data sometimes appear, for example, as anomalies related to Benford's law \citep{Newcomb1881,NigriniMiller09secorder}, as in the 2009 first round of the Iranian presidential election \citep{RouJASIrPresElec,RouBenfordBook14,Mebane2010}.


Here, we check the compatibility of noise in the official national SARS-CoV-2 daily infection counts, $N_i(t)$, for country\footnote{No position is taken in this paper regarding jurisdiction over territories; the term \enquote*{country} is intended here as a neutral term without supporting or opposing the formal notion of state. Apart from minor changes for technical reasons, the \enquote*{countries} are defined by the data sources.} $i$ on date $t$, with expectations based on the Poisson distribution \citep{Poisson1837}.
It is unlikely that any real count data will quite match the theoretical Poisson distribution, both due to the complexity of the logical tree of time-dependent intrinsic epidemiological infection as well as administrative effects in the SARS-CoV-2 testing procedures, and the sub-national and national level procedures for collecting and validating data to produce a national health agency's official report.
In particular, clusters of infections on a scale of $\phi_i'$ infections per cluster, either intrinsic or in the testing and administrative pipeline, would tend to cause relative noise to increase from a fraction of $1/\sqrt{N_i}$ for pure Poisson noise up to $\sqrt{\phi_i'/N_i}$, greater by a factor of $\sqrt{\phi_i'}$.
This overdispersion has been found, for example, for COVID-19 death rate counts in the United States \citep{KimLLP2020overdisp}.

In contrast, it is difficult to see how anti-Poissonian smoothing effects could occur, unless they were imposed administratively.
For example, an administrative office might impose (or have imposed on it by political authorities) a constraint to validate a fixed or slowly and smoothly varying number of SARS-CoV-2 test result files per day, independently of the number received or queued; this would constitute an example of an artificial intervention in the counts that would weaken the epidemiological usefulness of the data.

A one-parameter model to allow for the clustering is proposed in this paper, and used to classify the counts.
We allow the parameter to take on an effective anti-clustering value, in order to allow the data to freely determine its optimal value.
For more in-depth models of clustering, called \enquote{burstiness} in stochastic models of discrete event counts, power-law models have also been proposed \citep{Barbasi05,KwangIlBarbasi06}.

\sloppy
The method is presented in \ejeref{s-method}.
The section \ejeref{s-input-data} describes the choice of data set and the definition, for any given country, of a consecutive time sequence that has high enough daily infection counts for Poisson distribution analysis to be reasonable.
The method of analysis is given in \ejeref{s-analysis}.
Results are presented in \ejeref{s-results}.
Qualitative discussion of the results is given in \ejeref{s-disc} and conclusions are summarised in \ejeref{s-conclu}.
This work is intended to be fully reproducible by independent researchers using the {\sc Maneage} framework: see commit {\projectversion} of the {\sc git} repository {\projectgitrepository} and the archive {\projectzenodohref}.


\spacedsection{Method} \label{s-method}

\spacedsubsection{SARS-CoV-2 infection data} \label{s-input-data}

Two obvious choices of a dataset for national daily SARS-CoV-2 counts would be those provided by the World Health Organization (WHO)\footnote{\url{https://covid19.who.int/WHO-COVID-19-global-data.csv}; \href{\WHOarchiveurl}{(archive)}} or those curated by the Wikipedia {\em WikiProject COVID-19 Case Count Task Force}\footnote{\url{https://en.wikipedia.org/w/index.php?title=Wikipedia:WikiProject_COVID-19/Case_Count_Task_Force&oldid=967874960}} in {\em medical cases chart} templates (hereafter, {\WPCCTF}).
While WHO has published a wide variety of documents related to the COVID-19 pandemic, it does not appear to have published details of how national reports are communicated to it and collated.
Given that most government agencies and systems of government procedures tend to lack transparency, despite significant moves towards forms of open government \cite[e.g.][]{YuRobinson2012} in many countries, data lineage tracing from national governments to WHO is likely to be difficult in many cases.
In contrast, the curation of official government SARS-CoV-2 daily counts by the Wikipedia {\em WikiProject COVID-19 Case Count Task Force} follows a well-established technology of tracking data lineage.

\begin{figure}
  \includegraphics[width=\columnwidth]{WHO_vs_WP}
  \caption{Number $N_{\mathrm{jump}}$ of sudden jumps or drops in counts on adjacent days in WHO and  Wikipedia {\em WikiProject COVID-19 Case Count Task Force} {\em medical cases chart} national daily SARS-CoV-2 infection counts for countries present in both data sets.
    A line illustrates equal quality of the two data sets.
    The {\WPCCTF} version of the data is clearly less affected by sudden jumps than the WHO data.
    Plain text table: \mbox{\href{\projectzenodofilesbase/WHO_vs_WP_jumps.dat}{\projectzenodoid/WHO\_vs\_WP\_jumps.dat}}.
    \label{f-WHO-vs-WP}}
\end{figure}

Unfortunately, it is clear that in the WHO data, there are several cases where two days' worth of detected infections appear to be listed by WHO as a sequence of two days $j$ and $j+1$ on which all the infections are allocated to the second of the two days, with zero infections on the first of the pair.
There are also some sequences in the WHO data where the day listed with zero infections is separated by several days from a nearby day with double the usual amount of infections.
This is very likely an effect of difficulties in correctly managing world time zones, or time zone and sleep schedule effects, in any of several levels of the chains of communication between health agencies and WHO.
In other words, there are several cases where a temporary sharp jump or drop in the counts appears in the data but is most likely a timing artefact.
Whatever the reason for the effect, this effect will tend to confuse the epidemiological question of interest here: the aim is to globally characterise the noise and to highlight countries where unusual smoothing may have taken place.

We quantify this jump/drop problem as follows.
We consider a pair of days $j$, $j+1$ for a given country to be a jump if the absolute difference in counts, $|n_i(j+1)-n_i(j)|$, is greater than the mean, $(n_i(j+1)+n_i(j))/2$.
In the case of a pair in which one value is zero, the ratio is two, and the condition is satisfied.
We evaluate the number of jumps $N_{\mathrm{jump}}$ for both the WHO data and the {\WPCCTF} {\em medical cases chart} data, starting, for any given country, from the first day with at least $\poissNThresholdvalue$ infections.
Figure~\ref{f-WHO-vs-WP} shows $N_{\mathrm{jump}}$ for the {\poissNCommonCountriesvalue} countries in common to the two data sets; there are {\poissNWHOCountriesvalue} countries in the WHO data set and {\poissNWPCCTFCountriesvalue} in the {\WPCCTF} data.
It is clear that most countries have fewer jumps or drops in the Wikipedia data set than in the WHO data set.

Thus, at least for the purposes of understanding intrinsic and administrative clustering, the {\WPCCTF} {\em medical cases chart} data appear to be the better curated version of the national daily SARS-CoV-2 infection counts as reported by official agencies.
The detailed download and extraction script of national daily SARS-CoV-2 infection data from these templates and the resulting data file \mbox{\href{\projectzenodofilesbase/WP_C19CCTF_SARSCoV2.dat}{\projectzenodoid/WP\_C19CCTF\_SARSCoV2.dat}} are available in the reproducibility package associated with this paper (\S\hyperlink{s-code-avail}{{\sffamily Code availability}}).
Dates without data are omitted; this should have an insignificant effect on the analysis if these are due to low infection counts.

The full set of data includes many days, especially for countries or territories (as defined by the data source) of low populations, with low values, including zero and one.
The standard deviation of a Poisson distribution of expectation value $N$ is $\sqrt{N}$ \citep{Poisson1837}, giving a fractional error of $1/\sqrt{N}$.
Even taking into account clustering or anticlustering of data, inclusion of these periods of close to zero infection counts would contribute noise that would overwhelm the signal from the periods of higher infection rates for the same or other countries.
In the time sequences of SARS-CoV-2 infection counts, chaos in the administrative reactions to the initial stages of the pandemic will tend to create extra noise, so it is reasonable to choose a moderately high threshold at which the start and end of a consecutive sequence of days should be defined for analysis.
Here, we set the threshold for a sequence to start at a minimum of $\poissNThresholdvalue$ infections in a single day.
The sequence is continued for at least $\poissMinDaysvalue$ days (if available in the data), and stops when the counts drop below the same threshold for $\poissStopDaysvalue$ consecutive days.
The cutoff criterion of $\poissStopDaysvalue$ consecutive days avoids letting the analysable sequence be too sensitive to individual days of low fluctuations.
If the resulting sequence includes less than $\poissMinDaysvalue$ days, the sequence is rejected as having insufficient signal to be analysed.

\spacedsubsection{Analysis} \label{s-analysis}

\spacedsubsubsection{Poissonian and $\phi_i'$ models: full sequences} \label{s-method-full-seq}

We first consider the full count sequence $\{n_i(j), 1 \le j \le T_i\}$ for each country $i$, with $T_i$ valid days of analysis as defined in \ejeref{s-input-data}.
Our one-parameter model assumes that the counts are predominantly grouped in clusters, each with $\phi_i'$ infections per cluster.
Thus, the daily count $n_i(j)$ is assumed to consist of $n_i(j)/\phi_i'$ infection events.
We assume that $n_i(j)/\phi_i'$ on a given day is drawn from a Poisson distribution of mean $\mumodel_i(j)/\phi_i'$.
We set $\mumodel_i(j)$ to the median of the $\poissNMedianModelvalue$ neighbouring days, excluding day $j$ and centred on it.
\FPeval{\initfinalseqlength}{clip(\poissNMedianModelvalue/2)}
\FPeval{\initfinalseqlengthplusone}{clip(\poissNMedianModelvalue/2+1)}
For the initial sequence of {\initfinalseqlength} days, $\mumodel_i(j)$ is set to $\mumodel_i(\initfinalseqlengthplusone)$, and $\mumodel_i(j)$ for the final {\initfinalseqlength} days is set to $\mumodel_i(T_i-\initfinalseqlength)$.
By modelling $\mumodel_i$ as a median of a small number of neighbouring days, our model is almost identical to the data itself and statistically robust, with only mild dependence on the choices of parameters.
This definition of a model is more likely to bias the resulting analysis towards underestimating the noise on scales of several days rather than overestimating it; this method will not detect oscillations on the time scale of a few days to a fortnight that are related to the SARS-CoV-2 incubation time \citep{HuangZhang2020covid19incubation}.
For any given value $\phi_i'$, we calculate the cumulative probability $P_{ij}'$ that $n_i(j)/\phi_i'$ is drawn from a Poisson distribution of mean $\mumodel_i(j)/\phi_i'$.
For country $i$, the values $P_{ij}'$ should be drawn from a uniform distribution if the model is a fair approximation.
In particular, for $\phi_i'$ set to unity, $P_{ij}'$ should be drawn from a uniform distribution if the intrisic data distribution is Poissonian.
Individual values of $P_{ij}'$ (close to zero or one) could, in principle, be used to identify individual days that are unusual, but here we do not consider these further.

We allow a wide logarithmic range in values of $\phi_i'$, allowing the unrealistic domain of $\phi_i' < 1$, and find the value $\phi_i$ that minimises the Kolmogorov--Smirnov (KS) distance \citep{Kolmogorov1933,Smirnov1948} from a uniform distribution, i.e. that maximises the KS probability that the data are consistent with a uniform distribution, when varying $\phi_i'$.
The one-sample KS test is a non-parametric test that compares a data sample with a chosen theoretical probability distribution, yielding the probability that the sample is drawn randomly from the theoretical distribution.
We label the corresponding KS probability as $\PKSi$.
We write $\Ppoissi := \PKSi(\phi_i' = 1)$ to check if any country's daily infection rate sequence is consistent with Poissonian, although this is likely to be rare, as stated above: super-Poissonian behaviour seems reasonable.
Of particular interest are countries with low values of $\phi_i$.
Allowing for a possibly fractal or other power-law nature of the clustering of SARS-CoV-2 infection counts, we consider the possibility that the optimal values $\phi_i$ may be dependent on the total infection count $N_i$.
We investigate the $(\phi_i, N_i)$ distribution and see whether a scaling type relation exists, allowing for a corrected statistic $\psi_i$ to be defined in order to highlight the noise structure of the counts independent of the overall scale $N_i$ of the counts.

Standard errors in $\phi_i$ for a given country $i$ are estimated once $\phi_i$ has been obtained by assuming that $\mumodel_i(j)$ and $\phi_i$ are correct and generating $\poissNStderrClustervalue$ Poisson random simulations of the full sequence for that country.
Since the scales of interest vary logarithmically, the standard deviation of the best estimates of $\log_{10} \phi_i$ for these numerical simulations is used as an estimate of $\sigma(\log_{10}\phi_i)$, the logarithmic standard error in $\phi_i$.

\spacedsubsubsection{Subsequences} \label{s-method-sub-seq}

\sloppy
Since artificial interference in daily SARS-CoV-2 infection counts for a given country might be restricted to shorter periods than the full data sequence, we also analyse 28-, 14- and 7-day subsequences.
These analyses are performed using the same methods as above (\ejeref{s-method-full-seq}), except that the 28-, 14- or 7-day subsequence that minimises $\phi_i$ is found.
The search over all possible subsequences would require calculation of a \v{S}id\`ak-Bonferonni correction factor \citep{Abdi07} to judge how anomalous they are.
The KS probabilities that we calculate need to be interpreted keeping this in mind.
Since the subsequences for a given country overlap, they are clearly not independent from one another.
Instead, the {\em a posteriori} interpretation of the results of the subsequence searches found here should at best be considered indicative of periods that should be considered interesting for further verification.

\begin{figure}
  \includegraphics[width=\columnwidth]{probability_vs_N_full}
  \caption{Probability of the noise in the country-level daily SARS-CoV-2 counts being consistent with a Poisson point process, $\Ppoissi$, shown as red circles; and probability $\PKSi(\phi_i)$ for the $\phi_i$ clustering model proposed here (\protect\ejeref{s-method-full-seq}), shown as green {\tt X} symbols, versus $N_i$, the total number of officially recorded infections for that country.
    The horizontal axis is logarithmic.
    As discussed in the text (\protect\ejeref{s-res-full-seq}), the Poisson point process is unrealistic for most of these data, while the $\phi_i$ clustering model is consistent with the data for all countries.
    Plain text table: \mbox{\href{\projectzenodofilesbase/phi_N_full.dat}{\projectzenodoid/phi\_N\_full.dat}}.
    \label{f-probability-N}}
\end{figure}

\begin{figure}
  \includegraphics[width=\columnwidth]{phi_N_full}
  \caption{Noisiness in daily SARS-CoV-2 counts, showing the clustering parameter $\phi_i$ (\protect\ejeref{s-method-full-seq}) that best models the noise, versus the total number of counts for that country $N_i$.
    The error bars show standard errors derived from numerical (bootstrap) simulations based on the model.
    The axes are logarithmic, as indicated.
    Values of the clustering parameter $\phi_i$ below unity indicate sub-Poissonian behaviour -- the counts in these cases are less noisy than expected for Poisson statistics.
    A robust (Theil--Sen \protect\citep{Theil50,Sen68}) linear fit of $\log_{10}\phi_i$ against $\log_{10}N_i$ is shown as a thick green line (\ejeref{s-res-full-seq}).
    Plain text table: \mbox{\href{\projectzenodofilesbase/phi_N_full.dat}{\projectzenodoid/phi\_N\_full.dat}}.
    \label{f-phi-N}}
\end{figure}

\begin{figure}
  \includegraphics[width=\columnwidth]{psi_N_full}
  \caption{Normalised noisiness $\psi_i$ (Eq.~\protect\eqref{e-psi-law}) for daily SARS-CoV-2 counts versus total counts $N_i$.
    The error bars are as in Fig.~\ref{f-phi-N}, assuming no additional error source contributed by $N_i$.
    The axes are logarithmic.
    A few low $\psi_i$ values appear to be outliers of the $\psi_i$ distribution.
    \label{f-psi-N}}
\end{figure}


\spacedsection{Results} \label{s-results}

\spacedsubsection{Data} \label{s-results-data}

The {\poissNCountriesAllvalue} countries and territories in the {\WPCCTF} counts data have {\poissNNegvalue} negative values out of the total of {\poissNCountsAllvalue} values.
These can reasonably be interpreted as corrections for earlier overcounts, and we reset these values to zero with a negligible reduction in the amount of data.
Consecutive day sequences satisfying the criteria listed in \ejeref{s-input-data} were found for {\poissNCountriesOKvalue} countries.

\spacedsubsection{Clustering of SARS-CoV-2 counts} \label{s-res-analysis}

\begin{table}
  \caption{Clustering parameters for the countries with the $\poissNLowestPhivalue$ lowest $\phi_i$ and $\poissNLowestPsivalue$ lowest $\psi_i$ values (least noise);
    extended version of table: \mbox{\href{\projectzenodofilesbase/phi_N_full.dat}{\projectzenodoid/phi\_N\_full.dat}}.
    \label{t-least-phi-psi}}
  \begin{tabular}{crcccc}
    \hline
    Country & $N_i$ & $\Ppoissi$ & $\PKSi$ & $\phi_i$ & $\psi_i$
    \rule[-0.3ex]{0ex}{2.7ex} % strut
    \\
    \hline
    \input{low_phi_table_full}
    \hline
  \end{tabular}
\end{table}


\spacedsubsubsection{Full infection count sequences} \label{s-res-full-seq}

Figure~\ref{f-probability-N} shows, unsurprisingly, that only a small handful of the countries' daily SARS-CoV-2 counts sequences have noise whose statistical distribution is consistent with the Poisson distribution, in the sense modelled here: $\Ppoissi$ (red circles) is close to zero in most cases.
On the contrary, the introduction of the $\phi_i'$ parameter, optimised to $\phi_i$ for country $i$, provides a sufficient fit in all cases; none of the probabilities ($\PKSi(\phi_i)$, green {\tt X} symbols) in Fig.~\ref{f-probability-N} is low enough to be considered a significant rejection.

The consistency of the $\phi_i$ model with the data justifies continuing to Figure~\ref{f-phi-N}, which clearly shows a scaling relation: countries with greater overall numbers $N_i$ of infections also tend to have greater noise in the daily counts $n_i(j)$.
A Theil--Sen linear fit \citep{Theil50,Sen68} to the relation between $\log_{10}\phi_i$ and $\log_{10}N_i$ has a zeropoint of $\poissPhiNFullZeropointvalue \pm \poissPhiNFullSigZeropointvalue$ and a slope of $\poissPhiNFullSlopevalue \pm \poissPhiNFullSigSlopevalue$, where the standard errors (68\% confidence intervals if the distribution is Gaussian) are conservatively generated for both slope and zeropoint by 100 bootstraps.
By using a robust estimator, the low $\phi_i$ cases, which appear to be outliers, have little influence on the fit.
The fit is shown as a thick green line in Fig.~\ref{f-phi-N}.

This $\phi_i$--$N_i$ relation is consistent with $\phi_i \propto \sqrt{N_i}$.
To adjust the $\phi_i$ clustering value to take into account the dependence on $N_i$, and given that the slope is consistent with this simple relation, we propose the empirical definition of a normalised clustering parameter
\begin{equation}
  \psi_i := \phi_i/\sqrt{N_i}\,, \label{e-psi-law}
\end{equation}
so that $\psi_i$ should, by construction, be approximately constant.
While the estimated slope of the relation could be used rather than this half-integer power relation, the fixed relation in Eq.~\eqref{e-psi-law} offers the benefit of simplicity.

This relation should not be confused with the usual Poisson error.
By the divisibility of the Poisson distribution, the relation $\phi_i \propto \sqrt{N_i}$ found here can be used to show that
\begin{align}
  \sigma[\mumodel_i(j)/\phi_i] &\sim \sqrt{\mumodel_i(j)/\phi_i}
  \nonumber \\
  \Rightarrow
  \sigma[\mumodel_i(j)] &\sim \phi_i \sqrt{\mumodel_i(j)/\phi_i}
  \propto N_i^{1/4} \mumodel_i(j)^{1/2}  \,,
  %\Rightarrow
  %\sigma[\mumodel_i(j)] &\sim \sqrt{\phi_i\,\mumodel_i(j)}
  %\nonumber \\
  %  & \propto \sqrt{ \sqrt{N_i} \,\mumodel_i(j) } \\
  %&
\end{align}
where $\sigma[x]$ is the standard deviation of random variable $x$.
If we accept $\mumodel_i(j)$ as a fair model for $n_i(j)$ and that $n_i(j)$ is proportional to $N_i$, then we obtain
\begin{equation}
  \sigma[n_i(j)] \propto n_i^{3/4}\,.
  \label{e-epidemic-curve-3-4-law}
\end{equation}

\begin{figure}
  \includegraphics[width=\columnwidth]{phi_N_a}
  \caption{Clustering parameter $\phi_i$ for $\poissNSubseqLengthAvalue$-day sequence with lowest $\phi_i$, as in Fig.~\protect\ref{f-phi-N}.
    The vertical axis range is expanded from that in Fig.~\protect\ref{f-phi-N}, to accommodate lower values
    A robust (Theil--Sen \protect\citep{Theil50,Sen68}) linear fit of $\log_{10}\phi_i^{\poissNSubseqLengthAvalue}$ against $\log_{10}N_i$ is shown as a thick green line (\ejeref{s-res-full-seq}).
    Plain text table: \mbox{\href{\projectzenodofilesbase/phi_N_28days.dat}{\projectzenodoid/phi\_N\_28days.dat}}.
    \label{f-phi-N-28}}
\end{figure}

\setlength{\tabcolsep}{3pt}
\begin{table}
  \caption{Least noisy ${\poissNSubseqLengthAvalue}$-day sequences -- clustering parameters for the countries with the $\poissNLowestPhivalue$ lowest $\phi_i^{\poissNSubseqLengthAvalue}$ values;
    extended table: \mbox{\href{\projectzenodofilesbase/phi_N_28days.dat}{\projectzenodoid/phi\_N\_28days.dat}}.
    \label{t-least-phi-psi-28}}
  \begin{tabular}{crrcccc}
    \hline
    country & $N_i$ & $\scalaverage{n_i^{\poissNSubseqLengthAvalue}}$ & $\Ppoissi$ & $\PKSi$ & $\phi_i^{\poissNSubseqLengthAvalue}$ & starting
    \rule{0ex}{2.7ex} \\ % strut
    &&&&&& date
    \rule[-0.3ex]{0ex}{2ex} % strut
    \\
    \hline
    \input{low_phi_table_a}
    \hline
  \end{tabular}
\end{table}

Figure~\ref{f-psi-N} shows visually that $\psi_i$ appears to be scale-independent, in the sense that the dependence on $N_i$ has been cancelled, by construction.
The countries with the $\poissNLowestPsivalue$ lowest values of $\psi_i$ are those with ISO 3166-1 alpha-2 codes {\poissNLowestPsiCountriesvalue}.
Detailed SARS-CoV-2 daily count noise characteristics for the countries with lowest $\phi_i$ and $\psi_i$ are listed in Table~\ref{t-least-phi-psi}, including Kolmogorov--Smirnov probability that the data are drawn from a Poisson distribution, $\Ppoissi$, the probability of the optimal $\phi_i$ model, $\PKSi$, and $\phi_i$ and $\psi_i$.

The approximate proportionality of $\phi_i$ to $\sqrt{N_i}$ for the full sequences is strong and helps separate low-noise SARS-CoV-2 count countries from those following the main trend.
However, the results for subsequences shown below in \ejeref{s-res-sub-seq} suggest that this $N_i$ dependence may be an effect of the typically longer durations of the pandemic in countries where the overall count is higher.

\spacedsubsubsection{Subsequences of infection counts} \label{s-res-sub-seq}

\begin{figure}
  \includegraphics[width=\columnwidth]{phi_N_b}
  \caption{Clustering parameter $\phi_i$ for $\poissNSubseqLengthBvalue$-day sequence with lowest $\phi_i$, as in Fig.~\protect\ref{f-phi-N-28}.
    Plain text table: \mbox{\href{\projectzenodofilesbase/phi_N_14days.dat}{\projectzenodoid/phi\_N\_14days.dat}}.
    \label{f-phi-N-14}}
\end{figure}

\begin{table}
  \caption{Least noisy $\poissNSubseqLengthBvalue$-day sequences -- clustering parameters for the countries with the $\poissNLowestPhivalue$ lowest $\phi_i^{\poissNSubseqLengthBvalue}$ values;
    extended version of table: \mbox{\href{\projectzenodofilesbase/phi_N_14days.dat}{\projectzenodoid/phi\_N\_14days.dat}}.
    \label{t-least-phi-psi-14}}
  \begin{tabular}{crrcccc}
    \hline
    country & $N_i$ & $\scalaverage{n_i^{\poissNSubseqLengthBvalue}}$ & $\Ppoissi$ & $\PKSi$ & $\phi_i^{\poissNSubseqLengthBvalue}$ & starting
    \rule{0ex}{2.7ex} \\ % strut
    &&&&&& date
    \rule[-0.3ex]{0ex}{2ex} % strut
    \\
    \hline
    \input{low_phi_table_b}
    \hline
  \end{tabular}
\end{table}

\begin{figure}
  \includegraphics[width=\columnwidth]{phi_N_c}
  \caption{Clustering parameter $\phi_i$ for $\poissNSubseqLengthCvalue$-day sequence with lowest $\phi_i^\poissNSubseqLengthCvalue$, as in Fig.~\protect\ref{f-phi-N-28}.
    There is clearly a wider overall scatter and bigger error bars compared to Figs~\protect\ref{f-phi-N-28} and \protect\ref{f-phi-N-14}; a low $\phi_i^\poissNSubseqLengthCvalue$ is a weaker indicator than $\phi_i^\poissNSubseqLengthAvalue$ and $\phi_i^\poissNSubseqLengthBvalue$.
    Plain text table: \mbox{\href{\projectzenodofilesbase/phi_N_07days.dat}{\projectzenodoid/phi\_N\_07days.dat}}.
    \label{f-phi-N-7}}
\end{figure}

\begin{table}
  \caption{Least noisy $\poissNSubseqLengthCvalue$-day sequences -- clustering parameters for the countries with the $\poissNLowestPhivalue$ lowest $\phi$ values;
    extended table: \mbox{\href{\projectzenodofilesbase/phi_N_07days.dat}{\projectzenodoid/phi\_N\_07days.dat}}.
    \label{t-least-phi-psi-7}}
  \begin{tabular}{crrcccc}
    \hline
    country & $N_i$ & $\scalaverage{n_i^{\poissNSubseqLengthCvalue}}$ & $\Ppoissi$ & $\PKSi$ & $\phi_i^\poissNSubseqLengthCvalue$ & starting
    \rule{0ex}{2.7ex} \\ % strut
    &&&&&& date
    \rule[-0.3ex]{0ex}{2ex} % strut
    \\
    \hline
    \input{low_phi_table_c}
    \hline
  \end{tabular}
\end{table}

Figures~\ref{f-phi-N-28}--\ref{f-phi-N-7} show the equivalent of Fig.~\ref{f-phi-N} for sequences of lengths $\poissNSubseqLengthAvalue$, $\poissNSubseqLengthBvalue$ and $\poissNSubseqLengthCvalue$ days, respectively.
The Theil--Sen robust fits to the logarithmic $(\phi_i^{\poissNSubseqLengthAvalue}, N_i)$; $(\phi_i^{\poissNSubseqLengthBvalue}, N_i)$; and $(\phi_i^{\poissNSubseqLengthCvalue}, N_i)$ relations are zeropoints and slopes of $\poissPhiNAZeropointvalue \pm \poissPhiNASigZeropointvalue$ and $\poissPhiNASlopevalue \pm \poissPhiNASigSlopevalue$; $\poissPhiNBZeropointvalue \pm \poissPhiNBSigZeropointvalue$ and $\poissPhiNBSlopevalue \pm \poissPhiNBSigSlopevalue$; and $\poissPhiNCZeropointvalue \pm \poissPhiNCSigZeropointvalue$ and $\poissPhiNCSlopevalue \pm \poissPhiNCSigSlopevalue$, respectively.
There is clearly no significant dependence of $\phi_i^d$ on $N_i$ for any of these fixed length subsequences, in contrast to the case of the $\phi_i$ dependence on $N_i$ for the full count sequences.
Thus, the empirical motivation for using $\psi$ (Eq.~\eqref{e-psi-law}) to discriminate between the countries' full sequences of SARS-CoV-2 data is not justified for the subsequences.
Tables~\ref{t-least-phi-psi-28}--\ref{t-least-phi-psi-7} show the countries with the least noisy sequences as determined by $\phiiA, \phiiB$ and $\phiiC$, respectively.


\begin{figure*}
  \includegraphics[width=\columnwidth]{\poissCountriesDailyLowAAvalue}
  \includegraphics[width=\columnwidth]{\poissCountriesDailyLowBAvalue}
  \includegraphics[width=\columnwidth]{\poissCountriesDailyLowCAvalue}
  \includegraphics[width=\columnwidth]{\poissCountriesDailyLowDAvalue}
  \includegraphics[width=\columnwidth]{\poissCountriesDailyLowEAvalue}
  \hfill
  \includegraphics[width=\columnwidth]{\poissCountriesDailyLowFAvalue}
  \caption{Least noisy $\poissNSubseqLengthAvalue$-day official SARS-CoV-2 national daily counts for countries with total counts $N_i > \poissNPlotNMinLowestPhivalue$ (see Fig.~\protect\ref{f-phi-N-28} and Table~\protect\ref{t-least-phi-psi-28}), shown as dots in comparison to the $\mumodel_i(j)$ model (median of the $\poissNMedianModelvalue$ neighbouring days) and 68\% error band for the Poisson point process.
    The ranges in daily counts (vertical axis) are chosen automatically and in most cases do not start at zero.
    About nine (32\%) of the points should be outside of the shaded band unless the counts have an anti-clustering effect that weakens Poisson noise.
    A faint shaded band shows the $\phiiA$ model for the one country here with $\phi_i$ (slightly) greater than one (RU), but is almost indistinguishable from the Poissonian band.
    The dates indicate the start date of each sequence.
    \label{f-daily-counts-lowest-phi-28}}
\end{figure*}

\begin{figure*}
  \includegraphics[width=\columnwidth]{\poissCountriesDailyLowACvalue}
  \includegraphics[width=\columnwidth]{\poissCountriesDailyLowBCvalue}
  \includegraphics[width=\columnwidth]{\poissCountriesDailyLowCCvalue}
  \includegraphics[width=\columnwidth]{\poissCountriesDailyLowDCvalue}
  \includegraphics[width=\columnwidth]{\poissCountriesDailyLowECvalue}
  \hfill
  \includegraphics[width=\columnwidth]{\poissCountriesDailyLowFCvalue}
  \caption{Least noisy $\poissNSubseqLengthCvalue$-day daily counts for countries with total counts $N_i > \poissNPlotNMinLowestPhivalue$, as in Fig.~\protect\ref{f-daily-counts-lowest-phi-28}.
    Concentration of points close to the model indicates an anti-clustering effect; about 68\% (two) of the points should scatter up and down throughout the shaded band if the counts are Poissonian.
    In several cases, the data points appear to be mostly stuck to the model, with almost no scatter.
    \label{f-daily-counts-lowest-phi-7}}
\end{figure*}

\begin{figure*}
  \includegraphics[width=\columnwidth]{\poissCountriesDailyMedAAvalue}
  \includegraphics[width=\columnwidth]{\poissCountriesDailyMedBAvalue}
  \includegraphics[width=\columnwidth]{\poissCountriesDailyMedACvalue}
  \hfill
  \includegraphics[width=\columnwidth]{\poissCountriesDailyMedBCvalue}
  \caption{Typical (median) $\poissNSubseqLengthAvalue$-day (above) and $\poissNSubseqLengthCvalue$-day (below) daily counts, as in Figs~\protect\ref{f-daily-counts-lowest-phi-28} and \protect\ref{f-daily-counts-lowest-phi-7}.
    The dark shaded band again shows a Poissonian noise model, which underestimates the noise.
    A faint shaded band shows the $\phi_i$ models for these countries' SARS-CoV-2 daily counts, and should contain about 68\% of the infection count points.
    \label{f-daily-counts-median-phi}}
\end{figure*}

Tables~\ref{t-least-phi-psi-28} and \ref{t-least-phi-psi-14} show that the lists of countries with the strongest anti-clustering are similar.
Thus, Fig.~\ref{f-daily-counts-lowest-phi-28} shows the SARS-CoV-2 counts curves for countries with the lowest $\phiiA$, and Fig.~\ref{f-daily-counts-lowest-phi-7} the curves for those with the lowest $\phiiC$.
Both figures exclude countries with total counts $N_i \le \poissNPlotNMinLowestPhivalue$, in which low total counts tend to give low clustering.
It is clear in these figures that several countries have subsequences that are strongly sub-Poissonian -- with some form of anti-clustering, whether natural or artificial.

Countries in the median of the $\phiiA$ and $\phiiC$ distributions have their curves shown in Fig.~\ref{f-daily-counts-median-phi} for comparison.
It is visually clear in the figure that the counts are dispersed widely beyond the Poissonian band, and that the $\phiiA$ and $\phiiC$ models are reasonable as a model for representing about 68\% of the counts within one standard deviation of the model values.

\spacedsection{Discussion} \label{s-disc}

\sloppy
Figures~\ref{f-phi-N} and \ref{f-psi-N} clearly show that some groups of countries are unusual in terms of the characteristics of their location in the $(N_i,\psi_i)$ plane.

\spacedsubsection{High total infection count}
Brazil (BR) and the United States (US) are separated from the majority of other countries by their high total infection count.
They have correspondingly higher clustering values $\phi_i$, although their normalised clustering values $\psi_i$ are in the range of about $0.4 < \psi_i < 10$ covered by the majority of countries in Fig.~\ref{f-psi-N}.

It does not seem realistic that these two countries' $\phi_i$ values greater than 300 are purely an effect of intrinsic infection events -- \enquote*{superspreader} events in crowded places or nursing homes.
While individual big clusters may occur given the high overall scale of infections, it seems more likely that this is administrative clustering.
Both countries are federations, and have numerous geographic administrative subdivisions with a diversity of political and administrative methods.
A plausible explanation for the dominant effect yielding $\phi_i > 300$ in these two countries is that on any individual day, the arrival and full processing of reports depends on a number of sub-national administrative regions, each reporting a few hundred new infections.

For example, if there are 10 reporting regions, each typically reporting 300 infections, then typically (on about 68\% of days) there will be about 7 to 13 reports per day.
This would give a range varying from about 2100 to 3900 cases per day, rather than 2945 to 3055, which would be the case for unclustered, Poissonian counts (since $\sqrt{3000} \approx 55$).
Lacking a system that obliges sub-national divisions -- and laboratories -- to report their test results in time-continuous fashion and that validates and collates those reports on a time scale much shorter than 24 hours, this type of clustering seems natural in the sociological sense.

\spacedsubsection{Low normalised clustering $\psi_i$}

In Fig.~\ref{f-psi-N}, there appears to be a group of eight countries that are also separated from the main group of countries, but by having low normalised noise $\psi_i$ rather than just having a high total count $N_i$.

\spacedsubsubsection{Low $\psi_i$, low $N_i$, high $\Ppoissi$}
Classifying the countries by $\psi_i$ alone (Table~\ref{t-least-phi-psi}) would add Finland (FI) to this group, but in Fig.~\ref{f-psi-N}, Finland appears better grouped with the main body of countries in the $(\psi_i,N_i)$ plane.
This could be interpreted as Eq.~\eqref{e-psi-law} providing insufficient correction for the $\phi_i$--$N_i$ relation.
Alternatively, looking at Finland's entry in Table~\ref{t-least-phi-psi-28} for 28-day sequences, we see that Finland is among the three with the lowest total (or mean) daily infection counts in the table, and has the highest consistency with a Poisson distribution ($\Ppoissi$).
Having a low total infection count, it seems credible that Finland lacks the intrinsic, testing and administrative clustering of countries with higher infection counts.

\spacedsubsubsection{Low $\psi_i$, high $N_i$}
India (IN) and Russia (RU) have total infection counts nearly as high (logarithmically) as Brazil and the US, but have managed to keep their daily infection rates much less noisy -- by about a factor of 10 to 100 -- than would be expected from the general pattern displayed in the diagram.
Despite having of the order of a million total official SARS-CoV-2 infections each, these two countries have, as of the download date of the data, \WPCnineteenCCTFdownloadeddate, avoided having the clustering effects present in Brazil and the US.

The most divergent case in the high-$N_i$ part of this group (see Fig.~\ref{f-psi-N} and Table~\ref{t-least-phi-psi}) is Russia, which has only a very modest value of $\phi_i = \PhiRUSeqfullThreeSigFig \times 10^{\pm\PhiRUSeqfullSigLogTen}$  for its total infection count of over a million.
This would require that both intrinsic clustering of infection events and administrative procedures work much more smoothly in Russia than in the United States, Brazil and, to a lesser degree, India.
Tables~\ref{t-least-phi-psi-28} and \ref{t-least-phi-psi-14} and Fig.~\ref{f-daily-counts-lowest-phi-28} show that the Russian official SARS-CoV-2 counts indeed show very little noise compared to more typical cases (Fig.~\ref{f-daily-counts-median-phi}).
At the intrinsic epidemiological level, this means that if the Russian counts are to be considered accurate, then very few clusters -- in nursing homes, religious gatherings, bars, restaurants, schools, shops -- can have occurred.
Moreover, laboratory testing and transmission of data through the administrative chain from local levels to the national (federal) health agency must have occurred without the clustering effects present in the United States and Brazil and in countries with more typical clustering values $\phi_i$, characterising their daily infection counts.
International media interest in Russian COVID-19 data has mostly focussed on controversy related to COVID-19 death counts \citep{Newsweek20200514RUCOVID19}, with apparently no attention given so far to the modestly super-Poissonian nature of the daily counts, in contrast to the strongly super-Poissonian counts of other countries with high total infection counts.

India's overall position in the $(\psi_i,N_i)$ plane (Fig.~\ref{f-psi-N} and Table~\ref{t-least-phi-psi}) is less extreme than that of Russia, with an unnormalised clustering parameter $\phi_i = \PhiINSeqfull \times 10^{\pm\PhiINSeqfullSigLogTen}$.
However, Table~\ref{t-least-phi-psi-14} shows that despite its large overall infection count, India achieved a 14-day sequence with a preferred $\phi_i$ value close to unity.
Moreover, it has a very low-ranked $\phiiC$ value, as given in Table~\ref{t-least-phi-psi-7} and illustrated in Fig.~\ref{f-daily-counts-lowest-phi-7}.
Five values appear almost exactly on the model curve rather than scattering above and below.
Moreover, the value is just below 10,000.
Epidemiologically, it is not credible to believe that 10,000 officially reported cases per day should be an attractor resulting from the pattern of infections and system of reporting.
Given that the value of 10,000 is a round number in the decimal-based system, a reasonable speculation would be that the daily counts for India were artificially held at just below 10,000 for several days.
The crossing of the 10,000 psychological threshold of daily infections was noted in the media \citep{TheHindu20200612IN10k}, but the lack of noise in the counts during the week preceding the crossing of the threshold appears to have gone unnoticed.
After crossing the 10,000 threshold, the daily infections in India continued increasing, as can be seen in the full counts (\mbox{\href{\projectzenodofilesbase/WP_C19CCTF_SARSCoV2.dat}{\projectzenodoid/WP\_C19CCTF\_SARSCoV2.dat}}).

\spacedsubsubsection{Low $\psi_i$, low $\phi_i$, medium $N_i$}
\fussy
Among the group of eight low $\psi_i$ countries, Table~\ref{t-least-phi-psi} shows that only one country has its full data set (as defined here) best modelled by the ordinary Poisson point process.
Algeria (DZ) appears to have completely avoided clustering effects, with $\phi_i$ close to unity.
Figure~\ref{f-daily-counts-lowest-phi-28} shows the least noisy 28-day sequence for Algeria.
Only one day of SARS-CoV-2 recorded infections appears to have diverged beyond the Poissonian 68\% band, rather than about nine, the expected number for a Poissonian distribution.
Most of the points appear to stick very closely to the model.
It is difficult to imagine a natural process for obtaining this sub-Poissonian noise (as preferred by the $\phi_i$ model), especially in the context where most countries have super-Poissonian daily counts.
In a frequentist interpretation, the least noisy Algerian 28-day count sequence would be considered only mildly, not significantly, unusual, since it is consistent with a Poisson distribution, with only a weak rejection (Tables~\ref{t-least-phi-psi-28}--\ref{t-least-phi-psi-7}).
However, as a member of the general class of countries' SARS-CoV-2 daily infection count curves, use of the $\phi_i$ model would appear to be justified.
It is in this sense that the sequence can be considered sub-Poissonian.
Moreover, a full Bayesian analysis would need to consider independent credibility criteria.

In line with the counts for India that appeared to be smooth just below a round-number boundary of 10,000 infections per day, the least noisy 7-day sequence for Algeria, shown in Fig.~\ref{f-daily-counts-lowest-phi-7}, might appear to have been affected by a similar psychological boundary of 200 infections per day.
Medical specialists interviewed by the media interpreted the 200 daily infections period as representing stability and resulting from partial lockdown measures, without providing an explanation for why Poisson noise was nearly absent \citep{ElMoudjahid20200530DZ200}.
While lockdown measures should reduce intrinsic epidemiological clustering down towards the Poissonian level, it is difficult to see how they could reduce testing and administrative pipeline clustering.
A coincidence that occurred during this least-noisy 7-day period, on 24 May 2020, was that a full COVID-19 lockdown was implemented in Algeria \citep{TSA20200720DZfulllockdown}.

The Belarus (BY) case is present in all four tables (Tables~\ref{t-least-phi-psi}--\ref{t-least-phi-psi-7}).
The least noisy Belarusian counts curve appears in Fig.~\ref{f-daily-counts-lowest-phi-28}.
As with the other panels in the daily counts figures, the vertical axis is set by the data instead of starting at zero, in order to best display the information on the noise in the counts.
With the vertical axis starting at zero, the Belarus daily counts would look nearly flat in this figure.
They appear to be bounded above by the round number of 1000 SARS-CoV-2 infections per day, which, again, appears to be a psychologically preferred barrier.
Media have expressed scepticism of Belarusian COVID-19 related data \citep{NYTKramer20200425BYnoviruses,AFN20200512BYunderestimates}.

\sloppy
One remaining case of a coincidence is that the lowest noise 7-day sequence listed for Poland (PL, Table~\ref{t-least-phi-psi-7}) is for the 7-day period starting 20 June 2020, with $\phi_i^7 = \PhiPLSeqc \times 10^{\pm\PhiPLSeqcSigLogTen}$.
This is a factor of about 100 (or at least 10 at about 95\% confidence) below Poland's clustering value for the full sequence of its SARS-CoV-2 daily infection counts, $\phi_i = \PhiPLSeqfull \times 10^{\pm\PhiPLSeqfullSigLogTen}$, which Fig.~\ref{f-phi-N} shows is typical for a country with an intermediate total infection count.
On 28 June 2020, there was a {\em de facto} (of disputed constitutional validity \citep{Wyrzykowski2020,Letowska2020}) first-round presidential election in Poland.
Figure~\ref{f-daily-counts-lowest-phi-7} shows that the counts for Poland during the final pre-first-round-election week did not scatter widely throughout the Poissonian band.
A decimal-system round number also appears in this figure: the daily infection rate is slightly above about 300 infections per day and drops to slightly below that.
For an unknown reason that does not previously appear to have been studied, the intrinsic clustering of SARS-CoV-2 infections in Poland together with testing and administrative clustering of the confirmed cases appears to have temporarily disappeared just prior to the election date, yielding what is best modelled as sub-Poissonian counts.



\spacedsection{Conclusion} \label{s-conclu}

\fussy
Given the overdispersed, one-parameter Poissonian $\phi_i$ model proposed, the noise characteristics of the daily SARS-CoV-2 infection data suggest that most of the countries' data form a single family in the $(\phi_i,N_i)$ plane.
The clustering -- whether epidemiological in origin, or caused by testing or administrative pipelines -- tends to be greater for greater numbers of total infections.
Several countries appear, however, to show unusually anti-clustered (low-noise) daily infection counts.

\fussy
Since these daily infection counts data constitute data of high epidemiological interest, the statistical characteristics presented here and the general method could be used as the basis for further investigation into the data of countries showing exceptional characteristics.
The relations between the most anti-clustered counts and the psychologically significant decimal system round numbers (IN: 10,000 daily, DZ: 200 daily, BY: 1000 daily, PL: 300 daily), and in relation to a {\em de facto} national presidential election, raise the question of whether or not these are just coincidences.

\sloppy
It should be straightforward for any reader to extend the analysis in this paper by first checking its reproducibility with the free-licensed source package provided using the {\sc Maneage} framework \citep{Akhlaghi2020maneage}, and then extending, updating or modifying it in other appropriate ways; see \mbox{\S\hyperlink{s-code-avail}{{\sffamily Code availability}}} below.
Reuse of the data should be straightforward using the files archived at \mbox{\projectzenodohref}.


%% End of main body.

%\section*{%Appendices}
\section*{{}}%Notes

\begin{acknowledgements}
  Thank you to Marius Peper and an anonymous colleague for several useful comments and to the Maneage developers for the Maneage framework in general and for several specific comments on this work.
  This project has been supported by the Pozna\'n Supercomputing and Networking Center (PSNC) computational grant 314.
  %% Mention all used software in an appendix.
  %\section{Software acknowledgement}
  \input{tex/build/macros/dependencies.tex}
  \sloppy
\end{acknowledgements}

\begin{authorcontrib}
  The design, execution and writing up of this paper were carried out by the author alone.
   This research was partly done using the reproducible paper template
  \projectname-\projectversion.
\end{authorcontrib}

\begin{funding}
  No funding has been received for this project.
\end{funding}


\begin{dataavailability} \label{s-data-avail}
  As described above in \ejeref{s-input-data}, the source of curated SARS-CoV-2 infection count data used for the main analysis in this paper is the {\WPCCTF} data, downloaded using the script {\tt \WPchartscript} and stored in the file {\tt \WPchartfile} in the reproducibility package available at \mbox{\projectzenodohref}.
  The data file is archived at \mbox{\href{\projectzenodofilesbase/WP_C19CCTF_SARSCoV2.dat}{\projectzenodoid/WP\_C19CCTF\_SARSCoV2.dat}}.
  The WHO data that was compared with the {\WPCCTF} data via a jump analysis (Fig.~\ref{f-WHO-vs-WP}) was downloaded from {\url{\WHOsrcurl}} and was \mbox{\href{\WHOarchiveurl}{archived on \WHOdownloadeddate}}.\sloppy

  % The blank line (paragraph break) before closing this section is needed
  % so that \sloppy is active, allowing the paragraph to be formatted nicely.
\end{dataavailability}

\begin{codeavailability}
  \hypertarget{s-code-avail}{}In addition to the SARS-CoV-2 infection count data for this paper, the full calculations, production of figures, tables and values quoted in the text of the pdf version of the paper are intended to be fully reproducible on any POSIX-compatible system using free-licensed software, which, by definition, the user may modify, redistribute and redistribute in modified form.
  The reproducibility framework is technically a {\sc git} branch of the {\sc  Maneage} package \citep{Akhlaghi2020maneage}\footnote{\url{https://maneage.org}}, earlier used to produce reproducible papers such as \citet{Infantesainz20}.
  The {\sc git} repository commit ID of this version of this paper is \projectname-\projectversion{}.
  The primary {\sc git} repository is {\projectgitrepository}.
  Bug reports and discussion are welcome at {\projectgitrepositoryissues}.
\end{codeavailability}

\begin{conflictofinterest}
  The author of this paper has participated as a volunteer in the Wikipedia {\em WikiProject COVID-19 Case Count Task Force} in helping to collate a small fraction of COVID-19 pandemic related data.
  He is aware of no other conflicts of interest or competing interests.
\end{conflictofinterest}








%% Tell BibLaTeX to put the bibliography list here.
%% Tell BibLaTeX to put the bibliography list here.
{\small
  \bibliographystyle{\bibtexbstfile}
  \bibliography{\bibtexbibfiletmp}
}

%% Start appendix.
%\appendix


%% Finish LaTeX
\end{document}

%% This file is part of Maneage (https://maneage.org).
%
%% This file is free software: you can redistribute it and/or modify it
%% under the terms of the GNU General Public License as published by the
%% Free Software Foundation, either version 3 of the License, or (at your
%% option) any later version.
%
%% This file is distributed in the hope that it will be useful, but WITHOUT
%% ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
%% FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
%% for more details.
%
%% You should have received a copy of the GNU General Public License along
%% with this file.  If not, see <http://www.gnu.org/licenses/>.
back to top