https://github.com/cran/simPopulation
Tip revision: dfd80c8f3ed641c2a85359d83174d5bfd886147d authored by Andreas Alfons on 19 October 2010, 00:00:00 UTC
version 0.2.1
version 0.2.1
Tip revision: dfd80c8
simContinuous.Rd
\name{simContinuous}
\Rdversion{1.1}
\alias{simContinuous}
\title{
Simulate continuous variables of population data
}
\description{
Simulate continuous variables of population data using multinomial log-linear
models combined with random draws from the resulting categories or (two-step)
regression models combined with random error terms. The household structure
of the population data and any other categorical predictors need to be
simulated beforehand.
}
\usage{
simContinuous(dataS, dataP, w = "rb050", strata = "db040",
basic = c("age", "rb090", "hsize"),
additional = "netIncome",
method = c("multinom", "lm"), zeros = TRUE,
breaks = NULL, lower = NULL, upper = NULL,
equidist = TRUE, gpd = TRUE, threshold = NULL,
est = "moments", censor = NULL, log = TRUE,
const = NULL, alpha = 0.01, residuals = TRUE,
keep = TRUE, maxit = 500, MaxNWts = 1500,
tol = .Machine$double.eps^0.5, seed)
}
\arguments{
\item{dataS}{a \code{data.frame} containing household survey data.}
\item{dataP}{a \code{data.frame} containing the simulated population data.
Household structure and any other categorical predictors need to be
simulated beforehand.}
\item{w}{a character string specifying the column of \code{dataS} that
contains the (personal) sample weights.}
\item{strata}{a character string specifying the columns of \code{dataS} and
\code{dataP}, respectively, that define strata. The regression models are
computed for each stratum separately. Note that this is currently a
required argument and only one stratification variable is supported.}
\item{basic}{a character vector specifying the columns of \code{dataS} and
\code{dataP}, respectively, that define the household structure and any
other categorical predictors, such as age, gender and household size.}
\item{additional}{a character string specifying the additional continuous
variable of \code{dataS} that should be simulated for the population data.
Currently, only one additional variable can be simulated at a time.}
\item{method}{a character string specifying the method to be used for
simulating the continuous variable. Accepted values are \code{"multinom"},
for using multinomial log-linear models combined with random draws from the
resulting ategories, and \code{"lm"}, for using (two-step) regression
models combined with random error terms.}
\item{zeros}{a logical indicating whether the variable specified by
\code{additional} is semi-continuous, i.e., contains a considerable amount
of zeros. If \code{TRUE} and \code{method} is \code{"multinom"}, a
separate factor level for zeros in the response is used. If \code{TRUE}
and \code{method} is \code{"lm"}, a two-step model is applied. The first
step thereby uses a log-linear or multinomial log-linear model (see
\dQuote{Details}).}
\item{breaks}{an optional numeric vector; if multinomial models are computed,
this can be used to supply two or more breakpoints for categorizing the
variable specified by \code{additional}. If \code{NULL}, breakpoints are
computed using weighted quantiles.}
\item{lower, upper}{optional numeric values; if multinomial models are
computed and \code{breaks} is \code{NULL}, these can be used to specify
lower and upper bounds other than minimum and maximum, respectively. Note
that if \code{method} is \code{"multinom"} and \code{gpd} is \code{TRUE}
(see below), \code{upper} defaults to \code{Inf}.}
\item{equidist}{logical; if \code{method} is \code{"multinom"} and
\code{breaks} is \code{NULL}, this indicates whether the (positive) default
breakpoints should be equidistant or whether there should be refinements in
the lower and upper tail (see \code{\link{getBreaks}}).}
\item{gpd}{logical; if \code{method} is \code{"multinom"}, this indicates
whether the upper tail of the variable specified by \code{additional}
should be simulated by random draws from a (truncated) generalized Pareto
distribution rather than a uniform distribution.}
\item{threshold}{a numeric value; if \code{method} is \code{"multinom"},
values for categories above \code{threshold} are drawn from a (truncated)
generalized Pareto distribution.}
\item{est}{a character string; if \code{method} is \code{"multinom"}, the
estimator to be used to fit the generalized Pareto distribution (see
\code{\link[POT]{fitgpd}}).}
\item{censor}{an optional named list of \code{data.frame}s; if multinomial
models are computed, this can be used to account for structural zeros. The
names of the list components specify the categories that should be
censored. For each of these categories, a \code{data.frame} containing
levels of the predictor variables can be supplied. The probability of the
specified categories is set to 0 for the respective predictor levels.
Currently, this is only implemented for more than two categories in the
response.}
\item{log}{logical; if \code{method} is \code{"lm"}, this indicates whether
the linear model should be fitted to the logarithms of the variable
specified by \code{additional}. The predicted values are then
back-transformed with the exponential function. See \dQuote{Details} for
more information.}
\item{const}{numeric; if \code{method} is \code{"lm"} and \code{log} is
\code{TRUE}, this gives a constant to be added before log transformation.}
\item{alpha}{numeric; if \code{method} is \code{"lm"}, this gives trimming
parameters for the sample data. Trimming is thereby done with respect to
the variable specified by \code{additional}. If a numeric vector of length
two is supplied, the first element gives the trimming proportion for the
lower part and the second element the trimming proportion for the upper
part. If a single numeric is supplied, it is used for both. With
\code{NULL}, trimming is suppressed.}
\item{residuals}{logical; if \code{method} is \code{"lm"}, this indicates
whether the random error terms should be obtained by draws from the
residuals. If \code{FALSE}, they are drawn from a normal distribution
(median and MAD of the residuals are used as parameters).}
\item{keep}{logical; if multinomial models are computed, this indicates
whether the simulated categories should be stored as a variable in the
resulting population data. If \code{TRUE}, the corresponding column name
is given by \code{additional} with postfix \code{"Cat"}.}
\item{maxit, MaxNWts}{control parameters to be passed to
\code{\link[nnet]{multinom}} and \code{\link[nnet]{nnet}}. See the help
file for \code{\link[nnet]{nnet}}.}
\item{tol}{if \code{method} is \code{"lm"}, a small positive numeric value or
\code{NULL}. When fitting a log-linear model within a stratum, factor
levels may not exist in the sample but are likely to exist in the
population. However, the coefficient for such factor levels will be 0.
Therefore, coefficients smaller than \code{tol} in absolute value are
replaced by coefficients from an auxiliary model that is fit to the whole
sample. If \code{NULL}, no auxiliary log-linear model is computed and no
coefficients are replaced.}
\item{seed}{optional; an integer value to be used as the seed of the random
number generator, or an integer vector containing the state of the random
number generator to be restored.}
}
\details{
If \code{method} is \code{"lm"}, the behavior for two-step models is
described in the following.
If \code{zeros} is \code{TRUE} and \code{log} is not \code{TRUE} or the
variable specified by \code{additional} does not contain negative values, a
log-linear model is used to predict whether an observation is zero or not.
Then a linear model is used to predict the non-zero values.
If \code{zeros} is \code{TRUE}, \code{log} is \code{TRUE} and \code{const} is
specified, again a log-linear model is used to predict whether an observation
is zero or not. In the linear model to predict the non-zero values,
\code{const} is added to the variable specified by \code{additional} before
the logarithms are taken.
If \code{zeros} is \code{TRUE}, \code{log} is \code{TRUE}, \code{const} is
\code{NULL} and there are negative values, a multinomial log-linear model is
used to predict negative, zero and positive observations. Categories for the
negative values are thereby defined by \code{breaks}. In the second step, a
linear model is used to predict the positive values and negative values are
drawn from uniform distributions in the respective classes.
If \code{zeros} is \code{FALSE}, \code{log} is \code{TRUE} and \code{const}
is \code{NULL}, a two-step model is used if there are non-positive values in
the variable specified by \code{additional}. Whether a log-linear or a
multinomial log-linear model is used depends on the number of categories to
be used for the non-positive values, as defined by \code{breaks}. Again,
positive values are then predicted with a linear model and non-positive
values are drawn from uniform distributions.
}
\value{
A \code{data.frame} containing the simulated population data including the
continuous variable specified by \code{additional}.
}
\author{
Original code by Stefan Kraft, redesign and generalizations by Andreas Alfons.
}
\note{
The basic household structure and any other categorical predictors need to
be simulated beforehand with the functions \code{\link{simStructure}} and
\code{\link{simCategorical}}, respectively.
}
\seealso{
\code{\link{simStructure}}, \code{\link{simCategorical}},
\code{\link{simComponents}}, \code{\link{simEUSILC}}
}
\examples{
\dontrun{
## these take some time and are not run automatically
## copy & paste to the R command line
set.seed(1234) # for reproducibility
data(eusilcS) # load sample data
eusilcP <- simStructure(eusilcS)
eusilcP <- simCategorical(eusilcS, eusilcP)
basic <- c("age", "rb090", "hsize", "pl030", "pb220a")
# multinomial model with random draws
eusilcM <- simContinuous(eusilcS, eusilcP,
basic = basic, upper = 200000, equidist = FALSE)
summary(eusilcM)
# two-step regression
eusilcT <- simContinuous(eusilcS, eusilcP,
basic = basic, method = "lm")
summary(eusilcT)
}
}
\keyword{datagen}