Content - 7355008d9ecb3b5568c82d7a7bdde997e568c3a2 - 7faddaa/Ecdf.Rd

Ecdf.Rd
\name{Ecdf}
\alias{Ecdf}
\alias{Ecdf.default}
\alias{Ecdf.data.frame}
\alias{Ecdf.formula}
\alias{panel.Ecdf}
\alias{prepanel.Ecdf}
\title{Empirical Cumulative Distribution Plot}
\description{
Computes coordinates of cumulative distribution function of x, and by defaults
plots it as a step function.  A grouping variable may be specified so that
stratified estimates are computed and (by default) plotted.  If there is
more than one group, the \code{labcurve} function is used (by default) to label
the multiple step functions or to draw a legend defining line types, colors,
or symbols by linking them with group labels.  A \code{weights} vector may
be specified to get weighted estimates.  Specify \code{normwt} to make
\code{weights} sum to the length of \code{x} (after removing NAs).  Other wise
the total sample size is taken to be the sum of the weights.

\code{Ecdf} is actually a method, and \code{Ecdf.default} is what's
called for a vector argument.  \code{Ecdf.data.frame} is called when the
first argument is a data frame.  This function can automatically set up
a matrix of ECDFs and wait for a mouse click if the matrix requires more
than one page.  Categorical variables, character variables, and
variables having fewer than a set number of unique values are ignored.
If \code{par(mfrow=..)} is not set up before \code{Ecdf.data.frame} is
called, the function will try to figure the best layout depending on the
number of variables in the data frame.  Upon return the original
\code{mfrow} is left intact.

When the first argument to \code{Ecdf} is a formula, a Trellis/Lattice function
\code{Ecdf.formula} is called.  This allows for multi-panel
conditioning, superposition using a \code{groups} variable, and other
Trellis features, along with the ability to easily plot transformed
ECDFs using the \code{fun} argument.  For example, if \code{fun=qnorm},
the inverse normal transformation will be used for the y-axis.  If the
transformed curves are linear this indicates normality.  Like the
\code{xYplot} function, \code{Ecdf} will create a function \code{Key} if
the \code{groups} variable is used.  This function can be invoked by the
user to define the keys for the groups.
}

\usage{
Ecdf(x, \dots)

\method{Ecdf}{default}(x, what=c('F','1-F','f','1-f'),
     weights=rep(1, length(x)), normwt=FALSE,
     xlab, ylab, q, pl=TRUE, add=FALSE, lty=1, 
     col=1, group=rep(1,length(x)), label.curves=TRUE, xlim, 
     subtitles=TRUE, datadensity=c('none','rug','hist','density'),
     side=1, 
     frac=switch(datadensity,none=NA,rug=.03,hist=.1,density=.1),
     dens.opts=NULL, lwd=1, log='', \dots)


\method{Ecdf}{data.frame}(x, group=rep(1,nrows),
     weights=rep(1, nrows), normwt=FALSE,
     label.curves=TRUE, n.unique=10, na.big=FALSE, subtitles=TRUE, 
     vnames=c('labels','names'),\dots)

\method{Ecdf}{formula}(x, data=sys.frame(sys.parent()), groups=NULL,
     prepanel=prepanel.Ecdf, panel=panel.Ecdf, \dots, xlab,
     ylab, fun=function(x)x, what=c('F','1-F','f','1-f'), subset=TRUE)
}
\arguments{
\item{x}{a numeric vector, data frame, or Trellis/Lattice formula}
\item{what}{
The default is \code{"F"} which results in plotting the fraction of values
<= x.  Set to \code{"1-F"} to plot the fraction > x or \code{"f"} to plot the
cumulative frequency of values <= x.  Use \code{"1-f"} to plot the
cumulative frequency of values >= x.
}
\item{weights}{
numeric vector of weights.  Omit or specify a zero-length vector or
NULL to get unweighted estimates.
}
\item{normwt}{see above}
\item{xlab}{
x-axis label.  Default is label(x) or name of calling argument.  For
\code{Ecdf.formula}, \code{xlab} defaults to the \code{label} attribute
of the x-axis variable.
}
\item{ylab}{
y-axis label.  Default is \code{"Proportion <= x"}, \code{"Proportion > x"}, 
or "Frequency <= x" depending on value of \code{what}.
}
\item{q}{
a vector for quantiles for which to draw reference lines on the plot.
Default is not to draw any.
}
\item{pl}{set to F to omit the plot, to just return estimates}
\item{add}{
set to TRUE to add the cdf to an existing plot.  Does not apply if using
     lattice graphics (i.e., if a formula is given as the first argument).
}
\item{lty}{
integer line type for plot.  If \code{group} is specified, this can be a vector.
}
\item{lwd}{
  line width for plot.  Can be a vector corresponding to \code{group}s.
}
\item{log}{
	see \code{\link{plot}}.  Set \code{log='x'} to use log scale for
  \code{x}-axis.
	}
\item{col}{
color for step function.  Can be a vector.
}
\item{group}{
a numeric, character, or \code{factor} categorical variable used for stratifying
estimates.  If \code{group} is present, as many ECDFs are drawn as there are
non--missing group levels.
}
\item{label.curves}{
applies if more than one \code{group} exists.
Default is \code{TRUE} to use \code{labcurve} to label curves where they are farthest
apart.  Set \code{label.curves} to a \code{list} to specify options to
\code{labcurve}, e.g., \code{label.curves=list(method="arrow", cex=.8)}.
These option names may be abbreviated in the usual way arguments
are abbreviated.  Use for example \code{label.curves=list(keys=1:5)}
to draw symbols periodically (as in \code{pch=1:5} - see \code{points})
on the curves and automatically position a legend
in the most empty part of the plot.  Set \code{label.curves=FALSE} to
suppress drawing curve labels.  The \code{col}, \code{lty}, and \code{type}
parameters are automatically passed to \code{labcurve}, although you
can override them here.  You can set \code{label.curves=list(keys="lines")} to
have different line types defined in an automatically positioned key.
}
\item{xlim}{
x-axis limits.  Default is entire range of \code{x}.
}
\item{subtitles}{
set to \code{FALSE} to suppress putting a subtitle at the bottom left of each
plot.  The subtitle indicates the numbers of
non-missing and missing observations, which are labeled \code{n}, \code{m}.
}
\item{datadensity}{
If \code{datadensity} is not \code{"none"}, either \code{scat1d} or \code{histSpike} is called to
add a rug plot (\code{datadensity="rug"}), spike histogram
(\code{datadensity="hist"}), or smooth density estimate (\code{"density"}) to
the bottom or top of the ECDF.
}
\item{side}{
If \code{datadensity} is not \code{"none"}, the default is to place the additional
information on top of the x-axis (\code{side=1}).  Use \code{side=3} to place at
the top of the graph.
}
\item{frac}{
passed to \code{histSpike}
}
\item{dens.opts}{
a list of optional arguments for \code{histSpike}
}
\item{...}{
other parameters passed to plot if add=F.  For data frames, other
parameters to pass to \code{Ecdf.default}.
For \code{Ecdf.formula}, if \code{groups} is not used, you can also add
data density information to each panel's ECDF by specifying the
\code{datadensity} and optional \code{frac}, \code{side},
\code{dens.opts} arguments. 
}
\item{n.unique}{
minimum number of unique values before an ECDF is drawn for a variable
in a data frame.  Default is 10.
}
\item{na.big}{
set to \code{TRUE} to draw the number of NAs in larger letters in the middle of
the plot for \code{Ecdf.data.frame}
}
\item{vnames}{
By default, variable labels are used to label x-axes.  Set \code{vnames="names"}
to instead use variable names.
}
\item{method}{
method for computing the empirical cumulative distribution.  See
\code{wtd.Ecdf}.  The default is to use the standard \code{"i/n"} method as is
used by the non-Trellis versions of \code{Ecdf}.
}
\item{fun}{
a function to transform the cumulative proportions, for the
Trellis-type usage of \code{Ecdf}
}
\item{data, groups, subset,prepanel, panel}{the usual Trellis/Lattice parameters, with \code{groups}
  causing \code{Ecdf.formula} to overlay multiple ECDFs on one panel.}
}
\value{
for \code{Ecdf.default} an invisible list with elements x and y giving the
coordinates of the cdf.  If there is more than one \code{group}, a list of
such lists is returned.  An attribute, \code{N}, is in the returned
object.  It contains the elements \code{n} and \code{m}, the number of
non-missing and missing observations, respectively.
}
\author{
Frank Harrell
\cr
Department of Biostatistics, Vanderbilt University
\cr
\email{fh@fharrell.com}
}
\section{Side Effects}{
plots
}
\seealso{
\code{\link{wtd.Ecdf}}, \code{\link{label}}, \code{\link{table}}, \code{\link{cumsum}}, \code{\link{labcurve}}, \code{\link{xYplot}}, \code{\link{histSpike}}
}
\examples{
set.seed(1)
ch <- rnorm(1000, 200, 40)
Ecdf(ch, xlab="Serum Cholesterol")
scat1d(ch)                       # add rug plot
histSpike(ch, add=TRUE, frac=.15)   # add spike histogram
# Better: add a data density display automatically:
Ecdf(ch, datadensity='density')


label(ch) <- "Serum Cholesterol"
Ecdf(ch)
other.ch <- rnorm(500, 220, 20)
Ecdf(other.ch,add=TRUE,lty=2)


sex <- factor(sample(c('female','male'), 1000, TRUE))
Ecdf(ch, q=c(.25,.5,.75))  # show quartiles
Ecdf(ch, group=sex,
     label.curves=list(method='arrow'))


# Example showing how to draw multiple ECDFs from paired data
pre.test <- rnorm(100,50,10)
post.test <- rnorm(100,55,10)
x <- c(pre.test, post.test)
g <- c(rep('Pre',length(pre.test)),rep('Post',length(post.test)))
Ecdf(x, group=g, xlab='Test Results', label.curves=list(keys=1:2))
# keys=1:2 causes symbols to be drawn periodically on top of curves


# Draw a matrix of ECDFs for a data frame
m <- data.frame(pre.test, post.test, 
                sex=sample(c('male','female'),100,TRUE))
Ecdf(m, group=m$sex, datadensity='rug')


freqs <- sample(1:10, 1000, TRUE)
Ecdf(ch, weights=freqs)  # weighted estimates


# Trellis/Lattice examples:


region <- factor(sample(c('Europe','USA','Australia'),100,TRUE))
year <- factor(sample(2001:2002,1000,TRUE))
Ecdf(~ch | region*year, groups=sex)
Key()           # draw a key for sex at the default location
# Key(locator(1)) # user-specified positioning of key
age <- rnorm(1000, 50, 10)
Ecdf(~ch | equal.count(age), groups=sex)  # use overlapping shingles
Ecdf(~ch | sex, datadensity='hist', side=3)  # add spike histogram at top
}
\keyword{nonparametric}
\keyword{hplot}
\keyword{methods}
\keyword{distribution}
\concept{trellis}
\concept{lattice}