swh:1:snp:dcd7ac99330b9dc1737f0a46f0985095e58d6e65
Tip revision: 19e463061fd494f6572b5c7085feee655fde67ad authored by Luis M Rodriguez-R on 19 May 2015, 17:55:56 UTC
Old binaries removed
Old binaries removed
Tip revision: 19e4630
Nonpareil.R
# INTERNAL
Nonpareil.__init_globals <- function(
### Internal Nonpareil function.
soft=TRUE){
if(soft && exists('Nonpareil.LastFactor')) return();
Nonpareil.LastFactor <<- NULL;
Nonpareil.LastXmax <<- NULL;
Nonpareil.OpenColors <<- c();
Nonpareil.OpenNames <<- c();
}
# MODEL FUNCTIONS
#Nonpareil.f <- function(x,a,b){ return( ( 1 - exp(-x/(2^a)) )^b ) }
#Nonpareil.antif <- function(y,a,b){ return( -(2^a)*log(1 - y^(1/b)) ) }
Nonpareil.f <- function(
### Function of the projected model.
x,
### Values of sequencing effort (typically in bp).
a,
### Parameter alpha of the gamma CDF.
b
### Parameter beta of the Gamma CDF.
){ return(pgamma(log(x+1),a, b))
### Predicted values of abundance-weighted average coverage.
}
Nonpareil.antif <- function(
### Complement function of Nonpareil.f.
y,
### Values of abundance-weighted average coverage
a,
### Parameter alpha of the gamma CDF.
b
### Parameter beta of the gamma CDF.
){ return(exp(qgamma(y,a,b))-1)
### Estimated sequencing effort.
}
#Nonpareil.f <- function(x,a,b){ lx<-log(max(1, x)); return(exp((lx^a)/(b + lx^a))) }
#Nonpareil.antif <- function(y,a,b){ ly<-log(max(1e-10, y)); return(exp((b*ly/(1-ly))^(1/a))) }
# NONPAREIL CURVES
Nonpareil.curve.batch <- function(
### Generates a single plot (and a single table of estimates) with multiple Nonpareil
### curves, from different .npo files.
files,
### Vector of characters with the paths to the .npo files.
overlap,
### Value of the '-L' parameter (in Nonpareil, the default is 50). It can be a number (if all the curves
### were generated with the same value) or a vector (in the same order of 'files'). See the 'overlap' value
### of 'Nonpareil.curve()'. Use only with Nonpareil < v2.0.
r=NA,
### Values of 'r' in 'Nonpareil.curve()' (red value).
g=NA,
### Values of 'g' in 'Nonpareil.curve()' (green value).
b=NA,
### Values of 'b' in 'Nonpareil.curve()' (blue value).
libnames=NA,
### Vector of names of the libraries (corresponding to 'libname' in 'Nonpareil.curve()'). It must be characters,
### not factors.
read.lengths=NA,
### A vector of numbers indicating the length of the reads (corresponding to 'read.length' in 'Nonpareil.curve()').
### Use only with Nonpareil < v2.0.
col=NA,
### Values of 'col' in 'Nonpareil.curve()'.
...
### Any other parameter accepted by 'Nonpareil.curve()' is supported.
){
if(!is.vector(files)) files = as.vector(files);
if(missing(overlap)) overlap=rep(NULL, length(files));
new=TRUE;
for(i in 1:length(files)){
o = Nonpareil.curve(as.character(files[i]), overlap, r=r[i], g=g[i], b=b[i], col=col[i],
libname=ifelse(is.na(libnames[1]), NA, as.character(libnames[i])), new=new,
read.length=ifelse(is.na(read.lengths[1]), NA, read.lengths[i]), ...);
if(new){
out.m = matrix(NA, ncol=length(o), nrow=length(files));
colnames(out.m) <- rownames(as.matrix(o));
if(!is.na(libnames[1])) rownames(out.m) <- libnames;
}
out.m[i, ] <- as.numeric(o);
new = FALSE;
}
return(out.m);
### A dataframe containing the values generated by 'Nonpareil.curve()'.
}
Nonpareil.curve <- function(
### Generates a Nonpareil curve from a .npo file.
file,
### Path to the .npo file, containing the read redundancy.
overlap=NULL,
### Value of the '-L' parameter (in Nonpareil, the default is 50). If not set, it tries to find the value in the
### .npo file (supported in Nonpareil >= 2.0), or fails with an error message. Use only with Nonpareil < v2.0.
factor=1,
### Multiplier of the sequencing effort. This can be used to express the sequencing effort in units other than
### base pairs (bp). For example, to express sequencing effort as Gbp, use factor=1e-9. This can also affect
### the fit of the model, and it's considered EXPERIMENTAL.
plotDispersion=NA,
### Indicates if (and how) dispersion of the replicates should be plotted. It requires modelOnly=FALSE to take
### effect. It can be NA, in which case no dispersion is plotted, or any of the following strings: 'sd' (one
### standard deviation around the mean), 'ci95' (95% confidence interval), 'ci90' (90% confidence interval),
### 'ci50' (50% confidence interval), 'iq' (inter-quartile range).
returnModelValues=FALSE,
### If TRUE, returns the coordinates of the model as model.x and model.y.
returnModelParameters=FALSE,
### If TRUE, returns the model itself as model.
xmax=10e12,
### Maximum sequencing effort to plot.
xmin=1e3,
### Minimum sequencing effort to plot
ymax=1,
### Maximum coverage to plot.
ymin=1e-6,
### Minimum coverage to plot.
xlab=NULL,
### Label of the X-axis. If NULL, it's set to sequencing effort and the units (see factor).
ylab=NULL,
### Label of the Y-axis. If NULL, it's set to Estimated average coverage.
col=NA,
### The color of the curve. If passed, it overrides `r`, `g`, and `b`.
r=NA,
### Red component of the curve's color. If NA, it's randomly set. If <=1, it's assumed to be in the range [0,1]; if
### >1, it's assumed to be in the range [0,256].
g=NA,
### Green component of the curve's color. Same as `r`.
b=NA,
### Blue component of the curve's color. Same as `r`.
new=TRUE,
### If FALSE, it attempts to use a previous (active) canvas to plot the curve.
plot=TRUE,
### Determines if the plot should be produced. If FALSE, it still computes the coverage and the model.
libname=NA,
### Name of the library. If NA, it's determined by the file name. This is useful if you plan to call Nonpareil.legend().
modelOnly=FALSE,
### If TRUE, the rarefied data is not presented, only the fitted model.
plotModel=TRUE,
### If FALSE, the model is not plotted (but it's still computed).
plotDiversity=FALSE,
### If TRUE, the diversity estimate is plotted as a small arrow below the Nonpareil curve.
curve.lwd=2,
### Line width of the Nonpareil curve.
curve.alpha=0.4,
### Alpha value (from 0 to 1) of the Nonpareil curve.
model.lwd=1,
### Line width of the model.
model.alpha=1,
### Alpha value (from 0 to 1) of the model.
log='x',
### Axis to plot in logarithmic scale. It can be 'x' (sequencing effort, default), 'y' (coverage), 'xy' (both logarithmic),
### or '' (both linear).
data.consistency=TRUE,
### If TRUE, it checks the consistency of the data before plotting.
useValue='mean',
### Controls how the replicates are to be summarized at each point of sequencing effort. It can be any of: 'mean' (average of
### the replicates), 'median' (median), 'ub' (upper boundary of the 95% confidence interval), 'lb' (lower boundary of the 95%
### confidence interval), 'q1' (quartile 1), 'q3' (quartile 3). Note that the quartile 2 is also 'median'.
star=95,
### Objective coverage (in percentage). By default: 95, which means the sequencing effort required to reach 95% average
### coverage is to be estimated.
read.length=NA,
### Length of the reads. Use only with Nonpareil < v2.0.
weights.exp=NA,
### Vector of values to be tested (in order) as exponent of the weights distribution. If the model fails to converge, sometimes
### manual modifications in this parameter may help. By default (NA), five different values are tested in the following order:
### For linear sampling, -1.1, -1.2, -0.9, -1.3, -1. For logarithmic sampling (-d option in Nonpareil), 0, 1, -1, 1.3, -1.1,
### 1.5, -1.5.
...
### Any other parameters accepted by plot().
){
# Create environment
Nonpareil.__init_globals(!new);
# Examine consistency
if(is.null(file)) stop('The file argument is mandatory');
if(!new && (is.null(Nonpareil.LastFactor) || is.null(Nonpareil.LastXmax)))
stop('No previous plot found, please use new=TRUE if this is the first plot.');
# Read metadata (.npo header)
log_sampling <- 0;
meta_data <- gsub('^# @', '', grep("^# @", readLines(file), value=TRUE));
keys <- gsub(': .*', '', meta_data);
vals <- gsub('.*: ', '', meta_data);
if(is.null(overlap) & 'overlap' %in% keys) overlap = as.numeric(vals[keys=='overlap']);
if(is.null(overlap)) stop('The overlap argument is mandatory for Nonpareil versions before 2.0.');
if(is.na(read.length) & 'L' %in% keys) read.length = as.numeric(vals[keys=='L']);
if(is.na(read.length)) read.length=101;
num_reads = NULL;
if('R' %in% keys) num_reads = as.numeric(vals[keys=='R']);
if('divide' %in% keys) log_sampling = as.numeric(vals[keys=='divide']);
if('logsampling' %in% keys) log_sampling = as.numeric(vals[keys=='logsampling']);
# Read input
out <- list(kappa=0, C=0, LRstar=0, LR=0, modelR=0, diversity=0);
a <- read.table(file, sep="\t", h=F);
out$kappa <- a$V2[nrow(a)];
if(is.null(num_reads)) num_reads = max(a$V1);
LR <- exp(log(num_reads) + log(read.length));
out$LR <- LR;
for(i in 2:6) a[, i] <- a[, i]^Nonpareil.coverageFactor(overlap);
if(useValue=='median'){
values = a[, 5]
}else if(useValue=='ub'){
values = pmin(1, a[, 2] + a[, 3]*1.9)
}else if(useValue=='lb'){
values = pmax(0, a[, 2] - a[, 3]*1.9)
}else if(useValue=='q1'){
values = a[, 4]
}else if(useValue=='q3'){
values = a[, 6]
}else{
values = a[, 2]
}
a$V1 = exp(max(log(a$V1)) + max(values^0.27)*(log(a$V1) - max(log(a$V1))));
a$V1 = exp(log(a$V1)*0.61 + 10);
a$V1 = a$V1 * read.length / 101;
horiz.diff = LR / max(a$V1);
a$V1 = a$V1 * horiz.diff;##
horiz.diff = 1;
if(is.na(libname)) {
libname <- basename(file);
if(substr(libname, nchar(libname)-3, nchar(libname))==".npo")
libname <- substr(libname, 0, nchar(libname)-4);
}
# Check data
out$C <- max(values);
if(data.consistency){
twenty.pc = which.max(a$V1[a$V1<=0.2*max(a$V1)]);
if(a[twenty.pc, 5]==0){
warning('Median of the curve is zero at 20% of the reads, check parameters and re-run (e.g., decrease value of -L in nonpareil).');
return(out);
}
if(a[nrow(a), 2]<=1e-5){
warning('Curve too low, check parameters and re-run (e.g., decrease value of -L in nonpareil).');
return(out);
}
if(a[2, 2]>=1-1e-5){
warning('Curve too steep, check parameters and re-run (e.g., decrease value of -i in nonpareil or use -d).');
return(out);
}
#if(min(a[a[,2]>0, 2])<0.01){
# warning('Curve undefined around S* (kappa=05%), check parameters and re-run (e.g., decrease the value of -i in nonpareil).');
# return(0);
#}
}
# For future calls (of this or other Nonpareil.* functions)
Nonpareil.LastFactor <<- factor;
# Draw it
if(plot){
if(new){
if(!is.null(xlab)){ xlab <- as.character(xlab) }
else if(factor==1){ xlab <- 'Sequencing effort (bp)' }
else if(factor==1e-1){ xlab <- 'Sequencing effort (Dbp)' }
else if(factor==1e-2){ xlab <- 'Sequencing effort (Hbp)' }
else if(factor==1e-3){ xlab <- 'Sequencing effort (Kbp)' }
else if(factor==1e-6){ xlab <- 'Sequencing effort (Mbp)' }
else if(factor==1e-9){ xlab <- 'Sequencing effort (Gbp)' }
else if(factor==1e-12){ xlab <- 'Sequencing effort (Tbp)' }
else if(factor==1e-15){ xlab <- 'Sequencing effort (Pbp)' }
else if(factor==1e-18){ xlab <- 'Sequencing effort (Ebp)' }
else if(factor==1e-21){ xlab <- 'Sequencing effort (Zbp)' }
else{ xlab <- paste('Sequencing effort (by ', factor, 'bp)', sep='') }
if(is.null(ylab)) ylab <- 'Estimated average coverage';
plot(1, t='n', xlim=c(xmin, xmax), ylim=c(ymin, ymax), xlab=xlab, ylab=ylab, log=log, ...);
abline(h=c(1,star/100), lty=2, col='red');
abline(v=10^seq(0,15,by=3), lty=2, col='gray80')
}
if(!is.na(col)){
r <- col2rgb(col)['red',1]/256;
g <- col2rgb(col)['green',1]/256;
b <- col2rgb(col)['blue',1]/256;
}
if(is.na(r)) r <- sample(200,1);
if(is.na(g)) g <- sample(200,1);
if(is.na(b)) b <- sample(200,1);
if(r>1) r <- r/256;
if(g>1) g <- g/256;
if(b>1) b <- b/256;
if(!modelOnly){
if(!is.na(plotDispersion)){
if(plotDispersion == 'sd'){
err.y <- c(values+a$V3, rev(values-a$V3));
}else if(plotDispersion == 'ci95'){
err.y <- c(values+a$V3*1.9, rev(values-a$V3*1.9));
}else if(plotDispersion == 'ci90'){
err.y <- c(values+a$V3*1.64, rev(values-a$V3*1.64));
}else if(plotDispersion == 'ci50'){
err.y <- c(values+a$V3*.67, rev(values-a$V3*.67));
}else if(plotDispersion == 'iq'){
err.y <- c(a$V4, rev(a$V6));
}
polygon(horiz.diff*c(a$V1, rev(a$V1))*factor, ifelse(err.y<=ymin*0.1, ymin*0.1, err.y), col=rgb(r,g,b,.2), border=NA);
}
lines(horiz.diff*a$V1*factor, values, col=rgb(r,g,b,curve.alpha), lwd=curve.lwd);
}
# Save some info
Nonpareil.LastXmax <<- xmax;
Nonpareil.OpenColors <<- c(Nonpareil.OpenColors, rgb(r,g,b));
Nonpareil.OpenNames <<- c(Nonpareil.OpenNames, libname);
}
# Model it
sel <- values>0 & values<0.9;
x <- a$V1[sel];
if((length(x)>10) | ! data.consistency){
y <- values[sel];
data <- list(x=x, y=y)
estModel <- TRUE;
if(is.na(weights.exp[1])){
if(log_sampling==0){ weights.exp <- c(-1.1,-1.2,-0.9,-1.3,-1) }
else{ weights.exp <- c(0, 1, -1, 1.3, -1.1, 1.5, -1.5) }
}
weights.i <- 0;
while(estModel){
weights.i <- weights.i+1;
model <- nls(y ~ Nonpareil.f(x, a, b), data=data, weights=(a$V3[sel]^weights.exp[weights.i]), start=list(a=1, b=0.1), lower=c(a=0, b=0), algorithm='port',
control=nls.control(minFactor=1e-25000, tol=1e-15, maxiter=1024, warnOnly=T));
if(!is.na(weights.exp[weights.i+1]) | summary(model)$convInfo$isConv) estModel <- FALSE;
}
if(summary(model)$convInfo$isConv){
model.lty=2;
if(modelOnly) model.lty=1;
if(plot & plotModel){
model.x <- exp(seq(log(xmin), log(xmax), length.out=1e3));
model.y <- predict(model, list(x=model.x));
model.x <- model.x*horiz.diff*factor;
lines(model.x, model.y, col=rgb(r,g,b,model.alpha), lty=model.lty, lwd=model.lwd);
if(modelOnly) points(max(a$V1), predict(model, list(x=max(a$V1))), col=rgb(r,g,b), pch=21, bg='white');
#if(modelOnly) points(min(a$V1[a$V1>0]), predict(model, list(x=min(a$V1[a$V1>0]))), col=rgb(r,g,b), pch=8, bg='white');
}
if(returnModelValues) { out$model.x <- model.x ; out$model.y <- model.y; }
if(returnModelParameters) { out$model=model }
pa <- coef(model)[1];
pb <- coef(model)[2];
if(pa > 1) out$diversity <- (pa-1)/pb;
if(plot & plotDiversity & out$diversity>0)
arrows(x0=exp(out$diversity),
y0=ifelse(log=='y' | log=='xy' | log=='yx', ymin*0.1, ymin-1),
y1=ymin,
length=0.1, col=rgb(r,g,b,model.alpha));
out$LRstar <- Nonpareil.antif(star/100, pa, pb);
out$modelR <- cor(y, predict(model, list(x=x)));
}else{
warning('Model didn\'t converge.');
}
}else{
warning('Insufficient resolution below 90% coverage, skipping model');
}
return(out);
### A list with the following entries:
###
### kappa: "Redundancy" value of the entire dataset.
###
### C: Average coverage of the entire dataset.
###
### LRstar: Estimated sequencing effort required to reach the objective average coverage (star, 95% by default).
###
### LR: Actual sequencing effort of the dataset.
###
### modelR: Pearson's R coefficient betweeen the rarefied data and the projected model.
###
### diversity: Nonpareil-based diversity index. This value's units are the natural logarithm of the units of
### sequencing effort (by default, log-bp), and indicates the inflection point of the fitted model for the
### Nonpareil curve. If the fit doesn't converge, or the model is not estimated, the value is zero (0).
###
### model.x (if retturnModelValues=TRUE): Values of sequencing effort in the projected model.
###
### model.y (if retturnModelValues=TRUE): Values of average coverage in the projected model.
###
### model (if returnModelParameters=TRUE): Fitted model.
###
}
Nonpareil.legend <- function(
### Generates a legend for Nonpareil plots, after calling Nonpareil.curve() or Nonpareil.curve.batch().
x=NULL,
### X coordinate, or any character string accepted by legend (e.g., 'bottomright').
y=.3,
### Y coordinate.
...
### Any other parameters supported by legend().
){
if(is.null(Nonpareil.LastFactor) || is.null(Nonpareil.LastXmax))
stop('There must be at least one active plot to draw a legend. Use Nonpareil.curve().');
if(is.null(x)) x <- 0.75*Nonpareil.LastXmax;
legend(x=x, y=y, legend=Nonpareil.OpenNames, fill=Nonpareil.OpenColors, ...);
}
## PREDICTIONS
Nonpareil.coverageFactor <- function(
### Factor to transform redundancy into coverage (internal function).
overlap
### Value of overlap (-L).
){
if(overlap==25){
return(.845);
}else if(overlap==50){
return(.757);
}else if(overlap==75){
return(.633);
}else if(overlap==100){
return(.137);
}else{
stop('Unsupported overlap. Supported values are: 100, 75, 50, and 25.');
}
### A numeric scalar.
}