Content - 7a1b424b35bc11b5818f9927bf88df712c34a1f5 - ec05f50/utils/Nonpareil.R

swh:1:snp:dcd7ac99330b9dc1737f0a46f0985095e58d6e65

Tip revision: 19e463061fd494f6572b5c7085feee655fde67ad authored by Luis M Rodriguez-R on 19 May 2015, 17:55:56 UTC
Old binaries removed
Tip revision: 19e4630
Nonpareil.R
# INTERNAL
Nonpareil.__init_globals <- function(
	### Internal Nonpareil function.
	soft=TRUE){
	if(soft && exists('Nonpareil.LastFactor')) return();
	Nonpareil.LastFactor <<- NULL;
	Nonpareil.LastXmax <<- NULL;
	Nonpareil.OpenColors <<- c();
	Nonpareil.OpenNames <<- c();
}

# MODEL FUNCTIONS
#Nonpareil.f <- function(x,a,b){ return( ( 1 - exp(-x/(2^a)) )^b ) }
#Nonpareil.antif <- function(y,a,b){ return( -(2^a)*log(1 - y^(1/b)) ) }
Nonpareil.f <- function(
	### Function of the projected model.
	x,
	### Values of sequencing effort (typically in bp).
	a,
	### Parameter alpha of the gamma CDF.
	b
	### Parameter beta of the Gamma CDF.
	){ return(pgamma(log(x+1),a, b))
	### Predicted values of abundance-weighted average coverage.
}
Nonpareil.antif <- function(
	### Complement function of Nonpareil.f.
	y,
	### Values of abundance-weighted average coverage
	a,
	### Parameter alpha of the gamma CDF.
	b
	### Parameter beta of the gamma CDF.
	){ return(exp(qgamma(y,a,b))-1)
	### Estimated sequencing effort.
}
#Nonpareil.f <- function(x,a,b){ lx<-log(max(1, x)); return(exp((lx^a)/(b + lx^a))) }
#Nonpareil.antif <- function(y,a,b){ ly<-log(max(1e-10, y)); return(exp((b*ly/(1-ly))^(1/a))) }

# NONPAREIL CURVES
Nonpareil.curve.batch <- function(
	### Generates a single plot (and a single table of estimates) with multiple Nonpareil
	### curves, from different .npo files.
	files,
	### Vector of characters with the paths to the .npo files.
	overlap,
	### Value of the '-L' parameter (in Nonpareil, the default is 50). It can be a number (if all the curves
	### were generated with the same value) or a vector (in the same order of 'files'). See the 'overlap' value
	### of 'Nonpareil.curve()'. Use only with Nonpareil < v2.0.
	r=NA,
	### Values of 'r' in 'Nonpareil.curve()' (red value).
	g=NA,
	### Values of 'g' in 'Nonpareil.curve()' (green value).
	b=NA,
	### Values of 'b' in 'Nonpareil.curve()' (blue value).
	libnames=NA,
	### Vector of names of the libraries (corresponding to 'libname' in 'Nonpareil.curve()'). It must be characters,
	### not factors.
	read.lengths=NA,
	### A vector of numbers indicating the length of the reads (corresponding to 'read.length' in 'Nonpareil.curve()').
	### Use only with Nonpareil < v2.0.
	col=NA,
	### Values of 'col' in 'Nonpareil.curve()'.
	...
	### Any other parameter accepted by 'Nonpareil.curve()' is supported.
	){
   if(!is.vector(files)) files = as.vector(files);
   if(missing(overlap)) overlap=rep(NULL, length(files));
   new=TRUE;
   for(i in 1:length(files)){
      o = Nonpareil.curve(as.character(files[i]), overlap, r=r[i], g=g[i], b=b[i], col=col[i],
      		libname=ifelse(is.na(libnames[1]), NA, as.character(libnames[i])), new=new,
		read.length=ifelse(is.na(read.lengths[1]), NA, read.lengths[i]), ...);
      if(new){
	 out.m = matrix(NA, ncol=length(o), nrow=length(files));
         colnames(out.m) <- rownames(as.matrix(o));
	 if(!is.na(libnames[1])) rownames(out.m) <- libnames;
      }
      out.m[i, ] <- as.numeric(o);
      new = FALSE;
   }
   return(out.m);
   ### A dataframe containing the values generated by 'Nonpareil.curve()'.
}

Nonpareil.curve <- function(
	### Generates a Nonpareil curve from a .npo file.
	file,
	### Path to the .npo file, containing the read redundancy.
	overlap=NULL,
	### Value of the '-L' parameter (in Nonpareil, the default is 50). If not set, it tries to find the value in the
	### .npo file (supported in Nonpareil >= 2.0), or fails with an error message. Use only with Nonpareil < v2.0.
	factor=1,
	### Multiplier of the sequencing effort. This can be used to express the sequencing effort in units other than
	### base pairs (bp). For example, to express sequencing effort as Gbp, use factor=1e-9. This can also affect
	### the fit of the model, and it's considered EXPERIMENTAL.
	plotDispersion=NA,
	### Indicates if (and how) dispersion of the replicates should be plotted. It requires modelOnly=FALSE to take
	### effect. It can be NA, in which case no dispersion is plotted, or any of the following strings: 'sd' (one
	### standard deviation around the mean), 'ci95' (95% confidence interval), 'ci90' (90% confidence interval),
	### 'ci50' (50% confidence interval), 'iq' (inter-quartile range).
	returnModelValues=FALSE,
	### If TRUE, returns the coordinates of the model as model.x and model.y.
	returnModelParameters=FALSE,
	### If TRUE, returns the model itself as model.
	xmax=10e12,
	### Maximum sequencing effort to plot.
	xmin=1e3,
	### Minimum sequencing effort to plot
	ymax=1,
	### Maximum coverage to plot.
	ymin=1e-6,
	### Minimum coverage to plot.
	xlab=NULL,
	### Label of the X-axis. If NULL, it's set to sequencing effort and the units (see factor).
	ylab=NULL,
	### Label of the Y-axis. If NULL, it's set to Estimated average coverage.
	col=NA,
	### The color of the curve. If passed, it overrides `r`, `g`, and `b`.
	r=NA,
	### Red component of the curve's color. If NA, it's randomly set. If <=1, it's assumed to be in the range [0,1]; if
	### >1, it's assumed to be in the range [0,256].
	g=NA,
	### Green component of the curve's color. Same as `r`.
	b=NA,
	### Blue component of the curve's color. Same as `r`.
	new=TRUE,
	### If FALSE, it attempts to use a previous (active) canvas to plot the curve.
	plot=TRUE,
	### Determines if the plot should be produced. If FALSE, it still computes the coverage and the model.
	libname=NA,
	### Name of the library. If NA, it's determined by the file name. This is useful if you plan to call Nonpareil.legend().
	modelOnly=FALSE,
	### If TRUE, the rarefied data is not presented, only the fitted model.
	plotModel=TRUE,
	### If FALSE, the model is not plotted (but it's still computed).
	plotDiversity=FALSE,
	### If TRUE, the diversity estimate is plotted as a small arrow below the Nonpareil curve.
	curve.lwd=2,
	### Line width of the Nonpareil curve.
	curve.alpha=0.4,
	### Alpha value (from 0 to 1) of the Nonpareil curve.
	model.lwd=1,
	### Line width of the model.
	model.alpha=1,
	### Alpha value (from 0 to 1) of the model.
	log='x',
	### Axis to plot in logarithmic scale. It can be 'x' (sequencing effort, default), 'y' (coverage), 'xy' (both logarithmic),
	### or '' (both linear).
	data.consistency=TRUE,
	### If TRUE, it checks the consistency of the data before plotting.
	useValue='mean',
	### Controls how the replicates are to be summarized at each point of sequencing effort. It can be any of: 'mean' (average of
	### the replicates), 'median' (median), 'ub' (upper boundary of the 95% confidence interval), 'lb' (lower boundary of the 95%
	### confidence interval), 'q1' (quartile 1), 'q3' (quartile 3). Note that the quartile 2 is also 'median'.
	star=95,
	### Objective coverage (in percentage). By default: 95, which means the sequencing effort required to reach 95% average
	### coverage is to be estimated.
	read.length=NA,
	### Length of the reads. Use only with Nonpareil < v2.0.
	weights.exp=NA,
	### Vector of values to be tested (in order) as exponent of the weights distribution. If the model fails to converge, sometimes
	### manual modifications in this parameter may help. By default (NA), five different values are tested in the following order:
	### For linear sampling, -1.1, -1.2, -0.9, -1.3, -1. For logarithmic sampling (-d option in Nonpareil), 0, 1, -1, 1.3, -1.1,
	### 1.5, -1.5.
	...
	### Any other parameters accepted by plot().
	){
	
	# Create environment
	Nonpareil.__init_globals(!new);
	
	# Examine consistency
	if(is.null(file)) stop('The file argument is mandatory');
	if(!new && (is.null(Nonpareil.LastFactor) || is.null(Nonpareil.LastXmax)))
		stop('No previous plot found, please use new=TRUE if this is the first plot.');
	
	# Read metadata (.npo header)
	log_sampling <- 0;
	meta_data <- gsub('^# @', '', grep("^# @", readLines(file), value=TRUE));
	keys <- gsub(': .*', '', meta_data);
	vals <- gsub('.*: ', '', meta_data);
	if(is.null(overlap) & 'overlap' %in% keys) overlap = as.numeric(vals[keys=='overlap']);
	if(is.null(overlap)) stop('The overlap argument is mandatory for Nonpareil versions before 2.0.');
	if(is.na(read.length) & 'L' %in% keys) read.length = as.numeric(vals[keys=='L']);
	if(is.na(read.length)) read.length=101;
	num_reads = NULL;
	if('R' %in% keys) num_reads = as.numeric(vals[keys=='R']);
	if('divide' %in% keys) log_sampling = as.numeric(vals[keys=='divide']);
	if('logsampling' %in% keys) log_sampling = as.numeric(vals[keys=='logsampling']);
	
	# Read input
	out <- list(kappa=0, C=0, LRstar=0, LR=0, modelR=0, diversity=0);
	a <- read.table(file, sep="\t", h=F);
	out$kappa <- a$V2[nrow(a)];
	if(is.null(num_reads)) num_reads = max(a$V1);
	LR <- exp(log(num_reads) + log(read.length));
	out$LR <- LR;
	for(i in 2:6) a[, i] <- a[, i]^Nonpareil.coverageFactor(overlap);
	if(useValue=='median'){
	   values = a[, 5]
	}else if(useValue=='ub'){
	   values = pmin(1, a[, 2] + a[, 3]*1.9)
	}else if(useValue=='lb'){
	   values = pmax(0, a[, 2] - a[, 3]*1.9)
	}else if(useValue=='q1'){
	   values = a[, 4]
	}else if(useValue=='q3'){
	   values = a[, 6]
	}else{
	   values = a[, 2]
	}
	a$V1 = exp(max(log(a$V1)) + max(values^0.27)*(log(a$V1) - max(log(a$V1))));
	a$V1 = exp(log(a$V1)*0.61 + 10);
	a$V1 = a$V1 * read.length / 101;
	horiz.diff = LR / max(a$V1);
	a$V1 = a$V1 * horiz.diff;##
	horiz.diff = 1;
	if(is.na(libname)) {
	   libname <- basename(file);
	   if(substr(libname, nchar(libname)-3, nchar(libname))==".npo")
	      libname <- substr(libname, 0, nchar(libname)-4);
	}

	# Check data
	out$C <- max(values);
	if(data.consistency){
	   twenty.pc = which.max(a$V1[a$V1<=0.2*max(a$V1)]);
	   if(a[twenty.pc, 5]==0){
	      warning('Median of the curve is zero at 20% of the reads, check parameters and re-run (e.g., decrease value of -L in nonpareil).');
	      return(out);
	   }
	   if(a[nrow(a), 2]<=1e-5){
	      warning('Curve too low, check parameters and re-run (e.g., decrease value of -L in nonpareil).');
	      return(out);
	   }
	   if(a[2, 2]>=1-1e-5){
	      warning('Curve too steep, check parameters and re-run (e.g., decrease value of -i in nonpareil or use -d).');
	      return(out);
	   }
	   #if(min(a[a[,2]>0, 2])<0.01){
	   #   warning('Curve undefined around S* (kappa=05%), check parameters and re-run (e.g., decrease the value of -i in nonpareil).');
	   #   return(0);
	   #}
	}

	
	# For future calls (of this or other Nonpareil.* functions)
	Nonpareil.LastFactor <<- factor;
	
	# Draw it
	if(plot){
		if(new){
			if(!is.null(xlab)){	xlab <- as.character(xlab) }
			else if(factor==1){	xlab <- 'Sequencing effort (bp)' }
			else if(factor==1e-1){	xlab <- 'Sequencing effort (Dbp)' }
			else if(factor==1e-2){	xlab <- 'Sequencing effort (Hbp)' }
			else if(factor==1e-3){	xlab <- 'Sequencing effort (Kbp)' }
			else if(factor==1e-6){	xlab <- 'Sequencing effort (Mbp)' }
			else if(factor==1e-9){	xlab <- 'Sequencing effort (Gbp)' }
			else if(factor==1e-12){	xlab <- 'Sequencing effort (Tbp)' }
			else if(factor==1e-15){	xlab <- 'Sequencing effort (Pbp)' }
			else if(factor==1e-18){	xlab <- 'Sequencing effort (Ebp)' }
			else if(factor==1e-21){	xlab <- 'Sequencing effort (Zbp)' }
			else{			xlab <- paste('Sequencing effort (by ', factor, 'bp)', sep='') }
			if(is.null(ylab)) ylab <- 'Estimated average coverage';

			plot(1, t='n', xlim=c(xmin, xmax), ylim=c(ymin, ymax), xlab=xlab, ylab=ylab, log=log, ...);
			abline(h=c(1,star/100), lty=2, col='red');
			abline(v=10^seq(0,15,by=3), lty=2, col='gray80')
		}

		if(!is.na(col)){
			r <- col2rgb(col)['red',1]/256;
			g <- col2rgb(col)['green',1]/256;
			b <- col2rgb(col)['blue',1]/256;
		}
		if(is.na(r)) r <- sample(200,1);
		if(is.na(g)) g <- sample(200,1);
		if(is.na(b)) b <- sample(200,1);
		if(r>1) r <- r/256;
		if(g>1) g <- g/256;
		if(b>1) b <- b/256;
		
		if(!modelOnly){
			if(!is.na(plotDispersion)){
				if(plotDispersion == 'sd'){
				   err.y <- c(values+a$V3, rev(values-a$V3));
				}else if(plotDispersion == 'ci95'){
				   err.y <- c(values+a$V3*1.9, rev(values-a$V3*1.9));
				}else if(plotDispersion == 'ci90'){
				   err.y <- c(values+a$V3*1.64, rev(values-a$V3*1.64));
				}else if(plotDispersion == 'ci50'){
				   err.y <- c(values+a$V3*.67, rev(values-a$V3*.67));
				}else if(plotDispersion == 'iq'){
				   err.y <- c(a$V4, rev(a$V6));
				}
				polygon(horiz.diff*c(a$V1, rev(a$V1))*factor, ifelse(err.y<=ymin*0.1, ymin*0.1, err.y), col=rgb(r,g,b,.2), border=NA);
			}
			lines(horiz.diff*a$V1*factor, values, col=rgb(r,g,b,curve.alpha), lwd=curve.lwd);
		}
		
		# Save some info
		Nonpareil.LastXmax   <<- xmax;
		Nonpareil.OpenColors <<- c(Nonpareil.OpenColors, rgb(r,g,b));
		Nonpareil.OpenNames  <<- c(Nonpareil.OpenNames, libname);
	}
	
	# Model it
	sel  <- values>0 & values<0.9;
	x <- a$V1[sel];
	if((length(x)>10) | ! data.consistency){
	   y <- values[sel];
	   data <- list(x=x, y=y)
	   estModel <- TRUE;
	   if(is.na(weights.exp[1])){
	      if(log_sampling==0){ weights.exp <- c(-1.1,-1.2,-0.9,-1.3,-1) }
	      else{ weights.exp <- c(0, 1, -1, 1.3, -1.1, 1.5, -1.5) }
	   }
	   weights.i <- 0;
	   while(estModel){
	      weights.i <- weights.i+1;
	      model <- nls(y ~ Nonpareil.f(x, a, b), data=data, weights=(a$V3[sel]^weights.exp[weights.i]), start=list(a=1, b=0.1), lower=c(a=0, b=0), algorithm='port', 
			   control=nls.control(minFactor=1e-25000, tol=1e-15, maxiter=1024, warnOnly=T));
	      if(!is.na(weights.exp[weights.i+1]) | summary(model)$convInfo$isConv) estModel <- FALSE;
	   }
	   if(summary(model)$convInfo$isConv){
	      model.lty=2;
	      if(modelOnly) model.lty=1;
	      if(plot & plotModel){
		 model.x <- exp(seq(log(xmin), log(xmax), length.out=1e3));
		 model.y <- predict(model, list(x=model.x));
		 model.x <- model.x*horiz.diff*factor;
		 lines(model.x, model.y, col=rgb(r,g,b,model.alpha), lty=model.lty, lwd=model.lwd);
		 if(modelOnly) points(max(a$V1), predict(model, list(x=max(a$V1))), col=rgb(r,g,b), pch=21, bg='white');
		 #if(modelOnly) points(min(a$V1[a$V1>0]), predict(model, list(x=min(a$V1[a$V1>0]))), col=rgb(r,g,b), pch=8, bg='white');
	      }
	      if(returnModelValues) { out$model.x <- model.x ; out$model.y <- model.y; }
	      if(returnModelParameters) { out$model=model }
	      pa <- coef(model)[1];
	      pb <- coef(model)[2];
	      if(pa > 1) out$diversity <- (pa-1)/pb;
	      if(plot & plotDiversity & out$diversity>0)
	      	arrows(x0=exp(out$diversity),
			y0=ifelse(log=='y' | log=='xy' | log=='yx', ymin*0.1, ymin-1),
			y1=ymin,
			length=0.1, col=rgb(r,g,b,model.alpha));
	      out$LRstar <- Nonpareil.antif(star/100, pa, pb);
	      out$modelR <- cor(y, predict(model, list(x=x)));
	   }else{
	      warning('Model didn\'t converge.');
	   }
	}else{
	   warning('Insufficient resolution below 90% coverage, skipping model');
	}
	return(out);
	### A list with the following entries:
	### 
	### kappa: "Redundancy" value of the entire dataset.
	### 
	### C: Average coverage of the entire dataset.
	### 
	### LRstar: Estimated sequencing effort required to reach the objective average coverage (star, 95% by default).
	### 
	### LR: Actual sequencing effort of the dataset.
	### 
	### modelR: Pearson's R coefficient betweeen the rarefied data and the projected model.
	###
	### diversity: Nonpareil-based diversity index. This value's units are the natural logarithm of the units of
	###    sequencing effort (by default, log-bp), and indicates the inflection point of the fitted model for the
	###    Nonpareil curve. If the fit doesn't converge, or the model is not estimated, the value is zero (0).
	### 
	### model.x (if retturnModelValues=TRUE): Values of sequencing effort in the projected model.
	### 
	### model.y (if retturnModelValues=TRUE): Values of average coverage in the projected model.
	### 
	### model (if returnModelParameters=TRUE): Fitted model.
	### 
}

Nonpareil.legend <- function(
	### Generates a legend for Nonpareil plots, after calling Nonpareil.curve() or Nonpareil.curve.batch().
	x=NULL,
	### X coordinate, or any character string accepted by legend (e.g., 'bottomright').
	y=.3,
	### Y coordinate.
	...
	### Any other parameters supported by legend().
	){
   if(is.null(Nonpareil.LastFactor) || is.null(Nonpareil.LastXmax))
      stop('There must be at least one active plot to draw a legend.  Use Nonpareil.curve().');
   if(is.null(x)) x <- 0.75*Nonpareil.LastXmax;
   legend(x=x, y=y, legend=Nonpareil.OpenNames, fill=Nonpareil.OpenColors, ...);
}

## PREDICTIONS
Nonpareil.coverageFactor <- function(
	### Factor to transform redundancy into coverage (internal function).
	overlap
	### Value of overlap (-L).
	){
   if(overlap==25){
      return(.845);
   }else if(overlap==50){
      return(.757);
   }else if(overlap==75){
      return(.633);
   }else if(overlap==100){
      return(.137);
   }else{
      stop('Unsupported overlap.  Supported values are: 100, 75, 50, and 25.');
   }
   ### A numeric scalar.
}