Content - 2f357606ddb435d01db1d7d12866c73929f21b4f - faed4b1/utils/Nonpareil.R

swh:1:snp:dcd7ac99330b9dc1737f0a46f0985095e58d6e65

Tip revision: cfaa9e32e259561b2669f68f103dabd381a5d4ec authored by Luis M Rodriguez-R on 18 September 2017, 04:12:26 UTC
v3.1
Tip revision: cfaa9e3
Nonpareil.R
# INTERNAL
Nonpareil.__init_globals <- function(
    ### Internal Nonpareil function.
    soft=TRUE){
  if(soft && exists('Nonpareil.LastFactor')) return();
  Nonpareil.LastFactor <<- NULL;
  Nonpareil.LastXmax <<- NULL;
  Nonpareil.OpenColors <<- c();
  Nonpareil.OpenNames <<- c();
}

# MODEL FUNCTIONS
#Nonpareil.f <- function(x,a,b){ return( ( 1 - exp(-x/(2^a)) )^b ) }
#Nonpareil.antif <- function(y,a,b){ return( -(2^a)*log(1 - y^(1/b)) ) }
Nonpareil.f <- function(
    ### Function of the projected model.
    x,
    ### Values of sequencing effort (typically in bp).
    a,
    ### Parameter alpha of the gamma CDF.
    b
    ### Parameter beta of the Gamma CDF.
    ){
  return(pgamma(log(x+1),a, b))
  ### Predicted values of abundance-weighted average coverage.
}
Nonpareil.antif <- function(
    ### Complement function of Nonpareil.f.
    y,
    ### Values of abundance-weighted average coverage
    a,
    ### Parameter alpha of the gamma CDF.
    b
    ### Parameter beta of the gamma CDF.
    ){
  return(exp(qgamma(y,a,b))-1)
  ### Estimated sequencing effort.
}

# NONPAREIL CURVES
Nonpareil.curve.batch <- function(
    ### Generates a single plot (and a single table of estimates) with multiple
    ### Nonpareil curves, from different .npo files.
    files,
    ### Vector of characters with the paths to the .npo files.
    overlap,
    ### Value of the '-L' parameter (in Nonpareil, the default is 50). It can be
    ### a number (if all the curves were generated with the same value) or a
    ### vector (in the same order of 'files'). See the 'overlap' value of
    ### 'Nonpareil.curve()'. Use only with Nonpareil < v2.0.
    r=NA,
    ### Values of 'r' in 'Nonpareil.curve()' (red value).
    g=NA,
    ### Values of 'g' in 'Nonpareil.curve()' (green value).
    b=NA,
    ### Values of 'b' in 'Nonpareil.curve()' (blue value).
    libnames=NA,
    ### Vector of names of the libraries (corresponding to 'libname' in
    ### 'Nonpareil.curve()'). It must be characters, not factors.
    read.lengths=NA,
    ### A vector of numbers indicating the length of the reads (corresponding to
    ### 'read.length' in 'Nonpareil.curve()'). Use only with Nonpareil < v2.0.
    col=NA,
    ### Values of 'col' in 'Nonpareil.curve()'.
    ...
    ### Any other parameter accepted by 'Nonpareil.curve()' is supported.
    ){
  if(!is.vector(files)) files = as.vector(files);
  if(missing(overlap)) overlap=rep(NULL, length(files));
  new=TRUE;
  for(i in 1:length(files)){
    o = Nonpareil.curve(as.character(files[i]), overlap, r=r[i], g=g[i], b=b[i],
            col=col[i], new=new,
            libname=ifelse(is.na(libnames[1]), NA, as.character(libnames[i])),
            read.length=ifelse(is.na(read.lengths[1]), NA, read.lengths[i]),
            ...);
    if(new){
      out.m = matrix(NA, ncol=length(o), nrow=length(files));
      colnames(out.m) <- rownames(as.matrix(o));
      if(!is.na(libnames[1])) rownames(out.m) <- libnames;
    }
    out.m[i, ] <- as.numeric(o);
    if(!is.null(Nonpareil.LastXmax)) new <- FALSE;
  }
  return(out.m);
  ### A dataframe containing the values generated by 'Nonpareil.curve()'.
}

Nonpareil.curve <- function(
    ### Generates a Nonpareil curve from a .npo file.
    file,
    ### Path to the .npo file, containing the read redundancy.
    ksize=NULL,
    ### K-mer size (if -T kmer). By default this value is extracted
    ### from the .npo file headers.
    overlap=NULL,
    ### Value of the '-L' parameter (in Nonpareil, the default is 50). If not
    ### set, it tries to find the value in the .npo file (supported in Nonpareil
    ### >= 2.0), or fails with an error message. Use only with Nonpareil < v2.0.
    factor=1,
    ### Multiplier of the sequencing effort. This can be used to express the
    ### sequencing effort in units other than base pairs (bp). For example, to
    ### express sequencing effort as Gbp, use factor=1e-9. This can also affect
    ### the fit of the model, and it's considered EXPERIMENTAL.
    plotDispersion=NA,
    ### Indicates if (and how) dispersion of the replicates should be plotted.
    ### It requires modelOnly=FALSE to take effect. It can be NA, in which case
    ### no dispersion is plotted, or any of the following strings: 'sd' (one
    ### standard deviation around the mean), 'ci95' (95% confidence interval),
    ### 'ci90' (90% confidence interval), 'ci50' (50% confidence interval), 'iq'
    ### (inter-quartile range).
    returnModelValues=FALSE,
    ### If TRUE, returns the coordinates of the model as model.x and model.y.
    returnModelParameters=FALSE,
    ### If TRUE, returns the model itself as model.
    xmax=10e12,
    ### Maximum sequencing effort to plot.
    xmin=1e3,
    ### Minimum sequencing effort to plot
    ymax=1,
    ### Maximum coverage to plot.
    ymin=1e-6,
    ### Minimum coverage to plot.
    xlab=NULL,
    ### Label of the X-axis. If NULL, it's set to sequencing effort and the
    ### units (see factor).
    ylab=NULL,
    ### Label of the Y-axis. If NULL, it's set to Estimated average coverage.
    col=NA,
    ### The color of the curve. If passed, it overrides `r`, `g`, and `b`.
    r=NA,
    ### Red component of the curve's color. If NA, it's randomly set. If <=1,
    ### it's assumed to be in the range [0,1]; if >1, it's assumed to be in the
    ### range [0,256].
    g=NA,
    ### Green component of the curve's color. Same as `r`.
    b=NA,
    ### Blue component of the curve's color. Same as `r`.
    new=TRUE,
    ### If FALSE, it attempts to use a previous (active) canvas to plot the
    ### curve.
    plot=TRUE,
    ### Determines if the plot should be produced. If FALSE, it still computes
    ### the coverage and the model.
    libname=NA,
    ### Name of the library. If NA, it's determined by the file name. This is
    ### useful if you plan to call Nonpareil.legend().
    modelOnly=FALSE,
    ### If TRUE, the rarefied data is not presented, only the fitted model.
    plotModel=TRUE,
    ### If FALSE, the model is not plotted (but it's still computed).
    plotDiversity=FALSE,
    ### If TRUE, the diversity estimate is plotted as a small arrow below the
    ### Nonpareil curve.
    curve.lwd=2,
    ### Line width of the Nonpareil curve.
    curve.alpha=0.4,
    ### Alpha value (from 0 to 1) of the Nonpareil curve.
    model.lwd=1,
    ### Line width of the model.
    model.alpha=1,
    ### Alpha value (from 0 to 1) of the model.
    log='x',
    ### Axis to plot in logarithmic scale. It can be 'x' (sequencing effort,
    ### default), 'y' (coverage), 'xy' (both logarithmic), or '' (both linear).
    data.consistency=TRUE,
    ### If TRUE, it checks the consistency of the data before plotting.
    useValue='mean',
    ### Controls how the replicates are to be summarized at each point of
    ### sequencing effort. It can be any of: 'mean' (average of the replicates),
    ### 'median' (median), 'ub' (upper boundary of the 95% confidence interval),
    ### 'lb' (lower boundary of the 95% confidence interval), 'q1' (quartile 1),
    ### 'q3' (quartile 3). Note that the quartile 2 is also 'median'.
    star=95,
    ### Objective coverage (in percentage). By default: 95, which means the
    ### sequencing effort required to reach 95% average coverage is to be
    ### estimated.
    read.length=NA,
    ### Length of the reads. Use only with Nonpareil < v2.0.
    weights.exp=NA,
    ### Vector of values to be tested (in order) as exponent of the weights
    ### distribution. If the model fails to converge, sometimes manual
    ### modifications in this parameter may help. By default (NA), five
    ### different values are tested in the following order: For linear sampling,
    ### -1.1, -1.2, -0.9, -1.3, -1. For logarithmic sampling (-d option in
    ### Nonpareil), 0, 1, -1, 1.3, -1.1, 1.5, -1.5.
    correction.factor=TRUE,
    ### Should the overlap-dependent correction factor be applied? If FALSE,
    ### redundancy is assumed to equal coverage.
    ...
    ### Any other parameters accepted by plot().
    ){
  # Create environment
  Nonpareil.__init_globals(!new);

  # Examine consistency
  if(is.null(file)) stop("The file argument is mandatory");
  if(!new && (is.null(Nonpareil.LastFactor) || is.null(Nonpareil.LastXmax)))
    stop("No previous plot found, use new=TRUE if this is the first plot.");

  # Read metadata (.npo header)
  log_sampling <- 0;
  meta_data <- gsub('^# @', "", grep("^# @", readLines(file), value=TRUE));
  keys <- gsub(': .*', "", meta_data);
  vals <- gsub('.*: ', "", meta_data);
  type <- "alignment";
  if("ksize" %in% keys) type <- "kmer";
  if(type=="kmer"){
    if(is.null(ksize) & "ksize" %in% keys)
      ksize = as.numeric(vals[keys=="ksize"]);
    if(is.na(read.length) & "AL" %in% keys)
      read.length = as.numeric(vals[keys=="AL"]);
    overlap <- 0;
  }else{
    if(is.null(overlap) & "overlap" %in% keys)
      overlap = as.numeric(vals[keys=="overlap"]);
    if(is.null(overlap))
      stop("The overlap argument is mandatory for Nonpareil v < 2.0.");
    if(is.na(read.length) & "L" %in% keys)
      read.length = as.numeric(vals[keys=="L"]);
    ksize <- 1;
  }
  if(is.na(read.length)) read.length=101-ksize+1;
  num_reads = NULL;
  if("R" %in% keys) num_reads = as.numeric(vals[keys=="R"]);
  if("divide" %in% keys) log_sampling = as.numeric(vals[keys=="divide"]);
  if("logsampling" %in% keys)
    log_sampling = as.numeric(vals[keys=="logsampling"]);
  
  # Read input
  out <- list(kappa=0, C=0, LRstar=0, LR=0, modelR=0, diversity=0);
  a <- read.table(file, sep="\t", h=F);
  out$kappa <- a$V2[nrow(a)];
  if(is.null(num_reads)) num_reads = max(a$V1);
  LR <- exp(log(num_reads) + log(read.length));
  out$LR <- LR;
  cor.f <- 1.0;
  if(correction.factor) cor.f <- Nonpareil.coverageFactor(overlap)
  for(i in 2:6) a[, i] <- a[, i]^cor.f;
  if(useValue=="median"){
    values = a[, 5]
  }else if(useValue=="ub"){
    values = pmin(1, a[, 2] + a[, 3]*1.9)
  }else if(useValue=="lb"){
    values = pmax(0, a[, 2] - a[, 3]*1.9)
  }else if(useValue=="q1"){
    values = a[, 4]
  }else if(useValue=="q3"){
    values = a[, 6]
  }else{
    values = a[, 2]
  }
  a$V1 = exp(max(log(a$V1)) + max(values^0.27)*(log(a$V1) - max(log(a$V1))));
  a$V1 = exp(log(a$V1)*0.61 + 10);
  a$V1 = a$V1 * read.length / 101;
  horiz.diff = LR / max(a$V1);
  a$V1 = a$V1 * horiz.diff;##
  horiz.diff = 1;
  if(is.na(libname)) {
    libname <- basename(file);
    if(substr(libname, nchar(libname)-3, nchar(libname))==".npo")
      libname <- substr(libname, 0, nchar(libname)-4);
  }

  # Check data
  out$C <- max(values);
  if(data.consistency){
    twenty.pc = which.max(a$V1[a$V1<=0.5*max(a$V1)]);
    if(a[twenty.pc, 5]==0){
      warning(paste("Median of the curve is zero at 50% of the reads, check",
        "parameters and re-run (e.g., decrease value of -L in nonpareil)."));
      return(out);
    }
    if(a[nrow(a), 2]<=1e-5){
      warning(paste("Curve too low, check parameters and re-run (e.g.,",
        "decrease value of -L in nonpareil)."));
      return(out);
    }
    if(a[2, 2]>=1-1e-5){
      warning(paste("Curve too steep, check parameters and re-run (e.g.,",
        "decrease value of -i in nonpareil or use -d)."));
      return(out);
    }
  }

  # For future calls (of this or other Nonpareil.* functions)
  Nonpareil.LastFactor <<- factor;

  # Draw it
  if(plot){
    if(new){
      if(!is.null(xlab)){     xlab <- as.character(xlab) }
      else if(factor==1){     xlab <- 'Sequencing effort (bp)' }
      else if(factor==1e-1){  xlab <- 'Sequencing effort (Dbp)' }
      else if(factor==1e-2){  xlab <- 'Sequencing effort (Hbp)' }
      else if(factor==1e-3){  xlab <- 'Sequencing effort (Kbp)' }
      else if(factor==1e-6){  xlab <- 'Sequencing effort (Mbp)' }
      else if(factor==1e-9){  xlab <- 'Sequencing effort (Gbp)' }
      else if(factor==1e-12){ xlab <- 'Sequencing effort (Tbp)' }
      else if(factor==1e-15){ xlab <- 'Sequencing effort (Pbp)' }
      else if(factor==1e-18){ xlab <- 'Sequencing effort (Ebp)' }
      else if(factor==1e-21){ xlab <- 'Sequencing effort (Zbp)' }
      else{ xlab <- paste('Sequencing effort (by ', factor, 'bp)', sep='') }
      if(is.null(ylab))       ylab <- 'Estimated average coverage';

      plot(1, t='n', xlim=c(xmin, xmax), ylim=c(ymin, ymax), bty="l",
        xlab=xlab, ylab=ylab, xaxs="i", yaxs="i", log=log, ...);
      abline(h=c(1,star/100), lty=2, col='red');
      abline(v=10^seq(0,15,by=3), lty=2, col='gray80')
    }

    if(!is.na(col)){
      r <- col2rgb(col)['red',1]/256;
      g <- col2rgb(col)['green',1]/256;
      b <- col2rgb(col)['blue',1]/256;
    }
    if(is.na(r)) r <- sample(200,1);
    if(is.na(g)) g <- sample(200,1);
    if(is.na(b)) b <- sample(200,1);
    if(r>1) r <- r/256;
    if(g>1) g <- g/256;
    if(b>1) b <- b/256;

    if(!modelOnly){
      if(!is.na(plotDispersion)){
        if(plotDispersion == 'sd'){
          err.y <- c(values+a$V3, rev(values-a$V3));
        }else if(plotDispersion == 'ci95'){
          err.y <- c(values+a$V3*1.9, rev(values-a$V3*1.9));
        }else if(plotDispersion == 'ci90'){
          err.y <- c(values+a$V3*1.64, rev(values-a$V3*1.64));
        }else if(plotDispersion == 'ci50'){
          err.y <- c(values+a$V3*.67, rev(values-a$V3*.67));
        }else if(plotDispersion == 'iq'){
          err.y <- c(a$V4, rev(a$V6));
        }
        polygon(horiz.diff*c(a$V1, rev(a$V1))*factor, border=NA,
          ifelse(err.y<=ymin*0.1, ymin*0.1, err.y), col=rgb(r,g,b,.2));
      }
      lines(horiz.diff*a$V1*factor, values, col=rgb(r,g,b,curve.alpha),
        lwd=curve.lwd);
    }

    # Save some info
    Nonpareil.LastXmax   <<- xmax;
    Nonpareil.OpenColors <<- c(Nonpareil.OpenColors, rgb(r,g,b));
    Nonpareil.OpenNames  <<- c(Nonpareil.OpenNames, libname);
  }

  # Model it
  sel  <- values>0 & values<0.9;
  x <- a$V1[sel];
  if((length(x)>10) | ! data.consistency){
    y <- values[sel];
    data <- list(x=x, y=y)
    if(is.na(weights.exp[1])){
      if(log_sampling==0){ weights.exp <- c(-1.1,-1.2,-0.9,-1.3,-1) }
      else{ weights.exp <- c(0, 1, -1, 1.3, -1.1, 1.5, -1.5) }
    }
    estModel <- TRUE;
    weights.i <- 0;
    while(estModel & !is.na(weights.exp[weights.i+1])){
      weights.i <- weights.i+1;
      suppressWarnings(model <- nls(y ~ Nonpareil.f(x, a, b), data=data,
        weights=(a$V3[sel]^weights.exp[weights.i]), start=list(a=1, b=0.1),
        lower=c(a=0, b=0), algorithm='port', control=nls.control(
          minFactor=1e-25000, tol=1e-15, maxiter=1024, warnOnly=T)));
      if(summary(model)$convInfo$isConv) estModel <- FALSE;
    }
    if(summary(model)$convInfo$isConv){
      model.lty=2;
      if(modelOnly) model.lty=1;
      if(plot & plotModel){
        model.x <- exp(seq(log(xmin), log(xmax), length.out=1e3));
        model.y <- predict(model, list(x=model.x));
        model.x <- model.x*horiz.diff*factor;
        lines(model.x, model.y, col=rgb(r,g,b,model.alpha), lty=model.lty,
          lwd=model.lwd);
        if(modelOnly)
          points(max(a$V1), predict(model, list(x=max(a$V1))), col=rgb(r,g,b),
            pch=21, bg='white');
      }
      if(returnModelValues) { out$model.x <- model.x ; out$model.y <- model.y; }
      if(returnModelParameters) { out$model=model }
      pa <- coef(model)[1];
      pb <- coef(model)[2];
      if(pa > 1) out$diversity <- (pa-1)/pb;
      if(plot & plotDiversity & out$diversity>0)
        arrows(x0=exp(out$diversity),
          y0=ifelse(log=='y' | log=='xy' | log=='yx', ymin*0.1, ymin-1),
          y1=ymin, length=0.1, col=rgb(r,g,b,model.alpha));
      out$LRstar <- Nonpareil.antif(star/100, pa, pb);
      out$modelR <- cor(y, predict(model, list(x=x)));
    }else{
      warning("Model didn't converge.");
    }
  }else{
    warning("Insufficient resolution below 90% coverage, skipping model");
  }
  return(out);
  ### A list with the following entries:
  ###
  ### kappa: "Redundancy" value of the entire dataset.
  ###
  ### C: Average coverage of the entire dataset.
  ###
  ### LRstar: Estimated sequencing effort required to reach the objective
  ###   average coverage (star, 95% by default).
  ###
  ### LR: Actual sequencing effort of the dataset.
  ###
  ### modelR: Pearson's R coefficient betweeen the rarefied data and the
  ###   projected model.
  ###
  ### diversity: Nonpareil-based diversity index. This value's units are the
  ###   natural logarithm of the units of sequencing effort (by default,
  ###   log-bp), and indicates the inflection point of the fitted model for the
  ###   Nonpareil curve. If the fit doesn't converge, or the model is not
  ###   estimated, the value is zero (0).
  ###
  ### model.x (if retturnModelValues=TRUE): Values of sequencing effort in the
  ###   projected model.
  ###
  ### model.y (if retturnModelValues=TRUE): Values of average coverage in the
  ###   projected model.
  ###
  ### model (if returnModelParameters=TRUE): Fitted model.
  ###
}

Nonpareil.legend <- function(
    ### Generates a legend for Nonpareil plots, after calling Nonpareil.curve()
    ### or Nonpareil.curve.batch().
    x=NULL,
    ### X coordinate, or any character string accepted by legend (e.g.,
    ### 'bottomright').
    y=.3,
    ### Y coordinate.
    ...
    ### Any other parameters supported by legend().
    ){
  if(is.null(Nonpareil.LastFactor) || is.null(Nonpareil.LastXmax))
    stop(paste("There must be at least one active plot to draw a legend. Use",
      "Nonpareil.curve()."));
  if(is.null(x)) x <- 0.75*Nonpareil.LastXmax;
  legend(x=x, y=y, legend=Nonpareil.OpenNames, fill=Nonpareil.OpenColors, ...);
}

## PREDICTIONS
Nonpareil.coverageFactor <- function(
    ### Factor to transform redundancy into coverage (internal function).
    overlap
    ### Value of overlap (-L).
    ){
  if(overlap==0) overlap <- 50 # for type kmer
  return(1-exp(2.23E-2*overlap-3.5698))

  ### A numeric scalar.
}