swh:1:snp:0da231f3ffdb3226650880f1b61d5d5cdcbd749b
Raw File
Tip revision: fc4a4f5203227832477a576bfe01bc6efeb23f51 authored by satijalab on 04 October 2019, 13:38:05 UTC
Merge pull request #2122 from satijalab/develop
Tip revision: fc4a4f5
unknown.old
#' Run Independent Component Analysis on gene expression
#'
#' Run fastica algorithm from the ica package for ICA dimensionality reduction.
#' For details about stored ICA calculation parameters, see
#' \code{PrintICAParams}.
#'
#' @param object Seurat object
#' @param ic.genes Genes to use as input for ICA. Default is object@@var.genes
#' @param ics.compute Number of ICs to compute
#' @param use.imputed Run ICA on imputed values (FALSE by default)
#' @param rev.ica By default, computes the dimensional reduction on the cell x
#' gene matrix. Setting to true will compute it on the transpose (gene x cell
#' matrix).
#' @param print.results Print the top genes associated with each dimension
#' @param ics.print ICs to print genes for
#' @param genes.print Number of genes to print for each IC
#' @param ica.function ICA function from ica package to run (options: icafast,
#' icaimax, icajade)
#' @param seed.use Random seed to use for fastica
#' @param reduction.name dimensional reduction name, specifies the position in the object$dr list. ica by default
#' @param reduction.key dimensional reduction key, specifies the string before the number for the dimension names. IC by default
#' @param \dots Additional arguments to be passed to fastica
#'
#' @importFrom methods new
#' @importFrom ica icafast icaimax icajade
#'
#' @return Returns Seurat object with an ICA calculation stored in
#' object@@dr$ica
#'
#' @export
#'
#' @examples
#' pbmc_small
#' # Run ICA on variable genes (default)
#' pbmc_small <- RunICA(pbmc_small, ics.compute=5)
#' # Run ICA on different gene set (in this case all genes)
#' pbmc_small <- RunICA(pbmc_small, ic.genes = rownames(pbmc_small@data))
#' # Plot results
#' ICAPlot(pbmc_small)
#'
RunICA <- function(
  object,
  ic.genes = NULL,
  ics.compute = 50,
  use.imputed = FALSE,
  rev.ica = FALSE,
  print.results = TRUE,
  ics.print = 1:5,
  genes.print = 50,
  ica.function = "icafast",
  seed.use = 1,
  reduction.name = "ica",
  reduction.key = "IC",
  ...
) {
  data.use <- PrepDR(
    object = object,
    genes.use = ic.genes,
    use.imputed = use.imputed)
  set.seed(seed = seed.use)
  ics.compute <- min(ics.compute, ncol(x = data.use))
  ica.fxn <- eval(parse(text = ica.function))
  if (rev.ica) {
    ica.results <- ica.fxn(data.use, nc = ics.compute,...)
    cell.embeddings <- ica.results$M
  } else {
    ica.results <- ica.fxn(t(x = data.use), nc = ics.compute,...)
    cell.embeddings <- ica.results$S
  }
  gene.loadings <- (as.matrix(x = data.use ) %*% as.matrix(x = cell.embeddings))
  colnames(x = gene.loadings) <- paste0(reduction.key, 1:ncol(x = gene.loadings))
  colnames(x = cell.embeddings) <- paste0(reduction.key, 1:ncol(x = cell.embeddings))
  ica.obj <- new(
    Class = "dim.reduction",
    gene.loadings = gene.loadings,
    cell.embeddings = cell.embeddings,
    sdev = sqrt(x = ica.results$vafs),
    key = "IC"
  )

  eval(expr = parse(text = paste0("object@dr$", reduction.name, "<- ica.obj")))
  parameters.to.store <- as.list(environment(), all = TRUE)[names(formals("ICA"))]
  object <- SetCalcParams(object = object, calculation = "ICA", ... = parameters.to.store)
  if(is.null(object@calc.params$ICA$ic.genes)){
    object@calc.params$ICA$ic.genes <- rownames(data.use)
  }
  if(print.results){
    PrintDim(object = object, dims.print = ics.print, genes.print = genes.print,reduction.type = reduction.name)
  }
  return(object)
}

#' Run diffusion map
#'
#' NOTE: Prior to v2.3.4, this function used the R package diffusionMap to compute
#' the diffusion map components. This package was being archived and thus
#' RunDiffusion now uses the destiny package for the diffusion computations.
#' Please be aware that this will result in different default values as the two
#' underlying package implementations are different.
#'
#' @param object Seurat object
#' @param cells.use Which cells to analyze (default, all cells)
#' @param dims.use Which dimensions to use as input features
#' @param genes.use If set, run the diffusion map procedure on this subset of
#' genes (instead of running on a set of reduced dimensions). Not set (NULL) by
#' default
#' @param reduction.use Which dimensional reduction (PCA or ICA) to use for the
#' diffusion map input. Default is PCA
#' @param q.use Quantile to clip diffusion map components at. This addresses an
#' issue where 1-2 cells will have extreme values that obscure all other points.
#' 0.01 by default
#' @param max.dim Max dimension to keep from diffusion calculation
#' @param scale.clip Max/min value for scaled data. Default is 3
#' @param reduction.name dimensional reduction name, specifies the position in
#' the object$dr list. dm by default
#' @param reduction.key dimensional reduction key, specifies the string before
#' the number for the dimension names. DM by default
#' @param ... Additional arguments to the DiffusionMap call
#'
#' @return Returns a Seurat object with a diffusion map
#'
#' @importFrom utils installed.packages
#' @importFrom stats dist quantile
#'
#' @export
#'
#' @examples
#' \dontrun{
#' pbmc_small
#' # Run Diffusion on variable genes
#' pbmc_small <- RunDiffusion(pbmc_small,genes.use = pbmc_small@var.genes)
#' # Run Diffusion map on first 10 PCs
#' pbmc_small <- RunDiffusion(pbmc_small,genes.use = pbmc_small@var.genes)
#' # Plot results
#' DMPlot(pbmc_small)
#' }
#'
RunDiffusion <- function(
  object,
  cells.use = NULL,
  dims.use = 1:5,
  genes.use = NULL,
  reduction.use = 'pca',
  q.use = 0.01,
  max.dim = 2,
  scale.clip = 10,
  reduction.name = "dm",
  reduction.key = "DM",
  ...
) {
  # Check for destiny
  if (!'destiny' %in% rownames(x = installed.packages())) {
    stop("Please install destiny - learn more at https://bioconductor.org/packages/release/bioc/html/destiny.html")
  }
  cells.use <- SetIfNull(x = cells.use, default = colnames(x = object@data))
  if (is.null(x = genes.use)) {
    dim.code <- GetDimReduction(
      object = object,
      reduction.type = reduction.use,
      slot = 'key'
    )
    dim.codes <- paste0(dim.code, dims.use)
    data.use <- FetchData(object = object, vars.all = dim.codes)
  }
  if (! is.null(x = genes.use)) {
    genes.use <- intersect(x = genes.use, y = rownames(x = object@scale.data))
    data.use <- MinMax(
      data = t(x = object@data[genes.use, cells.use]),
      min = -1 * scale.clip,
      max = scale.clip
    )
  }
  parameters.to.store <- as.list(environment(), all = TRUE)[names(formals("RunDiffusion"))]
  object <- SetCalcParams(object = object,
                          calculation = "RunDiffusion",
                          ... = parameters.to.store)
  data.dist <- dist(data.use)
  data.diffusion <- data.frame(
    destiny::DiffusionMap(data = as.matrix(data.dist),
                          n_eigs = max.dim, ...)@eigenvectors
  )
  colnames(x = data.diffusion) <- paste0(reduction.key, 1:ncol(x = data.diffusion))
  rownames(x = data.diffusion) <- cells.use
  for (i in 1:max.dim) {
    x <- data.diffusion[,i]
    x <- MinMax(
      data = x,
      min = quantile(x = x, probs = q.use),
      quantile(x = x, probs = 1-q.use)
    )
    data.diffusion[, i] <- x
  }
  object <- SetDimReduction(
    object = object,
    reduction.type = reduction.name,
    slot = "cell.embeddings",
    new.data = as.matrix(x = data.diffusion)
  )
  object <- SetDimReduction(
    object = object,
    reduction.type = reduction.name,
    slot = "key",
    new.data = "DM"
  )
  return(object)
}

#' Run PHATE
#'
#' PHATE is a data reduction method specifically designed for visualizing
#' **high** dimensional data in **low** dimensional spaces.
#' To run, you must first install the `phate` python
#' package (e.g. via pip install phate). Details on this package can be
#' found here: \url{https://github.com/KrishnaswamyLab/PHATE}. For a more in depth
#' discussion of the mathematics underlying PHATE, see the bioRxiv paper here:
#' \url{https://www.biorxiv.org/content/early/2017/12/01/120378}.
#'
#' @param object Seurat object
#' @param cells.use Which cells to analyze (default, all cells)
#' @param genes.use If set, run PHATE on this subset of genes.
#' Not set (NULL) by default
#' @param assay.type Assay to pull data for (default: 'RNA')
#' @param max.dim Total number of dimensions to embed in PHATE.
#' @param k int, optional, default: 15
#' number of nearest neighbors on which to build kernel
#' @param alpha int, optional, default: 10
#' sets decay rate of kernel tails.
#' If NA, alpha decaying kernel is not used
#' @param use.alpha boolean, default: NA
#' forces the use of alpha decaying kernel
#' If NA, alpha decaying kernel is used for small inputs
#' (n_samples < n_landmark) and not used otherwise
#' @param n.landmark int, optional, default: 2000
#' number of landmarks to use in fast PHATE
#' @param potential.method string, optional, default: 'log'
#' choose from 'log' and 'sqrt'
#' which transformation of the diffusional operator is used
#' to compute the diffusion potential
#' @param t int, optional, default: 'auto'
#' power to which the diffusion operator is powered
#' sets the level of diffusion
#' @param knn.dist.method string, optional, default: 'euclidean'.
#' The desired distance function for calculating pairwise distances on the data.
#' If 'precomputed', `data` is treated as a
#' (n_samples, n_samples) distance or affinity matrix
#' @param mds.method string, optional, default: 'metric'
#' choose from 'classic', 'metric', and 'nonmetric'
#' which MDS algorithm is used for dimensionality reduction
#' @param mds.dist.method string, optional, default: 'euclidean'
#' recommended values: 'euclidean' and 'cosine'
#' @param t.max int, optional, default: 100.
#' Maximum value of t to test for automatic t selection.
#' @param npca int, optional, default: 100
#' Number of principal components to use for calculating
#' neighborhoods. For extremely large datasets, using
#' n_pca < 20 allows neighborhoods to be calculated in
#' log(n_samples) time.
#' @param plot.optimal.t boolean, optional, default: FALSE
#' If TRUE, produce a plot showing the Von Neumann Entropy
#' curve for automatic t selection.
#' @param verbose `int` or `boolean`, optional (default : 1)
#' If `TRUE` or `> 0`, print verbose updates.
#' @param n.jobs `int`, optional (default: 1)
#' The number of jobs to use for the computation.
#' If -1 all CPUs are used. If 1 is given, no parallel computing code is
#' used at all, which is useful for debugging.
#' For n_jobs below -1, (n.cpus + 1 + n.jobs) are used. Thus for
#' n_jobs = -2, all CPUs but one are used
#' @param seed.use int or `NA`, random state (default: `NA`)
#' @param reduction.name dimensional reduction name, specifies the position in
#' the object$dr list. phate by default
#' @param reduction.key dimensional reduction key, specifies the string before
#' the number for the dimension names. PHATE by default
#' @param ... Additional arguments for `phateR::phate`
#'
#' @return Returns a Seurat object containing a PHATE representation
#'
#' @importFrom utils installed.packages
#' @export
#'
#' @references Moon K, van Dijk D, Wang Z, Burkhardt D, Chen W, van den Elzen A,
#' Hirn M, Coifman R, Ivanova N, Wolf G and Krishnaswamy S (2017).
#' "Visualizing Transitions and Structure for High Dimensional Data
#' Exploration." _bioRxiv_, pp. 120378. doi: 10.1101/120378
#' (URL: http://doi.org/10.1101/120378),
#' <URL: https://www.biorxiv.org/content/early/2017/12/01/120378>.
#' @examples
#' if (reticulate::py_module_available("phate")) {
#'
#' # Load data
#' pbmc_small
#'
#' # Run PHATE with default parameters
#' pbmc_small <- RunPHATE(object = pbmc_small)
#' # Plot results
#' DimPlot(object = pbmc_small, reduction.use = 'phate')
#'
#' # Try smaller `k` for a small dataset, and larger `t` for a noisy embedding
#' pbmc_small <- RunPHATE(object = pbmc_small, k = 4, t = 12)
#' # Plot results
#' DimPlot(object = pbmc_small, reduction.use = 'phate')
#'1
#' # For increased emphasis on local structure, use sqrt potential
#' pbmc_small <- RunPHATE(object = pbmc_small, potential.method='sqrt')
#' # Plot results
#' DimPlot(object = pbmc_small, reduction.use = 'phate')
#' }
#'
RunPHATE <- function(
  object,
  cells.use = NULL,
  genes.use = NULL,
  assay.type = 'RNA',
  max.dim = 2L,
  k = 15,
  alpha = 10,
  use.alpha = NA,
  n.landmark = 2000,
  potential.method = "log",
  t = "auto",
  knn.dist.method = "euclidean",
  mds.method = "metric",
  mds.dist.method = "euclidean",
  t.max = 100,
  npca = 100,
  plot.optimal.t = FALSE,
  verbose = 1,
  n.jobs = 1,
  seed.use = NA,
  reduction.name = "phate",
  reduction.key = "PHATE",
  ...
) {
  if (!'phateR' %in% rownames(x = installed.packages())) {
    stop("Please install phateR")
  }
  data.use <- GetAssayData(object, assay.type = assay.type, slot = "scale.data")
  if (!is.null(x = cells.use)) {
    data.use <- data.use[, cells.use]
  }
  if (!is.null(x = genes.use)) {
    data.use <- data.use[genes.use, ]
  }
  data.use <- t(x = data.use)
  parameters.to.store <- as.list(x = environment(), all = TRUE)[names(x = formals(fun = "RunPHATE"))]
  object <- SetCalcParams(
    object = object,
    calculation = "RunPHATE",
    ... = parameters.to.store
  )
  phate_output <- phateR::phate(
    data.use,
    ndim = max.dim,
    k = k,
    alpha = alpha,
    use.alpha = alpha,
    n.landmark = n.landmark,
    potential.method = potential.method,
    t = t,
    knn.dist.method = knn.dist.method,
    init = NULL,
    mds.method = mds.method,
    mds.dist.method = mds.dist.method,
    t.max = t.max,
    npca = npca,
    plot.optimal.t = plot.optimal.t,
    verbose = verbose,
    n.jobs = n.jobs,
    seed = seed.use,
    ...
  )
  phate_output <- as.matrix(x = phate_output)
  colnames(x = phate_output) <- paste0(reduction.key, 1:ncol(x = phate_output))
  object <- SetDimReduction(
    object = object,
    reduction.type = reduction.name,
    slot = "cell.embeddings",
    new.data = phate_output
  )
  object <- SetDimReduction(
    object = object,
    reduction.type = reduction.name,
    slot = "key",
    new.data = reduction.key
  )
  return(object)
}

# Check group exists either as an ident or that all cells passed as vector are
# present
#
# @param object    Seurat object
# @param group     Identity or vector of cell names
# @param group.id  Corresponds to the the either group1 or group2 parameter from RunCCA
#
CheckGroup <- function(object, group, group.id) {
  if (all(group %in% unique(x = object@ident))) {
    cells.use <- WhichCells(object = object, ident = group)
  } else {
    if (all(group %in% object@cell.names)) {
      cells.use <- group
    } else {
      stop(paste(
        group.id,
        "must be either a vector of valid cell names or idents"
      ))
    }
  }
  if (length(cells.use) == 0) {
    stop(paste0("No cells present in group: ", group.id))
  }
  return(cells.use)
}

#' Gene expression markers of identity classes defined by a phylogenetic clade
#'
#' Finds markers (differentially expressed genes) based on a branching point (node) in
#' the phylogenetic tree. Markers that define clusters in the left branch are positive markers.
#' Markers that define the right branch are negative markers.
#'
#' @inheritParams FindMarkers
#' @param node The node in the phylogenetic tree to use as a branch point
#' @param tree.use Can optionally pass the tree to be used. Default uses the tree in object@@cluster.tree
#' @param assay.type Type of assay to fetch data for (default is RNA)
#' @param ... Additional arguments passed to FindMarkers
#'
#' @return Matrix containing a ranked list of putative markers, and associated
#' statistics (p-values, ROC score, etc.)
#'
#' @export
#'
#' @examples
#' FindMarkersNode(pbmc_small, 5)
#'
FindMarkersNode <- function(
  object,
  node,
  tree.use = NULL,
  genes.use = NULL,
  logfc.threshold = 0.25,
  test.use = "wilcox",
  assay.type = "RNA",
  ...
) {
  data.use <- GetAssayData(
    object = object,
    assay.type = assay.type
  )
  genes.use <- SetIfNull(x = genes.use, default = rownames(x = data.use))
  tree <- SetIfNull(x = tree.use, default = object@cluster.tree[[1]])
  ident.order <- tree$tip.label
  nodes.1 <- ident.order[GetLeftDescendants(tree = tree, node = node)]
  nodes.2 <- ident.order[GetRightDescendants(tree = tree, node = node)]
  #print(nodes.1)
  #print(nodes.2)
  to.return <- FindMarkers(
    object = object,
    assay.type = assay.type,
    ident.1 = nodes.1,
    ident.2 = nodes.2,
    genes.use = genes.use,
    logfc.threshold = logfc.threshold,
    test.use = test.use,
    ...
  )
  return(to.return)
}

globalVariables(names = c('myAUC', 'p_val'), package = 'Seurat', add = TRUE)
#' Find all markers for a node
#'
#' This function finds markers for all splits at or below the specified node
#'
#' @param object Seurat object. Must have object@@cluster.tree slot filled. Use BuildClusterTree() if not.
#' @param node Node from which to start identifying split markers, default is top node.
#' @param genes.use Genes to test. Default is to use all genes
#' @param logfc.threshold Limit testing to genes which show, on average, at least
#' X-fold difference (log-scale) between the two groups of cells.
#' @param test.use Denotes which test to use. Seurat currently implements
#' "bimod" (likelihood-ratio test for single cell gene expression, McDavid et
#' al., Bioinformatics, 2013, default), "roc" (standard AUC classifier), "t"
#' (Students t-test), and "tobit" (Tobit-test for differential gene expression,
#' as in Trapnell et al., Nature Biotech, 2014), 'poisson', and 'negbinom'.
#' The latter two options should only be used on UMI datasets, and assume an underlying
#' poisson or negative-binomial distribution.
#' @param min.pct - only test genes that are detected in a minimum fraction of min.pct cells
#' in either of the two populations. Meant to speed up the function by not testing genes that are very infrequently expression
#' @param min.diff.pct - only test genes that show a minimum difference in the fraction of detection between the two groups. Set to -Inf by default
#' @param only.pos Only return positive markers (FALSE by default)
#' @param print.bar Print a progress bar once expression testing begins (uses pbapply to do this)
#' @param max.cells.per.ident Down sample each identity class to a max number. Default is no downsampling.
#' @param random.seed Random seed for downsampling
#' @param return.thresh Only return markers that have a p-value < return.thresh, or a power > return.thresh (if the test is ROC)
#' @param do.print Print status updates
#' @param min.cells.gene Minimum number of cells expressing the gene in at least one
#' of the two groups, currently only used for poisson and negative binomial tests
#' @param min.cells.group Minimum number of cells in one of the groups
#' @param assay.type Type of assay to fetch data for (default is RNA)
#' @param \dots Additional parameters to pass to specific DE functions
#'
#' @return Returns a dataframe with a ranked list of putative markers for each node and associated statistics
#'
#' @importFrom ape drop.tip
#'
#' @export
#'
#' @examples
#' pbmc_small
#'
#' FindAllMarkersNode(pbmc_small)
#'
FindAllMarkersNode <- function(
  object,
  node = NULL,
  genes.use = NULL,
  logfc.threshold = 0.25,
  test.use = "wilcox",
  min.pct = 0.1,
  min.diff.pct = 0.05,
  print.bar = TRUE,
  only.pos = FALSE,
  max.cells.per.ident = Inf,
  return.thresh = 1e-2,
  do.print = FALSE,
  random.seed = 1,
  min.cells.gene = 3,
  min.cells.group = 3,
  assay.type = "RNA",
  ...
) {
  if (length(object@cluster.tree) == 0) {
    stop("Tree hasn't been built yet. Run BuildClusterTree to build.")
  }
  data.use <- GetAssayData(object = object,assay.type = assay.type,slot = "data")
  genes.use <- SetIfNull(x = genes.use, default = rownames(x = data.use))
  node <- SetIfNull(x = node, default = object@cluster.tree[[1]]$edge[1, 1])
  tree.use <- object@cluster.tree[[1]]
  descendants <- DFT(tree = tree.use, node = node, path = NULL, include.children = TRUE)
  all.children <- sort(x = tree.use$edge[,2][!tree.use$edge[,2] %in% tree.use$edge[,1]])
  descendants <- MapVals(v = descendants, from = all.children, to = tree.use$tip.label)
  drop.children <- setdiff(tree.use$tip.label, descendants)
  keep.children <- setdiff(tree.use$tip.label, drop.children)
  orig.nodes <- c(node, as.numeric(setdiff(descendants, keep.children)))
  tree.use <- drop.tip(tree.use, drop.children)
  new.nodes <- unique(tree.use$edge[,1])
  if ((test.use == 'roc') && (return.thresh == 1e-2)) {
    return.thresh <- 0.7
  }
  genes.de <- list()
  for (i in ((tree.use$Nnode + 2):max(tree.use$edge))) {
    genes.de[[i]] <- FindMarkersNode(
      object = object,
      assay.type = assay.type,
      node = i,
      tree.use = tree.use,
      genes.use = genes.use,
      logfc.threshold = logfc.threshold,
      test.use = test.use,
      min.pct = min.pct,
      min.diff.pct = min.diff.pct,
      print.bar = print.bar,
      only.pos = only.pos,
      max.cells.per.ident = max.cells.per.ident,
      random.seed = random.seed,
      min.cells.gene = min.cells.gene,
      min.cells.group = min.cells.group
    )
    if (do.print) {
      message(paste("Calculating node", i))
    }
  }
  gde.all <- data.frame()
  for (i in ((tree.use$Nnode + 2):max(tree.use$edge))) {
    if (is.null(x = unlist(x = genes.de[i]))) {
      next
    }
    gde <- genes.de[[i]]
    if (nrow(x = gde) > 0) {
      if (test.use == 'roc') {
        gde <- subset(
          x = gde,
          subset = (myAUC > return.thresh | myAUC < (1 - return.thresh))
        )
      }
      if ( (test.use == 'bimod') || (test.use == 't')) {
        gde <- gde[order(gde$p_val,-gde$avg_logFC), ]
        gde <- subset(x = gde, subset = p_val < return.thresh)
      }
      if (nrow(x = gde) > 0) {
        gde$cluster <- i
        gde$gene <- rownames(x = gde)
      }
      if (nrow(x = gde) > 0) {
        gde.all <- rbind(gde.all,gde)
      }
    }
  }
  gde.all$cluster <- MapVals(
    v = gde.all$cluster,
    from = new.nodes,
    to = orig.nodes
  )
  return(gde.all)
}

#' Negative binomial test for UMI-count based data (regularized version)
#'
#' Identifies differentially expressed genes between two groups of cells using
#' a likelihood ratio test of negative binomial generalized linear models where
#' the overdispersion parameter theta is determined by pooling information
#' across genes.
#'
#' @inheritParams FindMarkers
#' @param object Seurat object
#' @param cells.1 Group 1 cells
#' @param cells.2 Group 2 cells
#' @param genes.use Genes to use for test
#' @param latent.vars Latent variables to test
#' @param print.bar Print progress bar
#' @param min.cells Minimum number of cells threshold
#' @param assay.type Type of assay to fetch data for (default is RNA)
#'
#' @return Returns a p-value ranked data frame of test results.
#'
#' @importFrom stats p.adjust
#' @importFrom utils txtProgressBar setTxtProgressBar
#'
#' @export
#'
#' @examples
#' # Note, not recommended for particularly small datasets - expect warnings
#' NegBinomDETest(
#'   object = pbmc_small,
#'   cells.1 = WhichCells(object = pbmc_small, ident = 1),
#'   cells.2 = WhichCells(object = pbmc_small, ident = 2)
#' )
#'
NegBinomRegDETest <- function(
  object,
  cells.1,
  cells.2,
  genes.use = NULL,
  latent.vars = NULL,
  print.bar = TRUE,
  min.cells = 3,
  assay.type = "RNA"
) {
  if (!is.null(genes.use)) {
    message('Make sure that genes.use contains mostly genes that are not expected to be
            differentially expressed to allow unbiased theta estimation')
  }
  genes.use <- SetIfNull(x = genes.use, default = rownames(x = GetAssayData(object = object,assay.type = assay.type,slot = "data")))
  # check that the gene made it through the any filtering that was done
  genes.use <- genes.use[genes.use %in% rownames(x = GetAssayData(object = object,assay.type = assay.type,slot = "data"))]
  message(
    sprintf(
      'NegBinomRegDETest for %d genes and %d and %d cells',
      length(x = genes.use),
      length(x = cells.1),
      length(x = cells.2)
    )
  )
  grp.fac <- factor(
    x = c(
      rep.int(x = 'A', times = length(x = cells.1)),
      rep.int(x = 'B', times = length(x = cells.2))
    )
  )
  to.test.data <- GetAssayData(object = object,assay.type = assay.type,slot = "raw.data")[genes.use, c(cells.1, cells.2), drop = FALSE]
  message('Calculating mean per gene per group')
  above.threshold <- pmax(
    apply(X = to.test.data[, cells.1] > 0, MARGIN = 1, FUN = mean),
    apply(X = to.test.data[, cells.2] > 0, MARGIN = 1, FUN = mean)
  ) >= 0.02
  message(
    sprintf(
      '%d genes are detected in at least 2%% of the cells in at least one of the groups and will be tested',
      sum(above.threshold)
    )
  )
  genes.use <- genes.use[above.threshold]
  to.test.data <- to.test.data[genes.use, , drop = FALSE]
  my.latent <- FetchData(
    object = object,
    vars.all = latent.vars,
    cells.use = c(cells.1, cells.2),
    use.raw = TRUE
  )
  to.test <- data.frame(my.latent, row.names = c(cells.1, cells.2))
  message(paste('Latent variables are', paste(latent.vars, collapse = " ")))
  # get regularized theta (ignoring group factor)
  theta.fit <- RegularizedTheta(
    cm = to.test.data,
    latent.data = to.test,
    min.theta = 0.01,
    bin.size = 128
  )
  message('Running NB regression model comparison')
  to.test$NegBinomRegDETest.group <- grp.fac
  bin.size <- 128
  bin.ind <- ceiling(1:length(x = genes.use) / bin.size)
  max.bin <- max(bin.ind)
  pb <- txtProgressBar(min = 0, max = max.bin, style = 3, file = stderr())
  res <- c()
  for (i in 1:max.bin) {
    genes.bin.use <- genes.use[bin.ind == i]
    bin.out.lst <- parallel::mclapply(
      X = genes.bin.use,
      FUN = function(j) {
        return(NBModelComparison(
          y = to.test.data[j, ],
          theta = theta.fit[j],
          latent.data = to.test,
          com.fac = latent.vars,
          grp.fac = 'NegBinomRegDETest.group'
        ))
      }
    )
    res <- rbind(res, do.call(rbind, bin.out.lst))
    setTxtProgressBar(pb = pb, value = i)
  }
  close(pb)
  rownames(res) <- genes.use
  res <- as.data.frame(x = res)
  res$adj.pval <- p.adjust(p = res$pval, method = 'fdr')
  res <- res[order(res$pval, -abs(x = res$log2.fc)), ]
  return(res)
}

globalVariables(names = 'i', package = 'Seurat', add = TRUE)
# Regress out technical effects and cell cycle
#
# Remove unwanted effects from scale.data
#
# @keywords internal
# @param object Seurat object
# @param vars.to.regress effects to regress out
# @param genes.regress gene to run regression for (default is all genes)
# @param model.use Use a linear model or generalized linear model (poisson, negative binomial) for the regression. Options are 'linear' (default), 'poisson', and 'negbinom'
# @param use.umi Regress on UMI count data. Default is FALSE for linear modeling, but automatically set to TRUE if model.use is 'negbinom' or 'poisson'
# @param verbose display progress bar for regression procedure.
# @param do.par use parallel processing for regressing out variables faster.
# If set to TRUE, will use half of the machines available cores (FALSE by default)
# @param num.cores If do.par = TRUE, specify the number of cores to use.
#
# @return Returns the residuals from the regression model
#
#' @import Matrix
#' @import doSNOW
#' @importFrom stats as.formula lm residuals glm
#' @importFrom utils txtProgressBar setTxtProgressBar
#' @importFrom foreach foreach %dopar%
#
RegressOutResid <- function(
  object,
  vars.to.regress,
  genes.regress = NULL,
  model.use = 'linear',
  use.umi = FALSE,
  verbose = TRUE,
  do.par = FALSE,
  num.cores = 1
) {
  possible.models <- c("linear", "poisson", "negbinom")
  if (!model.use %in% possible.models) {
    stop(
      paste0(
        model.use,
        " is not a valid model. Please use one the following: ",
        paste0(possible.models, collapse = ", "),
        "."
      )
    )
  }
  genes.regress <- SetIfNull(x = genes.regress, default = rownames(x = object@data))
  genes.regress <- intersect(x = genes.regress, y = rownames(x = object@data))
  latent.data <- FetchData(object = object, vars.all = vars.to.regress)
  bin.size <- ifelse(test = model.use == 'negbinom', yes = 5, no = 100)
  bin.ind <- ceiling(x = 1:length(x = genes.regress) / bin.size)
  max.bin <- max(bin.ind)
  if (verbose) {
    message(paste("Regressing out:", paste(vars.to.regress, collapse = ", ")))
    pb <- txtProgressBar(min = 0, max = max.bin, style = 3, file = stderr())
  }
  data.resid <- c()
  data.use <- object@data[genes.regress, , drop = FALSE];
  if (model.use != "linear") {
    use.umi <- TRUE
  }
  if (use.umi) {
    data.use <- object@raw.data[genes.regress, object@cell.names, drop = FALSE]
  }
  # input checking for parallel options
  if (do.par) {
    if (num.cores == 1) {
      num.cores <- detectCores() / 2
    } else if (num.cores > detectCores()) {
      num.cores <- detectCores() - 1
      warning(paste0("num.cores set greater than number of available cores(", detectCores(), "). Setting num.cores to ", num.cores, "."))
    }
  } else if (num.cores != 1) {
    num.cores <- 1
    warning("For parallel processing, please set do.par to TRUE.")
  }
  cl <- parallel::makeCluster(num.cores)#, outfile = "")
  # using doSNOW library because it supports progress bar update
  registerDoSNOW(cl)
  opts <- list()
  if (verbose) {
    # define progress bar function
    progress <- function(n) setTxtProgressBar(pb, n)
    opts <- list(progress = progress)
    time_elapsed <- Sys.time()
  }
  reg.mat.colnames <- c(colnames(x = latent.data), "GENE")
  fmla_str = paste0("GENE ", " ~ ", paste(vars.to.regress, collapse = "+"))
  if (model.use == "linear") {
    # In this code, we'll repeatedly regress different Y against the same X
    # (latent.data) in order to calculate residuals.  Rather that repeatedly
    # call lm to do this, we'll avoid recalculating the QR decomposition for the
    # latent.data matrix each time by reusing it after calculating it once and
    # storing it in a fastResiduals function.
    regression.mat <- cbind(latent.data, data.use[1,])
    colnames(regression.mat) <- reg.mat.colnames
    qr = lm(as.formula(fmla_str), data = regression.mat, qr = TRUE)$qr
    rm(regression.mat)
  }
  data.resid <- foreach(i = 1:max.bin, .combine = "c", .options.snow = opts) %dopar% {
    genes.bin.regress <- rownames(x = data.use)[bin.ind == i]
    gene.expr <- as.matrix(x = data.use[genes.bin.regress, , drop = FALSE])
    empty_char = character(length = dim(gene.expr)[1]) # Empty vector to reuse
    new.data <- sapply(
      X = genes.bin.regress,
      FUN = function(x) {
        # Fast path for std. linear models
        if(model.use=="linear") {
          resid <- qr.resid(qr, gene.expr[x,])
        } else {
          regression.mat <- cbind(latent.data, gene.expr[x,])
          colnames(x = regression.mat) <- reg.mat.colnames
          fmla = as.formula(fmla_str)
          resid <- switch(
            EXPR = model.use,
            'poisson' = residuals(
              object = glm(
                formula = fmla,
                data = regression.mat,
                family = "poisson"
              ),
              type = 'pearson'
            ),
            'negbinom' = NBResiduals(
              fmla = fmla,
              regression.mat = regression.mat,
              gene = x,
              return.mode = TRUE
            )
          )
        }
        if (!is.list(x = resid)) {
          resid <- list('resid' = resid, 'mode' = empty_char)
        }
        return(resid)
      }
    )
    new.data.resid <- new.data[seq.int(from = 1, to = length(x = new.data), by = 2)]
    new.data.resid = matrix(unlist(new.data.resid), nrow = length(new.data.resid[[1]]))
    colnames(x = new.data.resid) <- genes.bin.regress
    new.data.mode <- unlist(x = new.data[seq.int(from = 2, to = length(x = new.data), by = 2)])
    names(x = new.data.mode) <- genes.bin.regress
    new.data <- list('resid' = new.data.resid, 'mode' = new.data.mode)
    return(new.data)
  }
  if (verbose) {
    time_elapsed <- Sys.time() - time_elapsed
    cat(paste("\nTime Elapsed: ",time_elapsed, units(time_elapsed)))
    close(pb)
  }
  stopCluster(cl)
  modes <- unlist(x = data.resid[seq.int(from = 2, to = length(x = data.resid), by = 2)])
  modes <- modes[modes == 'scale']
  names(x = modes) <- gsub(
    pattern = 'mode.',
    replacement = '',
    x = names(x = modes),
    fixed = TRUE
  )
  data.resid <- data.resid[seq.int(from = 1, to = length(x = data.resid), by = 2)]
  data.resid <- as.matrix(x = as.data.frame(x = data.resid))
  data.resid <- t(x = data.resid)
  if (length(x = modes)) {
    message(
      "The following genes failed with glm.nb, and fell back to scale(log(y+1))\n\t",
      paste(names(x = modes), collapse = ', ')
    )
  }
  rownames(x = data.resid) <- genes.regress
  suppressWarnings(expr = gc(verbose = FALSE))
  if (use.umi) {
    data.resid <- log1p(
      x = sweep(
        x = data.resid,
        MARGIN = 1,
        STATS = apply(X = data.resid, MARGIN = 1, FUN = min),
        FUN = "-"
      )
    )
  }
  return(data.resid)
}

# Regress out technical effects and cell cycle using regularized Negative Binomial regression
#
# Remove unwanted effects from umi data and set scale.data to Pearson residuals
# Uses mclapply; you can set the number of cores it will use to n with command options(mc.cores = n)
# If n.genes.step1 is set, only a (somewhat-random) subset of genes is used for estimating theta.
#
# @param object Seurat object
# @param latent.vars effects to regress out (character vector)
# @param n.genes.step1 number of genes to use when estimating theta (default uses all genes)
# @param genes.regress gene to run regression for (default uses all genes)
# @param res.clip.range numeric of length two specifying the min and max values the results will be clipped to
# @param min.theta minimum theta to use in NB regression
# @param residual.type string specifying the type of residual used (default is pearson)
# @param bin.size number of genes to put in each bin (to show progress)
# @param use.stored.theta skip the first step and use the fitted thetas from a previous run
# @param min.cells only use genes that have been observed in at least this many cells
#
# @return Returns Seurat object with the scale.data (object@scale.data) genes returning the residuals from the regression model
#
#' @import Matrix
#' @import parallel
#' @importFrom MASS theta.ml negative.binomial
#' @importFrom stats glm loess residuals approx
#' @importFrom utils txtProgressBar setTxtProgressBar
#
RegressOutNBregOld <- function(
  object,
  latent.vars,
  n.genes.step1 = NULL,
  genes.regress = NULL,
  res.clip.range = c(-30, 30),
  min.theta = 0.01,
  residual.type = 'pearson',
  bin.size = 128,
  use.stored.theta = FALSE,
  min.cells = 3
) {
  # in the first step we use all genes, except if n.genes.step1 has been set
  cm <- Matrix(object@raw.data[, colnames(x = object@data), drop = FALSE])
  gene.observed <- Matrix::rowSums(cm > 0)
  genes.regress <- SetIfNull(x = genes.regress, default = rownames(x = cm))
  genes.regress <- intersect(x = genes.regress, y = rownames(x = cm)[gene.observed >= min.cells])
  genes.step1 <- rownames(cm)[gene.observed >= min.cells]
  if (!is.null(n.genes.step1)) {
    # density-sample genes to speed up the first step
    raw.mean <- log10(rowMeans(cm[genes.step1, ]))
    raw.det.rate <- rowMeans(cm[genes.step1, ] > 0)
    dens <- apply(
      X = cbind(raw.mean, raw.det.rate),
      MARGIN = 2,
      FUN = function(y) {
        y.dens <- density(x = y, bw = 'nrd', adjust = 1)
        ret <- approx(x = y.dens$x, y = y.dens$y, xout = y)$y
        return(ret / sum(ret))
      }
    )
    sampling.prob <- 1 / apply(X = dens, MARGIN = 1, FUN = min)
    genes.step1 <- sample(x = genes.step1, size = n.genes.step1, prob = sampling.prob)
  }
  latent.data <- FetchData(object = object, vars.all = latent.vars)
  bin.ind <- ceiling(x = 1:length(x = genes.step1) / bin.size)
  max.bin <- max(bin.ind)
  message(paste("Regressing out", paste(latent.vars, collapse = ' ')))
  if (use.stored.theta) {
    message('Using previously fitted theta values for NB regression')
    theta.fit <- object@misc[['NBreg.theta.fit']]
  } else {
    message('First step: Poisson regression (to get initial mean), and estimate theta per gene')
    message('Using ', length(x = genes.step1), ' genes')
    pb <- txtProgressBar(min = 0, max = max.bin, style = 3)
    theta.estimate <- c()
    for (i in 1:max.bin) {
      genes.bin.regress <- genes.step1[bin.ind == i]
      bin.theta.estimate <- unlist(
        parallel::mclapply(
          X = genes.bin.regress,
          FUN = function(j) {
            as.numeric(
              x = MASS::theta.ml(
                as.numeric(x = unlist(x = cm[j, ])),
                glm(as.numeric(x = unlist(x = cm[j, ])) ~ ., data = latent.data, family = poisson)$fitted
              )
            )
          }
        ),
        use.names = FALSE
      )
      theta.estimate <- c(theta.estimate, bin.theta.estimate)
      setTxtProgressBar(pb, i)
    }
    close(pb)
    names(theta.estimate) <- genes.step1
    UMI.mean <- rowMeans(x = cm[genes.step1, ])
    var.estimate <- UMI.mean + (UMI.mean ^ 2) / theta.estimate
    fit <- loess(log10(var.estimate) ~ log10(UMI.mean), span = 0.33)
    genes.regress.mean <- rowMeans(x = cm[genes.regress, ])
    var.fit.log10 <- predict(fit, log10(genes.regress.mean))
    theta.fit <- (genes.regress.mean ^ 2) / (10 ^ var.fit.log10 - genes.regress.mean)
    to.fix <- theta.fit <= min.theta | is.infinite(x = theta.fit)
    if (any(to.fix)) {
      message(
        'Fitted theta below ',
        min.theta,
        ' for ',
        sum(to.fix),
        ' genes, setting them to ',
        min.theta
      )
      theta.fit[to.fix] <- min.theta
    }
    # save theta estimate and fitted theta in object
    object@misc <- as.list(object@misc)
    object@misc[['NBreg.theta.estimate']] <- theta.estimate
    object@misc[['NBreg.theta.fit']] <- theta.fit
  }
  message('Second step: NB regression with fixed theta for ', length(x = genes.regress), ' genes')
  bin.ind <- ceiling(x = 1:length(x = genes.regress) / bin.size)
  max.bin <- max(bin.ind)
  pb <- txtProgressBar(min = 0, max = max.bin, style = 3, file = stderr())
  res <- c()
  for (i in 1:max.bin) {
    genes.bin.regress <- genes.regress[bin.ind == i]
    names(genes.bin.regress) <- genes.bin.regress
    bin.res.lst <- parallel::mclapply(
      X = genes.bin.regress,
      FUN = function(j) {
        fit <- 0
        try(
          fit <- glm(
            as.numeric(x = unlist(x = cm[j, ])) ~ .,
            data = latent.data,
            family = MASS::negative.binomial(theta = theta.fit[j])
          ),
          silent = TRUE
        )
        if (class(fit)[1] == 'numeric') {
          message <-
            sprintf(
              'glm and family=negative.binomial(theta=%f) failed for gene %s; falling back to scale(log10(y+1))',
              theta.fit[j],
              j
            )
          res <- scale(x = log10(as.numeric(x = unlist(x = cm[j, ])) + 1))[, 1]
        } else {
          message <- NULL
          res <- residuals(object = fit, type = residual.type)
        }
        return(list(res = res, message = message))
      }
    )
    # Print message to keep track of the genes for which glm failed to converge
    message <- unlist(x = lapply(X = bin.res.lst, FUN = function(l) { return(l$message) }), use.names = FALSE)
    if (!is.null(x = message)) {
      message(paste(message, collapse = "\n"))
    }
    bin.res.lst <- lapply(X = bin.res.lst, FUN = function(l) { return(l$res) })
    res <- rbind(res, do.call(rbind, bin.res.lst))
    setTxtProgressBar(pb, i)
  }
  close(pb)
  dimnames(x = res) <- list(genes.regress, colnames(x = cm))
  res[res < res.clip.range[1]] <- res.clip.range[1]
  res[res > res.clip.range[2]] <- res.clip.range[2]
  object@scale.data <- res
  return(object)
}

#' Get Cluster Assignments
#'
#' Retrieve cluster IDs as a dataframe. First column will be the cell name,
#' second column will be the current cluster identity (pulled from object@ident).

#' @param object Seurat object with cluster assignments
#' @return Returns a dataframe with cell names and cluster assignments
#' @export
#'
#'@examples
#' pbmc_small
#' clusters <- GetClusters(object = pbmc_small)
#' head(clusters)
#'
GetClusters <- function(object) {
  clusters <- data.frame(cell.name = names(object@ident), cluster = object@ident)
  rownames(clusters) <- NULL
  clusters$cell.name <- as.character(clusters$cell.name)
  return(clusters)
}

#' Set Cluster Assignments
#'
#' Easily set the cluster assignments using the output of GetClusters() ---
#' a dataframe with cell names as the first column and cluster assignments as
#' the second.
#'
#' @param object Seurat object
#' @param clusters A dataframe containing the cell names and cluster assignments
#' to set for the object.
#' @return Returns a Seurat object with the identities set to the cluster
#' assignments that were passed.
#' @export
#'
#'@examples
#' pbmc_small
#' # Get clusters as a dataframe with GetClusters.
#' clusters <- GetClusters(object = pbmc_small)
#' # Use SetClusters to set cluster IDs
#' pbmc_small <- SetClusters(object = pbmc_small, clusters = clusters)
#'
SetClusters <- function(object, clusters = NULL) {
  if(!(all(c("cell.name", "cluster") %in% colnames(clusters)))){
    stop("The clusters parameter must be the output from GetClusters (i.e.
         Columns must be cell.name and cluster)")
  }
  cells.use <- clusters$cell.name
  ident.use <- clusters$cluster
  object <- SetIdent(
    object = object,
    cells.use = cells.use,
    ident.use = ident.use
  )
  return(object)
}

#' Save cluster assignments to a TSV file
#'
#' @param object Seurat object with cluster assignments
#' @param file Path to file to write cluster assignments to
#'
#' @return No return value. Writes clusters assignments to specified file.
#'
#' @importFrom utils write.table
#'
#' @export
#'
#' @examples
#' \dontrun{
#' pbmc_small
#' file.loc <- "~/Desktop/cluster_assignments.tsv"
#' SaveClusters(object = pbmc_small, file = file.loc)
#' }
#'
SaveClusters <- function(object, file) {
  my.clusters <- GetClusters(object = object)
  write.table(my.clusters, file = file, sep="\t", quote = FALSE, row.names = F)
}

#' Convert the cluster labels to a numeric representation
#'
#' @param object Seurat object
#' @return Returns a Seurat object with the identities relabeled numerically
#' starting from 1.
#'
#' @export
#'
#' @examples
#' # Append "Cluster_" to cluster IDs to demonstrate numerical conversion
#' new.cluster.labels <- paste0("Cluster_", pbmc_small@ident)
#' pbmc_small <- SetIdent(
#'   object = pbmc_small,
#'   cells.use = pbmc_small@cell.names,
#'   ident.use = new.cluster.labels
#' )
#' unique(pbmc_small@ident)
#' # Now relabel the IDs numerically starting from 1
#' pbmc_small <- NumberClusters(pbmc_small)
#' unique(pbmc_small@ident)
#'
NumberClusters <- function(object) {
  clusters <- unique(x = object@ident)
  if(any(sapply(X = clusters,
                FUN = function(x) { !grepl("\\D", x) }))
  ) {
    n <- as.numeric(x = max(clusters)) + 1
    for (i in clusters) {
      object <- SetIdent(
        object = object,
        cells.use = WhichCells(object = object, ident = i),
        ident.use = n
      )
      n <- n + 1
    }
    clusters <- unique(x = object@ident)
  }
  n <- 1
  for (i in clusters) {
    object <- SetIdent(
      object,
      cells.use = WhichCells(object = object, ident = i),
      ident.use = n
    )
    n <- n + 1
  }
  return(object)
}

#' Classify New Data
#'
#' Classify new data based on the cluster information of the provided object.
#' Random Forests are used as the basis of the classification.
#'
#' @param object Seurat object on which to train the classifier
#' @param classifier Random Forest classifier from BuildRFClassifier. If not provided,
#' it will be built from the training data provided.
#' @param training.genes Vector of genes to build the classifier on
#' @param training.classes Vector of classes to build the classifier on
#' @param new.data New data to classify
#' @param ... additional parameters passed to ranger
#'
#' @return Vector of cluster ids
#'
#' @import Matrix
#' @importFrom stats predict
#'
#' @export
#'
#' @examples
#' pbmc_small
#' # take the first 10 cells as test data and train on the remaining 70 cells
#' test.pbmc <- SubsetData(object = pbmc_small, cells.use = pbmc_small@cell.names[1:10])
#' train.pbmc <- SubsetData(object = pbmc_small, cells.use = pbmc_small@cell.names[11:80])
#' predicted.classes <- ClassifyCells(
#'   object = train.pbmc,
#'   training.classes = train.pbmc@ident,
#'   new.data = test.pbmc@data
#' )
#'
ClassifyCells <- function(
  object,
  classifier,
  training.genes = NULL,
  training.classes = NULL,
  new.data = NULL,
  ...
) {
  PackageCheck('ranger')
  # build the classifier
  if (missing(classifier)) {
    classifier <- BuildRFClassifier(
      object = object,
      training.genes = training.genes,
      training.classes = training.classes,
      ...
    )
  }
  # run the classifier on the new data
  features <- classifier$forest$independent.variable.names
  genes.to.add <- setdiff(x = features, y = rownames(x = new.data))
  data.to.add <- matrix(
    data = 0,
    nrow = length(x = genes.to.add),
    ncol = ncol(x = new.data)
  )
  rownames(x = data.to.add) <- genes.to.add
  new.data <- rbind(new.data, data.to.add)
  new.data <- new.data[features, ]
  new.data <- as.matrix(x = t(x = new.data))
  message("Running Classifier ...")
  prediction <- predict(classifier, new.data)
  new.classes <- prediction$predictions
  return(new.classes)
}

#' Build Random Forest Classifier
#'
#' Train the random forest classifier
#'
#'
#' @param object Seurat object on which to train the classifier
#' @param training.genes Vector of genes to build the classifier on
#' @param training.classes Vector of classes to build the classifier on
#' @param verbose Additional progress print statements
#' @param ... additional parameters passed to ranger
#'
#' @return Returns the random forest classifier
#'
#' @import Matrix
#'
#' @export
#'
#' @examples
#' pbmc_small
#' # Builds the random forest classifier to be used with ClassifyCells
#' # Useful if you want to use the same classifier with several sets of new data
#' classifier <- BuildRFClassifier(pbmc_small, training.classes = pbmc_small@ident)
#'
BuildRFClassifier <- function(
  object,
  training.genes = NULL,
  training.classes = NULL,
  verbose = TRUE,
  ...
) {
  PackageCheck('ranger')
  training.classes <- as.vector(x = training.classes)
  training.genes <- SetIfNull(
    x = training.genes,
    default = rownames(x = object@data)
  )
  training.data <- as.data.frame(
    x = as.matrix(
      x = t(
        x = object@data[training.genes, ]
      )
    )
  )
  training.data$class <- factor(x = training.classes)
  if (verbose) {
    message("Training Classifier ...")
  }
  classifier <- ranger::ranger(
    data = training.data,
    dependent.variable.name = "class",
    classification = TRUE,
    write.forest = TRUE,
    ...
  )
  return(classifier)
}

#' K-Means Clustering
#'
#' Perform k-means clustering on both genes and single cells
#'
#' K-means and heatmap are calculated on object@@scale.data
#'
#' @param object Seurat object
#' @param genes.use Genes to use for clustering
#' @param k.genes K value to use for clustering genes
#' @param k.cells K value to use for clustering cells (default is NULL, cells
#' are not clustered)
#' @param k.seed Random seed
#' @param do.plot Draw heatmap of clustered genes/cells (default is FALSE).
#' @param data.cut Clip all z-scores to have an absolute value below this.
#' Reduces the effect of huge outliers in the data.
#' @param k.cols Color palette for heatmap
#' @param set.ident If clustering cells (so k.cells>0), set the cell identity
#' class to its K-means cluster (default is TRUE)
#' @param do.constrained FALSE by default. If TRUE, use the constrained K-means function implemented in the tclust package.
#' @param assay.type Type of data to normalize for (default is RNA), but can be changed for multimodal analyses.
#' @param \dots Additional parameters passed to kmeans (or tkmeans)
#'
#' @importFrom methods new
#' @importFrom stats kmeans
#'
#' @return Seurat object where the k-means results for genes is stored in
#' object@@kmeans.obj[[1]], and the k-means results for cells is stored in
#' object@@kmeans.col[[1]]. The cluster for each cell is stored in object@@meta.data[,"kmeans.ident"]
#' and also object@@ident (if set.ident=TRUE)
#'
#' @export
#'
#' @examples
#' pbmc_small
#' # Cluster on genes only
#' pbmc_small <- DoKMeans(pbmc_small, k.genes = 3)
#' # Cluster on genes and cell
#' pbmc_small <- DoKMeans(pbmc_small, k.genes = 3, k.cells = 3)
#'
DoKMeans <- function(
  object,
  genes.use = NULL,
  k.genes = NULL,
  k.cells = 0,
  k.seed = 1,
  do.plot = FALSE,
  data.cut = 2.5,
  k.cols = PurpleAndYellow(),
  set.ident = TRUE,
  do.constrained = FALSE,
  assay.type="RNA",
  ...
) {
  data.use.orig <- GetAssayData(
    object = object,
    assay.type = assay.type,
    slot = "scale.data"
  )
  data.use <- MinMax(data = data.use.orig, min = data.cut * (-1), max = data.cut)
  genes.use <- SetIfNull(x = genes.use, default = object@var.genes)
  genes.use <- genes.use[genes.use %in% rownames(x = data.use)]
  cells.use <- object@cell.names
  kmeans.data <- data.use[genes.use, cells.use]
  if (do.constrained) {
    set.seed(seed = k.seed)
    PackageCheck('tclust')
    kmeans.obj <- tclust::tkmeans(x = kmeans.data, k = k.genes, ...)
  } else {
    set.seed(seed = k.seed)
    kmeans.obj <- kmeans(x = kmeans.data, centers = k.genes, ...)
  }
  names(x = kmeans.obj$cluster) <- genes.use
  #if we are going to k-means cluster cells in addition to genes
  kmeans.col <- c()
  if (k.cells > 0) {
    kmeans.col <- kmeans(x = t(x = kmeans.data), centers = k.cells)
    names(x = kmeans.col$cluster) <- cells.use
  }
  object.kmeans <- new(
    Class = "kmeans.info",
    gene.kmeans.obj = kmeans.obj,
    cell.kmeans.obj = kmeans.col
  )
  object@kmeans <- object.kmeans
  if (k.cells > 0) {
    kmeans.code=paste("kmeans",k.cells,"ident",sep=".")
    object@meta.data[names(x = kmeans.col$cluster), kmeans.code] <- kmeans.col$cluster
  }
  if (set.ident && (k.cells > 0)) {
    object <- SetIdent(
      object = object,
      cells.use = names(x = kmeans.col$cluster),
      ident.use = kmeans.col$cluster
    )
  }
  if (do.plot) {
    KMeansHeatmap(object = object)
  }
  return(object)
}

globalVariables(
  names = 'WeightedEuclideanDist',
  package = 'Seurat',
  add = TRUE
)
#' Phylogenetic Analysis of Identity Classes
#'
#' Constructs a phylogenetic tree relating the 'average' cell from each
#' identity class. Tree is estimated based on a distance matrix constructed in
#' either gene expression space or PCA space.
#'
#' Note that the tree is calculated for an 'average' cell, so gene expression
#' or PC scores are averaged across all cells in an identity class before the
#' tree is constructed.
#'
#' @param object Seurat object
#' @param genes.use Genes to use for the analysis. Default is the set of
#' variable genes (object@@var.genes). Assumes pcs.use=NULL (tree calculated in
#' gene expression space)
#' @param pcs.use If set, tree is calculated in PCA space.
#' @param SNN.use If SNN is passed, build tree based on SNN graph connectivity between clusters
#' @param do.plot Plot the resulting phylogenetic tree
#' @param do.reorder Re-order identity classes (factor ordering), according to
#' position on the tree. This groups similar classes together which can be
#' helpful, for example, when drawing violin plots.
#' @param reorder.numeric Re-order identity classes according to position on
#' the tree, assigning a numeric value ('1' is the leftmost node)
#' @param show.progress Show progress updates
#'
#' @return A Seurat object where the cluster tree is stored in
#' object@@cluster.tree[[1]]
#'
#' @importFrom ape as.phylo
#' @importFrom stats dist hclust
#'
#' @export
#'
#' @examples
#' pbmc_small
#' pbmc_small <- BuildClusterTree(pbmc_small, do.plot = FALSE)
#'
BuildClusterTree <- function(
  object,
  genes.use = NULL,
  pcs.use = NULL,
  SNN.use = NULL,
  do.plot = TRUE,
  do.reorder = FALSE,
  reorder.numeric = FALSE,
  show.progress = TRUE
) {
  genes.use <- SetIfNull(x = genes.use, default = object@var.genes)
  ident.names <- as.character(x = unique(x = object@ident))
  if (! is.null(x = genes.use)) {
    genes.use <- intersect(x = genes.use, y = rownames(x = object@data))
    data.avg <- AverageExpression(
      object = object,
      genes.use = genes.use,
      show.progress = show.progress
    )
    data.dist <- dist(t(x = data.avg[genes.use, ]))
  }
  if (! is.null(x = pcs.use)) {
    data.pca <- AveragePCA(object = object)
    data.dist <- dist(t(x = data.pca[pcs.use,]))
  }
  if (! is.null(x = SNN.use)) {
    num.clusters <- length(x = ident.names)
    data.dist <- matrix(data = 0, nrow = num.clusters, ncol = num.clusters)
    rownames(data.dist) <- ident.names
    colnames(data.dist) <- ident.names
    for (i in 1:(num.clusters - 1)) {
      for (j in (i + 1):num.clusters) {
        subSNN <- SNN.use[
          match(
            x = WhichCells(object = object, ident = ident.names[i]),
            table = colnames(x = SNN.use)
          ), # Row
          match(
            x = WhichCells(object = object, ident = ident.names[j]),
            table = rownames(x = SNN.use)
          ) # Column
          ]
        d <- mean(subSNN)
        if (is.na(x = d)) {
          data.dist[i, j] <- 0
        } else {
          data.dist[i, j] <- d
        }
      }
    }
    diag(x = data.dist) <- 1
    data.dist <- dist(data.dist)
  }
  data.tree <- as.phylo(x = hclust(d = data.dist))
  object@cluster.tree[[1]] <- data.tree
  if (do.reorder) {
    old.ident.order <- sort(x = unique(x = object@ident))
    data.tree <- object@cluster.tree[[1]]
    all.desc <- GetDescendants(tree = data.tree, node = (data.tree$Nnode + 2))
    all.desc <- old.ident.order[all.desc[all.desc <= (data.tree$Nnode + 1)]]
    object@ident <- factor(x = object@ident, levels = all.desc, ordered = TRUE)
    if (reorder.numeric) {
      object <- SetIdent(
        object = object,
        cells.use = object@cell.names,
        ident.use = as.integer(x = object@ident)
      )
      object@meta.data[object@cell.names, "tree.ident"] <- as.integer(x = object@ident)
    }
    object <- BuildClusterTree(
      object = object,
      genes.use = genes.use,
      pcs.use = pcs.use,
      do.plot = FALSE,
      do.reorder = FALSE,
      show.progress = show.progress
    )
  }
  if (do.plot) {
    PlotClusterTree(object)
  }
  return(object)
}

#' Perform spectral density clustering on single cells
#'
#' Find point clounds single cells in a two-dimensional space using density clustering (DBSCAN).
#'
#' @param object Seurat object
#' @param dim.1 First dimension to use
#' @param dim.2 second dimension to use
#' @param reduction.use Which dimensional reduction to use (either 'pca' or 'ica')
#' @param G.use Parameter for the density clustering. Lower value to get more fine-scale clustering
#' @param set.ident TRUE by default. Set identity class to the results of the density clustering.
#' Unassigned cells (cells that cannot be assigned a cluster) are placed in cluster 1, if there are any.
#' @param seed.use Random seed for the dbscan function
#' @param ... Additional arguments to be passed to the dbscan function
#'
#' @export
#'
#' @examples
#' pbmc_small
#' # Density based clustering on the first two tSNE dimensions
#' pbmc_small <- DBClustDimension(pbmc_small)
#'
DBClustDimension <- function(
  object,
  dim.1 = 1,
  dim.2 = 2,
  reduction.use = "tsne",
  G.use = NULL,
  set.ident = TRUE,
  seed.use = 1,
  ...
) {
  dim.code <- GetDimReduction(
    object = object,
    reduction.type = reduction.use,
    slot = 'key'
  )
  dim.codes <- paste0(dim.code, c(dim.1, dim.2))
  data.plot <- FetchData(object = object, vars.all = dim.codes)
  x1 <- paste0(dim.code, dim.1)
  x2 <- paste0(dim.code, dim.2)
  data.plot$x <- data.plot[, x1]
  data.plot$y <- data.plot[, x2]
  set.seed(seed = seed.use)
  data.mclust <- ds <- dbscan(data = data.plot[, c("x", "y")], eps = G.use, ...)
  to.set <- as.numeric(x = data.mclust$cluster + 1)
  data.names <- names(x = object@ident)
  object@meta.data[data.names, "DBclust.ident"] <- to.set
  if (set.ident) {
    object@ident <- factor(x = to.set)
    names(x = object@ident) <- data.names
  }
  return(object)
}

#' Perform spectral k-means clustering on single cells
#'
#' Find point clounds single cells in a low-dimensional space using k-means clustering.
#' Can be useful for smaller datasets, where graph-based clustering can perform poorly
#'
#' @param object A Seurat object
#' @param dims.use Dimensions to use for clustering
#' @param reduction.use Dimmensional Reduction to use for k-means clustering
#' @param k.use Number of clusters
#' @param set.ident Set identity of Seurat object
#' @param seed.use Random seed to use
#'
#' @return Object with clustering information
#'
#' @importFrom stats kmeans
#'
#' @export
#'
#' @examples
#' pbmc_small
#' # K-means clustering on the first two tSNE dimensions
#' pbmc_small <- KClustDimension(pbmc_small)
#'
KClustDimension <- function(
  object,
  dims.use = c(1,2),
  reduction.use = "tsne",
  k.use = 5,
  set.ident = TRUE,
  seed.use = 1
) {
  dim.code <- GetDimReduction(
    object = object,
    reduction.type = reduction.use,
    slot = 'key'
  )
  dim.codes <- paste0(dim.code, dims.use)
  data.plot <- FetchData(object = object, vars.all = dim.codes)
  set.seed(seed = seed.use)
  data.mclust <- ds <- kmeans(x = data.plot, centers = k.use)
  to.set <- as.numeric(x = data.mclust$cluster)
  data.names <- names(x = object@ident)
  object@meta.data[data.names, "kdimension.ident"] <- to.set
  if (set.ident) {
    object@ident <- factor(x = to.set)
    names(x = object@ident) <- data.names
  }
  return(object)
}

# This function calculates the pairwise connectivity of clusters.

# @param object  Seurat object containing the snn graph and cluster assignments
# @return        matrix with all pairwise connectivities calculated

CalcConnectivity <- function(object) {
  SNN <- object@snn
  cluster.names <- unique(x = object@ident)
  num.clusters <- length(x = cluster.names)
  connectivity <- matrix(data = 0, nrow = num.clusters, ncol = num.clusters)
  rownames(x = connectivity) <- cluster.names
  colnames(x = connectivity) <- cluster.names
  n <- 1
  for (i in cluster.names) {
    for (j in cluster.names[-(1:n)]) {
      subSNN <- SNN[
        match(x = WhichCells(object = object, ident = i), colnames(x = SNN)),
        match(x = WhichCells(object = object, ident = j), rownames(x = SNN))
        ]
      if (is.object(x = subSNN)) {
        connectivity[i, j] <- sum(subSNN) / (nrow(x = subSNN) * ncol(x = subSNN))
      } else {
        connectivity[i, j] <- mean(x = subSNN)
      }
    }
    n <- n + 1
  }
  return(connectivity)
}

#
#' @importFrom stats optim
#
project_map <- function(
  z,
  x_old,
  sum_X_old,
  x_old_tsne,
  P_tol = 5e-6,
  perplexity = 30
) {
  sum_z <- sum(z ^ 2)
  #x_old=test2@pca.rot[,1:5]
  #sum_X_old=rowSums((x_old^2))
  D_org <- sum_z + (-2 * as.matrix(x = x_old) %*% t(x = z) + sum_X_old)
  P <- d2p_cell(D = D_org, u = perplexity)
  nn_points <- which(x = P> P_tol) # Use only a small subset of points to comupute embedding. This keeps all the points that are proximal to the new point
  X_nn_set <- x_old[nn_points, ]  #Original points
  y_nn_set <- x_old_tsne[nn_points, ]  #Computed embeddings
  P_nn_set <- P[nn_points, ] #Probabilities
  y_new0 <- (
    t(x = as.matrix(x = y_nn_set)) %*%
      t(x = as.matrix(x = rbind(P_nn_set, P_nn_set)))
  )[, 1] #Initial guess of point as a weighted average
  sink("/dev/null")
  y_new <- optim(
    par = y_new0,
    fn = KullbackFun,
    gr = NULL,
    y_nn_set,
    P_nn_set,
    method = "Nelder-Mead"
  )
  sink()
  #plot(test2@tsne.rot)
  #points(y_new$par[1],y_new$par[2],col="red",cex=1,pch=16)
  #points(test2@tsne.rot[cell.num,1],test2@tsne.rot[cell.num,2],col="blue",cex=1,pch=16)
  #return(dist(as.matrix(rbind(y_new$par,test2@tsne.rot[cell.num,]))))
  return(y_new$par)
}

d2p_cell <- function(D, u = 15, tol = 1e-4) {
  betamin = -Inf
  betamax = Inf
  tries = 0
  tol = 1e-4
  beta = 1
  beta.list <- Hbeta(D = D, beta = beta)
  h <- beta.list[[1]]
  thisP <- beta.list[[2]]
  flagP <- beta.list[[3]]
  hdiff <- h - log(x = u)
  while (abs(x = hdiff) > tol && tries < 50) {
    if (hdiff > 0) {
      betamin <- beta
      if (betamax == Inf) {
        beta <- beta * 2
      } else {
        beta <- (beta + betamax) / 2
      }
    } else {
      betamax <- beta
      if (betamin == -Inf) {
        beta <- beta / 2
      } else {
        beta <- (beta + betamin) / 2
      }
    }
    beta.list <- Hbeta(D = D, beta = beta)
    h <- beta.list[[1]]
    thisP <- beta.list[[2]]
    flagP <- beta.list[[3]]
    hdiff <- h - log(x = u)
    tries <- tries + 1
  }
  # set the final row of p
  P <- thisP
  #Check if there are at least 10 points that are highly similar to the projected point
  return(P)
}

KullbackFun <- function(z, y, P) {
  #Computes the Kullback-Leibler divergence cost function for the embedding x in a tSNE map
  #%P = params{1};              %Transition probabilities in the original space. Nx1 vector
  #%y = params{2};              %tSNE embeddings of the training set. Nx2 vector
  print(z)
  print(dim(x = y))
  Cost0 = sum(P * log(x = P))      #Constant part of the cost function
  #Compute pairwise distances in embedding space
  sum_z <- sum(z ^ 2)
  sum_y <- rowSums(x = (y ^ 2))
  D_yz <- sum_z +(
    -2 * as.matrix(x = y) %*% t(x = matrix(data = z, nrow = 1)) + sum_y
  )
  Q <- 1 / (1 + D_yz)
  Q <- Q / sum(Q)               #Transition probabilities in the embedded space
  Cost <- Cost0 - sum(P * log(x = Q)) #% - 100 * sum(Q .* log(Q));
  return(Cost)
}

Hbeta <- function(D, beta) {
  flagP <- 1
  P <- exp(x = -D * beta)
  sumP <- sum(P)
  if (sumP < 1e-8) { #In this case it means that no point is proximal.
    P <- rep(length(x = P), 1) / length(x = P)
    sumP <- sum(P)
    flagP <- 0
  }
  H <- log(x = sumP) + beta * sum(D * P) / sumP
  P <- P / sumP
  return(list(H, P, flagP))
}

# Function to map values in a vector `v` as defined in `from`` to the values
# defined in `to`.
#
# @param v     vector of values to map
# @param from  vector of original values
# @param to    vector of values to map original values to (should be of equal
#              length as from)
# @return      returns vector of mapped values
#
MapVals <- function(v, from, to) {
  if (length(from) != length(to)) {
    stop("from and to vectors are not the equal length.")
  }
  vals.to.match <- match(v, from)
  vals.to.match.idx  <- !is.na(vals.to.match)
  v[vals.to.match.idx] <- to[vals.to.match[vals.to.match.idx]]
  return(v)
}

# Fills slot in new object with equivalent slot in old object if it still exists
#
# @param slot.name   slot to fill
# @param old.object  object to get slot value from
# @param new.slot    object to set slot value in
#
#' @importFrom methods slot slot<-
#
# @return            returns new object with slot filled
#
FillSlot <- function(slot.name, old.object, new.object) {
  new.slot <- tryCatch(
    {
      slot(object = old.object, name = slot.name)
    },
    error = function(err){
      return(NULL)
    }
  )
  if (!is.null(x = new.slot)) {
    slot(new.object, slot.name) <- new.slot
  }
  return(new.object)
}

# Use Fisher's method (Fisher's combined probability test) to combine p-values
# into single statistic
#
# @param pvals vector of p-values
#
# @returns integrated value
#
#' @importFrom stats pchisq
#
FisherIntegrate <- function(pvals) {
  return(1 - pchisq(q = -2 * sum(log(x = pvals)), df = 2 * length(x = pvals)))
}

# Set CalcParam information
#
# @param object      A Seurat object
# @param calculation The name of the calculation that was done
# @param time store time of calculation as well
# @param ...  Parameters for the calculation
#
# @return object with the calc.param slot modified to either append this
# calculation or replace the previous instance of calculation with
# a new list of parameters
#
SetCalcParams <- function(object, calculation, time = TRUE, ...) {
  object@calc.params[calculation] <- list(...)
  object@calc.params[[calculation]]$object <- NULL
  object@calc.params[[calculation]]$object2 <- NULL
  if(time) {
    object@calc.params[[calculation]]$time <- Sys.time()
  }
  return(object)
}

# Delete CalcParam information
#
# @param object      A Seurat object
# @param calculation The name of the calculation to remove
#
# @return object with the calc.param slot modified to remove this
# calculation
#
RemoveCalcParams <- function(object, calculation){
  object@calc.params[calculation] <- NULL
  return(object)
}

# Set Single CalcParam information
#
# @param object      A Seurat object
# @param calculation The name of the calculation that was done
# @param parameter  Parameter for the calculation to set
# @param value  Value of parameter to set
#
# @return object with the calc.param slot modified to either append this
# calculation or replace the previous instance of calculation with
# a new list of parameters
#
SetSingleCalcParam <- function(object, calculation, parameter, value) {
  object@calc.params[[calculation]][parameter] <- value
  return(object)
}

# Get CalcParam information
#
# @param object      A Seurat object
# @param calculation The name of the calculation that was done
# @param parameter  Parameter for the calculation to pull
#
# @return parameter value for given calculation
#
GetCalcParam <- function(object, calculation, parameter){
  if(parameter == "time"){
    return(object@calc.params[[calculation]][parameter][[1]])
  }
  return(unname(unlist(object@calc.params[[calculation]][parameter])))
}

# Get All CalcParam information for given calculation
#
# @param object      A Seurat object
# @param calculation The name of the calculation that was done
#
# @return list of parameter values for given calculation
#
GetAllCalcParam <- function(object, calculation){
  return(object@calc.params[[calculation]])
}

# Has any info been stored for the given calculation?
#
# @param object A Seurat object
# @param calculation The name of the calculation to look for info about
#
# @return Returns a boolean - whether or not there is any info about given calc
# stored
#
CalcInfoExists <- function(object, calculation){
  return(!is.null(object@calc.params[[calculation]]))
}

# Return vector of whitespace
#
# @param n length of whitespace vector to return
#
# @return vector of whitespace
#
FillWhiteSpace <- function(n){
  if(n <= 0){
    n <- 1
  }
  return(paste0(rep(" ", n), collapse = ""))
}

# return average of all values greater than a threshold
#
# @param x Values
# @param min Minimum threshold
#
# @return The mean of x where x > min
#
MeanGreaterThan <- function(x, min = 0) {
  return(mean(x = x[x > min]))
}

# return variance of all values greater than a threshold
#
# @param x Values
# @param min Minimum threshold
#
# @return The variance of x where x > min
#
#' @importFrom stats var
#
VarianceGreaterThan <- function(x, min = 0) {
  return(var(x = x[x > min]))
}

# calculate the coefficient of variation
#
# @param x Values to calculate the coefficient of variation
#
# @return The coefficient of variation of x
#
#' @importFrom stats sd
#
CoefVar <- function(x) {
  return(sd(x = x) / mean(x = x))
}

# return la count of all values greater than a threshold
#
# @param x Values
# @param min Minimum threshold
#
# @return The length of x where x > min
#
CountGreaterThan <- function(x, min = 0) {
  return(sum(x > min))
}

# Weighted Euclidean Distance
#
# @param x Dataset 1
# @param y Dataset 2
# @param w Weights
#
# @return The Weighted Euclidian Distance (numeric)
#
WeightedEuclideanDistance <- function(x, y, w) {
  v.dist <- sum(sqrt(x = w * (x - y) ^ 2))
  return(v.dist)
}

# Function to get all the descendants on a tree left of a given node
#
# @param tree  Tree object (from ape package)
# @param node  Internal node in the tree
#
# @return      Returns all descendants left of the given node
#
GetLeftDescendants <- function(tree, node) {
  daughters <- tree$edge[which(tree$edge[, 1] == node), 2]
  if (daughters[1] <= (tree$Nnode+1)) {
    return(daughters[1])
  }
  daughter.use <- GetDescendants(tree, daughters[1])
  daughter.use <- daughter.use[daughter.use <= (tree$Nnode + 1)]
  return(daughter.use)
}

# Function to get all the descendants on a tree right of a given node
#
# @param tree  Tree object (from ape package)
# @param node  Internal node in the tree
#
# @return      Returns all descendants right of the given node
#
GetRightDescendants <- function(tree, node) {
  daughters <- tree$edge[which(x = tree$edge[, 1] == node), 2]
  if (daughters[2] <= (tree$Nnode + 1)) {
    return(daughters[2])
  }
  daughter.use <- GetDescendants(tree = tree, node = daughters[2])
  daughter.use <- daughter.use[daughter.use <= (tree$Nnode + 1)]
  return(daughter.use)
}

# Function to get all the descendants on a tree of a given node
#
# @param tree  Tree object (from ape package)
# @param node  Internal node in the tree
#
# @return      Returns all descendants of the given node
#
GetDescendants <- function(tree, node, curr = NULL) {
  if (is.null(x = curr)) {
    curr <- vector()
  }
  daughters <- tree$edge[which(x = tree$edge[, 1] == node), 2]
  curr <- c(curr, daughters)
  w <- which(x = daughters >= length(x = tree$tip))
  if (length(x = w) > 0) {
    for (i in 1:length(x = w)) {
      curr <- GetDescendants(tree = tree, node = daughters[w[i]], curr = curr)
    }
  }
  return(curr)
}

# Depth first traversal path of a given tree
#
# @param tree              Tree object (from ape package)
# @param node              Internal node in the tree
# @param path              Path through the tree (for recursion)
# @param include.children  Include children in the output path
# @param only.children     Only include children in the output path
# @return                  Returns a vector representing the depth first
#                          traversal path
#
DFT <- function(
  tree,
  node,
  path = NULL,
  include.children = FALSE,
  only.children = FALSE
) {
  if (only.children) {
    include.children = TRUE
  }
  children <- which(x = tree$edge[, 1] == node)
  child1 <- tree$edge[children[1], 2]
  child2 <- tree$edge[children[2], 2]
  if (child1 %in% tree$edge[, 1]) {
    if(! only.children){
      path <- c(path, child1)
    }
    path <- DFT(
      tree = tree,
      node = child1,
      path = path,
      include.children = include.children,
      only.children = only.children
    )
  } else {
    if (include.children) {
      path <-c(path, child1)
    }
  }
  if (child2 %in% tree$edge[, 1]) {
    if (! only.children) {
      path <- c(path, child2)
    }
    path <- DFT(
      tree = tree,
      node = child2,
      path = path,
      include.children = include.children,
      only.children = only.children
    )
  } else {
    if (include.children) {
      path <- c(path, child2)
    }
  }
  return(path)
}

# Function to check whether a given node in a tree has a child (leaf node)
#
# @param tree   Tree object (from ape package)
# @param node   Internal node in the tree
#
# @return       Returns a Boolean of whether the given node is connected to a
#               terminal leaf node

NodeHasChild <- function(tree, node) {
  children <- tree$edge[which(x = tree$edge[, 1] == node), ][, 2]
  return(any(children %in% tree$edge[, 2] && ! children %in% tree$edge[, 1]))
}

# Function to check whether a given node in a tree has only children(leaf nodes)
#
# @param tree   Tree object (from ape package)
# @param node   Internal node in the tree
#
# @return       Returns a Boolean of whether the given node is connected to only
#               terminal leaf nodes

NodeHasOnlyChildren <- function(tree, node) {
  children <- tree$edge[which(x = tree$edge[, 1] == node), ][, 2]
  return(! any(children %in% tree$edge[, 1]))
}

# Function to return all internal (non-terminal) nodes in a given tree
#
# @param tree   Tree object (from ape package)
#
# @return       Returns a vector of all internal nodes for the given tree
#
GetAllInternalNodes <- function(tree) {
  return(c(tree$edge[1, 1], DFT(tree = tree, node = tree$edge[1, 1])))
}

#' Shuffle a vector
#' @param x A vector
#' @return A vector with the same values of x, just in random order
#' @export
#'
#' @examples
#' v <- seq(10)
#' v2 <- Shuffle(x = v)
#' v2
#'
Shuffle <- function(x) {
  return(x[base::sample.int(
    n = base::length(x = x),
    size = base::length(x = x),
    replace = FALSE
  )])
}

#' Remove data from a table
#'
#' This function will remove any rows from a data frame or matrix
#' that contain certain values
#'
#' @param to.remove A vector of values that indicate removal
#' @param data A data frame or matrix
#'
#' @return A data frame or matrix with values removed by row
#'
#' @export
#'
#' @examples
#' df <- data.frame(
#'   x = rnorm(n = 100, mean = 20, sd = 2),
#'   y = rbinom(n = 100, size = 100, prob = 0.2)
#' )
#' nrow(x = df)
#' nrow (x = RemoveFromTable(to.remove = 20, data = df))
#'
RemoveFromTable <- function(to.remove, data) {
  remove.indecies <- apply(
    X = data,
    MARGIN = 2,
    FUN = function(col) {
      return(which(x = col %in% to.remove))
    }
  )
  remove.indecies <- unlist(x = remove.indecies)
  remove.indecies <- as.numeric(x = remove.indecies)
  if (length(x = remove.indecies) == 0) {
    return(data)
  } else {
    return(data[-remove.indecies, ])
  }
}

#' Make object sparse
#'
#' Converts stored data matrices to sparse matrices to save space. Converts
#' object@@raw.data and object@@data to sparse matrices.
#' @param object Seurat object
#'
#' @return Returns a seurat object with data converted to sparse matrices.
#'
#' @import Matrix
#' @importFrom methods as
#'
#' @export
#'
#' @examples
#' pbmc_raw <- read.table(
#'   file = system.file('extdata', 'pbmc_raw.txt', package = 'Seurat'),
#'   as.is = TRUE
#' )
#' pbmc_small <- CreateSeuratObject(raw.data = pbmc_raw)
#' class(x = pbmc_small@raw.data)
#' pbmc_small <- MakeSparse(object = pbmc_small)
#' class(x = pbmc_small@raw.data)
#'
MakeSparse <- function(object) {
  if (class(object@raw.data) == "data.frame") {
    object@raw.data <- as.matrix(x = object@raw.data)
  }
  if (class(object@data) == "data.frame") {
    object@data <- as.matrix(x = object@data)
  }
  object@raw.data <- as(object = object@raw.data, Class = "dgCMatrix")
  object@data <- as(object = object@data, Class = "dgCMatrix")
  return(object)
}

#' Return a subset of columns for a matrix or data frame
#'
#' @param data Matrix or data frame with column names
#' @param code Pattern for matching within column names
#' @param invert Invert the search?
#'
#' @return Returns a subset of data. If invert = TRUE, returns data where colnames
#' do not contain code, otherwise returns data where colnames contain code
#'
#' @export
#'
#' @examples
#' head(as.matrix(SubsetColumn(data = pbmc_small@raw.data, code = 'ATGC'))[, 1:4])
#'
SubsetColumn <- function(data, code, invert = FALSE) {
  return(data[, grep(pattern = code, x = colnames(x = data), invert = invert)])
}

#' Return a subset of rows for a matrix or data frame
#'
#' @param data Matrix or data frame with row names
#' @param code Pattern for matching within row names
#' @param invert Invert the search?
#'
#' @return Returns a subset of data. If invert = TRUE, returns data where rownames
#' do not contain code, otherwise returns data where rownames contain code
#'
#' @export
#'
#' @examples
#' cd_genes <- SubsetRow(data = pbmc_small@raw.data, code = 'CD')
#' head(as.matrix(cd_genes)[, 1:4])
#'
SubsetRow <- function(data, code, invert = FALSE) {
  return(data[grep(pattern = code, x = rownames(x = data), invert = invert), ])
}

#' Probability of detection by identity class
#'
#' For each gene, calculates the probability of detection for each identity
#' class.
#'
#' @param object Seurat object
#' @param thresh.min Minimum threshold to define 'detected' (log-scale)
#'
#' @return Returns a matrix with genes as rows, identity classes as columns.
#'
#' @export
#'
#' @examples
#' head(AverageDetectionRate(object = pbmc_small))
#'
AverageDetectionRate <- function(object, thresh.min = 0) {
  ident.use <- object@ident
  data.all <- data.frame(row.names = rownames(x = object@data))
  for (i in sort(x = unique(x = ident.use))) {
    temp.cells <- WhichCells(object = object, ident = i)
    data.temp <- apply(
      X = object@data[, temp.cells],
      MARGIN = 1,
      FUN = function(x) {
        return(sum(x > thresh.min)/length(x = x))
      }
    )
    data.all <- cbind(data.all, data.temp)
    colnames(x = data.all)[ncol(x = data.all)] <- i
  }
  colnames(x = data.all) <- sort(x = unique(x = ident.use))
  return(data.all)
}

#' Average PCA scores by identity class
#'
#' Returns the PCA scores for an 'average' single cell in each identity class
#'
#' @param object Seurat object
#'
#' @return Returns a matrix with genes as rows, identity classes as columns
#'
#' @export
#'
#' @examples
#'
#' head(AveragePCA(object = pbmc_small))
#'
AveragePCA <- function(object) {
  # ident.use <- object@ident
  embeddings <- Embeddings(object = object, reduction = 'pca')
  # embeddings <- GetDimReduction(
  #   object = object,
  #   reduction.type = 'pca',
  #   slot = 'cell.embeddings'
  # )
  data.all <- NULL
  for (i in levels(x = object)) {
    temp.cells <- WhichCells(object = object, idents = i)
    if (length(x = temp.cells) == 1) {
      data.temp <- apply(
        X = data.frame((embeddings[c(temp.cells, temp.cells), ])),
        MARGIN = 2,
        FUN = mean
      )
    }
    if (length(x = temp.cells) > 1) {
      data.temp <- apply(
        X = embeddings[temp.cells, ],
        MARGIN = 2,
        FUN = mean
      )
    }
    data.all <- cbind(data.all, data.temp)
    colnames(x = data.all)[ncol(x = data.all)] <- i
  }
  return(data.all)
}

#' Merge childen of a node
#'
#' Merge the childen of a node into a single identity class
#'
#' @param object Seurat object
#' @param node.use Merge children of this node
#' @param rebuild.tree Rebuild cluster tree after the merge?
#' @param ... Extra parameters to BuildClusterTree, used only if rebuild.tree = TRUE
#'
#' @seealso \code{BuildClusterTree}
#'
#' @export
#'
#' @examples
#' PlotClusterTree(object = pbmc_small)
#' pbmc_small <- MergeNode(object = pbmc_small, node.use = 7, rebuild.tree = TRUE)
#' PlotClusterTree(object = pbmc_small)
#'
MergeNode <- function(object, node.use, rebuild.tree = FALSE, ...) {
  object.tree <- object@cluster.tree[[1]]
  node.children <- DFT(
    tree = object.tree,
    node = node.use,
    include.children = TRUE
  )
  node.children <- intersect(x = node.children, y = levels(x = object@ident))
  children.cells <- WhichCells(object = object, ident = node.children)
  if (length(x = children.cells > 0)) {
    object <- SetIdent(
      object = object,
      cells.use = children.cells,
      ident.use = min(node.children)
    )
  }
  if (rebuild.tree) {
    object <- BuildClusterTree(object = object, ...)
  }
  return(object)
}

#' Calculate smoothed expression values
#'
#' Smooths expression values across the k-nearest neighbors based on dimensional reduction
#'
#' @inheritParams FeaturePlot
#' @inheritParams AddImputedScore
#' @param genes.fit Genes to calculate smoothed values for
#' @param dim.1 Dimension 1 to use for dimensional reduction
#' @param dim.2 Dimension 2 to use for dimensional reduction
#' @param reduction.use Dimensional reduction to use
#' @param k k-param for k-nearest neighbor calculation. 30 by default
#' @param do.log Whether to perform smoothing in log space. Default is false.
#' @param nn.eps Error bound when performing nearest neighbor seach using RANN;
#' default of 0.0 implies exact nearest neighbor search
#'
#' @importFrom RANN nn2
#'
#' @export
#'
#' @examples
#' pbmc_small <- AddSmoothedScore(object = pbmc_small, genes.fit = "MS4A1", reduction.use = "pca")
#'
AddSmoothedScore <- function(
  object,
  genes.fit = NULL,
  dim.1 = 1,
  dim.2 = 2,
  reduction.use = "tsne",
  k = 30,
  do.log = FALSE,
  do.print = FALSE,
  nn.eps = 0
) {
  genes.fit <- SetIfNull(x = genes.fit, default = object@var.genes)
  genes.fit <- genes.fit[genes.fit %in% rownames(x = object@data)]
  dim.code <- GetDimReduction(
    object = object,
    reduction.type = reduction.use,
    slot = 'key'
  )
  dim.codes <- paste0(dim.code, c(dim.1, dim.2))
  data.plot <- FetchData(object = object, vars.all = dim.codes)
  # knn.smooth <- get.knn(data = data.plot, k = k)$nn.index
  knn.smooth <- nn2(data = data.plot, k = k, searchtype = 'standard', eps = nn.eps)$nn.idx
  avg.fxn <- ifelse(test = do.log, yes = mean, no = ExpMean)
  lasso.fits <- data.frame(t(x = sapply(
    X = genes.fit,
    FUN = function(g) {
      return(unlist(x = lapply(
        X = 1:nrow(x = data.plot),
        FUN = function(y) {
          avg.fxn(as.numeric(x = object@data[g, knn.smooth[y, ]]))
        }
      )))
    }
  )))
  colnames(x = lasso.fits) <- rownames(x = data.plot)
  genes.old <- genes.fit[genes.fit %in% rownames(x = object@imputed)]
  genes.new <- genes.fit[!genes.fit %in% rownames(x = object@imputed)]
  if (length(x = genes.old) > 0) {
    object@imputed[genes.old, ] <- lasso.fits[genes.old, ]
  }
  object@imputed <- rbind(object@imputed, lasso.fits[genes.new, ])
  return(object)
}

#' Calculate imputed expression values
#'
#' Uses L1-constrained linear models (LASSO) to impute single cell gene
#' expression values.
#'
#' @param object Seurat object
#' @param genes.use A vector of genes (predictors) that can be used for
#' building the LASSO models.
#' @param genes.fit A vector of genes to impute values for
#' @param s.use Maximum number of steps taken by the algorithm (lower values
#' indicate a greater degree of smoothing)
#' @param do.print Print progress (output the name of each gene after it has
#' been imputed).
#' @param gram The use.gram argument passed to lars
#'
#' @return Returns a Seurat object where the imputed values have been added to
#' object@@imputed
#'
#' @import lars
#'
#' @export
#'
#' @examples
#' pbmc_small <- AddImputedScore(object = pbmc_small, genes.fit = "MS4A1")
#'
AddImputedScore <- function(
  object,
  genes.use = NULL,
  genes.fit = NULL,
  s.use = 20,
  do.print = FALSE,
  gram = TRUE
) {
  genes.use <- SetIfNull(x = genes.use, default = object@var.genes)
  genes.fit <- SetIfNull(x = genes.fit, default = object@var.genes)
  genes.use <- genes.use[genes.use %in% rownames(x = object@data)]
  genes.fit <- genes.fit[genes.fit %in% rownames(x = object@data)]
  lasso.input <- t(x = object@data[genes.use, ])
  lasso.fits <- data.frame(t(
    x = sapply(
      X = genes.fit,
      FUN = function(x) {
        return(
          LassoFxn(
            lasso.input = as.matrix(t(x = object@data[genes.use[genes.use != x], ])),
            genes.obs = object@data[x, ],
            s.use = s.use,
            gene.name = x,
            do.print = do.print,
            gram = gram
          )
        )
      }
    )
  ))
  genes.old <- genes.fit[genes.fit %in% rownames(x = object@imputed)]
  genes.new <- genes.fit[! (genes.fit %in% rownames(x = object@imputed))]
  if (length(x = genes.old) > 0) {
    object@imputed[genes.old, ] <- lasso.fits[genes.old, ]
  }
  object@imputed <- rbind(object@imputed, lasso.fits[genes.new, ])
  return(object)
}

#' GenesInCluster
#'
#' After k-means analysis, previously run with DoKMeans, returns a set of genes associated with each cluster
#'
#' @param object Seurat object. Assumes DoKMeans has already been run
#' @param cluster.num K-means cluster(s) to return genes for
#' @param max.genes max number of genes to return
#' @return A vector of genes who are members in the cluster.num k-means cluster(s)
#'
#' @export
#'
#' @examples
#' pbmc_small
#' # Cluster on genes only
#' pbmc_small <- DoKMeans(object = pbmc_small, k.genes = 3)
#' pbmc_small <- GenesInCluster(object = pbmc_small, cluster.num = 1)
#'
GenesInCluster <- function(object, cluster.num, max.genes = 1e6) {
  toReturn <- unlist(
    x = lapply(
      X = cluster.num,
      FUN = function(x) {
        return(head(
          x = sort(x = names(x = which(x = object@kmeans@gene.kmeans.obj$cluster==x))),
          n = max.genes
        ))
      }
    )
  )
  return(toReturn)
}

#' Find gene terms from Enrichr
#'
#' Fing gene terms from Enrichr and return the XML information
#'
#' @param QueryGene A gene to query on Enrichr
#'
#' @export
#' @importFrom httr GET status_code content
#'
#' @return An XML document with information on the queried gene
#'
FindGeneTerms <- function(QueryGene = NULL) {
  if (is.null(x = QueryGene)) {
    stop("Missing query gene")
  }
  path.use <- "Enrichr/genemap"
  api.get <- GET(
    url = "http://amp.pharm.mssm.edu/",
    path = path.use,
    query = list(gene = QueryGene)
  )
  api.status <- status_code(x = api.get)
  if (api.status != 200) {
    stop("Error searching for terms")
  }
  api.data <- content(x = api.get)
  return (api.data)
}

# Run t-distributed Stochastic Neighbor Embedding
#
# Run t-SNE dimensionality reduction on selected features. Has the option of running in a reduced
# dimensional space (i.e. spectral tSNE, recommended), or running based on a set of genes
#
# @param object Seurat object
# @param cells.use Which cells to analyze (default, all cells)
# @param dims.use Which dimensions to use as input features
# @param k.seed Random seed for the t-SNE
# @param do.fast If TRUE, uses the Barnes-hut implementation, which runs
# faster, but is less flexible
# @param add.iter If an existing tSNE has already been computed, uses the
# current tSNE to seed the algorithm and then adds additional iterations on top of this
# @param genes.use If set, run the tSNE on this subset of genes
# (instead of running on a set of reduced dimensions). Not set (NULL) by default
# @param reduction.use Which dimensional reduction (PCA or ICA) to use for the tSNE. Default is PCA
# @param dim_embed The dimensional space of the resulting tSNE embedding (default is 2).
# For example, set to 3 for a 3d tSNE
# @param q.use Quantile to use
# @param max.dim Max dimension to keep from diffusion calculation
# @param scale.clip Max/min value for scaled data. Default is 3
# @param ... Additional arguments to the tSNE call. Most commonly used is
# perplexity (expected number of neighbors default is 30)
#
# @return Returns a Seurat object with a tSNE embedding in object@@tsne_rot
#
# @import Rtsne
# @import tsne
#
# Not currently supported
#
AddTSNE <- function(
  object,
  cells.use = NULL,
  pcs.use = 1:10,
  do.plot = TRUE,
  k.seed = 1,
  add.iter = 1000,
  ...
) {
  cells.use <- SetIfNull(x = cells.use, default = colnames(x = object@data))
  data.use <- object@pca.rot[cells.use, pcs.use]
  #data.dist=as.dist(mahalanobis.dist(data.use))
  set.seed(seed = k.seed)
  data.tsne <- data.frame(
    tsne(
      X = data.use,
      initial_config = as.matrix(x = object@tsne.rot[cells.use,]),
      max_iter = add.iter,
      ...
    )
  )
  colnames(x = data.tsne) <- paste0("tSNE_", 1:ncol(data.tsne))
  rownames(x = data.tsne) <- cells.use
  object@tsne.rot <- data.tsne
  return(object)
}

#calculate true positives, used in AUC
calcTP <- function(cutoff, data, score, real, nTP) {
  return(length(x = which(x = (data[, score] > cutoff) & (data[, real] > 0))) / nTP)
}

#calculate false positives, used in AUC
calcFP <- function(cutoff, data, score, real, nFP) {
  return(length(x = which(x = (data[, score] > cutoff) & (data[, real] == 0))) / nFP)
}

#i do not believe we use this function, but leaving it in to be safe
auc <- function(data, score, real, n = 20) {
  totalPos <- length(x = which(x = data[, real] == 1))
  totalNeg <- length(x = which(x = data[, real] == 0))
  scores <- data[, score]
  data$myScore <- (scores + min(scores)) / (max(scores) + min(scores))
  tp <- unlist(x = lapply(
    X = seq(from = -0.0001, to = 0.9999, by = 1 / n),
    FUN = calcTP,
    data = data,
    score = "myScore",
    real = real,
    nTP = totalPos
  ))
  fp <- unlist(x = lapply(
    X = seq(from = -0.0001, to = 0.9999, by = 1 / n),
    FUN = calcFP,
    data = data,
    score = "myScore",
    real = real,
    nFP = totalNeg
  ))
  plot(x = c(fp, 1), y = c(tp, 1), xlim = c(0, 1), ylim = c(0, 1))
  x1 <- c(1, fp)
  x2 <- c(1, tp)
  print(
    sum(diff(x = rev(x = x2)) * diff(x = rev(x = x1))) /
      2 + sum(diff(x = rev(x = x1)) * rev(x = x2[-1]))
  )
  return(list(c(1, fp), c(1, tp)))
}

#useful with FindAllMarkers, not ready for main package yet
returnTopX <- function(data, group.by, n.return, col.return = NA) {
  to.ret <- c()
  levels.use=unique(group.by); if (is.factor(group.by)) levels.use=levels(group.by)
  if (!is.na(col.return)) return(unlist(lapply(levels.use, function(x) head(data[group.by==x,col.return],n.return)))) else {
    return(unlist(lapply(levels.use, function(x) head(rownames(data[group.by==x,])))))
  }
}

#i like this, but not used too much yet
genes.ca.range <- function(object, my.min, my.max) {
  ca <- AverageDetectionRate(object = object)
  ca.min <- apply(X = ca, MARGIN = 1, FUN = min)
  ca.max <- apply(X = ca, MARGIN = 1, FUN = max)
  genes.1 <- names(x = ca.min[ca.min < my.max])
  genes.2 <- names(x = ca.max[ca.max > my.min])
  return(intersect(x = genes.1, y = genes.2))
}

# # Not currently supported, but a cool function for QC
# CalcNoiseModels <- function(
#   object,
#   cell.ids = NULL,
#   trusted.genes = NULL,
#   n.bin = 20,
#   drop.expr = 1
# ) {
#   object@drop.expr <- drop.expr
#   cell.ids <- SetIfNull(x = cell.ids, default = 1:ncol(x = object@data))
#   trusted.genes <- SetIfNull(x = trusted.genes, default = rownames(x = object@data))
#   trusted.genes <- trusted.genes[trusted.genes %in% rownames(x = object@data)]
#   object@trusted.genes <- trusted.genes
#   data <- object@data[trusted.genes, ]
#   idents <- data.frame(data[, 1])
#   code_humpAvg <- apply(X = data, MARGIN = 1, FUN = MeanGreaterThan, min = object@drop.expr)
#   code_humpAvg[code_humpAvg > 9] <- 9
#   code_humpAvg[is.na(x = code_humpAvg)] <- 0
#   idents$code_humpAvg <- code_humpAvg
#   data[data > object@drop.expr] <- 1
#   data[data < object@drop.expr] <- 0
#   data$bin <- cut(x = code_humpAvg, breaks = n.bin)
#   data$avg <- code_humpAvg
#   rownames(x = idents) <- rownames(x = data)
#   my.coefs <- data.frame(t(x = pbsapply(
#     X = colnames(x = data[1:(ncol(x = data) - 2)]),
#     FUN = getAB,
#     data = data,
#     data2 = idents,
#     status = "code",
#     code2 = "humpAvg",
#     hasBin = TRUE,
#     doPlot = FALSE
#   )))
#   colnames(x = my.coefs) <- c("a", "b")
#   object@drop.coefs <- my.coefs
#   return(object)
# }

# Visualize expression/dropout curve
#
# Plot the probability of detection vs average expression of a gene.
#
# Assumes that this 'noise' model has been precomputed with CalcNoiseModels
#
# @param object Seurat object
# @param cell.ids Cells to use
# @param col.use Color code or name
# @param lwd.use Line width for curve
# @param do.new Create a new plot (default) or add to existing
# @param x.lim Maximum value for X axis
# @param \dots Additional arguments to pass to lines function
# @return Returns no value, displays a plot
#
# PlotNoiseModel <- function(
#   object,
#   cell.ids = c(1, 2),
#   col.use = 'black',
#   lwd.use = 2,
#   do.new = TRUE,
#   x.lim = 10,
#   ...
# ) {
#   cell.coefs <- object@drop.coefs[cell.ids,]
#   if (do.new) {
#     plot(
#       x = 1,
#       y = 1,
#       pch = 16,
#       type = 'n',
#       xlab = 'Average expression',
#       ylab = 'Probability of detection',
#       xlim = c(0, x.lim),
#       ylim =c(0, 1)
#     )
#   }
#   unlist(
#     x = lapply(
#       X = 1:length(x = cell.ids),
#       FUN = function(y) {
#         x.vals <- seq(from = 0, to = x.lim, by = 0.05)
#         y.vals <- unlist(x = lapply(
#           X = x.vals,
#           FUN = calc.drop.prob,
#           a = cell.coefs[y, 1],
#           b = cell.coefs[y, 2])
#         )
#         lines(
#           x = x.vals,
#           y = y.vals,
#           lwd = lwd.use,
#           col = col.use,
#           ...
#         )
#       }
#     )
#   )
# }

# Deleted, but used in regression.sig (which was kept)...
vsubc <- function(data,code) {
  return(data[grep(pattern = code, x = names(x = data))])
}

regression.sig <- function(x, score, data, latent, code = "rsem") {
  if (var(x = as.numeric(x = SubsetColumn(data = data, code = code)[x, ])) == 0) {
    return(0)
  }
  latent <- latent[grep(pattern = code, x = names(x = data))]
  data <- rbind(SubsetColumn(data = data, code = code), vsubc(data = score, code = code))
  rownames(x = data)[nrow(x = data)] <- "score"
  data2 <- data[c(x, "score"), ]
  rownames(x = data2)[1] <- "fac"
  if (length(x = unique(x = latent)) > 1) {
    mylm <- lm(formula = score ~ fac + latent, data = data.frame(t(x = data2)))
  } else {
    mylm <- lm(formula = score ~ fac, data = data.frame(t(x = data2)))
  }
  return(coef(object = summary(object = mylm))["fac", 3])
}

# Documentation
# @export
#
RemovePC <- function(object, pcs.remove, use.full = FALSE, ...) {
  data.old <- object@data
  pcs.use <- setdiff(x = 1:ncol(x = object@pca.obj[[1]]$rotation), y = pcs.remove)
  if (use.full) {
    data.x <- as.matrix(
      x = object@pca.x.full[, intersect(
        x = pcs.use,
        y = 1:ncol(x = object@pca.x.full)
      )]
    )
  } else {
    data.x <- as.matrix(x = object@pca.obj[[1]]$x[, pcs.use])
  }
  data.1 <- data.x %*% t(x = as.matrix(x = object@pca.obj[[1]]$rotation[, pcs.use]))
  data.2 <- sweep(
    x = data.1,
    MARGIN = 2,
    STATS = colMeans(x = object@scale.data),
    FUN = "+"
  )
  data.3 <- sweep(
    x = data.2,
    MARGIN = 1,
    STATS = apply(X = object@data[rownames(x = data.2), ], MARGIN = 1, FUN = sd),
    FUN = "*"
  )
  data.3 <- sweep(
    X = data.3,
    MARGIN = 1,
    STATS = apply(X = object@data[rownames(x = data.2), ], MARGIN = 1, FUN = mean),
    FUN = "+"
  )
  object@scale.data <- (data.2)
  data.old <- data.old[rownames(x = data.3), ]
  data.4 <- data.3
  data.4[data.old == 0] <- 0
  data.4[data.4 < 0] <- 0
  object@data[rownames(x = data.4), ] <- data.frame(data.4)
  return(object)
}

# Documentation
# @export
#
CalinskiPlot <- function(
  object,
  pcs.use,
  pval.cut = 0.1,
  gene.max = 15,
  col.max = 25,
  use.full = TRUE
) {
  if (length(x = pcs.use) == 1) {
    pvals.min <- object@jackStraw.empP.full[, pcs.use]
  } else if (length(x = pcs.use) > 1) {
    pvals.min <- apply(
      X = object@jackStraw.empP.full[, pcs.use],
      MARGIN = 1,
      FUN = min
    )
  }
  names(x = pvals.min) <- rownames(x = object@jackStraw.empP.full)
  genes.use <- names(x = pvals.min)[pvals.min < pval.cut]
  genes.use <- genes.use[genes.use %in% rownames(do.NULL = object@scale.data)]
  par(mfrow = c(1, 2))
  mydata <- object@scale.data[genes.use, ]
  wss <- (nrow(x = mydata) - 1) * sum(apply(X = mydata, MARGIN = 2, FUN = var))
  for (i in 1:gene.max) {
    wss[i] <- sum(kmeans(x = mydata, centers=i)$withinss)
  }
  plot(
    x = 1:gene.max,
    y = wss,
    type = "b",
    xlab= "Number of Clusters for Genes",
    ylab = "Within groups sum of squares"
  )
  mydata <- t(x = object@scale.data[genes.use, ])
  wss <- (nrow(x = mydata) - 1) * sum(apply(X = mydata, MARGIN = 2, FUN = var))
  for (i in 1:col.max) {
    wss[i] <- sum(kmeans(x = mydata, centers = i)$withinss)
  }
  plot(
    x = 1:col.max,
    y = wss,
    type = "b",
    xlab = "Number of Clusters for Cells",
    ylab = "Within groups sum of squares"
  )
  ResetPar()
  return(object)
}

# Documentation
# #' @export
# #' @importFrom stats cor kmeans
# #' @importFrom NMF aheatmap
#
# CellCorMatrix <- function(
#   object,
#   cor.genes = NULL,
#   cell.inds = NULL,
#   do.k = FALSE,
#   k.seed = 1,
#   k.num = 4,
#   vis.low = (-1),
#   vis.high = 1,
#   vis.one = 0.8,
#   pcs.use = 1:3,
#   col.use = pyCols
# ) {
#   cor.genes <- SetIfNull(x = cor.genes, default = object@var.genes)
#   cell.inds <- SetIfNull(x = cell.inds, default = colnames(x = object@data))
#   cor.genes <- cor.genes[cor.genes %in% rownames(x = object@data)]
#   data.cor <- object@scale.data[cor.genes, cell.inds]
#   cor.matrix <- cor(x = data.cor)
#   set.seed(seed = k.seed)
#   kmeans.cor <- kmeans(x = cor.matrix, centers = k.num)
#   if (do.k) {
#     cor.matrix <- cor.matrix[order(kmeans.cor$cluster), order(kmeans.cor$cluster)]
#   }
#   kmeans.names <- rownames(x = cor.matrix)
#   row.annot <- data.frame(
#     cbind(
#       kmeans.cor$cluster[kmeans.names],
#       object@pca.rot[kmeans.names, pcs.use]
#     )
#   )
#   colnames(x = row.annot) <- c("K", paste0("PC", pcs.use))
#   cor.matrix[cor.matrix == 1] <- vis.one
#   cor.matrix <- MinMax(data = cor.matrix, min = vis.low, max = vis.high)
#   object@kmeans.cell <- list(kmeans.cor)
#   if (do.k) {
#     aheatmap(
#       x = cor.matrix,
#       col = col.use,
#       Rowv = NA,
#       Colv = NA,
#       annRow = row.annot
#     )
#   } else {
#     heatmap.2(
#       x = cor.matrix,
#       trace = "none",
#       Rowv = NA,
#       Colv = NA,
#       col = pyCols
#     )
#   }
#   return(object)
# }

# Documentation
# #' @export
#
# GeneCorMatrix <- function(
#   object,
#   cor.genes = NULL,
#   cell.inds = NULL,
#   do.k = FALSE,
#   k.seed = 1,
#   k.num = 4,
#   vis.low = (-1),
#   vis.high = 1,
#   vis.one = 0.8,
#   pcs.use = 1:3,
#   col.use = pyCols
# ) {
#   cor.genes <- SetIfNull(x = cor.genes, default = object@var.genes)
#   cell.inds <- SetIfNull(x = cell.inds, default = colnames(x = object@data))
#   cor.genes <- cor.genes[cor.genes %in% rownames(x = object@data)]
#   data.cor <- object@data[cor.genes, cell.inds]
#   cor.matrix <- cor(x = t(x = data.cor))
#   set.seed(seed = k.seed)
#   kmeans.cor <- kmeans(x = cor.matrix, centers = k.num)
#   cor.matrix <- cor.matrix[order(kmeans.cor$cluster), order(kmeans.cor$cluster)]
#   kmeans.names <- rownames(x = cor.matrix)
#   row.annot <- data.frame(
#     cbind(
#       kmeans.cor$cluster[kmeans.names],
#       object@pca.x[kmeans.names, pcs.use]
#     )
#   )
#   colnames(x = row.annot) <- c("K", paste0("PC", pcs.use))
#   cor.matrix[cor.matrix == 1] <- vis.one
#   cor.matrix <- MinMax(data = cor.matrix, min = vis.low, max = vis.high)
#   object@kmeans.gene <- list(kmeans.cor)
#   if (do.k) {
#     aheatmap(
#       x = cor.matrix,
#       col = col.use,
#       Rowv = NA,
#       Colv = NA,
#       annRow = row.annot
#     )
#   } else {
#     aheatmap(
#       x = cor.matrix,
#       col = col.use,
#       annRow = row.annot
#     )
#   }
#   return(object)
# }


#   Documentation
ProjectSamples <- function(object, new.samples) {
  genes.use <- rownames(x = object@data)
  genes.pca <- rownames(x = object@pca.x)
  data.project <- object@scale.data[genes.pca, ]
  data.project[is.na(x = data.project)] <- 0
  new.rot = t(x = data.project) %*% as.matrix(x = object@pca.x)
  scale.vec = apply(
    X = new.rot,
    MARGIN = 2,
    FUN = function(x) {
      return(sqrt(x = sum(x ^ 2)))
    }
  )
  new.rot.scale <- scale(x = new.rot, center = FALSE, scale = scale.vec)
  object@pca.rot <- as.data.frame(x = new.rot.scale)
  return(object)
}

# Documentation
#Cool, but not supported right now
SpatialDe <- function(object, marker.cells, genes.use = NULL) {
  embed.map <- object@tsne.rot
  mult.use <- 2
  mult.use.far <- 10
  if ((mult.use.far * length(x = marker.cells)) > nrow(x = embed.map)) {
    mult.use.far <- 1
    mult.use <- 1
  }
  genes.use <- SetIfNull(x = genes.use, default = object@var.genes)
  marker.pos <- apply(X = embed.map[marker.cells, ], MARGIN = 2, FUN = mean)
  embed.map <- rbind(embed.map, marker.pos)
  rownames(x = embed.map)[nrow(x = embed.map)] <- "marker"
  embed.dist <- sort(x = as.matrix(x = dist(x = (embed.map)))["marker", ])
  embed.diff <- names(x = embed.dist[! (names(x = embed.dist) %in% marker.cells)][1:(mult.use * length(x = marker.cells))][-1])
  embed.diff.far <- names(x = embed.dist[! (names(x = embed.dist) %in% marker.cells)][1:(mult.use.far * length(x = marker.cells))][-1])
  diff.genes <- rownames(
    x = subset(
      x = DiffExpTest(
        object = object,
        cells.1 = marker.cells,
        cells.2 = embed.diff,
        genes.use = genes.use
      ),
      subset = p_val < (1e-5)
    )
  )
  diff.genes <- subset(
    x = DiffExpTest(
      object = object,
      cells.1 = marker.cells,
      cells.2 = embed.diff,
      genes.use = diff.genes
    ),
    subset = p_val<(1e-10)
  )
  return(diff.genes)
}

#' Identify potential genes associated with batch effects
#'
#' Test for genes whose expression value is strongly predictive of batch (based
#' on ROC classification). Important note: Assumes that the 'batch' of each
#' cell is assigned to the cell's identity class (will be improved in a future
#' release)
#'
#' @param object Seurat object
#' @param idents.use Batch names to test
#' @param genes.use Gene list to test
#' @param auc.cutoff Minimum AUC needed to qualify as a 'batch gene'
#' @param thresh.use Limit testing to genes which show, on average, at least
#' X-fold difference (log-scale) in any one batch
#'
#' @return Returns a list of genes that are strongly correlated with batch.
#'
#' @export
#'
BatchGene <- function(
  object,
  idents.use,
  genes.use = NULL,
  auc.cutoff = 0.6,
  thresh.use = 0
) {
  batch.genes <- c()
  genes.use <- SetIfNull(x = genes.use, default = rownames(x = object@data))
  for (ident in idents.use) {
    cells.1 <- names(x = object@ident)[object@ident == ident]
    cells.2 <- names(x = object@ident)[object@ident != ident]
    if ((length(x = cells.1) < 5) | (length(x = cells.2) < 5)) {
      break
    }
    markers.ident <- MarkerTest(
      object = object,
      cells.1 = cells.1,
      cells.2 = cells.2,
      genes.use = genes.use,
      print.bar = thresh.use
    )
    batch.genes <- unique(
      x = c(
        batch.genes,
        rownames(x = subset(x = markers.ident, subset = myAUC > auc.cutoff))
      )
    )
  }
  return(batch.genes)
}



# Documentation
#multicore version of jackstraw
#DOES NOT WORK WITH WINDOWS
# @export
#
JackStrawMC <- function(
  object,
  num.pc = 30,
  num.replicate = 100,
  prop.freq = 0.01,
  do.print = FALSE,
  num.cores = 8
) {
  pc.genes <- rownames(x = object@pca.x)
  if (length(x = pc.genes) < 200) {
    prop.freq <- max(prop.freq, 0.015)
  }
  md.x <- as.matrix(x = object@pca.x)
  md.rot <- as.matrix(x = object@pca.rot)
  if (do.print) {
    fake.pcVals.raw <- mclapply(
      X = 1:num.replicate,
      FUN = function(x) {
        print(x)
        return(JackRandom(
          scaled.data = object@scale.data[pc.genes, ],
          prop.use = prop.freq,
          r1.use = 1,
          r2.use = num.pc,
          seed.use = x
        ))
      },
      mc.cores = num.cores
    )
  } else {
    fake.pcVals.raw <- mclapply(
      X = 1:num.replicate,
      FUN = function(x) {
        return(JackRandom(
          scaled.data = object@scale.data[pc.genes, ],
          prop.use = prop.freq,
          r1.use = 1,
          r2.use = num.pc,
          seed.use=x
        ))
      }, mc.cores = num.cores
    )
  }
  fake.pcVals <- simplify2array(
    x = mclapply(
      X = 1:num.pc,
      FUN = function(x) {
        return(as.numeric(x = unlist(x = lapply(
          X = 1:num.replicate,
          FUN = function(y) {
            return(fake.pcVals.raw[[y]][, x])
          }
        ))))
      },
      mc.cores = num.cores
    )
  )
  object@jackStraw.fakePC <- data.frame(fake.pcVals)
  object@jackStraw.empP <- data.frame(
    simplify2array(
      x = mclapply(
        X = 1:num.pc,
        FUN = function(x) {
          return(unlist(x = lapply(
            X = abs(md.x[, x]),
            FUN = EmpiricalP,
            nullval = abs(x = fake.pcVals[, x])
          )))
        },
        mc.cores = num.cores
      )
    )
  )
  colnames(x = object@jackStraw.empP) <- paste0("PC", 1:ncol(x = object@jackStraw.empP))
  return(object)
}


# # Documentation
# JackStrawFull <- function(
#   object,
#   num.pc = 5,
#   num.replicate = 100,
#   prop.freq = 0.01
# ) {
#   pc.genes <- rownames(x = object@pca.x)
#   if (length(x = pc.genes) < 200) {
#     prop.freq <- max(prop.freq, 0.015)
#   }
#   md.x <- as.matrix(x = object@pca.x)
#   md.rot <- as.matrix(x = object@pca.rot)
#   real.fval <- sapply(
#     X = 1:num.pc,
#     FUN = function(x) {
#       return(unlist(x = lapply(
#         X = pc.genes,
#         FUN = JackF,
#         r1 = x,
#         r2 = x,
#         x = md.x,
#         rot = md.rot
#       )))
#     }
#   )
#   rownames(x = real.fval) <- pc.genes
#   object@real.fval <- data.frame(real.fval)
#   fake.fval <- sapply(
#     X = 1:num.pc,
#     FUN = function(x) {
#       return(unlist(x = replicate(
#         n = num.replicate,
#         expr = JackstrawF(
#           prop = prop.freq,
#           data = object@scale.data[pc.genes, ],
#           myR1 = x,
#           myR2 = x
#         ),
#         simplify = FALSE
#       )))
#     }
#   )
#   rownames(x = fake.fval) <- 1:nrow(x = fake.fval)
#   object@fake.fval <- data.frame(fake.fval)
#   object@emp.pval <- data.frame(
#     sapply(
#       X = 1:num.pc,
#       FUN = function(x) {
#         return(unlist(x = lapply(
#           X = object@real.fval[, x],
#           FUN = EmpiricalP,
#           nullval = object@fake.fval[, x]
#         )))
#       }
#     )
#   )
#   rownames(x = object@emp.pval) <- pc.genes
#   colnames(x = object@emp.pval) <- paste0("PC", 1:ncol(x = object@emp.pval),)
#   return(object)
# }

# Documentation
# #' @export
#
# JackStrawPermutationTest <- function(
#   object,
#   genes.use = NULL,
#   num.iter = 100,
#   thresh.use = 0.05,
#   do.print = TRUE,
#   k.seed = 1
# ) {
#   genes.use <- SetIfNull(x = genes.use, default = rownames(x = object@pca.x))
#   genes.use <- intersect(x = genes.use, y = rownames(x = object@scale.data))
#   data.use <- t(x = as.matrix(x = object@scale.data[genes.use, ]))
#   if (do.print) {
#     print(paste("Running", num.iter, "iterations"))
#   }
#   pa.object <- permutationPA(
#     data.use,
#     B = num.iter,
#     threshold = thresh.use,
#     verbose = do.print,
#     seed = k.seed
#   )
#   if (do.print) {
#     cat("\n\n")
#   }
#   if (do.print) {
#     print(paste("JackStraw returns", pa.object$r, "significant components"))
#   }
#   return(pa.object)
# }

#' Run Canonical Correlation Analysis (CCA) on multimodal data
#'
#' CCA finds a shared correlation structure betwen two different datasets, enabling integrated downstream analysis
#'
#' @param object Seurat object
#' @param assay.1 First assay for multimodal analysis. Default is RNA
#' @param assay.2 Second assay for multimodal analysis. Default is CITE for CITE-Seq analysis.
#' @param features.1 Features of assay 1 to consider (default is variable genes)
#' @param features.2 Features of assay 2 to consider (default is all features, i.e. for CITE-Seq, all antibodies)
#' @param num.cc Number of canonical correlations to compute and store. Default is 20, but will calculate less if either assay has <20 features.
#' @param normalize.variance Z-score the embedding of each CC to 1, so each CC contributes equally in downstream analysis (default is T)
#'
#' @return Returns object after CCA, with results stored in dimensional reduction cca.assay1 (ie. cca.RNA) and cca.assay2. For example, results can be visualized using DimPlot(object,reduction.use="cca.RNA")
#'
#' @export
#'
MultiModal_CCA <- function(
  object,
  assay.1 = "RNA",
  assay.2 = "CITE",
  features.1 = NULL,
  features.2 = NULL,
  num.cc = 20,
  normalize.variance = TRUE
) {
  #first pull out data, define features
  data.1 <- GetAssayData(
    object = object,
    assay.type = assay.1,
    slot = "scale.data"
  )
  data.2 <- GetAssayData(
    object = object,
    assay.type = assay.2,
    slot = "scale.data"
  )
  if (is.null(x = features.1)) {
    if ((assay.1 == "RNA") && length(x = object@var.genes) > 0) {
      features.1 <- object@var.genes
    } else {
      features.1 <- rownames(x = data.1)
    }
  }
  features.2 <- SetIfNull(x = features.2, default = rownames(x = data.2))
  data.1 <- t(x = data.1[features.1, ])
  data.2 <- t(x = data.2[features.2, ])
  num.cc <- min(20, min(length(x = features.1), length(x = features.2)))
  cca.data <- list(data.1, data.2)
  names(x = cca.data) <- c(assay.1, assay.2)
  # now run CCA
  out <- CanonCor(mat1 = cca.data[[1]],
                  mat2 = cca.data[[2]],
                  standardize = TRUE,
                  k = num.cc)
  cca.output <- list(out$u, out$v)
  embeddings.cca <- list()
  for (i in 1:length(x = cca.data)) {
    assay.use <- names(x = cca.data)[i]
    rownames(x = cca.output[[i]]) <- colnames(x = cca.data[[i]])
    embeddings.cca[[i]] <- cca.data[[i]] %*% cca.output[[i]]
    colnames(x = embeddings.cca[[i]]) <- paste0(
      assay.use,
      "CC",
      1:ncol(x = embeddings.cca[[i]])
    )
    colnames(x = cca.output[[i]]) <- colnames(x = embeddings.cca[[i]])
    if (normalize.variance) {
      embeddings.cca[[i]] <- scale(x = embeddings.cca[[i]])
    }
    object <- SetDimReduction(
      object = object,
      reduction.type = paste0(assay.use, "CCA"),
      slot = "cell.embeddings",
      new.data = embeddings.cca[[i]]
    )
    object <- SetDimReduction(
      object = object,
      reduction.type = paste0(assay.use, "CCA"),
      slot = "key",
      new.data = paste0(assay.use, "CC")
    )
    object <- SetDimReduction(
      object = object,
      reduction.type = paste0(assay.use, "CCA"),
      slot = "x",
      new.data =  cca.output[[i]]
    )
  }
  return(object)
}

#' Run coinertia analysis on multimodal data
#'
#' CIA finds a shared correlation structure betwen two different datasets, enabling integrated downstream analysis
#'
#' @param object Seurat object
#' @param assay.1 First assay for multimodal analysis. Default is RNA
#' @param assay.2 Second assay for multimodal analysis. Default is CITE for CITE-Seq analysis.
#' @param features.1 Features of assay 1 to consider (default is variable genes)
#' @param features.2 Features of assay 2 to consider (default is all features, i.e. for CITE-Seq, all antibodies)
#' @param num.axes Number of principal axes to compute and store. Default is 20, but will calculate less if either assay has <20 features.
#' @param normalize.variance Return the normalized row scares, so each aexis contributes equally in downstream analysis (default is T)
#'
#' @importFrom utils installed.packages
#'
#' @return Returns object after CIA, with results stored in dimensional reduction cia.assay1 (ie. cia.RNA) and cia.assay2. For example, results can be visualized using DimPlot(object,reduction.use="cia.RNA")
#'
#' @export
#'
MultiModal_CIA <- function(
  object,
  assay.1 = "RNA",
  assay.2 = "CITE",
  features.1 = NULL,
  features.2 = NULL,
  num.axes = 20,
  normalize.variance = TRUE
) {
  if (!'made4' %in% rownames(x = installed.packages())) {
    stop("Please install made4")
  }
  #first pull out data, define features
  data.1 <- GetAssayData(
    object = object,
    assay.type = assay.1,
    slot = "scale.data"
  )
  data.2 <- GetAssayData(
    object = object,
    assay.type = assay.2,
    slot = "scale.data"
  )
  if (is.null(x = features.1)) {
    if ((assay.1 == "RNA") && length(x = object@var.genes) > 0) {
      features.1 <- object@var.genes
    } else {
      features.1 <- rownames(x = data.1)
    }
  }
  features.2 <- SetIfNull(x = features.2, default = rownames(x = data.2))
  data.1 <- t(x = data.1[features.1, ])
  data.2 <- t(x = data.2[features.2, ])
  num.axes <- min(20, min(length(x = features.1), length(x = features.2)))
  cia.data <- list(data.1, data.2)
  names(x = cia.data) <- c(assay.1, assay.2)
  # now run cia
  out <- made4::cia(
    df1 = t(x = cia.data[[1]]),
    df2 = t(x = cia.data[[2]]),
    cia.nf = num.axes
  )
  out <- out$coinertia
  cia.output <- list(as.matrix(x = out$c1), as.matrix(x = out$l1))
  embeddings.cia.norm <- list(as.matrix(x = out$mX), as.matrix(x = out$mY))
  embeddings.cia <- list(as.matrix(x = out$lX), as.matrix(x = out$lY))
  for (i in 1:length(x = cia.data)) {
    assay.use <- names(x = cia.data)[i]
    #rownames(cia.output[[i]])=colnames(cia.data[[i]])
    if (normalize.variance) {
      embeddings.cia[[i]] <- (embeddings.cia.norm[[i]])
    }
    colnames(x = embeddings.cia[[i]]) <- paste0(
      assay.use,
      "CI",
      1:ncol(x = embeddings.cia[[i]])
    )
    colnames(x = cia.output[[i]]) <- colnames(x = embeddings.cia[[i]])
    object <- SetDimReduction(
      object = object,
      reduction.type = paste("cia", assay.use, sep="_"),
      slot = "cell.embeddings",
      new.data = embeddings.cia[[i]]
    )
    object <- SetDimReduction(
      object = object,
      reduction.type = paste("cia", assay.use, sep="_"),
      slot = "key",
      new.data = paste0(assay.use," CI")
    )
    object <- SetDimReduction(
      object = object,
      reduction.type = paste("cia", assay.use, sep="_"),
      slot = "x",
      new.data =  cia.output[[i]]
    )
  }
  return(object)
}

#' Reorder identity classes
#'
#' Re-assigns the identity classes according to the average expression of a
#' particular feature (i.e, gene expression, or PC score)
#' Very useful after clustering, to re-order cells, for example, based on PC
#' scores
#'
#' @param object Seurat object
#' @param feature Feature to reorder on. Default is PC1
#' @param rev Reverse ordering (default is FALSE)
#' @param aggregate.fxn Function to evaluate each identity class based on
#' (default is mean)
#' @param reorder.numeric Rename all identity classes to be increasing numbers
#' starting from 1 (default is FALSE)
#' @param \dots additional arguemnts (i.e. use.imputed=TRUE)
#'
#' @return A seurat object where the identity have been re-oredered based on the
#' average.
#'
#' @export
#'
#' @examples
#' head(x = pbmc_small@ident)
#' pbmc_small <- ReorderIdent(object = pbmc_small)
#' head(x = pbmc_small@ident)
#'
ReorderIdent <- function(
  object,
  feature = "PC1",
  rev = FALSE,
  aggregate.fxn = mean,
  reorder.numeric = FALSE,
  ...
) {
  ident.use <- object@ident
  data.use <- FetchData(object = object, vars.all = feature, ...)[, 1]
  revFxn <- Same
  if (rev) {
    revFxn <- function(x) {
      return(max(x) + 1 - x)
    }
  }
  names.sort <- names(
    x = revFxn(
      sort(
        x = tapply(
          X = data.use,
          INDEX = (ident.use),
          FUN = aggregate.fxn
        )
      )
    )
  )
  ident.new <- factor(x = ident.use, levels = names.sort, ordered = TRUE)
  if (reorder.numeric) {
    ident.new <- factor(
      x = revFxn(
        rank(
          tapply(
            X = data.use,
            INDEX = as.numeric(x = ident.new),
            FUN = mean
          )
        )
      )[as.numeric(ident.new)],
      levels = 1:length(x = levels(x = ident.new)),
      ordered = TRUE
    )
  }
  names(x = ident.new) <- names(x = ident.use)
  object@ident <- ident.new
  return(object)
}

#' FastWhichCells
#' Identify cells matching certain criteria (limited to character values)
#' @param object Seurat object
#' @param group.by Group cells in different ways (for example, orig.ident).
#' Should be a column name in object@meta.data
#' @param subset.value  Return cells matching this value
#' @param invert invert cells to return.FALSE by default
#'
#' @export
#'
#' @examples
#' FastWhichCells(object = pbmc_small, group.by = 'res.1', subset.value = 1)
#'
FastWhichCells <- function(object, group.by, subset.value, invert = FALSE) {
  object <- SetAllIdent(object = object, id = group.by)
  cells.return <- WhichCells(object = object, ident = subset.value)
  if (invert) {
    cells.return <- setdiff(x = object@cell.names, y = cells.return)
  }
  return(cells.return)
}

#' Switch identity class definition to another variable
#'
#' @param object Seurat object
#' @param id Variable to switch identity class to (for example, 'DBclust.ident',
#' the output of density clustering) Default is orig.ident - the original
#' annotation pulled from the cell name.
#'
#' @return A Seurat object where object@@ident has been appropriately modified
#'
#' @export
#'
#' @examples
#' head(x = pbmc_small@ident)
#' pbmc_small <- SetAllIdent(object = pbmc_small, id = 'orig.ident')
#' head(x = pbmc_small@ident)
#'
SetAllIdent <- function(object, id = NULL) {
  id <- SetIfNull(x = id, default = "orig.ident")
  if (id %in% colnames(x = object@meta.data)) {
    cells.use <- rownames(x = object@meta.data)
    ident.use <- object@meta.data[, id]
    object <- SetIdent(
      object = object,
      cells.use = cells.use,
      ident.use = ident.use
    )
  }
  return(object)
}

#' Rename one identity class to another
#'
#' Can also be used to join identity classes together (for example, to merge
#' clusters).
#'
#' @param object Seurat object
#' @param old.ident.name The old identity class (to be renamed)
#' @param new.ident.name The new name to apply
#'
#' @return A Seurat object where object@@ident has been appropriately modified
#'
#' @export
#'
#' @examples
#' head(x = pbmc_small@ident)
#' pbmc_small <- RenameIdent(
#'   object = pbmc_small,
#'   old.ident.name = 0,
#'   new.ident.name = 'cluster_0'
#' )
#' head(x = pbmc_small@ident)
#'
RenameIdent <- function(object, old.ident.name = NULL, new.ident.name = NULL) {
  if (!old.ident.name %in% object@ident) {
    stop(paste("Error : ", old.ident.name, " is not a current identity class"))
  }
  new.levels <- old.levels <- levels(x = object@ident)
  new.levels <- old.levels
  if (new.ident.name %in% old.levels) {
    new.levels <- new.levels[new.levels != old.ident.name]
  }
  if (!(new.ident.name %in% old.levels)) {
    new.levels[new.levels == old.ident.name] <- new.ident.name
  }
  ident.vector <- as.character(x = object@ident)
  names(x = ident.vector) <- names(object@ident)
  ident.vector[WhichCells(object = object, ident = old.ident.name)] <- new.ident.name
  object@ident <- factor(x = ident.vector, levels = new.levels)
  return(object)
}

#' Transfer identity class information (or meta data) from one object to another
#'
#' Transfers identity class information (or meta data) from one object to
#' another, assuming the same cell barcode names are in each. Can be very useful
#' if you have multiple Seurat objects that share a subset of underlying data.
#'
#' @param object.from Seurat object to transfer information from
#' @param object.to Seurat object to transfer information onto
#' @param data.to.transfer What data should be transferred over? Default is the
#' identity class ("ident"), but can also include any column in
#' object.from@@meta.data
#' @param keep.existing For cells in object.to that are not present in
#' object.from, keep existing data? TRUE by default. If FALSE, set to NA.
#' @param add.cell.id1 Prefix to add (followed by an underscore) to cells in
#'  object.from. NULL by default, in which case no prefix is added.
#'
#' @return A Seurat object where object@@ident or object@@meta.data has been
#' appropriately modified
#'
#' @export
#'
#' @examples
#' Duplicate the test object and assign random new idents to transfer
#' pbmc_small@@ident
#' pbmc_small2 <- SetIdent(object = pbmc_small, cells.use = pbmc_small@@cell.names,
#'  ident.use = sample(pbmc_small@@ident))
#' pbmc_small2@@ident
#' pbmc_small <- TransferIdent(object.from = pbmc_small2, object.to = pbmc_small)
#' pbmc_small@@ident
#'
TransferIdent <- function(object.from, object.to, data.to.transfer = "ident", keep.existing = TRUE, add.cell.id1 = NULL) {
  old_data <- as.character(FetchData(object = object.from, vars.all = data.to.transfer)[, 1])
  names(old_data) <- object.from@cell.names
  if (data.to.transfer %in% c("ident", colnames(object.to@meta.data))) {
    new_data <- FetchData(object = object.to, vars.all = data.to.transfer)
    if (!keep.existing) {
      new_data[, 1] <- "NA"
    }
    new_data <- as.character(new_data[, 1])
  }
  else {
    new_data <- rep("NA", length(object.to@cell.names))
  }
  names(new_data) <- object.to@cell.names
  if (!is.null(add.cell.id1)) {
    names(old_data) <- paste(names(old_data), add.cell.id1, sep = "_")
  }
  new_data[names(old_data)] <- old_data
  if (data.to.transfer == "ident") {
    object.to <- SetIdent(object.to, cells.use = names(new_data), ident.use = new_data)
  }
  else {
    object.to <- AddMetaData(object = object.to, metadata = new_data,col.name = data.to.transfer)
  }
  return(object.to)
}

#' Sets identity class information to be a combination of two object attributes
#'
#' Combined two attributes to define identity classes. Very useful if, for
#' example, you have multiple cell types and multiple replicates, and you want
#' to group cells based on combinations of both.
#'
#' @param object Seurat object
#' @param attribute.1 First attribute for combination. Default is "ident"
#' @param attribute.2 Second attribute for combination. Default is "orig.ident"
#' @return A Seurat object where object@@ident has been appropriately modified
#'
#' @export
#'
#' @examples
#' groups <- sample(c("group1", "group2", "group3"), size = 80, replace = TRUE)
#' celltype <- sample(c("celltype1", "celltype2", "celltype3"), size = 80, replace = TRUE)
#' new.metadata <- data.frame(groups = groups, celltype = celltype)
#' rownames(new.metadata) <- pbmc_small@@cell.names
#' pbmc_small <- AddMetaData(object = pbmc_small, metadata = new.metadata)
#' pbmc_small <- CombineIdent(object = pbmc_small, attribute.1 = "celltype", attribute.2 = "groups")
#' pbmc_small@@ident
#'
CombineIdent <- function(object, attribute.1 = "ident", attribute.2 = "orig.ident") {
  old_data <- FetchData(object = object, vars.all = c(attribute.1, attribute.2))
  new_ids <- sapply(X = 1:nrow(old_data), FUN = function(x){
    paste(as.character(old_data[x, 1]), as.character(old_data[x, 2]), sep = "_")
  })
  object <- SetIdent(object = object,cells.use = object@cell.names, ident.use = new_ids)
}

#' Add samples into existing Seurat object.
#'
#' @param object Seurat object
#' @param project Project name (string)
#' @param new.data Data matrix for samples to be added
#' @param min.cells Include genes with detected expression in at least this
#' many cells
#' @param min.genes Include cells where at least this many genes are detected
#' @param is.expr Expression threshold for 'detected' gene
#' @param do.normalize Normalize the data after merging. Default is TRUE.
#' If set, will perform the same normalization strategy as stored in the object
#' @param scale.factor scale factor in the log normalization
#' @param do.scale In object@@scale.data, perform row-scaling (gene-based z-score)
#' @param do.center In object@@scale.data, perform row-centering (gene-based
#' centering)
#' @param names.field For the initial identity class for each cell, choose this
#' field from the cell's column name
#' @param names.delim For the initial identity class for each cell, choose this
#' delimiter from the cell's column name
#' @param meta.data Additional metadata to add to the Seurat object. Should be
#' a data frame where the rows are cell names, and the columns are additional
#' metadata fields
#' @param add.cell.id String to be appended to the names of all cells in
#' new.data. E.g. if add.cell.id = "rep1", "cell1" becomes "cell1.rep1"
#'
#' @import Matrix
#' @importFrom dplyr full_join
#'
#' @export
#'
#' @examples
#' pbmc1 <- SubsetData(object = pbmc_small, cells.use = pbmc_small@cell.names[1:40])
#' pbmc1
#' pbmc2 <- SubsetData(object = pbmc_small, cells.use = pbmc_small@cell.names[41:80])
#' pbmc2_data <- pbmc2@data
#' dim(pbmc2_data)
#' pbmc_added <- AddSamples(object = pbmc1, new.data = pbmc2_data)
#' pbmc_added
#'
AddSamples <- function(
  object,
  new.data,
  project = NULL,
  min.cells = 0,
  min.genes = 0,
  is.expr = 0,
  do.normalize = TRUE,
  scale.factor = 1e4,
  do.scale = FALSE,
  do.center = FALSE,
  names.field = 1,
  names.delim = "_",
  meta.data = NULL,
  add.cell.id = NULL
) {
  if (length(x = object@raw.data) < 2) {
    stop("Object provided has an empty raw.data slot. Adding/Merging performed on raw count data.")
  }
  if (!missing(x = add.cell.id)) {
    colnames(x = new.data) <- paste(
      add.cell.id,
      colnames(x = new.data),
      sep = "_"
    )
  }
  if (any(colnames(x = new.data) %in% object@cell.names)) {
    stop("Duplicate cell names, please provide 'add.cell.id' for unique names")
  }
  combined.data <- RowMergeSparseMatrices(
    mat1 = object@raw.data[, object@cell.names],
    mat2 = new.data
  )
  if (is.null(x = meta.data)) {
    filler <- matrix(NA, nrow = ncol(new.data), ncol = ncol(object@meta.data))
    rownames(filler) <- colnames(new.data)
    colnames(filler) <- colnames(object@meta.data)
    filler <- as.data.frame(filler)
    combined.meta.data <- rbind(object@meta.data, filler)
  } else {
    combined.meta.data <- suppressMessages(
      suppressWarnings(
        full_join(x = object@meta.data, y = meta.data)
      )
    )
  }
  combined.meta.data$nGene <- NULL
  combined.meta.data$nUMI <- NULL
  if (!is.null(x = add.cell.id)) {
    combined.meta.data$orig.ident <- factor(
      x = combined.meta.data$orig.ident,
      levels = c(levels(x = combined.meta.data$orig.ident), add.cell.id)
    )
    combined.meta.data[colnames(new.data), ] <- add.cell.id
  }
  project <- SetIfNull(x = project, default = object@project.name)
  new.object <- CreateSeuratObject(
    raw.data = combined.data,
    project = project,
    min.cells = min.cells,
    min.genes = min.genes,
    is.expr = is.expr,
    scale.factor = scale.factor,
    do.scale = F,
    do.center = F,
    names.field = names.field,
    names.delim = names.delim
  )
  if (do.normalize) {
    normalization.method.use = GetCalcParam(
      object = object,
      calculation = "NormalizeData",
      parameter = "normalization.method"
    )
    scale.factor.use = GetCalcParam(
      object = object,
      calculation = "NormalizeData",
      parameter = "scale.factor"
    )
    if (is.null(x = normalization.method.use)) {
      normalization.method.use <- "LogNormalize"
      scale.factor.use <- 10000
    }
    new.object <- NormalizeData(
      object = new.object,
      assay.type = "RNA",
      normalization.method = normalization.method.use,
      scale.factor = scale.factor.use
    )
  }
  if (do.scale | do.center) {
    new.object <- ScaleData(
      object = new.object,
      do.scale = do.scale,
      do.center = do.center
    )
  }
  new.object@meta.data$orig.ident <- NULL
  new.object@meta.data <- cbind(new.object@meta.data, combined.meta.data)
  return(new.object)
}

# add values in log-space
#
# @param x Values
#
# @return values added in log space
#
LogAdd <- function(x) {
  mpi <- max(x)
  return(mpi + log(x = sum(exp(x = x - mpi))))
}

# Set a default value if an object is null
#
# @param x An object to set if it's null
# @param default The value to provide if x is null
#
# @return default if x is null, else x
#
SetIfNull <- function(x, default) {
  if (is.null(x = x)) {
    return(default)
  } else {
    return(x)
  }
}

#internal function for spatial mapping
ShiftCell <- function(bin, x, y) {
  bin.y <- (bin - 1) %/% 8 + 1
  bin.x <- (bin - 1) %% 8 + 1
  new.x <- MinMax(data = bin.x + x, min = 1, max = 8)
  new.y <- MinMax(data = bin.y + y, min = 1, max = 8)
  new.bin <- 8 * (new.y - 1) + new.x
  return(new.bin)
}

NeighborCells <- function(bin) {
  return(unique(x = c(
    bin,
    ShiftCell(bin = bin, x = 0, y = 1),
    ShiftCell(bin = bin, x = 1, y = 0),
    ShiftCell(bin = bin, x = -1, y = 0),
    ShiftCell(bin = bin, x = 0, y = -1)
  )))
}

AllNeighborCells <- function(bin, dist = 1) {
  all.comb <- expand.grid(rep(x = list(-dist:dist), 2))
  return(unique(x = unlist(x = lapply(
    X = 1:nrow(x = all.comb),
    FUN = function(x) {
      return(ShiftCell(bin = bin, x = all.comb[x, 1], y = all.comb[x, 2]))
    }))))
}

#FetchClosest bin, used internally in spatial mapping
#
#' @importFrom stats dist
#
FetchClosest <- function(bin, all.centroids, num.cell) {
  bin.y <- (bin - 1) %/% 8 + 1
  bin.x <- (bin - 1) %% 8 + 1
  all.centroids <- rbind(all.centroids, c(bin.x, bin.y))
  all.dist <- as.matrix(x = dist(x = all.centroids))
  return(names(x = sort(x = all.dist[nrow(x = all.dist), ]))[2:(num.cell + 2)])
}

#calculate refined mapping probabilites based on multivariate distribution
slimdmvnorm <- function (x, mean = rep(0, p), sigma = diag(p), log = FALSE) {
  x <- matrix(data = x, ncol = length(x = x))
  p <- ncol(x = x)
  dec <- tryCatch(chol(x = sigma), error = function(e) e)
  tmp <- backsolve(r = dec, t(x = x) - mean, transpose = TRUE)
  rss <- colSums(tmp ^ 2)
  logretval <- -sum(log(x = diag(x = dec))) - 0.5 * p * log(x = 2 * pi) - 0.5 * rss
  names(x = logretval) <- rownames(x = x)
  return(logretval)
}

# Documentation
#Internal, not documented for now
#
#' @importFrom utils write.table
#
CalcInsitu <- function(
  object,
  gene,
  do.plot = TRUE,
  do.write = FALSE,
  write.dir = "~/window/insitu/",
  col.use = BlackAndWhite(),
  do.norm = FALSE,
  cells.use = NULL,
  do.return = FALSE,
  probs.min = 0,
  do.log = FALSE,
  use.imputed = FALSE,
  bleach.use = 0
) {
  cells.use <- SetIfNull(x = cells.use, default = colnames(x = object@spatial@final.prob))
  probs.use <- object@spatial@final.prob
  if (use.imputed) {
    data.use <- exp(x = object@imputed) - 1
  } else {
    data.use <- exp(object@data) - 1
  }
  cells.use <- cells.use[cells.use %in% colnames(x = probs.use)]
  cells.use <- cells.use[cells.use %in% colnames(x = data.use)]
  #insilico.stain=matrix(unlist(lapply(1:64,function(x) sum(probs.use[x,]*data.use[gene,]))),nrow=8,ncol=8)
  insilico.vector <- unlist(
    x = lapply(
      X = 1:64,
      FUN = function(x) {
        return(sum(
          as.numeric(x = probs.use[x, cells.use]) *
            as.numeric(x = data.use[gene, cells.use])
        ))
      }
    )
  )
  probs.total <- apply(X = probs.use, MARGIN = 1, FUN = sum)
  probs.total[probs.total < probs.min] <- probs.min
  insilico.stain <- (matrix(data = insilico.vector / probs.total, nrow=8, ncol=8))
  if (do.log) {
    insilico.stain <- log(x = insilico.stain + 1)
  }
  if (bleach.use > 0) {
    insilico.stain <- insilico.stain - bleach.use
    insilico.stain <- MinMax(data = insilico.stain, min=0, max=1e6)
  }
  if (do.norm) {
    insilico.stain <- (insilico.stain - min(insilico.stain)) /
      (max(insilico.stain) - min(insilico.stain))
  }
  title.use <- gene
  if (gene %in% colnames(x = object@spatial@insitu.matrix)) {
    pred.use <- prediction(
      predictions = insilico.vector / probs.total,
      labels = object@spatial@insitu.matrix[, gene],
      label.ordering = 0:1
    )
    perf.use <- performance(prediction.obj = pred.use, measure = "auc")
    auc.use <- round(x = perf.use@y.values[[1]], digits = 3)
    title.use <- paste(gene, sep=" ")
  }
  if (do.write) {
    write.table(
      x = insilico.stain,
      file = paste0(write.dir, gene, ".txt"),
      quote = FALSE,
      row.names = FALSE,
      col.names = FALSE
    )
  }
  if (do.plot) {
    heatmap2NoKey(x = insilico.stain,
                  Rowv = NA,
                  Colv = NA,
                  trace = "none",
                  col = col.use,
                  dimTitle = title.use)
  }
  if (do.return) {
    return(as.vector(x = insilico.stain))
  }
  return(object)
}

# Documentation
#Internal, not documented for now
#
#' @importFrom stats dnorm
#
map.cell.score <- function(gene, gene.value, insitu.bin, mu, sigma, alpha) {
  code.1 <- paste(gene, insitu.bin, sep=".")
  mu.use <- mu[paste(code.1, "mu", sep="."), 1]
  sigma.use <- sigma[paste(code.1, "sigma", sep="."), 1]
  alpha.use <- alpha[paste(code.1, "alpha", sep="."), 1]
  bin.prob <- unlist(
    x = lapply(
      X = 1:length(x = insitu.bin),
      FUN = function(x) {
        return(dnorm(
          x = gene.value,
          mean = mu.use[x],
          sd = sigma.use[x],
          log = TRUE) + log(x = alpha.use[x])
        )
      }
    )
  )
  return(bin.prob)
}

#Internal, not documented for now
#
MapCell <- function(
  object,
  cell.name,
  do.plot = FALSE,
  safe.use = TRUE,
  text.val = NULL,
  do.rev = FALSE
) {
  insitu.matrix <- object@spatial@insitu.matrix
  insitu.genes <- colnames(x = insitu.matrix)
  insitu.genes <- insitu.genes[insitu.genes %in% rownames(x = object@imputed)]
  insitu.use <- insitu.matrix[, insitu.genes]
  imputed.use <- object@imputed[insitu.genes, ]
  if (safe.use) {
    safe_fxn <- LogAdd
  } else {
    safe_fxn <- sum
  }
  all.needed.cols <- unique(
    x = unlist(
      x = lapply(
        X = insitu.genes,
        FUN = function(x) {
          return(paste(x, insitu.use[, x], "post", sep="."))
        }
      )
    )
  )
  missing.cols <- which(! (all.needed.cols %in% colnames(x = object@spatial@mix.probs)))
  if (length(x = missing.cols) > 0) {
    stop(paste(
      "Error: ",
      all.needed.cols[missing.cols],
      "is missing from the mixture fits"
    ))
  }
  all.probs <- data.frame(
    sapply(
      X = insitu.genes,
      FUN = function(x) {
        return(
          log(x = as.numeric(x = object@spatial@mix.probs[
            cell.name, # Row
            paste(x, insitu.use[, x], "post", sep=".") # Column
            ])))
      }
    )
  )
  scale.probs <- t(
    x = t(x = all.probs) - apply(X = t(x = all.probs), MARGIN = 1, FUN = LogAdd)
  )
  scale.probs[scale.probs < (-9.2)] <- (-9.2)
  #head(scale.probs)
  total.prob <- exp(x = apply(X = scale.probs, MARGIN = 1, FUN = safe_fxn))
  total.prob <- total.prob / sum(total.prob)
  if (do.plot) {
    #plot(total.prob,main=cell.name)
    par(mfrow = c(1, 2))
    txt.matrix <- matrix(data = rep(x = "", 64), nrow=8, ncol=8)
    if (! is.null(x = text.val)) {
      txt.matrix[text.val] <- "X"
    }
    if (do.rev) {
      scale.probs <- scale.probs[unlist(
        x = lapply(
          X = 0:7,
          FUN = function(x) {
            return(seq(from = 1, to = 57, by = 8) + x)
          }
        )
      ), ]
    }
    heatmap2NoKey(x = matrix(data = total.prob, nrow=8, ncol=8),
                  Rowv = NA,
                  Colv = NA,
                  col = BlackAndWhite(),
                  trace = "none"
    )
    heatmap2NoKey(x = scale.probs,
                  Rowv = NA,
                  Colv = NA,
                  col = BlueAndRed(),
                  trace = "none",
                  symbreaks = FALSE
    )
    ResetPar()
  }
  return(total.prob)
}

#Internal, not documented for now
#
#' @importFrom stats dist
#
iter.k.fit <- function(scale.data, cell.ident, data.use) {
  means.all <- sapply(
    X = sort(x = unique(x = cell.ident)),
    FUN = function(x) {
      return(apply(
        X = scale.data[, cell.ident == x, drop = FALSE],
        MARGIN = 1,
        FUN = mean
      ))
    }
  )
  if (is.null(x = dim(x = means.all))) {
    dim(x = means.all) <- c(1, length(x = means.all))
  }
  all.dist <- data.frame(
    t(x = sapply(
      X = 1:ncol(x = scale.data),
      FUN = function(x) {
        return(unlist(x = lapply(
          X = sort(x = unique(x = cell.ident)),
          FUN = function(y) {
            return(dist(x = rbind(scale.data[, x, drop = FALSE], means.all[, y])))
          }
        )))
      }
    ))
  )
  cell.ident <- apply(X = all.dist, MARGIN = 1, FUN = which.min)
  cell.ident <- order(tapply(
    X = as.numeric(x = data.use),
    INDEX = cell.ident,
    FUN = mean
  ))[cell.ident]
  return(cell.ident)
}

#return cell centroid after spatial mappings (both X and Y)
CellCentroid <- function(cell.probs) {
  centroid.x <- XCellCentroid(cell.probs = cell.probs)
  centroid.y <- YCellCentroid(cell.probs = cell.probs)
  centroid.bin <- 8 * (centroid.y - 1) + centroid.x
  return(centroid.bin)
}

#return x-coordinate cell centroid
XCellCentroid <- function(cell.probs) {
  centroid.x <- round(x = sum(sapply(
    X = 1:64,
    FUN = function(x) {
      return((x - 1) %% 8 + 1)
    }
  ) * cell.probs))
  return(centroid.x)
}

#return y-coordinate cell centroid
YCellCentroid <- function(cell.probs) {
  centroid.y <- round(x = sum(sapply(
    X = 1:64,
    FUN = function(x) {
      return((x - 1) %/% 8 + 1)
    }
  ) * cell.probs))
  return(centroid.y)
}

#return x and y-coordinate cell centroid
ExactCellCentroid <- function(cell.probs) {
  # centroid.x=(sum(sapply(1:64,function(x)(x-1)%%8+1)*cell.probs))
  centroid.x <- XCellCentroid(cell.probs = cell.probs)
  # centroid.y=(sum(sapply(1:64,function(x)(x-1)%/%8+1)*cell.probs))
  centroid.y <- YCellCentroid(cell.probs = cell.probs)
  return(c(centroid.x, centroid.y))
}

# Documentation
#Internal, not documented for now
#
#' @importFrom graphics plot
#' @importFrom mixtools plot.mixEM
#
FitGeneMix <- function(
  object,
  gene,
  do.k = 3,
  use.mixtools = TRUE,
  do.plot = FALSE,
  plot.with.imputed = TRUE,
  min.bin.size = 10
) {
  data.fit <- as.numeric(x = object@imputed[gene, ])
  mixtools.fit <- normalmixEM(x = data.fit, k = do.k)
  comp.order <- order(mixtools.fit$mu)
  mixtools.posterior <- data.frame(mixtools.fit$posterior[, comp.order])
  colnames(x = mixtools.posterior) <- unlist(
    x = lapply(
      X = 1:do.k,
      FUN = function(x) {
        return(paste(gene, x - 1, "post", sep="."))
      }
    )
  )
  #mixtools.mu=data.frame(mixtools.fit$mu[comp.order])
  #mixtools.sigma=data.frame(mixtools.fit$sigma[comp.order])
  #mixtools.alpha=data.frame(mixtools.fit$lambda[comp.order])
  #rownames(mixtools.mu)=unlist(lapply(1:do.k,function(x)paste(gene,x-1,"mu",sep=".")))
  #rownames(mixtools.sigma)=unlist(lapply(1:do.k,function(x)paste(gene,x-1,"sigma",sep=".")))
  #rownames(mixtools.alpha)=unlist(lapply(1:do.k,function(x)paste(gene,x-1,"alpha",sep=".")))
  #object@mix.mu = rbind(minusr(object@mix.mu,gene), mixtools.mu);
  #object@mix.sigma = rbind(minusr(object@mix.sigma,gene), mixtools.sigma);
  #o#bject@mu.alpha =rbind(minusr(object@mu.alpha,gene), mixtools.alpha);
  if (do.plot) {
    nCol <- 2
    num.row <- floor(x = (do.k + 1) / nCol - (1e-5)) + 1
    par(mfrow = c(num.row, nCol))
    plot.mixEM(x = mixtools.fit, whichplots = 2)
    plot.data <- as.numeric(x = object@imputed[gene, ])
    if (! plot.with.imputed) {
      plot.data <- as.numeric(x = object@data[gene, ])
    }
    unlist(
      x = lapply(
        X = 1:do.k,
        FUN = function(x) {
          plot(
            x = plot.data,
            y = mixtools.posterior[, x],
            ylab = paste0("Posterior for Component ", x - 1),
            xlab = gene,
            main = gene
          )
        }
      )
    )
  }
  new.mix.probs <- data.frame(
    SubsetColumn(
      data = object@spatial@mix.probs,
      code = paste0(gene, "."),
      invert = TRUE
    ),
    row.names = rownames(x = object@spatial@mix.probs)
  )
  colnames(x = new.mix.probs)[1] <- "nGene"
  object@spatial@mix.probs <- cbind(new.mix.probs, mixtools.posterior)
  return(object)
}

globalVariables(
  names = c('alignment.index1', 'dups', 'cc_data2'),
  package = 'Seurat',
  add = TRUE
)
#' Align subspaces using dynamic time warping (DTW)
#'
#' Aligns subspaces across a given grouping variable.
#'
#' Following is a description for the two group case but this can be extended to
#' arbitrarily many groups which works by performing pairwise alignment to a
#' reference group (the largest group). First, we identify genes that are driving
#' variation in both datasets by looking at the correlation of gene expression
#' with each projection vector (e.g. CC1) in both datasets. For this we use the
#' biweight midcorrelation (bicor) and choose the top num.genes with the strongest
#' bicor to construct a 'metagene' for each dataset. We then scale each metagene
#' to match its 95\% reference range and linearly shift them by the minimum
#' difference between the two metagenes over the 10-90 quantile range. We then
#' map each cell in the smaller dataset to a cell in the larger dataset using
#' dynamic time warping (DTW) and apply the same map to the projection vectors (
#' CC vectors) to place both datasets on a common aligned scale. We apply this
#' procedue to each pair (group) of vectors individually for all specified in
#' dims.align. For a full description of the method, see Butler et al 2017.
#'
#' @param object Seurat object
#' @param reduction.type Reduction to align scores for. Default is "cca".
#' @param grouping.var Name of the grouping variable for which to align the scores
#' @param dims.align Dims to align, default is all
#' @param num.possible.genes Number of possible genes to search when choosing
#' genes for the metagene. Set to 2000 by default. Lowering will decrease runtime
#' but may result in metagenes constructed on fewer than num.genes genes.
#' @param num.genes Number of genes to use in construction of "metagene" (default
#' is 30).
#' @param show.plots Show debugging plots
#' @param verbose Displays progress and other output
#' @param ... Additional parameters to ScaleData
#'
#' @return Returns Seurat object with the dims aligned, stored in
#'  object@@dr$reduction.type.aligned
#'
#' @importFrom dtw dtw
#' @importFrom stats density
#' @importFrom pbapply pbapply
#' @importFrom graphics plot lines
#' @importFrom stats density quantile
#'
#' @export
#'
#' @examples
#' \dontrun{
#' pbmc_small
#' # Requires CCA to have previously been run
#' # As CCA requires two datasets, we will split our test object into two just for this example
#' pbmc1 <- SubsetData(pbmc_small,cells = pbmc_small@cell.names[1:40])
#' pbmc2 <- SubsetData(pbmc_small,cells = pbmc_small@cell.names[41:80])
#' pbmc1@meta.data$group <- "group1"
#' pbmc2@meta.data$group <- "group2"
#' pbmc_cca <- RunCCA(pbmc1,pbmc2)
#' pbmc_cca <- AlignSubspace(pbmc_cca,reduction.type = "cca", grouping.var = "group", dims.align = 1:2)
#' }
#'
AlignSubspace <- function(
  object,
  reduction.type = "cca",
  grouping.var,
  dims.align,
  num.possible.genes = 2000,
  num.genes = 30,
  show.plots = FALSE,
  verbose = TRUE,
  ...
) {
  parameters.to.store <- as.list(environment(), all = TRUE)[names(formals("AlignSubspace"))]
  # object <- SetCalcParams(
  #   object = object,
  #   calculation = paste0("AlignSubspace.", reduction.type),
  #   ... = parameters.to.store
  # )
  # ident.orig <- object@ident
  ident.orig <- Idents(object = object)
  # object <- SetAllIdent(object = object, id = grouping.var)
  Idents(object) <- grouping.var
  # levels.split <- names(x = sort(x = table(object@ident), decreasing = T))
  levels.split <- names(x = sort(
    x = table(Idents(object = object)),
    decreasing = TRUE
  ))
  num.groups <- length(x = levels.split)
  objects <- list()
  for (i in 1:num.groups) {
    objects[[i]] <- subset(x = object, select = levels.split[i])
  }
  object@ident <- ident.orig
  cc.loadings <- list()
  scaled.data <- list()
  cc.embeds <- list()
  for (i in 1:num.groups) {
    if (verbose) {
      cat(paste0("Rescaling group ", i, "\n"), file = stderr())
    }
    # objects[[i]] <- ScaleData(object = objects[[i]], display.progress = verbose, ...)
    sdi <- ScaleData(object = objects[[i]], verbose = verbose, ...)
    sdi[is.na(x = sdi)] <- 0
    object[[i]] <- SetAssayData(
      object = object[[i]],
      slot = 'scale.data',
      new.data = sdi
    )
    # objects[[i]]@scale.data[is.na(x = objects[[i]]@scale.data)] <- 0
    objects[[i]] <- ProjectDim(
      object = objects[[i]],
      reduction = reduction.type,
      verbose = FALSE
    )
    cc.loadings[[i]] <- Loadings(
      object = objects[[i]][[reduction.type]],
      projected = TRUE
    )
    # cc.loadings[[i]] <- GetGeneLoadings(
    #   object = objects[[i]],
    #   reduction.type = reduction.type,
    #   use.full = TRUE
    # )
    # cc.embeds[[i]] <- GetCellEmbeddings(
    #   object = objects[[i]],
    #   reduction.type = reduction.type
    # )
    cc.embeds[[i]] <- Embeddings(object = objects[[i]][[reduction.type]])
    # scaled.data[[i]] <- objects[[i]]@scale.data
    scaled.data[[i]] <- GetAssayData(object = objects[[i]], slot = 'scale.data')
  }
  # cc.embeds.all <- GetCellEmbeddings(
  #   object = object,
  #   reduction.type = reduction.type,
  #   dims = dims.align
  # )
  cc.embeds.all <- Embeddings(object = object[[reduction.type]])[, dims.align]
  colnames(x = cc.embeds.all) <- paste0("A", colnames(x = cc.embeds.all))
  cc.embeds.orig <- cc.embeds.all
  for (cc.use in dims.align) {
    for (g in 2:num.groups) {
      if (verbose) {
        cat(paste0("Aligning dimension ", cc.use, "\n"), file = stderr())
      }
      genes.rank <- data.frame(
        rank(x = abs(x = cc.loadings[[1]][, cc.use])),
        rank(x = abs(x = cc.loadings[[g]][, cc.use])),
        cc.loadings[[1]][, cc.use],
        cc.loadings[[g]][, cc.use]
      )
      genes.rank$min <- apply(X = genes.rank[,1:2], MARGIN = 1, FUN = min)
      genes.rank <- genes.rank[order(genes.rank$min, decreasing = TRUE), ]
      genes.top <- rownames(x = genes.rank)[1:min(num.possible.genes, nrow(x = genes.rank))]
      bicors <- list()
      for (i in c(1, g)) {
        cc.vals <- cc.embeds[[i]][, cc.use]
        if (verbose) {
          bicors[[i]] <- pbsapply(
            X = genes.top,
            FUN = function(x) {
              return(BiweightMidcor(x = cc.vals, y = scaled.data[[i]][x, ]))
            }
          )
        } else {
          bicors[[i]] <- sapply(
            X = genes.top,
            FUN = function(x) {
              return(BiweightMidcor(x = cc.vals, y = scaled.data[[i]][x, ]))
            }
          )
        }
      }
      genes.rank <- data.frame(
        rank(x = abs(x = bicors[[1]])),
        rank(x = abs(x = bicors[[g]])),
        bicors[[1]],
        bicors[[g]]
      )
      genes.rank$min <- apply(X = abs(x = genes.rank[, 1:2]), MARGIN = 1, FUN = min)
      # genes must be correlated in same direction in both datasets
      genes.rank <- genes.rank[sign(genes.rank[,3]) == sign(genes.rank[,4]), ]
      genes.rank <- genes.rank[order(genes.rank$min, decreasing = TRUE), ]
      genes.use <- rownames(x = genes.rank)[1:min(num.genes, nrow(genes.rank))]
      if (length(x = genes.use) == 0) {
        stop("Can't align group ", g, " for dimension ", cc.use)
      }
      metagenes <- list()
      multvar.data <- list()
      for (i in c(1, g)) {
        scaled.use <- sweep(
          x = scaled.data[[i]][genes.use, ],
          MARGIN = 1,
          STATS = sign(x = genes.rank[genes.use, which(c(1, g) == i) + 2]),
          FUN = "*"
        )
        scaled.use <- scaled.use[, names(x = sort(x = cc.embeds[[i]][, cc.use]))]
        metagenes[[i]] <- (
          cc.loadings[[i]][genes.use, cc.use] %*% scaled.data[[i]][genes.use, ]
        )[1, colnames(x = scaled.use)]
      }
      mean.difference <- mean(x = ReferenceRange(x = metagenes[[g]])) -
        mean(x = ReferenceRange(x = metagenes[[1]]))
      align.1 <- ReferenceRange(x = metagenes[[g]])
      align.2 <- ReferenceRange(x = metagenes[[1]])
      a1q <- sapply(
        X = seq(from = 0, to = 1, by = 0.001),
        FUN = function(x) {
          return(quantile(x = align.1, probs = x))
        }
      )
      a2q <- sapply(
        X = seq(from = 0, to = 1, by = 0.001),
        FUN = function(x) {
          quantile(x = align.2, probs = x)
        }
      )
      iqr <- (a1q - a2q)[100:900]
      iqr.x <- which.min(x = abs(x = iqr))
      iqrmin <- iqr[iqr.x]
      if (show.plots) {
        print(iqrmin)
      }
      align.2 <- align.2 + iqrmin
      alignment <- dtw(
        x = align.1,
        y = align.2,
        keep.internals = TRUE
      )
      alignment.map <- data.frame(alignment$index1, alignment$index2)
      alignment.map$cc_data1 <- sort(cc.embeds[[g]][, cc.use])[alignment$index1]
      alignment.map$cc_data2 <- sort(cc.embeds[[1]][, cc.use])[alignment$index2]
      alignment.map.orig <- alignment.map
      alignment.map$dups <- duplicated(x = alignment.map$alignment.index1) |
        duplicated(x = alignment.map$alignment.index1, fromLast = TRUE)
      alignment.map %>% group_by(alignment.index1) %>% mutate(cc_data1_mapped = ifelse(test = dups, yes = mean(x = cc_data2), no = cc_data2)) -> alignment.map
      alignment.map <- alignment.map[!duplicated(x = alignment.map$alignment.index1), ]
      cc.embeds.all[names(x = sort(x = cc.embeds[[g]][, cc.use])), cc.use] <- alignment.map$cc_data1_mapped
      if (show.plots) {
        par(mfrow = c(3, 2))
        plot(x = ReferenceRange(x = metagenes[[1]]), main = cc.use)
        plot(x = ReferenceRange(x = metagenes[[g]]))
        plot(
          x = ReferenceRange(x = metagenes[[1]])[(alignment.map.orig$alignment.index2)],
          pch = 16
        )
        points(
          x = ReferenceRange(metagenes[[g]])[(alignment.map.orig$alignment.index1)],
          col = "red",
          pch = 16,
          cex = 0.4
        )
        plot(x = density(x = alignment.map$cc_data1_mapped))
        lines(x = density(x = sort(x = cc.embeds[[1]][, cc.use])), col = "red")
        plot(x = alignment.map.orig$cc_data1)
        points(x = alignment.map.orig$cc_data2, col = "red")
      }
    }
  }
  new.type <- paste0(reduction.type, ".aligned")
  new.key <- paste0("A", Key(object = object[[reduction.type]]))
  object[[new.type]] <- CreateDimReducObject(
    embeddings = scale(x = cc.embeds.all),
    key = new.key,
    assay = DefaultAssay(object = object[[reduction.type]])
  )
  # object <- SetDimReduction(
  #   object = object,
  #   reduction.type = new.type,
  #   slot = "cell.embeddings",
  #   new.data = scale(x = cc.embeds.all)
  # )
  # object <- SetDimReduction(
  #   object = object,
  #   reduction.type = new.type,
  #   slot = "key",
  #   new.data = new.key
  # )
  return(object)
}

#' Calculate an alignment score
#'
#' Calculates an alignment score to determine how well aligned two (or more)
#' groups have been aligned. We first split the data into groups based on the
#' grouping.var provided and randomly downsample all groups to have as many cells
#' as in the smallest group. We then construct a nearest neighbor graph and ask
#' for each cell, how many of its neighbors have the same group identity as it
#' does. We then take the average over all cells, compare it to the expected
#' value for perfectly mixed neighborhoods, and scale it to range from 0 to 1.
#'
#' xbar is the average number of neighbors belonging to any cells' same group,
#' N is the number of groups in the given grouping.var, k is the number of
#' neighbors in the KNN graph.
#' \deqn{1 - \frac{\bar{x} - \frac{k}{N}}{k - \frac{k}{N}}}{1 - (xbar - k/N)/(k - k/N)}
#'
#' @param object Seurat object
#' @param reduction Stored dimensional reduction on which to build NN graph.
#' Usually going to be cca.aligned.
#' @param dims Dimensions to use in building the NN graph
#' @param grouping.var Grouping variable used in the alignment.
#' @param nn Number of neighbors to calculate in the NN graph construction
#' @param nn.eps Error bound when performing nearest neighbor seach using RANN;
#' default of 0.0 implies exact nearest neighbor search
#'
#' @importFrom RANN nn2
#' @export
#'
#' @examples
#' \dontrun{
#' pbmc_small
#' # As CCA requires two datasets, we will split our test object into two just for this example
#' pbmc1 <- SubsetData(pbmc_small, cells = pbmc_small@cell.names[1:40])
#' pbmc2 <- SubsetData(pbmc_small, cells = pbmc_small@cell.names[41:80])
#' pbmc1@meta.data$group <- "group1"
#' pbmc2@meta.data$group <- "group2"
#' pbmc_cca <- RunCCA(pbmc1,pbmc2)
#' pbmc_cca <- AlignSubspace(pbmc_cca, reduction.type = "cca",
#'                           grouping.var = "group", dims.align = 1:5)
#' CalcAlignmentMetric(pbmc_cca, reduction = "cca.aligned",
#'                     dims = 1:5, grouping.var =  "group")
#' }
#'
CalcAlignmentMetric <- function(
  object,
  reduction = "cca.aligned",
  dims,
  grouping.var,
  nn,
  nn.eps = 0
) {
  object <- SetAllIdent(object = object, id = grouping.var)
  object <- subset(
    object = object,
    select = levels(x = Idents(object = object)),
    downsample = min(table(object@ident))
  )
  num.groups <- length(x = unique(x = object@ident))
  if (missing(x = nn)) {
    nn <- ceiling(x = table(object@ident)[1] * 0.01 * num.groups)
  }
  dist.mat <- GetCellEmbeddings(
    object = object,
    reduction.type = reduction,
    dims = dims
  )
  # object.fnn <- get.knn(dist.mat, k = nn)
  object.fnn <- nn2(
    data = dist.mat,
    k = nn,
    searchtype = 'standard',
    eps = nn.eps
  )
  alignment.score <- sapply(
    X = 1:length(x = object@cell.names),
    FUN = function(x) {
      cell.id <- object@ident[x]
      num.same.id <- length(x = which(x = object@ident[object.fnn$nn.idx[x, ]] == cell.id))
    }
  )
  alignment.score <- 1 - ((mean(x = alignment.score) - nn / num.groups) / (nn - nn / num.groups))
  return(unname(obj = alignment.score))
}

#' Calculate the ratio of variance explained by ICA or PCA to CCA
#'
#' @param object Seurat object
#' @param reduction.type type of dimensional reduction to compare to CCA (pca,
#' pcafast, ica)
#' @param grouping.var variable to group by
#' @param dims Vector of dimensions to project onto (default is the 1:number
#' stored for cca)
#' @param verbose Display progress and other output
#'
#' @return Returns Seurat object with ratio of variance explained stored in
#' object@@meta.data$var.ratio
#' @export
#'
#' @examples
#' \dontrun{
#' pbmc_small
#' # Requires CCA to have previously been run
#' # As CCA requires two datasets, we will split our test object into two just for this example
#' pbmc1 <- SubsetData(pbmc_small,cells = pbmc_small@cell.names[1:40])
#' pbmc2 <- SubsetData(pbmc_small,cells = pbmc_small@cell.names[41:80])
#' pbmc1@meta.data$group <- "group1"
#' pbmc2@meta.data$group <- "group2"
#' pbmc_cca <- RunCCA(pbmc1,pbmc2)
#' pbmc_cca <- CalcVarExpRatio(pbmc_cca,reduction.type = "pca", grouping.var = "group", dims = 1:5)
#'}
CalcVarExpRatio <- function(
  object,
  reduction.type = "pca",
  grouping.var,
  dims,
  verbose = TRUE
) {
  if (missing(x = grouping.var)) {
    stop("Need to provide grouping variable")
  }
  if (missing(x = dims)) {
    # dims <- 1:ncol(x = GetCellEmbeddings(object = object, reduction.type = "cca"))
    dims <- 1:ncol(x = object[['cca']])
  }
  # parameters.to.store <- as.list(environment(), all = TRUE)[names(formals("CalcVarExpRatio"))]
  # object <- SetCalcParams(object = object,
  #                         calculation = "CalcVarExpRatio",
  #                         ... = parameters.to.store)
  groups <- as.vector(x = unique(x = FetchData(
    object = object,
    vars = grouping.var
  )[, 1]))
  # genes.use <- rownames(x = GetGeneLoadings(object = object, reduction.type = "cca"))
  genes.use <- rownames(x = object[['cca']])
  var.ratio <- data.frame()
  for (group in groups) {
    if (verbose) {
      cat(paste("Calculating for", group, "\n"), file = stderr())
    }
    # group.cells <- WhichCells(
    #   object = object,
    #   subset.name = grouping.var,
    #   accept.value = group
    # )
    group.cells <- colnames(x = object)[object[[grouping.var]] == group]
    if (verbose) {
      cat(paste("\t Separating", group, "cells\n"), file = stderr())
    }
    group.object <- subset(x = object, select = group.cells)
    if (verbose) {
      cat("\t Running Dimensional Reduction \n", file = stderr())
    }
    ldp.cca <- CalcLDProj(
      object = group.object,
      reduction.type = "cca",
      dims = dims,
      genes.use = genes.use
    )
    group.object <- CalcProjectedVar(
      object = group.object,
      low.dim.data = ldp.cca,
      reduction.type = "cca",
      dims = dims,
      genes.use = genes.use
    )
    if (reduction.type == "pca") {
      temp.matrix <- PrepDR(group.object, features = genes.use)
      group.object <- RunPCA(
        object = group.object,
        pc.genes = genes.use,
        do.print = FALSE,
        center = rowMeans(temp.matrix),
        pcs.compute = max(dims)
      )
      ldp.pca <- CalcLDProj(
        object = group.object,
        reduction.type = "pca",
        dims = dims,
        genes.use = genes.use
      )
      group.object <- CalcProjectedVar(
        object = group.object,
        low.dim.data = ldp.pca,
        reduction.type = "pca",
        dims = dims,
        genes.use = genes.use
      )
      group.var.ratio <- group.object@meta.data[, "cca.var", drop = FALSE] /
        group.object@meta.data[, "pca.var", drop = FALSE]
    } else if (reduction.type == "ica") {
      group.object <- RunICA(
        object = group.object,
        ic.genes = genes.use,
        print.results = FALSE,
        ics.compute = max(dims)
      )
      ldp.ica <- CalcLDProj(
        object = group.object,
        reduction.type = "ica",
        dims = dims,
        genes.use = genes.use
      )
      group.object <- CalcProjectedVar(
        object = group.object,
        low.dim.data = ldp.ica,
        reduction.type = "ica",
        dims = dims,
        genes.use = genes.use
      )
      group.var.ratio <- group.object@meta.data[, "cca.var", drop = FALSE] /
        group.object@meta.data[, "ica.var", drop = FALSE]
    } else {
      stop(paste("reduction.type", reduction.type, "not supported"))
    }
    var.ratio <- rbind(var.ratio, group.var.ratio)
  }
  var.ratio$cell.name <- rownames(x = var.ratio)
  eval(expr = parse(text = paste0(
    "object@meta.data$var.ratio.",
    reduction.type,
    "<- NULL"
  )))
  colnames(x = var.ratio) <- c(
    paste0("var.ratio.", reduction.type),
    "cell.name"
  )
  object <- AddMetaData(object, metadata = var.ratio)
  object@meta.data$cell.name <- NULL
  return(object)
}

#' Canonical Correlation Analysis between modalities
#'
#' Runs a canonical correlation analysis between two assays (for example, RNA and ADT from CITE-seq)
#'
#' @param object Seurat object. Assumes that scale.data exist for both assays.
#' @param assay.1 The first assay for CCA.
#' @param assay.2 The second assay for CCA.
#' @param features.1 Features to use for the first assay, default is all the features (use object@var.genes if this is RNA).
#' @param features.2 Features to use for the second assay, default is all the features.
#' @param num.cc Minimal number of CCs to return.
#' @param normalize.variance Whether to scale the embeddings. Default is TRUE.
#'
#' @return Returns Seurat object with two CCA results stored (for example, object@dr$RNACCA and object@dr$ADTCCA).
#'
#' @export
#'
#' @examples
#' \dontrun{
#' object <- MultiModal_CCA(object,assay.1 = "RNA",assay.2 = "ADT")
#' }
#'
MultiModal_CCA <- function(
  object,
  assay.1 = "RNA",
  assay.2 = "ADT",
  features.1 = NULL,
  features.2 = NULL,
  num.cc = 20,
  normalize.variance = TRUE
) {
  #first pull out data, define features
  data.1 <- GetAssayData(
    object = object,
    assay.type = assay.1,
    slot = "scale.data"
  )
  data.2 <- GetAssayData(
    object = object,
    assay.type = assay.2,
    slot = "scale.data"
  )
  if (is.null(x = features.1)) {
    if ((assay.1 == "RNA") && length(x = object@var.genes) > 0) {
      features.1 <- object@var.genes
    } else {
      features.1 <- rownames(x = data.1)
    }
  }
  features.2 <- SetIfNull(x = features.2, default = rownames(x = data.2))
  data.1 <- t(x = data.1[features.1, ])
  data.2 <- t(x = data.2[features.2, ])
  #data.1.var=apply(data.1,2,var)
  #data.2.var=apply(data.2,2,var)
  data.1 <- data.1[,apply(data.1,2,var)>0]
  data.2 <- data.2[,apply(data.2,2,var)>0]

  num.cc <- max(20, min(ncol(data.1), ncol(data.2)))
  cca.data <- list(data.1, data.2)
  names(x = cca.data) <- c(assay.1, assay.2)
  # now run CCA
  out <- CCA(
    x = cca.data[[1]],
    z = cca.data[[2]],
    typex = "standard",
    typez = "standard",
    K = num.cc,
    penaltyz = 1,
    penaltyx = 1
  )
  cca.output <- list(out$u, out$v)
  embeddings.cca <- list()
  for (i in 1:length(x = cca.data)) {
    assay <- names(x = cca.data)[i]
    rownames(x = cca.output[[i]]) <- colnames(x = cca.data[[i]])
    embeddings.cca[[i]] <- cca.data[[i]] %*% cca.output[[i]]
    colnames(x = embeddings.cca[[i]]) <- paste0(
      assay,
      "CC",
      1:ncol(x = embeddings.cca[[i]])
    )
    colnames(x = cca.output[[i]]) <- colnames(x = embeddings.cca[[i]])
    if (normalize.variance) {
      embeddings.cca[[i]] <- scale(x = embeddings.cca[[i]])
    }
    object <- SetDimReduction(
      object = object,
      reduction.type = paste0(assay, "CCA"),
      slot = "cell.embeddings",
      new.data = embeddings.cca[[i]]
    )
    object <- SetDimReduction(
      object = object,
      reduction.type = paste0(assay, "CCA"),
      slot = "key",
      new.data = paste0(assay, "CC")
    )
    object <- SetDimReduction(
      object = object,
      reduction.type = paste0(assay, "CCA"),
      slot = "gene.loadings",
      new.data =  cca.output[[i]]
    )
  }
  return(object)
}

# Regress out technical effects and cell cycle using regularized Negative Binomial regression
#
# Remove unwanted effects from umi data and set scale.data to Pearson residuals
# Uses mclapply; you can set the number of cores it will use to n with command options(mc.cores = n)
#
# @param object Seurat object
# @param latent.vars effects to regress out
# @param genes.regress gene to run regression for (default is all genes)
# @param pr.clip.range numeric of length two specifying the min and max values the results will be clipped to
#
# @return Returns Seurat object with the scale.data (object@scale.data) genes returning the residuals fromthe regression model
#
#' @import Matrix
#' @import parallel
#' @importFrom stats glm residuals
#' @importFrom MASS theta.ml negative.binomial
#' @importFrom utils txtProgressBar setTxtProgressBar
#
RegressOutNB <- function(
  object,
  latent.vars,
  genes.regress = NULL,
  pr.clip.range = c(-30, 30),
  min.theta = 0.01
) {
  genes.regress <- SetIfNull(x = genes.regress, default = rownames(x = object@data))
  genes.regress <- intersect(x = genes.regress, y = rownames(x = object@data))
  cm <- object@raw.data[genes.regress, colnames(x = object@data), drop = FALSE]
  latent.data <- FetchData(object = object, vars = latent.vars)
  message(sprintf('Regressing out %s for %d genes\n', paste(latent.vars), length(x = genes.regress)))
  theta.fit <- RegularizedTheta(cm = cm, latent.data = latent.data, min.theta = 0.01, bin.size = 128)
  message('Second run NB regression with fixed theta')
  bin.size <- 128
  bin.ind <- ceiling(1:length(genes.regress)/bin.size)
  max.bin <- max(bin.ind)
  pb <- txtProgressBar(min = 0, max = max.bin, style = 3, file = stderr())
  pr <- c()
  for (i in 1:max.bin) {
    genes.bin.regress <- genes.regress[bin.ind == i]
    bin.pr.lst <- parallel::mclapply(
      X = genes.bin.regress,
      FUN = function(j) {
        fit <- 0
        try(
          expr = fit <- glm(
            cm[j, ] ~ .,
            data = latent.data,
            family = MASS::negative.binomial(theta = theta.fit[j])
          ),
          silent=TRUE
        )
        if (class(fit)[1] == 'numeric') {
          message(
            sprintf(
              'glm and family=negative.binomial(theta=%f) failed for gene %s; falling back to scale(log10(y+1))',
              theta.fit[j],
              j
            )
          )
          res <- scale(log10(cm[j, ] + 1))[, 1]
        } else {
          res <- residuals(fit, type = 'pearson')
        }
        return(res)
      }
    )
    pr <- rbind(pr, do.call(rbind, bin.pr.lst))
    setTxtProgressBar(pb, i)
  }
  close(pb)
  dimnames(x = pr) <- dimnames(x = cm)
  pr[pr < pr.clip.range[1]] <- pr.clip.range[1]
  pr[pr > pr.clip.range[2]] <- pr.clip.range[2]
  object@scale.data <- pr
  return(object)
}

#' Find spatial differential expression
#'
#' Calculate Moran's I for a set of genes
#'
#' @param object A Seurat object with spatial coordinates stored in the dr slot
#' @param assay Which assay to use
#' @param features List of features to compute spatial autocorrelation
#' @param cells Which cells to use (default all cells)
#' @param dims Which dimensions to use
#' @param sd Standard deviation for Gaussian kernel applied to cell distance weighting for Moran's I calculation (defualt 10).
#' Smaller standard deviation corresponds to a steeper drop-off in weights.
#' @param binsize Size of spatial regions (default single cell)
#' @param expression.limit Minimum total normalized counts to consider for spatial DE (genes with value lower than this will be skipped).
#' @param reduction Which dimension reduction to use (default spatial)
#' @param verbose Display messages (default FALSE)
#'
#' @return Returns a dataframe containing Moran's I for each gene
#'
#' @export
#'
#' @importFrom ape Moran.I
#' @importFrom dplyr mutate %>%
#' @importFrom tidyr unite
#' @importFrom utils txtProgressBar
#'
FindSpatialMarkers <- function(
  object,
  assay,
  features = NULL,
  cells = NULL,
  dims = c(1, 2),
  sd = 10,
  binsize = NULL,
  expression.limit = 5,
  reduction = 'spatial',
  verbose = TRUE,
  scale.factor = 1
) {
  if (!(reduction %in% names(object@reductions))){
    stop('Object does not contain requested reduction')
  }
  assay <- assay %||% DefaultAssay(object)
  cells <- cells %||% colnames(object)
  features <- features %||% rownames(object)
  spatial_coords <- as.data.frame(Embeddings(object[[reduction]])[cells, dims])
  colnames(spatial_coords) <- paste0('spatial', dims)
  if(!is.null(binsize)){
    if (verbose)
      message("Binning spatial regions")
    spatial_coords <- as.data.frame(apply(spatial_coords, 2, function(x) round((x*scale.factor)/binsize)))
    if (verbose)
      message("Averaging gene expression in spatial bins")
    if (length(dims) == 2) {
      bins <- spatial_coords %>%
        unite(bin, spatial1, spatial2) %>%
        unique()
      exprs <- GetAssayData(object, assay = assay, slot = 'data')[features, cells]
      binned_expression <- matrix(0, nrow = length(features), ncol = nrow(bins))
      colnames(binned_expression) <- bins[, 1]
      rownames(binned_expression) <- features
      for(i in 0:max(spatial_coords[, 1])){
        for(j in 0:max(spatial_coords[, 2])){
          bin.cells <- rownames(spatial_coords[spatial_coords[, 1] == i & spatial_coords[, 2] == j,])
          if(length(bin.cells) == 1){
            binned_expression[,paste(i, j, sep = '_')] <- exprs[features, bin.cells]
          } else if(length(bin.cells) > 0){
            av_exp <- apply(exprs[features, bin.cells], 1, mean)
            binned_expression[,paste(i, j, sep = '_')] <- av_exp
          }
        }
      }
    } else if (length(dims) == 1) {
      bins <- unique(spatial_coords)
      exprs <- GetAssayData(object, assay = assay, slot = 'data')[features, cells]
      binned_expression <- matrix(0, nrow = length(features), ncol = nrow(bins))
      colnames(binned_expression) <- bins[, 1]
      rownames(binned_expression) <- features
      spatial_coords <- as.matrix(spatial_coords)
      for(i in 0:max(spatial_coords[, 1])){
        bin.cells <- names(spatial_coords[spatial_coords[, 1] == i, ])
        if(length(bin.cells) == 1){
          binned_expression[ , as.character(i)] <- exprs[features, bin.cells]
        } else if(length(bin.cells) > 0){
          av_exp <- apply(exprs[features, bin.cells], 1, mean)
          binned_expression[ , as.character(i)] <- av_exp
        }
      }
    } else {
      stop("Too many dimensions requested")
    }
  } else {
    binned_expression <- GetAssayData(object = object, assay = assay, slot = 'data')[features, cells]
  }
  if(verbose) message("Computing distance weights")
  if (!is.null(binsize)) {
    spatial_bins <- as.matrix(unique(spatial_coords[, dims]))
  } else {
    spatial_bins <- as.matrix(spatial_coords[, dims])
  }
  distances <- as.matrix(dist(spatial_bins))
  weights <- exp(-1*distances/(2*(1/sd))^2)
  diag(weights) <- 0
  weights[is.infinite(weights)] <- 0
  ivals <- data.frame()
  if(verbose) {
    message("Calculating spatial autocorrelation")
    pb <- txtProgressBar(min = 1, max = nrow(binned_expression), style = 3,  file = stderr())
  }
  for(i in 1:nrow(binned_expression)){
    if (sum(binned_expression[i, ]) > expression.limit) {
      mi <- Moran.I(binned_expression[i,], weight = weights)
      ivals[rownames(binned_expression)[i],'I'] <- mi$observed
      ivals[rownames(binned_expression)[i],'expected.I'] <- mi$expected
      ivals[rownames(binned_expression)[i],'sd'] <- mi$sd
      ivals[rownames(binned_expression)[i],'p.value'] <- mi$p.value
      ivals[rownames(binned_expression)[i],'gene'] <- rownames(binned_expression)[i]
    }
    if (verbose) {
      setTxtProgressBar(pb, i)
    }
  }
  if(verbose) message("")
  return(ivals[order(ivals$I, decreasing = T),])
}

globalVariables(
  names = 'cc',
  package = 'Seurat',
  add = TRUE
)
# Evaluate CCs
#
# Looks at the biweight midcorrelation of the Xth gene across the specified CCs
# for each group in the grouping.var.
#
# @param object A Seurat object
# @param grouping.var Grouping variable specified in alignment procedure
# @param dims.eval dimensions to evalutate the bicor for
# @param gene.num Xth gene to look at bicor for
# @param num.possible.genes Number of possible genes to search when choosing
# genes for the metagene. Set to 2000 by default. Lowering will decrease runtime
# but may result in metagenes constructed on fewer than num.genes genes.
# @param display.progress Show progress bar
#
#' @importFrom tidyr gather
#' @importFrom utils txtProgressBar setTxtProgressBar
#
EvaluateCCs <- function(
  object,
  grouping.var,
  dims.eval,
  gene.num,
  num.possible.genes,
  display.progress
) {
  reduction.type <-  "cca"
  ident.orig <- Idents(object = object)
  # object <- SetAllIdent(object = object, id = grouping.var)
  Idents(object = object) <- grouping.var
  levels.split <- names(x = sort(
    x = table(Idents(object = object)),
    decreasing = TRUE
  ))
  num.groups <- length(x = levels.split)
  objects <- list()
  for (i in 1:num.groups) {
    objects[[i]] <- subset(x = object, idents = levels.split[i])
  }
  # object@ident <- ident.orig
  Idents(object = object) <- ident.orig
  cc.loadings <- list()
  scaled.data <- list()
  cc.embeds <- list()
  for (i in 1:num.groups) {
    cat(paste0("Rescaling group ", i, "\n"), file = stderr())
    objects[[i]] <- ScaleData(
      object = objects[[i]],
      block.size = 5000,
      display.progress = display.progress
    )
    objects[[i]] <- ProjectDim(
      object = objects[[i]],
      reduction = reduction.type,
      verbose = FALSE
    )
    cc.loadings[[i]] <- Loadings(
      object = objects[[i]][[reduction.type]],
      projected = TRUE
    )
    cc.embeds[[i]] <- Embeddings(object = objects[[i]][[reduction.type]])
    scaled.data[[i]] <- GetAssayData(object = objects[[i]], slot = 'scale.data')
  }
  bc.gene <- matrix(ncol = num.groups, nrow = length(dims.eval))
  if (display.progress) {
    cat(paste0("Evaluating dims: ", paste(dims.eval, collapse = " "),  "\n"), file = stderr())
    pb <- txtProgressBar(
      min = 0,
      max = length(x = dims.eval) * (num.groups - 1),
      style = 3
    )
    pb.idx <- 0
  }
  for (cc.use in dims.eval) {
    bc.gene.g1 <- c()
    for (g in 2:num.groups) {
      if (display.progress) {
        pb.idx <- pb.idx + 1
        setTxtProgressBar(pb = pb, value = pb.idx)
      }
      genes.rank <- data.frame(
        rank(x = abs(x = cc.loadings[[1]][, cc.use])),
        rank(x = abs(x = cc.loadings[[g]][, cc.use])),
        cc.loadings[[1]][, cc.use],
        cc.loadings[[g]][, cc.use]
      )
      genes.rank$min <- apply(X = genes.rank[,1:2], MARGIN = 1, FUN = min)
      genes.rank <- genes.rank[order(genes.rank$min, decreasing = TRUE), ]
      genes.top <- rownames(x = genes.rank)[1:min(num.possible.genes, nrow(x = genes.rank))]
      bicors <- list()
      for (i in c(1, g)) {
        cc.vals <- cc.embeds[[i]][, cc.use]
        bicors[[i]] <- sapply(
          X = genes.top,
          FUN = function(x) {
            return(BiweightMidcor(x = cc.vals, y = scaled.data[[i]][x, ]))
          }
        )
      }
      genes.rank <- data.frame(
        rank(x = abs(x = bicors[[1]])),
        rank(x = abs(x = bicors[[g]])),
        bicors[[1]],
        bicors[[g]]
      )
      genes.rank$min <- apply(X = abs(x = genes.rank[, 1:2]), MARGIN = 1, FUN = min)
      # genes must be correlated in same direction in both datasets
      genes.rank <- genes.rank[sign(x = genes.rank[,3]) == sign(x = genes.rank[,4]), ]
      genes.rank <- genes.rank[order(genes.rank$min, decreasing = TRUE), ]
      genes.rank <- genes.rank[order(genes.rank$min, decreasing = TRUE), ]
      bc.gene[cc.use, g] <- mean(x = abs(x = genes.rank[1:gene.num, 4]))
      bc.gene.g1 <- c(bc.gene.g1, mean(x = abs(x = genes.rank[1:gene.num, 3])))
    }
    bc.gene[cc.use, 1] <- abs(x = mean(x = bc.gene.g1))
  }
  if (display.progress) {
    close(con = pb)
  }
  colnames(x = bc.gene) <- levels.split
  bc.gene <- as.data.frame(x = bc.gene)
  bc.gene$cc <- 1:nrow(x = bc.gene)
  bc.gene <- gather(data = bc.gene, key = "Group",  value = "bicor", -cc)
  return(bc.gene)
}
=======
# Prepare a Seurat function for a workflow run
#
# Checks dependencies (and runs them if necessary), and then reads in parameter values
#
PrepareWorkflow <- function(object, workflow.name) {
  CheckWorkflow(object = object, workflow.name = workflow.name)
  command.name <- as.character(deparse(sys.calls()[[sys.nframe()-1]]))
  command.name <- gsub(pattern = ".Seurat",replacement = "",x = command.name)
  command.name <- ExtractField(string = command.name, field = 1, delim = "\\(")
  depends <- slot(object = object[[workflow.name]], name = "depends")
  prereq.commands <- colnames(depends)[which(depends[command.name, ] == 1)]
  for(i in prereq.commands) {
    check.prereqs <- CheckWorkflowUpdate(
      object = object,
      workflow.name = workflow.name,
      command.name = i)
    if (check.prereqs) {
      # run the dependency
      workflow.name.quotes <- paste0('\"', workflow.name, '\"')
      new.cmd <- paste0("object <- ", i, "(object, workflow = ", workflow.name.quotes, ")")
      message(paste0("Updating ", i))
      message(new.cmd)
      eval(expr = parse(text = new.cmd))
    }
  }
  ReadWorkflowParams( object = object, workflow.name = workflow.name, depth = 2)
  return(object)
}

# Read Seurat workflow parameters
#
# Reads parameters for a workflow and assigns them to the parent function call.
#
# @param depth is the depth of the call in the function stack. If called
# directly from, for example, ScaleData, then depth=1. If called indirectly
# from PrepareWorkflow, then depth=2
#
ReadWorkflowParams <- function(object, workflow.name, depth = 2) {
  CheckWorkflow(object = object, workflow.name = workflow.name)
  param.list <- names(formals(fun = sys.function(sys.parent(depth))))
  workflow.params <- slot(object = object[[workflow.name]], name = "params")

  # global variables
  to.set <- intersect(x = param.list, y = names(workflow.params$global))
  p.env <- parent.frame(depth)
  for(i in to.set) {
    if (workflow.params$global[[i]] == "FALSE") {
      workflow.params$global[[i]] = FALSE
    }
    if (workflow.params$global[[i]] == "TRUE") {
      workflow.params$global[[i]] = TRUE
    }
    assign(x = names(workflow.params$global[i]),
           value = workflow.params$global[[i]],
           envir = p.env)
  }

  # parameter-specific variables
  command.name <- as.character(deparse(sys.calls()[[sys.nframe()-depth]]))
  command.name <- gsub(pattern = ".Seurat",replacement = "",x = command.name)
  command.name <- ExtractField(string = command.name, field = 1, delim = "\\(")
  to.set <- intersect(x = param.list, y = names(workflow.params[[command.name]]))
  for(i in to.set) {
    assign(x = names(workflow.params[[command.name]][i]),
           value = workflow.params[[command.name]][[i]],
           envir = p.env)
  }

  # overwrite any arguments passed in on the command line
  argnames <- sys.call(which = depth)
  argList <- as.list(argnames[-1])
  args_ignore <- c("", "object", "workflow.name")
  args_use <- setdiff(x = names(argList), y = args_ignore)
  for(i in args_use) {
    if(as.character(unlist(argList[i])[[1]]) == "F") {
      arg.val <- FALSE
    } else if(as.character(unlist(argList[i])[[1]]) == "T") {
      arg.val <- TRUE
    } else {
      arg.val <- unlist(argList[i])[[1]]
    }
    assign(x = i, value = arg.val, envir = p.env)
  }
}

# UpdateWorkflow
#
# Updates a workflow object after a command is run, and makes sure timestamps are properly set.
#
UpdateWorkflow <- function(object, workflow.name, command.name = NULL) {
  CheckWorkflow(object = object,workflow.name = workflow.name)
  if (is.null(x = command.name)) {
    command.name <- as.character(x = deparse(expr = sys.calls()[[sys.nframe() - 1]]))
    command.name <- gsub(pattern = ".Seurat", replacement = "", x = command.name)
    command.name <- ExtractField(string = command.name, field = 1, delim = "\\(")
    command.name.seurat <- intersect(
      x = c(command.name, paste0(command.name, ".", DefaultAssay(object = object))),
      names(x = object@commands)
    )
  } else {
    command.name.seurat <- command.name
    command.name <- ExtractField(string = command.name, field = 1, delim = "\\.")
  }
  #TODO - Deal with Assay better
  seurat.timestamp <- Sys.time()
  if (length(x = command.name) == 1) {
    seurat.timestamp <- slot(
      object = object[[command.name.seurat]],
      name = "time.stamp"
    )
  }
  object <- TouchWorkflow(
    object = object,
    workflow.name = workflow.name,
    command.name = command.name,
    time.stamp = seurat.timestamp
  )
  return(object)
}

# Validates the workflow file that was provided.
#
# @param config results of reading in the ini file
#
# @return No return
#
ValidateWorkflowFile <- function(config) {
  sections <- names(x = config)
  if (!'dependencies' %in% sections) {
    stop("Workflow file is missing the dependencies section")
  }
}

#' The SeuratWorkflow class
#'
#' The SeuratWorkflow class aims to create a Makefile-like approach for Seurat
#' analysis. A full set of parameters can be defined with a single command, and
#' the workflow encodes the pipeline command structure (for example,
#' NormalizeData -> FindVariableFeatures -> ScaleData -> RunPCA -> FindClusters).
#' This leads to very simple workflows being encoded in just a couple commands,
#' without losing the flexibility to run each step for clarity if desired.
#'
#' @slot name Workflow name
#' @slot depends Dependency graph encoding the relationship between commands
#' (for example, RunPCA depends on ScaleData)
#' @slot update Vector specifying for each command, if it needs to be
#' updated/rerun
#' @slot params List of parameters used across the workflow
#' @slot mostRecent Vector specifying for each command, the most recent
#' timestamp. If the current timestamp matches this, should not need to be rerun.
#'
#' @name SeuratWorkflow-class
#' @rdname SeuratWorkflow-class
#' @exportClass SeuratWorkflow
#'
SeuratWorkflow <- setClass(
  Class = 'SeuratWorkflow',
  slots = c(
    name = 'character',
    depends = 'ANY',
    update = 'ANY',
    params = 'ANY',
    mostRecent = 'ANY'
  )
)

#' Checks if a workflow is defined for a Seurat object
#'
#' Checks if a workflow is defined for a Seurat object
#'
#' @param object Seurat object
#' @param workflow.name Workflow name, should already be initialized using InitializeWorkflow#'
#' @return TRUE if workflow is defined. STOP otherwise
#'
#' @export
#'
#' @examples
#' \dontrun{
#' CheckWorkflow(pbmc_small, "cluster")
#' }
#'
CheckWorkflow <- function(object, workflow.name) {
  # Check if workflow is there
  workflow.present <- FALSE
  if (workflow.name %in% names(x = object@workflows)) {
    if (class(x = object[[workflow.name]])[[1]] == "SeuratWorkflow") {
      workflow.present <- TRUE
    }
  }
  if (!workflow.present) {
    stop("Workflow not present, initialize first.")
  }
  return(TRUE)
}

#' Check if workflow command needs update
#'
#' Compares the stored timestamp with the most recently recorded timestamp to see if a dependency has been updated
#'
#' @param object Seurat object
#' @param workflow.name Workflow name, should already be initialized using InitializeWorkflow
#' @param command.name Name of the command to check
#'
#' @return Returns TRUE if the dependency has changed (or has not been run), and an update is needed. FALSE otherwise
#'
#' @export
#'
#' @examples
#' \dontrun{
#' CheckWorkflowUpdate(object = pbmc_small,workflow.name = "cluster", command.name = "ScaleData")
#' }
#'
CheckWorkflowUpdate <- function(object, workflow.name, command.name) {
  CheckWorkflow(object = object, workflow.name = workflow.name)

  # According to the workflow, the most recent update
  mostRecent <- slot(object[[workflow.name]],"mostRecent")
  workflow.timestamp <- mostRecent[command.name]
  seurat.timestamp <- as.POSIXct("1900-01-01");

  #means Seurat command has never been run in the workflow
  if (workflow.timestamp==seurat.timestamp) {
    return(TRUE)
  }
  # According to SeuratCommand, the most recent update
  # go to workflow to look up assay and DR
  params <- slot(object = object[[workflow.name]], name = "params")
  assay <- params$global$assay
  assay <- assay %iff% params[[command.name]]$assay
  assay <- assay %||% DefaultAssay(object)
  reduction <- params$global$reduction
  reduction <- reduction %iff% params[[command.name]]$reduction
  reduction <- reduction %||% formals(fun = paste0(command.name, ".Seurat"))$reduction
  command.name <- paste0(command.name, ".", assay, ".", reduction)
  command.name <- sub(pattern = "[\\.]+$", replacement = "", x = command.name, perl = TRUE)
  command.name <- sub(pattern = "\\.\\.", replacement = "\\.", x = command.name, perl = TRUE)
  if (!(command.name %in% names(object@commands))) {
    return(TRUE)
  }
  if (length(x = command.name)==1) {
    seurat.timestamp <- slot(object = object@commands[[command.name]], name = "time.stamp")
  }
  if (seurat.timestamp == workflow.timestamp) {
    return(FALSE)
  }
  return(TRUE)
}

#' Create a workflow object
#'
#' @param file Path to workflow file
#'
#' @return A SeuratWorkflow object
#'
#' @export
#'
CreateWorkflowObject <- function(file) {
  if (!file.exists(... = file)) {
    stop("Provided workflow file does not exist.")
  }
  config <- read.ini(filepath = file)
  ValidateWorkflowFile(config = config)
  workflow.name <- gsub(
    pattern = ".workflow.ini",
    replacement = "",
    x = basename(path = file)
  )
  depend.fxns <- unlist(x = strsplit(
    x = unname(obj = unlist(x = config$dependencies)),
    split = ","
  ))
  fxns <- union(x = depend.fxns, y = names(x = config$dependencies))
  depends <- matrix(nrow = length(x = fxns), ncol = length(x = fxns))
  rownames(x = depends) <- colnames(x = depends) <- fxns
  for (cmd in 1:length(x = config$dependencies)) {
    cmd.name <- names(x = config$dependencies[cmd])
    cmd.vals <- unlist(x = strsplit(x = config$dependencies[[cmd]], split = ","))
    for (cv in cmd.vals) {
      depends[cmd.name, cv] <- 1
    }
  }
  mostRecent <- rep(x = as.POSIXct(x = "1900-01-01"), length(x = fxns))
  names(x = mostRecent) <- fxns
  for (mr in names(x = mostRecent)) {
    assay <- config$global$assay
    assay <- assay %iff% config[mr]$assay
    reduction <- config$global$reduction
    reduction <- config[mr]$reduction
    reduction <- reduction %||% formals(fun = paste0(mr, ".Seurat"))$reduction
    command.name <- paste0(mr, ".", assay, ".", reduction)
    command.name <- sub(
      pattern = "[\\.]+$",
      replacement = "",
      x = command.name,
      perl = TRUE
    )
    command.name <- sub(
      pattern = "\\.\\.",
      replacement = "\\.",
      x = command.name,
      perl = TRUE
    )
  }
  params <- list()
  if (!is.null(x = config$global)) {
    params[["global"]] <- config$global
    for (p in 1:length(x = params$global)) {
      params$global[names(x = params$global[p])] <- ToNumeric(x = params$global[[p]])
    }
  }
  # set fxn specific params
  fxn.param.names <- setdiff(
    x = names(x = config),
    y = c("dependencies", "global")
  )
  if (length(x = fxn.param.names) > 0) {
    for (i in 1:length(x = fxn.param.names)) {
      params[fxn.param.names[i]] <- config[fxn.param.names[i]]
      for (p in 1:length(x = params[[fxn.param.names[i]]])) {
        params[[fxn.param.names[i]]][[p]] <- ToNumeric(x = params[[fxn.param.names[i]]][[p]])
      }
    }
  }
  seurat.workflow <- new(
    Class = 'SeuratWorkflow',
    name = workflow.name,
    depends = depends,
    params = params,
    mostRecent = mostRecent
  )
  return(seurat.workflow)
}

#' Initialize a Seurat workflow
#'
#' Reads dependencies from a file and initializes a Seurat workflow
#'
#' @param object Seurat object
#' @param file Ini configuration file. See cluster.workflow.ini for an example.
#'
#' @return Object with modified workflows
#'
#' @importFrom ini read.ini
#' @export
#'
#' @examples
#' \dontrun{
#' pbmc_small <- InitializeWorkflow(object = pbmc_small, file = 'workflows/cluster.workflow.txt')
#' }
#'
InitializeWorkflow <- function(object, file) {
  if (!file.exists(... = file)) {
    stop("Provided workflow file does not exist.")
  }
  config <- read.ini(filepath = file)
  ValidateWorkflowFile(config = config)
  workflow.name <- gsub(
    pattern = ".workflow.ini",
    replacement = "",
    x = basename(path = file)
  )
  depend.fxns <- unlist(x = strsplit(
    x = unname(obj = unlist(x = config$dependencies)),
    split = ","
  ))
  fxns <- union(x = depend.fxns, y = names(x = config$dependencies))
  depends <- matrix(nrow = length(x = fxns), ncol = length(x = fxns))
  rownames(x = depends) <- colnames(x = depends) <- fxns
  for (cmd in 1:length(x = config$dependencies)) {
    cmd.name <- names(x = config$dependencies[cmd])
    cmd.vals <- unlist(x = strsplit(x = config$dependencies[[cmd]], split = ","))
    for (cv in cmd.vals) {
      depends[cmd.name, cv] <- 1
    }
  }
  mostRecent <- rep(x = as.POSIXct(x = "1900-01-01"), length(x = fxns))
  names(x = mostRecent) <- fxns
  for (mr in names(x = mostRecent)) {
    assay <- config$global$assay
    assay <- assay %iff% config[mr]$assay
    assay <- assay %||% DefaultAssay(object = object)
    reduction <- config$global$reduction
    reduction <- config[mr]$reduction
    reduction <- reduction %||% formals(fun = paste0(mr, ".Seurat"))$reduction
    command.name <- paste0(mr, ".", assay, ".", reduction)
    command.name <- sub(
      pattern = "[\\.]+$",
      replacement = "",
      x = command.name,
      perl = TRUE
    )
    command.name <- sub(
      pattern = "\\.\\.",
      replacement = "\\.",
      x = command.name,
      perl = TRUE
    )
    if (command.name %in% names(x = object)) {
      seurat.timestamp <- slot(object = object[[command.name]], name = "time.stamp")
      mostRecent[mr] <- seurat.timestamp
    }
  }
  params <- list()
  if (!is.null(x = config$global)) {
    params[["global"]] <- config$global
    for (p in 1:length(x = params$global)) {
      params$global[names(x = params$global[p])] <- ToNumeric(x = params$global[[p]])
    }
  }
  # set fxn specific params
  fxn.param.names <- setdiff(
    x = names(x = config),
    y = c("dependencies", "global")
  )
  if (length(x = fxn.param.names) > 0) {
    for (i in 1:length(x = fxn.param.names)) {
      params[fxn.param.names[i]] <- config[fxn.param.names[i]]
      for (p in 1:length(x = params[[fxn.param.names[i]]])) {
        params[[fxn.param.names[i]]][[p]] <- ToNumeric(x = params[[fxn.param.names[i]]][[p]])
      }
    }
  }
  seurat.workflow <- new(
    Class = 'SeuratWorkflow',
    name = workflow.name,
    depends = depends,
    params = params,
    mostRecent = mostRecent
  )
  object[[workflow.name]] <- seurat.workflow
  return(object)
}

#' Output individual function calls to recreate workflow
#'
#' Output all commands to reproduce your analysis without shortcuts. Should enhance reproducibility, but can be confused by custom modifcations, usage of SubsetData, etc.
#' We hope this will be very useful, but use with care and verify that it does indeed reproduce your work.
#'
#' @param object Seurat object
#' @param workflow.name Workflow name, should already be initialized using InitializeWorkflow
#' @param command.name Name of the command at the end of the workflow
#' @param depth depth of the recursive call. Only depth 1 outputs the parameters
#'
#' @return Seurat object with updated workflow
#'
#' @export
#'
#' @examples
#' \dontrun{
#' RecreateWorkflows(object = pbmc_small,workflow.name = "cluster", command.name = "FindClusters")
#' }
#'
RecreateWorkflows <- function(object, workflow.name, command.name, depth = 1) {
  CheckWorkflow(object = object, workflow.name = workflow.name)
  depends <- slot(object = object[[workflow.name]],name = "depends")
  prereq.commands <- colnames(depends)[which(depends[command.name,]==1)]
  if (depth == 1) {
    message(paste0("\tNeed to output SetParams"))
  }
  for(i in prereq.commands) {
    RecreateWorkflows(object = object,workflow.name = workflow.name,command.name = i,depth = depth + 1)
  }
  #TODO deal with Assay better
  command.name <- intersect(c(command.name, paste0(command.name,".",DefaultAssay(object))), names(object))
  if (length(x = command.name)==1) {
    call.string <- slot(object[[command.name]],"call.string")
    #browser()
    message(paste0("\t",call.string))
  }
}

#' Set Seurat workflow parameters
#'
#' Sets parameters for a workflow
#'
#' @param object Seurat object
#' @param workflow.name Workflow name, should already be initialized using
#' InitializeWorkflow
#' @param fxn.param.names Name of the function and parameter to set (formatted as
#' FXNNAME_PARAM). Can take a vector to set multiple functions
#' @param fxn.param.values Value of the parameter to set. Should be of equal length
#' to fxn.param.names
#' @param \dots Global parameters to set, will be fed into workflow Seurat functions.
#' Parameters ending with a "." will populate all similar variable names (i.e.
#'  setting dims. will set both dims.compute, and dims.cluster)
#'
#' @return Object with modified workflows
#'
#' @export
#'
#' @examples
#' \dontrun{
#' pbmc_small <- SetWorkflowParams(object = pbmc_small, seed.use = 31, dims. = 20)
#' }
#'
SetWorkflowParams <- function(
  object,
  workflow.name = NULL,
  ...,
  fxn.param.names = NULL,
  fxn.param.values = NULL
) {
  CheckWorkflow(object = object, workflow.name = workflow.name)
  if (length(x = fxn.param.names) != length(x = fxn.param.values)) {
    stop("length of fxn.parameter.names needs to equal length of fxn.parameter.values")
  }
  params <- slot(object = object[[workflow.name]], name = "params")
  # set global params
  global.params <- list(...)
  for (gp in 1:length(global.params)) {
    params[["global"]][names(global.params[gp])] <- global.params[gp]
  }
  # set fxn specific params
  if (!is.null(fxn.param.names)) {
    for (i in 1:length(x = fxn.param.names)) {
      fxn <- unlist(strsplit(x = fxn.param.names[i], split = "_"))
      params[[fxn[1]]][fxn[2]] <- fxn.param.values[i]
    }
  }
  slot(object = object[[workflow.name]], name = "params") <- params
  return(object)
}

#' Run a command from a workflow
#'
#' Run workflow
#'
#' @param object Seurat object
#' @param workflow.name Workflow name, should already be initialized using InitializeWorkflow
#' @param end Name of ending function within workflow
#' @param start Name of starting function within workflow
#' @param ... Other arguments
#'
#' @return Seurat object with updated workflow
#'
#' @export
#'
#' @examples
#' \dontrun{
#' TouchWorkflow(object = pbmc_small,workflow.name = "cluster", command.name = "ScaleData")
#' }
#'
RunWorkflow <- function(
  object,
  workflow.name = "cluster",
  end = "FindClusters",
  start = NULL,
  ...
) {
  #browser()
  start <- start %||% end
  if ((workflow.name == "cluster") && (!(workflow.name%in%names(object@workflows)))) {
    object[[workflow.name]] <- cluster.workflow
  }

  CheckWorkflow(object = object, workflow.name = workflow.name)
  depends <- slot(object@workflows[[workflow.name]], name = "depends")
  if (!start %in% rownames(x = depends)) {
    stop(paste0(start, " is not a command in the workflow"))
  }
  if (!end %in% rownames(x = depends)) {
    stop(paste0(end, " is not a command in the workflow"))
  }
  if (length(x = list(...)) > 0) {
    object <- SetWorkflowParams(object = object,workflow.name = workflow.name, ...)
  }
  current.command <-  end
  depends <- slot(object = object[[workflow.name]], name = "depends")
  prereqs <- names(x = which(x = depends[current.command, ] > 0))
  if (current.command != start) {
    for (i in prereqs) {
      object <- RunWorkflow(
        object = object,
        workflow.name = workflow.name,
        end = i,
        start = start
      )
    }
  }
  workflow.args <- slot(object[[workflow.name]], name = "params")$global
  my.args <- formals(fun = paste0(current.command,".Seurat"))
  args_ignore <- c("", "object", "workflow.name")
  args_use <- setdiff(
    x = intersect(x = names(x = my.args), y = names(x = workflow.args)),
    y = args_ignore
  )
  for (i in args_use) {
    my.args[[i]] <- workflow.args[[i]]
  }
  args <- my.args[args_use]
  #browser()
  command <- paste0("object <-  ", current.command,"(object = object")
  if (length(x = args) > 0) {
    for (i in 1:length(x = args)) {
      command <- paste0(command, ", ", names(x = args)[i], " = ", args[i])
    }
  }
  command <- paste0(command, ")")
  message("")
  message(paste0("\t", "Running command:"))
  message(paste0("\t", command))
  message("")
  eval(expr = parse(text = command))
  return(object)
}

#' Updates workflow timestamps
#'
#' Like the touch command in linux. Updates a workflow command's timestamp, and its dependencies
#'
#' @param object Seurat object
#' @param workflow.name Workflow name, should already be initialized using InitializeWorkflow
#' @param command.name Name of the command to touch
#' @param time.stamp Timestamp to assign
#'
#' @return Seurat object with updated workflow
#'
#' @export
#'
#' @examples
#' \dontrun{
#' TouchWorkflow(object = pbmc_small,workflow.name = "cluster", command.name = "ScaleData")
#' }
#'
TouchWorkflow <- function(object, workflow.name, command.name, time.stamp = Sys.time()) {
  CheckWorkflow(object = object, workflow.name = workflow.name)
  #Now update all dependencies, recursively
  depends <- slot(object = object[[workflow.name]],name = "depends")
  depend.commands <- colnames(depends)[which(depends[,command.name]==1)]
  mostRecent <- slot(object[[workflow.name]],"mostRecent")
  mostRecent[command.name] <- time.stamp
  slot(object[[workflow.name]],"mostRecent") <- mostRecent
  for(i in depend.commands) {
    object <- TouchWorkflow(object,workflow.name = workflow.name,command.name = i,time.stamp = time.stamp)
  }
  return(object)
}

#' Output status of each command in the workflow
#'
#' For each command in the workflow, indicate whether it is up-to-date.
#'
#' @param object Seurat object
#' @param workflow.name Workflow name, should already be initialized using InitializeWorkflow
#' @param command.name Name of the command at the end of the workflow
#'
#' @return Seurat object with updated workflow
#'
#' @export
#'
#' @examples
#' \dontrun{
#' WorkflowStatus(object = pbmc_small, workflow.name = "cluster")
#' }
#'
WorkflowStatus <- function(object, workflow.name, command.name) {
  CheckWorkflow(object = object, workflow.name = workflow.name)
  message(paste("Status  for", workflow.name, "workflow"))
  depends <- slot(object = object[[workflow.name]], name = "depends")
  all.cmds <- rownames(x = depends)
  for (i in all.cmds) {
    is.updated <- !CheckWorkflowUpdate(
      object = object,
      workflow.name = workflow.name,
      command.name = i
    )
    if (is.updated) {
      message(paste0("\t", i, " up to date"))
    } else {
      message(paste0("\t\t", i, " is out of date"))
    }
  }
}

.onLoad <- function(libname, pkgname) {
  delayedAssign(
    x = 'cluster.workflow',
    value = CreateWorkflowObject(file = system.file(
      'extdata/cluster.workflow.ini',
      package = 'Seurat'
    )),
    assign.env = asNamespace(ns = 'Seurat')
  )
  # autoload(name = 'cluster.workflow', package = 'Seurat')
}

# Documentation needed
#
# @param object ...
# @param workflow.name ...
# @param depth ...
#
ReadPlotParams <- function(object, workflow.name, depth = 1) {
  if (!("PlotParams" %in% names(object@misc))) {
    return()
  }
  command.name <- as.character(deparse(sys.calls()[[sys.nframe()-depth]]))
  command.name <- gsub(pattern = ".Seurat",replacement = "",x = command.name)
  command.name <- ExtractField(string = command.name, field = 1, delim = "\\(")
  param.list <- names(formals(fun = sys.function(sys.parent(depth))))
  paramsList <- slot(object = object, name = "misc")[["PlotParams"]]
  p.env <- parent.frame(depth)
  for(i in names(paramsList)) {
    split_arr <- unlist(strsplit(x = i,split = ":"))
    function.name <- split_arr[1]
    param.name <- split_arr[2]
    param.value <- paramsList[[i]]
    if ((function.name == command.name) && (param.name %in% param.list)) {
      assign(
        x = param.name,
        value = param.value,
        envir = p.env
      )
    }
  }
  # overwrite any arguments passed in on the command line
  argnames <- sys.call(which = depth)
  argList <- as.list(argnames[-1])
  args_ignore <- c("", "object", "workflow.name")
  args_use <- setdiff(x = names(argList), y = args_ignore)
  for(i in args_use) {
    if(as.character(unlist(argList[i])[[1]]) == "F") {
      arg.val <- FALSE
    } else if(as.character(unlist(argList[i])[[1]]) == "T") {
      arg.val <- TRUE
    } else {
      arg.val <- unlist(argList[i])[[1]]
    }
    assign(x = i, value = arg.val, envir = p.env)
  }
}

# Documentation needed
#Sets the parameter value for a plotting function as default for an object
# @param object ...
# @param function.name ...
# @param param.name ...
# @param param.value ...
#
# Quick example :   pbmc <- FixPlotParam(pbmc,"FeaturePlot","pt.size","0.1")
#
FixPlotParam <- function(object, function.name, param.name, param.value) {
  if ("PlotParams" %in% names(object@misc)) {
    object@misc[["PlotParams"]][paste(function.name, param.name, sep = ":")] <- param.value
  }
  else {
    plotParamsList <- list()
    plotParamsList[paste(function.name, param.name, sep = ":")] <- param.value
    object@misc[["PlotParams"]] <- plotParamsList
  }
  return(object)
}

# Documentation
#Internal, not documented for now
#
#' @importFrom lars lars predict.lars
#
LassoFxn <- function(
  lasso.input,
  genes.obs,
  s.use = 20,
  gene.name = NULL,
  do.print = FALSE,
  gram = TRUE
) {
  lasso.model <- lars(
    x = lasso.input,
    y = as.numeric(x = genes.obs),
    type = "lasso",
    max.steps = s.use * 2,
    use.Gram = gram
  )
  #lasso.fits=predict.lars(lasso.model,lasso.input,type="fit",s=min(s.use,max(lasso.model$df)))$fit
  lasso.fits <- predict.lars(
    object = lasso.model,
    newx = lasso.input,
    type = "fit",
    s = s.use
  )$fit
  if (do.print) {
    print(gene.name)
  }
  return(lasso.fits)
}
back to top