smda <- function (x, ...) UseMethod("smda")


smda.default <- function(x, y, Z = NULL, Rj = NULL, lambda=1e-6, stop, maxIte=50, trace=FALSE, tol=1e-4, ...){
  ##
  ## smda performs Sparse Mixture Disciminant Analysis
  ## Solving: argmin{|(Y*theta-X*b)|_2^2 + t*|beta|_1 + lambda*|beta|_2^2}
  ##
  ## INPUT:
  ## x      : matrix of n observations down the rows and p variable columns. The
  ##          columns are assumed normalized
  ## Z      : matrix initializing the probabilities representing the groups
  ## Rj     : K length vector containing the number of subclasses in each of
  ##          the K classes
  ## lambda : the weight on the L2-norm for elastic net regression. Default: 1e-6
  ## stop   : nonzero STOP will perform
  ##          elastic net regression with early stopping. If STOP is negative, its 
  ##          absolute value corresponds to the desired number of variables. If STOP
  ##          is positive, it corresponds to an upper bound on the L1-norm of the
  ##          b coefficients. There is a one to one correspondence between stop
  ##          and t.
  ## maxIte : Maximum number of iterations. Default: 50.
  ## trace  : trace = FALSE turns printing of RSS off and trace = TRUE turns it on.
  ## tol    : Tolerance for the stopping criterion (change in RSS). Default: 1e-4
  ##
  ## OUTPUT:
  ## $beta   : The regression parameters
  ## $theta  : Optimal scores
  ## $Z      : Updated subclass probabilities
  ## $rss    : Residual Sum of Squares at each itearation
  ##
  ## Author: Line H. Clemmensen, IMM, DTU, lhc@imm.dtu.dk
  ## Based on the elastic net algorithm by Hui Zou and Trevor Hastie
  ##


  ## this is stright from nnet:::formula
  class.ind <- function(cl) {
    n <- length(cl)
    x <- matrix(0, n, length(levels(cl)))
    x[(1:n) + n * (as.vector(unclass(cl)) - 1)] <- 1
    dimnames(x) <- list(names(cl), levels(cl))
    x
  }

  if(is.factor(y))
    {
      classes <- levels(y)
      factorY <- y
      y <- class.ind(y)
    } else {
      if(is.null(colnames(y))) colnames(y) <- paste("class", 1:ncol(y), sep = "")
      classes <- colnames(y)
      factorY <- factor(colnames(y)[apply(y, 1, which.max)])
    }

  if(is.null(Rj)) Rj <- rep(3, length(classes))
  if(length(Rj) == 1) Rj <- rep(Rj, length(classes))
 
  classKey <- rep(classes, times = Rj)

  subClasses <- classKey
  for(i in seq(along = classes))
    {
      tmp <- subClasses[subClasses == classes[i]]
      subClasses[subClasses == classes[i]] <- paste(tmp, seq(along = tmp), sep = "|")
    }
  
  if(!is.matrix(x)) x <- as.matrix(x)
  predNames <- colnames(x)

  N <- dim(x)[1]
  p <- dim(x)[2]
  K <- length(Rj) ## number of classes
  
  ## make Z from y
  if(is.null(Z))
    {
      library(mda)
      tmp <- mda.start(x, factorY, subclasses = Rj,  start.method = "lvq")
      Z <- matrix(0, nrow = nrow(x), ncol = sum(Rj))
      for(i in seq(along = tmp))
        {
          colIndex <- which(classKey == names(tmp)[i])
          rowIndex <- which(factorY == names(tmp)[i])
          Z[rowIndex, colIndex] <- tmp[[i]]
        }
      rm(tmp)
    }

  colnames(Z) <- subClasses
  factorSubY <- factor(colnames(Z)[apply(Z, 1, which.max)])

  R <- dim(Z)[2] ## number of subclasses
  RSSold <- 1e8 
  RSS <- 1e6
  ite <- 0
  Zhat <- matrix(0,N,R-1)
  Dp <- apply(Z,2,sum)
  Dp_inv <- diag(1/sqrt(Dp/N)) ## R x R
  theta <- 1/sum(diag(Dp/N))*diag(rep(1,R))[,1:(R-1)]/R
  Ztheta <- Z%*%theta  ## N x R-1
  rss <- rep(0,maxIte)
  b <- matrix(0,p,R-1)
  if (length(stop)< (R-1)){
    stop <- rep(stop[1],1,R-1)
  }
  if (stop[1]<0) sparse <- "varnum" else sparse <- "penalty" 


  while (abs(RSSold-RSS)/RSS > tol & ite < maxIte){ 
    RSSold <- RSS
    ite <- ite + 1
    ## 1. Estimate beta:    
    for (j in 1:(R-1)){
      Zc <- Ztheta[,j]
      beta<- solvebeta(x, Zc, paras=c(lambda, abs(stop[j])),sparse=sparse)
      b[,j] <- t(beta)
      Zhat[,j] <- x%*%b[,j]
    }    

    ## 2. Optimal scores: (balanced Procrustes problem)
    B <- t(Z)%*%Zhat
    sb <- svd(B,nu=R-1,nv=R-1)
    theta.old <- theta
    theta <- Dp_inv%*%sb$u%*%t(sb$v)
    Ztheta <- Z%*%theta
    RSS <- sum((Ztheta-Zhat)*(Ztheta-Zhat))
    rss[ite] <- RSS
    if (trace){
      cat('ite: ', ite, ' RSS: ', RSS,'\n')
    }

    ## 3. update parameter estimates:
    Sigma <- matrix(0,R-1,R-1)
    mu <- matrix(0,(R-1)*R,K)
    dim(mu) <- c(R-1,R,K)
    for (i in 1:K){
      IK <- (sum(Rj[1:i-1])+1):(sum(Rj[1:i-1])+Rj[i])
      Ik <- apply(Z[,IK, drop = FALSE]>0,1,any)
      Ik.length <- sum(Ik)
      for (j in 1:Rj[i]){
        mu[,IK[j],i] = apply(matrix(1,Ik.length,1)%*%t(Z[Ik,IK[j]])%*%Zhat[Ik,,drop = FALSE],2,sum)/Dp[IK[j]]
        Sigma = Sigma + t(Zhat[Ik,,drop = FALSE]-matrix(1,Ik.length,1)%*%t(matrix(mu[,IK[j],i])))%*%(Z[Ik,IK[j]]%*%matrix(1,1,Ik.length))%*%(Zhat[Ik,,drop = FALSE]-
          matrix(1,Ik.length,1)%*%t(matrix(mu[,IK[j],i])))/(Ik.length-Rj[i])
      }
    }
    Sigma_inv <- solve(Sigma + 1e-2*diag(rep(1,R-1)))

    for (i in 1:K){
      IK <- (sum(Rj[1:i-1])+1):(sum(Rj[1:i-1])+Rj[i])
      Ik <- apply(Z[,IK,drop = FALSE]>0,1,any)
      Ik.length <- sum(Ik)
      Dmahal_K <- matrix(0,Ik.length,Rj[i])
      for (j in 1:Rj[i]){
        Dmahal_K[,j] <- diag((Zhat[Ik,,drop = FALSE]-matrix(1,Ik.length,1)%*%t(matrix(mu[,IK[j],i])))%*%Sigma_inv%*%t(Zhat[Ik,,drop = FALSE]-
                                                                                                         matrix(1,Ik.length,1)%*%t(matrix(mu[,IK[j],i]))))
      }
      sum_K <- apply(Z[Ik,IK, drop = FALSE]*exp(-Dmahal_K/2),1,sum)
      for (j in 1:Rj[i]){
        Z[Ik,IK[j]] <- Z[Ik,IK[j]]*exp(-Dmahal_K[,j]/2)/(sum_K+1e-6)
      }
      Z[Ik,IK] <- Z[Ik,IK]/(apply(Z[Ik,IK, drop = FALSE],1,sum)*rep(1,1,Rj[i]))
    }
    Ztheta <- Z%*%theta
    Dp <- apply(Z,2,sum)
    Dp_inv <- diag(1/sqrt(Dp/N)) ## R x R
  }

  ## Remove trivial directions
  Ik <- sb$d > 1e-6
  M <- sum(Ik)
  theta <- theta[,1:M]
  Ztheta <- Z%*%theta
  b <- b[,1:M]
  Zhat <- Zhat[,1:M]
  for (j in 1:M){
    Zc <- Ztheta[,j]
    beta<- solvebeta(x, Zc, paras=c(lambda, abs(stop[j])),sparse=sparse)
    b[,j] <- t(beta)
    Zhat[,j] <- x%*%b[,j]
  }
  if (trace){
    RSS <- sum((Ztheta-Zhat)*(Ztheta-Zhat))
    cat('final update, RSS: ', RSS,'\n')
  }


  notZero <- apply(b, 1, function(x) any(x != 0))
  b <- b[notZero,,drop = FALSE]
  origP <- ncol(x)
  x <- x[, notZero, drop = FALSE]
  varNames <- colnames(x)
  
  sl <- x %*% b
  colnames(sl) <- paste("score", 1:ncol(sl), sep = "")
  lobj<-lda(sl, factorSubY, ...)

  structure(
            list(call = match.call(),
                 beta = b,
                 theta = theta,
                 Z = Z,
                 Zhat = Zhat,
                 Rj = Rj,
                 varNames = varNames,
                 varIndex = which(notZero),
                 origP = origP,
                 rss = rss[1:ite],
                 fit = lobj,
                 classes = classes,
                 lambda = lambda,
                 stop = stop),
            class = "smda")
}


predict.smda <- function(object, newdata = NULL, ...)
  {
    if(!is.matrix(newdata)) newdata <- as.matrix(newdata)
    if(!is.null(object$varNames))
      {
        newdata <- newdata[, object$varNames, drop = FALSE]
      } else {
        if(ncol(newdata) != object$origP) stop("dimensions of training and testing X different")
        newdata <- newdata[, object$varIndex, drop = FALSE]
      }
    x <- newdata %*% object$beta
    subPred <- predict(object$fit, newdata = x, ...)
    ## We compute the posterior probs per class (not subclass) and get the class from that      
    subPred$class <- unlist(lapply(strsplit(as.character(subPred$class), "\\|"), function(x)x[1]))
    subPred$class <- factor(subPred$class, levels = object$classes)
    subPred
  }


print.smda <- function(x, digits = max(3, getOption("digits") - 3), ...)
  {
    cat("\nCall:\n", deparse(x$call), "\n\n", sep = "")

    classInfo <- paste(paste(x$classes, " (", x$Rj, ")", sep = ""), collapse = ", ")

    if(all(x$stop < 0))
      {
        stopVal <- paste(-x$stop[1], "variables")
      } else {
         stopVal <- paste(
                          paste(format(x$stop, digits = digits),
                                collapse = ", "),
                          "L1 bounds")
      }
    
      cat("lambda =", format(x$lambda, digits = digits),
        "\nstop =", stopVal,
        "\nsubclasses =", classInfo,
        "\n\n")

    top <- if(!is.null(x$varNames)) x$varNames else paste("Predictor", x$varIndex, sep = "")
    varOrder <- if(is.matrix(x$beta)) order(apply(abs(x$beta), 1, sum)) else order(abs(x$beta))
    top <- top[varOrder]
    top <- top[1:min(5, length(top))]
    top <- paste(top, collapse = ", ")
    
    if(nrow(x$beta) > 5)
      {
        cat("Top 5 predictors (out of ",
            length(x$varIndex),
            "):\n\t",
            top,
            sep = "")
      } else {
        cat("Predictors:\n\t",
            top,
            "\n",
            sep = "")
      }
    invisible(x)
  }