sda <- function (x, ...) UseMethod("sda")


sda.default <- function(x, y, lambda=1e-6, stop=-p, maxIte=100, Q=K-1, trace=FALSE, tol=1e-6, ...){
    ##
    ## sda performs Sparse Linear Disciminant Analysis
    ## Solving: argmin{|(y*theta-x*b)|_2^2 + t*|beta|_1 + lambda*|beta|_2^2}
    ##
    ## InPUT:
    ## x      : matrix of n observations down the rows and p variable columns. The
    ##          columns are assumed normalized
    ## y      : matrix initializing the dummy variables representing the groups or a factor
    ## lambda : the weight on the L2-norm for elastic net regression. Default: 1e-6.
    ## stop   : If STOP is negative, its 
    ##          absolute value corresponds to the desired number of variables. If STOP
    ##          is positive, it corresponds to an upper bound on the L1-norm of the
    ##          b coefficients. There is a one to one correspondence between stop
    ##          and t.
    ## maxIte : Maximum number of iterations. Default: 100.
    ## Q      : number of discriminative directions. default Q=K
    ## trace  : trace = FALSE turns printing of RSS off and trace = TRUE turns it on.
    ## tol    : Tolerance for the stopping criterion (change in RSS). Default is 1e-6.
    ##
    ## OUTPUT:
    ## $beta    : The sparse loadings
    ## $theta   : Optimal scores
    ## $rss     : Residual Sum of Squares at each itearation
    ##
    ## Author: Line H. Clemmensen, IMM, DTU, lhc@imm.dtu.dk
    ## Modified by Trevor Hastie to become sequential
    ## Based on the elastic net algorithm by Hui Zou and Trevor Hastie
    ##

    ## this is stright from nnet:::formula
    class.ind <- function(cl) {
      Ik=diag(length(levels(cl)))
      x=Ik[as.numeric(cl),]
      dimnames(x) <- list(names(cl), levels(cl))
      x
    }
    orth.Q=function(dp,Qj,theta){
      Qjp=Qj*as.vector(dp)
      qjtheta=t(Qjp)%*%theta
      theta=theta-Qj%*%qjtheta
      thetan=sqrt(apply(dp*theta^2,2,sum))
      scale(theta,FALSE,thetan)
    }
    rtheta=function(K,dp){
      jj=rnorm(K);
      jj/sqrt(sum(jj^2)*dp)
    }


    if(is.factor(y))
      {
        classes <- levels(y)
        factorY <- y
        y <- class.ind(y)
      } else {
        classes <- colnames(y)
        factorY <- factor(colnames(y)[apply(y, 1, which.max)])
      }
    if(!is.matrix(x)) x <- as.matrix(x)
    x=scale(x,TRUE,FALSE)##This centering is essential for the trivial solution to disappear
    predNames <- colnames(x)
    
    n <- dim(x)[1]
    p <- dim(x)[2]
    K <- dim(y)[2]
    ones=rep(1,K)
    if(Q>(K-1))stop("at most K-1 variates allowed")
    
    
    dpi=as.vector(table(factorY)/n)
    #Dpi <- diag(dpi) ## diagonal matrix of class priors
    #Dpi_inv <- diag(1/sqrt(diag(Dpi)))
                                        # set-up stop criterion for elasticnet
    if (length(stop)< (Q)){
      stop <- rep(stop[1],1,Q)
    }
    if (stop[1]<0) sparse <- "varnum" else sparse <- "penalty" 
    Yhat <- matrix(0,n,Q)
    b <- matrix(0,p,Q)
    rss <- rep(0,Q)
    theta=matrix(0,K,Q)
    Qj=matrix(ones,K,1)
    ydp=scale(y,FALSE,dpi)
    for(j in 1:Q){
      RSS <- 1e6
      RSSold <- Inf
      ite <- 0
      thetaj=rtheta(K,dpi)
      thetaj=orth.Q(dpi,Qj,thetaj)
      while (abs(RSSold-RSS)/RSS > tol & ite < maxIte){ 
        RSSold <- RSS
        ite <- ite + 1
        ## 1. Estimate beta:    
        Yc <- y%*%thetaj 
        beta<- solvebeta(x, Yc, paras=c(lambda, abs(stop[j])),sparse=sparse) # elasticnet to estimate beta
        yhatj=x%*%beta
        thetaj=orth.Q(dpi,Qj,drop(t(ydp)%*%yhatj))
        RSS=sum((yhatj-Yc)^2)+lambda*sum(beta^2)
        if (trace){ 
          cat('ite: ', ite, ' ridge cost: ', RSS, ' |b|_1: ', sum(abs(beta)),'\n')
        }
      }
      rss[j]=RSS
      Qj=cbind(Qj,thetaj)
      theta[,j]=thetaj
      b[,j]=beta
    }
    
    
    if (trace){
      cat('final update, total ridge cost: ', sum(rss), ' |b|_1: ', sum(abs(b)),'\n')
    }

                                        # calcualte classes for LDA
    if (all(b==0)){ # all values in b and sl are zero
      warning("no non-zero elements - try other regularization parameters")
      b <- matrix(0,p,1)
      sl <- matrix(0,n,1)
      notZero <- matrix(TRUE,p,1) # fake NotZero for alignment purpose
      colnames(sl) <- paste("score", 1:ncol(sl), sep = "")
      prior <- t(as.matrix(apply(y,2,sum)/n))
      colnames(prior) <- classes
      varNames <- colnames(x)
      origP <- ncol(x)
      lobj<-list(
                 prior=prior,
                 means = NULL,
                 x = x,
                 covw = matrix(0,1,1)
                 )
    }
    else{
                                        # remove predictors which are not included (do not have non-zero parameter estimates)
      notZero <- apply(b, 1, function(x) any(x != 0))
      b <- as.matrix(b[notZero,])
      origP <- ncol(x)
      x <- x[, notZero, drop = FALSE]
      varNames <- colnames(x)

### remove directions with only zero elements (this can be caused by a too high weight on L1-penalty)
      if (is.vector(b)){b<-t(as.matrix(b))}
      notZeroC <- apply(b,2,function(x) any(x!=0))
      b <- as.matrix(b[,notZeroC])
      
      
      sl <- x %*% b
      colnames(sl) <- paste("score", 1:ncol(sl), sep = "")
      lobj<-lda(sl, factorY, ...)
    }
    

    structure(
              list(call = match.call(),
                   beta = b,
                   theta = theta,
                   varNames = varNames,
                   varIndex = which(notZero),
                   origP = origP,
                   rss = rss,
                   fit = lobj,
                   classes = classes,
                   lambda = lambda,
                   stop = stop),
              class = "sda")
  }
  
  predict.sda <- function(object, newdata = NULL, ...)
  {
    if(!is.matrix(newdata)) newdata <- as.matrix(newdata)
    if(!is.null(object$varNames))
      {
        newdata <- newdata[, object$varNames, drop = FALSE]
      } else {
        if(ncol(newdata) != object$origP) stop("dimensions of training and testing X different")
        newdata <- newdata[, object$varIndex, drop = FALSE]
        
      }
    xnew <- newdata %*% object$beta
    pred <- predict(object$fit,xnew)
    pred
    
  }
      
   print.sda <- function(x, digits = max(3, getOption("digits") - 3), ...)
    {
    cat("\nCall:\n", deparse(x$call), "\n\n", sep = "")
 
    classInfo <- paste(x$classes, collapse = ", ")

    if(all(x$stop < 0))
      {
        stopVal <- paste(-x$stop[1], "variables")
      } else {
         stopVal <- paste(
                          paste(format(x$stop, digits = digits),
                                collapse = ", "),
                          "L1 bounds")
      }
    
      cat("lambda =", format(x$lambda, digits = digits),
        "\nstop =", stopVal,
        "\nclasses =", classInfo,
        "\n\n")

    top <- if(!is.null(x$varNames)) x$varNames else paste("Predictor", x$varIndex, sep = "")
    varOrder <- if(is.matrix(x$beta)) order(apply(abs(x$beta), 1, sum)) else order(abs(x$beta))
    top <- top[varOrder]
    top <- top[1:min(5, length(top))]
    top <- paste(top, collapse = ", ")
    
    if(nrow(x$beta) > 5)
      {
        cat("Top 5 predictors (out of ",
            length(x$varIndex),
            "):\n\t",
            top,
            sep = "")
      } else {
        cat("Predictors:\t",
            top,
            "\n",
            sep = "")
      }
    invisible(x)
  }