# This file contains functions for the Radial Big Data Model, following Khezrimotlagh et al. (2019) algorithm (we use KZCT as acronym here)
# D. Khezrimotlagh, J. Zhu and W.D. Cook et al. / European Journal of Operational Research 274 (2019) 1047–1054

"""
    initialsubset(X,Y)

Creates the initial subset as in KZCT algorithm:
1. Determines p the size of the subset and create an empty vector or matrix to fill out 
2. Determines the minimum value for each input m and the maximum value for each output s 
3. Select a subset with one DMU per input, with DMU's input equal to the minimum value for this input m and one DMU per output
with DMU's output equal to the maximum value for this output s: m + s DMU's selected
4. To fill out the subset up to p DMUs, implement the following algorithm:
    For each unselected DMU_j,
    {u = 0,
        {For each i and k_i,
            If x_ij <= k_ith percentile of x^i then
                u = u + 1}
            {For each r and k'_r 
                If y_rj >= k'th percentile of y^r then 
                    u = u + 1}
    pre_score_j = u }

Where x^i and y^r represent the ith inputs and the rth outputs of all DMUS.
The indexes k_i and k'_r are changed from 0 to 100 and refer to the percentiles of x^i and y^r respectively.
Sorting DMUs in descending order by the assigned pre-scores, the remaining DMUs (to construct the subsample with size p)
are selected as those having the greatest pre-scores. 

"""
function initialsubset(X::Union{Vector,Matrix},Y::Union{Vector,Matrix}, n::Int64, s::Int64, m::Int64)
    # Determines the size of the initial subset Dˢ
    p = convert(Int64, round(sqrt(n)))

    # Select DMU's with the minimum input or the maximum output
    dmus_selected = Vector{Int64}()
    for x in 1:m
        push!(dmus_selected, argmin(X[:,x]))
    end

    for y in 1:s
        push!(dmus_selected, argmax(Y[:,y]))
    end
    unique!(dmus_selected)

    initial_index = collect(1:n)
    dmus_unselected = initial_index[Not(dmus_selected)]

    # Algorithm implementation to fill out the subsample up to the size p 

    # select unselected DMUs 
    X_unselected = @view X[Not(dmus_selected), :]
    Y_unselected = @view Y[Not(dmus_selected), :]

    # create the pre-scores matrix (one column with the pre-score and the other to store the initial index)
    pre_scores = zeros(length(dmus_unselected))
    
    @inbounds for i in 1:size(X_unselected, 1)
        for x in 1:m 
            xquant = quantile!(X[:,x], 0.01:0.01:1)
            for k in 1:100 
                if X_unselected[i,x] <= xquant[k]
                    pre_scores[i] = pre_scores[i] + 1 
                end
            end
        end
        for y in 1:s 
            yquant = quantile!(Y[:,y], 0.01:0.01:1)
            for k in 1:100 
                if Y_unselected[i,y] >= yquant[k]
                    pre_scores[i] = pre_scores[i] + 1 
                end
            end
        end
    end

    # sort pre_scores to get the best DMUs in unselected DMUs 
    score_perm = sortperm(pre_scores, rev = true)
    dmus_unselected = dmus_unselected[score_perm]
    pre_scores = pre_scores[score_perm, :]

    # add the number of DMUs needed to fill out the subsample 
    z = p - length(dmus_selected)
    new_dmus = dmus_unselected[1:z]
    dmus_selected = vcat(dmus_selected, new_dmus)
    dmus_unselected = initial_index[Not(dmus_selected)]

    # Creates the subsets Dˢ and D_excluding_Dˢ
    return dmus_selected, dmus_unselected
end

"""
    bestpracticesfinder(Subset)

Identify best-practices in the subsample selected
"""
function bestpracticesfinder(scores::Vector, orient::Symbol, atol::Float64)
    index_bestpractices = Vector{Int64}()
    for i in 1:length(scores)
        if orient == :Input
            if scores[i] >= 1 - atol
                push!(index_bestpractices, i)
            end
        elseif orient == :Output
            if scores[i] <= 1 + atol
                push!(index_bestpractices, i)
            end
        end
    end
    return index_bestpractices
end

"""
    deabigdata(X, Y)

Compute the big data radial model using data envelopment analysis for inputs X and outputs Y.

# Optional Arguments
- `orient=:Input`: chooses the radially oriented input mode. For the radially oriented output model choose `:Output`.
- `rts=:CRS`: chooses constant returns to scale. For variable returns to scale choose `:VRS`.
- `atol=1e-6`: tolerance for DMU to be considered efficient.
- `names`: a vector of strings with the names of the decision making units.
"""
function deabigdata(X::Union{Matrix,Vector}, Y::Union{Matrix,Vector};
        orient::Symbol = :Input, rts::Symbol = :CRS, slack::Bool = true, atol::Float64 = 1e-6,
        names::Union{Vector{<: AbstractString},Nothing} = nothing,
        optimizer::Union{DEAOptimizer,Nothing} = nothing, progress::Bool = true)
    
    #=
    The algorithm is such as:
    1. Start 
    2. D <- get a sample of DMUs 
    3. D^S <- select a subsample of D 
    4. B^S <- find best-practices in D^S 
    5. E <- find exterior DMUs in D excluding D^S respect to the hull of B^S 
    6. If E = {}, then F = B^S and all DMUs are altready evaluated (go to step 7)
    Otherwise 
        6.1. F <- find best-practice DMUs in D^S Union E 
        6.2. Evaluate DMUs in D excluding (D^S Union E) respoect to the F's hull 
    7. End 
    =#

    # Check parameters
    nx, m = size(X, 1), size(X, 2)
    ny, s = size(Y, 1), size(Y, 2)

    if nx != ny
        throw(DimensionMismatch("number of@rows in X and Y ($nx, $ny) are not equal"));
    end

    # Default optimizer
    if optimizer === nothing 
        optimizer = DEAOptimizer(:LP)
    end

    n = nx
    disposX = :Strong
    disposY = :Strong

    # Select initial subset Dˢ using Khezrimotlagh et al. (2019) algorithm 
    Dˢ, D_excluding_Dˢ = initialsubset(X, Y, n, s, m)

    # Find the best-practices Bˢ in Dˢ
    Dˢ_evaluation = dea(X[Dˢ, :], Y[Dˢ, :], orient = orient, rts = rts,slack = slack, 
                    disposX = disposX, disposY = disposY, optimizer = optimizer, progress = progress)

    index_bestpractices = bestpracticesfinder(efficiency(Dˢ_evaluation), orient, atol)
    
    length(index_bestpractices) > 0 || throw(ErrorException("No efficient DMUs found in the initial subset. Consider increasing the tolerance `atol`"))

    Bˢ = Dˢ[index_bestpractices]

    # Find exterior DMUs in D excluding Dˢ respect to the hull of Bˢ 
    D_excluding_Dˢ_evaluation = dea(X[D_excluding_Dˢ, :], Y[D_excluding_Dˢ, :], orient = orient,
                        rts = rts, slack = slack, Xref = X[Bˢ, :], Yref = Y[Bˢ, :],
                        disposX = disposX, disposY = disposY, optimizer = optimizer, progress = progress)

                    
    index_exteriors = bestpracticesfinder(efficiency(D_excluding_Dˢ_evaluation), orient, atol)     

    E = D_excluding_Dˢ[index_exteriors]

    # If E is not empty, then we have to find best practice DMUs in D^S Union E, otherwise, F = B_S and all DMUs are already evaluated 
    if size(E,1) > 0
        # Create the subset Dˢ_union_E        
        Dˢ_union_E = vcat(Dˢ, E)
        
        sort!(Dˢ_union_E, rev = false)

        Dˢ_union_E_evaluation = dea(X[Dˢ_union_E, :], Y[Dˢ_union_E, :], orient = orient, rts = rts,slack = slack,
                        disposX = disposX, disposY = disposY,optimizer = optimizer, progress = progress)
        
        # Find the index of best practices DMUs 
        index_bestpractices = bestpracticesfinder(efficiency(Dˢ_union_E_evaluation), orient, atol)
        
        # Best practices DMUs are the efficient DMUs F of the sample D
        F = Dˢ_union_E[index_bestpractices]

        # # We then evaluate the rest of DMUs 
        initial_index = collect(1:n)
        
        D_excluding_Dˢ_union_E = initial_index[Not(Dˢ_union_E)]

        D_excluding_Dˢ_union_E_evaluation = dea(X[D_excluding_Dˢ_union_E, :], Y[D_excluding_Dˢ_union_E, :], orient = orient,
                            rts = rts, slack = slack, Xref = X[F, :], Yref = Y[F, :],
                            disposX = disposX, disposY = disposY, optimizer = optimizer, progress = progress)

        subset1  = Dˢ_union_E
        results1 = Dˢ_union_E_evaluation
        subset2  = D_excluding_Dˢ_union_E
        results2 = D_excluding_Dˢ_union_E_evaluation   
    else
        F = Bˢ

        subset1  = Dˢ
        results1 = Dˢ_evaluation
        subset2  = D_excluding_Dˢ
        results2 = D_excluding_Dˢ_evaluation
    end

    # Get results
    resultsperm = sortperm(vcat(subset1, subset2), rev = false)
    effi = vcat(efficiency(results1), efficiency(results2))
    effi = effi[resultsperm]

    if slack
        slackX = vcat(slacks(results1, :X), slacks(results2, :X))
        slackY = vcat(slacks(results1, :Y), slacks(results2, :Y))

        slackX = slackX[resultsperm, :]
        slackY = slackY[resultsperm, :]
    else
        slackX = Array{Float64}(undef, 0, 0)
        slackY = Array{Float64}(undef, 0, 0)
    end

    Xtarget = vcat(targets(results1, :X), targets(results2, :X))
    Ytarget = vcat(targets(results1, :Y), targets(results2, :Y))

    Xtarget = Xtarget[resultsperm, :]
    Ytarget = Ytarget[resultsperm, :]

    lambdas1 = peersmatrix(results1)
    lambdas2 = peersmatrix(results2)

    index_lambda_to_keep = findall(x -> x in F, subset1)
    lambdaeff = vcat(lambdas1[:,index_lambda_to_keep],lambdas2)
    lambdaindex = vcat(subset1, subset2)        
    lambda_matrix = lambdaeff[resultsperm, :]  

    # Create the sparse matrix for lambdas 
    lambdaeff = spzeros(n, n)
    lambdaeff[:, convert.(Int, lambdaindex[index_lambda_to_keep])] = lambda_matrix

    # return results
    return RadialDEAModel(n, m, s, orient, rts, disposX, disposY, names, effi, slackX, slackY, lambdaeff, Xtarget, Ytarget)  
end