Revision cc4814cbc080b465aebd3c9d2b1c7c41d851d7e2 authored by Clément Pernet on 14 October 2022, 15:01:33 UTC, committed by Clément Pernet on 14 October 2022, 15:01:33 UTC
1 parent 462d35d
Raw File
ffpack_ludivine.inl
/* ffpack/ffpack_ludivine.inl
 * Copyright (C) 2005 Clement Pernet
 *
 * Written by Clement Pernet <Clement.Pernet@imag.fr>
 *
 *
 * ========LICENCE========
 * This file is part of the library FFLAS-FFPACK.
 *
 * FFLAS-FFPACK is free software: you can redistribute it and/or modify
 * it under the terms of the  GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 * ========LICENCE========
 *.
 */

#ifndef __FFLASFFPACK_ffpack_ludivine_INL
#define __FFLASFFPACK_ffpack_ludivine_INL

#include "fflas-ffpack/fflas/fflas_bounds.inl"

namespace FFPACK {
    template<class Field>
    inline size_t
    LUdivine_gauss( const Field& F, const FFLAS::FFLAS_DIAG Diag,
                    const size_t M, const size_t N,
                    typename Field::Element_ptr A, const size_t lda, size_t*P,
                    size_t *Q, const FFPACK::FFPACK_LU_TAG LuTag)
    {
        size_t MN = std::min(M,N);
        typename Field::Element_ptr Acurr = A;
        size_t r = 0;

        for (size_t k = 0; k < MN; ++k){
            size_t p = r;
            Acurr = A+k*lda+r;
            while ((p < N) && F.isZero (*(Acurr++)))
                p++;
            if (p < N){
                P[r] = p;
                if (r < k){
                    FFLAS::fassign (F, N-r, (A+k*lda+r),1, (A + r*(lda+1)), 1);
                    Acurr = A+r+k*lda;
                    for (size_t i=r; i<N; ++i)
                        F.assign(*(Acurr++),F.zero);
                }

                FFLAS::fswap (F, M, A+r, lda, A+p, lda);
                Q[r] = k;
                r++;
            }
            if (k+1<M){
                ftrsv (F, FFLAS::FflasUpper, FFLAS::FflasTrans, FFLAS::FflasNonUnit, r, A, lda, A+(k+1)*lda, 1);
                fgemv (F, FFLAS::FflasTrans, r, N-r, F.mOne, A+r, lda, A+(k+1)*lda, 1, F.one, A+(k+1)*lda+r, 1);
            }
            else
                break; // return r;
        }
        return r;
    }

    template<class Element>
    class callLUdivine_small;



    template<class Field>
    inline size_t
    LUdivine_small( const Field& F, const FFLAS::FFLAS_DIAG Diag, const FFLAS::FFLAS_TRANSPOSE trans,
                    const size_t M, const size_t N,
                    typename Field::Element_ptr A, const size_t lda, size_t*P,
                    size_t *Q, const FFPACK::FFPACK_LU_TAG LuTag)
    {
        return callLUdivine_small <typename Field::Element> ()
        (F, Diag, trans, M, N, A, lda, P, Q, LuTag);
    }

    template<class Element>
    class callLUdivine_small {
    public:
        template <class Field>
        inline size_t
        operator()( const Field& F, const FFLAS::FFLAS_DIAG Diag, const FFLAS::FFLAS_TRANSPOSE trans,
                    const size_t M, const size_t N,
                    typename Field::Element_ptr A, const size_t lda, size_t*P,
                    size_t *Q, const FFPACK::FFPACK_LU_TAG LuTag)
        {
            if ( !(M && N) ) return 0;
            typedef typename Field::Element elt;
            typedef typename Field::Element_ptr elt_ptr;
            elt_ptr Aini = A;
            elt_ptr Acurr;
            size_t rowp = 0;
            size_t R = 0;
            size_t k = 0;
            while ((rowp<M) && (k<N)){
                size_t colp;

                //Find non zero pivot
                colp = k;
                Acurr = Aini;
                while ((F.isZero(*Acurr)) || (F.isZero (F.reduce (*Acurr))))
                    if (++colp == N){
                        if (rowp==M-1)
                            break;
                        colp=k; ++rowp;
                        Acurr = Aini += lda;
                    }
                    else
                        ++Acurr;

                if ((rowp == M-1)&&(colp == N))
                    break;
                R++;
                P[k] = colp;
                Q[k] = rowp;

                // Permutation of the pivot column
                FFLAS::fswap (F, M, A+k, lda, A + colp , lda);

                //Normalization
                elt invpiv;
                F.init (invpiv);
                F.reduce (*Aini);
                F.inv (invpiv,*Aini);

                for (size_t j=1; j<N-k; ++j)
                    if (!F.isZero(*(Aini+j)))
                        F.reduce (*(Aini+j));
                for (size_t i=lda; i<(M-rowp)*lda; i+=lda)
                    if (!F.isZero(*(Aini+i)))
                        F.reduce (*(Aini+i));


                if (Diag == FFLAS::FflasUnit) {
                    // for (size_t j=1; j<N-k; ++j)
                    // if (!F.isZero(*(Aini+j)))
                    // F.mulin (*(Aini+j),invpiv);
                    FFLAS::fscalin(F,N-k-1,invpiv,Aini+1,1);
                }
                else {
                    // for (size_t i=lda; i<(M-rowp)*lda; i+=lda)
                    // if (!F.isZero(*(Aini+i)))
                    // F.mulin (*(Aini+i),invpiv);
                    FFLAS::fscalin(F,M-rowp-1,invpiv,Aini+lda,lda);
                }

                //Elimination
                //Or equivalently, but without delayed ops :
                FFLAS::fger (F, M-rowp-1, N-k-1, F.mOne, Aini+lda, lda, Aini+1, 1, Aini+(lda+1), lda);

                Aini += lda+1; ++rowp; ++k;
            }

            // Compression the U matrix
            size_t l;
            if (Diag == FFLAS::FflasNonUnit){
                Aini = A;
                l = N;
            }
            else {
                Aini = A+1;
                l=N-1;
            }
            for (size_t i=0; i<R; ++i, Aini += lda+1) {
                if (Q[i] > i){
                    FFLAS::fassign (F, l-i, Aini+(Q[i]-i)*lda, 1, Aini, 1);
                    for (size_t j=0; j<l-i; ++j)
                        F.assign (*(Aini+(Q[i]-i)*lda+j), F.zero);
                }
            }
            return R;
        }
    };

    template<>
    class callLUdivine_small<double> {
    public:
        template <class Field>
        inline size_t
        operator()( const Field& F,
                    const FFLAS::FFLAS_DIAG Diag,  const FFLAS::FFLAS_TRANSPOSE trans,
                    const size_t M, const size_t N,
                    typename Field::Element_ptr A, const size_t lda, size_t*P,
                    size_t *Q, const FFPACK::FFPACK_LU_TAG LuTag)
        {

            if ( !(M && N) ) return 0;
            typedef typename Field::Element elt;
            elt * Aini = A;
            elt * Acurr;
            size_t rowp = 0;
            size_t R = 0;
            size_t k = 0;
            size_t delay =0;
            size_t kmax = FFLAS::Protected::DotProdBoundClassic (F, F.one) -1; // the max number of delayed operations
            while ((rowp<M) && (k<N)){
                size_t colp;

                //Find non zero pivot
                colp = k;
                Acurr = Aini;
                while ((F.isZero(*Acurr)) || (F.isZero (F.reduce (*Acurr))))
                    if (++colp == N){
                        if (rowp==M-1)
                            break;
                        colp=k; ++rowp;
                        Acurr = Aini += lda;
                    }
                    else
                        ++Acurr;

                if ((rowp == M-1)&&(colp == N))
                    break;
                R++;
                P[k] = colp;
                Q[k] = rowp;

                // Permutation of the pivot column
                FFLAS::fswap (F, M, A+k, lda, A + colp , lda);

                //Normalization
                elt invpiv;
                F.reduce (*Aini);
                F.inv (invpiv,*Aini);

                for (size_t j=1; j<N-k; ++j)
                    if (!F.isZero(*(Aini+j)))
                        F.reduce (*(Aini+j));
                for (size_t i=lda; i<(M-rowp)*lda; i+=lda)
                    if (!F.isZero(*(Aini+i)))
                        F.reduce(*(Aini+i));


                if (Diag == FFLAS::FflasUnit) {
                    // for (size_t j=1; j<N-k; ++j)
                    // if (!F.isZero(*(Aini+j)))
                    // F.mulin (*(Aini+j),invpiv);
                    FFLAS::fscalin(F,N-k-1,invpiv,Aini+1,1);
                }
                else {
                    // for (size_t i=lda; i<(M-rowp)*lda; i+=lda)
                    // if (!F.isZero(*(Aini+i)))
                    // F.mulin (*(Aini+i),invpiv);
                    FFLAS::fscalin(F,M-rowp-1,invpiv,Aini+lda,lda);
                }

                if (delay++ >= kmax){ // Reduction has to be done
                    delay = 0;
                    FFLAS::freduce (F, M-rowp-1,N-k-1, Aini+lda+1, lda);
                    // for (size_t i=1; i<M-rowp; ++i)
                    // 	for (size_t j=1; j<N-k; ++j)
                    // 		F.init(	*(Aini+i*lda+j),*(Aini+i*lda+j));
                }
                //Elimination
                for (size_t i=1; i<M-rowp; ++i)
                    for (size_t j=1; j<N-k; ++j)
                        *(Aini+i*lda+j) -= *(Aini+i*lda) * *(Aini+j);
                //Or equivalently, but without delayed ops :
                //FFLAS::fger (F, M-rowp-1, N-k-1, F.mOne, Aini+lda, lda, Aini+1, 1, Aini+(lda+1), lda);

                Aini += lda+1; ++rowp; ++k;
            }

            // Compression the U matrix
            size_t l;
            if (Diag == FFLAS::FflasNonUnit){
                Aini = A;
                l = N;
            }
            else {
                Aini = A+1;
                l=N-1;
            }
            for (size_t i=0; i<R; ++i, Aini += lda+1) {
                if (Q[i] > i){
                    FFLAS::fassign (F, l-i, Aini+(Q[i]-i)*lda, 1, Aini, 1);
                    for (size_t j=0; j<l-i; ++j)
                        F.assign (*(Aini+(Q[i]-i)*lda+j), F.zero);
                }
            }
            return R;
        }
    };

    template<>
    class callLUdivine_small<float> {
    public:
        template <class Field>
        inline size_t
        operator()( const Field& F,
                    const FFLAS::FFLAS_DIAG Diag, const FFLAS::FFLAS_TRANSPOSE trans,
                    const size_t M, const size_t N,
                    typename Field::Element_ptr A, const size_t lda, size_t*P,
                    size_t *Q, const FFPACK::FFPACK_LU_TAG LuTag)
        {

            if ( !(M && N) ) return 0;
            typedef typename Field::Element elt;
            elt * Aini = A;
            elt * Acurr;
            size_t rowp = 0;
            size_t R = 0;
            size_t k = 0;
            size_t delay =0;
            size_t kmax = FFLAS::Protected::DotProdBoundClassic (F, F.one) -1; // the max number of delayed operations
            while ((rowp<M) && (k<N)){
                size_t colp;

                //Find non zero pivot
                colp = k;
                Acurr = Aini;
                while ((F.isZero(*Acurr)) || (F.isZero (F.reduce (*Acurr))))
                    if (++colp == N){
                        if (rowp==M-1)
                            break;
                        colp=k; ++rowp;
                        Acurr = Aini += lda;
                    }
                    else
                        ++Acurr;

                if ((rowp == M-1)&&(colp == N))
                    break;
                R++;
                P[k] = colp;
                Q[k] = rowp;

                // Permutation of the pivot column
                FFLAS::fswap (F, M, A+k, lda, A + colp , lda);

                //Normalization
                elt invpiv;
                F.init(invpiv);
                F.reduce (*Aini);
                F.inv (invpiv,*Aini);

                for (size_t j=1; j<N-k; ++j)
                    if (!F.isZero(*(Aini+j)))
                        F.reduce (*(Aini+j));
                for (size_t i=lda; i<(M-rowp)*lda; i+=lda)
                    if (!F.isZero(*(Aini+i)))
                        F.reduce (*(Aini+i));

                if (Diag == FFLAS::FflasUnit) {
                    // for (size_t j=1; j<N-k; ++j)
                    // if (!F.isZero(*(Aini+j)))
                    // F.mulin (*(Aini+j),invpiv);
                    FFLAS::fscalin(F,N-k-1,invpiv,Aini+1,1);
                }
                else {
                    // for (size_t i=lda; i<(M-rowp)*lda; i+=lda)
                    // if (!F.isZero(*(Aini+i)))
                    // F.mulin (*(Aini+i),invpiv);
                    FFLAS::fscalin(F,M-rowp-1,invpiv,Aini+lda,lda);
                }

                if (delay++ >= kmax){ // Reduction has to be done
                    delay = 0;
                    FFLAS::freduce (F, M-rowp-1, N-k-1, Aini+lda+1, lda);
                    // for (size_t i=1; i<M-rowp; ++i)
                    // 	for (size_t j=1; j<N-k; ++j)
                    // 		F.reduce (*(Aini+i*lda+j));
                }
                //Elimination
                for (size_t i=1; i<M-rowp; ++i)
                    for (size_t j=1; j<N-k; ++j)
                        *(Aini+i*lda+j) -= *(Aini+i*lda) * *(Aini+j);
                //Or equivalently, but without delayed ops :
                //FFLAS::fger (F, M-rowp-1, N-k-1, F.mOne, Aini+lda, lda, Aini+1, 1, Aini+(lda+1), lda);

                Aini += lda+1; ++rowp; ++k;
            }

            // Compression the U matrix
            size_t l;
            if (Diag == FFLAS::FflasNonUnit){
                Aini = A;
                l = N;
            }
            else {
                Aini = A+1;
                l=N-1;
            }
            for (size_t i=0; i<R; ++i, Aini += lda+1) {
                if (Q[i] > i){
                    FFLAS::fassign (F, l-i, Aini+(Q[i]-i)*lda, 1, Aini, 1);
                    for (size_t j=0; j<l-i; ++j)
                        F.assign (*(Aini+(Q[i]-i)*lda+j), F.zero);
                }
            }
            return R;
        }
    };

    template <class Field>
    inline size_t
    LUdivine (const Field& F,
              const FFLAS::FFLAS_DIAG Diag, const FFLAS::FFLAS_TRANSPOSE trans,
              const size_t M, const size_t N,
              typename Field::Element_ptr A, const size_t lda,
              size_t*P, size_t *Q
              , const FFPACK::FFPACK_LU_TAG LuTag // =FFPACK::FfpackSlabRecursive
              , const size_t cutoff // =__FFPACK_LUDIVINE_CUTOFF
             )
    {
        if ( !(M && N) ) return 0;
        typedef typename Field::Element elt;
        size_t MN = std::min(M,N);

        size_t incRow, incCol, rowDim, colDim;
        if (trans == FFLAS::FflasTrans){
            incRow = 1;
            incCol = lda;
            colDim = M;
            rowDim = N;
        }
        else {
            incRow = lda;
            incCol = 1;
            colDim = N;
            rowDim = M;
        }

        if ((rowDim < cutoff) && (colDim < 2*cutoff)) { // the coeff 2 is experimentally determined!
            return LUdivine_small (F, Diag, trans, M, N, A, lda, P, Q, LuTag);
        }
        else { // recursively :
            if (MN == 1){
                size_t ip=0;
                while (F.isZero (*(A+ip*incCol)))
                    if (++ip == colDim)
                        break;
                *Q=0;
                if (ip == colDim){ // current row is zero
                    *P=0;
                    if (colDim == 1){
                        //while (ip<M && !F.isUnit(*(A+ip*lda)))
                        while (ip<rowDim && F.isZero(*(A + ip*incRow))){
                            //	Q[ip]=ip;
                            ip++;
                        }
                        if (ip == rowDim) {
                            return 0;
                        }
                        else {
                            size_t oldip = ip;
                            if ( Diag == FFLAS::FflasNonUnit ){
                                elt invpiv;
                                F.init(invpiv);
                                F.inv(invpiv,*(A+ip*incRow));
                                if (++ip < rowDim)
                                    FFLAS::fscalin(F,rowDim-ip,invpiv,A+ip*incRow,incRow);
                                // elt tmp;
                                //								F.init(tmp);
                                //								F.assign(tmp, *(A+oldip*incRow));
                                //								F.assign( *(A+oldip*incRow), *A);
                                F.assign( *A,*(A+oldip*incRow));
                                F.assign( *(A+oldip*incRow), F.zero);
                            }
                            *Q=oldip;

                            return 1;
                        }
                    }
                    else{
                        *Q=0; return 0;
                    }
                }
                *P=ip;
                if (ip!=0){
                    // swap the pivot
                    typename Field::Element tmp;
                    F.init(tmp);
                    F.assign(tmp,*A);
                    F.assign(*A, *(A + ip*incCol));
                    F.assign(*(A + ip*incCol), tmp);
                }
                elt invpiv;
                F.init(invpiv);
                F.inv(invpiv, *A);
                if ( Diag == FFLAS::FflasUnit && colDim>1){
                    // Normalisation of the row
                    FFLAS::fscalin(F,colDim-1,invpiv,A+incCol,incCol);
                }
                else if ( (colDim==1) &&(Diag==FFLAS::FflasNonUnit) ){
                    if (++ip < rowDim){
                        FFLAS::fscalin(F,rowDim-ip,invpiv,A+ip*incRow,incRow);
                    }
                }
                return 1;
            }
            else { // MN>1
                size_t Nup = rowDim >> 1;
                size_t Ndown =  rowDim - Nup;
                // FFLASFFPACK_check(Ndown < rowDim);
                // Recursive call on NW
                size_t R, R2;
                if (trans == FFLAS::FflasTrans){
                    R = LUdivine (F, Diag, trans, colDim, Nup, A, lda, P, Q,
                                  LuTag, cutoff);

                    typename Field::Element_ptr Ar = A + Nup*incRow;   // SW
                    typename Field::Element_ptr Ac = A + R*incCol;     // NE
                    typename Field::Element_ptr An = Ar+ R*incCol;     // SE

                    if (!R){
                        if (LuTag == FFPACK::FfpackSingular )
                            return 0;
                    }
                    else {
                        FFPACK::applyP (F, FFLAS::FflasLeft, FFLAS::FflasNoTrans,
                                        Ndown, 0,(int) R, Ar, lda, P);
                        // Ar <- L1^-1 Ar
                        FFLAS::ftrsm( F, FFLAS::FflasLeft, FFLAS::FflasLower,
                                      FFLAS::FflasNoTrans, Diag, R, Ndown,
                                      F.one, A, lda, Ar, lda);
                        // An <- An - Ac*Ar
                        if (colDim>R)
                            fgemm( F, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, colDim-R, Ndown, R,
                                   F.mOne, Ac, lda, Ar, lda, F.one, An, lda);
                    }
                    // Recursive call on SE
                    R2 = LUdivine (F, Diag, trans, colDim-R, Ndown, An, lda, P + R, Q + Nup, LuTag, cutoff);
                    for (size_t i = R; i < R + R2; ++i)
                        P[i] += R;
                    if (R2) {
                        // An <- An.P2
                        FFPACK::applyP (F, FFLAS::FflasLeft, FFLAS::FflasNoTrans,
                                        Nup,(int) R, (int)(R+R2), A, lda, P);
                    }
                    else {
                        if (LuTag == FFPACK::FfpackSingular)
                            return 0;
                    }

                }
                else { // trans == FFLAS::FflasNoTrans
                    R = LUdivine (F, Diag, trans, Nup, colDim, A, lda, P, Q, LuTag, cutoff);
                    typename Field::Element_ptr Ar = A + Nup*incRow;   // SW
                    typename Field::Element_ptr Ac = A + R*incCol;     // NE
                    typename Field::Element_ptr An = Ar+ R*incCol;     // SE

                    if (!R){
                        if (LuTag == FFPACK::FfpackSingular )
                            return 0;
                    }
                    else { /*  R>0 */
                        // Ar <- Ar.P^T
                        FFPACK::applyP (F, FFLAS::FflasRight, FFLAS::FflasTrans,
                                        Ndown, 0,(int) R, Ar, lda, P);
                        // Ar <- Ar.U1^-1
                        ftrsm( F, FFLAS::FflasRight, FFLAS::FflasUpper,
                               FFLAS::FflasNoTrans, Diag, Ndown, R,
                               F.one, A, lda, Ar, lda);
                        // An <- An - Ar*Ac
                        if (colDim>R)
                            fgemm( F, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, Ndown, colDim-R, R,
                                   F.mOne, Ar, lda, Ac, lda, F.one, An, lda );

                    }
                    // Recursive call on SE
                    R2=LUdivine (F, Diag, trans, Ndown, N-R, An, lda,P+R, Q+Nup, LuTag, cutoff);
                    for (size_t i = R; i < R + R2; ++i)
                        P[i] += R;
                    if (R2)
                        // An <- An.P2
                        FFPACK::applyP (F, FFLAS::FflasRight, FFLAS::FflasTrans,
                                        Nup,(int) R, (int)(R+R2), A, lda, P);
                    else if (LuTag == FFPACK::FfpackSingular)
                        return 0;

                }
                // Non zero row permutations
                for (size_t i = Nup; i < Nup + R2; i++)
                    Q[i] += Nup;
                if (R < Nup){
                    // Permutation of the 0 rows
                    if (Diag == FFLAS::FflasNonUnit){
                        for ( size_t i = Nup, j = R ; i < Nup + R2; ++i, ++j){
                            FFLAS::fassign( F, colDim - j, A + i*incRow + j*incCol, incCol, A + j * (lda + 1), incCol);
                            for (typename Field::Element_ptr Ai = A + i*incRow + j*incCol;
                                 Ai != A + i*incRow + colDim*incCol; Ai+=incCol)
                                F.assign (*Ai, F.zero);
                            ///@todo std::swap ?
                            size_t t = Q[j];
                            Q[j]=Q[i];
                            Q[i] = t;
                        }
                    }
                    else { // Diag == FFLAS::FflasUnit
                        for ( size_t i = Nup, j = R+1 ; i < Nup + R2; ++i, ++j){
                            FFLAS::fassign( F, colDim - j,
                                            A + i*incRow + j*incCol, incCol,
                                            A + (j-1)*incRow + j*incCol, incCol);
                            for (typename Field::Element_ptr Ai = A + i*incRow + j*incCol;
                                 Ai != A + i*incRow + colDim*incCol; Ai+=incCol)
                                F.assign (*Ai, F.zero);
                            size_t t = Q[j-1];
                            Q[j-1]=Q[i];
                            Q[i] = t;
                        }
                    }
                }
                return R + R2;
            }
        }
    }

    namespace Protected {

        //---------------------------------------------------------------------
        // LUdivine_construct: (Specialisation of LUdivine)
        // LUP factorisation of the Krylov base matrix of A^t and v.
        // When all rows have been factorized in A, and rank is full,
        // then new krylov vectors are computed and then triangularized
        // P is the permutation matrix stored in the lapack style
        // nRowX is the number of Krylov vectors already computed,
        // nUsedRowX is the number of Krylov vectors already triangularized
        //---------------------------------------------------------------------

        template <class Field>
        size_t
        LUdivine_construct( const Field& F, const FFLAS::FFLAS_DIAG Diag,
                            const size_t M, const size_t N,
                            typename Field::ConstElement_ptr A, const size_t lda,
                            typename Field::Element_ptr X, const size_t ldx,
                            typename Field::Element_ptr u, const size_t incu, size_t* P,
                            bool computeX
                            , const FFPACK::FFPACK_MINPOLY_TAG MinTag //= FFPACK::FfpackDense
                            , const size_t kg_mc// =0
                            , const size_t kg_mb// =0
                            , const size_t kg_j // =0
                          )
        {

            size_t MN = std::min(M,N);

            if (MN == 1){
                size_t ip=0;
                while (ip<N && F.isZero(*(X+ip))){ip++;}
                if (ip==N){ // current row is zero
                    *P=0;
                    return 0;
                }
                *P=ip;
                if (ip!=0){
                    // swap the pivot
                    F.assign(X[0],X[ip]);
                    F.assign(X[ip],F.zero);
                }
                if ( Diag == FFLAS::FflasUnit ){
                    typename Field::Element invpiv;
                    F.inv(invpiv, *X);

                    // Normalisation of the row
                    // for (size_t k=1; k<N; k++)
                    // F.mulin(*(X+k), invpiv);
                    FFLAS::fscalin(F,N-1,invpiv,X+1,1);
                }
                if (N==1 && M>1 && computeX)// Only appends when A is 1 by 1
                    F.mul(*(X+ldx),*X, *A);

                return 1;
            }
            else{ // MN>1
                size_t Nup = MN>>1;
                size_t Ndown =  M - Nup;

                // Recursive call on NW
                size_t R = LUdivine_construct(F, Diag, Nup, N, A, lda, X, ldx, u, incu,
                                              P, computeX, MinTag, kg_mc, kg_mb, kg_j );
                if (R==Nup){
                    typename Field::Element_ptr Xr = X + Nup*ldx; //  SW
                    typename Field::Element_ptr Xc = X + Nup;     //  NE
                    typename Field::Element_ptr Xn = Xr + Nup;    //  SE
                    typename Field::Element_ptr Xi = Xr;
                    if ( computeX ){
                        if (MinTag == FFPACK::FfpackDense)
                            for (size_t i=0; i< Ndown; ++i, Xi+=ldx){
                                fgemv(F, FFLAS::FflasNoTrans, N, N, F.one,
                                      A, lda, u, incu, F.zero, Xi,1);
                                FFLAS::fassign(F, N,Xi, 1, u,incu);
                            }
                        else // Keller-Gehrig Fast algorithm's matrix
                            for (size_t i=0; i< Ndown; ++i, Xi+=ldx){
                                FFPACK::Protected::fgemv_kgf( F, N, A, lda, u, incu, Xi, 1,
                                                              kg_mc, kg_mb, kg_j );
                                FFLAS::fassign(F, N,Xi, 1, u, incu);
                            }
                    }
                    // Apply the permutation on SW
                    FFPACK::applyP( F, FFLAS::FflasRight, FFLAS::FflasTrans,
                                    Ndown, 0,(int) R, Xr, ldx, P);
                    // Triangular block inversion of NW and apply to SW
                    // Xr <- Xr.U1^-1
                    ftrsm( F, FFLAS::FflasRight, FFLAS::FflasUpper, FFLAS::FflasNoTrans, Diag,
                           Ndown, R, F.one, X, ldx, Xr, ldx);

                    // Update of SE
                    // Xn <- Xn - Xr*Xc
                    fgemm( F, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, Ndown, N-Nup, Nup,
                           F.mOne, Xr, ldx, Xc, ldx, F.one, Xn, ldx);

                    // Recursive call on SE

                    size_t R2 = LUdivine_construct(F, Diag, Ndown, N-Nup, A, lda,
                                                   Xn, ldx, u, incu, P + Nup,
                                                   false, MinTag, kg_mc, kg_mb, kg_j);
                    for ( size_t i=R;i<R+R2;++i) P[i] += R;

                    FFPACK::applyP( F, FFLAS::FflasRight, FFLAS::FflasTrans,
                                    Nup, (int)R, (int)(R+R2), X, ldx, P);

                    return R+=R2;
                }
                else
                    return R;
                // Rank deficient matrices can only be factorized
                // under the condition: the first R rows are linearly independent
                // If not, the lower block is never factorized as soon as the
                // upper block is rank defficient
            }
        }

    } // Protected

} // FFPACK

#endif //__FFLASFFPACK_ffpack_ludivine_INL

/* -*- mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
// vim:sts=4:sw=4:ts=4:et:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
back to top