Revision 518501af154696f1624a5d6f20ae7bda2407fb13 authored by Pierre Karpman on 08 April 2019, 17:44:39 UTC, committed by Pierre Karpman on 08 April 2019, 17:44:39 UTC
1 parent 685e8fe
Raw File
benchmark-pluq.C
/* Copyright (c) FFLAS-FFPACK
 * Written by Ziad Sultan <ziad.sultan@imag.fr>
 * ========LICENCE========
 * This file is part of the library FFLAS-FFPACK.
 *
 * FFLAS-FFPACK is free software: you can redistribute it and/or modify
 * it under the terms of the  GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 * ========LICENCE========
 */

//#include "goto-def.h"

//#define __FFLASFFPACK_USE_OPENMP
//#define __FFLASFFPACK_USE_TBB

//#define __FFLASFFPACK_USE_DATAFLOW
//#define  __FFLASFFPACK_FORCE_SEQ
//#define WINOPAR_KERNEL
//#define CLASSIC_SEQ
// #define PROFILE_PLUQ
// #define MONOTONIC_CYCLES
// #define MONOTONIC_MOREPIVOTS
// #define MONOTONIC_FEWPIVOTS

#ifdef MONOTONIC_CYCLES
#define MONOTONIC_APPLYP
#endif
#ifdef MONOTONIC_MOREPIVOTS
#define MONOTONIC_APPLYP
#endif
#ifdef MONOTONIC_FEWPIVOTS
#define MONOTONIC_APPLYP
#endif

#include "fflas-ffpack/fflas-ffpack-config.h"
#include <givaro/modular.h>
#include <givaro/givranditer.h>
#include <iostream>

#include "fflas-ffpack/config-blas.h"
#include "fflas-ffpack/fflas/fflas.h"
#include "fflas-ffpack/utils/timer.h"
#include "fflas-ffpack/utils/fflas_randommatrix.h"
#include "fflas-ffpack/utils/args-parser.h"
#include "fflas-ffpack/ffpack/ffpack.h"

#ifdef __FFLASFFPACK_USE_KAAPI
#include "libkomp.h"
#endif

using namespace std;

typedef Givaro::ModularBalanced<double> Field;
//typedef Givaro::ModularBalanced<float> Field;
//typedef Givaro::ZRing<double> Field;
//typedef Givaro::UnparametricZRing<double> Field;

void verification_PLUQ(const Field & F, typename Field::Element * B, typename Field::Element * A,
                       size_t * P, size_t * Q, size_t m, size_t n, size_t R)
{

    FFLAS::ParSeqHelper::Parallel<FFLAS::CuttingStrategy::Block,FFLAS::StrategyParameter::Threads> H;

    Field::Element_ptr X = FFLAS::fflas_new (F, m,n);
    Field::Element_ptr L, U;
    L = FFLAS::fflas_new(F, m,R);
    U = FFLAS::fflas_new(F, R,n);

    PARFOR1D (i, m*R,H, F.init(L[i], 0.0); );

    PARFOR1D (i,n*R,H, F.init(U[i], 0.0); );

    PARFOR1D (i,m*n,H, F.init(X[i], 0.0); );

    PARFOR1D (i,R,H,
              for (size_t j=0; j<i; ++j)
              F.assign ( *(U + i*n + j), F.zero);
              for (size_t j=i; j<n; ++j)
              F.assign (*(U + i*n + j), *(A+ i*n+j));
             );

    PARFOR1D (j,R,H,
              for (size_t i=0; i<=j; ++i )
              F.assign( *(L+i*R+j), F.zero);
              F.assign(*(L+j*R+j), F.one);
              for (size_t i=j+1; i<m; i++)
              F.assign( *(L + i*R+j), *(A+i*n+j));
             );

    PAR_BLOCK{
        SYNCH_GROUP(

                    TASK(MODE(CONSTREFERENCE(F,P,L)),
                         FFPACK::applyP( F, FFLAS::FflasLeft, FFLAS::FflasTrans, R,0,m, L, R, P););
                    TASK(MODE(CONSTREFERENCE(F,Q,U)),
                         FFPACK::applyP (F, FFLAS::FflasRight, FFLAS::FflasNoTrans, R,0,n, U, n, Q););
                    WAIT;
                    typename FFLAS::ParSeqHelper::Parallel<FFLAS::CuttingStrategy::Block,FFLAS::StrategyParameter::Threads> pWH (MAX_THREADS);

                    TASK(MODE(CONSTREFERENCE(F,U,L,X)),
                         FFLAS::fgemm (F, FFLAS::FflasNoTrans, FFLAS::FflasNoTrans, m,n,R,
                                       F.one, L,R, U,n, F.zero, X,n, pWH););
                   );
    }
    bool fail = false;
    for(size_t i=0; i<m; ++i)
        for (size_t j=0; j<n; ++j)
            if (!F.areEqual (*(B+i*n+j), *(X+i*n+j))){
                std::cout << " Initial["<<i<<","<<j<<"] = " << (*(B+i*n+j))
                << " Result["<<i<<","<<j<<"] = " << (*(X+i*n+j))
                << std::endl;
                fail=true;
            }

    if (fail)
        std::cout<<"FAIL"<<std::endl;
    else
        std::cout<<"PASS"<<std::endl;

    FFLAS::fflas_delete( U);
    FFLAS::fflas_delete( L);
    FFLAS::fflas_delete( X);
}

void Rec_Initialize(Field &F, Field::Element * C, size_t m, size_t n, size_t ldc)
{
    if(std::min(m,n) <= ldc/NUM_THREADS){
        for(size_t i=0; i<m; i++)
            FFLAS::fzero(F, 1, n, C+i*n, n);
    }
    else{
        size_t M2 = m >> 1;
        size_t N2 = n >> 1;
        typename Field::Element * C2 = C + N2;
        typename Field::Element * C3 = C + M2*ldc;
        typename Field::Element * C4 = C3 + N2;

        SYNCH_GROUP(
                    TASK(MODE(CONSTREFERENCE(F)), Rec_Initialize(F,C,M2,N2, ldc););
                    TASK(MODE(CONSTREFERENCE(F)), Rec_Initialize(F,C2,M2,n-N2, ldc););
                    TASK(MODE(CONSTREFERENCE(F)), Rec_Initialize(F,C3,m-M2,N2, ldc););
                    TASK(MODE(CONSTREFERENCE(F)), Rec_Initialize(F,C4,m-M2,n-N2, ldc););
                   );
    }
}

int main(int argc, char** argv) {

    size_t iter = 3 ;
    bool slab=false;
    int q = 131071 ;
    int m = 2000 ;
    int n = 2000 ;
    int r = 2000 ;
    int v = 0;
    int t=MAX_THREADS;
    int NBK = -1;
    bool par=false;
    bool grp =true;
    Argument as[] = {
        { 's', "-s S", "Use the Slab recursive algorithm (LUdivine)instead of the tile recursive algorithm (PLUQ).", TYPE_BOOL , &slab },
        { 'q', "-q Q", "Set the field characteristic (-1 for random).",         TYPE_INT , &q },
        { 'm', "-m M", "Set the row dimension of A.",      TYPE_INT , &m },
        { 'n', "-n N", "Set the col dimension of A.",      TYPE_INT , &n },
        { 'r', "-r R", "Set the rank of matrix A.",            TYPE_INT , &r },
        { 'g', "-g yes/no", "Generic rank profile (yes) or random rank profile (no).", TYPE_BOOL , &grp },
        { 'i', "-i I", "Set number of repetitions.",            TYPE_INT , &iter },
        { 'v', "-v V", "Set 1 if need verification of result else 0.",            TYPE_INT , &v },
        { 't', "-t T", "number of virtual threads to drive the partition.", TYPE_INT , &t },
        { 'b', "-b B", "number of numa blocks per dimension for the numa placement", TYPE_INT , &NBK },
        { 'p', "-p P", "whether to run or not the parallel PLUQ", TYPE_BOOL , &par },
        END_OF_ARGUMENTS
    };
    FFLAS::parseArguments(argc,argv,as);
    Field F(q);
    if (r > std::min(m,n)){
        std::cerr<<"Warning: rank can not be greater than min (m,n). It has been forced to min (m,n)"<<std::endl;
        r=std::min(m,n);
    }
    if (!par) { t=1;NBK=1;}
    if (NBK==-1) NBK = t;

    Field::Element_ptr A,  Acop;
    A = FFLAS::fflas_new(F,m,n);

    PAR_BLOCK{
        Rec_Initialize(F, A, m, n, n);
        //              FFLAS::pfzero(F,m,n,A,m/NBK);
        if (grp){
            size_t * cols = FFLAS::fflas_new<size_t>(n);
            size_t * rows = FFLAS::fflas_new<size_t>(m);
            for (int i=0; i<n; ++i)
                cols[i] = i;
            for (int i=0; i<m; ++i)
                rows[i] = i;
            FFPACK::RandomMatrixWithRankandRPM (F, m, n ,r, A, n, rows, cols);
            FFLAS::fflas_delete(cols);
            FFLAS::fflas_delete(rows);
        } else
            FFPACK::RandomMatrixWithRankandRandomRPM (F, m, n ,r, A, n);
    }
    size_t R;
    FFLAS::Timer chrono;
    double *time=new double[iter];

    enum FFLAS::FFLAS_DIAG diag = FFLAS::FflasNonUnit;
    size_t maxP, maxQ;
    maxP = m;
    maxQ = n;

    size_t *P = FFLAS::fflas_new<size_t>(maxP);
    size_t *Q = FFLAS::fflas_new<size_t>(maxQ);

    FFLAS::ParSeqHelper::Parallel<FFLAS::CuttingStrategy::Recursive,FFLAS::StrategyParameter::TwoDAdaptive> H;

    Acop = FFLAS::fflas_new(F,m,n);
    PARFOR1D(i,(size_t)m,H,
             FFLAS::fassign(F, n, A + i*n, 1, Acop + i*n, 1);
             // for (size_t j=0; j<(size_t)n; ++j)
             //     Acop[i*n+j]= A[i*n+j];
            );
    for (size_t i=0;i<=iter;++i){

        PARFOR1D(j,maxP,H, P[j]=0; );
        PARFOR1D(j,maxQ,H, Q[j]=0; );
        PARFOR1D(k,(size_t)m,H,
                 FFLAS::fassign(F, n, Acop + k*n, 1, A + k*n, 1);
                 // for (size_t j=0; j<(size_t)n; ++j)
                 //     F.assign( A[k*n+j] , Acop[k*n+j]) ;
                );
        chrono.clear();

        if (i) chrono.start();
        if (par){


            PAR_BLOCK{
                R = FFPACK::pPLUQ(F, diag, m, n, A, n, P, Q, t);
            }
        }
        else{
            if (slab)
                R = FFPACK::LUdivine (F, diag, FFLAS::FflasNoTrans, m, n, A, n, P, Q);
            else
                R = FFPACK::PLUQ(F, diag, m, n, A, n, P, Q);
        }
        if (i) {chrono.stop(); time[i-1]=chrono.realtime();}

    }
    std::sort(time, time+iter);
    double mediantime = time[iter/2];
    delete[] time;
    // -----------
    // Standard output for benchmark - Alexis Breust 2014/11/14
#define CUBE(x) ((x)*(x)*(x))
    double gflop =  2.0/3.0*CUBE(double(r)/1000.0) +2*m/1000.0*n/1000.0*double(r)/1000.0  - double(r)/1000.0*double(r)/1000.0*(m+n)/1000;
    std::cout << "Time: " << mediantime
    << " Gfops: " << gflop / mediantime;
    FFLAS::writeCommandString(std::cout, as) << std::endl;

    //verification
    if(v)
        verification_PLUQ(F,Acop,A,P,Q,m,n,R);

    FFLAS::fflas_delete (P);
    FFLAS::fflas_delete (Q);
    FFLAS::fflas_delete (A);
    FFLAS::fflas_delete (Acop);

    return 0;
}

/* -*- mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
// vim:sts=4:sw=4:ts=4:et:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
back to top