https://github.com/linbox-team/fflas-ffpack
Raw File
Tip revision: 432071cb5f212d683607ab8f8ff42d31b437fc4a authored by Clement Pernet on 30 July 2016, 17:26:20 UTC
preparing release
Tip revision: 432071c
benchmark-fgemm.C
/* -*- mode: C++; tab-width: 4; indent-tabs-mode: t; c-basic-offset: 4 -*- */
// vim:sts=4:sw=4:ts=4:noet:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s
//#include "goto-def.h"

/* Copyright (c) FFLAS-FFPACK
* Written by Clément Pernet <clement.pernet@imag.fr>
* ========LICENCE========
* This file is part of the library FFLAS-FFPACK.
*
* FFLAS-FFPACK is free software: you can redistribute it and/or modify
* it under the terms of the  GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
* ========LICENCE========
*/

// Please do not commit with any of these defines on - AB 2015-01-12
//#define __FFLASFFPACK_USE_TBB
//#define __FFLASFFPACK_USE_OPENMP 
//#define __FFLASFFPACK_USE_DATAFLOW
//#define WINO_PARALLEL_TMPS
//#define __FFLASFFPACK_FORCE_SEQ
//#define PFGEMM_WINO_SEQ 32
//#define CLASSIC_SEQ
#define CLASSIC_HYBRID
//#define WINO_SEQ
//#define DEBUG 1
//#undef NDEBUG
//#define FFT_PROFILER
//#define PROFILE_FGEMM_MP
#include "fflas-ffpack/fflas-ffpack-config.h"
#include <iostream>
#include <givaro/modular-balanced.h>

#include "fflas-ffpack/config-blas.h"
#include "fflas-ffpack/fflas/fflas.h"
#include "fflas-ffpack/utils/timer.h"
#include "fflas-ffpack/utils/Matio.h"
#include "fflas-ffpack/utils/args-parser.h"

#ifdef __FFLASFFPACK_USE_KAAPI
#include "libkomp.h"
#endif


using namespace std;
using namespace FFLAS;


int main(int argc, char** argv) {

	size_t iter = 3 ;
	Givaro::Integer q = 131071 ;
	size_t m = 2000 ;
	size_t k = 2000 ;
	size_t n = 2000 ;
	int nbw = -1 ;
	int p=0;
	int t=MAX_THREADS;
	int NBK = -1;

	Argument as[] = {
		{ 'q', "-q Q", "Set the field characteristic (-1 for random).",         TYPE_INTEGER , &q },
		{ 'm', "-m M", "Set the row dimension of A.",      TYPE_INT , &m },
		{ 'k', "-k K", "Set the col dimension of A.",      TYPE_INT , &k },
		{ 'n', "-n N", "Set the col dimension of B.",      TYPE_INT , &n },
		{ 'w', "-w N", "Set the number of winograd levels (-1 for random).",    TYPE_INT , &nbw },
		{ 'i', "-i R", "Set number of repetitions.",       TYPE_INT , &iter },
		{ 'p', "-p P", "0 for sequential, 1 for 2D iterative, 2 for 2D rec, 3 for 2D rec adaptive, 4 for 3D rc in-place, 5 for 3D rec, 6 for 3D rec adaptive.", TYPE_INT , &p },
		{ 't', "-t T", "number of virtual threads to drive the partition.", TYPE_INT , &t },
		{ 'b', "-b B", "number of numa blocks per dimension for the numa placement", TYPE_INT , &NBK },
		END_OF_ARGUMENTS
	};

	parseArguments(argc,argv,as);

	if (NBK==-1) NBK = t;
//  typedef Givaro::Modular<Givaro::Integer> Field;
//  typedef Givaro::ModularBalanced<int32_t> Field;
//	typedef Givaro::ModularBalanced<float> Field;
	typedef Givaro::ModularBalanced<double> Field;
//	typedef Givaro::Modular<Givaro::Integer> Field;
	typedef Field::Element Element;

  Field F(q);

  Timer chrono, TimFreivalds;
  double time=0.0, timev=0.0;

  Element * A, * B, * C;

  Field::RandIter G(F);
  A = fflas_new(F,m,k,Alignment::CACHE_PAGESIZE);
//#pragma omp parallel for collapse(2) schedule(runtime) 
  PAR_BLOCK { pfrand(F,G, m,k,A,m/size_t(NBK)); }	
  
  B = fflas_new(F,k,n,Alignment::CACHE_PAGESIZE);
//#pragma omp parallel for collapse(2) schedule(runtime) 
  PAR_BLOCK { pfrand(F,G, k,n,B,k/NBK); }	

  C = fflas_new(F,m,n,Alignment::CACHE_PAGESIZE);
//#pragma omp parallel for collapse(2) schedule(runtime) 
  PAR_BLOCK { pfzero(F, m,n,C,m/NBK); }
  

  for (size_t i=0;i<=iter;++i){

	  // if (argc > 4){
	  // 	  A = read_field (F, argv[4], &n, &n);
	  // }
	  // else{

      chrono.clear();
      if (p && p!=7){
	      // CuttingStrategy meth = RECURSIVE;
	      // StrategyParameter strat = THREADS;

	      typedef CuttingStrategy::Block block;
	      typedef CuttingStrategy::Recursive rec;
	      typedef StrategyParameter::Threads threads;
	      typedef StrategyParameter::TwoD  twod;
	      typedef StrategyParameter::TwoDAdaptive  twoda;
	      typedef StrategyParameter::ThreeD  threed;
	      typedef StrategyParameter::ThreeDAdaptive  threeda;
	      typedef StrategyParameter::ThreeDInPlace  threedip;
	      PAR_BLOCK{
              if (i) { chrono.start(); }
              
	      switch (p){
		  case 1:{
			  MMHelper<Field, MMHelperAlgo::Winograd, typename ModeTraits<Field>::value, ParSeqHelper::Parallel<block,threads> > WH(F,nbw, SPLITTER(t,block,threads));
			  fgemm (F, FflasNoTrans, FflasNoTrans, m,n,k, F.one, A, k, B, n, F.zero, C,n, WH);
			  break;}
		  case 2:{
			  MMHelper<Field, MMHelperAlgo::Winograd, typename ModeTraits<Field>::value, ParSeqHelper::Parallel<rec,twod> > WH(F,nbw, SPLITTER(t,rec,twod));
			  fgemm (F, FflasNoTrans, FflasNoTrans, m,n,k, F.one, A, k, B, n, F.zero, C,n, WH);
			  break;
		  }
		  case 3:{
			  MMHelper<Field, MMHelperAlgo::Winograd, typename ModeTraits<Field>::value, ParSeqHelper::Parallel<rec,twoda> > WH(F,nbw, SPLITTER(t,rec,twoda));
			  fgemm (F, FflasNoTrans, FflasNoTrans, m,n,k, F.one, A, k, B, n, F.zero, C,n, WH);
			  break;
		  }
		  case 4:{
			  MMHelper<Field, MMHelperAlgo::Winograd, typename ModeTraits<Field>::value, ParSeqHelper::Parallel<rec,threedip> > WH(F,nbw, SPLITTER(t,rec,threedip));
			  fgemm (F, FflasNoTrans, FflasNoTrans, m,n,k, F.one, A, k, B, n, F.zero, C,n, WH);
			  break;
		  }
		  case 5:{
			  MMHelper<Field, MMHelperAlgo::Winograd, typename ModeTraits<Field>::value, ParSeqHelper::Parallel<rec,threed> > WH(F,nbw, SPLITTER(t,rec,threed));
			  fgemm (F, FflasNoTrans, FflasNoTrans, m,n,k, F.one, A, k, B, n, F.zero, C,n, WH);
			  break;
		  }
		  case 6:{
			  MMHelper<Field, MMHelperAlgo::Winograd, typename ModeTraits<Field>::value, ParSeqHelper::Parallel<rec,threeda> > WH(F,nbw, SPLITTER(t,rec,threeda));
			  fgemm (F, FflasNoTrans, FflasNoTrans, m,n,k, F.one, A, k, B, n, F.zero, C,n, WH);
			  break;
		  }
		  default:{
			  MMHelper<Field, MMHelperAlgo::Winograd, typename ModeTraits<Field>::value, ParSeqHelper::Parallel<block,threads> > WH(F,nbw, SPLITTER(t,block,threads));
			  fgemm (F, FflasNoTrans, FflasNoTrans, m,n,k, F.one, A, k, B, n, F.zero, C,n, WH);
			  break;
		  }
	      }
		  }
	      if (i) {chrono.stop(); time+=chrono.realtime();}
      }else{
	      if(p==7){

			  int nrec = 0;
			  int dim = m;
			  //                      if(dim < 19000)
			  nrec--;
			  while(dim >= __FFLASFFPACK_WINOTHRESHOLD*2){
				  dim=dim/2;
				  nrec++;
			  }
			  nrec=std::max(1,nrec);
			  //			  std::cout<<" WINO_THREShold"<<__FFLASFFPACK_WINOTHRESHOLD<<" nrec = "<<nrec<<" dim = "<<dim<<std::endl;
			  if(nbw != -1)
				  nrec=nbw;
			  nbw=nrec;
			  if (i) chrono.start();
			  PAR_BLOCK
			  {
				  MMHelper<Field, MMHelperAlgo::WinogradPar,ModeTraits<Field>::value,ParSeqHelper::Parallel<> >  WH (F, nrec, ParSeqHelper::Parallel<>(t));
				  fgemm (F, FflasNoTrans, FflasNoTrans, m,n,k, F.one, A, k, B, n, F.zero, C,n,WH);
			  }
			  if (i) {chrono.stop(); time+=chrono.realtime();}
			  
			  
		      // MMHelper<Field, MMHelperAlgo::WinogradPar>
		      // 	      WH (F, nbw, ParSeqHelper::Sequential());
		      // 	  //		      cout<<"wino parallel"<<endl;
		      // if (i) chrono.start();
		      // PAR_BLOCK
		      // {
		      // 	      fgemm (F, FflasNoTrans, FflasNoTrans, m,n,k, F.one, A, k, B, n, F.zero, C,n,WH);
		      // }
		      // if (i) {chrono.stop(); time+=chrono.realtime();}
	      }
	      else{

		      MMHelper<Field,MMHelperAlgo::Winograd>//,
					  //typename FieldTraits<Field>::value,
					  //ParSeqHelper::Sequential>
			      WH (F, nbw, ParSeqHelper::Sequential());
		      if (i) chrono.start();
		      fgemm (F, FflasNoTrans, FflasNoTrans, m,n,k, F.one, A, k, B, n, F.zero, C,n,WH);
		      if (i) {chrono.stop(); time+=chrono.realtime();}
	      }
  }

      TimFreivalds.clear();
      TimFreivalds.start();
      
      bool pass = freivalds(F, FflasNoTrans, FflasNoTrans, m,n,k, F.one, A, k, B, n, C,n);
      TimFreivalds.stop();
      timev+=TimFreivalds.usertime();
      if (!pass) 
	      std::cout<<"FAILED"<<std::endl;
}
  fflas_delete( A);
  fflas_delete( B);
  fflas_delete( C);
  
	// -----------
	// Standard output for benchmark - Alexis Breust 2014/11/14
	std::cout << "Time: " << time / double(iter)
			  << " Gflops: " << (2.*double(m)/1000.*double(n)/1000.*double(k)/1000.0) / time * double(iter);
	writeCommandString(std::cout, as) << std::endl;
  
#if DEBUG
        std::cout<<"Freivalds vtime: "<<timev/(double)iter<<std::endl;
#endif

  return 0;
}

back to top