https://hal.archives-ouvertes.fr/hal-02128878
Tip revision: 4201397494d9af8b687117e8ff4d85a8944f5c5a authored by Software Heritage on 11 June 2019, 10:15:02 UTC
hal: Deposit 298 in collection hal
hal: Deposit 298 in collection hal
Tip revision: 4201397
test-bini-p.C
/*
* Copyright (C) 2015 the FFLAS-FFPACK group
* Written by Brice Boyer (briceboyer) <boyer.brice@gmail.com>
*
* This file is Free Software and part of FFLAS-FFPACK.
*
* ========LICENCE========
* This file is part of the library FFLAS-FFPACK.
*
* FFLAS-FFPACK is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
* ========LICENCE========
*
*/
#include "fflas-ffpack/utils/timer.h"
#include "fflas-ffpack/fflas/fflas.h"
#include "fflas-ffpack/fflas-ffpack-config.h"
#include "fflas-ffpack/utils/test-utils.h"
#include "assert.h"
#include "fflas-ffpack/utils/args-parser.h"
#include "fflas-ffpack/utils/flimits.h"
#include <givaro/udl.h>
// using namespace FFPACK;
#define NEWWINO
// #define NOTRANDOM
//
#define DIVIDE_INTO(x,y) (((x) + (y) - 1)/(y))
const int algos = 6 ;
const int algos_k = 2 ;
using Givaro::Modular;
using Givaro::ModularBalanced;
using Givaro::Timer;
using FFLAS::FieldTraits;
typedef std::vector<Timer> time_v ;
typedef std::vector<int> int_v ;
const int selec[] = {
0
,1
,2
,3
,4
,5
};
const int selec_k[] = {
0
,1
};
const char * descr[] = {
"322 low mem"
, "322 first 1"
, "322 4 tmp "
, "223 low mem"
, "232 first 1"
, "232 all tmp"
, "comp left "
, "comp right "
// , "322 sqrt "
};
const char * descr_k[] = {
"comp left "
, "comp right "
};
namespace FFLAS { /* compression */
template<class Elem, int Num>
struct Packer ;
template<>
struct Packer<double,2> {
uint64_t bits = (limits<double>::digits()/2) ;
double base = (double) (1_ui64 << bits) ;
uint64_t mask = (1_ui64 << bits) - 1_ui64 ;
template<class T>
void accu(double * p, T * w) {
*p *= base ;
*p += (double)*w ;
}
} ;
/* ****** */
/* pack */
/* ****** */
/* pack nb words (a,b,c) -> [a|b|c] */
template<class wide_T, class pack_T, int Nb>
void pack_word( pack_T * packed,
const wide_T * words, int32_t stride,
Packer<pack_T,Nb> & packer) ;
template<class wide_T>
void pack_word/*<wide_T,double,2>*/( double * packed,
const wide_T * words, int32_t stride,
Packer<double,2> & packer)
{
// std::cout << "pack " << *words << '+' << *(words+stride) << " * " << (uint64_t) packer.base << " = ";
// words += stride ;
*packed = (double) *words ;
words += stride ;
packer.accu(packed,words);
// std::cout << (uint64_t) *packed << std::endl;
}
/* pack nb words (a,b) -> [a|b|0] filling with zeros */
template<class wide_T, class pack_T, int Nb>
void pack_word_part( pack_T * packed, int32_t nb,
const wide_T * words, int32_t stride,
Packer<pack_T,Nb> & packer) ;
template<class wide_T>
void pack_word_part/* <wide_T,double,2> */( double * packed, int32_t nb,
const wide_T * words, int32_t stride,
Packer<double,2> & packer)
{
assert(nb == 1);
*packed = (double) *words ;
// words += stride ;
// packer.accu(packed,words);
*packed *= packer.base ;
}
/* ****** */
/* unpack */
/* ****** */
template<class wide_T, class pack_T, int Nb>
void unpack_word( wide_T * words, int32_t stride,
const pack_T * packed,
Packer<pack_T,Nb> & packer);
template<class wide_T>
void unpack_word/* <wide_T,double,2> */( wide_T * words, int32_t stride,
const double * packed,
Packer<double ,2> & packer)
{
uint64_t pck = (uint64_t) *packed ;
words += stride ;
*words = (double) (pck & packer.mask) ;
words -= stride ;
pck >>= packer.bits ;
*words = (double) pck /* & packer.mask */ ;
}
template<class wide_T, class pack_T, int Nb>
void unpack_word_part( wide_T * words, int32_t stride,
const pack_T * packed, int32_t nb,
Packer<pack_T,Nb> & packer);
template<class wide_T>
void unpack_word_part/* <wide_T,double,2> */( wide_T * words, int32_t stride,
const double * packed, int32_t nb,
Packer<double,2> & packer)
{
assert(nb == 1);
words += stride ;
*words = 0 ;
words -= stride ;
uint64_t pck = (uint64_t) *packed ;
pck >>= packer.bits ;
*words = (double)pck /* & packer.mask */ ;
}
/* ****** */
/* pack */
/* ****** */
template<class wide_T, class pack_T, int Nb, bool row_packed>
void pack_matrix( pack_T * packed, int32_t row_p, int32_t col_p, int32_t ldm_p,
const wide_T * elemts, int32_t row_e, int32_t col_e, int32_t ldm_e,
Packer<pack_T,Nb> & packer)
{
if (row_packed == true) {
for (int32_t i = 0 ; i < row_e ; i++ ) {
const wide_T * e_p = elemts + i * ldm_e ;
pack_T * p_p = packed + i * ldm_p ;
int32_t j = 0 ;
for ( ; j < col_e/Nb*Nb ; j+=Nb, e_p+=Nb, p_p++) {
pack_word<wide_T>(p_p,e_p,1,packer);
}
if (j < col_e)
pack_word_part<wide_T>(p_p,col_e-j,e_p,1,packer);
}
}
else { /* col_packed */
int32_t i = 0 ;
int32_t ii = 0 ;
for ( ; i < row_e/Nb*Nb ; i += Nb , ii++) {
const wide_T * e_p = elemts + i * ldm_e ;
pack_T * p_p = packed + ii * ldm_p ;
for (int32_t j = 0 ; j < col_e ; j++, e_p++, p_p++) {
pack_word<wide_T>(p_p,e_p,ldm_e,packer);
}
}
if (i < row_e)
pack_word_part<wide_T>(packed+i*ldm_p,row_e-i,elemts+ii*ldm_e,ldm_e,packer);
}
}
/* ****** */
/* unpack */
/* ****** */
template<class wide_T, class pack_T, int Nb, bool row_packed>
void unpack_matrix( wide_T * elemts, int32_t row_e, int32_t col_e, int32_t ldm_e,
const pack_T * packed, int32_t row_p, int32_t col_p, int32_t ldm_p,
Packer<pack_T,Nb> & packer)
{
if (row_packed == true) {
for (int32_t i = 0 ; i < row_e ; i++ ) {
wide_T * e_p = elemts + i * ldm_e ;
const pack_T * p_p = packed + i * ldm_p ;
int32_t j = 0 ;
for ( ; j < col_e/Nb*Nb ; j+=Nb, e_p+=Nb, p_p++) {
unpack_word<wide_T>(e_p,1,p_p,packer);
}
if (j < col_e)
unpack_word_part<wide_T>(e_p,1,p_p,col_e-j,packer);
}
}
else { /* col_packed */
int32_t i = 0 ;
int32_t ii = 0 ;
for ( ; i < row_e/Nb*Nb ; i += Nb , ii++) {
wide_T * e_p = elemts + i * ldm_e ;
const pack_T * p_p = packed + ii * ldm_p ;
for (int32_t j = 0 ; j < col_e ; j++, e_p++, p_p++) {
unpack_word<wide_T>(e_p,ldm_e,p_p,packer);
}
}
if (i < row_e)
unpack_word_part<wide_T>(elemts+i*ldm_e,ldm_e,packed+ii*ldm_p,row_e-i,packer);
}
}
/* compress A */
template<class Field, bool left_compress >
void
fgemm_compressed(const Field & F,
int m, int n, int k,
const typename Field::Element * A, int lda,
const typename Field::Element * B, int ldb,
typename Field::Element * C, int ldc
)
{
Givaro::ZRing<double> NoField;
double * A_k, * B_k, * C_k ;
typedef typename Field::Element elem_t ;
Packer<elem_t,2> packer ;
int m_k = m , n_k = n , lda_k = lda, ldb_k = ldb, ldc_k = ldc ;
if (left_compress) {
m_k = DIVIDE_INTO(m,2)*2 ;
lda_k = m_k ;
ldc_k = n ;
A_k = FFLAS::fflas_new<double>(m_k*k) ;
//!@bug don't zero all, just the "border"
FFLAS::fzero(NoField,m_k,k,A_k,k);
B_k = const_cast<typename Field::Element *>(B) ;
pack_matrix<elem_t,elem_t,2,false>(A_k,m_k,k,lda_k,
A,m,k,lda,
packer);
}
else {
n_k = DIVIDE_INTO(n,2)*2 ;
ldb_k = n_k ;
ldc_k = n_k ;
A_k = const_cast<typename Field::Element *>(A) ;
B_k = FFLAS::fflas_new<double>(k*n_k) ;
//!@bug don't zero all, just the "border"
FFLAS::fzero(NoField,k,n_k,B_k,n_k);
pack_matrix<elem_t,elem_t,2,true>(B_k,k,n_k,ldb_k,
B,k,n,ldb,
packer);
}
C_k = FFLAS::fflas_new<double>(m_k*n_k) ;
//!@bug don't zero all, just the "border"
FFLAS::fzero(NoField,m_k,n_k,C_k,n_k);
pack_matrix<elem_t,elem_t,2,!left_compress>(C_k,m_k,n_k,ldc_k,
C,m,n,ldc,
packer);
#if 0
double * C_e = FFLAS::fflas_new<double>(m*ldc);
unpack_matrix<elem_t,elem_t,2,!left_compress>(C_e,m,n,ldc,
C_k,m_k,n_k,ldc_k,
packer);
int faux = 0 ;
for (int i = 0 ; i < m ; ++i) {
for (int j = 0 ; j < n ; ++j) {
if (! (C[i*ldc+j] == C_e[i*ldc+j]) ) {
++faux ;
}
}
}
if (faux) {
std::cout << "bad pack/unpack ; bad/all = " << faux << '/' << m*n << " ~~ " << (double)faux/(double)(m*n) << std::endl;
}
if (faux && (n<20)) {
std::cout << "IN " << std::endl;
for (int i = 0 ; i < m ; ++i) {
for (int j = 0 ; j < n ; ++j)
std::cout << C[i*ldc+j] << ' ';
std::cout << std::endl;
}
std::cout << "OUT" << std::endl;
for (int i = 0 ; i < m ; ++i) {
for (int j = 0 ; j < n ; ++j)
std::cout << C_e[i*ldc+j] << ' ';
std::cout << std::endl;
}
}
if (faux)
exit(-1);
#endif
Givaro::DoubleDomain G ;
fgemm(G,FFLAS::FflasNoTrans,FFLAS::FflasNoTrans,
m_k,n_k,k, 1, A_k,lda_k, B_k,ldb_k, 0, C_k, ldc_k);
// cblas_dgemm(CblasRowMajor, CblasNoTrans,CblasNoTrans,
// m_k,n_k,k, 1, A_k,lda_k, B_k,ldb_k, 0, C_k, ldc_k);
unpack_matrix<elem_t,elem_t,2,!left_compress>(C,m,n,ldc,
C_k,m_k,n_k,ldc_k,
packer);
if (left_compress)
FFLAS::fflas_delete(A_k);
else
FFLAS::fflas_delete(B_k);
FFLAS::fflas_delete(C_k);
}
}
namespace FFLAS { /* tools */
template<class Field>
void finit_fuzzy(Field & F, size_t m, size_t n, double * C, size_t ldc)
{
if (n == ldc)
// FFLAS::vectorised::modp<true,true>(C,C,m*n,p,invp,0,p-1);
FFLAS::vectorised::modp<Field,true>(F,C,m*n,C);
else
for (size_t i = 0 ; i < m ; ++i)
// FFLAS::vectorised::modp<true,true>(C+i*ldc,C+i*ldc,n,p,invp,0,p-1);
FFLAS::vectorised::modp<Field,true>(F,C+i*ldc,n,C+i*ldc);
}
// C = a*A + B
void add(const size_t m, const size_t n,
double a,
const double *A, const size_t lda,
const double *B, const size_t ldb,
double *C, const size_t ldc)
{
const double *Ai = A,*Bi = B;
double *Ci = C;
for (;Ai < A+m*lda ; Ai+=lda,Bi+=ldb,Ci+=ldc)
for (size_t j = 0 ; j < n ; ++j)
Ci[j] = a * Ai[j] + Bi[j];
}
// C = C-(A+B)
void subadd(const size_t m, const size_t n,
const double *A, const size_t lda,
const double *B, const size_t ldb,
double *C, const size_t ldc)
{
const double *Ai = A,*Bi = B;
double *Ci = C;
for (;Ai < A+m*lda ; Ai+=lda,Bi+=ldb,Ci+=ldc)
for (size_t j = 0 ; j < n ; ++j) {
Ci[j] = Ci[j] - Ai[j] - Bi[j] ;
}
}
// C = -(A+B)
void negadd(const size_t m, const size_t n,
const double *A, const size_t lda,
const double *B, const size_t ldb,
double *C, const size_t ldc)
{
const double *Ai = A,*Bi = B;
double *Ci = C;
for (;Ai < A+m*lda ; Ai+=lda,Bi+=ldb,Ci+=ldc)
for (size_t j = 0 ; j < n ; ++j) {
Ci[j] = - Ai[j] - Bi[j] ;
}
}
// C = C+A-B
void addsub(const size_t m, const size_t n,
const double *A, const size_t lda,
const double *B, const size_t ldb,
double *C, const size_t ldc)
{
const double *Ai = A,*Bi = B;
double *Ci = C;
for (;Ai < A+m*lda ; Ai+=lda,Bi+=ldb,Ci+=ldc)
for (size_t j = 0 ; j < n ; ++j) {
Ci[j] = Ci[j] + Ai[j] - Bi[j] ;
}
}
// C = (C+B)/e
template<class Field>
void addscalinf(const Field & F, const size_t m, const size_t n,
const double *B, const size_t ldb,
double e,
double *C, const size_t ldc)
{
const double * Bi = B;
double * Ci = C;
for (;Bi < B+m*ldb ; Ci+=ldc, Bi += ldb)
for (size_t j = 0 ; j < n ; ++j)
Ci[j]= (Ci[j]+Bi[j])*e ;
// F.init( Ci[j], (Ci[j]+Bi[j])/e );
}
// C = (C-B)/e
template<class Field>
void subscalinf(const Field & F, const size_t m, const size_t n,
const double *B, const size_t ldb,
double e,
double *C, const size_t ldc)
{
const double * Bi = B;
double * Ci = C;
for (;Bi < B+m*ldb ; Ci+=ldc, Bi += ldb)
for (size_t j = 0 ; j < n ; ++j)
Ci[j]= (Ci[j]-Bi[j])*e ;
// F.init( Ci[j], (Ci[j]-Bi[j])/e );
}
// C = (D-B)/e
template<class Field>
void subscal(const Field & F, const size_t m, const size_t n,
const double *D, const size_t ldd,
const double *B, const size_t ldb,
double e,
double *C, const size_t ldc)
{
const double * Bi = B;
const double * Di = D;
double * Ci = C;
for (;Bi < B+m*ldb ; Ci+=ldc, Bi += ldb, Di += ldd)
for (size_t j = 0 ; j < n ; ++j)
Ci[j] = (Di[j]-Bi[j])*e ;
}
// C = (D+B)/e
template<class Field>
void addscal(const Field & F, const size_t m, const size_t n,
const double *D, const size_t ldd,
const double *B, const size_t ldb,
double e,
double *C, const size_t ldc)
{
const double * Bi = B;
const double * Di = D;
double * Ci = C;
for (;Bi < B+m*ldb ; Ci+=ldc, Bi += ldb, Di += ldd)
for (size_t j = 0 ; j < n ; ++j)
Ci[j] = (Di[j]+Bi[j])*e ;
}
// C = C + (D-B)/e
template<class Field>
void subscalacc(const Field & F, const size_t m, const size_t n,
const double *D, const size_t ldd,
const double *B, const size_t ldb,
double e,
double *C, const size_t ldc)
{
const double * Bi = B;
const double * Di = D;
double * Ci = C;
for (;Bi < B+m*ldb ; Ci+=ldc, Bi += ldb, Di += ldd)
for (size_t j = 0 ; j < n ; ++j)
Ci[j] += (Di[j]-Bi[j])*e ;
}
#ifndef TRE
// #ifndef NDEBUG
// #define TRE 1
// #else
#define TRE (size_t)(__FFLASFFPACK_WINOTHRESHOLD)
// #define TRE (size_t)(__FFLASFFPACK_WINOTHRESHOLD*0.9)
// #endif
#endif
template<class Field>
double * gemm_fflas(const Field & F,
const size_t m, const size_t n, const size_t k,
const double *A, size_t lda,
const double *B, size_t ldb,
double * C, size_t ldc,
int rec = 0)
{
Givaro::DoubleDomain R;
FFLAS::fgemm(R,
FFLAS::FflasNoTrans,FFLAS::FflasNoTrans,
m,n,k,
1,
A,lda, B,ldb,
0,
C, ldc);
// cblas_dgemm(CblasRowMajor, CblasNoTrans,CblasNoTrans,
// m,n,k,1,A,lda,B,ldb,0,C,ldc);
return C;
}
} // FFLAS
namespace FFLAS { namespace Protected { namespace Rec {
// Field must be Givaro::Modular<double>
template<class Field>
double *
gemm_bini_322_0(const Field & F
, const size_t m
, const size_t n
, const size_t k
, const double *A , const size_t lda
, const double *B , const size_t ldb
, double *C , const size_t ldc
, int rec
, const double & epsilon
)
{
Givaro::ZRing<double> NoField;
// const double p = (double)F.characteristic();
size_t M = (n>m)?std::min(k,m):std::min(k,n);
// std::cout << rec << ',' << M << std::endl;
// Field G(p*p);
if ( M < TRE || rec <= 0) {
return gemm_fflas(F, m,n,k, A,lda, B,ldb, C, ldc);
}
assert(k/2*2==k); // k divisible par 2
assert(n/2*2==n); // n divisible par 2
assert(m/3*3==m); // m divisible par 3
size_t n2 = n/2;
size_t k2 = k/2;
size_t m3 = m/3;
// std::cout << "€ = " << epsilon << std::endl;
// sub matrices in A
const double * A11 = A;
const double * A12 = A +k2;
const double * A21 = A +lda*m3;
const double * A22 = A21 +k2;
const double * A31 = A21 +lda*m3;
const double * A32 = A31 +k2;
// sub matrices in C
double * C11 = C;
double * C12 = C +n2;
double * C21 = C +ldc*m3;
double * C22 = C21 +n2;
double * C31 = C21 +ldc*m3;
double * C32 = C31 +n2;
// sub matrices in B
const double * B11 = B;
const double * B12 = B +n2;
const double * B21 = B +ldb*k2;
const double * B22 = B21 +n2;
FFLAS::fzero(NoField,m,n,C,ldc);
/*
* Algo :
* S1 := A11 +A22;
* S4 := e*A12+A22;
* S5 := A11 +e*A12;
* S6 := A21 +A32;
* S9 := A21 +e*A31;
* S10 := e*A31+A32;
*
* T1 := e*B11 +B22;
* T2 := B21 +B22;
* T4 := -e*B11+B21;
* T5 := e*B12 +B22;
* T6 := B11 +e*B22;
* T7 := B11 +B12;
* T9 := B12 -e*B22;
* T10 := B11 +e*B21;
*
* P1 := S1 *T1;
* P2 := A22*T2;
* P3 := A11*B22;
* P4 := S4 *T4;
* P5 := S5 *T5;
* P6 := S6 *T6;
* P7 := A21*T7;
* P8 := A32*B11;
* P9 := S9 *T9;
* P10:= S10*T10;
*
* C11 := (P1-P2-P3+P4)/e;
* C12 := (P3-P5)/(-e) ;
* C21 := P4+P6-P10 ;
* C22 := P1-P5+P9;
* C31 := (-P8+P10)/e;
* C32 := (P6-P7-P8+P9)/e;
*
*/
double * S1 = FFLAS::fflas_new<double>(m3*k2) ;
// double * C11t = FFLAS::fflas_new<double>(n2*m3) ;
// S1 := A11 +A22;
FFLAS::fadd(NoField,m3,k2,A11,lda,A22,lda,S1,k2);
// T1 := e*B11 +B22;
double * T1 = FFLAS::fflas_new<double>(n2*k2) ; // ou aire
add(k2,n2,epsilon,B11,ldb,B22,ldb,T1,n2);
// P1 := S1 *T1; (dans C22)
gemm_bini_322_0(F,m3,n2,k2,S1,k2,T1,n2,C22,ldc,rec-1,epsilon);
// S4 := e*A12+A22;
double * eA12 = FFLAS::fflas_new<double >(m3*k2);
FFLAS::fscal(NoField,m3,k2,epsilon,A12,lda,eA12,k2) ;
FFLAS::fadd(NoField,m3,k2,eA12,k2,A22,lda,S1,k2);
// T4 := -e*B11+B21;
add(k2,n2,-epsilon,B11,ldb,B21,ldb,T1,n2);
// P4 := S4 *T4; (dans C21)
gemm_bini_322_0(F,m3,n2,k2,S1,k2,T1,n2,C21,ldc,rec-1,epsilon);
// C11 = P1+P4
FFLAS::fadd(NoField,m3,n2,C21,ldc,C22,ldc,C11,ldc);
// T2 := B21 +B22;
FFLAS::fadd(NoField,k2,n2,B21,ldb,B22,ldb,T1,n2);
// P2 := A22*T2;
double * P1 = FFLAS::fflas_new<double>(n2*m3) ; // ou aire
gemm_bini_322_0(F,m3,n2,k2,A22,lda,T1,n2,P1,n2,rec-1,epsilon);
// P3 := A11*B22; (dans C12)
gemm_bini_322_0(F,m3,n2,k2,A11,lda,B22,ldb,C12,ldc,rec-1,epsilon);
// C11 -= (P2+P3)
subadd(m3,n2,P1,n2,C12,ldc,C11,ldc);
// S5 := A11 +e*A12;
FFLAS::fadd(NoField,m3,k2,eA12,k2,A11,lda,S1,k2);
// T5 := e*B12 +B22;
add(k2,n2,epsilon,B12,ldb,B22,ldb,T1,n2);
// P5 := S5 *T5;
double * P2 = FFLAS::fflas_new<double>(n2*m3) ; // ou aire
gemm_bini_322_0(F,m3,n2,k2,S1,k2,T1,n2,P2,n2,rec-1,epsilon);
// C12 -= P5
subscalinf(NoField,m3,n2,P2,n2,-(double)1/epsilon,C12,ldc);
// S6 := A21 +A32;
FFLAS::fadd(NoField,m3,k2,A21,lda,A32,lda,S1,k2);
// T6 := B11 +e*B22;
add(k2,n2,epsilon,B22,ldb,B11,ldb,T1,n2);
// P6 := S6 *T6; dans C32
gemm_bini_322_0(F,m3,n2,k2,S1,k2,T1,n2,C32,ldc,rec-1,epsilon);
// C21+= P6
FFLAS::faddin(NoField,m3,n2,C32,ldc,C21,ldc);
// T7 := B11 +B12;
FFLAS::fadd(NoField,k2,n2,B11,ldb,B12,ldb,T1,n2);
// P7 := A21*T7; !signe
gemm_bini_322_0(F,m3,n2,k2,A21,lda,T1,n2,P1,n2,rec-1,epsilon);
// P8 := A32*B11; dans C31 !signe
gemm_bini_322_0(F,m3,n2,k2,A32,lda,B11,ldb,C31,ldc,rec-1,epsilon);
// C32 -= P8+P7
subadd(m3,n2,P1,n2,C31,ldc,C32,ldc);
// S9 := A21 +e*A31;
double * eA31 = eA12 ;
FFLAS::fscal(NoField,m3,k2,epsilon,A31,lda,eA31,k2);
FFLAS::fadd(NoField,m3,k2,eA31,k2,A21,lda,S1,k2);
// T9 := B12 -e*B22;
add(k2,n2,-epsilon,B22,ldb,B12,ldb,T1,n2);
// P9 := S9 *T9;
gemm_bini_322_0(F,m3,n2,k2,S1,k2,T1,n2,P1,n2,rec-1,epsilon);
// C32= (C32+P9)/p
addscalinf(NoField,m3,n2,P1,n2,(double)1/epsilon,C32,ldc);
// C22+= P9-P5
addsub(m3,n2,P1,n2,P2,n2,C22,ldc);
FFLAS::fflas_delete( P2);
// S10 := e*A31+A32;
FFLAS::fadd(NoField,m3,k2,eA31,k2,A32,lda,S1,k2);
FFLAS::fflas_delete( eA12 );
// T10 := B11 +e*B21;
add(k2,n2,epsilon,B21,ldb,B11,ldb,T1,n2);
// P10:= S10*T10;
gemm_bini_322_0(F,m3,n2,k2,S1,k2,T1,n2,P1,n2,rec-1,epsilon);
FFLAS::fflas_delete( S1);
FFLAS::fflas_delete( T1);
// C21-= P10
FFLAS::fsubin(NoField,m3,n2,P1,n2,C21,ldc);
// C31= (C31-P10)/(-epsilon)
subscalinf(NoField,m3,n2,P1,n2,-(double)1/epsilon,C31,ldc);
FFLAS::fflas_delete( P1);
// C11 := (P1+P-P3+P4)/e;
FFLAS::fscalin(NoField,m3,n2,(double)1/epsilon,C11,ldc);
return C;
}
// Field must be Givaro::Modular<double>
template<class Field>
double *
gemm_bini_322_mem(const Field & F
, const size_t m
, const size_t n
, const size_t k
, const double *A , const size_t lda
, const double *B , const size_t ldb
, double *C , const size_t ldc
, int rec
, const double & epsilon
)
{
Givaro::ZRing<double> NoField;
// const double p = (double)F.characteristic();
size_t M = (n>m)?std::min(k,m):std::min(k,n);
// std::cout << rec << ',' << M << std::endl;
// Field G(p*p);
if ( M < TRE || rec <= 0) {
// std::cout << "ffw" << std::endl;
return gemm_fflas(F, m,n,k, A,lda, B,ldb, C, ldc);
// return gemm_fflas(NoField, m,n,k, A,lda, B,ldb, C, ldc);
}
assert(k/2*2==k); // k divisible par 2
assert(n/2*2==n); // n divisible par 2
assert(m/3*3==m); // m divisible par 3
// std::cout << "tested" << std::endl;
size_t n2 = n/2;
size_t k2 = k/2;
size_t m3 = m/3;
// std::cout << "€ = " << epsilon << std::endl;
// sub matrices in A
const double * A11 = A;
const double * A12 = A +k2;
const double * A21 = A +lda*m3;
const double * A22 = A21 +k2;
const double * A31 = A21 +lda*m3;
const double * A32 = A31 +k2;
// sub matrices in C
double * C11 = C;
double * C12 = C +n2;
double * C21 = C +ldc*m3;
double * C22 = C21 +n2;
double * C31 = C21 +ldc*m3;
double * C32 = C31 +n2;
// sub matrices in B
const double * B11 = B;
const double * B12 = B +n2;
const double * B21 = B +ldb*k2;
const double * B22 = B21 +n2;
FFLAS::fzero(F,m,n,C,ldc);
/*
* Algo :
* S1 := A11 +A22;
* S4 := e*A12+A22;
* S5 := A11 +e*A12;
* S6 := A21 +A32;
* S9 := A21 +e*A31;
* S3 := e*A31+A32;
*
* T1 := e*B11 +B22;
* T2 := B21 +B22;
* T4 := -e*B11+B21;
* T5 := e*B12 +B22;
* T6 := B11 +e*B22;
* T7 := B11 +B12;
* T9 := B12 -e*B22;
* T3 := B11 +e*B21;
*
* P1 := S1 *T1;
* P2 := A22*T2;
* P10 := A11*B22;
* P4 := S4 *T4;
* P5 := S5 *T5;
* P6 := S6 *T6;
* P7 := A21*T7;
* P8 := A32*B11;
* P9 := S9 *T9;
* P3:= S3*T3;
*
* C11 := (P1-P2-P10+P4)/e;
* C12 := (P10-P5)/(-e) ;
* C21 := P4+P6-P3 ;
* C22 := P1-P5+P9;
* C31 := (-P8+P3)/e;
* C32 := (P6-P7-P8+P9)/e;
*
*/
// P10
gemm_bini_322_mem(F,m3,n2,k2,A11,lda,B22,ldb,C11,ldc,rec-1,epsilon);
// S5
double * X = FFLAS::fflas_new<double>(m3*k2);
add(m3,k2,epsilon,A12,lda,A11,lda,X,k2);
// T5
// double * Y = FFLAS::fflas_new<double>(std::max(k2,m3)*n2);
double * Y = FFLAS::fflas_new<double>(k2*n2);
add(k2,n2,epsilon,B12,ldb,B22,ldb,Y,n2);
// P5
gemm_bini_322_mem(F,m3,n2,k2,X,k2,Y,n2,C22,ldc,rec-1,epsilon);
// C12
subscal(NoField,m3,n2,C22,ldc,C11,ldc,(double)1/epsilon,C12,ldc);
// T2
FFLAS::fadd(NoField,k2,n2,B21,ldb,B22,ldb,Y,n2);
// P2
gemm_bini_322_mem(F,m3,n2,k2,A22,lda,Y,n2,C31,ldc,rec-1,epsilon);
// C11
FFLAS::faddin(NoField,m3,n2,C31,ldc,C11,ldc);
// S1
FFLAS::fadd(NoField,m3,k2,A11,lda,A22,lda,X,k2);
// T1
add(k2,n2,epsilon,B11,ldb,B22,ldb,Y,n2);
// P1
gemm_bini_322_mem(F,m3,n2,k2,X,k2,Y,n2,C21,ldc,rec-1,epsilon);
// C22
FFLAS::fsub(NoField,m3,n2,C21,ldc,C22,ldc,C22,ldc);
// C11
FFLAS::fsub(NoField,m3,n2,C21,ldc,C11,ldc,C11,ldc);
// S4
add(m3,k2,epsilon,A12,lda,A22,lda,X,k2);
// T4
add(k2,n2,-epsilon,B11,ldb,B21,ldb,Y,n2);
// P4
gemm_bini_322_mem(F,m3,n2,k2,X,k2,Y,n2,C21,ldc,rec-1,epsilon);
// C11
addscalinf(NoField,m3,n2,C21,ldc,(double)1/epsilon,C11,ldc);
// S9
add(m3,k2,epsilon,A31,lda,A21,lda,X,k2);
// T9
add(k2,n2,-epsilon,B22,ldb,B12,ldb,Y,n2);
// P9
gemm_bini_322_mem(F,m3,n2,k2,X,k2,Y,n2,C32,ldc,rec-1,epsilon);
// C22
FFLAS::faddin(NoField,m3,n2,C32,ldc,C22,ldc);
// S6
FFLAS::fadd(NoField,m3,k2,A21,lda,A32,lda,X,k2);
// T6
add(k2,n2,epsilon,B22,ldb,B11,ldb,Y,n2);
// P6
gemm_bini_322_mem(F,m3,n2,k2,X,k2,Y,n2,C31,ldc,rec-1,epsilon);
// C21
FFLAS::faddin(NoField,m3,n2,C31,ldc,C21,ldc);
// C32
FFLAS::faddin(NoField,m3,n2,C31,ldc,C32,ldc);
// T7
FFLAS::fadd(NoField,k2,n2,B11,ldb,B12,ldb,Y,n2);
// P7
gemm_bini_322_mem(F,m3,n2,k2,A21,lda,Y,n2,C31,ldc,rec-1,epsilon);
// if (epsilon > 1 && rec == 2) { FFLAS::finit(G,m3,n2,C31,ldc);}
// C32
FFLAS::fsubin(NoField,m3,n2,C31,ldc,C32,ldc);
// S3
add(m3,k2,epsilon,A31,lda,A32,lda,X,k2);
// T3
add(k2,n2,epsilon,B21,ldb,B11,ldb,Y,n2);
// P3
gemm_bini_322_mem(F,m3,n2,k2,X,k2,Y,n2,C31,ldc,rec-1,epsilon);
FFLAS::fflas_delete( X);
FFLAS::fflas_delete( Y );
// C21
FFLAS::fsubin(NoField,m3,n2,C31,ldc,C21,ldc);
// P8
Y = FFLAS::fflas_new<double>(m3*n2);
gemm_bini_322_mem(F,m3,n2,k2,A32,lda,B11,ldb,Y,n2,rec-1,epsilon);
// C31
subscalinf(NoField,m3,n2,Y,n2,(double)1/epsilon,C31,ldc);
// FFLAS::fsubin(NoField,m3,n2,Y,n2,C31,ldc);
// C32
subscalinf(NoField,m3,n2,Y,n2,(double)1/epsilon,C32,ldc);
// FFLAS::fsubin(NoField,m3,n2,Y,n2,C32,ldc);
// FFLAS::fscalin(NoField,m3,n,(double)1/epsilon,C31,ldc);
FFLAS::fflas_delete( Y );
return C;
}
// Field must be Givaro::Modular<double>
template<class Field>
double *
gemm_bini_223_mem(const Field & F
, const size_t m
, const size_t n
, const size_t k
, const double *A , const size_t lda
, const double *B , const size_t ldb
, double *C , const size_t ldc
, int rec
, const double & epsilon
)
{
Givaro::ZRing<double> NoField;
// const double p = (double)F.characteristic();
size_t M = (n>m)?std::min(k,m):std::min(k,n);
// std::cout << rec << ',' << M << std::endl;
// Field G(p*p);
if ( M < TRE || rec <= 0) {
// std::cout << "ffw" << std::endl;
return gemm_fflas(F, m,n,k, A,lda, B,ldb, C, ldc);
// return gemm_fflas(NoField, m,n,k, A,lda, B,ldb, C, ldc);
}
assert(k/2*2==k); // k divisible par 2
assert(n/3*3==n); // n divisible par 2
assert(m/2*2==m); // m divisible par 3
// std::cout << "tested" << std::endl;
size_t m2 = m/2;
size_t k2 = k/2;
size_t n3 = n/3;
// std::cout << "€ = " << epsilon << std::endl;
// sub matrices in A
const double * A11 = A;
const double * A12 = A +k2;
const double * A21 = A +lda*m2;
const double * A22 = A21 +k2;
// sub matrices in C
double * C11 = C;
double * C12 = C +n3;
double * C13 = C +2*n3;
double * C21 = C +ldc*m2;
double * C22 = C21 +n3;
double * C23 = C21 +2*n3;
// sub matrices in B
const double * B11 = B;
const double * B12 = B +n3;
const double * B13 = B +2*n3;
const double * B21 = B +ldb*k2;
const double * B22 = B21 +n3;
const double * B23 = B21 +2*n3;
FFLAS::fzero(F,m,n,C,ldc);
/*
* Algo :
* S1 := B11 +B22;
* S4 := e*B21+B22;
* S5 := B11 +e*B21;
* S6 := B12 +B23;
* S9 := B12 +e*B13;
* S3 := e*B13+B23;
*
* T1 := e*A11 +A22;
* T2 := A12 +A22;
* T4 := -e*A11+A12;
* T5 := e*A21 +A22;
* T6 := A11 +e*A22;
* T7 := A11 +A21;
* T9 := A21 -e*A22;
* T3 := A11 +e*A12;
*
* P1 := S1 *T1;
* P2 := T2 * B22;
* P10 := A22 * B11;
* P4 := S4 *T4;
* P5 := S5 *T5;
* P6 := S6 *T6;
* P7 := T7*B12;
* P8 := A11*B23;
* P9 := S9 *T9;
* P3 := S3*T3;
*
* C11 := (P1-P2-P10+P4)/e;
* C21 := (P10-P5)/(-e) ;
* C12 := P4+P6-P3 ;
* C22 := P1-P5+P9;
* C13 := (-P8+P3)/e;
* C23 := (P6-P7-P8+P9)/e;
*
*/
// P10
gemm_bini_223_mem(F,m2,n3,k2,A22,lda,B11,ldb,C11,ldc,rec-1,epsilon);
// S5
double * Y = FFLAS::fflas_new<double>(k2*n3);
add(k2,n3,epsilon,B21,ldb,B11,ldb,Y,n3);
// T5
double * X = FFLAS::fflas_new<double>(m2*k2);
add(m2,k2,epsilon,A21,lda,A22,lda,X,k2);
// P5
gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C22,ldc,rec-1,epsilon);
// C12
subscal(NoField,m2,n3,C22,ldc,C11,ldc,(double)1/epsilon,C21,ldc);
// T2
FFLAS::fadd(NoField,m2,k2,A12,lda,A22,lda,X,k2);
// P2
gemm_bini_223_mem(F,m2,n3,k2,X,k2,B22,ldb,C13,ldc,rec-1,epsilon);
// C11
FFLAS::faddin(NoField,m2,n3,C13,ldc,C11,ldc);
// S1
FFLAS::fadd(NoField,k2,n3,B11,ldb,B22,ldb,Y,n3);
// T1
add(m2,k2,epsilon,A11,lda,A22,lda,X,k2);
// P1
gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C12,ldc,rec-1,epsilon);
// C22
FFLAS::fsub(NoField,m2,n3,C12,ldc,C22,ldc,C22,ldc);
// C11
FFLAS::fsub(NoField,m2,n3,C12,ldc,C11,ldc,C11,ldc);
// S4
add(k2,n3,epsilon,B21,ldb,B22,ldb,Y,n3);
// T4
add(m2,k2,-epsilon,A11,lda,A12,lda,X,k2);
// P4
gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C12,ldc,rec-1,epsilon);
// C11
addscalinf(NoField,m2,n3,C12,ldc,(double)1/epsilon,C11,ldc);
// S9
add(k2,n3,epsilon,B13,ldb,B12,ldb,Y,n3);
// T9
add(m2,k2,-epsilon,A22,lda,A21,lda,X,k2);
// P9
gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C23,ldc,rec-1,epsilon);
// C22
FFLAS::faddin(NoField,m2,n3,C23,ldc,C22,ldc);
// S6
FFLAS::fadd(NoField,k2,n3,B12,ldb,B23,ldb,Y,n3);
// T6
add(m2,k2,epsilon,A22,lda,A11,lda,X,k2);
// P6
gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C13,ldc,rec-1,epsilon);
// C21
FFLAS::faddin(NoField,m2,n3,C13,ldc,C12,ldc);
// C32
FFLAS::faddin(NoField,m2,n3,C13,ldc,C23,ldc);
// T7
FFLAS::fadd(NoField,m2,k2,A11,lda,A21,lda,X,k2);
// P7
gemm_bini_223_mem(F,m2,n3,k2,X,k2,B12,ldb,C13,ldc,rec-1,epsilon);
// if (epsilon > 1 && rec == 2) { FFLAS::finit(G,m2,n3,C31,ldc);}
// C32
FFLAS::fsubin(NoField,m2,n3,C13,ldc,C23,ldc);
// S3
add(k2,n3,epsilon,B13,ldb,B23,ldb,Y,n3);
// T3
add(m2,k2,epsilon,A12,lda,A11,lda,X,k2);
// P3
gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C13,ldc,rec-1,epsilon);
FFLAS::fflas_delete( Y );
FFLAS::fflas_delete( X );
// C21
FFLAS::fsubin(NoField,m2,n3,C13,ldc,C12,ldc);
// P8
Y = FFLAS::fflas_new<double>(m2*n3);
gemm_bini_223_mem(F,m2,n3,k2,A11,lda,B23,ldb,Y,n3,rec-1,epsilon);
// C31
subscalinf(NoField,m2,n3,Y,n3,(double)1/epsilon,C13,ldc);
// C32
subscalinf(NoField,m2,n3,Y,n3,(double)1/epsilon,C23,ldc);
FFLAS::fflas_delete( Y );
return C;
}
// Field must be Givaro::Modular<double>
template<class Field>
double *
gemm_bini_322_2(const Field & F
, const size_t m
, const size_t n
, const size_t k
, const double *A , const size_t lda
, const double *B , const size_t ldb
, double *C , const size_t ldc
, int rec
, const double & epsilon
)
{
Givaro::ZRing<double> NoField;
// const double p = (double)F.characteristic();
size_t M = (n>m)?std::min(k,m):std::min(k,n);
// std::cout << rec << ',' << M << std::endl;
// Field G(p*p);
if ( M < TRE || rec <= 0) {
// std::cout << "ffw" << std::endl;
return gemm_fflas(F, m,n,k, A,lda, B,ldb, C, ldc);
// return gemm_fflas(NoField, m,n,k, A,lda, B,ldb, C, ldc);
}
assert(k/2*2==k); // k divisible par 2
assert(n/2*2==n); // n divisible par 2
assert(m/3*3==m); // m divisible par 3
// std::cout << "tested" << std::endl;
size_t n2 = n/2;
size_t k2 = k/2;
size_t m3 = m/3;
// std::cout << "€ = " << epsilon << std::endl;
// sub matrices in A
const double * A11 = A;
const double * A12 = A +k2;
const double * A21 = A +lda*m3;
const double * A22 = A21 +k2;
const double * A31 = A21 +lda*m3;
const double * A32 = A31 +k2;
// sub matrices in C
double * C11 = C;
double * C12 = C +n2;
double * C21 = C +ldc*m3;
double * C22 = C21 +n2;
double * C31 = C21 +ldc*m3;
double * C32 = C31 +n2;
// sub matrices in B
const double * B11 = B;
const double * B12 = B +n2;
const double * B21 = B +ldb*k2;
const double * B22 = B21 +n2;
FFLAS::fzero(F,m,n,C,ldc);
/*
* Algo :
* S1 := A11 +A22;
* S4 := e*A12+A22;
* S5 := A11 +e*A12;
* S3 := e*A31+A32;
* S6 := A21 +A32;
* S9 := A21 +e*A31;
*
* T1 := e*B11 +B22;
* T2 := B21 +B22;
* T3 := B11 +e*B21;
* T4 := -e*B11+B21;
* T5 := e*B12 +B22;
* T6 := B11 +e*B22;
* T7 := B11 +B12;
* T9 := B12 -e*B22;
*
* P1 := S1 *T1;
* P2 := A22*T2;
* P10 := A11*B22;
* P4 := S4 *T4;
* P5 := S5 *T5;
* P6 := S6 *T6;
* P7 := A21*T7;
* P8 := A32*B11;
* P9 := S9 *T9;
* P3:= S3*T3;
*
* C11 := (P1-P2-P10+P4)/e;
* C12 := (P10-P5)/(-e) ;
* C21 := P4+P6-P3 ;
* C22 := P1-P5+P9;
* C31 := (-P8+P3)/e;
* C32 := (P6-P7-P8+P9)/e;
*
*/
double * U = FFLAS::fflas_new<double>(m3*n2);
double * V = FFLAS::fflas_new<double>(m3*n2);
double * X = FFLAS::fflas_new<double>(m3*std::max(k2,n2));
double * Y = FFLAS::fflas_new<double>(std::max(k2,m3)*n2);
// S4
add(m3,k2,epsilon,A12,lda,A22,lda,X,k2);
// T4
add(k2,n2,-epsilon,B11,ldb,B21,ldb,Y,n2);
// P4
gemm_bini_322_2(F,m3,n2,k2,X,k2,Y,n2,U,n2,rec-1,epsilon);
// S9
add(m3,k2,epsilon,A31,lda,A21,lda,X,k2);
// T9
add(k2,n2,-epsilon,B22,ldb,B12,ldb,Y,n2);
// P9
gemm_bini_322_2(F,m3,n2,k2,X,k2,Y,n2,V,n2,rec-1,epsilon);
// S5
add(m3,k2,epsilon,A12,lda,A11,lda,X,k2);
// T5
add(k2,n2,epsilon,B12,ldb,B22,ldb,Y,n2);
// P5
gemm_bini_322_2(F,m3,n2,k2,X,k2,Y,n2,C12,ldc,rec-1,epsilon);
// S3
add(m3,k2,epsilon,A31,lda,A32,lda,X,k2);
// T3
add(k2,n2,epsilon,B21,ldb,B11,ldb,Y,n2);
// P3
gemm_bini_322_2(F,m3,n2,k2,X,k2,Y,n2,C31,ldc,rec-1,epsilon);
// C22 = P9-P5
FFLAS::fsub(NoField,m3,n2,V,n2,C12,ldc,C22,ldc);
// C21 = P4-P3
FFLAS::fsub(NoField,m3,n2,U,n2,C31,ldc,C21,ldc);
// T2
FFLAS::fadd(NoField,k2,n2,B21,ldb,B22,ldb,Y,n2);
// P2
gemm_bini_322_2(F,m3,n2,k2,A22,lda,Y,n2,X,n2,rec-1,epsilon);
// XXX approximate
// C11 = (P4 - P2) / e
subscal(NoField,m3,n2,U,n2,X,n2,1./epsilon,C11,ldc);
// T7
FFLAS::fadd(NoField,k2,n2,B11,ldb,B12,ldb,Y,n2);
// P7
gemm_bini_322_2(F,m3,n2,k2,A21,lda,Y,n2,X,n2,rec-1,epsilon);
// XXX approximate
// C32 = (P9-P7) / e
subscal(NoField,m3,n2,V,n2,X,n2,1./epsilon,C32,ldc);
// S1
FFLAS::fadd(NoField,m3,k2,A11,lda,A22,lda,X,k2);
// T1
add(k2,n2,epsilon,B11,ldb,B22,ldb,Y,n2);
// P1
gemm_bini_322_2(F,m3,n2,k2,X,k2,Y,n2,U,n2,rec-1,epsilon);
// C22 += P1
FFLAS::faddin(NoField,m3,n2,U,n2,C22,ldc);
// P10
gemm_bini_322_2(F,m3,n2,k2,A11,lda,B22,ldb,V,n2,rec-1,epsilon);
// C12 = (P5-P10)/e
subscalinf(NoField,m3,n2,V,n2,1./epsilon,C12,ldc);
// XXX approximate
// C11 = C11 + (P1-P10)/e
subscalacc(NoField,m3,n2,U,n2,V,n2,1./epsilon,C11,ldc);
// S6
FFLAS::fadd(NoField,m3,k2,A21,lda,A32,lda,X,k2);
// T6
add(k2,n2,epsilon,B22,ldb,B11,ldb,Y,n2);
// P6
gemm_bini_322_2(F,m3,n2,k2,X,k2,Y,n2,U,n2,rec-1,epsilon);
// C21 += P6
FFLAS::faddin(NoField,m3,n2,U,n2,C21,ldc);
// P8
gemm_bini_322_2(F,m3,n2,k2,A32,lda,B11,ldb,V,n2,rec-1,epsilon);
// C31 = (P3-P8)/2
subscalinf(NoField,m3,n2,V,n2,1./epsilon,C31,ldc);
// XXX approximate
// C32 = C32 + (P6-P8)/e
subscalacc(NoField,m3,n2,U,n2,V,n2,1./epsilon,C32,ldc);
FFLAS::fflas_delete( X);
FFLAS::fflas_delete( Y );
FFLAS::fflas_delete( U);
FFLAS::fflas_delete( V);
return C;
}
// Field must be Givaro::Modular<double>
template<class Field>
double *
gemm_bini_232_2(const Field & F
, const size_t m
, const size_t n
, const size_t k
, const double *A , const size_t lda
, const double *B , const size_t ldb
, double *C , const size_t ldc
, int rec
, const double & epsilon
)
{
Givaro::ZRing<double> NoField;
// const double p = (double)F.characteristic();
size_t M = (n>m)?std::min(k,m):std::min(k,n);
// Field G(p*p);
if ( M < TRE || rec <= 0) {
// std::cout << "ffw" << std::endl;
return gemm_fflas(F, m,n,k, A,lda, B,ldb, C, ldc);
// return gemm_fflas(NoField, m,n,k, A,lda, B,ldb, C, ldc);
}
assert(k/3*3==k); // k divisible par 3
assert(n/2*2==n); // n divisible par 2
assert(m/2*2==m); // m divisible par 2
// std::cout << "tested" << std::endl;
size_t n2 = n/2;
size_t k3 = k/3;
size_t m2 = m/2;
// std::cout << "€ = " << epsilon << std::endl;
// sub matrices in B
const double * B11 = B;
const double * B12 = B +n2;
const double * B21 = B +ldb*k3;
const double * B22 = B21 +n2;
const double * B31 = B21 +ldb*k3;
const double * B32 = B31 +n2;
// sub matrices in C
double * C11 = C;
double * C12 = C +n2;
double * C21 = C +ldc*m2;
double * C22 = C21 +n2;
// sub matrices in A
const double * A11 = A;
const double * A12 = A +k3;
const double * A13 = A +2*k3;
const double * A21 = A +lda*m2;
const double * A22 = A21 +k3;
const double * A23 = A21 +2*k3;
FFLAS::fzero(F,m,n,C,ldc);
/*
* Algo :
*
* S1 := A11 +A22*e;
* S3 := -(A11+A21);
* S4 := A11+A12*e;
* S5 := A21 - A22*e;
* S6 := A12*e + A23;
* S8 := -(A13+A23):
* S9 := A22*e + A23;
* S10 := -A12*e+A13;
*
* T1 := B11 +B22;
* T4 := e*B12+B22;
* T5 := B11 +e*B12;
* T6 := B21 +B32;
* T9 := B21 + e*B31;
* T10 := e*B31 +B32;
*
* P1 := Bini232(S1,T1 ,e);
* P2 := Bini232(A11,B22 ,e);
* P3 := Bini232(S3,B11,e);
* P4 := Bini232(S4,T4 ,e);
* P5 := Bini232(S5,T5 ,e);
* P6 := Bini232(S6,T6 ,e);
* P7 := Bini232(A23,B21 ,e);
* P8 := Bini232(S8,B32,e);
* P9 := Bini232(S9,T9 ,e);
* P10:= Bini232(S10,T10,e);
*
*
* C11 := evalm(P1-P4+(P6-P7+P8+P10)/e);
* C12 := evalm((-P2+P4)/e+P10) ;
* C21 := evalm(P5+(-P7+P9)/e) ;
* C22 := evalm((P1-P2+P3+P5)/e+P6-P9);
*
*/
double * U = FFLAS::fflas_new<double>(m2*n2);
double * V = FFLAS::fflas_new<double>(m2*n2);
double * X = FFLAS::fflas_new<double>(m2*k3);
double * Y = FFLAS::fflas_new<double>(k3*n2);
// S1
add(m2,k3,epsilon,A22,lda,A11,lda,X,k3);
// T1
FFLAS::fadd(NoField,k3,n2,B11,ldb,B22,ldb,Y,n2);
// P1 (in U)
gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,U,n2,rec-1,epsilon);
// S3
negadd(m2,k3,A11,lda,A21,lda,X,k3);
// P3 (in V)
gemm_bini_232_2(F,m2,n2,k3,X,k3,B11,ldb,V,n2,rec-1,epsilon);
// C22 = (P1+P3)/e
// FFLAS::fadd(NoField,m2,n2,U,n2,V,n2,C22,ldc); // XXX acc
addscal(NoField,m2,n2,U,n2,V,n2,(double)1/epsilon,C22,ldc);
// S6
add(m2,k3,epsilon,A12,lda,A23,lda,X,k3);
// T6
FFLAS::fadd(NoField,k3,n2,B21,ldb,B32,ldb,Y,n2);
// P6 (in V)
gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,V,n2,rec-1,epsilon);
// C22 += P6
FFLAS::faddin(NoField,m2,n2,V,n2,C22,ldc);
// S8
negadd(m2,k3,A13,lda,A23,lda,X,k3);
// P8 (in C11)
gemm_bini_232_2(F,m2,n2,k3,X,k3,B32,ldb,C11,ldc,rec-1,epsilon);
// C11 = (P8+P6)/e
addscalinf(NoField,m2,n2,V,n2,(double)1/epsilon,C11,ldc);
// C11 += P1
FFLAS::faddin(NoField,m2,n2,U,n2,C11,ldc);
// S4
add(m2,k3,epsilon,A12,lda,A11,lda,X,k3);
// T4
add(k3,n2,epsilon,B12,ldb,B22,ldb,Y,n2);
// P4 (in U)
gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,U,n2,rec-1,epsilon);
// C11 -= P4
FFLAS::fsubin(NoField,m2,n2,U,n2,C11,ldc);
// P2 (in C12)
gemm_bini_232_2(F,m2,n2,k3,A11,lda,B22,ldb,C12,ldc,rec-1,epsilon);
// S5
add(m2,k3,-epsilon,A22,lda,A21,lda,X,k3);
// T5
add(k3,n2,epsilon,B12,ldb,B11,ldb,Y,n2);
// P5 (in V)
gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,V,n2,rec-1,epsilon);
// C22 += (P5-P2)/e
subscalacc(NoField,m2,n2,V,n2,C12,ldc,(double)1/epsilon,C22,ldc);
// C12 = (P4-P2)/e
subscalinf(NoField,m2,n2,U,n2,-(double)1/epsilon,C12,ldc);
// S9
add(m2,k3,epsilon,A22,lda,A23,lda,X,k3);
// T9
add(k3,n2,epsilon,B31,ldb,B21,ldb,Y,n2);
// P9 (in U)
gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,U,n2,rec-1,epsilon);
// C22 -= P9
FFLAS::fsubin(NoField,m2,n2,U,n2,C22,ldc);
// P7 (in C21)
gemm_bini_232_2(F,m2,n2,k3,A23,lda,B21,ldb,C21,ldc,rec-1,epsilon);
// C11 = C11 - P7/e
add(m2,n2,-(double)1/epsilon,C21,ldc,C11,ldc,C11,ldc);
// C21 = (P9-P7)/e
subscalinf(NoField,m2,n2,U,n2,-(double)1/epsilon,C21,ldc);
// C21 += P5
FFLAS::faddin(NoField,m2,n2,V,n2,C21,ldc);
// S10
add(m2,k3,-epsilon,A12,lda,A13,lda,X,k3);
// T10
add(k3,n2,epsilon,B31,ldb,B32,ldb,Y,n2);
// P10 (in U)
gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,U,n2,rec-1,epsilon);
// C12 += P10
FFLAS::faddin(NoField,m2,n2,U,n2,C12,ldc);
// C11 += P10/e
add(m2,n2,(double)1/epsilon,U,n2,C11,ldc,C11,ldc);
FFLAS::fflas_delete( X );
FFLAS::fflas_delete( Y );
FFLAS::fflas_delete( U );
FFLAS::fflas_delete( V );
return C;
}
template<class Field>
double *
gemm_bini_232_3_acc(const Field & F
, const size_t m
, const size_t n
, const size_t k
, const double *A , const size_t lda
, const double *B , const size_t ldb
, double *C , const size_t ldc
, int rec
, const double & epsilon
)
{
if (rec != 0)
exit(-1);
Givaro::DoubleDomain R;
FFLAS::fgemm(R,
FFLAS::FflasNoTrans,FFLAS::FflasNoTrans,
m,n,k,
1,
A,lda, B,ldb,
1,
C, ldc);
}
template<class Field>
double *
gemm_bini_232_3(const Field & F
, const size_t m
, const size_t n
, const size_t k
, const double *A , const size_t lda
, const double *B , const size_t ldb
, double *C , const size_t ldc
, int rec
, const double & epsilon
)
{
Givaro::ZRing<double> NoField;
// const double p = (double)F.characteristic();
size_t M = (n>m)?std::min(k,m):std::min(k,n);
// Field G(p*p);
if ( M < TRE || rec <= 0) {
// std::cout << "ffw" << std::endl;
return gemm_fflas(F, m,n,k, A,lda, B,ldb, C, ldc);
// return gemm_fflas(NoField, m,n,k, A,lda, B,ldb, C, ldc);
}
assert(k/3*3==k); // k divisible par 3
assert(n/2*2==n); // n divisible par 2
assert(m/2*2==m); // m divisible par 2
// std::cout << "tested" << std::endl;
size_t n2 = n/2;
size_t k3 = k/3;
size_t m2 = m/2;
// std::cout << "€ = " << epsilon << std::endl;
// sub matrices in B
const double * B11 = B;
const double * B12 = B +n2;
const double * B21 = B +ldb*k3;
const double * B22 = B21 +n2;
const double * B31 = B21 +ldb*k3;
const double * B32 = B31 +n2;
// sub matrices in C
double * C11 = C;
double * C12 = C +n2;
double * C21 = C +ldc*m2;
double * C22 = C21 +n2;
// sub matrices in A
const double * A11 = A;
const double * A12 = A +k3;
const double * A13 = A +2*k3;
const double * A21 = A +lda*m2;
const double * A22 = A21 +k3;
const double * A23 = A21 +2*k3;
FFLAS::fzero(F,m,n,C,ldc);
/*
* Algo :
*
* S1 := A11 +A22*e;
* S3 := -(A11+A21);
* S4 := A11+A12*e;
* S5 := A21 - A22*e;
* S6 := A12*e + A23;
* S8 := -(A13+A23):
* S9 := A22*e + A23;
* S10 := -A12*e+A13;
*
* T1 := B11 +B22;
* T4 := e*B12+B22;
* T5 := B11 +e*B12;
* T6 := B21 +B32;
* T9 := B21 + e*B31;
* T10 := e*B31 +B32;
*
* P1 := Bini232(S1,T1 ,e);
* P2 := Bini232(A11,B22 ,e);
* P3 := Bini232(S3,B11,e);
* P4 := Bini232(S4,T4 ,e);
* P5 := Bini232(S5,T5 ,e);
* P6 := Bini232(S6,T6 ,e);
* P7 := Bini232(A23,B21 ,e);
* P8 := Bini232(S8,B32,e);
* P9 := Bini232(S9,T9 ,e);
* P10:= Bini232(S10,T10,e);
*
*
* C11 := evalm(P1-P4+(P6-P7+P8+P10)/e);
* C12 := evalm((-P2+P4)/e+P10) ;
* C21 := evalm(P5+(-P7+P9)/e) ;
* C22 := evalm((P1-P2+P3+P5)/e+P6-P9);
*
*/
// could be just one band for the scalings
double * U = FFLAS::fflas_new<double>(m2*n2);
double * V = FFLAS::fflas_new<double>(std::max(k3,m2)*n2);
double * X = FFLAS::fflas_new<double>(m2*k3);
double * Y = FFLAS::fflas_new<double>(k3*n2);
// S1
double * eA22 = FFLAS::fflas_new<double>(std::max(m2,n2)*k3);
FFLAS::fscal(NoField,m2,k3,epsilon,A22,lda,eA22,k3);
FFLAS::fadd(NoField,m2,k3,eA22,k3,A11,lda,X,k3);
// T1
FFLAS::fadd(NoField,k3,n2,B11,ldb,B22,ldb,Y,n2);
// P1 (in U)
gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,U,n2,rec-1,epsilon);
// S3
negadd(m2,k3,A11,lda,A21,lda,X,k3);
// P3 (in V)
gemm_bini_232_2(F,m2,n2,k3,X,k3,B11,ldb,V,n2,rec-1,epsilon);
// C22 = (P1+P3)/e
addscal(NoField,m2,n2,U,n2,V,n2,(double)1/epsilon,C22,ldc);
// S6
double * eA12 = FFLAS::fflas_new<double>(m2*k3);
FFLAS::fscal(NoField,m2,k3,epsilon,A12,lda,eA12,k3);
FFLAS::fadd(NoField,m2,k3,eA12,k3,A23,lda,X,k3);
// T6
FFLAS::fadd(NoField,k3,n2,B21,ldb,B32,ldb,Y,n2);
// P6 (in V)
gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,V,n2,rec-1,epsilon);
// C22 += P6
FFLAS::faddin(NoField,m2,n2,V,n2,C22,ldc);
// S8
negadd(m2,k3,A13,lda,A23,lda,X,k3);
// P8 (in C11)
gemm_bini_232_2(F,m2,n2,k3,X,k3,B32,ldb,C11,ldc,rec-1,epsilon);
// C11 = (P8+P6)/e
addscalinf(NoField,m2,n2,V,n2,(double)1/epsilon,C11,ldc);
// C11 += P1
FFLAS::faddin(NoField,m2,n2,U,n2,C11,ldc);
// S4
FFLAS::fadd(NoField,m2,k3,eA12,k3,A11,lda,X,k3);
// T4
double * eB12 = V ; // FFLAS::fflas_new<double>(n2*k3);
FFLAS::fscal(NoField,k3,n2,epsilon,B12,ldb,eB12,n2);
FFLAS::fadd(NoField,k3,n2,eB12,n2,B22,ldb,Y,n2);
// P4 (in U)
gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,U,n2,rec-1,epsilon);
// C11 -= P4
FFLAS::fsubin(NoField,m2,n2,U,n2,C11,ldc);
// P2 (in C12)
gemm_bini_232_2(F,m2,n2,k3,A11,lda,B22,ldb,C12,ldc,rec-1,epsilon);
// S5
FFLAS::fsub(NoField,m2,k3,A21,lda,eA22,k3,X,k3);
// T5
FFLAS::fadd(NoField,k3,n2,eB12,n2,B11,ldb,Y,n2);
// FFLAS::fflas_delete( eB12);
// P5 (in V)
gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,V,n2,rec-1,epsilon);
// C22 += (P5-P2)/e
subscalacc(NoField,m2,n2,V,n2,C12,ldc,(double)1/epsilon,C22,ldc);
// C12 = (P4-P2)/e
subscalinf(NoField,m2,n2,U,n2,-(double)1/epsilon,C12,ldc);
// S9
FFLAS::fadd(NoField,m2,k3,eA22,k3,A23,lda,X,k3);
double * eB31 = eA22 ;
FFLAS::fscal(NoField,k3,n2,epsilon,B31,ldb,eB31,n2);
// T9
FFLAS::fadd(NoField,k3,n2,eB31,n2,B21,ldb,Y,n2);
// P9 (in U)
gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,U,n2,rec-1,epsilon);
// C22 -= P9
FFLAS::fsubin(NoField,m2,n2,U,n2,C22,ldc);
// P7 (in C21)
gemm_bini_232_2(F,m2,n2,k3,A23,lda,B21,ldb,C21,ldc,rec-1,epsilon);
// C11 = C11 - P7/e
add(m2,n2,-(double)1/epsilon,C21,ldc,C11,ldc,C11,ldc);
// C21 = (P9-P7)/e
subscalinf(NoField,m2,n2,U,n2,-(double)1/epsilon,C21,ldc);
// C21 += P5
FFLAS::faddin(NoField,m2,n2,V,n2,C21,ldc);
// S10
FFLAS::fsub(NoField,m2,k3,A13,lda,eA12,k3,X,k3);
FFLAS::fflas_delete( eA12);
// T10
FFLAS::fadd(NoField,k3,n2,eB31,n2,B32,ldb,Y,n2);
FFLAS::fflas_delete( eA22);
// P10 (in U)
gemm_bini_232_2(F,m2,n2,k3,X,k3,Y,n2,U,n2,rec-1,epsilon);
// C12 += P10
FFLAS::faddin(NoField,m2,n2,U,n2,C12,ldc);
// C11 += P10/e
add(m2,n2,(double)1/epsilon,U,n2,C11,ldc,C11,ldc);
FFLAS::fflas_delete( X );
FFLAS::fflas_delete( Y );
FFLAS::fflas_delete( U );
FFLAS::fflas_delete( V );
return C;
}
#if 0
template<class Field>
double *
gemm_bini_322_sqrt(const Field & F
, const size_t m
, const size_t n
, const size_t k
, const double *A , const size_t lda
, const double *B , const size_t ldb
, double *C , const size_t ldc
, int rec
, const double & epsilon
)
{
Givaro::ZRing<double> NoField;
// const double p = (double)F.characteristic();
size_t M = (n>m)?std::min(k,m):std::min(k,n);
// std::cout << rec << ',' << M << std::endl;
// Field G(p*p);
if ( M < TRE || rec <= 0) {
// std::cout << "ffw" << std::endl;
return gemm_fflas(F, m,n,k, A,lda, B,ldb, C, ldc);
// return gemm_fflas(NoField, m,n,k, A,lda, B,ldb, C, ldc);
}
assert(k/2*2==k); // k divisible par 2
assert(n/3*3==n); // n divisible par 2
assert(m/2*2==m); // m divisible par 3
// std::cout << "tested" << std::endl;
size_t m2 = m/2;
size_t k2 = k/2;
size_t n3 = n/3;
// std::cout << "€ = " << epsilon << std::endl;
// sub matrices in A
const double * A11 = A;
const double * A12 = A +k2;
const double * A21 = A +lda*m2;
const double * A22 = A21 +k2;
// sub matrices in C
double * C11 = C;
double * C12 = C +n3;
double * C13 = C +2*n3;
double * C21 = C +ldc*m2;
double * C22 = C21 +n3;
double * C23 = C21 +2*n3;
// sub matrices in B
const double * B11 = B;
const double * B12 = B +n3;
const double * B13 = B +2*n3;
const double * B21 = B +ldb*k2;
const double * B22 = B21 +n3;
const double * B23 = B21 +2*n3;
FFLAS::fzero(F,m,n,C,ldc);
/*
* Algo :
* S1 := B11 +B22;
* S4 := e*B21+B22;
* S5 := B11 +e*B21;
* S6 := B12 +B23;
* S9 := B12 +e*B13;
* S3 := e*B13+B23;
*
* T1 := e*A11 +A22;
* T2 := A12 +A22;
* T4 := -e*A11+A12;
* T5 := e*A21 +A22;
* T6 := A11 +e*A22;
* T7 := A11 +A21;
* T9 := A21 -e*A22;
* T3 := A11 +e*A12;
*
* P1 := S1 *T1;
* P2 := T2 * B22;
* P10 := A22 * B11;
* P4 := S4 *T4;
* P5 := S5 *T5;
* P6 := S6 *T6;
* P7 := T7*B12;
* P8 := A11*B23;
* P9 := S9 *T9;
* P3 := S3*T3;
*
* C11 := (P1-P2-P10+P4)/e;
* C21 := (P10-P5)/(-e) ;
* C12 := P4+P6-P3 ;
* C22 := P1-P5+P9;
* C13 := (-P8+P3)/e;
* C23 := (P6-P7-P8+P9)/e;
*
*/
// P10
gemm_bini_223_mem(F,m2,n3,k2,A22,lda,B11,ldb,C11,ldc,rec-1,epsilon);
// S5
double * Y = FFLAS::fflas_new<double>(k2*n3);
add(k2,n3,epsilon,B21,ldb,B11,ldb,Y,n3);
// T5
double * X = FFLAS::fflas_new<double>(m2*k2);
add(m2,k2,epsilon,A21,lda,A22,lda,X,k2);
// P5
gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C22,ldc,rec-1,epsilon);
// C12
subscal(NoField,m2,n3,C22,ldc,C11,ldc,(double)1/epsilon,C21,ldc);
// T2
FFLAS::fadd(NoField,m2,k2,A12,lda,A22,lda,X,k2);
// P2
gemm_bini_223_mem(F,m2,n3,k2,X,k2,B22,ldb,C13,ldc,rec-1,epsilon);
// C11
FFLAS::faddin(NoField,m2,n3,C13,ldc,C11,ldc);
// S1
FFLAS::fadd(NoField,k2,n3,B11,ldb,B22,ldb,Y,n3);
// T1
add(m2,k2,epsilon,A11,lda,A22,lda,X,k2);
// P1
gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C12,ldc,rec-1,epsilon);
// C22
FFLAS::fsub(NoField,m2,n3,C12,ldc,C22,ldc,C22,ldc);
// C11
FFLAS::fsub(NoField,m2,n3,C12,ldc,C11,ldc,C11,ldc);
// S4
add(k2,n3,epsilon,B21,ldb,B22,ldb,Y,n3);
// T4
add(m2,k2,-epsilon,A11,lda,A12,lda,X,k2);
// P4
gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C12,ldc,rec-1,epsilon);
// C11
addscalinf(NoField,m2,n3,C12,ldc,(double)1/epsilon,C11,ldc);
// S9
add(k2,n3,epsilon,B13,ldb,B12,ldb,Y,n3);
// T9
add(m2,k2,-epsilon,A22,lda,A21,lda,X,k2);
// P9
gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C23,ldc,rec-1,epsilon);
// C22
FFLAS::faddin(NoField,m2,n3,C23,ldc,C22,ldc);
// S6
FFLAS::fadd(NoField,k2,n3,B12,ldb,B23,ldb,Y,n3);
// T6
add(m2,k2,epsilon,A22,lda,A11,lda,X,k2);
// P6
gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C13,ldc,rec-1,epsilon);
// C21
FFLAS::faddin(NoField,m2,n3,C13,ldc,C12,ldc);
// C32
FFLAS::faddin(NoField,m2,n3,C13,ldc,C23,ldc);
// T7
FFLAS::fadd(NoField,m2,k2,A11,lda,A21,lda,X,k2);
// P7
gemm_bini_223_mem(F,m2,n3,k2,X,k2,B12,ldb,C13,ldc,rec-1,epsilon);
// if (epsilon > 1 && rec == 2) { FFLAS::finit(G,m2,n3,C31,ldc);}
// C32
FFLAS::fsubin(NoField,m2,n3,C13,ldc,C23,ldc);
// S3
add(k2,n3,epsilon,B13,ldb,B23,ldb,Y,n3);
// T3
add(m2,k2,epsilon,A12,lda,A11,lda,X,k2);
// P3
gemm_bini_223_mem(F,m2,n3,k2,X,k2,Y,n3,C13,ldc,rec-1,epsilon);
FFLAS::fflas_delete( Y );
FFLAS::fflas_delete( X );
// C21
FFLAS::fsubin(NoField,m2,n3,C13,ldc,C12,ldc);
// P8
Y = FFLAS::fflas_new<double>(m2*n3);
gemm_bini_223_mem(F,m2,n3,k2,A11,lda,B23,ldb,Y,n3,rec-1,epsilon);
// C31
subscalinf(NoField,m2,n3,Y,n3,(double)1/epsilon,C13,ldc);
// C32
subscalinf(NoField,m2,n3,Y,n3,(double)1/epsilon,C23,ldc);
FFLAS::fflas_delete( Y );
return C;
}
#endif
} // Rec
} // Protected
} // FFLAS
namespace FFLAS { namespace Protected {
template<class Field>
typename Field::Element *
gemm_bini_p(const Field &F
, const size_t m
, const size_t n
, const size_t k
, const typename Field::Element *A
, const size_t lda
, const typename Field::Element *B
, const size_t ldb
, typename Field::Element *C
, const size_t ldc
, int rec
, size_t algo
)
{
assert(k/6*6==k); // k divisible par 6
assert(n/6*6==n); // n divisible par 6
assert(m/6*6==m); // m divisible par 6
// e-formule
double epsilon = (double) F.characteristic() ;
switch(algo) {
case 0 :
Rec::gemm_bini_322_mem(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon);
FFLAS::finit_fuzzy(F,m,n,C,ldc);
// FFLAS::finit(F,m,n,C,ldc);
break;
case 1 :
Rec::gemm_bini_322_0(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon);
FFLAS::finit_fuzzy(F,m,n,C,ldc);
// FFLAS::finit(F,m,n,C,ldc);
break;
case 2 :
Rec::gemm_bini_322_2(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon);
FFLAS::finit_fuzzy(F,m,n,C,ldc);
break;
case 3 :
Rec::gemm_bini_223_mem(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon);
FFLAS::finit_fuzzy(F,m,n,C,ldc);
// FFLAS::finit(F,m,n,C,ldc);
break;
case 4 :
Rec::gemm_bini_232_2(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon);
FFLAS::finit_fuzzy(F,m,n,C,ldc);
break;
case 5 :
Rec::gemm_bini_232_3(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon);
FFLAS::finit_fuzzy(F,m,n,C,ldc);
break;
#if 0
case 8 : {
double epsilon2 = sqrt((double)epsilon);
std::cout << epsilon2 << std::endl;
Rec::gemm_bini_322_sqrt(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon2);
// FFLAS::finit_fuzzy(F,m,n,C,ldc);
for(size_t i = 0 ; i < m ; ++i) {
for(size_t j = 0 ; j < n ; ++j)
C[i*ldc+j] = rint(fmod(C[i*ldc+j],epsilon2));
}
break;
}
#endif
default :
std::cout << " not an algo :" << algo << std::endl;;
exit(-1);
}
return C;
}
template<class Field>
typename Field::Element *
gemm_bini_e(const Field &F
, const size_t m
, const size_t n
, const size_t k
, const typename Field::Element *A
, const size_t lda
, const typename Field::Element *B
, const size_t ldb
, typename Field::Element *C
, const size_t ldc
, int rec
, size_t algo
)
{
assert(k/2*2==k); // k divisible par 2
assert(n/2*2==n); // n divisible par 2
assert(m/3*3==m); // m divisible par 3
// e-formule
double epsilon = 1./(1<<27);
switch(algo) {
case 0 :
Rec::gemm_bini_322_mem(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon);
break;
case 1 :
Rec::gemm_bini_322_0(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon);
break;
case 2 :
Rec::gemm_bini_322_2(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon);
break;
case 3 :
Rec::gemm_bini_223_mem(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon);
break;
case 4 :
Rec::gemm_bini_232_2(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon);
break;
case 5 :
Rec::gemm_bini_232_3(F,m,n,k,A,lda,B,ldb,C,ldc,rec,epsilon);
break;
default :
std::cout << " not an algo :" << algo << std::endl;;
exit(-1);
}
// vire les e.
// FFLAS::finit_fuzzy(F,m,n,C,ldc);
FFLAS::finit_fuzzy(F,m,n,C,ldc);
return C;
}
template<class Field>
typename Field::Element *
gemm_compress(const Field &F
, const size_t m
, const size_t n
, const size_t k
, const typename Field::Element *A
, const size_t lda
, const typename Field::Element *B
, const size_t ldb
, typename Field::Element *C
, const size_t ldc
, int rec
, size_t algo
)
{
assert(k/6*6==k); // k divisible par 6
assert(n/6*6==n); // n divisible par 6
assert(m/6*6==m); // m divisible par 6
switch(algo) {
case 0 :
fgemm_compressed<Field,true>(F,(int)m,(int)n,(int)k,A,(int)lda,B,(int)ldb,C,(int)ldc);
FFLAS::freduce(F,m,n,C,ldc);
break;
case 1 :
fgemm_compressed<Field,false>(F,(int)m,(int)n,(int)k,A,(int)lda,B,(int)ldb,C,(int)ldc);
FFLAS::freduce(F,m,n,C,ldc);
break;
default :
std::cout << " not an algo :" << algo << std::endl;;
exit(-1);
}
return C;
}
} // Protected
} // FFLAS
template<class Field>
void check_equal(const Field & F,int m,int n,
typename Field::Element * D,int ldd,
typename Field::Element * E,int lde,
const char * nomalgo, const char * madescr, int & ok_p)
{
int faux = 0 ;
for (int i = 0 ; i < m ; ++i) {
for (int j = 0 ; j < n ; ++j) {
if (!F.areEqual(D[i*ldd+j],E[i*lde+j])) {
++faux ;
}
}
}
if (faux) {
std::cout << nomalgo << " " << madescr << " : bad/all = " << faux << '/' << m*n << " ~~ " << (double)faux/(double)(m*n) << std::endl;
}
else ok_p ++ ;
#if 1
if (faux && (n<20)) {
std::cout << "OK" << std::endl;
for (int i = 0 ; i < m ; ++i) {
for (int j = 0 ; j < n ; ++j)
std::cout << D[i*ldd+j] << ' ';
std::cout << std::endl;
}
std::cout << "KO" << std::endl;
for (int i = 0 ; i < m ; ++i) {
for (int j = 0 ; j < n ; ++j)
std::cout << E[i*lde+j] << ' ';
std::cout << std::endl;
}
std::cout << "Diff" << std::endl;
for (int i = 0 ; i < m ; ++i) {
for (int j = 0 ; j < n ; ++j)
std::cout << D[i*ldd+j]-E[i*lde+j] << ' ';
std::cout << std::endl;
}
}
#endif
}
template<class Field>
void test_algos(const Field &F, int m, int n, int k
, const typename Field::Element * A, int lda
, const typename Field::Element * B, int ldb
, int r
, time_v & tim_k, time_v & tim_e , time_v & tim_p
, int_v & ok_k, int_v & ok_e, int_v & ok_p
, FFLAS::Timer & tim_wd, int & nb_wd
, bool with_e
, bool with_k
)
{
FFLAS::Timer tmp ;
typedef typename Field::Element Element;
Element * D = FFLAS::fflas_new<Element>(m*n);
Element * C = FFLAS::fflas_new<Element>(m*n);
tmp.clear();tmp.start();
fgemm(F,FFLAS::FflasNoTrans,FFLAS::FflasNoTrans,
m,n,k, 1, A,k, B,n, 0, D, n);
tmp.stop(); tim_wd += tmp ; nb_wd ++;
/* bini_p */
if (with_e) {
for (int algo = 0 ; algo < algos ; ++algo) {
tmp.clear();tmp.start();
FFLAS::Protected::gemm_bini_e(F,m,n,k,A,k,B,n,C,n,r,selec[algo]);
tmp.stop(); tim_e[algo] += tmp ;
/* checking */
check_equal(F,m,n,D,n,C,n,"bini_e",descr[algo],ok_e[algo]) ;
}
}
/* compress */
if (with_k && std::is_same<typename FieldTraits<Field>::category,FFLAS::FieldCategories::ModularTag>::value && (! FieldTraits<Field>::balanced)) {
for (int algo = 0 ; algo < algos_k ; ++algo) {
tmp.clear();tmp.start();
FFLAS::Protected::gemm_compress(F,m,n,k,A,k,B,n,C,n,r,selec_k[algo]);
tmp.stop(); tim_k[algo] += tmp ;
/* checking */
check_equal(F,m,n,D,n,C,n,"compress",descr_k[algo],ok_k[algo]) ;
}
}
/* bini_p */
for (int algo = 0 ; algo < algos ; ++algo) {
tmp.clear();tmp.start();
FFLAS::Protected::gemm_bini_p(F,m,n,k,A,k,B,n,C,n,r,selec[algo]);
tmp.stop(); tim_p[algo] += tmp ;
/* checking */
check_equal(F,m,n,D,n,C,n,"bini_p",descr[algo],ok_p[algo]) ;
}
FFLAS::fflas_delete(C);
FFLAS::fflas_delete(D);
}
template<class T>
struct changeField {
typedef T other ;
};
template<>
struct changeField<Modular<double> > {
typedef Givaro::Modular<float> other;
};
template<>
struct changeField<ModularBalanced<double> > {
typedef ModularBalanced<float> other;
};
double descrip(int algo, int_v & ok_e, time_v & tim_e, int iters, const char ** madescr, const char * nom)
{
int min_e = -1 ;
double bini_e = -1 ;
for (int i = 0 ; i < algo ; ++i){
if (ok_e[i] == (int)iters) {
double bini1 = tim_e[i].usertime()/(double)ok_e[i] ;
if (bini_e < 0) {
bini_e = bini1;
min_e = (int) i ;
}
else if (bini1 < bini_e) {
min_e = (int)i ;
bini_e = bini1 ;
}
}
}
for (int i = 0 ; i < algo ; ++i){
if (ok_e[i] == (int)iters) {
double bini1 = tim_e[i].usertime()/(double)ok_e[i] ;
std::cout << nom << " ( " << madescr[i] << " ) : " ;
if ((int)i == min_e) std::cout << " * " ;
else std::cout << " ";
std::cout << bini1 << 's'<< std::endl;
}
}
return bini_e ;
}
template<class Field>
void test(int m, int k, int n, int p, int r, bool with_e, bool with_k, int iters = 4, uint64_t seed=0)
{
typedef typename Field::Element Element;
Element * A = FFLAS::fflas_new<Element>(m*k);
Element * B = FFLAS::fflas_new<Element>(n*k);
Field F(p);
typename Field::RandIter G(F,0,seed);
F.write(std::cout<< " * Field " ) << std::endl;
typedef typename changeField<Field>::other Field_f ;
typedef typename Field_f::Element Element_f ;
Field_f F_f(p);
Element_f * A_f = FFLAS::fflas_new<Element_f>(m*k);
Element_f * B_f = FFLAS::fflas_new<Element_f>(n*k);
Element_f * C_f = FFLAS::fflas_new<Element_f>(m*n);
#if defined(NOTRANDOM)
int i0 ;
int j0 ;
Element p2 ; F.init(p2,(int)F.mOne/2);
std::cout << p2 << std::endl;
#warning "not random"
for (int i = 0 ; i < m ; ++i)
for (int j = 0 ; j < k ; ++j) {
i0 = i/(m/3);
j0 = j/(k/2);
if (i0 == 0 and j0 == 0) A[i*k+j] = F.mOne ;
else if (i0 == 0 and j0 == 1) A[i*k+j] = F.zero ;
else if (i0 == 1 and j0 == 0) A[i*k+j] = F.mOne ;
else if (i0 == 1 and j0 == 1) A[i*k+j] = F.mOne ;
else if (i0 == 2 and j0 == 0) A[i*k+j] = F.mOne ;
else if (i0 == 2 and j0 == 1) A[i*k+j] = F.mOne ;
else A[i*k+j] = F.mOne ;
}
for (int i = 0 ; i < k ; ++i)
for (int j = 0 ; j < n ; ++j) {
i0 = i/(k/2);
j0 = j/(n/2);
if (i0 == 0 and j0 == 0) B[i*n+j] = F.mOne ;
else if (i0 == 0 and j0 == 1) B[i*n+j] = F.mOne ;
else if (i0 == 1 and j0 == 0) B[i*n+j] = F.mOne ;
else if (i0 == 1 and j0 == 1) B[i*n+j] = F.zero ;
else B[i*n+j] = F.mOne ;
}
#endif
time_v tim_e(algos), tim_p(algos), tim_k(algos_k);
FFLAS::Timer tim_wd; tim_wd.clear();
FFLAS::Timer tim_wf; tim_wf.clear();
FFLAS::Timer tmp;
for (int i = 0 ; i < algos ; ++i) {
tim_e[i].clear();
tim_p[i].clear();
}
for (int i = 0 ; i < algos_k ; ++i) {
tim_k[i].clear();
}
int_v ok_p(algos,0), ok_e(algos,0), ok_k(algos_k,0);
int nb_wd = 0 , nb_wf = 0 ;
for (int b = 0 ; b < iters ; ++b) {
std::cout << "iter " << b+1 << " of " << iters << std::endl;
#if not defined(NOTRANDOM)
FFPACK::RandomMatrix(F, m, k, A, k, G);
FFPACK::RandomMatrix(F, k, n, B, n, G);
#endif
FFLAS::finit(F_f,m,k,A,k,A_f,k);
FFLAS::finit(F_f,k,n,B,n,B_f,n);
tmp.clear();tmp.start();
fgemm(F_f,FFLAS::FflasNoTrans,FFLAS::FflasNoTrans,
m,n,k, 1, A_f,k, B_f,n, 0, C_f, n);
tmp.stop(); tim_wf += tmp ; nb_wf ++ ;
test_algos(F,m,n,k,A,k,B,n,r,
tim_k,tim_e,tim_p,
ok_k,ok_e,ok_p,
tim_wd,nb_wd,
with_e,with_k);
}
std::cout << std::endl << "results" << std::endl;
double bini_e = descrip(algos,ok_e,tim_e,iters,descr,"Bini_e");
double bini_p = descrip(algos,ok_p,tim_p,iters,descr,"Bini_p");
double bini_k = descrip(algos_k,ok_k,tim_k,iters,descr_k,"Bini_k");
double t_wd = tim_wd.usertime()/(double)(nb_wd);
double t_wf = tim_wf.usertime()/(double)(nb_wf);
std::cout << "Wino d : " << t_wd << 's'<< std::endl;
std::cout << "Wino f : " << t_wf << 's'<< std::endl;
double wino = std::min(t_wd,t_wf) ;
if (bini_e>=0)
std::cout << "Gain e: " << ((bini_e-wino)/wino)*100 << '%' << std::endl;
if (bini_p>=0)
std::cout << "Gain p: " << ((bini_p-wino)/wino)*100 << '%' << std::endl;
if (bini_k>=0)
std::cout << "Gain k: " << ((bini_k-wino)/wino)*100 << '%' << std::endl;
FFLAS::fflas_delete( A );
FFLAS::fflas_delete( B);
FFLAS::fflas_delete( A_f );
FFLAS::fflas_delete( B_f);
FFLAS::fflas_delete( C_f);
}
int main(int ac, char **av) {
static int m = 36 ;
static int n = 12 ;
static int k = 18 ;
static int p = 101;
bool eps = false ;
bool kom = false ;
int r = 1 ;
uint64_t seed = getSeed();
int iters = 4;
static Argument as[] = {
{ 'p', "-p P", "Set the field characteristic.", TYPE_INT , &p },
{ 'n', "-n N", "Set the number of cols in C.", TYPE_INT , &n },
{ 'm', "-m N", "Set the number of rows in C.", TYPE_INT , &m },
{ 'k', "-k N", "Set the number of rows in B.", TYPE_INT , &k },
{ 'r', "-k N", "Set the recursive number Bini.", TYPE_INT , &r },
{ 'i', "-i N", "Set the numebr of iterations.", TYPE_INT , &iters },
{ 's', "-s N", "Set the seed .", TYPE_UINT64 , &seed },
{ 'e', "-e " , "epsilon .", TYPE_NONE , &eps },
{ 'c', "-c " , "compress .", TYPE_NONE , &kom},
END_OF_ARGUMENTS
};
FFLAS::parseArguments(ac,av,as);
srand(seed);
srand48(seed);
std::cout << ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>" << std::endl;
std::cout << "size: " << m << ',' << k << ',' << n << std::endl;
std::cout << "mod : " << p << std::endl;
std::cout << "rec : " << r << std::endl;
std::cout << "seed: " << seed << std::endl;
std::cout << "thre: " << TRE << std::endl;
std::cout << "=====================================================" << std::endl;
test<Modular<double> > (m,k,n,p,r,eps,kom,iters,seed);
std::cout << "=====================================================" << std::endl;
test<ModularBalanced<double> > (m,k,n,p,r,eps,kom,iters,seed);
std::cout << "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" << std::endl;
return 0;
}
/* -*- mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
// vim:sts=4:sw=4:ts=4:et:sr:cino=>s,f0,{0,g0,(0,\:0,t0,+0,=s