Content - 428131fcfcf94fb7db215a461b771ecca7c7f95e - 56f1d1b/src/operators.txx

visit type:
Tip revision: 20385423c077704ac4c176b4380c070a33670267 authored by Malte Brunn on 05 November 2020, 11:48:01 UTC
removed unsupported compute capabilities and added new ones
Tip revision: 2038542
operators.txx
/*
 *  Copyright (c) 2014-2015, Amir Gholami, George Biros
 *  All rights reserved.
 *  This file is part of AccFFT library.
 *
 *  AccFFT is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  AccFFT is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with AccFFT.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

#ifndef OPERATORS_TXX
#define OPERATORS_TXX
#include "accfft_operators.h"
template<typename Tc>
static void grad_mult_wave_numberx(Tc* wA, Tc* A, int* N, MPI_Comm c_comm,
	int* size, int* start,	std::bitset<3> xyz) {

	int procid;
	MPI_Comm_rank(c_comm, &procid);
	double scale = 1;
	if (xyz[0])
		scale *= N[0];
	if (xyz[1])
		scale *= N[1];
	if (xyz[2])
		scale *= N[2];
	scale = 1. / scale;

//#pragma omp parallel
	{
		long int X, wave;
		long int ptr = 0;
//#pragma omp for
		for (int i = 0; i < size[0]; i++) {
      ptr = i * size[1] * size[2];
			for (int j = 0; j < size[1] * size[2]; j++) {
					X = (i + start[0]);
					wave = X;

					if (X > N[0] / 2)
						wave -= N[0];
					if (X == N[0] / 2)
						wave = 0; // Filter Nyquist

					//ptr = (i * size[1] + j) * size[2] + k;
					wA[ptr][0] = -scale * wave * A[ptr][1];
					wA[ptr][1] = scale * wave * A[ptr][0];
          ++ptr;
			}
		}
	}

	return;
}

template<typename Tc>
static void grad_mult_wave_numbery(Tc* wA, Tc* A, int* N, MPI_Comm c_comm,
		int* size, int* start, std::bitset<3> xyz) {

	int procid;
	MPI_Comm_rank(c_comm, &procid);

	double scale = 1;
	if (xyz[0])
		scale *= N[0];
	if (xyz[1])
		scale *= N[1];
	if (xyz[2])
		scale *= N[2];
	//PCOUT<<scale<<std::endl;
	scale = 1. / scale;


//#pragma omp parallel
	{
		long int X, Y, Z, wave;
		long int ptr;
//#pragma omp for
		for (int i = 0; i < size[0]; i++) {
			for (int j = 0; j < size[1]; j++) {
				for (int k = 0; k < size[2]; k++) {
					//X = (i + start[0]);
					Y = (j + start[1]);
					//Z = (k + start[2]);

					wave = Y;

					if (Y > N[1] / 2)
						wave -= N[1];
					if (Y == N[1] / 2)
						wave = 0; // Filter Nyquist

					ptr = (i * size[1] + j) * size[2] + k;
					wA[ptr][0] = -scale * wave * A[ptr][1];
					wA[ptr][1] = scale * wave * A[ptr][0];
				}
			}
		}
	}

	return;
}

template<typename Tc>
static void grad_mult_wave_numberz(Tc* wA, Tc* A, int* N, MPI_Comm c_comm,
		int* size, int* start, std::bitset<3> xyz) {

	int procid;
	MPI_Comm_rank(c_comm, &procid);
	double scale = 1;
	if (xyz[0])
		scale *= N[0];
	if (xyz[1])
		scale *= N[1];
	if (xyz[2])
		scale *= N[2];
	//PCOUT<<scale<<std::endl;
	scale = 1. / scale;

	// int istart[3], isize[3], osize[3], ostart[3];
	// accfft_local_size_dft_r2c_t<Tc>(N, isize, istart, osize, ostart, c_comm);
	//PCOUT<<osize[0]<<'\t'<<osize[1]<<'\t'<<osize[2]<<std::endl;

//#pragma omp parallel
	{
		long int  Z, wave;
		long int ptr;
//#pragma omp for
		for (int i = 0; i < size[0]; i++) {
			for (int j = 0; j < size[1]; j++) {
				for (int k = 0; k < size[2]; k++) {
					//X = (i + start[0]);
					//Y = (j + start[1]);
					Z = (k + start[2]);

					wave = Z;

					if (Z > N[2] / 2)
						wave -= N[2];
					if (Z == N[2] / 2)
						wave = 0; // Filter Nyquist

					ptr = (i * size[1] + j) * size[2] + k;
					wA[ptr][0] = -scale * wave * A[ptr][1];
					wA[ptr][1] = scale * wave * A[ptr][0];
				}
			}
		}
	}

	return;
}

template<typename Tc>
static void grad_mult_wave_number_laplace(Tc* wA, Tc* A, int* N,
		MPI_Comm c_comm) {

	int procid;
	MPI_Comm_rank(c_comm, &procid);
	const double scale = 1. / (N[0] * N[1] * N[2]);

	int istart[3], isize[3], osize[3], ostart[3];
	accfft_local_size_dft_r2c_t<Tc>(N, isize, istart, osize, ostart, c_comm);

//#pragma omp parallel
	{
		long int X, Y, Z, wx, wy, wz, wave;
		long int ptr;
//#pragma omp for
		for (int i = 0; i < osize[0]; i++) {
			for (int j = 0; j < osize[1]; j++) {
				for (int k = 0; k < osize[2]; k++) {
					X = (i + ostart[0]);
					Y = (j + ostart[1]);
					Z = (k + ostart[2]);

					wx = X;
					wy = Y;
					wz = Z;

					if (X > N[0] / 2)
						wx -= N[0];
					if (X == N[0] / 2)
						wx = 0;

					if (Y > N[1] / 2)
						wy -= N[1];
					if (Y == N[1] / 2)
						wy = 0;

					if (Z > N[2] / 2)
						wz -= N[2];
					if (Z == N[2] / 2)
						wz = 0;

					wave = -wx * wx - wy * wy - wz * wz;

					ptr = (i * osize[1] + j) * osize[2] + k;
					wA[ptr][0] = scale * wave * A[ptr][0];
					wA[ptr][1] = scale * wave * A[ptr][1];
				}
			}
		}
	}

	return;
}

template<typename Tc>
static void biharmonic_mult_wave_number(Tc* wA, Tc* A, int* N,
		MPI_Comm c_comm) {

	int procid;
	MPI_Comm_rank(c_comm, &procid);
	const double scale = 1. / (N[0] * N[1] * N[2]);

	int istart[3], isize[3], osize[3], ostart[3];
	accfft_local_size_dft_r2c_t<Tc>(N, isize, istart, osize, ostart, c_comm);

//#pragma omp parallel
	{
		long int X, Y, Z, wx, wy, wz, wave;
		long int ptr;
//#pragma omp for
		for (int i = 0; i < osize[0]; i++) {
			for (int j = 0; j < osize[1]; j++) {
				for (int k = 0; k < osize[2]; k++) {
					X = (i + ostart[0]);
					Y = (j + ostart[1]);
					Z = (k + ostart[2]);

					wx = X;
					wy = Y;
					wz = Z;

					if (X > N[0] / 2)
						wx -= N[0];
					if (X == N[0] / 2)
						wx = 0;

					if (Y > N[1] / 2)
						wy -= N[1];
					if (Y == N[1] / 2)
						wy = 0;

					if (Z > N[2] / 2)
						wz -= N[2];
					if (Z == N[2] / 2)
						wz = 0;

					wave = -wx * wx - wy * wy - wz * wz;
					wave *= wave;

					ptr = (i * osize[1] + j) * osize[2] + k;
					wA[ptr][0] = scale * wave * A[ptr][0];
					wA[ptr][1] = scale * wave * A[ptr][1];
				}
			}
		}
	}

	return;
}

template<typename Tc>
static void mult_wave_number_inv_laplace(Tc* wA, Tc* A, int* N,
		MPI_Comm c_comm) {

	int procid;
	MPI_Comm_rank(c_comm, &procid);
	const double scale = 1. / (N[0] * N[1] * N[2]);

	int istart[3], isize[3], osize[3], ostart[3];
	accfft_local_size_dft_r2c_t<Tc>(N, isize, istart, osize, ostart, c_comm);

//#pragma omp parallel
	{
		long int X, Y, Z, wx, wy, wz;
		double wave;
		long int ptr;
//#pragma omp for
		for (int i = 0; i < osize[0]; i++) {
			for (int j = 0; j < osize[1]; j++) {
				for (int k = 0; k < osize[2]; k++) {
					X = (i + ostart[0]);
					Y = (j + ostart[1]);
					Z = (k + ostart[2]);

					wx = X;
					wy = Y;
					wz = Z;

					if (X > N[0] / 2)
						wx -= N[0];
					if(X == N[0] / 2)
					  wx=0;

					if (Y > N[1] / 2)
						wy -= N[1];
					if(Y == N[1] / 2)
					  wy=0;

					if (Z > N[2] / 2)
						wz -= N[2];
					if(Z == N[2] / 2)
					  wz=0;

					wave = -wx * wx - wy * wy - wz * wz;
					if (wave == 0)
						wave = 0;
          else
					  wave = 1. / wave;

					ptr = (i * osize[1] + j) * osize[2] + k;
					wA[ptr][0] = scale * wave * A[ptr][0];
					wA[ptr][1] = scale * wave * A[ptr][1];
				}
			}
		}
	}

	return;
}

template<typename Tc>
static void mult_wave_number_inv_biharmonic(Tc* wA, Tc* A, int* N,
		MPI_Comm c_comm) {

	int procid;
	MPI_Comm_rank(c_comm, &procid);
	const double scale = 1. / (N[0] * N[1] * N[2]);

	int istart[3], isize[3], osize[3], ostart[3];
	accfft_local_size_dft_r2c_t<Tc>(N, isize, istart, osize, ostart, c_comm);

//#pragma omp parallel
	{
		long int X, Y, Z, wx, wy, wz;
		double wave;
		long int ptr;
//#pragma omp for
		for (int i = 0; i < osize[0]; i++) {
			for (int j = 0; j < osize[1]; j++) {
				for (int k = 0; k < osize[2]; k++) {
					X = (i + ostart[0]);
					Y = (j + ostart[1]);
					Z = (k + ostart[2]);

					wx = X;
					wy = Y;
					wz = Z;

					if (X > N[0] / 2)
						wx -= N[0];
					//if(X==N[0]/2)
					//  wx=0;

					if (Y > N[1] / 2)
						wy -= N[1];
					//if(Y==N[1]/2)
					//  wy=0;

					if (Z > N[2] / 2)
						wz -= N[2];
					//if(Z==N[2]/2)
					//  wz=0;

					wave = -wx * wx - wy * wy - wz * wz;
					wave *= wave;
					if (wave == 0)
						wave = 1;
					wave = 1. / wave;
					ptr = (i * osize[1] + j) * osize[2] + k;
					wA[ptr][0] = scale * wave * A[ptr][0];
					wA[ptr][1] = scale * wave * A[ptr][1];
				}
			}
		}
	}

	return;
}

template<typename Tc>
static void grad_mult_wave_numberx_inplace(Tc* A, int* N, MPI_Comm c_comm,
	int* size, int* start,	std::bitset<3> xyz) {

	int procid;
	MPI_Comm_rank(c_comm, &procid);
	double scale = 1;
	if (xyz[0])
		scale *= N[0];
	if (xyz[1])
		scale *= N[1];
	if (xyz[2])
		scale *= N[2];
	scale = 1. / scale;
#pragma omp parallel
	{
		long int X, wave;
		long int ptr = 0;
    Tc tmp_c;
#pragma omp for
		for (int i = 0; i < size[1]; i++) {
			for (int j = 0; j < size[0] * size[2]; j++) {
          ptr = i + j* size[1];
					X = i;
					wave = X;

					if (X > N[0] / 2)
						wave -= N[0];
					if (X == N[0] / 2)
						wave = 0; // Filter Nyquist

					//ptr = (i * size[1] + j) * size[2] + k;
          tmp_c[0] = A[ptr][0];
          tmp_c[1] = A[ptr][1];
					A[ptr][0] = -scale * wave * tmp_c[1];
					A[ptr][1] = scale * wave * tmp_c[0];
			}
		}
	}

//	{
//		long int X, wave;
//		long int ptr = 0;
//    Tc tmp_c;
////#pragma omp for
//		for (int i = 0; i < size[1]; i++) {
//      ptr = i * size[0] * size[2];
//			for (int j = 0; j < size[0] * size[2]; j++) {
//					X = i;
//					wave = X;
//
//					if (X > N[0] / 2)
//						wave -= N[0];
//					if (X == N[0] / 2)
//						wave = 0; // Filter Nyquist
//
//					//ptr = (i * size[1] + j) * size[2] + k;
//          tmp_c[0] = A[ptr][0];
//          tmp_c[1] = A[ptr][1];
//					A[ptr][0] = -scale * wave * tmp_c[1];
//					A[ptr][1] = scale * wave * tmp_c[0];
//          ++ptr;
//			}
//		}
//	}
	return;
}

template<typename Tc>
static void grad_mult_wave_numbery_inplace(Tc* A, int* N, MPI_Comm c_comm,
		int* size, int* start, std::bitset<3> xyz) {

	int procid;
	MPI_Comm_rank(c_comm, &procid);

	double scale = 1;
	if (xyz[0])
		scale *= N[0];
	if (xyz[1])
		scale *= N[1];
	if (xyz[2])
		scale *= N[2];
	//PCOUT<<scale<<std::endl;
	scale = 1. / scale;
#pragma omp parallel
	{
		long int Y, wave;
		long int ptr = 0;
    Tc tmp_c;
#pragma omp for
		for (int i = 0; i < size[2]; i++) {
			for (int j = 0; j < size[0] * size[1]; j++) {
        ptr = i + j* size[2];
					Y = i;
					wave = Y;

					if (Y > N[1] / 2)
						wave -= N[1];
					if (Y == N[1] / 2)
						wave = 0; // Filter Nyquist

					//ptr = (i * size[1] + j) * size[2] + k;
          tmp_c[0] = A[ptr][0];
          tmp_c[1] = A[ptr][1];
					A[ptr][0] = -scale * wave * tmp_c[1];
					A[ptr][1] = scale * wave * tmp_c[0];
          ++ptr;
			}
		}
	}



	return;
}

template<typename Tc>
static void grad_mult_wave_numberz_inplace(Tc* A, int* N, MPI_Comm c_comm,
		int* size, int* start, std::bitset<3> xyz) {

	int procid;
	MPI_Comm_rank(c_comm, &procid);
	double scale = 1;
	if (xyz[0])
		scale *= N[0];
	if (xyz[1])
		scale *= N[1];
	if (xyz[2])
		scale *= N[2];
	//PCOUT<<scale<<std::endl;
	scale = 1. / scale;

	// int istart[3], isize[3], osize[3], ostart[3];
	// accfft_local_size_dft_r2c_t<Tc>(N, isize, istart, osize, ostart, c_comm);
	//PCOUT<<osize[0]<<'\t'<<osize[1]<<'\t'<<osize[2]<<std::endl;

#pragma omp parallel
	{
		long int  Z, wave;
		long int ptr;
    Tc tmp_c;
#pragma omp for
		for (int i = 0; i < size[0]; i++) {
			for (int j = 0; j < size[1]; j++) {
				for (int k = 0; k < size[2]; k++) {
					//X = (i + start[0]);
					//Y = (j + start[1]);
					Z = (k + start[2]);

					wave = Z;

					if (Z > N[2] / 2)
						wave -= N[2];
					if (Z == N[2] / 2)
						wave = 0; // Filter Nyquist

					ptr = (i * size[1] + j) * size[2] + k;
          tmp_c[0] = A[ptr][0];
          tmp_c[1] = A[ptr][1];
					A[ptr][0] = -scale * wave * tmp_c[1];
					A[ptr][1] = scale * wave * tmp_c[0];
				}
			}
		}
	}

	return;
}

template<typename Tc>
static void grad_mult_wave_number_laplace_inplace(Tc* A, int* N,
		MPI_Comm c_comm) {

	int procid;
	MPI_Comm_rank(c_comm, &procid);
	const double scale = 1. / (N[0] * N[1] * N[2]);

	int istart[3], isize[3], osize[3], ostart[3];
	accfft_local_size_dft_r2c_t<Tc>(N, isize, istart, osize, ostart, c_comm);

//#pragma omp parallel
	{
		long int X, Y, Z, wx, wy, wz, wave;
		long int ptr;
//#pragma omp for
		for (int i = 0; i < osize[0]; i++) {
			for (int j = 0; j < osize[1]; j++) {
				for (int k = 0; k < osize[2]; k++) {
					X = (i + ostart[0]);
					Y = (j + ostart[1]);
					Z = (k + ostart[2]);

					wx = X;
					wy = Y;
					wz = Z;

					if (X > N[0] / 2)
						wx -= N[0];
					if (X == N[0] / 2)
						wx = 0;

					if (Y > N[1] / 2)
						wy -= N[1];
					if (Y == N[1] / 2)
						wy = 0;

					if (Z > N[2] / 2)
						wz -= N[2];
					if (Z == N[2] / 2)
						wz = 0;

					wave = -wx * wx - wy * wy - wz * wz;

					ptr = (i * osize[1] + j) * osize[2] + k;
					A[ptr][0] *= scale * wave;
					A[ptr][1] *= scale * wave;
				}
			}
		}
	}

	return;
}

template<typename Tc>
static void biharmonic_mult_wave_number_inplace(Tc* A, int* N,
		MPI_Comm c_comm) {

	int procid;
	MPI_Comm_rank(c_comm, &procid);
	const double scale = 1. / (N[0] * N[1] * N[2]);

	int istart[3], isize[3], osize[3], ostart[3];
	accfft_local_size_dft_r2c_t<Tc>(N, isize, istart, osize, ostart, c_comm);

//#pragma omp parallel
	{
		long int X, Y, Z, wx, wy, wz, wave;
		long int ptr;
//#pragma omp for
		for (int i = 0; i < osize[0]; i++) {
			for (int j = 0; j < osize[1]; j++) {
				for (int k = 0; k < osize[2]; k++) {
					X = (i + ostart[0]);
					Y = (j + ostart[1]);
					Z = (k + ostart[2]);

					wx = X;
					wy = Y;
					wz = Z;

					if (X > N[0] / 2)
						wx -= N[0];
					if (X == N[0] / 2)
						wx = 0;

					if (Y > N[1] / 2)
						wy -= N[1];
					if (Y == N[1] / 2)
						wy = 0;

					if (Z > N[2] / 2)
						wz -= N[2];
					if (Z == N[2] / 2)
						wz = 0;

					wave = -wx * wx - wy * wy - wz * wz;
					wave *= wave;

					ptr = (i * osize[1] + j) * osize[2] + k;
					A[ptr][0] *= scale * wave;
					A[ptr][1] *= scale * wave;
				}
			}
		}
	}

	return;
}

template<typename Tc>
static void mult_wave_number_inv_laplace_inplace(Tc* A, int* N,
		MPI_Comm c_comm) {

	int procid;
	MPI_Comm_rank(c_comm, &procid);
	const double scale = 1. / (N[0] * N[1] * N[2]);

	int istart[3], isize[3], osize[3], ostart[3];
	accfft_local_size_dft_r2c_t<Tc>(N, isize, istart, osize, ostart, c_comm);

//#pragma omp parallel
	{
		long int X, Y, Z, wx, wy, wz;
		double wave;
		long int ptr;
//#pragma omp for
		for (int i = 0; i < osize[0]; i++) {
			for (int j = 0; j < osize[1]; j++) {
				for (int k = 0; k < osize[2]; k++) {
					X = (i + ostart[0]);
					Y = (j + ostart[1]);
					Z = (k + ostart[2]);

					wx = X;
					wy = Y;
					wz = Z;

					if (X > N[0] / 2)
						wx -= N[0];
					if(X == N[0] / 2)
					  wx=0;

					if (Y > N[1] / 2)
						wy -= N[1];
					if(Y == N[1] / 2)
					  wy=0;

					if (Z > N[2] / 2)
						wz -= N[2];
					if(Z == N[2] / 2)
					  wz=0;

					wave = -wx * wx - wy * wy - wz * wz;
					if (wave == 0)
						wave = 0;
          else
					  wave = 1. / wave;

					ptr = (i * osize[1] + j) * osize[2] + k;
					A[ptr][0] *= scale * wave;
					A[ptr][1] *= scale * wave;
				}
			}
		}
	}

	return;
}


template<typename Tc>
static void mult_wave_number_inv_biharmonic_inplace(Tc* A, int* N,
		MPI_Comm c_comm) {

	int procid;
	MPI_Comm_rank(c_comm, &procid);
	const double scale = 1. / (N[0] * N[1] * N[2]);

	int istart[3], isize[3], osize[3], ostart[3];
	accfft_local_size_dft_r2c_t<Tc>(N, isize, istart, osize, ostart, c_comm);

//#pragma omp parallel
	{
		long int X, Y, Z, wx, wy, wz;
		double wave;
		long int ptr;
//#pragma omp for
		for (int i = 0; i < osize[0]; i++) {
			for (int j = 0; j < osize[1]; j++) {
				for (int k = 0; k < osize[2]; k++) {
					X = (i + ostart[0]);
					Y = (j + ostart[1]);
					Z = (k + ostart[2]);

					wx = X;
					wy = Y;
					wz = Z;

					if (X > N[0] / 2)
						wx -= N[0];
					//if(X==N[0]/2)
					//  wx=0;

					if (Y > N[1] / 2)
						wy -= N[1];
					//if(Y==N[1]/2)
					//  wy=0;

					if (Z > N[2] / 2)
						wz -= N[2];
					//if(Z==N[2]/2)
					//  wz=0;

					wave = -wx * wx - wy * wy - wz * wz;
					wave *= wave;
					if (wave == 0)
						wave = 1;
					wave = 1. / wave;
					ptr = (i * osize[1] + j) * osize[2] + k;
					A[ptr][0] *= scale * wave;
					A[ptr][1] *= scale * wave;
				}
			}
		}
	}

	return;
}

template<typename T, typename Tp>
void accfft_grad_slow_t(T* A_x, T* A_y, T*A_z, T* A, Tp* plan, std::bitset<3>* pXYZ,
		double* timer) {
	typedef T Tc[2];
	int procid;
	MPI_Comm c_comm = plan->c_comm;
	MPI_Comm_rank(c_comm, &procid);
	if (!plan->r2c_plan_baked) {
		PCOUT << "Error in accfft_grad! plan is not correctly made."
				<< std::endl;
		return;
	}
	std::bitset < 3 > XYZ;
	if (pXYZ != NULL) {
		XYZ = *pXYZ;
	} else {
		XYZ[0] = 1;
		XYZ[1] = 1;
		XYZ[2] = 1;
	}
	double timings[7] = { 0 };

	double self_exec_time = -MPI_Wtime();
	int *N = plan->N;

	int isize[3], osize[3], istart[3], ostart[3];
	long long int alloc_max;
	/* Get the local pencil size and the allocation size */
	alloc_max = accfft_local_size_dft_r2c_t<T>(N, isize, istart, osize, ostart,
			c_comm);
	//PCOUT<<"istart[0]= "<<istart[0]<<" istart[1]= "<<istart[1]<<" istart[2]="<<istart[2]<<std::endl;
	//PCOUT<<"ostart[0]= "<<ostart[0]<<" ostart[1]= "<<ostart[1]<<" ostart[2]="<<ostart[2]<<std::endl;

	Tc* A_hat = (Tc*) accfft_alloc(alloc_max);
	Tc* tmp = (Tc*) accfft_alloc(alloc_max);
	std::bitset < 3 > scale_xyz(0);
	scale_xyz[0] = 1;
	scale_xyz[1] = 1;
	scale_xyz[2] = 1;

	//MPI_Barrier(c_comm);

	/* Forward transform */
	accfft_execute_r2c_t<T, Tc, Tp>(plan, A, A_hat, timings);

	/* Multiply x Wave Numbers */
	if (XYZ[0]) {
    timings[6] += -MPI_Wtime();
		grad_mult_wave_numberx<Tc>(tmp, A_hat, N, c_comm, osize, ostart, scale_xyz);
    timings[6] += +MPI_Wtime();
		//MPI_Barrier(c_comm);

		/* Backward transform */
		accfft_execute_c2r_t<Tc, T, Tp>(plan, tmp, A_x, timings);
	}
	/* Multiply y Wave Numbers */
	if (XYZ[1]) {
    timings[6] += -MPI_Wtime();
		grad_mult_wave_numbery<Tc>(tmp, A_hat, N, c_comm, osize, ostart, scale_xyz);
    timings[6] += +MPI_Wtime();
		/* Backward transform */
		accfft_execute_c2r_t<Tc, T, Tp>(plan, tmp, A_y, timings);
	}

	/* Multiply z Wave Numbers */
	if (XYZ[2]) {
    timings[6] += -MPI_Wtime();
		grad_mult_wave_numberz<Tc>(tmp, A_hat, N, c_comm, osize, ostart, scale_xyz);
    timings[6] += +MPI_Wtime();
		/* Backward transform */
		accfft_execute_c2r_t<Tc, T, Tp>(plan, tmp, A_z, timings);
	}

	accfft_free(A_hat);
	accfft_free(tmp);

	self_exec_time += MPI_Wtime();

	if (timer == NULL) {
		//delete [] timings;
	} else {
		timer[0] += timings[0];
		timer[1] += timings[1];
		timer[2] += timings[2];
		timer[3] += timings[3];
		timer[4] += timings[4];
		timer[6] += timings[6];
	}
	return;
} // end of accfft_grad_slow_t

template<typename T, typename Tp>
void accfft_grad_t(T* A_x, T* A_y, T*A_z, T* A, Tp* plan, std::bitset<3>* pXYZ,
		double* timer) {
	typedef T Tc[2];
	double self_exec_time = -MPI_Wtime();
	// int procid;
	MPI_Comm c_comm = plan->c_comm;
	// MPI_Comm_rank(c_comm, &procid);
	if (!plan->r2c_plan_baked) {
    std::cout << "Error in accfft_grad! plan is not correctly made."
				<< std::endl;
		return;
	}
	std::bitset < 3 > XYZ;
	if (pXYZ != NULL) {
		XYZ = *pXYZ;
	} else {
		XYZ[0] = 1;
		XYZ[1] = 1;
		XYZ[2] = 1;
	}
	double timings[7] = { 0 };
	int *N = plan->N;


  Tc* A_hat = (Tc*)plan->Mem_mgr->operator_buffer_1;
	std::bitset < 3 > scale_xyz(0);
	//MPI_Barrier(c_comm);

	/* Multiply x Wave Numbers */
	if (XYZ[0]) {
	  accfft_execute_r2c_x_t<T, Tc>(plan, A, A_hat, timings);
	  scale_xyz[0] = 1;
	  scale_xyz[1] = 0;
	  scale_xyz[2] = 0;
    timings[6] += -MPI_Wtime();
    grad_mult_wave_numberx_inplace<Tc>(A_hat, N, c_comm, plan->osize_xi, plan->ostart_2, scale_xyz);
    timings[6] += +MPI_Wtime();

    /* Backward transform */
    accfft_execute_c2r_x_t<Tc, T>(plan, A_hat, A_x, timings);
  }
  /* Multiply y Wave Numbers */
  if (XYZ[1]) {
    accfft_execute_r2c_y_t<T, Tc>(plan, A, A_hat, timings);
    scale_xyz[0] = 0;
    scale_xyz[1] = 1;
    scale_xyz[2] = 0;
    timings[6] += -MPI_Wtime();
    grad_mult_wave_numbery_inplace<Tc>(A_hat, N, c_comm,
        plan->osize_yi, plan->ostart_y, scale_xyz);
    timings[6] += +MPI_Wtime();
    /* Backward transform */
    accfft_execute_c2r_y_t<Tc, T>(plan, A_hat, A_y, timings);
  }

  /* Multiply z Wave Numbers */
  if (XYZ[2]) {
    accfft_execute_r2c_z_t<T, Tc>(plan, A, A_hat, timings);
    scale_xyz[0] = 0;
    scale_xyz[1] = 0;
    scale_xyz[2] = 1;
    timings[6] += -MPI_Wtime();
    grad_mult_wave_numberz_inplace<Tc>(A_hat, N, c_comm,
        plan->osize_0, plan->ostart_0, scale_xyz);
    timings[6] += +MPI_Wtime();
    /* Backward transform */
    accfft_execute_c2r_z_t<Tc, T>(plan, A_hat, A_z, timings);
  }
  self_exec_time += MPI_Wtime();

  if (timer == NULL) {
    //delete [] timings;
  } else {
    timer[0] += timings[0];
    timer[1] += timings[1];
    timer[2] += timings[2];
    timer[3] += timings[3];
    timer[4] += timings[4];
    timer[5] += self_exec_time;
		timer[6] += timings[6];
  }
  return;
}

template<typename T, typename Tp>
void accfft_laplace_t(T* LA, T* A, Tp* plan, double* timer) {
  typedef T Tc[2];
  // int procid;
  MPI_Comm c_comm = plan->c_comm;
  // MPI_Comm_rank(c_comm, &procid);
  if (!plan->r2c_plan_baked) {
    std::cout << "Error in accfft_grad! plan is not correctly made."
      << std::endl;
    return;
  }

  double timings[7] = { 0 };

  double self_exec_time = -MPI_Wtime();
  int *N = plan->N;

  Tc* A_hat = (Tc*)plan->Mem_mgr->operator_buffer_1;

  //MPI_Barrier(c_comm);

  /* Forward transform */
  accfft_execute_r2c_t<T, Tc, Tp>(plan, A, A_hat, timings);

  /* Multiply x Wave Numbers */
  timings[6] += -MPI_Wtime();
  grad_mult_wave_number_laplace_inplace<Tc>(A_hat, N, c_comm);
  timings[6] += +MPI_Wtime();
  //MPI_Barrier(c_comm);

  /* Backward transform */
  accfft_execute_c2r_t<Tc, T, Tp>(plan, A_hat, LA, timings);

  self_exec_time += MPI_Wtime();

  if (timer == NULL) {
    //delete [] timings;
  } else {
    timer[0] += timings[0];
    timer[1] += timings[1];
    timer[2] += timings[2];
    timer[3] += timings[3];
    timer[4] += timings[4];
		timer[6] += timings[6];
  }
  return;
}

template<typename T, typename Tp>
void accfft_divergence_slow_t(T* div_A, T* A_x, T* A_y, T* A_z, Tp* plan,
    double* timer) {
  double self_exec_time = -MPI_Wtime();
  typedef T Tc[2];
  int procid;
  MPI_Comm c_comm = plan->c_comm;
  MPI_Comm_rank(c_comm, &procid);
  if (!plan->r2c_plan_baked) {
    PCOUT << "Error in accfft_grad! plan is not correctly made."
      << std::endl;
    return;
  }

  double timings[7] = { 0 };

  int *N = plan->N;

  int isize[3], osize[3], istart[3], ostart[3];
  long long int alloc_max;
  /* Get the local pencil size and the allocation size */
  alloc_max = accfft_local_size_dft_r2c_t<T>(N, isize, istart, osize, ostart,
      c_comm);

  Tc* A_hat = (Tc*) accfft_alloc(alloc_max);
  Tc* tmp = (Tc*) accfft_alloc(alloc_max);
  T* tmp2 = (T*) accfft_alloc(alloc_max);
  std::bitset < 3 > xyz(0);

  //MPI_Barrier(c_comm);

  /* Forward transform in x direction*/
  xyz[0] = 1;
  xyz[1] = 0;
  xyz[2] = 1;
  accfft_execute_r2c_t<T, Tc, Tp>(plan, A_x, A_hat, timings, xyz);
  /* Multiply x Wave Numbers */
  timings[6] += -MPI_Wtime();
  grad_mult_wave_numberx<T[2]>(tmp, A_hat, N, c_comm, osize, ostart, xyz);
  timings[6] += +MPI_Wtime();
  //MPI_Barrier(c_comm);
  /* Backward transform */
  accfft_execute_c2r_t<Tc, T, Tp>(plan, tmp, div_A, timings, xyz);

  // memcpy(div_A, tmp2, isize[0] * isize[1] * isize[2] * sizeof(T));

  /* Forward transform in y direction*/
  xyz[0] = 0;
  xyz[1] = 1;
  xyz[2] = 1;
  accfft_execute_r2c_t<T, Tc, Tp>(plan, A_y, A_hat, timings, xyz);
  /* Multiply y Wave Numbers */
  timings[6] += -MPI_Wtime();
  grad_mult_wave_numbery<T[2]>(tmp, A_hat, N, c_comm, osize, ostart, xyz);
  timings[6] += +MPI_Wtime();
  //MPI_Barrier(c_comm);
  /* Backward transform */
  accfft_execute_c2r_t<Tc, T, Tp>(plan, tmp, tmp2, timings, xyz);

  timings[6] += -MPI_Wtime();
#pragma ivdep
  for (int i = 0; i < isize[0] * isize[1] * isize[2]; ++i)
    div_A[i] += tmp2[i];
  timings[6] += +MPI_Wtime();

  /* Forward transform in z direction*/
  xyz[0] = 0;
  xyz[1] = 0;
  xyz[2] = 1;
  accfft_execute_r2c_t<T, Tc, Tp>(plan, A_z, A_hat, timings, xyz);
  /* Multiply z Wave Numbers */
  timings[6] += -MPI_Wtime();
  grad_mult_wave_numberz<T[2]>(tmp, A_hat, N, c_comm, osize, ostart, xyz);
  timings[6] += +MPI_Wtime();
  //MPI_Barrier(c_comm);
  /* Backward transform */
  accfft_execute_c2r_t<Tc, T, Tp>(plan, tmp, tmp2, timings, xyz);

  timings[6] += -MPI_Wtime();
#pragma ivdep
  for (int i = 0; i < isize[0] * isize[1] * isize[2]; ++i)
    div_A[i] += tmp2[i];
  timings[6] += +MPI_Wtime();

  accfft_free(A_hat);
  accfft_free(tmp);
  accfft_free(tmp2);

  self_exec_time += MPI_Wtime();

  if (timer == NULL) {
    //delete [] timings;
  } else {
    timer[0] += timings[0];
    timer[1] += timings[1];
    timer[2] += timings[2];
    timer[3] += timings[3];
    timer[4] += timings[4];
		timer[6] += timings[6];
  }
  return;
}// end of accfft_divergence_slow_t

template<typename T, typename Tp>
void accfft_divergence_t(T* div_A, T* A_x, T* A_y, T* A_z, Tp* plan,
    double* timer) {
  double self_exec_time = -MPI_Wtime();
  typedef T Tc[2];
  // int procid;
  MPI_Comm c_comm = plan->c_comm;
  // MPI_Comm_rank(c_comm, &procid);
  if (!plan->r2c_plan_baked) {
    std::cout << "Error in accfft_grad! plan is not correctly made."
      << std::endl;
    return;
  }

  double timings[7] = { 0 };

  int *N = plan->N;

  int* isize = plan->isize;

  Tc* A_hat = (Tc*)plan->Mem_mgr->operator_buffer_1;
  T* tmp2 = (T*)plan->Mem_mgr->operator_buffer_2;
  std::bitset < 3 > xyz(0);

  //MPI_Barrier(c_comm);

  /* Forward transform in x direction*/
  accfft_execute_r2c_x_t<T, Tc>(plan, A_x, A_hat, timings);
  /* Multiply x Wave Numbers */
  xyz[0] = 1;
  xyz[1] = 0;
  xyz[2] = 0;
  timings[6] += -MPI_Wtime();
  grad_mult_wave_numberx_inplace<T[2]>(A_hat, N, c_comm, plan->osize_xi,
      plan->ostart_2, xyz);
  timings[6] += +MPI_Wtime();
  //MPI_Barrier(c_comm);
  /* Backward transform */
  accfft_execute_c2r_x_t<Tc, T>(plan, A_hat, div_A, timings);

  /* Forward transform in y direction*/
  accfft_execute_r2c_y_t<T, Tc>(plan, A_y, A_hat, timings);
  /* Multiply y Wave Numbers */
  xyz[0] = 0;
  xyz[1] = 1;
  xyz[2] = 0;
  timings[6] += -MPI_Wtime();
  grad_mult_wave_numbery_inplace<T[2]>(A_hat, N, c_comm, plan->osize_yi,
      plan->ostart_y, xyz);
  timings[6] += +MPI_Wtime();
  //MPI_Barrier(c_comm);
  /* Backward transform */
  accfft_execute_c2r_y_t<Tc, T>(plan, A_hat, tmp2, timings);

  timings[6] += -MPI_Wtime();
  for (int i = 0; i < isize[0] * isize[1] * isize[2]; ++i)
    div_A[i] += tmp2[i];
  timings[6] += +MPI_Wtime();

  /* Forward transform in z direction*/
  xyz[0] = 0;
  xyz[1] = 0;
  xyz[2] = 1;
  accfft_execute_r2c_z_t<T, Tc>(plan, A_z, A_hat, timings);
  /* Multiply z Wave Numbers */
  timings[6] += -MPI_Wtime();
  grad_mult_wave_numberz_inplace<T[2]>(A_hat, N, c_comm, plan->osize_0,
      plan->ostart_0, xyz);
  timings[6] += +MPI_Wtime();
  //MPI_Barrier(c_comm);
  /* Backward transform */
  accfft_execute_c2r_z_t<Tc, T>(plan, A_hat, tmp2, timings);

  timings[6] += -MPI_Wtime();
  for (int i = 0; i < isize[0] * isize[1] * isize[2]; ++i)
    div_A[i] += tmp2[i];
  timings[6] += +MPI_Wtime();

  self_exec_time += MPI_Wtime();

  if (timer == NULL) {
    //delete [] timings;
  } else {
    timer[0] += timings[0];
    timer[1] += timings[1];
    timer[2] += timings[2];
    timer[3] += timings[3];
    timer[4] += timings[4];
    timer[5] += self_exec_time;
		timer[6] += timings[6];
  }
  return;
}

template<typename T, typename Tp>
void accfft_biharmonic_t(T* LA, T* A, Tp* plan, double* timer) {
  typedef T Tc[2];
  int procid;
  MPI_Comm c_comm = plan->c_comm;
  MPI_Comm_rank(c_comm, &procid);
  if (!plan->r2c_plan_baked) {
    PCOUT << "Error in accfft_grad! plan is not correctly made."
      << std::endl;
    return;
  }

  double timings[7] = { 0 };

  double self_exec_time = -MPI_Wtime();
  int *N = plan->N;

  int isize[3], osize[3], istart[3], ostart[3];
  long long int alloc_max;
  /* Get the local pencil size and the allocation size */
  alloc_max = accfft_local_size_dft_r2c_t<T>(N, isize, istart, osize, ostart,
      c_comm);

  Tc* A_hat = (Tc*)plan->Mem_mgr->operator_buffer_1;

  //MPI_Barrier(c_comm);

  /* Forward transform */
  accfft_execute_r2c_t<T, Tc, Tp>(plan, A, A_hat, timings);

  /* Multiply x Wave Numbers */
  timings[6] += -MPI_Wtime();
  biharmonic_mult_wave_number_inplace<Tc>(A_hat, N, c_comm);
  timings[6] += +MPI_Wtime();
  //MPI_Barrier(c_comm);

  /* Backward transform */
  accfft_execute_c2r_t<Tc, T, Tp>(plan, A_hat, LA, timings);

  self_exec_time += MPI_Wtime();

  if (timer == NULL) {
    //delete [] timings;
  } else {
    timer[0] += timings[0];
    timer[1] += timings[1];
    timer[2] += timings[2];
    timer[3] += timings[3];
    timer[4] += timings[4];
		timer[6] += timings[6];
  }
  return;
}

template<typename T, typename Tp>
void accfft_inv_laplace_t(T* invLA, T* A, Tp* plan, double* timer) {
  typedef T Tc[2];
  int procid;
  MPI_Comm c_comm = plan->c_comm;
  MPI_Comm_rank(c_comm, &procid);
  if (!plan->r2c_plan_baked) {
    PCOUT << "Error in accfft_grad! plan is not correctly made."
      << std::endl;
    return;
  }

  double timings[7] = { 0 };

  double self_exec_time = -MPI_Wtime();
  int *N = plan->N;

  int isize[3], osize[3], istart[3], ostart[3];
  long long int alloc_max;
  /* Get the local pencil size and the allocation size */
  alloc_max = accfft_local_size_dft_r2c_t<T>(N, isize, istart, osize, ostart,
      c_comm);

  Tc* A_hat = (Tc*)plan->Mem_mgr->operator_buffer_1;

  //MPI_Barrier(c_comm);

  /* Forward transform */
  accfft_execute_r2c_t<T, Tc, Tp>(plan, A, A_hat, timings);

  /* Multiply x Wave Numbers */
  timings[6] += -MPI_Wtime();
  mult_wave_number_inv_laplace_inplace<Tc>(A_hat, N, c_comm);
  timings[6] += +MPI_Wtime();
  //MPI_Barrier(c_comm);

  /* Backward transform */
  accfft_execute_c2r_t<Tc, T, Tp>(plan, A_hat, invLA, timings);

  self_exec_time += MPI_Wtime();

  if (timer == NULL) {
    //delete [] timings;
  } else {
    timer[0] += timings[0];
    timer[1] += timings[1];
    timer[2] += timings[2];
    timer[3] += timings[3];
    timer[4] += timings[4];
		timer[6] += timings[6];
  }
  return;
}

template<typename T, typename Tp>
void accfft_inv_biharmonic_t(T* invBA, T* A, Tp* plan, double* timer) {
  typedef T Tc[2];
  int procid;
  MPI_Comm c_comm = plan->c_comm;
  MPI_Comm_rank(c_comm, &procid);
  if (!plan->r2c_plan_baked) {
    PCOUT << "Error in accfft_grad! plan is not correctly made."
      << std::endl;
    return;
  }

  double timings[7] = { 0 };

  double self_exec_time = -MPI_Wtime();
  int *N = plan->N;

  int isize[3], osize[3], istart[3], ostart[3];
  long long int alloc_max;
  /* Get the local pencil size and the allocation size */
  alloc_max = accfft_local_size_dft_r2c_t<T>(N, isize, istart, osize, ostart,
      c_comm);

  Tc* A_hat = (Tc*)plan->Mem_mgr->operator_buffer_1;

  //MPI_Barrier(c_comm);

  /* Forward transform */
  accfft_execute_r2c_t<T, Tc, Tp>(plan, A, A_hat, timings);

  /* Multiply x Wave Numbers */
  timings[6] += -MPI_Wtime();
  mult_wave_number_inv_biharmonic_inplace<Tc>(A_hat, N, c_comm);
  timings[6] += +MPI_Wtime();
  //MPI_Barrier(c_comm);

  /* Backward transform */
  accfft_execute_c2r_t<Tc, T, Tp>(plan, A_hat, invBA, timings);


  self_exec_time += MPI_Wtime();

  if (timer == NULL) {
    //delete [] timings;
  } else {
    timer[0] += timings[0];
    timer[1] += timings[1];
    timer[2] += timings[2];
    timer[3] += timings[3];
    timer[4] += timings[4];
		timer[6] += timings[6];
  }
  return;
}

#endif
Browse the archive

https://github.com/amirgholami/accfft