Content - 78224aa130615bddb2b4f28351e17dd850d0b1ea - b9a1384/src/ngrid-cuda2.cu

visit type:
Tip revision: ced4d9906bc8d32ac610097399fdfdf7a5e40a0a authored by Jussi Eloranta on 20 December 2021, 02:30:57 UTC
Update README
Tip revision: ced4d99
ngrid-cuda2.cu
/*
 * CUDA device code (mixed cgrid/rgrid).
 *
 * blockDim = # of threads
 * gridDim = # of blocks
 *
 */

#include <stdio.h>
#include <cuda_runtime_api.h>
#include <cuda.h>
#include <device_launch_parameters.h>
#include <cufft.h>
#include <cufftXt.h>
#include "cuda-math.h"
#include "cgrid_bc-cuda.h"
#include "cuda-vars.h"
#include "cuda.h"

extern void *grid_gpu_mem;
extern cudaXtDesc *grid_gpu_mem_addr;
extern "C" void cuda_error_check();

/*
 * Real to complex_re.
 *
 * dst.re = src(real). (zeroes the imag part)
 *
 */

__global__ void grid_cuda_real_to_complex_re_gpu(CUCOMPLEX *dst, CUREAL *src, INT nx, INT ny, INT nz, INT nzz) {

  INT k = blockIdx.x * blockDim.x + threadIdx.x, j = blockIdx.y * blockDim.y + threadIdx.y, i = blockIdx.z * blockDim.z + threadIdx.z, idx, idx2, tmp;

  if(i >= nx || j >= ny || k >= nz) return;

  tmp = i * ny + j;
  idx = tmp * nz + k;    // Index for complex grid
  idx2 = tmp * nzz + k;  // Index for real grid

  dst[idx] = CUMAKE(src[idx2], 0.0);
}

/*
 * Real to complex_re
 *
 * dst     = Destination for operation (gpu_mem_block *; output).
 * src     = Source for operation (gpu_mem_block *; input).
 * nx      = # of points along x (INT; input).
 * ny      = # of points along y (INT; input).
 * nz      = # of points along z (INT; input).
 *
 */

extern "C" void grid_cuda_real_to_complex_reW(gpu_mem_block *dst, gpu_mem_block *src, INT nx, INT ny, INT nz) {

  dst->gpu_info->subFormat = src->gpu_info->subFormat;
  SETUP_VARIABLES(dst);
  cudaXtDesc *DST = dst->gpu_info->descriptor, *SRC = src->gpu_info->descriptor;
  INT nzz = 2 * (nz / 2 + 1);

  for(i = 0; i < ngpu1; i++) {
    cudaSetDevice(DST->GPUs[i]);
    grid_cuda_real_to_complex_re_gpu<<<blocks1,threads>>>((CUCOMPLEX *) DST->data[i], (CUREAL *) SRC->data[i], nnx1, nny1, nz, nzz);
  }

  for(i = ngpu1; i < ngpu2; i++) {
    cudaSetDevice(DST->GPUs[i]);
    grid_cuda_real_to_complex_re_gpu<<<blocks2,threads>>>((CUCOMPLEX *) DST->data[i], (CUREAL *) SRC->data[i], nnx2, nny2, nz, nzz);
  }

  cuda_error_check();
}

/*
 * Real to complex_im.
 *
 * dst.im = src(real). (zeroes the real part)
 *
 */

__global__ void grid_cuda_real_to_complex_im_gpu(CUCOMPLEX *dst, CUREAL *src, INT nx, INT ny, INT nz, INT nzz) {

  INT k = blockIdx.x * blockDim.x + threadIdx.x, j = blockIdx.y * blockDim.y + threadIdx.y, i = blockIdx.z * blockDim.z + threadIdx.z, idx, idx2, tmp;

  if(i >= nx || j >= ny || k >= nz) return;

  tmp = i * ny + j;
  idx = tmp * nz + k;
  idx2 = tmp * nzz + k;

  dst[idx] = CUMAKE(0.0, src[idx2]);
}

/*
 * Real to complex_im
 *
 * dst     = Destination for operation (gpu_mem_block *; output).
 * src     = Source for operation (gpu_mem_block *; input).
 * nx      = # of points along x (INT; input).
 * ny      = # of points along y (INT; input).
 * nz      = # of points along z (INT; input).
 *
 */

extern "C" void grid_cuda_real_to_complex_imW(gpu_mem_block *dst, gpu_mem_block *src, INT nx, INT ny, INT nz) {

  dst->gpu_info->subFormat = src->gpu_info->subFormat;
  SETUP_VARIABLES(dst);
  cudaXtDesc *DST = dst->gpu_info->descriptor, *SRC = src->gpu_info->descriptor;
  INT nzz = 2 * (nz / 2 + 1);

  for(i = 0; i < ngpu1; i++) {
    cudaSetDevice(DST->GPUs[i]);
    grid_cuda_real_to_complex_im_gpu<<<blocks1,threads>>>((CUCOMPLEX *) DST->data[i], (CUREAL *) SRC->data[i], nnx1, nny1, nz, nzz);
  }

  for(i = ngpu1; i < ngpu2; i++) {
    cudaSetDevice(DST->GPUs[i]);
    grid_cuda_real_to_complex_im_gpu<<<blocks2,threads>>>((CUCOMPLEX *) DST->data[i], (CUREAL *) SRC->data[i], nnx2, nny2, nz, nzz);
  }

  cuda_error_check();
}

/*
 * Add real to complex_re.
 *
 * dst.re = dst.re + src(real).
 *
 */

__global__ void grid_cuda_add_real_to_complex_re_gpu(CUCOMPLEX *dst, CUREAL *src, INT nx, INT ny, INT nz, INT nzz) {

  INT k = blockIdx.x * blockDim.x + threadIdx.x, j = blockIdx.y * blockDim.y + threadIdx.y, i = blockIdx.z * blockDim.z + threadIdx.z, idx, idx2, tmp;

  if(i >= nx || j >= ny || k >= nz) return;

  tmp = i * ny + j;
  idx = tmp * nz + k;
  idx2 = tmp * nzz + k;

  dst[idx].x = dst[idx].x + src[idx2];
}

/*
 * Add real to complex.re
 *
 * grida   = Destination for operation (gpu_mem_block *; output).
 * gridb   = Source for operation (gpu_mem_block *; input).
 * nx      = # of points along x (INT; input).
 * ny      = # of points along y (INT; input).
 * nz      = # of points along z (INT; input).
 *
 */

extern "C" void grid_cuda_add_real_to_complex_reW(gpu_mem_block *dst, gpu_mem_block *src, INT nx, INT ny, INT nz) {

  SETUP_VARIABLES(dst);
  cudaXtDesc *DST = dst->gpu_info->descriptor, *SRC = src->gpu_info->descriptor;
  INT nzz = 2 * (nz / 2 + 1);

  if(src->gpu_info->subFormat != dst->gpu_info->subFormat) {
    fprintf(stderr, "libgrid(cuda): add_real_to_complex_re source/destination must have the same subformat.");
    abort();
  }

  for(i = 0; i < ngpu1; i++) {
    cudaSetDevice(DST->GPUs[i]);
    grid_cuda_add_real_to_complex_re_gpu<<<blocks1,threads>>>((CUCOMPLEX *) DST->data[i], (CUREAL *) SRC->data[i], nnx1, nny1, nz, nzz);
  }

  for(i = ngpu1; i < ngpu2; i++) {
    cudaSetDevice(DST->GPUs[i]);
    grid_cuda_add_real_to_complex_re_gpu<<<blocks2,threads>>>((CUCOMPLEX *) DST->data[i], (CUREAL *) SRC->data[i], nnx2, nny2, nz, nzz);
  }

  cuda_error_check();
}

/*
 * Add real to complex_im.
 *
 * dst.im = dst.im + src(real).
 *
 */

__global__ void grid_cuda_add_real_to_complex_im_gpu(CUCOMPLEX *dst, CUREAL *src, INT nx, INT ny, INT nz, INT nzz) {

  INT k = blockIdx.x * blockDim.x + threadIdx.x, j = blockIdx.y * blockDim.y + threadIdx.y, i = blockIdx.z * blockDim.z + threadIdx.z, idx, idx2, tmp;

  if(i >= nx || j >= ny || k >= nz) return;

  tmp = i * ny + j;
  idx = tmp * nz + k;
  idx2 = tmp * nzz + k;

  dst[idx].y = dst[idx].y + src[idx2];
}

/*
 * Add real to complex_im
 *
 * grida   = Destination for operation (gpu_mem_block *; output).
 * gridb   = Source for operation (gpu_mem_block *; input).
 * nx      = # of points along x (INT; input).
 * ny      = # of points along y (INT; input).
 * nz      = # of points along z (INT; input).
 *
 */

extern "C" void grid_cuda_add_real_to_complex_imW(gpu_mem_block *dst, gpu_mem_block *src, INT nx, INT ny, INT nz) {

  SETUP_VARIABLES(dst);
  cudaXtDesc *DST = dst->gpu_info->descriptor, *SRC = src->gpu_info->descriptor;
  INT nzz = 2 * (nz / 2 + 1);

  if(src->gpu_info->subFormat != dst->gpu_info->subFormat) {
    fprintf(stderr, "libgrid(cuda): add_real_to_complex_im source/destination must have the same subformat.");
    abort();
  }

  for(i = 0; i < ngpu1; i++) {
    cudaSetDevice(DST->GPUs[i]);
    grid_cuda_add_real_to_complex_im_gpu<<<blocks1,threads>>>((CUCOMPLEX *) DST->data[i], (CUREAL *) SRC->data[i], nnx1, nny1, nz, nzz);
  }

  for(i = ngpu1; i < ngpu2; i++) {
    cudaSetDevice(DST->GPUs[i]);
    grid_cuda_add_real_to_complex_im_gpu<<<blocks2,threads>>>((CUCOMPLEX *) DST->data[i], (CUREAL *) SRC->data[i], nnx2, nny2, nz, nzz);
  }

  cuda_error_check();
}

/*
 * Product of real grid with sqnorm of complex grid:
 *
 * dst = src1 * |src2|^2
 *
 */

__global__ void grid_cuda_product_norm_gpu(CUREAL *dst, CUREAL *src1, CUCOMPLEX *src2, INT nx, INT ny, INT nz, INT nzz) {

  INT k = blockIdx.x * blockDim.x + threadIdx.x, j = blockIdx.y * blockDim.y + threadIdx.y, i = blockIdx.z * blockDim.z + threadIdx.z, idx, idx2, tmp;

  if(i >= nx || j >= ny || k >= nz) return;

  tmp = i * ny + j;
  idx = tmp * nz + k;
  idx2 = tmp * nzz + k;

  dst[idx2] = src1[idx2] * CUCSQNORM(src2[idx]);
}

/*
 * Product of src1 with sqnorm of src2.
 *
 * dst     = Destination for operation (gpu_mem_block *; output).
 * src1    = Source for operation 1 (gpu_mem_block *; input).
 * src2    = Source for operation 2 (gpu_mem_block *; input).
 * nx      = # of points along x (INT; input).
 * ny      = # of points along y (INT; input).
 * nz      = # of points along z (INT; input).
 *
 */

extern "C" void grid_cuda_product_normW(gpu_mem_block *dst, gpu_mem_block *src1, gpu_mem_block *src2, INT nx, INT ny, INT nz) {

  SETUP_VARIABLES(src2);
  cudaXtDesc *DST = dst->gpu_info->descriptor, *SRC1 = src1->gpu_info->descriptor, *SRC2 = src2->gpu_info->descriptor;
  INT nzz = 2 * (nz / 2 + 1);

  if(src1->gpu_info->subFormat != src2->gpu_info->subFormat) {
    fprintf(stderr, "libgrid(cuda): product_norm source grids must have the same subformat.");
    abort();
  }

  for(i = 0; i < ngpu1; i++) {
    cudaSetDevice(DST->GPUs[i]);
    grid_cuda_product_norm_gpu<<<blocks1,threads>>>((CUREAL *) DST->data[i], (CUREAL *) SRC1->data[i], (CUCOMPLEX *) SRC2->data[i], nnx1, nny1, nz, nzz);
  }

  for(i = ngpu1; i < ngpu2; i++) {
    cudaSetDevice(DST->GPUs[i]);
    grid_cuda_product_norm_gpu<<<blocks2,threads>>>((CUREAL *) DST->data[i], (CUREAL *) SRC1->data[i], (CUCOMPLEX *) SRC2->data[i], nnx2, nny2, nz, nzz);
  }

  cuda_error_check();
}

/*
 * Divide real grid with sqnorm of complex grid:
 *
 * dst = src1 / (|src2|^2 + eps)
 *
 */

__global__ void grid_cuda_division_norm_gpu(CUREAL *dst, CUREAL *src1, CUCOMPLEX *src2, CUREAL eps, INT nx, INT ny, INT nz, INT nzz) {

  INT k = blockIdx.x * blockDim.x + threadIdx.x, j = blockIdx.y * blockDim.y + threadIdx.y, i = blockIdx.z * blockDim.z + threadIdx.z, idx, idx2, tmp;

  if(i >= nx || j >= ny || k >= nz) return;

  tmp = i * ny + j;
  idx = tmp * nz + k;
  idx2 = tmp * nzz + k;

  dst[idx2] = src1[idx2] / (CUCSQNORM(src2[idx]) + eps);
}

/*
 * Division src1 with sqnorm of src2.
 *
 * dst     = Destination for operation (gpu_mem_block *; output).
 * src1    = Source for operation 1 (gpu_mem_block *; input).
 * src2    = Source for operation 2 (gpu_mem_block *; input).
 * eps     = Epsilon for division (REAL; input).
 * nx      = # of points along x (INT; input).
 * ny      = # of points along y (INT; input).
 * nz      = # of points along z (INT; input).
 *
 */

extern "C" void grid_cuda_division_normW(gpu_mem_block *dst, gpu_mem_block *src1, gpu_mem_block *src2, REAL eps, INT nx, INT ny, INT nz) {

  SETUP_VARIABLES(src2);
  cudaXtDesc *DST = dst->gpu_info->descriptor, *SRC1 = src1->gpu_info->descriptor, *SRC2 = src2->gpu_info->descriptor;
  INT nzz = 2 * (nz / 2 + 1);

  if(src1->gpu_info->subFormat != src2->gpu_info->subFormat) {
    fprintf(stderr, "libgrid(cuda): product_norm source grids must have the same subformat.");
    abort();
  }

  for(i = 0; i < ngpu1; i++) {
    cudaSetDevice(DST->GPUs[i]);
    grid_cuda_division_norm_gpu<<<blocks1,threads>>>((CUREAL *) DST->data[i], (CUREAL *) SRC1->data[i], (CUCOMPLEX *) SRC2->data[i], eps, nnx1, nny1, nz, nzz);
  }

  for(i = ngpu1; i < ngpu2; i++) {
    cudaSetDevice(DST->GPUs[i]);
    grid_cuda_division_norm_gpu<<<blocks2,threads>>>((CUREAL *) DST->data[i], (CUREAL *) SRC1->data[i], (CUCOMPLEX *) SRC2->data[i], eps, nnx2, nny2, nz, nzz);
  }

  cuda_error_check();
}

/*
 * Product dst(complex) and src(real).
 *
 * dst = dst * src(real).
 *
 */

__global__ void grid_cuda_product_complex_with_real_gpu(CUCOMPLEX *dst, CUREAL *src, INT nx, INT ny, INT nz, INT nzz) {

  INT k = blockIdx.x * blockDim.x + threadIdx.x, j = blockIdx.y * blockDim.y + threadIdx.y, i = blockIdx.z * blockDim.z + threadIdx.z, idx, idx2, tmp;

  if(i >= nx || j >= ny || k >= nz) return;

  tmp = i * ny + j;
  idx = tmp * nz + k;
  idx2 = tmp * nzz + k;

  dst[idx] = dst[idx] * src[idx2];
}

/*
 * Product dst(complex) with src(real).
 *
 * dst     = Destination for operation (gpu_mem_block *; output).
 * src     = Source for operation (gpu_mem_block *; input).
 * nx      = # of points along x (INT; input).
 * ny      = # of points along y (INT; input).
 * nz      = # of points along z (INT; input).
 *
 */

extern "C" void grid_cuda_product_complex_with_realW(gpu_mem_block *dst, gpu_mem_block *src, INT nx, INT ny, INT nz) {

  SETUP_VARIABLES(dst);
  cudaXtDesc *DST = dst->gpu_info->descriptor, *SRC = src->gpu_info->descriptor;
  INT nzz = 2 * (nz / 2 + 1);

  if(src->gpu_info->subFormat != dst->gpu_info->subFormat) {
    fprintf(stderr, "libgrid(cuda): product_complex_with_real source/destination must have the same subformat.");
    abort();
  }

  for(i = 0; i < ngpu1; i++) {
    cudaSetDevice(DST->GPUs[i]);
    grid_cuda_product_complex_with_real_gpu<<<blocks1,threads>>>((CUCOMPLEX *) DST->data[i], (CUREAL *) SRC->data[i], nnx1, nny1, nz, nzz);
  }

  for(i = ngpu1; i < ngpu2; i++) {
    cudaSetDevice(DST->GPUs[i]);
    grid_cuda_product_complex_with_real_gpu<<<blocks2,threads>>>((CUCOMPLEX *) DST->data[i], (CUREAL *) SRC->data[i], nnx2, nny2, nz, nzz);
  }

  cuda_error_check();
}

/*
 * Imag. part to real grid.
 *
 * dst(real) = src.im;
 *
 */

__global__ void grid_cuda_complex_im_to_real_gpu(CUREAL *dst, CUCOMPLEX *src, INT nx, INT ny, INT nz, INT nzz) {

  INT k = blockIdx.x * blockDim.x + threadIdx.x, j = blockIdx.y * blockDim.y + threadIdx.y, i = blockIdx.z * blockDim.z + threadIdx.z, idx, idx2, tmp;

  if(i >= nx || j >= ny || k >= nz) return;

  tmp = i * ny + j;
  idx = tmp * nz + k;
  idx2 = tmp * nzz + k;

  dst[idx2] = CUCIMAG(src[idx]);
}

/*
 * Imag. part of src to real dst.
 *
 * dst     = Destination for operation (gpu_mem_block *; output).
 * src     = Source for operation (gpu_mem_block *; input).
 * nx      = # of points along x (INT; input).
 * ny      = # of points along y (INT; input).
 * nz      = # of points along z (INT; input).
 *
 */

extern "C" void grid_cuda_complex_im_to_realW(gpu_mem_block *dst, gpu_mem_block *src, INT nx, INT ny, INT nz) {

  dst->gpu_info->subFormat = src->gpu_info->subFormat;
  SETUP_VARIABLES(dst);
  cudaXtDesc *DST = dst->gpu_info->descriptor, *SRC = src->gpu_info->descriptor;
  INT nzz = 2 * (nz / 2 + 1);

  for(i = 0; i < ngpu1; i++) {
    cudaSetDevice(DST->GPUs[i]);
    grid_cuda_complex_im_to_real_gpu<<<blocks1,threads>>>((CUREAL *) DST->data[i], (CUCOMPLEX *) SRC->data[i], nnx1, nny1, nz, nzz);
  }

  for(i = ngpu1; i < ngpu2; i++) {
    cudaSetDevice(DST->GPUs[i]);
    grid_cuda_complex_im_to_real_gpu<<<blocks2,threads>>>((CUREAL *) DST->data[i], (CUCOMPLEX *) SRC->data[i], nnx2, nny2, nz, nzz);
  }

  cuda_error_check();
}

/*
 * Real part to real grid.
 *
 * dst(real) = src.re;
 *
 */

__global__ void grid_cuda_complex_re_to_real_gpu(CUREAL *dst, CUCOMPLEX *src, INT nx, INT ny, INT nz, INT nzz) {

  INT k = blockIdx.x * blockDim.x + threadIdx.x, j = blockIdx.y * blockDim.y + threadIdx.y, i = blockIdx.z * blockDim.z + threadIdx.z, idx, idx2, tmp;

  if(i >= nx || j >= ny || k >= nz) return;

  tmp = i * ny + j;
  idx = tmp * nz + k;
  idx2 = tmp * nzz + k;

  dst[idx2] = CUCREAL(src[idx]);
}

/*
 * Real part of B to real A.
 *
 * dst     = Destination for operation (gpu_mem_block *; output).
 * src     = Source for operation (gpu_mem_block *; input).
 * nx      = # of points along x (INT; input).
 * ny      = # of points along y (INT; input).
 * nz      = # of points along z (INT; input).
 *
 */

extern "C" void grid_cuda_complex_re_to_realW(gpu_mem_block *dst, gpu_mem_block *src, INT nx, INT ny, INT nz) {

  dst->gpu_info->subFormat = src->gpu_info->subFormat;
  SETUP_VARIABLES(dst);
  cudaXtDesc *DST = dst->gpu_info->descriptor, *SRC = src->gpu_info->descriptor;
  INT nzz = 2 * (nz / 2 + 1);

  for(i = 0; i < ngpu1; i++) {
    cudaSetDevice(DST->GPUs[i]);
    grid_cuda_complex_re_to_real_gpu<<<blocks1,threads>>>((CUREAL *) DST->data[i], (CUCOMPLEX *) SRC->data[i], nnx1, nny1, nz, nzz);
  }

  for(i = ngpu1; i < ngpu2; i++) {
    cudaSetDevice(DST->GPUs[i]);
    grid_cuda_complex_re_to_real_gpu<<<blocks2,threads>>>((CUREAL *) DST->data[i], (CUCOMPLEX *) SRC->data[i], nnx2, nny2, nz, nzz);
  }

  cuda_error_check();
}

/*
 * Integrate opgrid * |dgrid|^2.
 *
 */

__global__ void grid_cuda_grid_expectation_value_gpu(CUCOMPLEX *dgrid, CUREAL *opgrid, CUREAL *blocks, INT nx, INT ny, INT nz, INT nzz) {

  INT k = blockIdx.x * blockDim.x + threadIdx.x, j = blockIdx.y * blockDim.y + threadIdx.y, i = blockIdx.z * blockDim.z + threadIdx.z;
  INT d = blockDim.x * blockDim.y * blockDim.z, idx, idx2, idxc, t;
  extern __shared__ CUREAL els[];

  if(i >= nx || j >= ny || k >= nz) return;

  if(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
    for(t = 0; t < d; t++)
      els[t] = 0.0;
  }
  __syncthreads();

  idx = (i * ny + j) * nzz + k;
  idxc = (i * ny + j) * nz + k;
  idx2 = (threadIdx.z * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;

  els[idx2] += opgrid[idx] * CUCSQNORM(dgrid[idxc]);
  __syncthreads();

  if(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
    for(t = 0; t < d; t++) {
      idx2 = (blockIdx.z * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x;
      blocks[idx2] += els[t];  // reduce threads
    }
  }
}

/*
 * Integral opgrid * |dgrid|^2
 *
 * dgrid    = Source 1 for operation (gpu_mem_block *; input).
 * opgrid   = Source 2 for operation (gpu_mem_block *; input).
 * nx       = # of points along x (INT; input).
 * ny       = # of points along y (INT; input).
 * nz       = # of points along z (INT; input).
 * *value   = Return value (CUREAL *; output).
 *
 */

extern __global__ void rgrid_cuda_block_init(CUREAL *);
extern "C" void rgrid_cuda_reduce_all(CUREAL *, INT);

extern "C" void grid_cuda_grid_expectation_valueW(gpu_mem_block *dgrid, gpu_mem_block *opgrid, INT nx, INT ny, INT nz, CUREAL *value) {

  SETUP_VARIABLES_REAL(dgrid);
  cudaXtDesc *DGRID = dgrid->gpu_info->descriptor, *OPGRID = opgrid->gpu_info->descriptor;
  CUREAL tmp;
  INT s = CUDA_THREADS_PER_BLOCK * CUDA_THREADS_PER_BLOCK * CUDA_THREADS_PER_BLOCK, b31 = blocks1.x * blocks1.y * blocks1.z, b32 = blocks2.x * blocks2.y * blocks2.z;
  extern int cuda_get_element(void *, int, size_t, size_t, void *);

  if(dgrid->gpu_info->subFormat != CUFFT_XT_FORMAT_INPLACE || opgrid->gpu_info->subFormat != CUFFT_XT_FORMAT_INPLACE) {
    fprintf(stderr, "libgrid(cuda): grid_expectation_value grids must be in real space (INPLACE).");
    abort();
  }

  for(i = 0; i < ngpu1; i++) {
    cudaSetDevice(DGRID->GPUs[i]);
    rgrid_cuda_block_init<<<b31/CUDA_THREADS_PER_BLOCK,CUDA_THREADS_PER_BLOCK>>>((CUREAL *) grid_gpu_mem_addr->data[i]);
    // Blocks, Threads, dynamic memory size
    grid_cuda_grid_expectation_value_gpu<<<blocks1,threads,s*sizeof(CUREAL)>>>((CUCOMPLEX *) DGRID->data[i], (CUREAL *) OPGRID->data[i], 
                                                                                (CUREAL *) grid_gpu_mem_addr->data[i], nnx1, ny, nz, nzz);
    rgrid_cuda_reduce_all((CUREAL *) grid_gpu_mem_addr->data[i], b31); // reduce over blocks
  }

  for(i = ngpu1; i < ngpu2; i++) {
    cudaSetDevice(DGRID->GPUs[i]);
    rgrid_cuda_block_init<<<b32/CUDA_THREADS_PER_BLOCK,CUDA_THREADS_PER_BLOCK>>>((CUREAL *) grid_gpu_mem_addr->data[i]);
    // Blocks, Threads, dynamic memory size
    grid_cuda_grid_expectation_value_gpu<<<blocks2,threads,s*sizeof(CUREAL)>>>((CUCOMPLEX *) DGRID->data[i], (CUREAL *) OPGRID->data[i], 
                                                                                (CUREAL *) grid_gpu_mem_addr->data[i], nnx2, ny, nz, nzz);
    rgrid_cuda_reduce_all((CUREAL *) grid_gpu_mem_addr->data[i], b32); // reduce over blocks
  }

  // Reduce over GPUs
  *value = 0.0;
  for(i = 0; i < ngpu2; i++) {
    cuda_get_element(grid_gpu_mem, i, 0, sizeof(CUREAL), &tmp);
    *value = *value + tmp;
  }

  cuda_error_check();
}
Browse the archive

https://github.com/jmeloranta/libgrid