https://github.com/N-BodyShop/changa
Tip revision: cdd9b98293f4a8568def0446daeb81d829dfd5d7 authored by Lukasz Wesolowski on 18 June 2015, 07:04:34 UTC
Implementing fix for synchronizing GPU Manager device buffer table after transfers of remote chunk data
Implementing fix for synchronizing GPU Manager device buffer table after transfers of remote chunk data
Tip revision: cdd9b98
HostCUDA.cu
#ifdef _WIN32
#define NOMINMAX
#endif
#ifdef CUDA_MEMPOOL
#define GPU_MEMPOOL
#endif
#ifdef CUDA_INSTRUMENT_WRS
#define GPU_INSTRUMENT_WRS
#endif
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <float.h>
// #include <cutil.h>
#include "CudaFunctions.h"
#ifdef HEXADECAPOLE
# include "CUDAMoments.cu"
#endif
#include "HostCUDA.h"
#include "EwaldCUDA.h"
#include "wr.h"
#define CmiMyPe() _Cmi_mype
extern int _Cmi_mype;
//extern workRequestQueue *wrQueue;
//extern void **devBuffers;
//extern cudaStream_t kernel_stream;
__device__ __constant__ EwaldReadOnlyData cachedData[1];
__device__ __constant__ EwtData ewt[NEWH];
//__constant__ constantData[88];
//
//
#ifdef CUDA_TRACE
extern "C" void traceUserBracketEvent(int e, double beginT, double endT);
extern "C" double CmiWallTimer();
#endif
void allocatePinnedHostMemory(void **ptr, int size){
if(size <= 0){
*((char **)ptr) = NULL;
#ifdef CUDA_PRINT_ERRORS
printf("allocatePinnedHostMemory: 0 size!\n");
#endif
exit(-1);
return;
}
#ifdef CUDA_MEMPOOL
*ptr = hapi_poolMalloc(size);
#else
cudaMallocHost(ptr, size);
#endif
#ifdef CUDA_PRINT_ERRORS
printf("allocatePinnedHostMemory: %s size: %d\n", cudaGetErrorString( cudaGetLastError() ), size);
#endif
}
void freePinnedHostMemory(void *ptr){
if(ptr == NULL){
#ifdef CUDA_PRINT_ERRORS
printf("freePinnedHostMemory: NULL ptr!\n");
#endif
exit(-1);
return;
}
#ifdef CUDA_MEMPOOL
hapi_poolFree(ptr);
#else
delayedFree(ptr);
#endif
#ifdef CUDA_PRINT_ERRORS
printf("freePinnedHostMemory: %s\n", cudaGetErrorString( cudaGetLastError() ));
#endif
}
#ifdef CUDA_INSTRUMENT_WRS
void DataManagerTransferLocalTree(CudaMultipoleMoments *moments, int nMoments, CompactPartData *compactParts, int nCompactParts, int mype, char phase, void *wrCallback) {
#else
void DataManagerTransferLocalTree(CudaMultipoleMoments *moments, int nMoments, CompactPartData *compactParts, int nCompactParts, int mype, void *wrCallback) {
#endif
workRequest transferKernel;
dataInfo *buf;
int size;
// operation will not invoke a kernel
transferKernel.dimGrid = dim3(0);
transferKernel.dimBlock = dim3(0);
transferKernel.smemSize = 0;
transferKernel.nBuffers = DM_TRANSFER_LOCAL_NBUFFERS;
/* schedule buffers for transfer to the GPU */
transferKernel.bufferInfo = (dataInfo *) malloc(transferKernel.nBuffers * sizeof(dataInfo));
buf = &(transferKernel.bufferInfo[LOCAL_MOMENTS_IDX]);
buf->bufferID = LOCAL_MOMENTS;
buf->freeBuffer = false;
size = (nMoments) * sizeof(CudaMultipoleMoments);
buf->size = size;
buf->transferToDevice = size > 0;
buf->transferFromDevice = false;
//double mill = 1e6;
//printf("(%d) DM local moments: %f mbytes\n", mype, 1.0*size/mill);
#ifdef CUDA_PRINT_ERRORS
printf("(%d) DMLocal 0000: %s\n", mype, cudaGetErrorString( cudaGetLastError() ));
#endif
if(size > 0){
#ifdef CUDA_USE_CUDAMALLOCHOST
#ifdef CUDA_MEMPOOL
buf->hostBuffer = hapi_poolMalloc(size);
#else
cudaMallocHost(&buf->hostBuffer, size);
#endif
#else
buf->hostBuffer = malloc(size);
#endif
}
else{
buf->hostBuffer = NULL;
}
#ifdef CUDA_PRINT_ERRORS
printf("(%d) DMLocal 0: %s hostBuf: 0x%x, size: %d\n", mype, cudaGetErrorString( cudaGetLastError() ), buf->hostBuffer, size );
#endif
memcpy(buf->hostBuffer, moments, size);
buf = &(transferKernel.bufferInfo[LOCAL_PARTICLE_CORES_IDX]);
buf->bufferID = LOCAL_PARTICLE_CORES;
buf->freeBuffer = false;
size = (nCompactParts)*sizeof(CompactPartData);
buf->size = size;
buf->transferToDevice = size > 0;
buf->transferFromDevice = false;
if(size > 0){
#ifdef CUDA_USE_CUDAMALLOCHOST
#ifdef CUDA_MEMPOOL
buf->hostBuffer = hapi_poolMalloc(size);
#else
cudaMallocHost(&buf->hostBuffer, size);
#endif
#else
buf->hostBuffer = malloc(size);
#endif
#ifdef CUDA_PRINT_ERRORS
printf("(%d) DMLocal 1: %s\n", mype, cudaGetErrorString( cudaGetLastError() ) );
#endif
memcpy(buf->hostBuffer, compactParts, size);
}
else{
buf->hostBuffer = NULL;
}
VariablePartData *zeroArray;
buf = &(transferKernel.bufferInfo[LOCAL_PARTICLE_VARS_IDX]);
buf->bufferID = LOCAL_PARTICLE_VARS;
buf->freeBuffer = false;
size = (nCompactParts)*sizeof(VariablePartData);
buf->size = size;
buf->transferToDevice = size > 0;
buf->transferFromDevice = false;
if(size > 0){
#ifdef CUDA_USE_CUDAMALLOCHOST
#ifdef CUDA_MEMPOOL
buf->hostBuffer = hapi_poolMalloc(size);
#else
cudaMallocHost(&buf->hostBuffer, size);
#endif
#else
buf->hostBuffer = malloc(size);
#endif
#ifdef CUDA_PRINT_ERRORS
printf("(%d) DMLocal 2: %s\n", mype, cudaGetErrorString( cudaGetLastError() ));
#endif
zeroArray = (VariablePartData *) buf->hostBuffer;
for(int i = 0; i < nCompactParts; i++){
zeroArray[i].a.x = 0.0;
zeroArray[i].a.y = 0.0;
zeroArray[i].a.z = 0.0;
zeroArray[i].potential = 0.0;
}
}
else{
buf->hostBuffer = NULL;
}
transferKernel.callbackFn = wrCallback;
transferKernel.id = DM_TRANSFER_LOCAL;
#ifdef CUDA_VERBOSE_KERNEL_ENQUEUE
printf("(%d) DM LOCAL TREE moments %d (%d) partcores %d (%d) partvars %d (%d)\n",
CmiMyPe(),
transferKernel.bufferInfo[LOCAL_MOMENTS_IDX].size,
transferKernel.bufferInfo[LOCAL_MOMENTS_IDX].transferToDevice,
transferKernel.bufferInfo[LOCAL_PARTICLE_CORES_IDX].size,
transferKernel.bufferInfo[LOCAL_PARTICLE_CORES_IDX].transferToDevice,
transferKernel.bufferInfo[LOCAL_PARTICLE_VARS_IDX].size,
transferKernel.bufferInfo[LOCAL_PARTICLE_VARS_IDX].transferToDevice
);
#endif
#ifdef CUDA_INSTRUMENT_WRS
transferKernel.chareIndex = mype;
transferKernel.compType = transferKernel.id;
transferKernel.compPhase = phase;
#endif
enqueue(&transferKernel);
}
#ifdef CUDA_INSTRUMENT_WRS
void DataManagerTransferRemoteChunk(CudaMultipoleMoments *moments, int nMoments, CompactPartData *compactParts, int nCompactParts, int mype, char phase, void *wrCallback) {
#else
void DataManagerTransferRemoteChunk(CudaMultipoleMoments *moments, int nMoments, CompactPartData *compactParts, int nCompactParts, void *wrCallback) {
#endif
workRequest transferKernel;
dataInfo *buf;
int size;
// operation will not invoke a kernel
transferKernel.dimGrid = dim3(0);
transferKernel.dimBlock = dim3(0);
transferKernel.smemSize = 0;
transferKernel.nBuffers = DM_TRANSFER_REMOTE_CHUNK_NBUFFERS;
transferKernel.bufferInfo = (dataInfo *) malloc(transferKernel.nBuffers * sizeof(dataInfo));
buf = &(transferKernel.bufferInfo[REMOTE_MOMENTS_IDX]);
buf->bufferID = REMOTE_MOMENTS;
buf->transferFromDevice = false;
buf->freeBuffer = false;
size = (nMoments) * sizeof(CudaMultipoleMoments);
buf->size = size;
buf->transferToDevice = (size > 0);
//double mill = 1e6;
//printf("DM remote moments: %f mbytes\n", 1.0*size/mill);
if(size > 0){
CUDA_MALLOC(buf->hostBuffer, size);
#ifdef CUDA_PRINT_ERRORS
printf("DMRemote 0: %s\n", cudaGetErrorString( cudaGetLastError() ) );
#endif
memcpy(buf->hostBuffer, moments, size);
}
buf = &(transferKernel.bufferInfo[REMOTE_PARTICLE_CORES_IDX]);
buf->bufferID = REMOTE_PARTICLE_CORES;
buf->transferFromDevice = false;
buf->freeBuffer = false;
size = (nCompactParts)*sizeof(CompactPartData);
buf->size = size;
buf->transferToDevice = (size > 0);
//printf("DM remote cores: %f mbytes\n", 1.0*size/mill);
if(size > 0){
CUDA_MALLOC(buf->hostBuffer, size);
#ifdef CUDA_PRINT_ERRORS
printf("DMRemote 1: %s\n", cudaGetErrorString( cudaGetLastError() ) );
#endif
memcpy(buf->hostBuffer, compactParts, size);
}
#ifdef CUDA_VERBOSE_KERNEL_ENQUEUE
printf("(%d) DM REMOTE CHUNK moments %d (%d) partcores %d (%d)\n",
CmiMyPe(),
transferKernel.bufferInfo[REMOTE_MOMENTS_IDX].size,
transferKernel.bufferInfo[REMOTE_MOMENTS_IDX].transferToDevice,
transferKernel.bufferInfo[REMOTE_PARTICLE_CORES_IDX].size,
transferKernel.bufferInfo[REMOTE_PARTICLE_CORES_IDX].transferToDevice
);
#endif
transferKernel.callbackFn = wrCallback;
transferKernel.id = DM_TRANSFER_REMOTE_CHUNK;
#ifdef CUDA_INSTRUMENT_WRS
transferKernel.chareIndex = mype;
transferKernel.compType = transferKernel.id;
transferKernel.compPhase = phase;
#endif
enqueue(&transferKernel);
}
void TreePieceCellListDataTransferLocal(CudaRequest *data){
int numBlocks = data->numBucketsPlusOne-1;
workRequest gravityKernel;
//dataInfo *buffer, *partCoreBuffer;
//ParameterStruct *pmtr;
gravityKernel.dimGrid = dim3(numBlocks);
#ifdef CUDA_2D_TB_KERNEL
gravityKernel.dimBlock = dim3(NODES_PER_BLOCK,PARTS_PER_BLOCK);
#else
gravityKernel.dimBlock = dim3(THREADS_PER_BLOCK);
#endif
gravityKernel.smemSize = 0;
gravityKernel.nBuffers = TP_GRAVITY_LOCAL_NBUFFERS;
/* schedule buffers for transfer to the GPU */
gravityKernel.bufferInfo = (dataInfo *) malloc(gravityKernel.nBuffers * sizeof(dataInfo));
TreePieceCellListDataTransferBasic(data, &gravityKernel);
#ifdef CUDA_VERBOSE_KERNEL_ENQUEUE
printf("(%d) TRANSFER LOCAL CELL\n", CmiMyPe());
#endif
gravityKernel.callbackFn = data->cb;
gravityKernel.id = TP_GRAVITY_LOCAL;
#ifdef CUDA_INSTRUMENT_WRS
gravityKernel.chareIndex = data->tpIndex;
gravityKernel.compType = gravityKernel.id;
gravityKernel.compPhase = data->phase;
#endif
enqueue(&gravityKernel);
}
void TreePieceCellListDataTransferRemote(CudaRequest *data){
int numBlocks = data->numBucketsPlusOne-1;
workRequest gravityKernel;
//dataInfo *buffer, *partCoreBuffer;
gravityKernel.dimGrid = dim3(numBlocks);
#ifdef CUDA_2D_TB_KERNEL
gravityKernel.dimBlock = dim3(NODES_PER_BLOCK,PARTS_PER_BLOCK);
#else
gravityKernel.dimBlock = dim3(THREADS_PER_BLOCK);
#endif
gravityKernel.smemSize = 0;
gravityKernel.nBuffers = TP_NODE_GRAVITY_REMOTE_NBUFFERS;
/* schedule buffers for transfer to the GPU */
gravityKernel.bufferInfo = (dataInfo *) malloc(gravityKernel.nBuffers * sizeof(dataInfo));
TreePieceCellListDataTransferBasic(data, &gravityKernel);
#ifdef CUDA_VERBOSE_KERNEL_ENQUEUE
printf("(%d) TRANSFER REMOTE CELL\n", CmiMyPe());
#endif
gravityKernel.callbackFn = data->cb;
gravityKernel.id = TP_GRAVITY_REMOTE;
#ifdef CUDA_INSTRUMENT_WRS
gravityKernel.chareIndex = data->tpIndex;
gravityKernel.compType = gravityKernel.id;
gravityKernel.compPhase = data->phase;
#endif
enqueue(&gravityKernel);
}
void TreePieceCellListDataTransferRemoteResume(CudaRequest *data, CudaMultipoleMoments *missedMoments, int numMissedMoments){
int numBlocks = data->numBucketsPlusOne-1;
int size;
workRequest gravityKernel;
dataInfo *buffer;
bool transfer;
gravityKernel.dimGrid = dim3(numBlocks);
#ifdef CUDA_2D_TB_KERNEL
gravityKernel.dimBlock = dim3(NODES_PER_BLOCK,PARTS_PER_BLOCK);
#else
gravityKernel.dimBlock = dim3(THREADS_PER_BLOCK);
#endif
gravityKernel.smemSize = 0;
gravityKernel.nBuffers = TP_NODE_GRAVITY_REMOTE_RESUME_NBUFFERS;
/* schedule buffers for transfer to the GPU */
gravityKernel.bufferInfo = (dataInfo *) malloc(gravityKernel.nBuffers * sizeof(dataInfo));
TreePieceCellListDataTransferBasic(data, &gravityKernel);
transfer = gravityKernel.bufferInfo[ILCELL_IDX].transferToDevice;
buffer = &(gravityKernel.bufferInfo[MISSED_MOMENTS_IDX]);
buffer->bufferID = -1;
size = (numMissedMoments) * sizeof(CudaMultipoleMoments);
buffer->size = size;
buffer->transferToDevice = transfer;
buffer->freeBuffer = transfer;
buffer->transferFromDevice = false;
if(transfer){
CUDA_MALLOC(buffer->hostBuffer, size);
#ifdef CUDA_PRINT_ERRORS
printf("TPRR 0: %s\n", cudaGetErrorString( cudaGetLastError() ) );
#endif
memcpy(buffer->hostBuffer, missedMoments, size);
}
#ifdef CUDA_VERBOSE_KERNEL_ENQUEUE
printf("(%d) TRANSFER REMOTE RESUME CELL %d (%d)\n", CmiMyPe(),
buffer->size, buffer->transferToDevice);
#endif
ParameterStruct *ptr = (ParameterStruct *)gravityKernel.userData;
ptr->numEntities = numMissedMoments;
gravityKernel.callbackFn = data->cb;
gravityKernel.id = TP_GRAVITY_REMOTE_RESUME;
#ifdef CUDA_INSTRUMENT_WRS
gravityKernel.chareIndex = data->tpIndex;
gravityKernel.compType = gravityKernel.id;
gravityKernel.compPhase = data->phase;
#endif
enqueue(&gravityKernel);
}
void TreePieceCellListDataTransferBasic(CudaRequest *data, workRequest *gravityKernel){
dataInfo *buffer;
int numBucketsPlusOne = data->numBucketsPlusOne;
int numBuckets = numBucketsPlusOne-1;
int size = (data->numInteractions) * sizeof(ILCell);
bool transfer = size > 0;
buffer = &(gravityKernel->bufferInfo[ILCELL_IDX]);
buffer->bufferID = -1;
buffer->size = size;
buffer->transferToDevice = transfer;
buffer->freeBuffer = transfer;
buffer->transferFromDevice = false;
buffer->hostBuffer = data->list;
buffer = &(gravityKernel->bufferInfo[NODE_BUCKET_MARKERS_IDX]);
buffer->bufferID = -1;
size = (numBucketsPlusOne) * sizeof(int);
buffer->size = transfer ? size : 0;
buffer->transferToDevice = transfer;
buffer->freeBuffer = transfer;
buffer->transferFromDevice = false;
buffer->hostBuffer = data->bucketMarkers;
buffer = &(gravityKernel->bufferInfo[NODE_BUCKET_START_MARKERS_IDX]);
buffer->bufferID = -1;
size = (numBuckets) * sizeof(int);
buffer->size = size;
buffer->transferToDevice = transfer;
buffer->freeBuffer = transfer;
buffer->transferFromDevice = false;
buffer->hostBuffer = data->bucketStarts;
buffer = &(gravityKernel->bufferInfo[NODE_BUCKET_SIZES_IDX]);
buffer->bufferID = -1;
buffer->size = size;
buffer->transferToDevice = transfer;
buffer->freeBuffer = transfer;
buffer->transferFromDevice = false;
buffer->hostBuffer = data->bucketSizes;
ParameterStruct *ptr = (ParameterStruct *)malloc(sizeof(ParameterStruct));
ptr->numInteractions = data->numInteractions;
ptr->numBucketsPlusOne = numBucketsPlusOne;
ptr->fperiod = data->fperiod;
gravityKernel->userData = ptr;
#ifdef CUDA_VERBOSE_KERNEL_ENQUEUE
printf("(%d) TRANSFER BASIC cells %d (%d) bucket_markers %d (%d) bucket_starts %d (%d) bucket_sizes %d (%d)\n",
CmiMyPe(),
gravityKernel->bufferInfo[ILCELL_IDX].size,
gravityKernel->bufferInfo[ILCELL_IDX].transferToDevice,
gravityKernel->bufferInfo[NODE_BUCKET_MARKERS_IDX].size,
gravityKernel->bufferInfo[NODE_BUCKET_MARKERS_IDX].transferToDevice,
gravityKernel->bufferInfo[NODE_BUCKET_START_MARKERS_IDX].size,
gravityKernel->bufferInfo[NODE_BUCKET_START_MARKERS_IDX].transferToDevice,
gravityKernel->bufferInfo[NODE_BUCKET_SIZES_IDX].size,
gravityKernel->bufferInfo[NODE_BUCKET_SIZES_IDX].transferToDevice
);
#endif
}
void TreePiecePartListDataTransferLocalSmallPhase(CudaRequest *data, CompactPartData *particles, int len){
int numBlocks = data->numBucketsPlusOne-1;
workRequest gravityKernel;
dataInfo *buffer;
int size;
ParameterStruct *ptr;
bool transfer;
gravityKernel.dimGrid = dim3(numBlocks);
#ifdef CUDA_2D_TB_KERNEL
gravityKernel.dimBlock = dim3(NODES_PER_BLOCK_PART,PARTS_PER_BLOCK_PART);
#else
gravityKernel.dimBlock = dim3(THREADS_PER_BLOCK);
#endif
gravityKernel.smemSize = 0;
gravityKernel.nBuffers = TP_GRAVITY_LOCAL_NBUFFERS_SMALLPHASE;
/* schedule buffers for transfer to the GPU */
gravityKernel.bufferInfo = (dataInfo *) malloc(gravityKernel.nBuffers * sizeof(dataInfo));
TreePiecePartListDataTransferBasic(data, &gravityKernel);
transfer = gravityKernel.bufferInfo[ILPART_IDX].transferToDevice;
buffer = &(gravityKernel.bufferInfo[MISSED_PARTS_IDX]);
buffer->bufferID = -1;
size = (len) * sizeof(CompactPartData);
buffer->size = size;
buffer->transferToDevice = transfer;
buffer->freeBuffer = transfer;
buffer->transferFromDevice = false;
#ifdef CUDA_VERBOSE_KERNEL_ENQUEUE
printf("(%d) TRANSFER LOCAL SMALL PHASE %d (%d)\n",
CmiMyPe(),
buffer->size,
buffer->transferToDevice
);
#endif
if(transfer){
CUDA_MALLOC(buffer->hostBuffer, size);
#ifdef CUDA_PRINT_ERRORS
printf("TPPartSmallPhase 0: %s\n", cudaGetErrorString( cudaGetLastError() ) );
#endif
memcpy(buffer->hostBuffer, particles, size);
}
ptr = (ParameterStruct *)gravityKernel.userData;
ptr->numMissedCores = len;
gravityKernel.callbackFn = data->cb;
gravityKernel.id = TP_PART_GRAVITY_LOCAL_SMALLPHASE;
#ifdef CUDA_INSTRUMENT_WRS
gravityKernel.chareIndex = data->tpIndex;
gravityKernel.compType = gravityKernel.id;
gravityKernel.compPhase = data->phase;
#endif
enqueue(&gravityKernel);
}
void TreePiecePartListDataTransferLocal(CudaRequest *data){
int numBlocks = data->numBucketsPlusOne-1;
workRequest gravityKernel;
//dataInfo *buffer, *partCoreBuffer;
gravityKernel.dimGrid = dim3(numBlocks);
#ifdef CUDA_2D_TB_KERNEL
gravityKernel.dimBlock = dim3(NODES_PER_BLOCK_PART,PARTS_PER_BLOCK_PART);
#else
gravityKernel.dimBlock = dim3(THREADS_PER_BLOCK);
#endif
gravityKernel.smemSize = 0;
gravityKernel.nBuffers = TP_GRAVITY_LOCAL_NBUFFERS;
/* schedule buffers for transfer to the GPU */
gravityKernel.bufferInfo = (dataInfo *) malloc(gravityKernel.nBuffers * sizeof(dataInfo));
TreePiecePartListDataTransferBasic(data, &gravityKernel);
gravityKernel.callbackFn = data->cb;
gravityKernel.id = TP_PART_GRAVITY_LOCAL;
#ifdef CUDA_VERBOSE_KERNEL_ENQUEUE
printf("(%d) TRANSFER LOCAL LARGEPHASE PART\n", CmiMyPe());
#endif
#ifdef CUDA_INSTRUMENT_WRS
gravityKernel.chareIndex = data->tpIndex;
gravityKernel.compType = gravityKernel.id;
gravityKernel.compPhase = data->phase;
#endif
enqueue(&gravityKernel);
}
void TreePiecePartListDataTransferRemote(CudaRequest *data){
int numBlocks = data->numBucketsPlusOne-1;
workRequest gravityKernel;
gravityKernel.dimGrid = dim3(numBlocks);
#ifdef CUDA_2D_TB_KERNEL
gravityKernel.dimBlock = dim3(NODES_PER_BLOCK_PART,PARTS_PER_BLOCK_PART);
#else
gravityKernel.dimBlock = dim3(THREADS_PER_BLOCK);
#endif
gravityKernel.smemSize = 0;
gravityKernel.nBuffers = TP_PART_GRAVITY_REMOTE_NBUFFERS;
gravityKernel.bufferInfo = (dataInfo *) malloc(gravityKernel.nBuffers * sizeof(dataInfo));
TreePiecePartListDataTransferBasic(data, &gravityKernel);
gravityKernel.callbackFn = data->cb;
gravityKernel.id = TP_PART_GRAVITY_REMOTE;
#ifdef CUDA_VERBOSE_KERNEL_ENQUEUE
printf("(%d) TRANSFER REMOTE PART\n", CmiMyPe());
#endif
#ifdef CUDA_INSTRUMENT_WRS
gravityKernel.chareIndex = data->tpIndex;
gravityKernel.compType = gravityKernel.id;
gravityKernel.compPhase = data->phase;
#endif
enqueue(&gravityKernel);
}
void TreePiecePartListDataTransferRemoteResume(CudaRequest *data, CompactPartData *missedParts, int numMissedParts){
int numBlocks = data->numBucketsPlusOne-1;
int size;
ParameterStruct *ptr;
bool transfer;
workRequest gravityKernel;
dataInfo *buffer;
gravityKernel.dimGrid = dim3(numBlocks);
#ifdef CUDA_2D_TB_KERNEL
gravityKernel.dimBlock = dim3(NODES_PER_BLOCK_PART,PARTS_PER_BLOCK_PART);
#else
gravityKernel.dimBlock = dim3(THREADS_PER_BLOCK);
#endif
gravityKernel.smemSize = 0;
gravityKernel.nBuffers = TP_PART_GRAVITY_REMOTE_RESUME_NBUFFERS;
/* schedule buffers for transfer to the GPU */
gravityKernel.bufferInfo = (dataInfo *) malloc(gravityKernel.nBuffers * sizeof(dataInfo));
TreePiecePartListDataTransferBasic(data, &gravityKernel);
transfer = gravityKernel.bufferInfo[ILPART_IDX].transferToDevice;
buffer = &(gravityKernel.bufferInfo[MISSED_PARTS_IDX]);
buffer->bufferID = -1;
size = (numMissedParts) * sizeof(CompactPartData);
buffer->size = size;
buffer->transferToDevice = transfer;
buffer->freeBuffer = transfer;
buffer->transferFromDevice = false;
#ifdef CUDA_VERBOSE_KERNEL_ENQUEUE
printf("(%d) TRANSFER REMOTE RESUME PART %d (%d)\n",
CmiMyPe(),
buffer->size,
buffer->transferToDevice
);
#endif
if(transfer){
CUDA_MALLOC(buffer->hostBuffer, size);
#ifdef CUDA_PRINT_ERRORS
printf("TPPartRR 0: %s\n", cudaGetErrorString( cudaGetLastError() ) );
#endif
memcpy(buffer->hostBuffer, missedParts, size);
}
ptr = (ParameterStruct *)gravityKernel.userData;
ptr->numMissedCores = numMissedParts;
gravityKernel.callbackFn = data->cb;
gravityKernel.id = TP_PART_GRAVITY_REMOTE_RESUME;
#ifdef CUDA_INSTRUMENT_WRS
gravityKernel.chareIndex = data->tpIndex;
gravityKernel.compType = gravityKernel.id;
gravityKernel.compPhase = data->phase;
#endif
enqueue(&gravityKernel);
}
void TreePiecePartListDataTransferBasic(CudaRequest *data, workRequest *gravityKernel){
dataInfo *buffer;
int numInteractions = data->numInteractions;
int numBucketsPlusOne = data->numBucketsPlusOne;
int numBuckets = numBucketsPlusOne-1;
int size;
bool transfer;
buffer = &(gravityKernel->bufferInfo[ILPART_IDX]);
buffer->bufferID = -1;
size = (numInteractions) * sizeof(ILCell);
buffer->size = size;
transfer = size > 0;
buffer->transferToDevice = transfer;
buffer->freeBuffer = transfer;
buffer->transferFromDevice = false;
buffer->hostBuffer = data->list;
buffer = &(gravityKernel->bufferInfo[PART_BUCKET_MARKERS_IDX]);
buffer->bufferID = -1;
size = (numBucketsPlusOne) * sizeof(int);
buffer->size = transfer ? size : 0;
buffer->transferToDevice = transfer;
buffer->freeBuffer = transfer;
buffer->transferFromDevice = false;
buffer->hostBuffer = data->bucketMarkers;
buffer = &(gravityKernel->bufferInfo[PART_BUCKET_START_MARKERS_IDX]);
buffer->bufferID = -1;
buffer->hostBuffer = data->bucketStarts;
size = (numBuckets) * sizeof(int);
buffer->size = size;
buffer->transferToDevice = transfer;
buffer->freeBuffer = transfer;
buffer->transferFromDevice = false;
buffer->hostBuffer = data->bucketStarts;
buffer = &(gravityKernel->bufferInfo[PART_BUCKET_SIZES_IDX]);
buffer->bufferID = -1;
buffer->size = size;
buffer->transferToDevice = transfer;
buffer->freeBuffer = transfer;
buffer->transferFromDevice = false;
buffer->hostBuffer = data->bucketSizes;
ParameterStruct *ptr = (ParameterStruct *)malloc(sizeof(ParameterStruct));
ptr->numInteractions = data->numInteractions;
ptr->numBucketsPlusOne = numBucketsPlusOne;
ptr->fperiod = data->fperiod;
gravityKernel->userData = ptr;
#ifdef CUDA_VERBOSE_KERNEL_ENQUEUE
printf("(%d) TRANSFER BASIC PART parts %d (%d) bucket_markers %d (%d) bucket_starts %d (%d) bucket_sizes %d (%d)\n",
CmiMyPe(),
gravityKernel->bufferInfo[ILPART_IDX].size,
gravityKernel->bufferInfo[ILPART_IDX].transferToDevice,
gravityKernel->bufferInfo[PART_BUCKET_MARKERS_IDX].size,
gravityKernel->bufferInfo[PART_BUCKET_MARKERS_IDX].transferToDevice,
gravityKernel->bufferInfo[PART_BUCKET_START_MARKERS_IDX].size,
gravityKernel->bufferInfo[PART_BUCKET_START_MARKERS_IDX].transferToDevice,
gravityKernel->bufferInfo[PART_BUCKET_SIZES_IDX].size,
gravityKernel->bufferInfo[PART_BUCKET_SIZES_IDX].transferToDevice
);
#endif
}
/* kernels:
TP_GRAVITY_LOCAL,
TP_GRAVITY_REMOTE,
TP_GRAVITY_REMOTE_RESUME,
TP_PART_GRAVITY_LOCAL,
TP_PART_GRAVITY_REMOTE,
TP_PART_GRAVITY_REMOTE_RESUME
*/
#ifdef CUDA_INSTRUMENT_WRS
void FreeDataManagerLocalTreeMemory(bool freemom, bool freepart, int index, char phase){
#else
void FreeDataManagerLocalTreeMemory(bool freemom, bool freepart){
#endif
workRequest gravityKernel;
dataInfo *buffer;
gravityKernel.dimGrid = dim3(0);
gravityKernel.dimBlock = dim3(0);
gravityKernel.smemSize = 0;
gravityKernel.nBuffers = DM_TRANSFER_LOCAL_NBUFFERS-1;
/* schedule buffers for transfer to the GPU */
gravityKernel.bufferInfo = (dataInfo *) malloc((DM_TRANSFER_LOCAL_NBUFFERS-1) * sizeof(dataInfo));
buffer = &(gravityKernel.bufferInfo[LOCAL_MOMENTS_IDX]);
buffer->bufferID = LOCAL_MOMENTS;
buffer->transferToDevice = false ;
buffer->transferFromDevice = false;
buffer->freeBuffer = freemom;
buffer->hostBuffer = 0;
buffer->size = 0;
buffer = &(gravityKernel.bufferInfo[LOCAL_PARTICLE_CORES_IDX]);
buffer->bufferID = LOCAL_PARTICLE_CORES;
buffer->transferToDevice = false ;
buffer->transferFromDevice = false;
buffer->freeBuffer = freepart;
buffer->hostBuffer = 0;
buffer->size = 0;
gravityKernel.callbackFn = 0;
gravityKernel.id = DM_TRANSFER_FREE_LOCAL;
//printf("DM TRANSFER FREE LOCAL\n");
#ifdef CUDA_INSTRUMENT_WRS
gravityKernel.chareIndex = index;
gravityKernel.compType = gravityKernel.id;
gravityKernel.compPhase = phase;
#endif
enqueue(&gravityKernel);
}
// this function begins the transfer of the next
// pending chunk on dm, once we are assured that
// the previous chunk's allocated memory has been
// freed
void initiateNextChunkTransfer(void *dm);
#ifdef CUDA_INSTRUMENT_WRS
void FreeDataManagerRemoteChunkMemory(int chunk, void *dm, bool freemom, bool freepart, int index, char phase){
#else
void FreeDataManagerRemoteChunkMemory(int chunk, void *dm, bool freemom, bool freepart){
#endif
workRequest gravityKernel;
dataInfo *buffer;
gravityKernel.dimGrid = dim3(0);
gravityKernel.dimBlock = dim3(0);
gravityKernel.smemSize = 0;
gravityKernel.nBuffers = DM_TRANSFER_REMOTE_CHUNK_NBUFFERS;
/* schedule buffers for transfer to the GPU */
gravityKernel.bufferInfo = (dataInfo *) malloc((DM_TRANSFER_REMOTE_CHUNK_NBUFFERS) * sizeof(dataInfo));
buffer = &(gravityKernel.bufferInfo[REMOTE_MOMENTS_IDX]);
buffer->bufferID = REMOTE_MOMENTS;
buffer->transferToDevice = false;
buffer->transferFromDevice = false;
buffer->freeBuffer = freemom;
buffer->size = 0;
buffer = &(gravityKernel.bufferInfo[REMOTE_PARTICLE_CORES_IDX]);
buffer->bufferID = REMOTE_PARTICLE_CORES;
buffer->transferToDevice = false;
buffer->transferFromDevice = false;
buffer->freeBuffer = freepart;
buffer->size = 0;
gravityKernel.callbackFn = 0;
gravityKernel.id = DM_TRANSFER_FREE_REMOTE_CHUNK;
// save a pointer to the data manager so that
// the next chunk's transfer can be initiated once
// the memory for this chunk has been freed
gravityKernel.userData = dm;
//printf("DM TRANSFER FREE REMOTE CHUNK\n");
#ifdef CUDA_INSTRUMENT_WRS
gravityKernel.chareIndex = index;
gravityKernel.compType = gravityKernel.id;
gravityKernel.compPhase = phase;
#endif
enqueue(&gravityKernel);
}
#ifdef CUDA_INSTRUMENT_WRS
void TransferParticleVarsBack(VariablePartData *hostBuffer, int size, void *cb, bool freemom, bool freepart, int index, char phase){
#else
void TransferParticleVarsBack(VariablePartData *hostBuffer, int size, void *cb, bool freemom, bool freepart){
#endif
workRequest gravityKernel;
dataInfo *buffer;
#ifdef CUDA_PRINT_TRANSFER_BACK_PARTICLES
printf("Enqueue kernel to transfer particles back from: 0x%x\n", devBuffers[LOCAL_PARTICLE_VARS]);
#endif
gravityKernel.dimGrid = dim3(0);
gravityKernel.dimBlock = dim3(0);
gravityKernel.smemSize = 0;
gravityKernel.nBuffers = 3;
/* schedule buffers for transfer to the GPU */
gravityKernel.bufferInfo = (dataInfo *) malloc(3 * sizeof(dataInfo));
buffer = &(gravityKernel.bufferInfo[LOCAL_PARTICLE_VARS_IDX]);
buffer->bufferID = LOCAL_PARTICLE_VARS;
buffer->transferToDevice = false;
buffer->hostBuffer = hostBuffer;
buffer->size = size;
buffer->transferFromDevice = size > 0;
buffer->freeBuffer = size > 0;
buffer = &(gravityKernel.bufferInfo[LOCAL_MOMENTS_IDX]);
buffer->bufferID = LOCAL_MOMENTS;
buffer->transferToDevice = false;
buffer->transferFromDevice = false;
buffer->freeBuffer = freemom;
buffer->hostBuffer = NULL;
buffer->size = 0;
buffer = &(gravityKernel.bufferInfo[LOCAL_PARTICLE_CORES_IDX]);
buffer->bufferID = LOCAL_PARTICLE_CORES;
buffer->transferToDevice = false ;
buffer->transferFromDevice = false;
buffer->freeBuffer = freepart;
buffer->hostBuffer = NULL;
buffer->size = 0;
gravityKernel.callbackFn = cb;
gravityKernel.id = DM_TRANSFER_BACK;
#ifdef CUDA_VERBOSE_KERNEL_ENQUEUE
printf("(%d) DM TRANSFER BACK\n", CmiMyPe());
#endif
#ifdef CUDA_INSTRUMENT_WRS
gravityKernel.chareIndex = index;
gravityKernel.compType = gravityKernel.id;
gravityKernel.compPhase = phase;
#endif
enqueue(&gravityKernel);
}
/*
void DummyKernel(void *cb){
workRequest dummy;
//dataInfo *buffer;
dummy.dimGrid = dim3(1);
dummy.dimBlock = dim3(THREADS_PER_BLOCK);
dummy.smemSize = 0;
dummy.nBuffers = 0;
dummy.bufferInfo = 0; //(dataInfo *) malloc(1 * sizeof(dataInfo));
dummy.callbackFn = cb;
dummy.id = DUMMY;
enqueue(wrQueue, &dummy);
}
*/
// kernel selector function
void kernelSelect(workRequest *wr) {
#ifdef CUDA_TRACE
double st;
#endif
cudaStream_t kernel_stream = getKernelStream();
void** devBuffers = getdevBuffers();
ParameterStruct *ptr;
switch (wr->id) {
case DM_TRANSFER_LOCAL:
#ifdef CUDA_NOTIFY_DATA_TRANSFER_DONE
printf("DM_TRANSFER_LOCAL KERNELSELECT\n");
printf("mom: 0x%x\n", devBuffers[LOCAL_MOMENTS]);
printf("cores: 0x%x\n", devBuffers[LOCAL_PARTICLE_CORES]);
printf("vars: 0x%x\n", devBuffers[LOCAL_PARTICLE_VARS]);
#endif
#ifdef CUDA_USE_CUDAMALLOCHOST
if( wr->bufferInfo[LOCAL_MOMENTS_IDX].transferToDevice ){
#ifdef CUDA_MEMPOOL
hapi_poolFree(wr->bufferInfo[LOCAL_MOMENTS_IDX].hostBuffer);
#else
delayedFree(wr->bufferInfo[LOCAL_MOMENTS_IDX].hostBuffer);
#endif
}
#ifdef CUDA_PRINT_ERRORS
printf("DM_TRANSFER_LOCAL 0: %s\n", cudaGetErrorString( cudaGetLastError() ) );
#endif
if( wr->bufferInfo[LOCAL_PARTICLE_CORES_IDX].transferToDevice ){
#ifdef CUDA_MEMPOOL
hapi_poolFree(wr->bufferInfo[LOCAL_PARTICLE_CORES_IDX].hostBuffer);
#else
delayedFree(wr->bufferInfo[LOCAL_PARTICLE_CORES_IDX].hostBuffer);
#endif
}
#ifdef CUDA_PRINT_ERRORS
printf("DM_TRANSFER_LOCAL 1: %s\n", cudaGetErrorString( cudaGetLastError() ) );
#endif
if( wr->bufferInfo[LOCAL_PARTICLE_VARS_IDX].transferToDevice ){
#ifdef CUDA_MEMPOOL
hapi_poolFree(wr->bufferInfo[LOCAL_PARTICLE_VARS_IDX].hostBuffer);
#else
delayedFree(wr->bufferInfo[LOCAL_PARTICLE_VARS_IDX].hostBuffer);
#endif
}
#ifdef CUDA_PRINT_ERRORS
printf("DM_TRANSFER_LOCAL 2: %s\n", cudaGetErrorString( cudaGetLastError() ) );
#endif
#else
if( wr->bufferInfo[LOCAL_MOMENTS_IDX].transferToDevice ){
free(wr->bufferInfo[LOCAL_MOMENTS_IDX].hostBuffer);
}
if( wr->bufferInfo[LOCAL_PARTICLE_CORES_IDX].transferToDevice ){
free(wr->bufferInfo[LOCAL_PARTICLE_CORES_IDX].hostBuffer);
}
if( wr->bufferInfo[LOCAL_PARTICLE_VARS_IDX].transferToDevice ){
free(wr->bufferInfo[LOCAL_PARTICLE_VARS_IDX].hostBuffer);
}
#endif
break;
case DM_TRANSFER_REMOTE_CHUNK:
#ifdef CUDA_NOTIFY_DATA_TRANSFER_DONE
printf("DM_TRANSFER_REMOTE_CHUNK, %d KERNELSELECT\n", wr->bufferInfo[REMOTE_MOMENTS_IDX].transferToDevice);
#endif
if( wr->bufferInfo[REMOTE_MOMENTS_IDX].transferToDevice ){
#ifdef CUDA_USE_CUDAMALLOCHOST
#ifdef CUDA_MEMPOOL
hapi_poolFree(wr->bufferInfo[REMOTE_MOMENTS_IDX].hostBuffer);
#else
delayedFree(wr->bufferInfo[REMOTE_MOMENTS_IDX].hostBuffer);
#endif
#ifdef CUDA_PRINT_ERRORS
printf("DM_TRANSFER_REMOTE_CHUNK 0: %s\n", cudaGetErrorString( cudaGetLastError() ) );
#endif
#else
free(wr->bufferInfo[REMOTE_MOMENTS_IDX].hostBuffer);
#endif
}
if( wr->bufferInfo[REMOTE_PARTICLE_CORES_IDX].transferToDevice ){
#ifdef CUDA_USE_CUDAMALLOCHOST
#ifdef CUDA_MEMPOOL
hapi_poolFree(wr->bufferInfo[REMOTE_PARTICLE_CORES_IDX].hostBuffer);
#else
delayedFree(wr->bufferInfo[REMOTE_PARTICLE_CORES_IDX].hostBuffer);
#endif
#ifdef CUDA_PRINT_ERRORS
printf("DM_TRANSFER_REMOTE_CHUNK 1: %s\n", cudaGetErrorString( cudaGetLastError() ) );
#endif
#else
free(wr->bufferInfo[REMOTE_PARTICLE_CORES_IDX].hostBuffer);
#endif
}
break;
case DM_TRANSFER_FREE_LOCAL:
#ifdef CUDA_NOTIFY_DATA_TRANSFER_DONE
printf("DM_TRANSFER_FREE_LOCAL KERNELSELECT\n");
printf("buffers:\nlocal_particles: (0x%x)\nlocal_particle_vars: (0x%x)\nlocal_moments: (0x%x)\n",
devBuffers[LOCAL_PARTICLE_CORES],
devBuffers[LOCAL_PARTICLE_VARS],
devBuffers[LOCAL_MOMENTS]
);
#endif
break;
case DM_TRANSFER_FREE_REMOTE_CHUNK:
#ifdef CUDA_NOTIFY_DATA_TRANSFER_DONE
printf("DM_TRANSFER_FREE_REMOTE_CHUNK KERNELSELECT\n");
#endif
initiateNextChunkTransfer(wr->userData);
break;
case DM_TRANSFER_BACK:
#ifdef CUDA_NOTIFY_DATA_TRANSFER_DONE
printf("DM_TRANSFER_BACK: 0x%x KERNELSELECT\n", devBuffers[LOCAL_PARTICLE_VARS]);
#endif
break;
case TP_GRAVITY_LOCAL:
ptr = (ParameterStruct *)wr->userData;
#ifdef CUDA_NOTIFY_DATA_TRANSFER_DONE
printf("TP_GRAVITY_LOCAL KERNELSELECT buffers:\nlocal_particles: (0x%x)\nlocal_particle_vars: (0x%x)\nlocal_moments: (0x%x)\nil_cell: %d (0x%x)\n",
devBuffers[LOCAL_PARTICLE_CORES],
devBuffers[LOCAL_PARTICLE_VARS],
devBuffers[LOCAL_MOMENTS],
wr->bufferInfo[ILCELL_IDX].bufferID,
devBuffers[wr->bufferInfo[ILCELL_IDX].bufferID]
);
#endif
if( wr->bufferInfo[ILCELL_IDX].transferToDevice ){
#ifndef CUDA_NO_KERNELS
nodeGravityComputation<<<wr->dimGrid, wr->dimBlock, wr->smemSize, kernel_stream>>>
(
(CompactPartData *)devBuffers[LOCAL_PARTICLE_CORES],
(VariablePartData *)devBuffers[LOCAL_PARTICLE_VARS],
(CudaMultipoleMoments *)devBuffers[LOCAL_MOMENTS],
(ILCell *)devBuffers[wr->bufferInfo[ILCELL_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[NODE_BUCKET_MARKERS_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[NODE_BUCKET_START_MARKERS_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[NODE_BUCKET_SIZES_IDX].bufferID],
ptr->fperiod
);
#endif
#ifdef CUDA_TRACE
st = CmiWallTimer();
#endif
#ifdef CUDA_USE_CUDAMALLOCHOST
#ifdef CUDA_MEMPOOL
hapi_poolFree((ILCell *)wr->bufferInfo[ILCELL_IDX].hostBuffer);
hapi_poolFree((int *)wr->bufferInfo[NODE_BUCKET_MARKERS_IDX].hostBuffer);
hapi_poolFree((int *)wr->bufferInfo[NODE_BUCKET_START_MARKERS_IDX].hostBuffer);
hapi_poolFree((int *)wr->bufferInfo[NODE_BUCKET_SIZES_IDX].hostBuffer);
#else
delayedFree((ILCell *)wr->bufferInfo[ILCELL_IDX].hostBuffer);
delayedFree((int *)wr->bufferInfo[NODE_BUCKET_MARKERS_IDX].hostBuffer);
delayedFree((int *)wr->bufferInfo[NODE_BUCKET_START_MARKERS_IDX].hostBuffer);
delayedFree((int *)wr->bufferInfo[NODE_BUCKET_SIZES_IDX].hostBuffer);
#endif
#ifdef CUDA_PRINT_ERRORS
printf("TP_GRAVITY_LOCAL 0: %s\n", cudaGetErrorString( cudaGetLastError() ) );
#endif
#else
free((ILCell *)wr->bufferInfo[ILCELL_IDX].hostBuffer);
free((int *)wr->bufferInfo[NODE_BUCKET_MARKERS_IDX].hostBuffer);
free((int *)wr->bufferInfo[NODE_BUCKET_START_MARKERS_IDX].hostBuffer);
free((int *)wr->bufferInfo[NODE_BUCKET_SIZES_IDX].hostBuffer);
#endif
#ifdef CUDA_TRACE
traceUserBracketEvent(CUDA_LOCAL_NODE_KERNEL, st, CmiWallTimer());
#endif
}
free((ParameterStruct *)ptr);
break;
case TP_PART_GRAVITY_LOCAL_SMALLPHASE:
ptr = (ParameterStruct *)wr->userData;
#ifdef CUDA_NOTIFY_DATA_TRANSFER_DONE
printf("TP_PART_GRAVITY_LOCAL_SMALLPHASE KERNELSELECT buffers:\nlocal_particles: (0x%x)\nlocal_particle_vars: (0x%x)\nil_cell: %d (0x%x)\n",
devBuffers[LOCAL_PARTICLE_CORES],
devBuffers[LOCAL_PARTICLE_VARS],
wr->bufferInfo[ILPART_IDX].bufferID,
devBuffers[wr->bufferInfo[ILPART_IDX].bufferID]
);
#endif
if( wr->bufferInfo[ILPART_IDX].transferToDevice ){
#ifndef CUDA_NO_KERNELS
#ifdef CUDA_2D_TB_KERNEL
particleGravityComputation<<<wr->dimGrid, wr->dimBlock, wr->smemSize, kernel_stream>>>
(
(CompactPartData *)devBuffers[LOCAL_PARTICLE_CORES],
(VariablePartData *)devBuffers[LOCAL_PARTICLE_VARS],
(CompactPartData *)devBuffers[wr->bufferInfo[MISSED_PARTS_IDX].bufferID],
(ILCell *)devBuffers[wr->bufferInfo[ILPART_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[PART_BUCKET_MARKERS_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[PART_BUCKET_START_MARKERS_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[PART_BUCKET_SIZES_IDX].bufferID],
ptr->fperiod
);
#else
particleGravityComputation<<<wr->dimGrid, wr->dimBlock, wr->smemSize, kernel_stream>>>
(
(CompactPartData *)devBuffers[LOCAL_PARTICLE_CORES],
(VariablePartData *)devBuffers[LOCAL_PARTICLE_VARS],
(CompactPartData *)devBuffers[wr->bufferInfo[MISSED_PARTS_IDX].bufferID],
(ILPart *)devBuffers[wr->bufferInfo[ILPART_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[PART_BUCKET_MARKERS_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[PART_BUCKET_START_MARKERS_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[PART_BUCKET_SIZES_IDX].bufferID],
ptr->fperiod
);
#endif
#endif
#ifdef CUDA_TRACE
st = CmiWallTimer();
#endif
#ifdef CUDA_USE_CUDAMALLOCHOST
#ifdef CUDA_MEMPOOL
hapi_poolFree((CompactPartData *)wr->bufferInfo[MISSED_PARTS_IDX].hostBuffer);
hapi_poolFree(wr->bufferInfo[ILPART_IDX].hostBuffer);
hapi_poolFree((int *)wr->bufferInfo[PART_BUCKET_MARKERS_IDX].hostBuffer);
hapi_poolFree((int *)wr->bufferInfo[PART_BUCKET_START_MARKERS_IDX].hostBuffer);
hapi_poolFree((int *)wr->bufferInfo[PART_BUCKET_SIZES_IDX].hostBuffer);
#else
delayedFree((CompactPartData *)wr->bufferInfo[MISSED_PARTS_IDX].hostBuffer);
delayedFree(wr->bufferInfo[ILPART_IDX].hostBuffer);
delayedFree((int *)wr->bufferInfo[PART_BUCKET_MARKERS_IDX].hostBuffer);
delayedFree((int *)wr->bufferInfo[PART_BUCKET_START_MARKERS_IDX].hostBuffer);
delayedFree((int *)wr->bufferInfo[PART_BUCKET_SIZES_IDX].hostBuffer);
#endif
#ifdef CUDA_PRINT_ERRORS
printf("TP_PART_GRAVITY_LOCAL_SMALLPHASE 0: %s\n", cudaGetErrorString( cudaGetLastError() ) );
#endif
#else
free((CompactPartData *)wr->bufferInfo[MISSED_PARTS_IDX].hostBuffer);
free(wr->bufferInfo[ILPART_IDX].hostBuffer);
free((int *)wr->bufferInfo[PART_BUCKET_MARKERS_IDX].hostBuffer);
free((int *)wr->bufferInfo[PART_BUCKET_START_MARKERS_IDX].hostBuffer);
free((int *)wr->bufferInfo[PART_BUCKET_SIZES_IDX].hostBuffer);
#endif
#ifdef CUDA_TRACE
traceUserBracketEvent(CUDA_LOCAL_PART_KERNEL, st, CmiWallTimer());
#endif
}
free((ParameterStruct *)ptr);
break;
case TP_PART_GRAVITY_LOCAL:
ptr = (ParameterStruct *)wr->userData;
#ifdef CUDA_NOTIFY_DATA_TRANSFER_DONE
printf("TP_PART_GRAVITY_LOCAL KERNELSELECT buffers:\nlocal_particles: (0x%x)\nlocal_particle_vars: (0x%x)\nil_cell: %d (0x%x)\n",
devBuffers[LOCAL_PARTICLE_CORES],
devBuffers[LOCAL_PARTICLE_VARS],
wr->bufferInfo[ILPART_IDX].bufferID,
devBuffers[wr->bufferInfo[ILPART_IDX].bufferID]
);
#endif
if( wr->bufferInfo[ILPART_IDX].transferToDevice ){
#ifndef CUDA_NO_KERNELS
#ifdef CUDA_2D_TB_KERNEL
particleGravityComputation<<<wr->dimGrid, wr->dimBlock, wr->smemSize, kernel_stream>>>
(
(CompactPartData *)devBuffers[LOCAL_PARTICLE_CORES],
(VariablePartData *)devBuffers[LOCAL_PARTICLE_VARS],
(CompactPartData *)devBuffers[LOCAL_PARTICLE_CORES],
(ILCell *)devBuffers[wr->bufferInfo[ILPART_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[PART_BUCKET_MARKERS_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[PART_BUCKET_START_MARKERS_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[PART_BUCKET_SIZES_IDX].bufferID],
ptr->fperiod
);
#else
particleGravityComputation<<<wr->dimGrid, wr->dimBlock, wr->smemSize, kernel_stream>>>
(
(CompactPartData *)devBuffers[LOCAL_PARTICLE_CORES],
(VariablePartData *)devBuffers[LOCAL_PARTICLE_VARS],
(CompactPartData *)devBuffers[LOCAL_PARTICLE_CORES],
(ILPart *)devBuffers[wr->bufferInfo[ILPART_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[PART_BUCKET_MARKERS_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[PART_BUCKET_START_MARKERS_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[PART_BUCKET_SIZES_IDX].bufferID],
ptr->fperiod
);
#endif
#endif
#ifdef CUDA_TRACE
st = CmiWallTimer();
#endif
#ifdef CUDA_USE_CUDAMALLOCHOST
#ifdef CUDA_MEMPOOL
hapi_poolFree(wr->bufferInfo[ILPART_IDX].hostBuffer);
hapi_poolFree((int *)wr->bufferInfo[PART_BUCKET_MARKERS_IDX].hostBuffer);
hapi_poolFree((int *)wr->bufferInfo[PART_BUCKET_START_MARKERS_IDX].hostBuffer);
hapi_poolFree((int *)wr->bufferInfo[PART_BUCKET_SIZES_IDX].hostBuffer);
#else
delayedFree(wr->bufferInfo[ILPART_IDX].hostBuffer);
delayedFree((int *)wr->bufferInfo[PART_BUCKET_MARKERS_IDX].hostBuffer);
delayedFree((int *)wr->bufferInfo[PART_BUCKET_START_MARKERS_IDX].hostBuffer);
delayedFree((int *)wr->bufferInfo[PART_BUCKET_SIZES_IDX].hostBuffer);
#endif
#ifdef CUDA_PRINT_ERRORS
printf("TP_PART_GRAVITY_LOCAL 0: %s\n", cudaGetErrorString( cudaGetLastError() ) );
#endif
#else
free(wr->bufferInfo[ILPART_IDX].hostBuffer);
free((int *)wr->bufferInfo[PART_BUCKET_MARKERS_IDX].hostBuffer);
free((int *)wr->bufferInfo[PART_BUCKET_START_MARKERS_IDX].hostBuffer);
free((int *)wr->bufferInfo[PART_BUCKET_SIZES_IDX].hostBuffer);
#endif
#ifdef CUDA_TRACE
traceUserBracketEvent(CUDA_LOCAL_PART_KERNEL, st, CmiWallTimer());
#endif
}
free((ParameterStruct *)ptr);
break;
case TP_GRAVITY_REMOTE:
ptr = (ParameterStruct *)wr->userData;
#ifdef CUDA_NOTIFY_DATA_TRANSFER_DONE
printf("TP_GRAVITY_REMOTE KERNELSELECT buffers:\nlocal_particles: (0x%x)\nlocal_particle_vars: (0x%x)\nremote_moments: (0x%x)\nil_cell: %d (0x%x)\n",
devBuffers[LOCAL_PARTICLE_CORES],
devBuffers[LOCAL_PARTICLE_VARS],
devBuffers[REMOTE_MOMENTS],
wr->bufferInfo[ILCELL_IDX].bufferID,
devBuffers[wr->bufferInfo[ILCELL_IDX].bufferID]
);
#endif
if( wr->bufferInfo[ILCELL_IDX].transferToDevice ){
#ifndef CUDA_NO_KERNELS
nodeGravityComputation<<<wr->dimGrid, wr->dimBlock, wr->smemSize, kernel_stream>>>
(
(CompactPartData *)devBuffers[LOCAL_PARTICLE_CORES],
(VariablePartData *)devBuffers[LOCAL_PARTICLE_VARS],
(CudaMultipoleMoments *)devBuffers[REMOTE_MOMENTS],
(ILCell *)devBuffers[wr->bufferInfo[ILCELL_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[NODE_BUCKET_MARKERS_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[NODE_BUCKET_START_MARKERS_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[NODE_BUCKET_SIZES_IDX].bufferID],
ptr->fperiod
);
#endif
#ifdef CUDA_TRACE
st = CmiWallTimer();
#endif
#ifdef CUDA_USE_CUDAMALLOCHOST
#ifdef CUDA_MEMPOOL
hapi_poolFree((ILCell *)wr->bufferInfo[ILCELL_IDX].hostBuffer);
hapi_poolFree((int *)wr->bufferInfo[NODE_BUCKET_MARKERS_IDX].hostBuffer);
hapi_poolFree((int *)wr->bufferInfo[NODE_BUCKET_START_MARKERS_IDX].hostBuffer);
hapi_poolFree((int *)wr->bufferInfo[NODE_BUCKET_SIZES_IDX].hostBuffer);
#else
delayedFree((ILCell *)wr->bufferInfo[ILCELL_IDX].hostBuffer);
delayedFree((int *)wr->bufferInfo[NODE_BUCKET_MARKERS_IDX].hostBuffer);
delayedFree((int *)wr->bufferInfo[NODE_BUCKET_START_MARKERS_IDX].hostBuffer);
delayedFree((int *)wr->bufferInfo[NODE_BUCKET_SIZES_IDX].hostBuffer);
#endif
#ifdef CUDA_PRINT_ERRORS
printf("TP_GRAVITY_REMOTE 0: %s\n", cudaGetErrorString( cudaGetLastError() ) );
#endif
#else
free((ILCell *)wr->bufferInfo[ILCELL_IDX].hostBuffer);
free((int *)wr->bufferInfo[NODE_BUCKET_MARKERS_IDX].hostBuffer);
free((int *)wr->bufferInfo[NODE_BUCKET_START_MARKERS_IDX].hostBuffer);
free((int *)wr->bufferInfo[NODE_BUCKET_SIZES_IDX].hostBuffer);
#endif
#ifdef CUDA_TRACE
traceUserBracketEvent(CUDA_REMOTE_NODE_KERNEL, st, CmiWallTimer());
#endif
}
free((ParameterStruct *)ptr);
break;
case TP_PART_GRAVITY_REMOTE:
ptr = (ParameterStruct *)wr->userData;
#ifdef CUDA_NOTIFY_DATA_TRANSFER_DONE
printf("TP_PART_GRAVITY_REMOTE KERNELSELECT buffers:\nlocal_particles: (0x%x)\nlocal_particle_vars: (0x%x)\nil_cell: %d (0x%x)\n",
devBuffers[LOCAL_PARTICLE_CORES],
devBuffers[LOCAL_PARTICLE_VARS],
wr->bufferInfo[ILPART_IDX].bufferID,
devBuffers[wr->bufferInfo[ILPART_IDX].bufferID]
);
#endif
if( wr->bufferInfo[ILPART_IDX].transferToDevice ){
#ifndef CUDA_NO_KERNELS
#ifdef CUDA_2D_TB_KERNEL
particleGravityComputation<<<wr->dimGrid, wr->dimBlock, wr->smemSize, kernel_stream>>>
(
(CompactPartData *)devBuffers[LOCAL_PARTICLE_CORES],
(VariablePartData *)devBuffers[LOCAL_PARTICLE_VARS],
(CompactPartData *)devBuffers[REMOTE_PARTICLE_CORES],
(ILCell *)devBuffers[wr->bufferInfo[ILPART_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[PART_BUCKET_MARKERS_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[PART_BUCKET_START_MARKERS_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[PART_BUCKET_SIZES_IDX].bufferID],
ptr->fperiod
);
#else
particleGravityComputation<<<wr->dimGrid, wr->dimBlock, wr->smemSize, kernel_stream>>>
(
(CompactPartData *)devBuffers[LOCAL_PARTICLE_CORES],
(VariablePartData *)devBuffers[LOCAL_PARTICLE_VARS],
(CompactPartData *)devBuffers[REMOTE_PARTICLE_CORES],
(ILPart *)devBuffers[wr->bufferInfo[ILPART_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[PART_BUCKET_MARKERS_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[PART_BUCKET_START_MARKERS_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[PART_BUCKET_SIZES_IDX].bufferID],
ptr->fperiod
);
#endif
#endif
#ifdef CUDA_TRACE
st = CmiWallTimer();
#endif
#ifdef CUDA_USE_CUDAMALLOCHOST
#ifdef CUDA_MEMPOOL
hapi_poolFree(wr->bufferInfo[ILPART_IDX].hostBuffer);
hapi_poolFree((int *)wr->bufferInfo[PART_BUCKET_MARKERS_IDX].hostBuffer);
hapi_poolFree((int *)wr->bufferInfo[PART_BUCKET_START_MARKERS_IDX].hostBuffer);
hapi_poolFree((int *)wr->bufferInfo[PART_BUCKET_SIZES_IDX].hostBuffer);
#else
delayedFree(wr->bufferInfo[ILPART_IDX].hostBuffer);
delayedFree((int *)wr->bufferInfo[PART_BUCKET_MARKERS_IDX].hostBuffer);
delayedFree((int *)wr->bufferInfo[PART_BUCKET_START_MARKERS_IDX].hostBuffer);
delayedFree((int *)wr->bufferInfo[PART_BUCKET_SIZES_IDX].hostBuffer);
#endif
#ifdef CUDA_PRINT_ERRORS
printf("TP_PART_GRAVITY_REMOTE 0: %s\n", cudaGetErrorString( cudaGetLastError() ) );
#endif
#else
free(wr->bufferInfo[ILPART_IDX].hostBuffer);
free((int *)wr->bufferInfo[PART_BUCKET_MARKERS_IDX].hostBuffer);
free((int *)wr->bufferInfo[PART_BUCKET_START_MARKERS_IDX].hostBuffer);
free((int *)wr->bufferInfo[PART_BUCKET_SIZES_IDX].hostBuffer);
#endif
#ifdef CUDA_TRACE
traceUserBracketEvent(CUDA_REMOTE_PART_KERNEL, st, CmiWallTimer());
#endif
}
free((ParameterStruct *)ptr);
break;
case TP_GRAVITY_REMOTE_RESUME:
ptr = (ParameterStruct *)wr->userData;
#ifdef CUDA_NOTIFY_DATA_TRANSFER_DONE
printf("TP_GRAVITY_REMOTE_RESUME KERNELSELECT buffers:\nlocal_particles: (0x%x)\nlocal_particle_vars: (0x%x)\nmissed_moments: %d (0x%x)\nil_cell: %d (0x%x)\n",
devBuffers[LOCAL_PARTICLE_CORES],
devBuffers[LOCAL_PARTICLE_VARS],
wr->bufferInfo[MISSED_MOMENTS_IDX].bufferID,
devBuffers[wr->bufferInfo[MISSED_MOMENTS_IDX].bufferID],
wr->bufferInfo[ILCELL_IDX].bufferID,
devBuffers[wr->bufferInfo[ILCELL_IDX].bufferID]
);
#endif
if( wr->bufferInfo[ILCELL_IDX].transferToDevice ){
#ifndef CUDA_NO_KERNELS
nodeGravityComputation<<<wr->dimGrid, wr->dimBlock, wr->smemSize, kernel_stream>>>
(
(CompactPartData *)devBuffers[LOCAL_PARTICLE_CORES],
(VariablePartData *)devBuffers[LOCAL_PARTICLE_VARS],
(CudaMultipoleMoments *)devBuffers[wr->bufferInfo[MISSED_MOMENTS_IDX].bufferID],
(ILCell *)devBuffers[wr->bufferInfo[ILCELL_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[NODE_BUCKET_MARKERS_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[NODE_BUCKET_START_MARKERS_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[NODE_BUCKET_SIZES_IDX].bufferID],
ptr->fperiod
);
#endif
#ifdef CUDA_TRACE
st = CmiWallTimer();
#endif
#ifdef CUDA_USE_CUDAMALLOCHOST
#ifdef CUDA_MEMPOOL
hapi_poolFree((CudaMultipoleMoments *)wr->bufferInfo[MISSED_MOMENTS_IDX].hostBuffer);
hapi_poolFree((ILCell *)wr->bufferInfo[ILCELL_IDX].hostBuffer);
hapi_poolFree((int *)wr->bufferInfo[NODE_BUCKET_MARKERS_IDX].hostBuffer);
hapi_poolFree((int *)wr->bufferInfo[NODE_BUCKET_START_MARKERS_IDX].hostBuffer);
hapi_poolFree((int *)wr->bufferInfo[NODE_BUCKET_SIZES_IDX].hostBuffer);
#else
delayedFree((CudaMultipoleMoments *)wr->bufferInfo[MISSED_MOMENTS_IDX].hostBuffer);
delayedFree((ILCell *)wr->bufferInfo[ILCELL_IDX].hostBuffer);
delayedFree((int *)wr->bufferInfo[NODE_BUCKET_MARKERS_IDX].hostBuffer);
delayedFree((int *)wr->bufferInfo[NODE_BUCKET_START_MARKERS_IDX].hostBuffer);
delayedFree((int *)wr->bufferInfo[NODE_BUCKET_SIZES_IDX].hostBuffer);
#endif
#ifdef CUDA_PRINT_ERRORS
printf("TP_GRAVITY_REMOTE_RESUME 0: %s\n", cudaGetErrorString( cudaGetLastError() ) );
#endif
#else
free((CudaMultipoleMoments *)wr->bufferInfo[MISSED_MOMENTS_IDX].hostBuffer);
free((ILCell *)wr->bufferInfo[ILCELL_IDX].hostBuffer);
free((int *)wr->bufferInfo[NODE_BUCKET_MARKERS_IDX].hostBuffer);
free((int *)wr->bufferInfo[NODE_BUCKET_START_MARKERS_IDX].hostBuffer);
free((int *)wr->bufferInfo[NODE_BUCKET_SIZES_IDX].hostBuffer);
#endif
#ifdef CUDA_TRACE
traceUserBracketEvent(CUDA_REMOTE_RESUME_NODE_KERNEL, st, CmiWallTimer());
#endif
}
free((ParameterStruct *)ptr);
break;
case TP_PART_GRAVITY_REMOTE_RESUME:
ptr = (ParameterStruct *)wr->userData;
#ifdef CUDA_NOTIFY_DATA_TRANSFER_DONE
printf("TP_PART_GRAVITY_REMOTE_RESUME KERNELSELECT buffers:\nlocal_particles: (0x%x)\nlocal_particle_vars: (0x%x)\nmissed_parts: %d (0x%x)\nil_cell: %d (0x%x)\n",
devBuffers[LOCAL_PARTICLE_CORES],
devBuffers[LOCAL_PARTICLE_VARS],
wr->bufferInfo[MISSED_PARTS_IDX].bufferID,
devBuffers[wr->bufferInfo[MISSED_PARTS_IDX].bufferID],
wr->bufferInfo[ILPART_IDX].bufferID,
devBuffers[wr->bufferInfo[ILPART_IDX].bufferID]
);
#endif
if( wr->bufferInfo[ILPART_IDX].transferToDevice ){
#ifndef CUDA_NO_KERNELS
#ifdef CUDA_2D_TB_KERNEL
particleGravityComputation<<<wr->dimGrid, wr->dimBlock, wr->smemSize, kernel_stream>>>
(
(CompactPartData *)devBuffers[LOCAL_PARTICLE_CORES],
(VariablePartData *)devBuffers[LOCAL_PARTICLE_VARS],
(CompactPartData *)devBuffers[wr->bufferInfo[MISSED_PARTS_IDX].bufferID],
(ILCell *)devBuffers[wr->bufferInfo[ILPART_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[PART_BUCKET_MARKERS_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[PART_BUCKET_START_MARKERS_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[PART_BUCKET_SIZES_IDX].bufferID],
ptr->fperiod
);
#else
particleGravityComputation<<<wr->dimGrid, wr->dimBlock, wr->smemSize, kernel_stream>>>
(
(CompactPartData *)devBuffers[LOCAL_PARTICLE_CORES],
(VariablePartData *)devBuffers[LOCAL_PARTICLE_VARS],
(CompactPartData *)devBuffers[wr->bufferInfo[MISSED_PARTS_IDX].bufferID],
(ILPart *)devBuffers[wr->bufferInfo[ILPART_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[PART_BUCKET_MARKERS_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[PART_BUCKET_START_MARKERS_IDX].bufferID],
(int *)devBuffers[wr->bufferInfo[PART_BUCKET_SIZES_IDX].bufferID],
ptr->fperiod
);
#endif
#endif
#ifdef CUDA_TRACE
st = CmiWallTimer();
#endif
#ifdef CUDA_USE_CUDAMALLOCHOST
#ifdef CUDA_MEMPOOL
hapi_poolFree((CompactPartData *)wr->bufferInfo[MISSED_PARTS_IDX].hostBuffer);
hapi_poolFree(wr->bufferInfo[ILPART_IDX].hostBuffer);
hapi_poolFree((int *)wr->bufferInfo[PART_BUCKET_MARKERS_IDX].hostBuffer);
hapi_poolFree((int *)wr->bufferInfo[PART_BUCKET_START_MARKERS_IDX].hostBuffer);
hapi_poolFree((int *)wr->bufferInfo[PART_BUCKET_SIZES_IDX].hostBuffer);
#else
delayedFree((CompactPartData *)wr->bufferInfo[MISSED_PARTS_IDX].hostBuffer);
delayedFree(wr->bufferInfo[ILPART_IDX].hostBuffer);
delayedFree((int *)wr->bufferInfo[PART_BUCKET_MARKERS_IDX].hostBuffer);
delayedFree((int *)wr->bufferInfo[PART_BUCKET_START_MARKERS_IDX].hostBuffer);
delayedFree((int *)wr->bufferInfo[PART_BUCKET_SIZES_IDX].hostBuffer);
#endif
#ifdef CUDA_PRINT_ERRORS
printf("TP_PART_GRAVITY_REMOTE_RESUME 0: %s\n", cudaGetErrorString( cudaGetLastError() ) );
#endif
#else
free((CompactPartData *)wr->bufferInfo[MISSED_PARTS_IDX].hostBuffer);
free(wr->bufferInfo[ILPART_IDX].hostBuffer);
free((int *)wr->bufferInfo[PART_BUCKET_MARKERS_IDX].hostBuffer);
free((int *)wr->bufferInfo[PART_BUCKET_START_MARKERS_IDX].hostBuffer);
free((int *)wr->bufferInfo[PART_BUCKET_SIZES_IDX].hostBuffer);
#endif
#ifdef CUDA_TRACE
traceUserBracketEvent(CUDA_REMOTE_RESUME_PART_KERNEL, st, CmiWallTimer());
#endif
}
free((ParameterStruct *)ptr);
break;
case TOP_EWALD_KERNEL:
cudaMemcpyToSymbol(cachedData, wr->bufferInfo[EWALD_READ_ONLY_DATA].hostBuffer, sizeof(EwaldReadOnlyData), 0, cudaMemcpyHostToDevice);
EwaldTopKernel<<<wr->dimGrid, wr->dimBlock, wr->smemSize, kernel_stream>>>
((GravityParticleData*)devBuffers[wr->bufferInfo[PARTICLE_TABLE].bufferID],
wr->bufferInfo[PARTICLE_TABLE].size/sizeof(GravityParticleData) - 1);
#ifdef CUDA_PRINT_ERRORS
printf("TOP_EWALD_KERNEL: %s\n", cudaGetErrorString( cudaGetLastError() ) );
#endif
break;
case BOTTOM_EWALD_KERNEL:
cudaMemcpyToSymbol(ewt, wr->bufferInfo[EWALD_TABLE].hostBuffer, NEWH * sizeof(EwtData), 0, cudaMemcpyHostToDevice);
EwaldBottomKernel<<<wr->dimGrid, wr->dimBlock,
wr->smemSize, kernel_stream>>>
((GravityParticleData *)devBuffers[wr->bufferInfo[PARTICLE_TABLE].bufferID],
wr->bufferInfo[PARTICLE_TABLE].size/sizeof(GravityParticleData) - 1);
#ifdef CUDA_PRINT_ERRORS
printf("BOTTOM_EWALD_KERNEL : %s\n", cudaGetErrorString( cudaGetLastError() ) );
#endif
break;
default:
printf("error: id %d not valid\n", wr->id);
break;
}
}
/*
* Kernels
*/
#define GROUP(t) ((t)/MAX_THREADS_PER_GROUP)
#define GROUP_INDEX(t) ((t)%MAX_THREADS_PER_GROUP)
// 2d thread blocks
#ifdef CUDA_2D_TB_KERNEL
#define TRANSLATE(x,y) (y*NODES_PER_BLOCK+x)
#ifndef CUDA_2D_FLAT
__global__ void nodeGravityComputation(
CompactPartData *particleCores,
VariablePartData *particleVars,
CudaMultipoleMoments *moments,
ILCell *ils,
int *ilmarks,
int *bucketStarts,
int *bucketSizes,
cudatype fperiod){
__shared__ CudaVector3D acc[THREADS_PER_BLOCK];
__shared__ cudatype pot[THREADS_PER_BLOCK];
__shared__ CudaMultipoleMoments m[NODES_PER_BLOCK];
__shared__ int offsetID[NODES_PER_BLOCK];
__shared__ CompactPartData shared_particle_cores[PARTS_PER_BLOCK];
int
start = ilmarks[blockIdx.x],
end = ilmarks[blockIdx.x+1],
bucketStart = bucketStarts[blockIdx.x];
int bucketSize = bucketSizes[blockIdx.x];
/*
__shared__ int start;
__shared__ int end;
__shared__ int bucketStart;
__shared__ int bucketSize;
*/
/*
if(threadIdx.x == 0 && threadIdx.y == 0){
start = ilmarks[blockIdx.x];
end = ilmarks[blockIdx.x+1];
bucketStart = bucketStarts[blockIdx.x];
bucketSize = bucketSizes[blockIdx.x];
}
__syncthreads();
*/
char
tidx = threadIdx.x,
tidy = threadIdx.y;
for(int ystart = 0; ystart < bucketSize; ystart += PARTS_PER_BLOCK){
int my_particle_idx = ystart + tidy;
if(tidx == 0 && my_particle_idx < bucketSize){
shared_particle_cores[tidy] = particleCores[bucketStart+my_particle_idx];
}
__syncthreads(); // wait for leader threads to finish using acc's, pot's of other threads
acc[TRANSLATE(tidx,tidy)].x = 0.0;
acc[TRANSLATE(tidx,tidy)].y = 0.0;
acc[TRANSLATE(tidx,tidy)].z = 0.0;
pot[TRANSLATE(tidx,tidy)] = 0.0;
for(int xstart = start; xstart < end; xstart += NODES_PER_BLOCK){
int my_cell_idx = xstart + tidx;
ILCell ilc;
__syncthreads(); // wait for all threads to finish using
// previous iteration's nodes before reloading
if(tidy == 0 && my_cell_idx < end){
ilc = ils[my_cell_idx];
m[tidx] = moments[ilc.index];
offsetID[tidx] = ilc.offsetID;
}
__syncthreads(); // wait for nodes to be loaded before using them
if(my_particle_idx < bucketSize && my_cell_idx < end){ // INTERACT
CudaVector3D r;
r.x = shared_particle_cores[tidy].position.x -
((((offsetID[tidx] >> 22) & 0x7)-3)*fperiod + m[tidx].cm.x);
r.y = shared_particle_cores[tidy].position.y -
((((offsetID[tidx] >> 25) & 0x7)-3)*fperiod + m[tidx].cm.y);
r.z = shared_particle_cores[tidy].position.z -
((((offsetID[tidx] >> 28) & 0x7)-3)*fperiod + m[tidx].cm.z);
cudatype rsq = r.x*r.x + r.y*r.y + r.z*r.z;
if(rsq != 0){
cudatype dir = rsqrt(rsq);
#if defined(HEXADECAPOLE)
CUDA_momEvalFmomrcm(&m[tidx], &r, dir, &acc[TRANSLATE(tidx, tidy)], &pot[TRANSLATE(tidx, tidy)]);
#else
cudatype a, b, c, d;
cudatype
twoh = m[tidx].soft + shared_particle_cores[tidy].soft;
// SPLINEQ(dir, rsq, twoh, a, b, c, d);
// expansion of function below:
cudatype u,dih;
if (rsq < twoh*twoh) {
dih = 2.0/twoh;
u = dih/dir;
if (u < 1.0) {
a = dih*(7.0/5.0 - 2.0/3.0*u*u + 3.0/10.0*u*u*u*u
- 1.0/10.0*u*u*u*u*u);
b = dih*dih*dih*(4.0/3.0 - 6.0/5.0*u*u + 1.0/2.0*u*u*u);
c = dih*dih*dih*dih*dih*(12.0/5.0 - 3.0/2.0*u);
d = 3.0/2.0*dih*dih*dih*dih*dih*dih*dir;
}
else {
a = -1.0/15.0*dir + dih*(8.0/5.0 - 4.0/3.0*u*u + u*u*u
- 3.0/10.0*u*u*u*u + 1.0/30.0*u*u*u*u*u);
b = -1.0/15.0*dir*dir*dir + dih*dih*dih*(8.0/3.0 - 3.0*u
+ 6.0/5.0*u*u - 1.0/6.0*u*u*u);
c = -1.0/5.0*dir*dir*dir*dir*dir + 3.0*dih*dih*dih*dih*dir
+ dih*dih*dih*dih*dih*(-12.0/5.0 + 1.0/2.0*u);
d = -dir*dir*dir*dir*dir*dir*dir
+ 3.0*dih*dih*dih*dih*dir*dir*dir
- 1.0/2.0*dih*dih*dih*dih*dih*dih*dir;
}
}
else {
a = dir;
b = a*a*a;
c = 3.0*b*a*a;
d = 5.0*c*a*a;
}
cudatype
qirx = m[tidx].xx*r.x + m[tidx].xy*r.y + m[tidx].xz*r.z,
qiry = m[tidx].xy*r.x + m[tidx].yy*r.y + m[tidx].yz*r.z,
qirz = m[tidx].xz*r.x + m[tidx].yz*r.y + m[tidx].zz*r.z,
qir = 0.5*(qirx*r.x + qiry*r.y + qirz*r.z),
tr = 0.5*(m[tidx].xx + m[tidx].yy + m[tidx].zz),
qir3 = b*m[tidx].totalMass + d*qir - c*tr;
pot[TRANSLATE(tidx, tidy)] -= m[tidx].totalMass * a + c*qir - b*tr;
acc[TRANSLATE(tidx, tidy)].x -= qir3*r.x - c*qirx;
acc[TRANSLATE(tidx, tidy)].y -= qir3*r.y - c*qiry;
acc[TRANSLATE(tidx, tidy)].z -= qir3*r.z - c*qirz;
#endif
}// end if rsq != 0
}// end INTERACT
}// end for each NODE group
__syncthreads(); // wait for all threads to finish before results become available
cudatype sumx, sumy, sumz, poten;
sumx = sumy = sumz = poten = 0.0;
// accumulate forces, potential in global memory data structure
if(tidx == 0 && my_particle_idx < bucketSize){
for(int i = 0; i < NODES_PER_BLOCK; i++){
sumx += acc[TRANSLATE(i,tidy)].x;
sumy += acc[TRANSLATE(i,tidy)].y;
sumz += acc[TRANSLATE(i,tidy)].z;
poten += pot[TRANSLATE(i,tidy)];
}
particleVars[bucketStart+my_particle_idx].a.x += sumx;
particleVars[bucketStart+my_particle_idx].a.y += sumy;
particleVars[bucketStart+my_particle_idx].a.z += sumz;
particleVars[bucketStart+my_particle_idx].potential += poten;
}
}// end for each PARTICLE group
}
#else
__global__ void nodeGravityComputation(
CompactPartData *particleCores,
VariablePartData *particleVars,
CudaMultipoleMoments *moments,
ILCell *ils,
int *ilmarks,
int *bucketStarts,
int *bucketSizes,
cudatype fperiod){
__shared__ cudatype accx[THREADS_PER_BLOCK];
__shared__ cudatype accy[THREADS_PER_BLOCK];
__shared__ cudatype accz[THREADS_PER_BLOCK];
__shared__ cudatype pot[THREADS_PER_BLOCK];
//__shared__ cudatype mr[NODES_PER_BLOCK];
__shared__ cudatype ms[NODES_PER_BLOCK];
__shared__ cudatype mt[NODES_PER_BLOCK];
__shared__ cudatype mcmx[NODES_PER_BLOCK];
__shared__ cudatype mcmy[NODES_PER_BLOCK];
__shared__ cudatype mcmz[NODES_PER_BLOCK];
__shared__ cudatype mxx[NODES_PER_BLOCK];
__shared__ cudatype mxy[NODES_PER_BLOCK];
__shared__ cudatype mxz[NODES_PER_BLOCK];
__shared__ cudatype myy[NODES_PER_BLOCK];
__shared__ cudatype myz[NODES_PER_BLOCK];
__shared__ cudatype mzz[NODES_PER_BLOCK];
__shared__ int offsetID[NODES_PER_BLOCK];
__shared__ CompactPartData shared_particle_cores[PARTS_PER_BLOCK];
int start = ilmarks[blockIdx.x];
int end = ilmarks[blockIdx.x+1];
int bucketStart = bucketStarts[blockIdx.x];
int bucketSize = bucketSizes[blockIdx.x];
/*
__shared__ int start;
__shared__ int end;
__shared__ int bucketStart;
__shared__ int bucketSize;
*/
int tx, ty;
/*
if(threadIdx.x == 0 && threadIdx.y == 0){
start = ilmarks[blockIdx.x];
end = ilmarks[blockIdx.x+1];
bucketStart = bucketStarts[blockIdx.x];
bucketSize = bucketSizes[blockIdx.x];
}
__syncthreads();
*/
int xstart;
int ystart;
tx = threadIdx.x;
ty = threadIdx.y;
for(ystart = 0; ystart < bucketSize; ystart += PARTS_PER_BLOCK){
int my_particle_idx = ystart + ty;
if(tx == 0 && my_particle_idx < bucketSize){
shared_particle_cores[ty] = particleCores[bucketStart+my_particle_idx];
}
__syncthreads(); // wait for leader threads to finish using acc's, pot's of other threads
accx[TRANSLATE(tx,ty)] = 0.0;
accy[TRANSLATE(tx,ty)] = 0.0;
accz[TRANSLATE(tx,ty)] = 0.0;
pot[TRANSLATE(tx,ty)] = 0.0;
for(xstart = start; xstart < end; xstart += NODES_PER_BLOCK){
int my_cell_idx = xstart + tx;
ILCell ilc;
__syncthreads(); // wait for all threads to finish using
// previous iteration's nodes before reloading
if(ty == 0 && my_cell_idx < end){
ilc = ils[my_cell_idx];
//mr[tx] = moments[ilc.index].radius;
ms[tx] = moments[ilc.index].soft;
mt[tx] = moments[ilc.index].totalMass;
mcmx[tx] = moments[ilc.index].cm.x;
mcmy[tx] = moments[ilc.index].cm.y;
mcmz[tx] = moments[ilc.index].cm.z;
mxx[tx] = moments[ilc.index].xx;
mxy[tx] = moments[ilc.index].xy;
mxz[tx] = moments[ilc.index].xz;
myy[tx] = moments[ilc.index].yy;
myz[tx] = moments[ilc.index].yz;
mzz[tx] = moments[ilc.index].zz;
offsetID[tx] = ilc.offsetID;
}
__syncthreads(); // wait for nodes to be loaded before using them
if(my_particle_idx < bucketSize && my_cell_idx < end){ // INTERACT
CudaVector3D r;
cudatype rsq;
cudatype twoh, a, b, c, d;
r.x = shared_particle_cores[ty].position.x -
((((offsetID[tx] >> 22) & 0x7)-3)*fperiod + mcmx[tx]);
r.y = shared_particle_cores[ty].position.y -
((((offsetID[tx] >> 25) & 0x7)-3)*fperiod + mcmy[tx]);
r.z = shared_particle_cores[ty].position.z -
((((offsetID[tx] >> 28) & 0x7)-3)*fperiod + mcmz[tx]);
rsq = r.x*r.x + r.y*r.y + r.z*r.z;
twoh = ms[tx] + shared_particle_cores[ty].soft;
if(rsq != 0){
cudatype dir = 1.0/sqrt(rsq);
// SPLINEQ(dir, rsq, twoh, a, b, c, d);
// expansion of function below:
cudatype u,dih;
if (rsq < twoh*twoh) {
dih = 2.0/twoh;
u = dih/dir;
if (u < 1.0) {
a = dih*(7.0/5.0 - 2.0/3.0*u*u + 3.0/10.0*u*u*u*u
- 1.0/10.0*u*u*u*u*u);
b = dih*dih*dih*(4.0/3.0 - 6.0/5.0*u*u + 1.0/2.0*u*u*u);
c = dih*dih*dih*dih*dih*(12.0/5.0 - 3.0/2.0*u);
d = 3.0/2.0*dih*dih*dih*dih*dih*dih*dir;
}
else {
a = -1.0/15.0*dir + dih*(8.0/5.0 - 4.0/3.0*u*u + u*u*u
- 3.0/10.0*u*u*u*u + 1.0/30.0*u*u*u*u*u);
b = -1.0/15.0*dir*dir*dir + dih*dih*dih*(8.0/3.0 - 3.0*u
+ 6.0/5.0*u*u - 1.0/6.0*u*u*u);
c = -1.0/5.0*dir*dir*dir*dir*dir + 3.0*dih*dih*dih*dih*dir
+ dih*dih*dih*dih*dih*(-12.0/5.0 + 1.0/2.0*u);
d = -dir*dir*dir*dir*dir*dir*dir
+ 3.0*dih*dih*dih*dih*dir*dir*dir
- 1.0/2.0*dih*dih*dih*dih*dih*dih*dir;
}
}
else {
a = dir;
b = a*a*a;
c = 3.0*b*a*a;
d = 5.0*c*a*a;
}
cudatype qirx = mxx[tx]*r.x + mxy[tx]*r.y + mxz[tx]*r.z;
cudatype qiry = mxy[tx]*r.x + myy[tx]*r.y + myz[tx]*r.z;
cudatype qirz = mxz[tx]*r.x + myz[tx]*r.y + mzz[tx]*r.z;
cudatype qir = 0.5*(qirx*r.x + qiry*r.y + qirz*r.z);
cudatype tr = 0.5*(mxx[tx] + myy[tx] + mzz[tx]);
cudatype qir3 = b*mt[tx] + d*qir - c*tr;
pot[TRANSLATE(tx, ty)] -= mt[tx] * a + c*qir - b*tr;
accx[TRANSLATE(tx, ty)] -= qir3*r.x - c*qirx;
accy[TRANSLATE(tx, ty)] -= qir3*r.y - c*qiry;
accz[TRANSLATE(tx, ty)] -= qir3*r.z - c*qirz;
}// end if rsq != 0
}// end INTERACT
}// end for each NODE group
__syncthreads(); // wait for all threads to finish before results become available
cudatype sumx, sumy, sumz, poten;
sumx = sumy = sumz = poten = 0.0;
// accumulate forces, potential in global memory data structure
if(tx == 0 && my_particle_idx < bucketSize){
for(int i = 0; i < NODES_PER_BLOCK; i++){
sumx += accx[TRANSLATE(i,ty)];
sumy += accy[TRANSLATE(i,ty)];
sumz += accz[TRANSLATE(i,ty)];
poten += pot[TRANSLATE(i,ty)];
}
particleVars[bucketStart+my_particle_idx].a.x += sumx;
particleVars[bucketStart+my_particle_idx].a.y += sumy;
particleVars[bucketStart+my_particle_idx].a.z += sumz;
particleVars[bucketStart+my_particle_idx].potential += poten;
}
}// end for each PARTICLE group
}
#endif
#else
__global__ void nodeGravityComputation(
CompactPartData *particleCores,
VariablePartData *particleVars,
CudaMultipoleMoments *moments,
ILCell *ils,
int *ilmarks,
int *bucketStarts,
int *bucketSizes,
cudatype fperiod){
// each thread has its own storage for these
__shared__ CudaVector3D acc[THREADS_PER_BLOCK];
__shared__ cudatype pot[THREADS_PER_BLOCK];
__shared__ CudaMultipoleMoments m[THREADS_PER_BLOCK];
__shared__ CompactPartData shared_particle_core;
// each block is given a bucket to compute
// each thread in the block computes an interaction of a particle with a node
// threads must iterate through the interaction lists and sync.
// then, block leader (first rank in each block) reduces the forces and commits
// values to global memory.
int bucket = blockIdx.x;
int start = ilmarks[bucket];
int end = ilmarks[bucket+1];
int bucketSize = bucketSizes[bucket];
int bucketStart = bucketStarts[bucket];
int thread = threadIdx.x;
CudaVector3D r;
cudatype rsq;
cudatype twoh, a, b, c, d;
#if defined __DEVICE_EMULATION__ && defined CUDA_EMU_KERNEL_NODE_PRINTS
int tp, id;
if(thread == 0)
printf("numMissedCores: %d, bucketSize: %d, start: %d, end: %d\n", numInteractions, bucketSize, start, end);
#endif
#ifdef __DEVICE_EMULATION__
if(blockIdx.x == 0){
//printf("t: %d, blen: %d, llen: %d\n", threadIdx.x, blen, llen);
//printf("group: %d, particle: %d, ngroups: %d, groupSize: %d\n", group, particle, ngroups, groupSize);
}
#endif
for(int particle = 0; particle < bucketSize; particle++){
if(thread == 0){
// load shared_particle_core
shared_particle_core = particleCores[bucketStart+particle];
}
__syncthreads();
#if defined __DEVICE_EMULATION__ && defined CUDA_EMU_KERNEL_NODE_PRINTS
id = shared_particle_core.id;
tp = shared_particle_core.tp;
if(thread == 0){
printf("targetVars: 0x%x targetVars[%d,%d].a.x = %f\n",
particleVars, tp,id,
particleVars[bucketStart+particle].a.x);
}
#endif
acc[thread].x = 0;
acc[thread].y = 0;
acc[thread].z = 0;
pot[thread] = 0;
for(int node = start+thread; node < end; node+=THREADS_PER_BLOCK){
ILCell ilc = ils[node];
m[thread] = moments[ilc.index];
int offsetID = ilc.offsetID;
r.x = shared_particle_core.position.x -
((((offsetID >> 22) & 0x7)-3)*fperiod + m[thread].cm.x);
r.y = shared_particle_core.position.y -
((((offsetID >> 25) & 0x7)-3)*fperiod + m[thread].cm.y);
r.z = shared_particle_core.position.z -
((((offsetID >> 28) & 0x7)-3)*fperiod + m[thread].cm.z);
rsq = r.x*r.x + r.y*r.y + r.z*r.z;
twoh = m[thread].soft + shared_particle_core.soft;
if(rsq != 0){
cudatype dir = 1.0/sqrt(rsq);
// SPLINEQ(dir, rsq, twoh, a, b, c, d);
// expansion of function below:
cudatype u,dih;
if (rsq < twoh*twoh) {
dih = 2.0/twoh;
u = dih/dir;
if (u < 1.0) {
a = dih*(7.0/5.0 - 2.0/3.0*u*u + 3.0/10.0*u*u*u*u
- 1.0/10.0*u*u*u*u*u);
b = dih*dih*dih*(4.0/3.0 - 6.0/5.0*u*u + 1.0/2.0*u*u*u);
c = dih*dih*dih*dih*dih*(12.0/5.0 - 3.0/2.0*u);
d = 3.0/2.0*dih*dih*dih*dih*dih*dih*dir;
}
else {
a = -1.0/15.0*dir + dih*(8.0/5.0 - 4.0/3.0*u*u + u*u*u
- 3.0/10.0*u*u*u*u + 1.0/30.0*u*u*u*u*u);
b = -1.0/15.0*dir*dir*dir + dih*dih*dih*(8.0/3.0 - 3.0*u
+ 6.0/5.0*u*u - 1.0/6.0*u*u*u);
c = -1.0/5.0*dir*dir*dir*dir*dir + 3.0*dih*dih*dih*dih*dir
+ dih*dih*dih*dih*dih*(-12.0/5.0 + 1.0/2.0*u);
d = -dir*dir*dir*dir*dir*dir*dir
+ 3.0*dih*dih*dih*dih*dir*dir*dir
- 1.0/2.0*dih*dih*dih*dih*dih*dih*dir;
}
}
else {
a = dir;
b = a*a*a;
c = 3.0*b*a*a;
d = 5.0*c*a*a;
}
cudatype qirx = m[thread].xx*r.x + m[thread].xy*r.y + m[thread].xz*r.z;
cudatype qiry = m[thread].xy*r.x + m[thread].yy*r.y + m[thread].yz*r.z;
cudatype qirz = m[thread].xz*r.x + m[thread].yz*r.y + m[thread].zz*r.z;
cudatype qir = 0.5*(qirx*r.x + qiry*r.y + qirz*r.z);
cudatype tr = 0.5*(m[thread].xx + m[thread].yy + m[thread].zz);
cudatype qir3 = b*m[thread].totalMass + d*qir - c*tr;
pot[thread] -= m[thread].totalMass * a + c*qir - b*tr;
acc[thread].x -= qir3*r.x - c*qirx;
acc[thread].y -= qir3*r.y - c*qiry;
acc[thread].z -= qir3*r.z - c*qirz;
}// end if rsq != 0
#if defined __DEVICE_EMULATION__ && defined CUDA_EMU_KERNEL_NODE_PRINTS
if(1){
CudaMultipoleMoments mm = m[thread];
printf("NODE target particle (%d,%d) type %d with moments[%d], acc: (%f,%f,%f)\n", tp, id,
type,
ilc.index,
acc[thread].x,
acc[thread].y,
acc[thread].z);
printf("{m=%f,s=%f,r=%f}\n", mm.totalMass, mm.soft, mm.radius);
printf("{xx=%f,xy=%f,xz=%f}\n", mm.xx, mm.xy, mm.xz);
printf("{yy=%f,yz=%f,zz=%f}\n", mm.yy, mm.yz, mm.zz);
}
#endif
}// for each node in list
__syncthreads();
// at this point, the total force on particle is distributed among
// all active threads;
// reduce.
// TODO: make this a parallel reduction
cudatype sumx, sumy, sumz, poten;
sumx = sumy = sumz = poten = 0.0;
if(thread == 0){
for(int i = 0; i < THREADS_PER_BLOCK; i++){
sumx += acc[i].x;
sumy += acc[i].y;
sumz += acc[i].z;
poten += pot[i];
}
#if defined __DEVICE_EMULATION__ && defined CUDA_EMU_KERNEL_NODE_PRINTS
printf("aggregate acc[%d,%d] = (%f,%f,%f)\n", tp, id, sumx, sumy, sumz);
/*
printf("before (%d,%d) type: NODE %d acc: (%f,%f,%f)\n", tp, id, type,
particleVars[bucketStart+particle].a.x,
particleVars[bucketStart+particle].a.y,
particleVars[bucketStart+particle].a.z
);
*/
#endif
particleVars[bucketStart+particle].a.x += sumx;
particleVars[bucketStart+particle].a.y += sumy;
particleVars[bucketStart+particle].a.z += sumz;
particleVars[bucketStart+particle].potential += poten;
#if defined __DEVICE_EMULATION__ && defined CUDA_EMU_KERNEL_NODE_PRINTS
printf("after (%d,%d) type: NODE %d acc: (%f,%f,%f)\n", tp, id, type,
particleVars[bucketStart+particle].a.x,
particleVars[bucketStart+particle].a.y,
particleVars[bucketStart+particle].a.z
);
#endif
}
}// for each particle in bucket
}
#endif
#ifdef CUDA_2D_TB_KERNEL
#define TRANSLATE_PART(x,y) (y*NODES_PER_BLOCK_PART+x)
__global__ void particleGravityComputation(
CompactPartData *targetCores,
VariablePartData *targetVars,
CompactPartData *sourceCores,
ILCell *ils,
int *ilmarks,
int *bucketStarts,
int *bucketSizes,
cudatype fperiod){
__shared__ CudaVector3D acc[THREADS_PER_BLOCK_PART];
__shared__ cudatype pot[THREADS_PER_BLOCK_PART];
__shared__ CompactPartData m[NODES_PER_BLOCK_PART];
__shared__ int offsetID[NODES_PER_BLOCK_PART];
__shared__ CompactPartData shared_particle_cores[PARTS_PER_BLOCK_PART];
int start = ilmarks[blockIdx.x];
int end = ilmarks[blockIdx.x+1];
int bucketStart = bucketStarts[blockIdx.x];
int bucketSize = bucketSizes[blockIdx.x];
/*
__shared__ int start;
__shared__ int end;
__shared__ int bucketStart;
__shared__ int bucketSize;
*/
int tx, ty;
/*
if(threadIdx.x == 0 && threadIdx.y == 0){
start = ilmarks[blockIdx.x];
end = ilmarks[blockIdx.x+1];
bucketStart = bucketStarts[blockIdx.x];
bucketSize = bucketSizes[blockIdx.x];
}
__syncthreads();
*/
int xstart;
int ystart;
tx = threadIdx.x;
ty = threadIdx.y;
for(ystart = 0; ystart < bucketSize; ystart += PARTS_PER_BLOCK_PART){
int my_particle_idx = ystart + ty;
if(tx == 0 && my_particle_idx < bucketSize){
shared_particle_cores[ty] = targetCores[bucketStart+my_particle_idx];
}
__syncthreads(); // wait for leader threads to finish using acc's, pot's of other threads
acc[TRANSLATE_PART(tx,ty)].x = 0.0;
acc[TRANSLATE_PART(tx,ty)].y = 0.0;
acc[TRANSLATE_PART(tx,ty)].z = 0.0;
pot[TRANSLATE_PART(tx,ty)] = 0.0;
for(xstart = start; xstart < end; xstart += NODES_PER_BLOCK_PART){
int my_cell_idx = xstart + tx;
ILCell ilc;
__syncthreads(); // wait for all threads to finish using
// previous iteration's nodes before reloading
if(ty == 0 && my_cell_idx < end){
ilc = ils[my_cell_idx];
m[tx] = sourceCores[ilc.index];
offsetID[tx] = ilc.offsetID;
}
__syncthreads(); // wait for nodes to be loaded before using them
if(my_particle_idx < bucketSize && my_cell_idx < end){ // INTERACT
CudaVector3D r;
cudatype rsq;
cudatype twoh, a, b;
r.x = (((offsetID[tx] >> 22) & 0x7)-3)*fperiod
+ m[tx].position.x
- shared_particle_cores[ty].position.x;
r.y = (((offsetID[tx] >> 25) & 0x7)-3)*fperiod
+ m[tx].position.y
- shared_particle_cores[ty].position.y;
r.z = (((offsetID[tx] >> 28) & 0x7)-3)*fperiod
+ m[tx].position.z
- shared_particle_cores[ty].position.z;
rsq = r.x*r.x + r.y*r.y + r.z*r.z;
twoh = m[tx].soft + shared_particle_cores[ty].soft;
if(rsq != 0){
cudatype r1, u, dih, dir;
r1 = sqrt(rsq);
if (r1 < (twoh)) {
dih = 2.0/(twoh);
u = r1*dih;
if (u < 1.0) {
a = dih*(7.0/5.0 - 2.0/3.0*u*u + 3.0/10.0*u*u*u*u
- 1.0/10.0*u*u*u*u*u);
b = dih*dih*dih*(4.0/3.0 - 6.0/5.0*u*u
+ 1.0/2.0*u*u*u);
}
else {
dir = 1.0/r1;
a = -1.0/15.0*dir + dih*(8.0/5.0 - 4.0/3.0*u*u +
u*u*u - 3.0/10.0*u*u*u*u + 1.0/30.0*u*u*u*u*u);
b = -1.0/15.0*dir*dir*dir +
dih*dih*dih*(8.0/3.0 - 3.0*u +
6.0/5.0*u*u - 1.0/6.0*u*u*u);
}
}
else {
a = 1.0/r1;
b = a*a*a;
}
pot[TRANSLATE_PART(tx, ty)] -= m[tx].mass * a;
acc[TRANSLATE_PART(tx, ty)].x += r.x*b*m[tx].mass;
acc[TRANSLATE_PART(tx, ty)].y += r.y*b*m[tx].mass;
acc[TRANSLATE_PART(tx, ty)].z += r.z*b*m[tx].mass;
}// end if rsq != 0
}// end INTERACT
}// end for each NODE group
__syncthreads(); // wait for all threads to finish before results become available
cudatype sumx, sumy, sumz, poten;
sumx = sumy = sumz = poten = 0.0;
// accumulate forces, potential in global memory data structure
if(tx == 0 && my_particle_idx < bucketSize){
for(int i = 0; i < NODES_PER_BLOCK_PART; i++){
sumx += acc[TRANSLATE_PART(i,ty)].x;
sumy += acc[TRANSLATE_PART(i,ty)].y;
sumz += acc[TRANSLATE_PART(i,ty)].z;
poten += pot[TRANSLATE_PART(i,ty)];
}
targetVars[bucketStart+my_particle_idx].a.x += sumx;
targetVars[bucketStart+my_particle_idx].a.y += sumy;
targetVars[bucketStart+my_particle_idx].a.z += sumz;
targetVars[bucketStart+my_particle_idx].potential += poten;
}
}// end for each PARTICLE group
}
#else
__global__ void particleGravityComputation(
CompactPartData *targetCores,
VariablePartData *targetVars,
CompactPartData *sourceCores,
ILPart *ils,
int *ilmarks,
int *bucketStarts,
int *bucketSizes,
cudatype fperiod){
// each thread has its own storage for these
__shared__ CudaVector3D acc[THREADS_PER_BLOCK];
__shared__ cudatype pot[THREADS_PER_BLOCK];
__shared__ CompactPartData source_cores[THREADS_PER_BLOCK];
__shared__ CompactPartData shared_target_core;
// each block is given a bucket to compute
// each thread in the block computes an interaction of a particle with a node
// threads must iterate through the interaction lists and sync.
// then, block leader (first rank in each block) reduces the forces and commits
// values to global memory.
int bucket = blockIdx.x;
int start = ilmarks[bucket];
int end = ilmarks[bucket+1];
int bucketSize = bucketSizes[bucket];
int bucketStart = bucketStarts[bucket];
int thread = threadIdx.x;
#if defined __DEVICE_EMULATION__ && defined CUDA_EMU_KERNEL_PART_PRINTS
int tp, id;
#endif
CudaVector3D r;
cudatype rsq;
cudatype twoh, a, b;
#if defined __DEVICE_EMULATION__ && defined CUDA_EMU_KERNEL_PART_PRINTS
if(thread == 0)
printf("numMissedCores: %d, bucketSize: %d, start: %d, end: %d\n", numInteractions, bucketSize, start, end);
#endif
for(int target = 0; target < bucketSize; target++){
if(thread == 0){
shared_target_core = targetCores[bucketStart+target];
}
__syncthreads();
#if defined __DEVICE_EMULATION__ && defined CUDA_EMU_KERNEL_PART_PRINTS
id = shared_target_core.id;
tp = shared_target_core.tp;
if(thread == 0){
printf("targetVars: 0x%x targetVars[%d,%d].a.x = %f\n",
targetVars, tp,id,
targetVars[bucketStart+target].a.x);
}
#endif
acc[thread].x = 0;
acc[thread].y = 0;
acc[thread].z = 0;
pot[thread] = 0;
for(int source = start+thread; source < end; source += THREADS_PER_BLOCK){
ILPart ilp = ils[source];
int oid = ilp.off;
int num = ilp.num;
int ilpindex = ilp.index;
for(int particle = 0; particle < num; particle++){
source_cores[thread] = sourceCores[ilpindex+particle];
r.x = (((oid >> 22) & 0x7)-3)*fperiod +
source_cores[thread].position.x -
shared_target_core.position.x;
r.y = (((oid >> 25) & 0x7)-3)*fperiod +
source_cores[thread].position.y -
shared_target_core.position.y;
r.z = (((oid >> 28) & 0x7)-3)*fperiod +
source_cores[thread].position.z -
shared_target_core.position.z;
rsq = r.x*r.x + r.y*r.y + r.z*r.z;
twoh = source_cores[thread].soft + shared_target_core.soft;
if(rsq != 0){
cudatype r1, u,dih,dir;
r1 = sqrt(rsq);
if (r1 < (twoh)) {
dih = 2.0/(twoh);
u = r1*dih;
if (u < 1.0) {
a = dih*(7.0/5.0 - 2.0/3.0*u*u + 3.0/10.0*u*u*u*u
- 1.0/10.0*u*u*u*u*u);
b = dih*dih*dih*(4.0/3.0 - 6.0/5.0*u*u
+ 1.0/2.0*u*u*u);
}
else {
dir = 1.0/r1;
a = -1.0/15.0*dir + dih*(8.0/5.0 - 4.0/3.0*u*u +
u*u*u - 3.0/10.0*u*u*u*u + 1.0/30.0*u*u*u*u*u);
b = -1.0/15.0*dir*dir*dir +
dih*dih*dih*(8.0/3.0 - 3.0*u +
6.0/5.0*u*u - 1.0/6.0*u*u*u);
}
}
else {
a = 1.0/r1;
b = a*a*a;
}
pot[thread] -= source_cores[thread].mass * a;
acc[thread].x += r.x*b*source_cores[thread].mass;
acc[thread].y += r.y*b*source_cores[thread].mass;
acc[thread].z += r.z*b*source_cores[thread].mass;
}// if rsq != 0
}// for each particle in source bucket
}// for each source bucket
__syncthreads();
cudatype sumx, sumy, sumz, poten;
sumx = sumy = sumz = poten = 0.0;
if(thread == 0){
for(int i = 0; i < THREADS_PER_BLOCK; i++){
sumx += acc[i].x;
sumy += acc[i].y;
sumz += acc[i].z;
poten += pot[i];
}
#if defined __DEVICE_EMULATION__ && defined CUDA_EMU_KERNEL_PART_PRINTS
//printf("before acc[%d,%d] = (%f,%f,%f)\n", tp, id, sumx, sumy, sumz);
printf("before (%d,%d) type: %d acc: (%f,%f,%f)\n", tp, id, type,
targetVars[bucketStart+target].a.x,
targetVars[bucketStart+target].a.y,
targetVars[bucketStart+target].a.z
);
#endif
targetVars[bucketStart+target].a.x += sumx;
targetVars[bucketStart+target].a.y += sumy;
targetVars[bucketStart+target].a.z += sumz;
targetVars[bucketStart+target].potential += poten;
#if defined __DEVICE_EMULATION__ && defined CUDA_EMU_KERNEL_PART_PRINTS
printf("after (%d,%d) type: %d acc: (%f,%f,%f)\n", tp, id, type,
targetVars[bucketStart+target].a.x,
targetVars[bucketStart+target].a.y,
targetVars[bucketStart+target].a.z
);
#endif
}
}// for each target part
}
#endif
__global__ void EwaldTopKernel(GravityParticleData *particleTable, int nPart);
__global__ void EwaldBottomKernel(GravityParticleData *particleTable, int nPart);
extern unsigned int timerHandle;
void EwaldHostMemorySetup(EwaldData *h_idata, int nParticles, int nEwhLoop) {
#ifdef CUDA_MEMPOOL
h_idata->p = (GravityParticleData *) hapi_poolMalloc((nParticles+1)*sizeof(GravityParticleData));
h_idata->ewt = (EwtData *) hapi_poolMalloc((nEwhLoop)*sizeof(EwtData));
h_idata->cachedData = (EwaldReadOnlyData *) hapi_poolMalloc(sizeof(EwaldReadOnlyData));
#else
cudaMallocHost((void**)&(h_idata->p), (nParticles+1) * sizeof(GravityParticleData));
cudaMallocHost((void**)&(h_idata->ewt), nEwhLoop * sizeof(EwtData));
cudaMallocHost((void**)&(h_idata->cachedData), sizeof(EwaldReadOnlyData));
#endif
}
void EwaldHostMemoryFree(EwaldData *h_idata) {
#ifdef CUDA_MEMPOOL
hapi_poolFree(h_idata->p);
hapi_poolFree(h_idata->ewt);
hapi_poolFree(h_idata->cachedData);
#else
cudaFreeHost(h_idata->p);
cudaFreeHost(h_idata->ewt);
cudaFreeHost(h_idata->cachedData);
#endif
}
#ifdef CUDA_INSTRUMENT_WRS
void EwaldHost(EwaldData *h_idata, void *cb, int myIndex, char phase) {
#else
void EwaldHost(EwaldData *h_idata, void *cb, int myIndex) {
#endif
int n = h_idata->cachedData->n;
int numBlocks = (int) ceilf((float)n/BLOCK_SIZE);
int nEwhLoop = h_idata->cachedData->nEwhLoop;
workRequest topKernel;
dataInfo *particleTableInfo, *cachedDataInfo, *ewtInfo;
topKernel.dimGrid = dim3(numBlocks);
topKernel.dimBlock = dim3(BLOCK_SIZE);
topKernel.smemSize = 0;
topKernel.nBuffers = 2;
/* schedule two buffers for transfer to the GPU */
topKernel.bufferInfo =
(dataInfo *) malloc(topKernel.nBuffers * sizeof(dataInfo));
particleTableInfo = &(topKernel.bufferInfo[0]);
particleTableInfo->bufferID = NUM_GRAVITY_BUFS + BUFFERS_PER_CHARE * myIndex + PARTICLE_TABLE;
particleTableInfo->transferToDevice = true;
particleTableInfo->transferFromDevice = false;
particleTableInfo->freeBuffer = false;
particleTableInfo->hostBuffer = h_idata->p;
particleTableInfo->size = (n+1) * sizeof(GravityParticleData);
cachedDataInfo = &(topKernel.bufferInfo[1]);
cachedDataInfo->bufferID = NUM_GRAVITY_BUFS + BUFFERS_PER_CHARE * myIndex + EWALD_READ_ONLY_DATA;
cachedDataInfo->transferToDevice = false;
cachedDataInfo->transferFromDevice = false;
cachedDataInfo->freeBuffer = false;
cachedDataInfo->hostBuffer = h_idata->cachedData;
cachedDataInfo->size = sizeof(EwaldReadOnlyData);
topKernel.callbackFn = NULL;
topKernel.id = TOP_EWALD_KERNEL;
#ifdef CUDA_INSTRUMENT_WRS
topKernel.chareIndex = myIndex;
topKernel.compType = topKernel.id;
topKernel.compPhase = phase;
#endif
enqueue(&topKernel);
#ifdef CUDA_VERBOSE_KERNEL_ENQUEUE
printf("[%d] in EwaldHost, enqueued TopKernel\n", myIndex);
#endif
workRequest bottomKernel;
bottomKernel.dimGrid = dim3(numBlocks);
bottomKernel.dimBlock = dim3(BLOCK_SIZE);
bottomKernel.smemSize = 0;
bottomKernel.nBuffers = 3;
bottomKernel.bufferInfo =
(dataInfo *) malloc(bottomKernel.nBuffers * sizeof(dataInfo));
particleTableInfo = &(bottomKernel.bufferInfo[0]);
particleTableInfo->bufferID = NUM_GRAVITY_BUFS + BUFFERS_PER_CHARE * myIndex + PARTICLE_TABLE;
particleTableInfo->transferToDevice = false;
particleTableInfo->transferFromDevice = true;
particleTableInfo->freeBuffer = true;
particleTableInfo->hostBuffer = h_idata->p;
particleTableInfo->size = (n+1) * sizeof(GravityParticleData);
cachedDataInfo = &(bottomKernel.bufferInfo[1]);
cachedDataInfo->bufferID = NUM_GRAVITY_BUFS + BUFFERS_PER_CHARE * myIndex + EWALD_READ_ONLY_DATA;
cachedDataInfo->transferToDevice = false;
cachedDataInfo->transferFromDevice = false;
cachedDataInfo->freeBuffer = false;
cachedDataInfo->hostBuffer = h_idata->cachedData;
cachedDataInfo->size = sizeof(EwaldReadOnlyData);
ewtInfo = &(bottomKernel.bufferInfo[2]);
ewtInfo->bufferID = NUM_GRAVITY_BUFS + BUFFERS_PER_CHARE * myIndex + EWALD_TABLE;
ewtInfo->transferToDevice = false;
ewtInfo->transferFromDevice = false;
ewtInfo->freeBuffer = false;
ewtInfo->hostBuffer = h_idata->ewt;
ewtInfo->size = nEwhLoop * sizeof(EwtData);
bottomKernel.callbackFn = cb;
bottomKernel.id = BOTTOM_EWALD_KERNEL;
#ifdef CUDA_INSTRUMENT_WRS
bottomKernel.chareIndex = myIndex;
bottomKernel.compType = bottomKernel.id;
bottomKernel.compPhase = phase;
#endif
enqueue(&bottomKernel);
#ifdef CUDA_VERBOSE_KERNEL_ENQUEUE
printf("[%d] in EwaldHost, enqueued BotKernel\n", myIndex);
#endif
}
__global__ void EwaldTopKernel(GravityParticleData *particleTable,
int nPart) {
GravityParticleData *p;
cudatype alphan;
cudatype fPot, ax, ay, az;
cudatype x, y, z, r2, dir, dir2, a;
cudatype xdif, ydif, zdif;
cudatype g0, g1, g2, g3, g4, g5;
cudatype Q2, Q2mirx, Q2miry, Q2mirz, Q2mir, Qta;
int id, ix, iy, iz, bInHole, bInHolex, bInHolexy;
#ifdef HEXADECAPOLE
MomcData *mom = &(cachedData->momcRoot);
MultipoleMomentsData *momQuad = &(cachedData->mm);
cudatype xx,xxx,xxy,xxz,yy,yyy,yyz,xyy,zz,zzz,xzz,yzz,xy,xyz,xz,yz;
cudatype Q4mirx,Q4miry,Q4mirz,Q4mir,Q4x,Q4y,Q4z;
cudatype Q4xx,Q4xy,Q4xz,Q4yy,Q4yz,Q4zz,Q4,Q3x,Q3y,Q3z;
cudatype Q3mirx,Q3miry,Q3mirz,Q3mir;
const cudatype onethird = 1.0/3.0;
#else
MultipoleMomentsData *mom;
mom = &(cachedData->mm);
#endif
#ifdef HEXADECAPOLE
Q4xx = 0.5*(mom->xxxx + mom->xxyy + mom->xxzz);
Q4xy = 0.5*(mom->xxxy + mom->xyyy + mom->xyzz);
Q4xz = 0.5*(mom->xxxz + mom->xyyz + mom->xzzz);
Q4yy = 0.5*(mom->xxyy + mom->yyyy + mom->yyzz);
Q4yz = 0.5*(mom->xxyz + mom->yyyz + mom->yzzz);
Q4zz = 0.5*(mom->xxzz + mom->yyzz + mom->zzzz);
Q4 = 0.25*(Q4xx + Q4yy + Q4zz);
Q3x = 0.5*(mom->xxx + mom->xyy + mom->xzz);
Q3y = 0.5*(mom->xxy + mom->yyy + mom->yzz);
Q3z = 0.5*(mom->xxz + mom->yyz + mom->zzz);
#endif
Q2 = 0.5 * (mom->xx + mom->yy + mom->zz);
id = blockIdx.x * BLOCK_SIZE + threadIdx.x + 1;
if (id > nPart) {
return;
}
p = &(particleTable[id]);
#ifdef DEBUG
if (blockIdx.x == 0 && threadIdx.x == 0) {
printf("Moments\n");
printf("xx %f, xy %f, xz %f, yy %f, yz %f, zz %f\n", mom->xx, mom->xy, mom->xz,
mom->yy, mom->yz, mom->zz);
}
#endif
// if (p->rung < activeRung) return; // only for multistepping
ax = 0.0f;
ay = 0.0f;
az = 0.0f;
#ifdef HEXADECAPOLE
xdif = p->position_x - momQuad->cmx;
ydif = p->position_y - momQuad->cmy;
zdif = p->position_z - momQuad->cmz;
fPot = momQuad->totalMass*cachedData->k1;
#else
xdif = p->position_x - mom->cmx;
ydif = p->position_y - mom->cmy;
zdif = p->position_z - mom->cmz;
fPot = mom->totalMass*cachedData->k1;
#endif
for (ix=-(cachedData->nEwReps);ix<=(cachedData->nEwReps);++ix) {
bInHolex = (ix >= -cachedData->nReps && ix <= cachedData->nReps);
x = xdif + ix * cachedData->L;
for(iy=-(cachedData->nEwReps);iy<=(cachedData->nEwReps);++iy) {
bInHolexy = (bInHolex && iy >= -cachedData->nReps && iy <= cachedData->nReps);
y = ydif + iy*cachedData->L;
for(iz=-(cachedData->nEwReps);iz<=(cachedData->nEwReps);++iz) {
bInHole = (bInHolexy && iz >= -cachedData->nReps && iz <= cachedData->nReps);
z = zdif + iz*cachedData->L;
r2 = x*x + y*y + z*z;
if (r2 > cachedData->fEwCut2 && !bInHole) continue;
if (r2 < cachedData->fInner2) {
/*
* For small r, series expand about
* the origin to avoid errors caused
* by cancellation of large terms.
*/
alphan = cachedData->ka;
r2 *= cachedData->alpha2;
g0 = alphan*((1.0/3.0)*r2 - 1.0);
alphan *= 2*cachedData->alpha2;
g1 = alphan*((1.0/5.0)*r2 - (1.0/3.0));
alphan *= 2*cachedData->alpha2;
g2 = alphan*((1.0/7.0)*r2 - (1.0/5.0));
alphan *= 2*cachedData->alpha2;
g3 = alphan*((1.0/9.0)*r2 - (1.0/7.0));
alphan *= 2*cachedData->alpha2;
g4 = alphan*((1.0/11.0)*r2 - (1.0/9.0));
alphan *= 2*cachedData->alpha2;
g5 = alphan*((1.0/13.0)*r2 - (1.0/11.0));
}
else {
dir = 1/sqrtf(r2);
dir2 = dir*dir;
a = expf(-r2*cachedData->alpha2);
a *= cachedData->ka*dir2;
if (bInHole) g0 = -erff(cachedData->alpha/dir);
else g0 = erfcf(cachedData->alpha/dir);
g0 *= dir;
g1 = g0*dir2 + a;
alphan = 2*cachedData->alpha2;
g2 = 3*g1*dir2 + alphan*a;
alphan *= 2*cachedData->alpha2;
g3 = 5*g2*dir2 + alphan*a;
alphan *= 2*cachedData->alpha2;
g4 = 7*g3*dir2 + alphan*a;
alphan *= 2*cachedData->alpha2;
g5 = 9*g4*dir2 + alphan*a;
}
#ifdef HEXADECAPOLE
xx = 0.5*x*x;
xxx = onethird*xx*x;
xxy = xx*y;
xxz = xx*z;
yy = 0.5*y*y;
yyy = onethird*yy*y;
xyy = yy*x;
yyz = yy*z;
zz = 0.5*z*z;
zzz = onethird*zz*z;
xzz = zz*x;
yzz = zz*y;
xy = x*y;
xyz = xy*z;
xz = x*z;
yz = y*z;
Q2mirx = mom->xx*x + mom->xy*y + mom->xz*z;
Q2miry = mom->xy*x + mom->yy*y + mom->yz*z;
Q2mirz = mom->xz*x + mom->yz*y + mom->zz*z;
Q3mirx = mom->xxx*xx + mom->xxy*xy + mom->xxz*xz + mom->xyy*yy + mom->xyz*yz + mom->xzz*zz;
Q3miry = mom->xxy*xx + mom->xyy*xy + mom->xyz*xz + mom->yyy*yy + mom->yyz*yz + mom->yzz*zz;
Q3mirz = mom->xxz*xx + mom->xyz*xy + mom->xzz*xz + mom->yyz*yy + mom->yzz*yz + mom->zzz*zz;
Q4mirx = mom->xxxx*xxx + mom->xxxy*xxy + mom->xxxz*xxz + mom->xxyy*xyy + mom->xxyz*xyz +
mom->xxzz*xzz + mom->xyyy*yyy + mom->xyyz*yyz + mom->xyzz*yzz + mom->xzzz*zzz;
Q4miry = mom->xxxy*xxx + mom->xxyy*xxy + mom->xxyz*xxz + mom->xyyy*xyy + mom->xyyz*xyz +
mom->xyzz*xzz + mom->yyyy*yyy + mom->yyyz*yyz + mom->yyzz*yzz + mom->yzzz*zzz;
Q4mirz = mom->xxxz*xxx + mom->xxyz*xxy + mom->xxzz*xxz + mom->xyyz*xyy + mom->xyzz*xyz +
mom->xzzz*xzz + mom->yyyz*yyy + mom->yyzz*yyz + mom->yzzz*yzz + mom->zzzz*zzz;
Q4x = Q4xx*x + Q4xy*y + Q4xz*z;
Q4y = Q4xy*x + Q4yy*y + Q4yz*z;
Q4z = Q4xz*x + Q4yz*y + Q4zz*z;
Q2mir = 0.5*(Q2mirx*x + Q2miry*y + Q2mirz*z) - (Q3x*x + Q3y*y + Q3z*z) + Q4;
Q3mir = onethird*(Q3mirx*x + Q3miry*y + Q3mirz*z) - 0.5*(Q4x*x + Q4y*y + Q4z*z);
Q4mir = 0.25*(Q4mirx*x + Q4miry*y + Q4mirz*z);
Qta = g1*mom->m - g2*Q2 + g3*Q2mir + g4*Q3mir + g5*Q4mir;
fPot -= g0*mom->m - g1*Q2 + g2*Q2mir + g3*Q3mir + g4*Q4mir;
ax += g2*(Q2mirx - Q3x) + g3*(Q3mirx - Q4x) + g4*Q4mirx - x*Qta;
ay += g2*(Q2miry - Q3y) + g3*(Q3miry - Q4y) + g4*Q4miry - y*Qta;
az += g2*(Q2mirz - Q3z) + g3*(Q3mirz - Q4z) + g4*Q4mirz - z*Qta;
#else
Q2mirx = mom->xx*x + mom->xy*y + mom->xz*z;
Q2miry = mom->xy*x + mom->yy*y + mom->yz*z;
Q2mirz = mom->xz*x + mom->yz*y + mom->zz*z;
Q2mir = 0.5*(Q2mirx*x + Q2miry*y + Q2mirz*z);
Qta = g1*mom->totalMass - g2*Q2 + g3*Q2mir;
fPot -= g0*mom->totalMass - g1*Q2 + g2*Q2mir;
ax += g2*(Q2mirx) - x*Qta;
ay += g2*(Q2miry) - y*Qta;
az += g2*(Q2mirz) - z*Qta;
#endif
}
}
}
p->potential += fPot;
p->acceleration_x += ax;
p->acceleration_y += ay;
p->acceleration_z += az;
}
__global__ void EwaldBottomKernel(GravityParticleData *particleTable, int nPart) {
int i, id;
cudatype hdotx, c, s, fPot;
cudatype ax, ay, az;
cudatype tempEwt;
cudatype xdif, ydif, zdif;
GravityParticleData *p;
MultipoleMomentsData *mom;
mom = &(cachedData->mm);
id = blockIdx.x * BLOCK_SIZE + threadIdx.x + 1;
if (id > nPart) {
return;
}
p = &(particleTable[id]);
/*
** Scoring for the h-loop (+,*)
** Without trig = (10,14)
** Trig est. = 2*(6,11) same as 1/sqrt scoring.
** Total = (22,36)
** = 58
*/
fPot=0.0f;
ax=0.0f;
ay=0.0f;
az=0.0f;
xdif = p->position_x - mom->cmx;
ydif = p->position_y - mom->cmy;
zdif = p->position_z - mom->cmz;
for (i=0;i<cachedData->nEwhLoop;++i) {
hdotx = ewt[i].hx * xdif + ewt[i].hy * ydif + ewt[i].hz * zdif;
c = cosf(hdotx);
s = sinf(hdotx);
fPot += ewt[i].hCfac*c + ewt[i].hSfac*s;
tempEwt = ewt[i].hCfac*s - ewt[i].hSfac*c;
ax += ewt[i].hx * tempEwt;
ay += ewt[i].hy * tempEwt;
az += ewt[i].hz * tempEwt;
}
p->potential += fPot;
p->acceleration_x += ax;
p->acceleration_y += ay;
p->acceleration_z += az;
}