https://github.com/geodynamics/citcoms
Tip revision: 0473ed5e226f2a1bc702fb1c6f1210ee721ddc33 authored by Rene Gassmoeller on 03 November 2022, 16:37:30 UTC
Merge pull request #10 from ljhwang/patch-1
Merge pull request #10 from ljhwang/patch-1
Tip revision: 0473ed5
multigrid_kernel.cu
/* -*- C -*- */
/* vim:set ft=c: */
#if __CUDA_ARCH__ < 130
/* for double-precision floating-point */
#error This code requires compute capability 1.3 or higher; try giving "-arch sm_13".
#endif
#include "global_defs.h"
#include "element_definitions.h"
#include <assert.h>
#include <stdio.h>
enum {
CAPS_PER_PROC = 1,
M = 1, /* cap # */
NSD = 3, /* Spatial extent: 3d */
MAX_EQN = NSD*14,
};
struct Some_variables {
int num_zero_resid;
int *zero_resid;
struct /*MESH_DATA*/ {
int NEQ;
int NNO;
} lmesh;
struct ID *ID;
higher_precision *Eqn_k[NSD+1];
int *Node_map;
double *BI;
double *temp;
unsigned int *NODE;
int2 *term;
};
/*------------------------------------------------------------------------*/
static void construct_E(
struct Some_variables **d_E,
struct Some_variables *s_E, /* host's shadow copy of d_E */
struct Some_variables *E
)
{
/* construct a copy of 'E' in device memory */
int neq = E->lmesh.NEQ;
int nno = E->lmesh.NNO;
s_E->lmesh.NEQ = E->lmesh.NEQ;
s_E->lmesh.NNO = E->lmesh.NNO;
/* ID -- cf. allocate_common_vars()*/
cudaMalloc((void **)&s_E->ID, (nno+1)*sizeof(struct ID));
cudaMemcpy(s_E->ID, E->ID, (nno+1)*sizeof(struct ID), cudaMemcpyHostToDevice);
/* Eqn_k, Node_map -- cf. construct_node_maps() */
size_t matrix = MAX_EQN * nno;
s_E->Eqn_k[0] = 0;
cudaMalloc((void **)&s_E->Eqn_k[1], 3*matrix*sizeof(higher_precision));
s_E->Eqn_k[2] = s_E->Eqn_k[1] + matrix;
s_E->Eqn_k[3] = s_E->Eqn_k[2] + matrix;
cudaMemcpy(s_E->Eqn_k[1], E->Eqn_k[1], matrix*sizeof(higher_precision), cudaMemcpyHostToDevice);
cudaMemcpy(s_E->Eqn_k[2], E->Eqn_k[2], matrix*sizeof(higher_precision), cudaMemcpyHostToDevice);
cudaMemcpy(s_E->Eqn_k[3], E->Eqn_k[3], matrix*sizeof(higher_precision), cudaMemcpyHostToDevice);
cudaMalloc((void **)&s_E->Node_map, matrix*sizeof(int));
cudaMemcpy(s_E->Node_map, E->Node_map, matrix*sizeof(int), cudaMemcpyHostToDevice);
/* BI -- cf. allocate_velocity_vars() */
cudaMalloc((void **)&s_E->BI, neq*sizeof(double));
cudaMemcpy(s_E->BI, E->BI, neq*sizeof(double), cudaMemcpyHostToDevice);
/* temp -- cf. allocate_velocity_vars() */
cudaMalloc((void **)&s_E->temp, (neq+1)*sizeof(double));
cudaMemcpy(s_E->temp, E->temp, (neq+1)*sizeof(double), cudaMemcpyHostToDevice);
/* NODE -- cf. allocate_common_vars() */
cudaMalloc((void **)&s_E->NODE, (nno+1)*sizeof(unsigned int));
cudaMemcpy(s_E->NODE, E->NODE, (nno+1)*sizeof(unsigned int), cudaMemcpyHostToDevice);
/* term */
cudaMalloc((void **)&s_E->term, (neq+1) * MAX_EQN * sizeof(int2));
cudaMemcpy(s_E->term, E->term, (neq+1) * MAX_EQN * sizeof(int2), cudaMemcpyHostToDevice);
/* E */
cudaMalloc((void**)d_E, sizeof(Some_variables));
cudaMemcpy(*d_E, s_E, sizeof(Some_variables), cudaMemcpyHostToDevice);
return;
}
static void destroy_E(
struct Some_variables *d_E,
struct Some_variables *s_E
)
{
cudaFree(s_E->ID);
cudaFree(s_E->Eqn_k[1]);
cudaFree(s_E->Node_map);
cudaFree(s_E->BI);
cudaFree(s_E->temp);
cudaFree(s_E->NODE);
cudaFree(s_E->term);
cudaFree(d_E);
}
/*------------------------------------------------------------------------*/
/* from Element_calculations.c */
__global__ void n_assemble_del2_u(
struct Some_variables *E,
double *u, double *Au,
int strip_bcs
)
{
int n = blockIdx.x + 1; /* 1 <= n <= E->lmesh.NNO */
int doff = blockIdx.y + 1; /* 1 <= doff <= NSD */
unsigned int tid = threadIdx.x; /* 0 <= tid < MAX_EQN */
/* Each block writes one element of Au in global memory: Au[eqn]. */
int eqn = E->ID[n].doff[doff]; /* XXX: Compute this value? */
if (n == 1 && doff == 1 && tid == 0) {
Au[E->lmesh.NEQ] = 0.0;
}
if (strip_bcs) {
/* See get_bcs_id_for_residual(). */
unsigned int flags = E->NODE[n];
unsigned int vb;
switch (doff) {
case 1: vb = VBX; break; /* 0x2 */
case 2: vb = VBY; break; /* 0x8 */
case 3: vb = VBZ; break; /* 0x4 */
}
if (flags & vb) {
/* no-op: Au[eqn] is zero */
if (tid == 0) {
Au[eqn] = 0.0;
}
/* XXX: Hundreds of blocks exit here (E->num_zero_resid).
Does it matter? */
return;
}
}
/* The partial sum computed by this thread. */
double acc;
/* Part I: The terms here are easily derived from the block and
thread indices. */
{
int e = n; /* 1 <= e <= E->lmesh.NNO */
int i = (int)tid; /* 0 <= i < MAX_EQN */
if (i < 3) {
acc = 0.0;
} else {
int *C = E->Node_map + (e-1)*MAX_EQN;
higher_precision *B = E->Eqn_k[doff]+(e-1)*MAX_EQN;
double UU = u[C[i]];
acc = B[i]*UU;
}
}
/* Part II: These terms are more complicated. */
{
int2 *term = E->term + eqn*MAX_EQN;
int2 pair = term[tid];
int e = pair.x; /* 1 <= e <= E->lmesh.NNO */
int i = pair.y; /* 0 <= i < MAX_EQN */
if (i != -1) {
/* XXX: Compute these values? */
int eqn1 = E->ID[e].doff[1];
int eqn2 = E->ID[e].doff[2];
int eqn3 = E->ID[e].doff[3];
double U1 = u[eqn1];
double U2 = u[eqn2];
double U3 = u[eqn3];
higher_precision *B1, *B2, *B3;
B1 = E->Eqn_k[1]+(e-1)*MAX_EQN;
B2 = E->Eqn_k[2]+(e-1)*MAX_EQN;
B3 = E->Eqn_k[3]+(e-1)*MAX_EQN;
acc += B1[i]*U1 +
B2[i]*U2 +
B3[i]*U3;
} else {
/* XXX: A considerable number of threads idle here. */
}
}
/* Reduce the partial sums for this block.
Based on reduce2() in the CUDA SDK. */
__shared__ double sum[MAX_EQN];
sum[tid] = acc;
__syncthreads();
if (tid >= 32) {
sum[tid-32] += sum[tid];
}
__syncthreads();
for (unsigned int s = 16; s > 0; s >>= 1) {
if (tid < s) {
sum[tid] += sum[tid + s];
}
/* XXX: not always necessary */
__syncthreads();
}
/* Each block writes one element of Au in global memory. */
if (tid == 0) {
Au[eqn] = sum[0];
}
return;
}
/*------------------------------------------------------------------------*/
/* These are based on the function from General_matrix_functions.c. */
__global__ void gauss_seidel_0(
int neq,
double *d0,
double *Ad
)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < neq) {
d0[i] = Ad[i] = 0.0;
}
}
__global__ void gauss_seidel_1(
struct Some_variables *E,
int neq, int nno,
double *F, double *Ad
)
{
const double zeroo = 0.0;
int i = blockIdx.x * blockDim.x + threadIdx.x + 1;
if (i <= nno) {
/* 1 <= i <= E->lmesh.NNO */
int doff = blockIdx.y + 1; /* 1 <= doff <= NSD */
int eqn = E->ID[i].doff[doff];
if (E->NODE[i] & OFFSIDE) {
E->temp[eqn] = (F[eqn] - Ad[eqn])*E->BI[eqn];
} else {
E->temp[eqn] = zeroo;
}
if (i == 1 && doff == 1) {
E->temp[neq] = zeroo;
Ad[neq] = zeroo;
}
}
}
__global__ void gauss_seidel_2a(
struct Some_variables *E,
double *F, double *Ad
)
{
const int nno = E->lmesh.NNO;
int j = threadIdx.x + 3; /* 3 <= j < MAX_EQN */
int doff = threadIdx.y + 1; /* 1 <= doff <= NSD */
for (int i = 1; i <= nno; i++) {
int eqn = E->ID[i].doff[doff];
int *C = E->Node_map + (i-1)*MAX_EQN;
higher_precision *B = E->Eqn_k[doff] + (i-1)*MAX_EQN;
/* Ad on boundaries differs after the following operation, but
no communications are needed yet, because boundary Ad will
not be used for the G-S iterations for interior nodes */
__shared__ double UU[MAX_EQN-3];
if (threadIdx.y == 0) {
UU[threadIdx.x] = E->temp[C[j]];
}
__syncthreads();
__shared__ double Ad_eqn[NSD];
if (threadIdx.x == 0) {
Ad_eqn[threadIdx.y] = Ad[eqn];
}
__shared__ double sum[MAX_EQN-3][3];
/*for (int j = 3; j < MAX_EQN; j++)*/ {
sum[threadIdx.x][threadIdx.y] = B[j]*UU[threadIdx.x];
}
__syncthreads();
/* Reduce. */
if (threadIdx.x >= 32) {
sum[threadIdx.x-32][threadIdx.y] += sum[threadIdx.x][threadIdx.y];
}
__syncthreads();
for (unsigned int s = 16; s > 0; s >>= 1) {
if (threadIdx.x < s) {
sum[threadIdx.x][threadIdx.y] += sum[threadIdx.x + s][threadIdx.y];
}
/* XXX: not always necessary */
__syncthreads();
}
if (threadIdx.x == 0) {
Ad_eqn[threadIdx.y] += sum[0][threadIdx.y];
Ad[eqn] = Ad_eqn[threadIdx.y];
if (!(E->NODE[i] & OFFSIDE)) {
E->temp[eqn] = (F[eqn] - Ad_eqn[threadIdx.y])*E->BI[eqn];
}
}
__syncthreads();
}
}
__global__ void gauss_seidel_2b(
struct Some_variables *E,
double *F, double *Ad
)
{
const int nno = E->lmesh.NNO;
unsigned int tid = threadIdx.x;
for (int i = 1; i <= nno; i++) {
int eqn1 = E->ID[i].doff[1];
int eqn2 = eqn1 + 1;
int eqn3 = eqn2 + 1;
int *C = E->Node_map + (i-1)*MAX_EQN;
higher_precision *B1,*B2,*B3;
B1 = E->Eqn_k[1] + (i-1)*MAX_EQN;
B2 = E->Eqn_k[2] + (i-1)*MAX_EQN;
B3 = E->Eqn_k[3] + (i-1)*MAX_EQN;
/* Ad on boundaries differs after the following operation, but
no communications are needed yet, because boundary Ad will
not be used for the G-S iterations for interior nodes */
double Ad_eqn1, Ad_eqn2, Ad_eqn3;
if (tid == 0) {
Ad_eqn1 = Ad[eqn1];
Ad_eqn2 = Ad[eqn2];
Ad_eqn3 = Ad[eqn3];
}
__shared__ double sum1[MAX_EQN-3];
__shared__ double sum2[MAX_EQN-3];
__shared__ double sum3[MAX_EQN-3];
int j = tid + 3; /* 3 <= j < MAX_EQN */
/*for (int j = 3; j < MAX_EQN; j++)*/ {
double UU = E->temp[C[j]];
sum1[tid] = B1[j]*UU;
sum2[tid] = B2[j]*UU;
sum3[tid] = B3[j]*UU;
}
__syncthreads();
/* Reduce. */
if (tid >= 32) {
sum1[tid-32] += sum1[tid];
sum2[tid-32] += sum2[tid];
sum3[tid-32] += sum3[tid];
}
__syncthreads();
for (unsigned int s = 16; s > 0; s >>= 1) {
if (tid < s) {
sum1[tid] += sum1[tid + s];
sum2[tid] += sum2[tid + s];
sum3[tid] += sum3[tid + s];
}
/* XXX: not always necessary */
__syncthreads();
}
if (tid == 0) {
Ad_eqn1 += sum1[0];
Ad_eqn2 += sum2[0];
Ad_eqn3 += sum3[0];
Ad[eqn1] = Ad_eqn1;
Ad[eqn2] = Ad_eqn2;
Ad[eqn3] = Ad_eqn3;
if (!(E->NODE[i] & OFFSIDE)) {
E->temp[eqn1] = (F[eqn1] - Ad_eqn1)*E->BI[eqn1];
E->temp[eqn2] = (F[eqn2] - Ad_eqn2)*E->BI[eqn2];
E->temp[eqn3] = (F[eqn3] - Ad_eqn3)*E->BI[eqn3];
}
}
__syncthreads();
}
}
__global__ void gauss_seidel_3(
struct Some_variables *E,
double *d0,
double *Ad
)
{
int n = blockIdx.x + 1; /* 1 <= n <= E->lmesh.NNO */
int doff = blockIdx.y + 1; /* 1 <= doff <= NSD */
unsigned int tid = threadIdx.x; /* 0 <= tid < MAX_EQN */
/* Each block writes one element of Ad and d0 in global memory:
Ad[eqn], d0[eqn]. */
int eqn = E->ID[n].doff[doff]; /* XXX: Compute this value? */
__shared__ double sum[MAX_EQN];
int2 *term = E->term + eqn*MAX_EQN;
int2 pair = term[tid];
int e = pair.x; /* 1 <= e <= E->lmesh.NNO */
int i = pair.y; /* 0 <= i < MAX_EQN */
if (i != -1) {
/* XXX: Compute these values? */
int eqn1 = E->ID[e].doff[1];
int eqn2 = E->ID[e].doff[2];
int eqn3 = E->ID[e].doff[3];
higher_precision *B1, *B2, *B3;
B1 = E->Eqn_k[1]+(e-1)*MAX_EQN;
B2 = E->Eqn_k[2]+(e-1)*MAX_EQN;
B3 = E->Eqn_k[3]+(e-1)*MAX_EQN;
sum[tid] = B1[i]*E->temp[eqn1] +
B2[i]*E->temp[eqn2] +
B3[i]*E->temp[eqn3];
} else {
/* XXX: A considerable number of threads idle here. */
sum[tid] = 0.0;
}
__syncthreads();
/* Reduce the partial sums for this block.
Based on reduce2() in the CUDA SDK. */
if (tid >= 32) {
sum[tid-32] += sum[tid];
}
__syncthreads();
for (unsigned int s = 16; s > 0; s >>= 1) {
if (tid < s) {
sum[tid] += sum[tid + s];
}
/* XXX: not always necessary */
__syncthreads();
}
if (tid == 0) {
/* Each block writes one element of Ad... */
Ad[eqn] += sum[0];
/* ..and one element of d0. */
d0[eqn] += E->temp[eqn];
}
}
void host_n_assemble_del2_u(
struct Some_variables *E,
double *u, double *Au,
int strip_bcs
);
void host_gauss_seidel_0(
struct Some_variables *E,
double *d0,
double *Ad
);
void host_gauss_seidel_1(
struct Some_variables *E,
struct Some_variables *s_E,
double *F, double *Ad
);
void host_gauss_seidel_2(
struct Some_variables *E,
struct Some_variables *s_E,
double *F, double *Ad
);
void host_gauss_seidel_3(
struct Some_variables *E,
struct Some_variables *s_E,
double *d0,
double *Ad
);
void do_gauss_seidel(
struct Some_variables *E,
double *d0,
double *F, double *Ad,
double acc,
int *cycles,
int guess
)
{
int count, steps;
steps=*cycles;
/* pointers to device memory */
struct Some_variables *d_E = 0;
double *d_d0 = 0, *d_F = 0, *d_Ad = 0;
/* construct 'E' on the device */
struct Some_variables s_E;
construct_E(&d_E, &s_E, E);
int neq = E->lmesh.NEQ;
/* allocate memory on the device */
cudaMalloc((void**)&d_d0, (1+neq)*sizeof(double));
cudaMalloc((void**)&d_F, neq*sizeof(double));
cudaMalloc((void**)&d_Ad, (1+neq)*sizeof(double));
/* copy input to the device */
cudaMemcpy(d_F, F, neq*sizeof(double), cudaMemcpyHostToDevice);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
static float totalElapsedTime = 0.0;
if (guess) {
/* copy more input to the device */
d0[E->lmesh.NEQ] = 0.0; /* normally done by n_assemble_del2_u() */
cudaMemcpy(d_d0, d0, (1+neq)*sizeof(double), cudaMemcpyHostToDevice);
dim3 block(MAX_EQN, 1, 1);
dim3 grid(E->lmesh.NNO, NSD, 1);
if (1) n_assemble_del2_u<<< grid, block >>>(d_E, d_d0, d_Ad, 1);
else host_n_assemble_del2_u(E, d_d0, d_Ad, 1);
} else {
const int blockSize = 64;
const int dim = E->lmesh.NEQ;
int gridSize = dim / blockSize;
if (dim % blockSize)
++gridSize;
dim3 block(blockSize, 1, 1);
dim3 grid(gridSize, 1, 1);
gauss_seidel_0<<< grid, block >>>(E->lmesh.NEQ, d_d0, d_Ad);
}
for (count = 0; count < steps; ++count) {
{
const int blockSize = 64;
const int dim = E->lmesh.NNO;
int gridSize = dim / blockSize;
if (dim % blockSize)
++gridSize;
dim3 block(blockSize, 1, 1);
dim3 grid(gridSize, NSD, 1);
cudaEventRecord(start, 0);
gauss_seidel_1<<< grid, block >>>(d_E, E->lmesh.NEQ, E->lmesh.NNO, d_F, d_Ad);
cudaEventRecord(stop, 0);
if (0) {
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
totalElapsedTime += elapsedTime;
if (count == steps - 1) {
printf("gauss_seidel_0 %d\t%f\t%f\n", count, elapsedTime, totalElapsedTime);
}
}
}
if (1) {
dim3 block(MAX_EQN - 3, NSD, 1);
dim3 grid(1, 1, 1);
gauss_seidel_2a<<< grid, block >>>(d_E, d_F, d_Ad);
}
if (0) {
dim3 block(MAX_EQN - 3, 1, 1);
dim3 grid(1, 1, 1);
gauss_seidel_2b<<< grid, block >>>(d_E, d_F, d_Ad);
}
if (0) host_gauss_seidel_2(E, &s_E, d_F, d_Ad);
/* Ad on boundaries differs after the following operation */
{
dim3 block(MAX_EQN, 1, 1);
dim3 grid(E->lmesh.NNO, NSD, 1);
if (1) gauss_seidel_3<<< grid, block >>>(d_E, d_d0, d_Ad);
else host_gauss_seidel_3(E, &s_E, d_d0, d_Ad);
}
}
cudaEventDestroy(start);
cudaEventDestroy(stop);
/* wait for completion */
if (cudaThreadSynchronize() != cudaSuccess) {
assert(0 && "something went wrong");
}
/* copy output from device */
cudaMemcpy(Ad, d_Ad, (1+neq)*sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(d0, d_d0, (1+neq)*sizeof(double), cudaMemcpyDeviceToHost);
/* free device memory */
cudaFree(d_d0);
cudaFree(d_F);
cudaFree(d_Ad);
destroy_E(d_E, &s_E);
*cycles=count;
return;
}
/*------------------------------------------------------------------------*/
static void assert_assumptions(struct All_variables *E, int level) {
assert(E->control.NMULTIGRID);
assert(E->sphere.caps_per_proc == CAPS_PER_PROC);
assert(E->mesh.nsd == NSD);
assert(E->parallel.nproc == 1);
}
static int2 *collect_terms(
struct Some_variables *E
)
{
/* Map out how to parallelize "Au[C[i]] += ..." and "Ad[C[j]] += ...". */
const int neq = E->lmesh.NEQ;
const int nno = E->lmesh.NNO;
int2 *term = (int2 *)malloc((neq+1) * MAX_EQN * sizeof(int2));
for (int e = 0; e <= neq; e++) {
int2 *term = E->term + e*MAX_EQN;
for (int j = 0; j < MAX_EQN; j++) {
term[j].x = -1;
term[j].y = -1;
}
}
for (int e = 1; e <= nno; e++) {
int *C = E->Node_map + (e-1)*MAX_EQN;
for (int i = 0; i < MAX_EQN; i++) {
int2 *term = E->term + C[i]*MAX_EQN;
int j;
for (j = 0; j < MAX_EQN; j++) {
if (term[j].x == -1) {
term[j].x = e;
term[j].y = i;
break;
}
}
assert(C[i] == neq || j < MAX_EQN);
}
}
return term;
}
extern "C" void gauss_seidel(
struct All_variables *E,
double **d0,
double **F, double **Ad,
double acc,
int *cycles,
int level,
int guess
)
{
struct Some_variables kE;
assert_assumptions(E, level);
/* initialize 'Some_variables' with 'All_variables' */
kE.num_zero_resid = E->num_zero_resid[level][M];
kE.zero_resid = E->zero_resid[level][M];
kE.lmesh.NEQ = E->lmesh.NEQ[level];
kE.lmesh.NNO = E->lmesh.NNO[level];
kE.ID = E->ID[level][M];
kE.Eqn_k[0] = 0;
kE.Eqn_k[1] = E->Eqn_k1[level][M];
kE.Eqn_k[2] = E->Eqn_k2[level][M];
kE.Eqn_k[3] = E->Eqn_k3[level][M];
kE.Node_map = E->Node_map[level][M];
kE.BI = E->BI[level][M];
kE.temp = E->temp[M];
kE.NODE = E->NODE[level][M];
static int2 *term[MAX_LEVELS];
if (!term[level])
term[level] = collect_terms(&kE);
kE.term = term[level];
do_gauss_seidel(
&kE,
d0[M],
F[M], Ad[M],
acc,
cycles,
guess
);
}
/*------------------------------------------------------------------------*/
/*------------------------------------------------------------------------*/
/*------------------------------------------------------------------------*/
void host_strip_bcs_from_residual(
struct Some_variables *E,
double *Res
)
{
for(int i = 1; i <= E->num_zero_resid; i++)
Res[E->zero_resid[i]] = 0.0;
}
void host_n_assemble_del2_u(
struct Some_variables *E,
double *d_u, double *d_Au,
int strip_bcs
)
{
cudaThreadSynchronize();
const int neq = E->lmesh.NEQ;
const int nno = E->lmesh.NNO;
double *u = (double *)malloc((1+neq)*sizeof(double));
cudaMemcpy(u, d_u, (1+neq)*sizeof(double), cudaMemcpyDeviceToHost);
double *Au = (double *)malloc((1+neq)*sizeof(double));
for (int e = 0; e <= neq; e++) {
Au[e] = 0.0;
}
u[neq] = 0.0;
for (int e = 1; e <= nno; e++) {
int eqn1 = E->ID[e].doff[1];
int eqn2 = E->ID[e].doff[2];
int eqn3 = E->ID[e].doff[3];
double U1 = u[eqn1];
double U2 = u[eqn2];
double U3 = u[eqn3];
int *C = E->Node_map + (e-1)*MAX_EQN;
higher_precision *B1,*B2,*B3;
B1 = E->Eqn_k[1] + (e-1)*MAX_EQN;
B2 = E->Eqn_k[2] + (e-1)*MAX_EQN;
B3 = E->Eqn_k[3] + (e-1)*MAX_EQN;
for (int i = 3; i < MAX_EQN; i++) {
double UU = u[C[i]];
Au[eqn1] += B1[i]*UU;
Au[eqn2] += B2[i]*UU;
Au[eqn3] += B3[i]*UU;
}
for (int i = 0; i < MAX_EQN; i++) {
Au[C[i]] += B1[i]*U1 +
B2[i]*U2 +
B3[i]*U3;
}
}
if (strip_bcs)
host_strip_bcs_from_residual(E,Au);
cudaMemcpy(d_Au, Au, (1+neq)*sizeof(double), cudaMemcpyHostToDevice);
free(u);
free(Au);
return;
}
void host_gauss_seidel_0(
struct Some_variables *E,
double *d_d0,
double *d_Ad
)
{
cudaThreadSynchronize();
const int neq = E->lmesh.NEQ;
double *d0 = (double *)malloc((1+neq)*sizeof(double));
double *Ad = (double *)malloc((1+neq)*sizeof(double));
const double zeroo = 0.0;
for (int i = 0; i < neq; i++) {
d0[i] = Ad[i] = zeroo;
}
cudaMemcpy(d_d0, d0, (1+neq)*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_Ad, Ad, (1+neq)*sizeof(double), cudaMemcpyHostToDevice);
free(d0);
free(Ad);
}
void host_gauss_seidel_1(
struct Some_variables *E,
struct Some_variables *s_E,
double *d_F, double *d_Ad
)
{
cudaThreadSynchronize();
const int neq = E->lmesh.NEQ;
const int nno = E->lmesh.NNO;
double *F = (double *)malloc(neq*sizeof(double));
cudaMemcpy(F, d_F, neq*sizeof(double), cudaMemcpyDeviceToHost);
double *Ad = (double *)malloc((1+neq)*sizeof(double));
cudaMemcpy(Ad, d_Ad, (1+neq)*sizeof(double), cudaMemcpyDeviceToHost);
const double zeroo = 0.0;
for (int j = 0; j <= neq; j++) {
E->temp[j] = zeroo;
}
Ad[neq] = zeroo;
for (int i = 1; i <= nno; i++) {
if (E->NODE[i] & OFFSIDE) {
int eqn1 = E->ID[i].doff[1];
int eqn2 = E->ID[i].doff[2];
int eqn3 = E->ID[i].doff[3];
E->temp[eqn1] = (F[eqn1] - Ad[eqn1])*E->BI[eqn1];
E->temp[eqn2] = (F[eqn2] - Ad[eqn2])*E->BI[eqn2];
E->temp[eqn3] = (F[eqn3] - Ad[eqn3])*E->BI[eqn3];
}
}
/* Ad[neq] */
cudaMemcpy(d_Ad + neq, Ad + neq, sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(s_E->temp, E->temp, (neq+1)*sizeof(double), cudaMemcpyHostToDevice);
free(F);
free(Ad);
}
void host_gauss_seidel_2(
struct Some_variables *E,
struct Some_variables *s_E,
double *d_F, double *d_Ad
)
{
cudaThreadSynchronize();
const int neq = E->lmesh.NEQ;
const int nno = E->lmesh.NNO;
double *F = (double *)malloc(neq*sizeof(double));
cudaMemcpy(F, d_F, neq*sizeof(double), cudaMemcpyDeviceToHost);
double *Ad = (double *)malloc((1+neq)*sizeof(double));
cudaMemcpy(Ad, d_Ad, (1+neq)*sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(E->temp, s_E->temp, (neq+1)*sizeof(double), cudaMemcpyDeviceToHost);
for (int i = 1; i <= nno; i++) {
int eqn1 = E->ID[i].doff[1];
int eqn2 = E->ID[i].doff[2];
int eqn3 = E->ID[i].doff[3];
int *C = E->Node_map + (i-1)*MAX_EQN;
higher_precision *B1,*B2,*B3;
B1 = E->Eqn_k[1] + (i-1)*MAX_EQN;
B2 = E->Eqn_k[2] + (i-1)*MAX_EQN;
B3 = E->Eqn_k[3] + (i-1)*MAX_EQN;
/* Ad on boundaries differs after the following operation, but
no communications are needed yet, because boundary Ad will
not be used for the G-S iterations for interior nodes */
for (int j = 3; j < MAX_EQN; j++) {
double UU = E->temp[C[j]];
Ad[eqn1] += B1[j]*UU;
Ad[eqn2] += B2[j]*UU;
Ad[eqn3] += B3[j]*UU;
}
if (!(E->NODE[i] & OFFSIDE)) {
E->temp[eqn1] = (F[eqn1] - Ad[eqn1])*E->BI[eqn1];
E->temp[eqn2] = (F[eqn2] - Ad[eqn2])*E->BI[eqn2];
E->temp[eqn3] = (F[eqn3] - Ad[eqn3])*E->BI[eqn3];
}
}
cudaMemcpy(d_Ad, Ad, (1+neq)*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(s_E->temp, E->temp, (neq+1)*sizeof(double), cudaMemcpyHostToDevice);
free(F);
free(Ad);
}
void host_gauss_seidel_3(
struct Some_variables *E,
struct Some_variables *s_E,
double *d_d0,
double *d_Ad
)
{
cudaThreadSynchronize();
const int neq = E->lmesh.NEQ;
const int nno = E->lmesh.NNO;
double *d0 = (double *)malloc((1+neq)*sizeof(double));
double *Ad = (double *)malloc((1+neq)*sizeof(double));
cudaMemcpy(d0, d_d0, (1+neq)*sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(Ad, d_Ad, (1+neq)*sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(E->temp, s_E->temp, (neq+1)*sizeof(double), cudaMemcpyDeviceToHost);
for (int i = 1; i <= nno; i++) {
int eqn1 = E->ID[i].doff[1];
int eqn2 = E->ID[i].doff[2];
int eqn3 = E->ID[i].doff[3];
int *C = E->Node_map + (i-1)*MAX_EQN;
higher_precision *B1,*B2,*B3;
B1 = E->Eqn_k[1] + (i-1)*MAX_EQN;
B2 = E->Eqn_k[2] + (i-1)*MAX_EQN;
B3 = E->Eqn_k[3] + (i-1)*MAX_EQN;
/* Ad on boundaries differs after the following operation */
for (int j = 0; j < MAX_EQN; j++) {
Ad[C[j]] += B1[j]*E->temp[eqn1] +
B2[j]*E->temp[eqn2] +
B3[j]*E->temp[eqn3];
}
d0[eqn1] += E->temp[eqn1];
d0[eqn2] += E->temp[eqn2];
d0[eqn3] += E->temp[eqn3];
}
cudaMemcpy(d_d0, d0, (1+neq)*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_Ad, Ad, (1+neq)*sizeof(double), cudaMemcpyHostToDevice);
free(d0);
free(Ad);
}