https://github.com/geodynamics/citcoms
Revision df821c06952128812763ec5b7a944cd178ca5ed3 authored by Thorsten Becker on 08 December 2010, 18:43:58 UTC, committed by Thorsten Becker on 08 December 2010, 18:43:58 UTC
GMT/ggrd compile before).
1 parent cf7495e
Tip revision: df821c06952128812763ec5b7a944cd178ca5ed3 authored by Thorsten Becker on 08 December 2010, 18:43:58 UTC
Added definition of TRUE (1) and FALSE (0) in case undefined (was defined for
Added definition of TRUE (1) and FALSE (0) in case undefined (was defined for
Tip revision: df821c0
multigrid_kernel.cu
/* -*- C -*- */
/* vim:set ft=c: */
#if __CUDA_ARCH__ < 130
/* for double-precision floating-point */
#error This code requires compute capability 1.3 or higher; try giving "-arch sm_13".
#endif
#include "global_defs.h"
#include "element_definitions.h"
#include <assert.h>
#include <stdio.h>
enum {
CAPS_PER_PROC = 1,
M = 1, /* cap # */
NSD = 3, /* Spatial extent: 3d */
MAX_EQN = NSD*14,
};
struct Some_variables {
int num_zero_resid;
int *zero_resid;
struct /*MESH_DATA*/ {
int NEQ;
int NNO;
} lmesh;
struct ID *ID;
higher_precision *Eqn_k[NSD+1];
int *Node_map;
double *BI;
double *temp;
unsigned int *NODE;
int2 *term;
};
/*------------------------------------------------------------------------*/
static void construct_E(
struct Some_variables **d_E,
struct Some_variables *s_E, /* host's shadow copy of d_E */
struct Some_variables *E
)
{
/* construct a copy of 'E' in device memory */
int neq = E->lmesh.NEQ;
int nno = E->lmesh.NNO;
s_E->lmesh.NEQ = E->lmesh.NEQ;
s_E->lmesh.NNO = E->lmesh.NNO;
/* ID -- cf. allocate_common_vars()*/
cudaMalloc((void **)&s_E->ID, (nno+1)*sizeof(struct ID));
cudaMemcpy(s_E->ID, E->ID, (nno+1)*sizeof(struct ID), cudaMemcpyHostToDevice);
/* Eqn_k, Node_map -- cf. construct_node_maps() */
size_t matrix = MAX_EQN * nno;
s_E->Eqn_k[0] = 0;
cudaMalloc((void **)&s_E->Eqn_k[1], 3*matrix*sizeof(higher_precision));
s_E->Eqn_k[2] = s_E->Eqn_k[1] + matrix;
s_E->Eqn_k[3] = s_E->Eqn_k[2] + matrix;
cudaMemcpy(s_E->Eqn_k[1], E->Eqn_k[1], matrix*sizeof(higher_precision), cudaMemcpyHostToDevice);
cudaMemcpy(s_E->Eqn_k[2], E->Eqn_k[2], matrix*sizeof(higher_precision), cudaMemcpyHostToDevice);
cudaMemcpy(s_E->Eqn_k[3], E->Eqn_k[3], matrix*sizeof(higher_precision), cudaMemcpyHostToDevice);
cudaMalloc((void **)&s_E->Node_map, matrix*sizeof(int));
cudaMemcpy(s_E->Node_map, E->Node_map, matrix*sizeof(int), cudaMemcpyHostToDevice);
/* BI -- cf. allocate_velocity_vars() */
cudaMalloc((void **)&s_E->BI, neq*sizeof(double));
cudaMemcpy(s_E->BI, E->BI, neq*sizeof(double), cudaMemcpyHostToDevice);
/* temp -- cf. allocate_velocity_vars() */
cudaMalloc((void **)&s_E->temp, (neq+1)*sizeof(double));
cudaMemcpy(s_E->temp, E->temp, (neq+1)*sizeof(double), cudaMemcpyHostToDevice);
/* NODE -- cf. allocate_common_vars() */
cudaMalloc((void **)&s_E->NODE, (nno+1)*sizeof(unsigned int));
cudaMemcpy(s_E->NODE, E->NODE, (nno+1)*sizeof(unsigned int), cudaMemcpyHostToDevice);
/* term */
cudaMalloc((void **)&s_E->term, (neq+1) * MAX_EQN * sizeof(int2));
cudaMemcpy(s_E->term, E->term, (neq+1) * MAX_EQN * sizeof(int2), cudaMemcpyHostToDevice);
/* E */
cudaMalloc((void**)d_E, sizeof(Some_variables));
cudaMemcpy(*d_E, s_E, sizeof(Some_variables), cudaMemcpyHostToDevice);
return;
}
static void destroy_E(
struct Some_variables *d_E,
struct Some_variables *s_E
)
{
cudaFree(s_E->ID);
cudaFree(s_E->Eqn_k[1]);
cudaFree(s_E->Node_map);
cudaFree(s_E->BI);
cudaFree(s_E->temp);
cudaFree(s_E->NODE);
cudaFree(s_E->term);
cudaFree(d_E);
}
/*------------------------------------------------------------------------*/
/* from Element_calculations.c */
__global__ void n_assemble_del2_u(
struct Some_variables *E,
double *u, double *Au,
int strip_bcs
)
{
int n = blockIdx.x + 1; /* 1 <= n <= E->lmesh.NNO */
int doff = blockIdx.y + 1; /* 1 <= doff <= NSD */
unsigned int tid = threadIdx.x; /* 0 <= tid < MAX_EQN */
/* Each block writes one element of Au in global memory: Au[eqn]. */
int eqn = E->ID[n].doff[doff]; /* XXX: Compute this value? */
if (n == 1 && doff == 1 && tid == 0) {
Au[E->lmesh.NEQ] = 0.0;
}
if (strip_bcs) {
/* See get_bcs_id_for_residual(). */
unsigned int flags = E->NODE[n];
unsigned int vb;
switch (doff) {
case 1: vb = VBX; break; /* 0x2 */
case 2: vb = VBY; break; /* 0x8 */
case 3: vb = VBZ; break; /* 0x4 */
}
if (flags & vb) {
/* no-op: Au[eqn] is zero */
if (tid == 0) {
Au[eqn] = 0.0;
}
/* XXX: Hundreds of blocks exit here (E->num_zero_resid).
Does it matter? */
return;
}
}
/* The partial sum computed by this thread. */
double acc;
/* Part I: The terms here are easily derived from the block and
thread indices. */
{
int e = n; /* 1 <= e <= E->lmesh.NNO */
int i = (int)tid; /* 0 <= i < MAX_EQN */
if (i < 3) {
acc = 0.0;
} else {
int *C = E->Node_map + (e-1)*MAX_EQN;
higher_precision *B = E->Eqn_k[doff]+(e-1)*MAX_EQN;
double UU = u[C[i]];
acc = B[i]*UU;
}
}
/* Part II: These terms are more complicated. */
{
int2 *term = E->term + eqn*MAX_EQN;
int2 pair = term[tid];
int e = pair.x; /* 1 <= e <= E->lmesh.NNO */
int i = pair.y; /* 0 <= i < MAX_EQN */
if (i != -1) {
/* XXX: Compute these values? */
int eqn1 = E->ID[e].doff[1];
int eqn2 = E->ID[e].doff[2];
int eqn3 = E->ID[e].doff[3];
double U1 = u[eqn1];
double U2 = u[eqn2];
double U3 = u[eqn3];
higher_precision *B1, *B2, *B3;
B1 = E->Eqn_k[1]+(e-1)*MAX_EQN;
B2 = E->Eqn_k[2]+(e-1)*MAX_EQN;
B3 = E->Eqn_k[3]+(e-1)*MAX_EQN;
acc += B1[i]*U1 +
B2[i]*U2 +
B3[i]*U3;
} else {
/* XXX: A considerable number of threads idle here. */
}
}
/* Reduce the partial sums for this block.
Based on reduce2() in the CUDA SDK. */
__shared__ double sum[MAX_EQN];
sum[tid] = acc;
__syncthreads();
if (tid >= 32) {
sum[tid-32] += sum[tid];
}
__syncthreads();
for (unsigned int s = 16; s > 0; s >>= 1) {
if (tid < s) {
sum[tid] += sum[tid + s];
}
/* XXX: not always necessary */
__syncthreads();
}
/* Each block writes one element of Au in global memory. */
if (tid == 0) {
Au[eqn] = sum[0];
}
return;
}
/*------------------------------------------------------------------------*/
/* These are based on the function from General_matrix_functions.c. */
__global__ void gauss_seidel_0(
int neq,
double *d0,
double *Ad
)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < neq) {
d0[i] = Ad[i] = 0.0;
}
}
__global__ void gauss_seidel_1(
struct Some_variables *E,
int neq, int nno,
double *F, double *Ad
)
{
const double zeroo = 0.0;
int i = blockIdx.x * blockDim.x + threadIdx.x + 1;
if (i <= nno) {
/* 1 <= i <= E->lmesh.NNO */
int doff = blockIdx.y + 1; /* 1 <= doff <= NSD */
int eqn = E->ID[i].doff[doff];
if (E->NODE[i] & OFFSIDE) {
E->temp[eqn] = (F[eqn] - Ad[eqn])*E->BI[eqn];
} else {
E->temp[eqn] = zeroo;
}
if (i == 1 && doff == 1) {
E->temp[neq] = zeroo;
Ad[neq] = zeroo;
}
}
}
__global__ void gauss_seidel_2a(
struct Some_variables *E,
double *F, double *Ad
)
{
const int nno = E->lmesh.NNO;
int j = threadIdx.x + 3; /* 3 <= j < MAX_EQN */
int doff = threadIdx.y + 1; /* 1 <= doff <= NSD */
for (int i = 1; i <= nno; i++) {
int eqn = E->ID[i].doff[doff];
int *C = E->Node_map + (i-1)*MAX_EQN;
higher_precision *B = E->Eqn_k[doff] + (i-1)*MAX_EQN;
/* Ad on boundaries differs after the following operation, but
no communications are needed yet, because boundary Ad will
not be used for the G-S iterations for interior nodes */
__shared__ double UU[MAX_EQN-3];
if (threadIdx.y == 0) {
UU[threadIdx.x] = E->temp[C[j]];
}
__syncthreads();
__shared__ double Ad_eqn[NSD];
if (threadIdx.x == 0) {
Ad_eqn[threadIdx.y] = Ad[eqn];
}
__shared__ double sum[MAX_EQN-3][3];
/*for (int j = 3; j < MAX_EQN; j++)*/ {
sum[threadIdx.x][threadIdx.y] = B[j]*UU[threadIdx.x];
}
__syncthreads();
/* Reduce. */
if (threadIdx.x >= 32) {
sum[threadIdx.x-32][threadIdx.y] += sum[threadIdx.x][threadIdx.y];
}
__syncthreads();
for (unsigned int s = 16; s > 0; s >>= 1) {
if (threadIdx.x < s) {
sum[threadIdx.x][threadIdx.y] += sum[threadIdx.x + s][threadIdx.y];
}
/* XXX: not always necessary */
__syncthreads();
}
if (threadIdx.x == 0) {
Ad_eqn[threadIdx.y] += sum[0][threadIdx.y];
Ad[eqn] = Ad_eqn[threadIdx.y];
if (!(E->NODE[i] & OFFSIDE)) {
E->temp[eqn] = (F[eqn] - Ad_eqn[threadIdx.y])*E->BI[eqn];
}
}
__syncthreads();
}
}
__global__ void gauss_seidel_2b(
struct Some_variables *E,
double *F, double *Ad
)
{
const int nno = E->lmesh.NNO;
unsigned int tid = threadIdx.x;
for (int i = 1; i <= nno; i++) {
int eqn1 = E->ID[i].doff[1];
int eqn2 = eqn1 + 1;
int eqn3 = eqn2 + 1;
int *C = E->Node_map + (i-1)*MAX_EQN;
higher_precision *B1,*B2,*B3;
B1 = E->Eqn_k[1] + (i-1)*MAX_EQN;
B2 = E->Eqn_k[2] + (i-1)*MAX_EQN;
B3 = E->Eqn_k[3] + (i-1)*MAX_EQN;
/* Ad on boundaries differs after the following operation, but
no communications are needed yet, because boundary Ad will
not be used for the G-S iterations for interior nodes */
double Ad_eqn1, Ad_eqn2, Ad_eqn3;
if (tid == 0) {
Ad_eqn1 = Ad[eqn1];
Ad_eqn2 = Ad[eqn2];
Ad_eqn3 = Ad[eqn3];
}
__shared__ double sum1[MAX_EQN-3];
__shared__ double sum2[MAX_EQN-3];
__shared__ double sum3[MAX_EQN-3];
int j = tid + 3; /* 3 <= j < MAX_EQN */
/*for (int j = 3; j < MAX_EQN; j++)*/ {
double UU = E->temp[C[j]];
sum1[tid] = B1[j]*UU;
sum2[tid] = B2[j]*UU;
sum3[tid] = B3[j]*UU;
}
__syncthreads();
/* Reduce. */
if (tid >= 32) {
sum1[tid-32] += sum1[tid];
sum2[tid-32] += sum2[tid];
sum3[tid-32] += sum3[tid];
}
__syncthreads();
for (unsigned int s = 16; s > 0; s >>= 1) {
if (tid < s) {
sum1[tid] += sum1[tid + s];
sum2[tid] += sum2[tid + s];
sum3[tid] += sum3[tid + s];
}
/* XXX: not always necessary */
__syncthreads();
}
if (tid == 0) {
Ad_eqn1 += sum1[0];
Ad_eqn2 += sum2[0];
Ad_eqn3 += sum3[0];
Ad[eqn1] = Ad_eqn1;
Ad[eqn2] = Ad_eqn2;
Ad[eqn3] = Ad_eqn3;
if (!(E->NODE[i] & OFFSIDE)) {
E->temp[eqn1] = (F[eqn1] - Ad_eqn1)*E->BI[eqn1];
E->temp[eqn2] = (F[eqn2] - Ad_eqn2)*E->BI[eqn2];
E->temp[eqn3] = (F[eqn3] - Ad_eqn3)*E->BI[eqn3];
}
}
__syncthreads();
}
}
__global__ void gauss_seidel_3(
struct Some_variables *E,
double *d0,
double *Ad
)
{
int n = blockIdx.x + 1; /* 1 <= n <= E->lmesh.NNO */
int doff = blockIdx.y + 1; /* 1 <= doff <= NSD */
unsigned int tid = threadIdx.x; /* 0 <= tid < MAX_EQN */
/* Each block writes one element of Ad and d0 in global memory:
Ad[eqn], d0[eqn]. */
int eqn = E->ID[n].doff[doff]; /* XXX: Compute this value? */
__shared__ double sum[MAX_EQN];
int2 *term = E->term + eqn*MAX_EQN;
int2 pair = term[tid];
int e = pair.x; /* 1 <= e <= E->lmesh.NNO */
int i = pair.y; /* 0 <= i < MAX_EQN */
if (i != -1) {
/* XXX: Compute these values? */
int eqn1 = E->ID[e].doff[1];
int eqn2 = E->ID[e].doff[2];
int eqn3 = E->ID[e].doff[3];
higher_precision *B1, *B2, *B3;
B1 = E->Eqn_k[1]+(e-1)*MAX_EQN;
B2 = E->Eqn_k[2]+(e-1)*MAX_EQN;
B3 = E->Eqn_k[3]+(e-1)*MAX_EQN;
sum[tid] = B1[i]*E->temp[eqn1] +
B2[i]*E->temp[eqn2] +
B3[i]*E->temp[eqn3];
} else {
/* XXX: A considerable number of threads idle here. */
sum[tid] = 0.0;
}
__syncthreads();
/* Reduce the partial sums for this block.
Based on reduce2() in the CUDA SDK. */
if (tid >= 32) {
sum[tid-32] += sum[tid];
}
__syncthreads();
for (unsigned int s = 16; s > 0; s >>= 1) {
if (tid < s) {
sum[tid] += sum[tid + s];
}
/* XXX: not always necessary */
__syncthreads();
}
if (tid == 0) {
/* Each block writes one element of Ad... */
Ad[eqn] += sum[0];
/* ..and one element of d0. */
d0[eqn] += E->temp[eqn];
}
}
void host_n_assemble_del2_u(
struct Some_variables *E,
double *u, double *Au,
int strip_bcs
);
void host_gauss_seidel_0(
struct Some_variables *E,
double *d0,
double *Ad
);
void host_gauss_seidel_1(
struct Some_variables *E,
struct Some_variables *s_E,
double *F, double *Ad
);
void host_gauss_seidel_2(
struct Some_variables *E,
struct Some_variables *s_E,
double *F, double *Ad
);
void host_gauss_seidel_3(
struct Some_variables *E,
struct Some_variables *s_E,
double *d0,
double *Ad
);
void do_gauss_seidel(
struct Some_variables *E,
double *d0,
double *F, double *Ad,
double acc,
int *cycles,
int guess
)
{
int count, steps;
steps=*cycles;
/* pointers to device memory */
struct Some_variables *d_E = 0;
double *d_d0 = 0, *d_F = 0, *d_Ad = 0;
/* construct 'E' on the device */
struct Some_variables s_E;
construct_E(&d_E, &s_E, E);
int neq = E->lmesh.NEQ;
/* allocate memory on the device */
cudaMalloc((void**)&d_d0, (1+neq)*sizeof(double));
cudaMalloc((void**)&d_F, neq*sizeof(double));
cudaMalloc((void**)&d_Ad, (1+neq)*sizeof(double));
/* copy input to the device */
cudaMemcpy(d_F, F, neq*sizeof(double), cudaMemcpyHostToDevice);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
static float totalElapsedTime = 0.0;
if (guess) {
/* copy more input to the device */
d0[E->lmesh.NEQ] = 0.0; /* normally done by n_assemble_del2_u() */
cudaMemcpy(d_d0, d0, (1+neq)*sizeof(double), cudaMemcpyHostToDevice);
dim3 block(MAX_EQN, 1, 1);
dim3 grid(E->lmesh.NNO, NSD, 1);
if (1) n_assemble_del2_u<<< grid, block >>>(d_E, d_d0, d_Ad, 1);
else host_n_assemble_del2_u(E, d_d0, d_Ad, 1);
} else {
const int blockSize = 64;
const int dim = E->lmesh.NEQ;
int gridSize = dim / blockSize;
if (dim % blockSize)
++gridSize;
dim3 block(blockSize, 1, 1);
dim3 grid(gridSize, 1, 1);
gauss_seidel_0<<< grid, block >>>(E->lmesh.NEQ, d_d0, d_Ad);
}
for (count = 0; count < steps; ++count) {
{
const int blockSize = 64;
const int dim = E->lmesh.NNO;
int gridSize = dim / blockSize;
if (dim % blockSize)
++gridSize;
dim3 block(blockSize, 1, 1);
dim3 grid(gridSize, NSD, 1);
cudaEventRecord(start, 0);
gauss_seidel_1<<< grid, block >>>(d_E, E->lmesh.NEQ, E->lmesh.NNO, d_F, d_Ad);
cudaEventRecord(stop, 0);
if (0) {
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
totalElapsedTime += elapsedTime;
if (count == steps - 1) {
printf("gauss_seidel_0 %d\t%f\t%f\n", count, elapsedTime, totalElapsedTime);
}
}
}
if (1) {
dim3 block(MAX_EQN - 3, NSD, 1);
dim3 grid(1, 1, 1);
gauss_seidel_2a<<< grid, block >>>(d_E, d_F, d_Ad);
}
if (0) {
dim3 block(MAX_EQN - 3, 1, 1);
dim3 grid(1, 1, 1);
gauss_seidel_2b<<< grid, block >>>(d_E, d_F, d_Ad);
}
if (0) host_gauss_seidel_2(E, &s_E, d_F, d_Ad);
/* Ad on boundaries differs after the following operation */
{
dim3 block(MAX_EQN, 1, 1);
dim3 grid(E->lmesh.NNO, NSD, 1);
if (1) gauss_seidel_3<<< grid, block >>>(d_E, d_d0, d_Ad);
else host_gauss_seidel_3(E, &s_E, d_d0, d_Ad);
}
}
cudaEventDestroy(start);
cudaEventDestroy(stop);
/* wait for completion */
if (cudaThreadSynchronize() != cudaSuccess) {
assert(0 && "something went wrong");
}
/* copy output from device */
cudaMemcpy(Ad, d_Ad, (1+neq)*sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(d0, d_d0, (1+neq)*sizeof(double), cudaMemcpyDeviceToHost);
/* free device memory */
cudaFree(d_d0);
cudaFree(d_F);
cudaFree(d_Ad);
destroy_E(d_E, &s_E);
*cycles=count;
return;
}
/*------------------------------------------------------------------------*/
static void assert_assumptions(struct All_variables *E, int level) {
assert(E->control.NMULTIGRID);
assert(E->sphere.caps_per_proc == CAPS_PER_PROC);
assert(E->mesh.nsd == NSD);
assert(E->parallel.nproc == 1);
}
static int2 *collect_terms(
struct Some_variables *E
)
{
/* Map out how to parallelize "Au[C[i]] += ..." and "Ad[C[j]] += ...". */
const int neq = E->lmesh.NEQ;
const int nno = E->lmesh.NNO;
int2 *term = (int2 *)malloc((neq+1) * MAX_EQN * sizeof(int2));
for (int e = 0; e <= neq; e++) {
int2 *term = E->term + e*MAX_EQN;
for (int j = 0; j < MAX_EQN; j++) {
term[j].x = -1;
term[j].y = -1;
}
}
for (int e = 1; e <= nno; e++) {
int *C = E->Node_map + (e-1)*MAX_EQN;
for (int i = 0; i < MAX_EQN; i++) {
int2 *term = E->term + C[i]*MAX_EQN;
int j;
for (j = 0; j < MAX_EQN; j++) {
if (term[j].x == -1) {
term[j].x = e;
term[j].y = i;
break;
}
}
assert(C[i] == neq || j < MAX_EQN);
}
}
return term;
}
extern "C" void gauss_seidel(
struct All_variables *E,
double **d0,
double **F, double **Ad,
double acc,
int *cycles,
int level,
int guess
)
{
struct Some_variables kE;
assert_assumptions(E, level);
/* initialize 'Some_variables' with 'All_variables' */
kE.num_zero_resid = E->num_zero_resid[level][M];
kE.zero_resid = E->zero_resid[level][M];
kE.lmesh.NEQ = E->lmesh.NEQ[level];
kE.lmesh.NNO = E->lmesh.NNO[level];
kE.ID = E->ID[level][M];
kE.Eqn_k[0] = 0;
kE.Eqn_k[1] = E->Eqn_k1[level][M];
kE.Eqn_k[2] = E->Eqn_k2[level][M];
kE.Eqn_k[3] = E->Eqn_k3[level][M];
kE.Node_map = E->Node_map[level][M];
kE.BI = E->BI[level][M];
kE.temp = E->temp[M];
kE.NODE = E->NODE[level][M];
static int2 *term[MAX_LEVELS];
if (!term[level])
term[level] = collect_terms(&kE);
kE.term = term[level];
do_gauss_seidel(
&kE,
d0[M],
F[M], Ad[M],
acc,
cycles,
guess
);
}
/*------------------------------------------------------------------------*/
/*------------------------------------------------------------------------*/
/*------------------------------------------------------------------------*/
void host_strip_bcs_from_residual(
struct Some_variables *E,
double *Res
)
{
for(int i = 1; i <= E->num_zero_resid; i++)
Res[E->zero_resid[i]] = 0.0;
}
void host_n_assemble_del2_u(
struct Some_variables *E,
double *d_u, double *d_Au,
int strip_bcs
)
{
cudaThreadSynchronize();
const int neq = E->lmesh.NEQ;
const int nno = E->lmesh.NNO;
double *u = (double *)malloc((1+neq)*sizeof(double));
cudaMemcpy(u, d_u, (1+neq)*sizeof(double), cudaMemcpyDeviceToHost);
double *Au = (double *)malloc((1+neq)*sizeof(double));
for (int e = 0; e <= neq; e++) {
Au[e] = 0.0;
}
u[neq] = 0.0;
for (int e = 1; e <= nno; e++) {
int eqn1 = E->ID[e].doff[1];
int eqn2 = E->ID[e].doff[2];
int eqn3 = E->ID[e].doff[3];
double U1 = u[eqn1];
double U2 = u[eqn2];
double U3 = u[eqn3];
int *C = E->Node_map + (e-1)*MAX_EQN;
higher_precision *B1,*B2,*B3;
B1 = E->Eqn_k[1] + (e-1)*MAX_EQN;
B2 = E->Eqn_k[2] + (e-1)*MAX_EQN;
B3 = E->Eqn_k[3] + (e-1)*MAX_EQN;
for (int i = 3; i < MAX_EQN; i++) {
double UU = u[C[i]];
Au[eqn1] += B1[i]*UU;
Au[eqn2] += B2[i]*UU;
Au[eqn3] += B3[i]*UU;
}
for (int i = 0; i < MAX_EQN; i++) {
Au[C[i]] += B1[i]*U1 +
B2[i]*U2 +
B3[i]*U3;
}
}
if (strip_bcs)
host_strip_bcs_from_residual(E,Au);
cudaMemcpy(d_Au, Au, (1+neq)*sizeof(double), cudaMemcpyHostToDevice);
free(u);
free(Au);
return;
}
void host_gauss_seidel_0(
struct Some_variables *E,
double *d_d0,
double *d_Ad
)
{
cudaThreadSynchronize();
const int neq = E->lmesh.NEQ;
double *d0 = (double *)malloc((1+neq)*sizeof(double));
double *Ad = (double *)malloc((1+neq)*sizeof(double));
const double zeroo = 0.0;
for (int i = 0; i < neq; i++) {
d0[i] = Ad[i] = zeroo;
}
cudaMemcpy(d_d0, d0, (1+neq)*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_Ad, Ad, (1+neq)*sizeof(double), cudaMemcpyHostToDevice);
free(d0);
free(Ad);
}
void host_gauss_seidel_1(
struct Some_variables *E,
struct Some_variables *s_E,
double *d_F, double *d_Ad
)
{
cudaThreadSynchronize();
const int neq = E->lmesh.NEQ;
const int nno = E->lmesh.NNO;
double *F = (double *)malloc(neq*sizeof(double));
cudaMemcpy(F, d_F, neq*sizeof(double), cudaMemcpyDeviceToHost);
double *Ad = (double *)malloc((1+neq)*sizeof(double));
cudaMemcpy(Ad, d_Ad, (1+neq)*sizeof(double), cudaMemcpyDeviceToHost);
const double zeroo = 0.0;
for (int j = 0; j <= neq; j++) {
E->temp[j] = zeroo;
}
Ad[neq] = zeroo;
for (int i = 1; i <= nno; i++) {
if (E->NODE[i] & OFFSIDE) {
int eqn1 = E->ID[i].doff[1];
int eqn2 = E->ID[i].doff[2];
int eqn3 = E->ID[i].doff[3];
E->temp[eqn1] = (F[eqn1] - Ad[eqn1])*E->BI[eqn1];
E->temp[eqn2] = (F[eqn2] - Ad[eqn2])*E->BI[eqn2];
E->temp[eqn3] = (F[eqn3] - Ad[eqn3])*E->BI[eqn3];
}
}
/* Ad[neq] */
cudaMemcpy(d_Ad + neq, Ad + neq, sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(s_E->temp, E->temp, (neq+1)*sizeof(double), cudaMemcpyHostToDevice);
free(F);
free(Ad);
}
void host_gauss_seidel_2(
struct Some_variables *E,
struct Some_variables *s_E,
double *d_F, double *d_Ad
)
{
cudaThreadSynchronize();
const int neq = E->lmesh.NEQ;
const int nno = E->lmesh.NNO;
double *F = (double *)malloc(neq*sizeof(double));
cudaMemcpy(F, d_F, neq*sizeof(double), cudaMemcpyDeviceToHost);
double *Ad = (double *)malloc((1+neq)*sizeof(double));
cudaMemcpy(Ad, d_Ad, (1+neq)*sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(E->temp, s_E->temp, (neq+1)*sizeof(double), cudaMemcpyDeviceToHost);
for (int i = 1; i <= nno; i++) {
int eqn1 = E->ID[i].doff[1];
int eqn2 = E->ID[i].doff[2];
int eqn3 = E->ID[i].doff[3];
int *C = E->Node_map + (i-1)*MAX_EQN;
higher_precision *B1,*B2,*B3;
B1 = E->Eqn_k[1] + (i-1)*MAX_EQN;
B2 = E->Eqn_k[2] + (i-1)*MAX_EQN;
B3 = E->Eqn_k[3] + (i-1)*MAX_EQN;
/* Ad on boundaries differs after the following operation, but
no communications are needed yet, because boundary Ad will
not be used for the G-S iterations for interior nodes */
for (int j = 3; j < MAX_EQN; j++) {
double UU = E->temp[C[j]];
Ad[eqn1] += B1[j]*UU;
Ad[eqn2] += B2[j]*UU;
Ad[eqn3] += B3[j]*UU;
}
if (!(E->NODE[i] & OFFSIDE)) {
E->temp[eqn1] = (F[eqn1] - Ad[eqn1])*E->BI[eqn1];
E->temp[eqn2] = (F[eqn2] - Ad[eqn2])*E->BI[eqn2];
E->temp[eqn3] = (F[eqn3] - Ad[eqn3])*E->BI[eqn3];
}
}
cudaMemcpy(d_Ad, Ad, (1+neq)*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(s_E->temp, E->temp, (neq+1)*sizeof(double), cudaMemcpyHostToDevice);
free(F);
free(Ad);
}
void host_gauss_seidel_3(
struct Some_variables *E,
struct Some_variables *s_E,
double *d_d0,
double *d_Ad
)
{
cudaThreadSynchronize();
const int neq = E->lmesh.NEQ;
const int nno = E->lmesh.NNO;
double *d0 = (double *)malloc((1+neq)*sizeof(double));
double *Ad = (double *)malloc((1+neq)*sizeof(double));
cudaMemcpy(d0, d_d0, (1+neq)*sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(Ad, d_Ad, (1+neq)*sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(E->temp, s_E->temp, (neq+1)*sizeof(double), cudaMemcpyDeviceToHost);
for (int i = 1; i <= nno; i++) {
int eqn1 = E->ID[i].doff[1];
int eqn2 = E->ID[i].doff[2];
int eqn3 = E->ID[i].doff[3];
int *C = E->Node_map + (i-1)*MAX_EQN;
higher_precision *B1,*B2,*B3;
B1 = E->Eqn_k[1] + (i-1)*MAX_EQN;
B2 = E->Eqn_k[2] + (i-1)*MAX_EQN;
B3 = E->Eqn_k[3] + (i-1)*MAX_EQN;
/* Ad on boundaries differs after the following operation */
for (int j = 0; j < MAX_EQN; j++) {
Ad[C[j]] += B1[j]*E->temp[eqn1] +
B2[j]*E->temp[eqn2] +
B3[j]*E->temp[eqn3];
}
d0[eqn1] += E->temp[eqn1];
d0[eqn2] += E->temp[eqn2];
d0[eqn3] += E->temp[eqn3];
}
cudaMemcpy(d_d0, d0, (1+neq)*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_Ad, Ad, (1+neq)*sizeof(double), cudaMemcpyHostToDevice);
free(d0);
free(Ad);
}
Computing file changes ...