https://github.com/geodynamics/citcoms
Revision df821c06952128812763ec5b7a944cd178ca5ed3 authored by Thorsten Becker on 08 December 2010, 18:43:58 UTC, committed by Thorsten Becker on 08 December 2010, 18:43:58 UTC
GMT/ggrd compile before).
1 parent cf7495e
Tip revision: df821c06952128812763ec5b7a944cd178ca5ed3 authored by Thorsten Becker on 08 December 2010, 18:43:58 UTC
Added definition of TRUE (1) and FALSE (0) in case undefined (was defined for
Added definition of TRUE (1) and FALSE (0) in case undefined (was defined for
Tip revision: df821c0
cgrad_kernel.cu
/* -*- C -*- */
/* vim:set ft=c: */
#if __CUDA_ARCH__ < 130
/* for double-precision floating-point */
#error This code requires compute capability 1.3 or higher; try giving "-arch sm_13".
#endif
#include <math.h>
#include "global_defs.h"
enum {
LEVEL = 0,
CAPS_PER_PROC = 1,
M = 1, /* cap # */
NSD = 3, /* Spatial extent: 3d */
MAX_EQN = NSD*14,
ENODES = 8, /* enodes[NSD] */
LOC_MAT_SIZE = 24, /* loc_mat_size[NSD] */
};
struct octoterm {
/* an octoterm is 8 (ENODES) * 3 terms */
int e;
int a;
int offset;
};
struct matrix_mult {
int n; /* number of octoterms: 1, 2, 4, or 8 */
int ot_i; /* index of octoterm struct in 'ot' array */
int zero_res; /* boolean */
};
struct Some_variables {
int num_zero_resid;
int *zero_resid;
struct /*MESH_DATA*/ {
int NEQ;
int NNO;
int NEL;
} lmesh;
struct IEN *IEN;
struct ID *ID;
struct EK *elt_k;
higher_precision *Eqn_k1, *Eqn_k2, *Eqn_k3;
int *Node_map;
double *BI;
/* temporary malloc'd memory */
double *memory;
int memoryDim;
struct matrix_mult *mm; /* dim is 'neq' */
int n_octoterms;
struct octoterm *ot;
};
/*------------------------------------------------------------------------*/
/* from BC_util.c */
void strip_bcs_from_residual(
struct Some_variables *E,
double *Res
)
{
int i;
for(i=1;i<=E->num_zero_resid;i++)
Res[E->zero_resid[i]] = 0.0;
return;
}
/*------------------------------------------------------------------------*/
static int e_tally(
struct Some_variables *E
)
{
int e,i,a,a1,a2,a3,ii,nTotal;
const int nel=E->lmesh.NEL;
const int neq=E->lmesh.NEQ;
nTotal = 0;
for(i=0;i<neq;i++) {
E->mm[i].n = 0;
E->mm[i].zero_res = 0;
}
for(e=1;e<=nel;e++) {
for(a=1;a<=ENODES;a++) {
ii = E->IEN[e].node[a];
a1 = E->ID[ii].doff[1];
a2 = E->ID[ii].doff[2];
a3 = E->ID[ii].doff[3];
++E->mm[a1].n;
++E->mm[a2].n;
++E->mm[a3].n;
nTotal += 3;
} /* end for loop a */
} /* end for e */
/* strip_bcs_from_residual */
for(i=1;i<=E->num_zero_resid;i++)
E->mm[E->zero_resid[i]].zero_res = 1;
/* return the total number of octoterms */
return nTotal;
}
static void e_map(
struct Some_variables *E
)
{
int e,i,a,a1,a2,a3,o1,o2,o3,ii;
struct octoterm *ot;
const int nel=E->lmesh.NEL;
const int neq=E->lmesh.NEQ;
ot = E->ot;
for(i=0;i<neq;i++) {
E->mm[i].ot_i = ot - E->ot;
ot += E->mm[i].n;
E->mm[i].n = 0;
}
for(e=1;e<=nel;e++) {
for(a=1;a<=ENODES;a++) {
ii = E->IEN[e].node[a];
a1 = E->ID[ii].doff[1];
a2 = E->ID[ii].doff[2];
a3 = E->ID[ii].doff[3];
o1 = E->mm[a1].n++;
o2 = E->mm[a2].n++;
o3 = E->mm[a3].n++;
ot = E->ot + E->mm[a1].ot_i + o1;
ot->e = e;
ot->a = a;
ot->offset = 0;
ot = E->ot + E->mm[a2].ot_i + o2;
ot->e = e;
ot->a = a;
ot->offset = LOC_MAT_SIZE;
ot = E->ot + E->mm[a3].ot_i + o3;
ot->e = e;
ot->a = a;
ot->offset = LOC_MAT_SIZE + LOC_MAT_SIZE;
} /* end for loop a */
} /* end for e */
return;
}
/* based on the function from Element_calculations.c */
__global__ void e_assemble_del2_u(
struct Some_variables *E,
double *u, double *Au,
int strip_bcs
)
{
int e,i,a,b,o,ii,nodeb,offset;
struct octoterm *ot;
/* ENODES*ENODES = 8*8 = 64 threads (2 warps) per block */
__shared__ double sum[ENODES*ENODES];
unsigned int tid = ENODES*threadIdx.y + threadIdx.x; /* 0 <= tid < 64 */
do {
i = blockIdx.x; /* 0 <= i < E->lmesh.NEQ */
/***
* This is block 'i'.
*/
if (strip_bcs && E->mm[i].zero_res) {
/* no-op: Au[i] is zero */
sum[tid] = 0.0; /* but only sum[0] is used */
/* XXX: How many blocks are wasted here? */
} else {
o = threadIdx.x; /* 0 <= o < ENODES */
if (o < E->mm[i].n) {
/****
* This is thread group 'o' in block 'i'.
* Each group of 8 threads handles one 'octoterm'.
*/
ot = E->ot + E->mm[i].ot_i + o;
e = ot->e;
a = ot->a;
offset = ot->offset;
do {
b = threadIdx.y + 1; /* 1 <= b <= ENODES */
/****
* This is thread '(o, b)' in block 'i'.
* Each thread computes three terms.
*/
nodeb = E->IEN[e].node[b];
ii = (a*LOC_MAT_SIZE+b)*NSD-(NSD*LOC_MAT_SIZE+NSD);
sum[tid] =
E->elt_k[e].k[ii+offset] *
u[E->ID[nodeb].doff[1]]
+ E->elt_k[e].k[ii+offset+1] *
u[E->ID[nodeb].doff[2]]
+ E->elt_k[e].k[ii+offset+2] *
u[E->ID[nodeb].doff[3]];
} while (0);
} else {
/* XXX: 8-n wasted threads per block go through here */
sum[tid] = 0.0;
}
__syncthreads();
/* Reduce the partial sums for this block.
Based on reduce2() in the CUDA SDK. */
for (unsigned int s = ENODES*ENODES / 2; s > 0; s >>= 1)
{
if (tid < s) {
sum[tid] += sum[tid + s];
}
/* XXX: This is unnecessary, since only a single
"warp" is involved. */
__syncthreads();
}
}
/* XXX: ??? */
__syncthreads();
/* each block writes one element, Au[i], in global memory */
if (tid == 0) {
Au[i] = sum[0];
}
} while (0);
return;
}
/*------------------------------------------------------------------------*/
/* from Global_operations.c */
double global_vdot(
struct Some_variables *E,
double *A, double *B
)
{
int i,neq;
double prod;
prod = 0.0;
neq=E->lmesh.NEQ;
for (i=0;i<neq;i++)
prod += A[i]*B[i];
return prod;
}
/*------------------------------------------------------------------------*/
static void construct_E(
struct Some_variables **d_E,
struct Some_variables *s_E, /* host's shadow copy of d_E */
struct Some_variables *E
)
{
/* construct a copy of 'E' in device memory */
int neq = E->lmesh.NEQ;
int nno = E->lmesh.NNO;
int nel = E->lmesh.NEL;
memset(s_E, 0, sizeof(struct Some_variables));
/* mm, ot */
cudaMalloc((void**)&s_E->mm, neq * sizeof(struct matrix_mult));
cudaMalloc((void**)&s_E->ot, E->n_octoterms * sizeof(struct octoterm));
cudaMemcpy(s_E->mm, E->mm, neq * sizeof(struct matrix_mult), cudaMemcpyHostToDevice);
cudaMemcpy(s_E->ot, E->ot, E->n_octoterms * sizeof(struct octoterm), cudaMemcpyHostToDevice);
/* IEN -- cf. allocate_common_vars() */
cudaMalloc((void**)&s_E->IEN, (nel+2)*sizeof(struct IEN));
cudaMemcpy(s_E->IEN, E->IEN, (nel+2)*sizeof(struct IEN), cudaMemcpyHostToDevice);
/* ID -- cf. allocate_common_vars()*/
cudaMalloc((void **)&s_E->ID, (nno+1)*sizeof(struct ID));
cudaMemcpy(s_E->ID, E->ID, (nno+1)*sizeof(struct ID), cudaMemcpyHostToDevice);
/* elt_k -- cf. general_stokes_solver_setup() */
cudaMalloc((void **)&s_E->elt_k, (nel+1)*sizeof(struct EK));
cudaMemcpy(s_E->elt_k, E->elt_k, (nel+1)*sizeof(struct EK), cudaMemcpyHostToDevice);
/* E */
cudaMalloc((void**)d_E, sizeof(Some_variables));
cudaMemcpy(*d_E, s_E, sizeof(Some_variables), cudaMemcpyHostToDevice);
return;
}
static void destroy_E(
struct Some_variables *d_E,
struct Some_variables *s_E
)
{
cudaFree(s_E->mm);
cudaFree(s_E->ot);
cudaFree(s_E->IEN);
cudaFree(s_E->ID);
cudaFree(s_E->elt_k);
cudaFree(d_E);
}
/*------------------------------------------------------------------------*/
/* from General_matix_functions.c */
double do_conj_grad(
struct Some_variables *E,
double *d0,
double *F,
double acc,
int *cycles
)
{
double *r0,*r1,*r2;
double *z0,*z1;
double *p1,*p2;
double *Ap;
double *shuffle;
int count,i,steps;
double residual;
double alpha,beta,dotprod,dotr1z1,dotr0z0;
double *memory = E->memory;
const int neq = E->lmesh.NEQ;
steps = *cycles;
r0 = memory; memory += neq;
r1 = memory; memory += neq;
r2 = memory; memory += neq;
z0 = memory; memory += neq;
z1 = memory; memory += neq;
p1 = memory; memory += (1+neq);
p2 = memory; memory += (1+neq);
Ap = memory; memory += (1+neq);
assert(memory == E->memory + E->memoryDim);
for(i=0;i<neq;i++) {
r1[i] = F[i];
d0[i] = 0.0;
}
residual = sqrt(global_vdot(E,r1,r1));
assert(residual != 0.0 /* initial residual for CG = 0.0 */);
count = 0;
/* pointers to device memory */
struct Some_variables *d_E = 0;
double *d_p2 = 0, *d_Ap = 0;
/* construct 'E' on the device */
struct Some_variables s_E;
construct_E(&d_E, &s_E, E);
/* allocate memory on the device */
cudaMalloc((void**)&d_p2, (1+neq)*sizeof(double));
cudaMalloc((void**)&d_Ap, (1+neq)*sizeof(double));
while (((residual > acc) && (count < steps)) || count == 0) {
for(i=0;i<neq;i++)
z1[i] = E->BI[i] * r1[i];
dotr1z1 = global_vdot(E,r1,z1);
if (0==count)
for(i=0;i<neq;i++)
p2[i] = z1[i];
else {
assert(dotr0z0 != 0.0 /* in head of conj_grad */);
beta = dotr1z1/dotr0z0;
for(i=0;i<neq;i++)
p2[i] = z1[i] + beta * p1[i];
}
dotr0z0 = dotr1z1;
/********************************************/
/* launch e_assemble_del2_u() on the device */
/* copy input to the device */
cudaMemcpy(d_p2, p2, (1+neq)*sizeof(double), cudaMemcpyHostToDevice);
/* launch */
dim3 dimBlock(ENODES, ENODES, 1);
dim3 dimGrid(neq, 1, 1);
e_assemble_del2_u<<< dimGrid, dimBlock >>>(d_E, d_p2, d_Ap, 1);
/* wait for completion */
cudaThreadSynchronize();
/* copy output from device */
cudaMemcpy(Ap, d_Ap, (1+neq)*sizeof(double), cudaMemcpyDeviceToHost);
/********************************************/
dotprod=global_vdot(E,p2,Ap);
if(0.0==dotprod)
alpha=1.0e-3;
else
alpha = dotr1z1/dotprod;
for(i=0;i<neq;i++) {
d0[i] += alpha * p2[i];
r2[i] = r1[i] - alpha * Ap[i];
}
residual = sqrt(global_vdot(E,r2,r2));
shuffle = r0; r0 = r1; r1 = r2; r2 = shuffle;
shuffle = z0; z0 = z1; z1 = shuffle;
shuffle = p1; p1 = p2; p2 = shuffle;
count++;
/* end of while-loop */
}
/* free device memory */
cudaFree(d_p2);
cudaFree(d_Ap);
destroy_E(d_E, &s_E);
*cycles=count;
strip_bcs_from_residual(E,d0);
return residual;
}
/*------------------------------------------------------------------------*/
static void assert_assumptions(struct All_variables *E, int high_lev) {
assert(!E->control.NMULTIGRID);
assert(!E->control.NASSEMBLE);
assert(E->sphere.caps_per_proc == CAPS_PER_PROC);
assert(E->mesh.nsd == NSD);
assert(E->mesh.levmax == LEVEL);
assert(high_lev == LEVEL);
assert(E->parallel.nproc == 1);
assert(E->parallel.TNUM_PASS[LEVEL][M] == 0);
assert(E->parallel.Skip_neq[LEVEL][M] == 0);
}
extern "C" double conj_grad(
struct All_variables *E,
double **d0,
double **F,
double acc,
int *cycles,
int level
)
{
struct Some_variables kE;
double residual;
assert_assumptions(E, level);
/* initialize 'Some_variables' with 'All_variables' */
kE.num_zero_resid = E->num_zero_resid[LEVEL][M];
kE.zero_resid = E->zero_resid[LEVEL][M];
kE.lmesh.NEQ = E->lmesh.NEQ[LEVEL];
kE.lmesh.NNO = E->lmesh.NNO[LEVEL];
kE.lmesh.NEL = E->lmesh.NEL[LEVEL];
kE.IEN = E->IEN[LEVEL][M];
kE.ID = E->ID[LEVEL][M];
kE.elt_k = E->elt_k[LEVEL][M];
kE.Eqn_k1 = E->Eqn_k1[LEVEL][M];
kE.Eqn_k2 = E->Eqn_k2[LEVEL][M];
kE.Eqn_k3 = E->Eqn_k3[LEVEL][M];
kE.Node_map = E->Node_map[LEVEL][M];
kE.BI = E->BI[LEVEL][M];
/* allocate temporary memory */
kE.memoryDim = 0;
kE.memoryDim += 5 * E->lmesh.NEQ[LEVEL] + /* r0,r1,r2,z0,z1 */
3 * (1+E->lmesh.NEQ[LEVEL]) /* p1,p2,Ap */
;
kE.memory = (double *)malloc(kE.memoryDim*sizeof(double));
kE.mm = (struct matrix_mult *)malloc(kE.lmesh.NEQ * sizeof(struct matrix_mult));
kE.n_octoterms = e_tally(&kE);
kE.ot = (struct octoterm *)malloc(kE.n_octoterms * sizeof(struct octoterm));
e_map(&kE);
/* solve */
residual = do_conj_grad(&kE, d0[M], F[M], acc, cycles);
free(kE.memory);
free(kE.mm);
free(kE.ot);
return residual;
}
Computing file changes ...