/* -*- C -*- */ /* vim:set ft=c: */ #if __CUDA_ARCH__ < 130 /* for double-precision floating-point */ #error This code requires compute capability 1.3 or higher; try giving "-arch sm_13". #endif #include #include "global_defs.h" enum { LEVEL = 0, CAPS_PER_PROC = 1, M = 1, /* cap # */ NSD = 3, /* Spatial extent: 3d */ MAX_EQN = NSD*14, ENODES = 8, /* enodes[NSD] */ LOC_MAT_SIZE = 24, /* loc_mat_size[NSD] */ }; struct octoterm { /* an octoterm is 8 (ENODES) * 3 terms */ int e; int a; int offset; }; struct matrix_mult { int n; /* number of octoterms: 1, 2, 4, or 8 */ int ot_i; /* index of octoterm struct in 'ot' array */ int zero_res; /* boolean */ }; struct Some_variables { int num_zero_resid; int *zero_resid; struct /*MESH_DATA*/ { int NEQ; int NNO; int NEL; } lmesh; struct IEN *IEN; struct ID *ID; struct EK *elt_k; higher_precision *Eqn_k1, *Eqn_k2, *Eqn_k3; int *Node_map; double *BI; /* temporary malloc'd memory */ double *memory; int memoryDim; struct matrix_mult *mm; /* dim is 'neq' */ int n_octoterms; struct octoterm *ot; }; /*------------------------------------------------------------------------*/ /* from BC_util.c */ void strip_bcs_from_residual( struct Some_variables *E, double *Res ) { int i; for(i=1;i<=E->num_zero_resid;i++) Res[E->zero_resid[i]] = 0.0; return; } /*------------------------------------------------------------------------*/ static int e_tally( struct Some_variables *E ) { int e,i,a,a1,a2,a3,ii,nTotal; const int nel=E->lmesh.NEL; const int neq=E->lmesh.NEQ; nTotal = 0; for(i=0;imm[i].n = 0; E->mm[i].zero_res = 0; } for(e=1;e<=nel;e++) { for(a=1;a<=ENODES;a++) { ii = E->IEN[e].node[a]; a1 = E->ID[ii].doff[1]; a2 = E->ID[ii].doff[2]; a3 = E->ID[ii].doff[3]; ++E->mm[a1].n; ++E->mm[a2].n; ++E->mm[a3].n; nTotal += 3; } /* end for loop a */ } /* end for e */ /* strip_bcs_from_residual */ for(i=1;i<=E->num_zero_resid;i++) E->mm[E->zero_resid[i]].zero_res = 1; /* return the total number of octoterms */ return nTotal; } static void e_map( struct Some_variables *E ) { int e,i,a,a1,a2,a3,o1,o2,o3,ii; struct octoterm *ot; const int nel=E->lmesh.NEL; const int neq=E->lmesh.NEQ; ot = E->ot; for(i=0;imm[i].ot_i = ot - E->ot; ot += E->mm[i].n; E->mm[i].n = 0; } for(e=1;e<=nel;e++) { for(a=1;a<=ENODES;a++) { ii = E->IEN[e].node[a]; a1 = E->ID[ii].doff[1]; a2 = E->ID[ii].doff[2]; a3 = E->ID[ii].doff[3]; o1 = E->mm[a1].n++; o2 = E->mm[a2].n++; o3 = E->mm[a3].n++; ot = E->ot + E->mm[a1].ot_i + o1; ot->e = e; ot->a = a; ot->offset = 0; ot = E->ot + E->mm[a2].ot_i + o2; ot->e = e; ot->a = a; ot->offset = LOC_MAT_SIZE; ot = E->ot + E->mm[a3].ot_i + o3; ot->e = e; ot->a = a; ot->offset = LOC_MAT_SIZE + LOC_MAT_SIZE; } /* end for loop a */ } /* end for e */ return; } /* based on the function from Element_calculations.c */ __global__ void e_assemble_del2_u( struct Some_variables *E, double *u, double *Au, int strip_bcs ) { int e,i,a,b,o,ii,nodeb,offset; struct octoterm *ot; /* ENODES*ENODES = 8*8 = 64 threads (2 warps) per block */ __shared__ double sum[ENODES*ENODES]; unsigned int tid = ENODES*threadIdx.y + threadIdx.x; /* 0 <= tid < 64 */ do { i = blockIdx.x; /* 0 <= i < E->lmesh.NEQ */ /*** * This is block 'i'. */ if (strip_bcs && E->mm[i].zero_res) { /* no-op: Au[i] is zero */ sum[tid] = 0.0; /* but only sum[0] is used */ /* XXX: How many blocks are wasted here? */ } else { o = threadIdx.x; /* 0 <= o < ENODES */ if (o < E->mm[i].n) { /**** * This is thread group 'o' in block 'i'. * Each group of 8 threads handles one 'octoterm'. */ ot = E->ot + E->mm[i].ot_i + o; e = ot->e; a = ot->a; offset = ot->offset; do { b = threadIdx.y + 1; /* 1 <= b <= ENODES */ /**** * This is thread '(o, b)' in block 'i'. * Each thread computes three terms. */ nodeb = E->IEN[e].node[b]; ii = (a*LOC_MAT_SIZE+b)*NSD-(NSD*LOC_MAT_SIZE+NSD); sum[tid] = E->elt_k[e].k[ii+offset] * u[E->ID[nodeb].doff[1]] + E->elt_k[e].k[ii+offset+1] * u[E->ID[nodeb].doff[2]] + E->elt_k[e].k[ii+offset+2] * u[E->ID[nodeb].doff[3]]; } while (0); } else { /* XXX: 8-n wasted threads per block go through here */ sum[tid] = 0.0; } __syncthreads(); /* Reduce the partial sums for this block. Based on reduce2() in the CUDA SDK. */ for (unsigned int s = ENODES*ENODES / 2; s > 0; s >>= 1) { if (tid < s) { sum[tid] += sum[tid + s]; } /* XXX: This is unnecessary, since only a single "warp" is involved. */ __syncthreads(); } } /* XXX: ??? */ __syncthreads(); /* each block writes one element, Au[i], in global memory */ if (tid == 0) { Au[i] = sum[0]; } } while (0); return; } /*------------------------------------------------------------------------*/ /* from Global_operations.c */ double global_vdot( struct Some_variables *E, double *A, double *B ) { int i,neq; double prod; prod = 0.0; neq=E->lmesh.NEQ; for (i=0;ilmesh.NEQ; int nno = E->lmesh.NNO; int nel = E->lmesh.NEL; memset(s_E, 0, sizeof(struct Some_variables)); /* mm, ot */ cudaMalloc((void**)&s_E->mm, neq * sizeof(struct matrix_mult)); cudaMalloc((void**)&s_E->ot, E->n_octoterms * sizeof(struct octoterm)); cudaMemcpy(s_E->mm, E->mm, neq * sizeof(struct matrix_mult), cudaMemcpyHostToDevice); cudaMemcpy(s_E->ot, E->ot, E->n_octoterms * sizeof(struct octoterm), cudaMemcpyHostToDevice); /* IEN -- cf. allocate_common_vars() */ cudaMalloc((void**)&s_E->IEN, (nel+2)*sizeof(struct IEN)); cudaMemcpy(s_E->IEN, E->IEN, (nel+2)*sizeof(struct IEN), cudaMemcpyHostToDevice); /* ID -- cf. allocate_common_vars()*/ cudaMalloc((void **)&s_E->ID, (nno+1)*sizeof(struct ID)); cudaMemcpy(s_E->ID, E->ID, (nno+1)*sizeof(struct ID), cudaMemcpyHostToDevice); /* elt_k -- cf. general_stokes_solver_setup() */ cudaMalloc((void **)&s_E->elt_k, (nel+1)*sizeof(struct EK)); cudaMemcpy(s_E->elt_k, E->elt_k, (nel+1)*sizeof(struct EK), cudaMemcpyHostToDevice); /* E */ cudaMalloc((void**)d_E, sizeof(Some_variables)); cudaMemcpy(*d_E, s_E, sizeof(Some_variables), cudaMemcpyHostToDevice); return; } static void destroy_E( struct Some_variables *d_E, struct Some_variables *s_E ) { cudaFree(s_E->mm); cudaFree(s_E->ot); cudaFree(s_E->IEN); cudaFree(s_E->ID); cudaFree(s_E->elt_k); cudaFree(d_E); } /*------------------------------------------------------------------------*/ /* from General_matix_functions.c */ double do_conj_grad( struct Some_variables *E, double *d0, double *F, double acc, int *cycles ) { double *r0,*r1,*r2; double *z0,*z1; double *p1,*p2; double *Ap; double *shuffle; int count,i,steps; double residual; double alpha,beta,dotprod,dotr1z1,dotr0z0; double *memory = E->memory; const int neq = E->lmesh.NEQ; steps = *cycles; r0 = memory; memory += neq; r1 = memory; memory += neq; r2 = memory; memory += neq; z0 = memory; memory += neq; z1 = memory; memory += neq; p1 = memory; memory += (1+neq); p2 = memory; memory += (1+neq); Ap = memory; memory += (1+neq); assert(memory == E->memory + E->memoryDim); for(i=0;i acc) && (count < steps)) || count == 0) { for(i=0;iBI[i] * r1[i]; dotr1z1 = global_vdot(E,r1,z1); if (0==count) for(i=0;i>>(d_E, d_p2, d_Ap, 1); /* wait for completion */ cudaThreadSynchronize(); /* copy output from device */ cudaMemcpy(Ap, d_Ap, (1+neq)*sizeof(double), cudaMemcpyDeviceToHost); /********************************************/ dotprod=global_vdot(E,p2,Ap); if(0.0==dotprod) alpha=1.0e-3; else alpha = dotr1z1/dotprod; for(i=0;icontrol.NMULTIGRID); assert(!E->control.NASSEMBLE); assert(E->sphere.caps_per_proc == CAPS_PER_PROC); assert(E->mesh.nsd == NSD); assert(E->mesh.levmax == LEVEL); assert(high_lev == LEVEL); assert(E->parallel.nproc == 1); assert(E->parallel.TNUM_PASS[LEVEL][M] == 0); assert(E->parallel.Skip_neq[LEVEL][M] == 0); } extern "C" double conj_grad( struct All_variables *E, double **d0, double **F, double acc, int *cycles, int level ) { struct Some_variables kE; double residual; assert_assumptions(E, level); /* initialize 'Some_variables' with 'All_variables' */ kE.num_zero_resid = E->num_zero_resid[LEVEL][M]; kE.zero_resid = E->zero_resid[LEVEL][M]; kE.lmesh.NEQ = E->lmesh.NEQ[LEVEL]; kE.lmesh.NNO = E->lmesh.NNO[LEVEL]; kE.lmesh.NEL = E->lmesh.NEL[LEVEL]; kE.IEN = E->IEN[LEVEL][M]; kE.ID = E->ID[LEVEL][M]; kE.elt_k = E->elt_k[LEVEL][M]; kE.Eqn_k1 = E->Eqn_k1[LEVEL][M]; kE.Eqn_k2 = E->Eqn_k2[LEVEL][M]; kE.Eqn_k3 = E->Eqn_k3[LEVEL][M]; kE.Node_map = E->Node_map[LEVEL][M]; kE.BI = E->BI[LEVEL][M]; /* allocate temporary memory */ kE.memoryDim = 0; kE.memoryDim += 5 * E->lmesh.NEQ[LEVEL] + /* r0,r1,r2,z0,z1 */ 3 * (1+E->lmesh.NEQ[LEVEL]) /* p1,p2,Ap */ ; kE.memory = (double *)malloc(kE.memoryDim*sizeof(double)); kE.mm = (struct matrix_mult *)malloc(kE.lmesh.NEQ * sizeof(struct matrix_mult)); kE.n_octoterms = e_tally(&kE); kE.ot = (struct octoterm *)malloc(kE.n_octoterms * sizeof(struct octoterm)); e_map(&kE); /* solve */ residual = do_conj_grad(&kE, d0[M], F[M], acc, cycles); free(kE.memory); free(kE.mm); free(kE.ot); return residual; }