/* -*- C -*- */
/* vim:set ft=c: */

#if __CUDA_ARCH__ < 130
/* for double-precision floating-point */
#error This code requires compute capability 1.3 or higher; try giving "-arch sm_13".
#endif


#include "global_defs.h"
#include "element_definitions.h"
#include <assert.h>
#include <stdio.h>


enum {
    CAPS_PER_PROC = 1,
    M = 1, /* cap # */
    NSD = 3, /* Spatial extent: 3d */
    MAX_EQN = NSD*14,
};


struct Some_variables {
    int num_zero_resid;
    int *zero_resid;
    
    struct /*MESH_DATA*/ {
        int NEQ;
        int NNO;
    } lmesh;
    
    struct ID *ID;
    
    higher_precision *Eqn_k[NSD+1];
    int *Node_map;
    
    double *BI;
    
    double *temp;
    unsigned int *NODE;
    
    int2 *term;
};


/*------------------------------------------------------------------------*/

static void construct_E(
    struct Some_variables **d_E,
    struct Some_variables *s_E, /* host's shadow copy of d_E */
    struct Some_variables *E
    )
{
    /* construct a copy of 'E' in device memory */
    
    int neq = E->lmesh.NEQ;
    int nno = E->lmesh.NNO;
    
    s_E->lmesh.NEQ = E->lmesh.NEQ;
    s_E->lmesh.NNO = E->lmesh.NNO;
    
    /* ID -- cf. allocate_common_vars()*/
    cudaMalloc((void **)&s_E->ID, (nno+1)*sizeof(struct ID));
    cudaMemcpy(s_E->ID, E->ID, (nno+1)*sizeof(struct ID), cudaMemcpyHostToDevice);
    
    /* Eqn_k, Node_map -- cf. construct_node_maps() */
    size_t matrix = MAX_EQN * nno;
    s_E->Eqn_k[0] = 0;
    cudaMalloc((void **)&s_E->Eqn_k[1], 3*matrix*sizeof(higher_precision));
    s_E->Eqn_k[2] = s_E->Eqn_k[1] + matrix;
    s_E->Eqn_k[3] = s_E->Eqn_k[2] + matrix;
    cudaMemcpy(s_E->Eqn_k[1], E->Eqn_k[1], matrix*sizeof(higher_precision), cudaMemcpyHostToDevice);
    cudaMemcpy(s_E->Eqn_k[2], E->Eqn_k[2], matrix*sizeof(higher_precision), cudaMemcpyHostToDevice);
    cudaMemcpy(s_E->Eqn_k[3], E->Eqn_k[3], matrix*sizeof(higher_precision), cudaMemcpyHostToDevice);
    cudaMalloc((void **)&s_E->Node_map, matrix*sizeof(int));
    cudaMemcpy(s_E->Node_map, E->Node_map, matrix*sizeof(int), cudaMemcpyHostToDevice);
    
    /* BI -- cf. allocate_velocity_vars() */
    cudaMalloc((void **)&s_E->BI, neq*sizeof(double));
    cudaMemcpy(s_E->BI, E->BI, neq*sizeof(double), cudaMemcpyHostToDevice);
    
    /* temp -- cf. allocate_velocity_vars() */
    cudaMalloc((void **)&s_E->temp, (neq+1)*sizeof(double));
    cudaMemcpy(s_E->temp, E->temp, (neq+1)*sizeof(double), cudaMemcpyHostToDevice);
    
    /* NODE -- cf. allocate_common_vars() */
    cudaMalloc((void **)&s_E->NODE, (nno+1)*sizeof(unsigned int));
    cudaMemcpy(s_E->NODE, E->NODE, (nno+1)*sizeof(unsigned int), cudaMemcpyHostToDevice);
    
    /* term */
    cudaMalloc((void **)&s_E->term, (neq+1) * MAX_EQN * sizeof(int2));
    cudaMemcpy(s_E->term, E->term, (neq+1) * MAX_EQN * sizeof(int2), cudaMemcpyHostToDevice);
    
    /* E */
    cudaMalloc((void**)d_E, sizeof(Some_variables));
    cudaMemcpy(*d_E, s_E, sizeof(Some_variables), cudaMemcpyHostToDevice);
    
    return;
}

static void destroy_E(
    struct Some_variables *d_E,
    struct Some_variables *s_E
    )
{
    cudaFree(s_E->ID);
    cudaFree(s_E->Eqn_k[1]);
    cudaFree(s_E->Node_map);
    cudaFree(s_E->BI);
    cudaFree(s_E->temp);
    cudaFree(s_E->NODE);
    cudaFree(s_E->term);
    cudaFree(d_E);
}


/*------------------------------------------------------------------------*/
/* from Element_calculations.c */

__global__ void n_assemble_del2_u(
    struct Some_variables *E,
    double *u, double *Au,
    int strip_bcs
    )
{
    int n = blockIdx.x + 1; /* 1 <= n <= E->lmesh.NNO */
    int doff = blockIdx.y + 1; /* 1 <= doff <= NSD */ 
    unsigned int tid = threadIdx.x; /* 0 <= tid < MAX_EQN */
    
    /* Each block writes one element of Au in global memory: Au[eqn]. */
    int eqn = E->ID[n].doff[doff]; /* XXX: Compute this value? */
    
    if (n == 1 && doff == 1 && tid == 0) {
        Au[E->lmesh.NEQ] = 0.0;
    }
    
    if (strip_bcs) {
        /* See get_bcs_id_for_residual(). */
        unsigned int flags = E->NODE[n];
        unsigned int vb;
        switch (doff) {
        case 1: vb = VBX; break; /* 0x2 */
        case 2: vb = VBY; break; /* 0x8 */
        case 3: vb = VBZ; break; /* 0x4 */
        }
        if (flags & vb) {
            /* no-op: Au[eqn] is zero */
            if (tid == 0) {
                Au[eqn] = 0.0;
            }
            /* XXX: Hundreds of blocks exit here (E->num_zero_resid).
               Does it matter? */
            return;
        }
    }
    
    /* The partial sum computed by this thread. */
    double acc;
    
    /* Part I: The terms here are easily derived from the block and
       thread indices. */
    {
        int e = n; /* 1 <= e <= E->lmesh.NNO */
        int i = (int)tid; /* 0 <= i < MAX_EQN */
        
        if (i < 3) {
            acc = 0.0;
        } else {
            int *C = E->Node_map + (e-1)*MAX_EQN;
            higher_precision *B = E->Eqn_k[doff]+(e-1)*MAX_EQN;
            double UU = u[C[i]];
            acc = B[i]*UU;
        }
    }
    
    /* Part II: These terms are more complicated. */
    {
        int2 *term = E->term + eqn*MAX_EQN;
        int2 pair = term[tid];
        int e = pair.x; /* 1 <= e <= E->lmesh.NNO */
        int i = pair.y; /* 0 <= i < MAX_EQN */
        
        if (i != -1) {
            /* XXX: Compute these values? */
            int eqn1 = E->ID[e].doff[1];
            int eqn2 = E->ID[e].doff[2];
            int eqn3 = E->ID[e].doff[3];
            
            double U1 = u[eqn1];
            double U2 = u[eqn2];
            double U3 = u[eqn3];
            
            higher_precision *B1, *B2, *B3;
            B1 = E->Eqn_k[1]+(e-1)*MAX_EQN;
            B2 = E->Eqn_k[2]+(e-1)*MAX_EQN;
            B3 = E->Eqn_k[3]+(e-1)*MAX_EQN;
            
            acc += B1[i]*U1 +
                   B2[i]*U2 +
                   B3[i]*U3;
        } else {
            /* XXX: A considerable number of threads idle here. */
        }
    }
    
    /* Reduce the partial sums for this block.
       Based on reduce2() in the CUDA SDK. */
    __shared__ double sum[MAX_EQN];
    sum[tid] = acc;
    __syncthreads();
    if (tid >= 32) {
        sum[tid-32] += sum[tid];
    }
    __syncthreads();
    for (unsigned int s = 16; s > 0; s >>= 1) {
        if (tid < s) {
            sum[tid] += sum[tid + s];
        }
        /* XXX: not always necessary */
        __syncthreads();
    }
    
    /* Each block writes one element of Au in global memory. */
    if (tid == 0) {
        Au[eqn] = sum[0];
    }
    
    return;
}


/*------------------------------------------------------------------------*/
/* These are based on the function from General_matrix_functions.c. */

__global__ void gauss_seidel_0(
    int neq,
    double *d0,
    double *Ad
    )
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < neq) {
        d0[i] = Ad[i] = 0.0;
    }
}

__global__ void gauss_seidel_1(
    struct Some_variables *E,
    int neq, int nno,
    double *F, double *Ad
    )
{
    const double zeroo = 0.0;
    
    int i = blockIdx.x * blockDim.x + threadIdx.x + 1;
    if (i <= nno) {
        /* 1 <= i <= E->lmesh.NNO */
        int doff = blockIdx.y + 1; /* 1 <= doff <= NSD */ 
        int eqn = E->ID[i].doff[doff];
        
        if (E->NODE[i] & OFFSIDE) {
            E->temp[eqn] = (F[eqn] - Ad[eqn])*E->BI[eqn];
        } else {
            E->temp[eqn] = zeroo;
        }
        
        if (i == 1 && doff == 1) {
            E->temp[neq] = zeroo;
            Ad[neq] = zeroo;
        }
    }
}

__global__ void gauss_seidel_2a(
    struct Some_variables *E,
    double *F, double *Ad
    )
{
    const int nno = E->lmesh.NNO;
    
    int j = threadIdx.x + 3; /* 3 <= j < MAX_EQN */
    int doff = threadIdx.y + 1; /* 1 <= doff <= NSD */ 
    
    for (int i = 1; i <= nno; i++) {
        int eqn = E->ID[i].doff[doff];
        int *C = E->Node_map + (i-1)*MAX_EQN;
        higher_precision *B = E->Eqn_k[doff] + (i-1)*MAX_EQN;
        
        /* Ad on boundaries differs after the following operation, but
           no communications are needed yet, because boundary Ad will
           not be used for the G-S iterations for interior nodes */
        
        __shared__ double UU[MAX_EQN-3];
        if (threadIdx.y == 0) {
            UU[threadIdx.x] = E->temp[C[j]];
        }
        __syncthreads();
        __shared__ double Ad_eqn[NSD];
        if (threadIdx.x == 0) {
            Ad_eqn[threadIdx.y] = Ad[eqn];
        }
        __shared__ double sum[MAX_EQN-3][3];
        /*for (int j = 3; j < MAX_EQN; j++)*/ {
            sum[threadIdx.x][threadIdx.y] = B[j]*UU[threadIdx.x];
        }
        __syncthreads();
        /* Reduce. */
        if (threadIdx.x >= 32) {
            sum[threadIdx.x-32][threadIdx.y] += sum[threadIdx.x][threadIdx.y];
        }
        __syncthreads();
        for (unsigned int s = 16; s > 0; s >>= 1) {
            if (threadIdx.x < s) {
                sum[threadIdx.x][threadIdx.y] += sum[threadIdx.x + s][threadIdx.y];
            }
            /* XXX: not always necessary */
            __syncthreads();
        }
        
        if (threadIdx.x == 0) {
            Ad_eqn[threadIdx.y] += sum[0][threadIdx.y];
            Ad[eqn] = Ad_eqn[threadIdx.y];
            if (!(E->NODE[i] & OFFSIDE)) {
                E->temp[eqn] = (F[eqn] - Ad_eqn[threadIdx.y])*E->BI[eqn];
            }
        }
        __syncthreads();
    }
}


__global__ void gauss_seidel_2b(
    struct Some_variables *E,
    double *F, double *Ad
    )
{
    const int nno = E->lmesh.NNO;
    unsigned int tid = threadIdx.x;
    
    for (int i = 1; i <= nno; i++) {
        
        int eqn1 = E->ID[i].doff[1];
        int eqn2 = eqn1 + 1;
        int eqn3 = eqn2 + 1;
        
        int *C = E->Node_map + (i-1)*MAX_EQN;
        
        higher_precision *B1,*B2,*B3;
        B1 = E->Eqn_k[1] + (i-1)*MAX_EQN;
        B2 = E->Eqn_k[2] + (i-1)*MAX_EQN;
        B3 = E->Eqn_k[3] + (i-1)*MAX_EQN;
        
        /* Ad on boundaries differs after the following operation, but
           no communications are needed yet, because boundary Ad will
           not be used for the G-S iterations for interior nodes */
        
        double Ad_eqn1, Ad_eqn2, Ad_eqn3;
        if (tid == 0) {
            Ad_eqn1 = Ad[eqn1];
            Ad_eqn2 = Ad[eqn2];
            Ad_eqn3 = Ad[eqn3];
        }
        
        __shared__ double sum1[MAX_EQN-3];
        __shared__ double sum2[MAX_EQN-3];
        __shared__ double sum3[MAX_EQN-3];
        
        int j = tid + 3; /* 3 <= j < MAX_EQN */
        /*for (int j = 3; j < MAX_EQN; j++)*/ {
            double UU = E->temp[C[j]];
            sum1[tid] = B1[j]*UU;
            sum2[tid] = B2[j]*UU;
            sum3[tid] = B3[j]*UU;
        }
        __syncthreads();
        
        /* Reduce. */
        if (tid >= 32) {
            sum1[tid-32] += sum1[tid];
            sum2[tid-32] += sum2[tid];
            sum3[tid-32] += sum3[tid];
        }
        __syncthreads();
        for (unsigned int s = 16; s > 0; s >>= 1) {
            if (tid < s) {
                sum1[tid] += sum1[tid + s];
                sum2[tid] += sum2[tid + s];
                sum3[tid] += sum3[tid + s];
            }
            /* XXX: not always necessary */
            __syncthreads();
        }
        
        if (tid == 0) {
            Ad_eqn1 += sum1[0];
            Ad_eqn2 += sum2[0];
            Ad_eqn3 += sum3[0];
            Ad[eqn1] = Ad_eqn1;
            Ad[eqn2] = Ad_eqn2;
            Ad[eqn3] = Ad_eqn3;
            if (!(E->NODE[i] & OFFSIDE)) {
                E->temp[eqn1] = (F[eqn1] - Ad_eqn1)*E->BI[eqn1];
                E->temp[eqn2] = (F[eqn2] - Ad_eqn2)*E->BI[eqn2];
                E->temp[eqn3] = (F[eqn3] - Ad_eqn3)*E->BI[eqn3];
            }
        }
        __syncthreads();
        
    }
    
}


__global__ void gauss_seidel_3(
    struct Some_variables *E,
    double *d0,
    double *Ad
    )
{
    int n = blockIdx.x + 1; /* 1 <= n <= E->lmesh.NNO */
    int doff = blockIdx.y + 1; /* 1 <= doff <= NSD */ 
    unsigned int tid = threadIdx.x; /* 0 <= tid < MAX_EQN */
    
    /* Each block writes one element of Ad and d0 in global memory:
       Ad[eqn], d0[eqn]. */
    int eqn = E->ID[n].doff[doff]; /* XXX: Compute this value? */
    
    __shared__ double sum[MAX_EQN];
    
    int2 *term = E->term + eqn*MAX_EQN;
    int2 pair = term[tid];
    int e = pair.x; /* 1 <= e <= E->lmesh.NNO */
    int i = pair.y; /* 0 <= i < MAX_EQN */
        
    if (i != -1) {
        /* XXX: Compute these values? */
        int eqn1 = E->ID[e].doff[1];
        int eqn2 = E->ID[e].doff[2];
        int eqn3 = E->ID[e].doff[3];
            
        higher_precision *B1, *B2, *B3;
        B1 = E->Eqn_k[1]+(e-1)*MAX_EQN;
        B2 = E->Eqn_k[2]+(e-1)*MAX_EQN;
        B3 = E->Eqn_k[3]+(e-1)*MAX_EQN;
        
        sum[tid] = B1[i]*E->temp[eqn1] +
                   B2[i]*E->temp[eqn2] +
                   B3[i]*E->temp[eqn3];
    } else {
        /* XXX: A considerable number of threads idle here. */
        sum[tid] = 0.0;
    }
    __syncthreads();
    
    /* Reduce the partial sums for this block.
       Based on reduce2() in the CUDA SDK. */
    if (tid >= 32) {
        sum[tid-32] += sum[tid];
    }
    __syncthreads();
    for (unsigned int s = 16; s > 0; s >>= 1) {
        if (tid < s) {
            sum[tid] += sum[tid + s];
        }
        /* XXX: not always necessary */
        __syncthreads();
    }
    
    if (tid == 0) {
        /* Each block writes one element of Ad... */
        Ad[eqn] += sum[0];
        /* ..and one element of d0. */
        d0[eqn] += E->temp[eqn];
    }
}


void host_n_assemble_del2_u(
    struct Some_variables *E,
    double *u, double *Au,
    int strip_bcs
    );

void host_gauss_seidel_0(
    struct Some_variables *E,
    double *d0,
    double *Ad
    );

void host_gauss_seidel_1(
    struct Some_variables *E,
    struct Some_variables *s_E,
    double *F, double *Ad
    );

void host_gauss_seidel_2(
    struct Some_variables *E,
    struct Some_variables *s_E,
    double *F, double *Ad
    );

void host_gauss_seidel_3(
    struct Some_variables *E,
    struct Some_variables *s_E,
    double *d0,
    double *Ad
    );


void do_gauss_seidel(
    struct Some_variables *E,
    double *d0,
    double *F, double *Ad,
    double acc,
    int *cycles,
    int guess
    )
{

    int count, steps;

    steps=*cycles;

    /* pointers to device memory */
    struct Some_variables *d_E = 0;
    double *d_d0 = 0, *d_F = 0, *d_Ad = 0;
    
    /* construct 'E' on the device */
    struct Some_variables s_E;
    construct_E(&d_E, &s_E, E);
    
    int neq = E->lmesh.NEQ;
    
    /* allocate memory on the device */
    cudaMalloc((void**)&d_d0, (1+neq)*sizeof(double));
    cudaMalloc((void**)&d_F, neq*sizeof(double));
    cudaMalloc((void**)&d_Ad, (1+neq)*sizeof(double));
    
    /* copy input to the device */
    cudaMemcpy(d_F, F, neq*sizeof(double), cudaMemcpyHostToDevice);
    
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    static float totalElapsedTime = 0.0;
    
    if (guess) {
        /* copy more input to the device */
        d0[E->lmesh.NEQ] = 0.0; /* normally done by n_assemble_del2_u() */
        cudaMemcpy(d_d0, d0, (1+neq)*sizeof(double), cudaMemcpyHostToDevice);
        
        dim3 block(MAX_EQN, 1, 1);
        dim3 grid(E->lmesh.NNO, NSD, 1);
        if (1) n_assemble_del2_u<<< grid, block >>>(d_E, d_d0, d_Ad, 1);
        else host_n_assemble_del2_u(E, d_d0, d_Ad, 1);
    
    } else {
        const int blockSize = 64;
        const int dim = E->lmesh.NEQ;
        int gridSize = dim / blockSize;
        if (dim % blockSize)
            ++gridSize;
        
        dim3 block(blockSize, 1, 1);
        dim3 grid(gridSize, 1, 1);
        gauss_seidel_0<<< grid, block >>>(E->lmesh.NEQ, d_d0, d_Ad);
    }
    
    for (count = 0; count < steps; ++count) {
        {
            const int blockSize = 64;
            const int dim = E->lmesh.NNO;
            int gridSize = dim / blockSize;
            if (dim % blockSize)
                ++gridSize;
            
            dim3 block(blockSize, 1, 1);
            dim3 grid(gridSize, NSD, 1);
            
            cudaEventRecord(start, 0);
            
            gauss_seidel_1<<< grid, block >>>(d_E, E->lmesh.NEQ, E->lmesh.NNO, d_F, d_Ad);
            
            cudaEventRecord(stop, 0);
            if (0) {
                cudaEventSynchronize(stop);
                float elapsedTime;
                cudaEventElapsedTime(&elapsedTime, start, stop);
                totalElapsedTime += elapsedTime;
                if (count == steps - 1) {
                    printf("gauss_seidel_0 %d\t%f\t%f\n", count, elapsedTime, totalElapsedTime);
                }
            }
        }
        
        if (1) {
            dim3 block(MAX_EQN - 3, NSD, 1);
            dim3 grid(1, 1, 1);
            gauss_seidel_2a<<< grid, block >>>(d_E, d_F, d_Ad);
        }
        if (0) {
            dim3 block(MAX_EQN - 3, 1, 1);
            dim3 grid(1, 1, 1);
            gauss_seidel_2b<<< grid, block >>>(d_E, d_F, d_Ad);
        }
        if (0) host_gauss_seidel_2(E, &s_E, d_F, d_Ad);
        
        /* Ad on boundaries differs after the following operation */
        {
            dim3 block(MAX_EQN, 1, 1);
            dim3 grid(E->lmesh.NNO, NSD, 1);
            if (1) gauss_seidel_3<<< grid, block >>>(d_E, d_d0, d_Ad);
            else host_gauss_seidel_3(E, &s_E, d_d0, d_Ad);            
        }
    }
    
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    
    /* wait for completion */
    if (cudaThreadSynchronize() != cudaSuccess) {
        assert(0 && "something went wrong");
    }
    
    /* copy output from device */
    cudaMemcpy(Ad, d_Ad, (1+neq)*sizeof(double), cudaMemcpyDeviceToHost);
    cudaMemcpy(d0, d_d0, (1+neq)*sizeof(double), cudaMemcpyDeviceToHost);
    
    /* free device memory */
    cudaFree(d_d0);
    cudaFree(d_F);
    cudaFree(d_Ad);
    
    destroy_E(d_E, &s_E);
    
    *cycles=count;
    
    return;
}


/*------------------------------------------------------------------------*/

static void assert_assumptions(struct All_variables *E, int level) {
    
    assert(E->control.NMULTIGRID);
    
    assert(E->sphere.caps_per_proc == CAPS_PER_PROC);
    
    assert(E->mesh.nsd == NSD);
    
    assert(E->parallel.nproc == 1);
}

static int2 *collect_terms(
    struct Some_variables *E
    )
{
    /* Map out how to parallelize "Au[C[i]] += ..." and "Ad[C[j]] += ...". */
    
    const int neq = E->lmesh.NEQ;
    const int nno = E->lmesh.NNO;
    
    int2 *term = (int2 *)malloc((neq+1) * MAX_EQN * sizeof(int2));
    
    for (int e = 0; e <= neq; e++) {
        int2 *term = E->term + e*MAX_EQN;
        for (int j = 0; j < MAX_EQN; j++) {
            term[j].x = -1;
            term[j].y = -1;
        }
    }
    
    for (int e = 1; e <= nno; e++) {
        int *C = E->Node_map + (e-1)*MAX_EQN;
        for (int i = 0; i < MAX_EQN; i++) {
            int2 *term = E->term + C[i]*MAX_EQN;
            int j;
            for (j = 0; j < MAX_EQN; j++) {
                if (term[j].x == -1) {
                    term[j].x = e;
                    term[j].y = i;
                    break;
                }
            }
            assert(C[i] == neq || j < MAX_EQN);
        }
    }
    
    return term;
}


extern "C" void gauss_seidel(
    struct All_variables *E,
    double **d0,
    double **F, double **Ad,
    double acc,
    int *cycles,
    int level,
    int guess
    )
{
    struct Some_variables kE;
    
    assert_assumptions(E, level);
    
    /* initialize 'Some_variables' with 'All_variables' */
    
    kE.num_zero_resid = E->num_zero_resid[level][M];
    kE.zero_resid = E->zero_resid[level][M];
    
    kE.lmesh.NEQ = E->lmesh.NEQ[level];
    kE.lmesh.NNO = E->lmesh.NNO[level];
    
    kE.ID    = E->ID[level][M];
    
    kE.Eqn_k[0] = 0;
    kE.Eqn_k[1] = E->Eqn_k1[level][M];
    kE.Eqn_k[2] = E->Eqn_k2[level][M];
    kE.Eqn_k[3] = E->Eqn_k3[level][M];
    kE.Node_map = E->Node_map[level][M];
    
    kE.BI = E->BI[level][M];
    
    kE.temp = E->temp[M];
    
    kE.NODE = E->NODE[level][M];
    
    static int2 *term[MAX_LEVELS];
    if (!term[level])
        term[level] = collect_terms(&kE);
    kE.term = term[level];
    
    do_gauss_seidel(
        &kE,
        d0[M],
        F[M], Ad[M],
        acc,
        cycles,
        guess
        );
}


/*------------------------------------------------------------------------*/
/*------------------------------------------------------------------------*/
/*------------------------------------------------------------------------*/

void host_strip_bcs_from_residual(
    struct Some_variables *E,
    double *Res
    )
{
    for(int i = 1; i <= E->num_zero_resid; i++)
        Res[E->zero_resid[i]] = 0.0;
}


void host_n_assemble_del2_u(
    struct Some_variables *E,
    double *d_u, double *d_Au,
    int strip_bcs
    )
{
    cudaThreadSynchronize();
    
    const int neq = E->lmesh.NEQ;
    const int nno = E->lmesh.NNO;
    
    double *u = (double *)malloc((1+neq)*sizeof(double));
    cudaMemcpy(u, d_u, (1+neq)*sizeof(double), cudaMemcpyDeviceToHost);
    
    double *Au = (double *)malloc((1+neq)*sizeof(double));
    
    for (int e = 0; e <= neq; e++) {
        Au[e] = 0.0;
    }
    
    u[neq] = 0.0;
    
    for (int e = 1; e <= nno; e++) {
        
        int eqn1 = E->ID[e].doff[1];
        int eqn2 = E->ID[e].doff[2];
        int eqn3 = E->ID[e].doff[3];
        
        double U1 = u[eqn1];
        double U2 = u[eqn2];
        double U3 = u[eqn3];
        
        int *C = E->Node_map + (e-1)*MAX_EQN;
        
        higher_precision *B1,*B2,*B3;
        B1 = E->Eqn_k[1] + (e-1)*MAX_EQN;
        B2 = E->Eqn_k[2] + (e-1)*MAX_EQN;
        B3 = E->Eqn_k[3] + (e-1)*MAX_EQN;
        
        for (int i = 3; i < MAX_EQN; i++)  {
            double UU = u[C[i]];
            Au[eqn1] += B1[i]*UU;
            Au[eqn2] += B2[i]*UU;
            Au[eqn3] += B3[i]*UU;
        }
        for (int i = 0; i < MAX_EQN; i++) {
            Au[C[i]] += B1[i]*U1 +
                        B2[i]*U2 +
                        B3[i]*U3;
        }
    }
    
    if (strip_bcs)
        host_strip_bcs_from_residual(E,Au);
    
    cudaMemcpy(d_Au, Au, (1+neq)*sizeof(double), cudaMemcpyHostToDevice);
    
    free(u);
    free(Au);
    
    return;
}


void host_gauss_seidel_0(
    struct Some_variables *E,
    double *d_d0,
    double *d_Ad
    )
{
    cudaThreadSynchronize();
    
    const int neq = E->lmesh.NEQ;
    
    double *d0 = (double *)malloc((1+neq)*sizeof(double));
    double *Ad = (double *)malloc((1+neq)*sizeof(double));
    
    const double zeroo = 0.0;
    
    for (int i = 0; i < neq; i++) {
        d0[i] = Ad[i] = zeroo;
    }
    
    cudaMemcpy(d_d0, d0, (1+neq)*sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(d_Ad, Ad, (1+neq)*sizeof(double), cudaMemcpyHostToDevice);
    
    free(d0);
    free(Ad);
}


void host_gauss_seidel_1(
    struct Some_variables *E,
    struct Some_variables *s_E,
    double *d_F, double *d_Ad
    )
{
    cudaThreadSynchronize();
    
    const int neq = E->lmesh.NEQ;
    const int nno = E->lmesh.NNO;
    
    double *F = (double *)malloc(neq*sizeof(double));
    cudaMemcpy(F, d_F, neq*sizeof(double), cudaMemcpyDeviceToHost);
    
    double *Ad = (double *)malloc((1+neq)*sizeof(double));
    cudaMemcpy(Ad, d_Ad, (1+neq)*sizeof(double), cudaMemcpyDeviceToHost);
    
    const double zeroo = 0.0;
    
    for (int j = 0; j <= neq; j++) {
        E->temp[j] = zeroo;
    }
    
    Ad[neq] = zeroo;
    
    for (int i = 1; i <= nno; i++) {
        if (E->NODE[i] & OFFSIDE) {
            int eqn1 = E->ID[i].doff[1];
            int eqn2 = E->ID[i].doff[2];
            int eqn3 = E->ID[i].doff[3];
            E->temp[eqn1] = (F[eqn1] - Ad[eqn1])*E->BI[eqn1];
            E->temp[eqn2] = (F[eqn2] - Ad[eqn2])*E->BI[eqn2];
            E->temp[eqn3] = (F[eqn3] - Ad[eqn3])*E->BI[eqn3];
        }
    }
    
    /* Ad[neq] */
    cudaMemcpy(d_Ad + neq, Ad + neq, sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(s_E->temp, E->temp, (neq+1)*sizeof(double), cudaMemcpyHostToDevice);
    
    free(F);
    free(Ad);
}


void host_gauss_seidel_2(
    struct Some_variables *E,
    struct Some_variables *s_E,
    double *d_F, double *d_Ad
    )
{
    cudaThreadSynchronize();
    
    const int neq = E->lmesh.NEQ;
    const int nno = E->lmesh.NNO;
    
    double *F = (double *)malloc(neq*sizeof(double));
    cudaMemcpy(F, d_F, neq*sizeof(double), cudaMemcpyDeviceToHost);
    
    double *Ad = (double *)malloc((1+neq)*sizeof(double));
    cudaMemcpy(Ad, d_Ad, (1+neq)*sizeof(double), cudaMemcpyDeviceToHost);
    
    cudaMemcpy(E->temp, s_E->temp, (neq+1)*sizeof(double), cudaMemcpyDeviceToHost);
    
    for (int i = 1; i <= nno; i++) {
            
        int eqn1 = E->ID[i].doff[1];
        int eqn2 = E->ID[i].doff[2];
        int eqn3 = E->ID[i].doff[3];
            
        int *C = E->Node_map + (i-1)*MAX_EQN;
            
        higher_precision *B1,*B2,*B3;
        B1 = E->Eqn_k[1] + (i-1)*MAX_EQN;
        B2 = E->Eqn_k[2] + (i-1)*MAX_EQN;
        B3 = E->Eqn_k[3] + (i-1)*MAX_EQN;
            
        /* Ad on boundaries differs after the following operation, but
           no communications are needed yet, because boundary Ad will
           not be used for the G-S iterations for interior nodes */
            
        for (int j = 3; j < MAX_EQN; j++) {
            double UU = E->temp[C[j]];
            Ad[eqn1] += B1[j]*UU;
            Ad[eqn2] += B2[j]*UU;
            Ad[eqn3] += B3[j]*UU;
        }
            
        if (!(E->NODE[i] & OFFSIDE)) {
            E->temp[eqn1] = (F[eqn1] - Ad[eqn1])*E->BI[eqn1];
            E->temp[eqn2] = (F[eqn2] - Ad[eqn2])*E->BI[eqn2];
            E->temp[eqn3] = (F[eqn3] - Ad[eqn3])*E->BI[eqn3];
        }
            
    }
            
    cudaMemcpy(d_Ad, Ad, (1+neq)*sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(s_E->temp, E->temp, (neq+1)*sizeof(double), cudaMemcpyHostToDevice);
    
    free(F);
    free(Ad);
}


void host_gauss_seidel_3(
    struct Some_variables *E,
    struct Some_variables *s_E,
    double *d_d0,
    double *d_Ad
    )
{
    cudaThreadSynchronize();
    
    const int neq = E->lmesh.NEQ;
    const int nno = E->lmesh.NNO;
    
    double *d0 = (double *)malloc((1+neq)*sizeof(double));
    double *Ad = (double *)malloc((1+neq)*sizeof(double));
    
    cudaMemcpy(d0, d_d0, (1+neq)*sizeof(double), cudaMemcpyDeviceToHost);
    cudaMemcpy(Ad, d_Ad, (1+neq)*sizeof(double), cudaMemcpyDeviceToHost);
    cudaMemcpy(E->temp, s_E->temp, (neq+1)*sizeof(double), cudaMemcpyDeviceToHost);
    
    for (int i = 1; i <= nno; i++) {
            
        int eqn1 = E->ID[i].doff[1];
        int eqn2 = E->ID[i].doff[2];
        int eqn3 = E->ID[i].doff[3];
            
        int *C = E->Node_map + (i-1)*MAX_EQN;
            
        higher_precision *B1,*B2,*B3;
        B1 = E->Eqn_k[1] + (i-1)*MAX_EQN;
        B2 = E->Eqn_k[2] + (i-1)*MAX_EQN;
        B3 = E->Eqn_k[3] + (i-1)*MAX_EQN;
            
        /* Ad on boundaries differs after the following operation */
        for (int j = 0; j < MAX_EQN; j++) {
            Ad[C[j]] += B1[j]*E->temp[eqn1] +
                        B2[j]*E->temp[eqn2] +
                        B3[j]*E->temp[eqn3];
        }
            
        d0[eqn1] += E->temp[eqn1];
        d0[eqn2] += E->temp[eqn2];
        d0[eqn3] += E->temp[eqn3];
            
    }
    
    cudaMemcpy(d_d0, d0, (1+neq)*sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(d_Ad, Ad, (1+neq)*sizeof(double), cudaMemcpyHostToDevice);
    
    free(d0);
    free(Ad);
}