""" Metaprogramming subroutines (using reikna, pyopencl, pycuda) ======================================================== """ from __future__ import absolute_import # Python2 compatibility # from . import cSelect def create_kernel_sets(API): """ Create the kernel from the kernel sets. Note that in some tests (Benoit's and my tests) CUDA shows some degraded accuracy. This loss of accuracy was due to undefined shared memory behavior, which I don't fully understand. This has been fixed in 2019.2.0 as the operations are moved to global memory. """ kernel_sets = ( cMultiplyScalar() + cCopy() + cTensorCopy() + cTensorMultiply() + cMultiplyVec() + cHypot() + cAddScalar() + cSelect() + cAddVec() + cMultiplyVecInplace() + cMultiplyConjVecInplace() + cMultiplyRealInplace() + cMultiplyConjVec() + cDiff() + cSqrt() + cAnisoShrink() + cSpmv() + cSpmvh() + cHadamard()) # if 'cuda' is API: # print('Select cuda interface') # kernel_sets = atomic_add.cuda_add + kernel_sets # elif 'ocl' is API: # print("Selecting opencl interface") # kernel_sets = atomic_add.ocl_add + kernel_sets kernel_sets = atomic_add(API) + kernel_sets return kernel_sets def cMultiplyScalar(): """ Return the kernel source for cMultiplyScalar. """ code_text =""" KERNEL void cMultiplyScalar( const float2 CA, GLOBAL_MEM float2 *CX) { // Scale CX by CA: CX=CA*CX // CA: scaling factor(float2) //*CX: input, output array(float2) unsigned long gid = get_global_id(0); CX[gid].x=CA.x*CX[gid].x-CA.y*CX[gid].y; CX[gid].y=CA.x*CX[gid].y+CA.y*CX[gid].x; }; """ return code_text def cCopy(): """ Return the kernel source for cCopy """ code_text = """ KERNEL void cCopy( GLOBAL_MEM const float2 *CX, GLOBAL_MEM float2 *CY) { // Copy x to y: y = x; //CX: input array (float2) // CY output array (float2) unsigned long gid=get_global_id(0); CY[gid]=CX[gid]; }; """ return code_text def cTensorCopy(): """ Return the kernel source for cTensorCopy. """ code_text = """ KERNEL void cTensorCopy( const unsigned int batch, const unsigned int dim, GLOBAL_MEM const unsigned int *Nd_elements, GLOBAL_MEM const unsigned int *Kd_elements, GLOBAL_MEM const float *invNd, GLOBAL_MEM const float2 *indata, GLOBAL_MEM float2 *outdata, const int direction) { const unsigned int gid=get_global_id(0); unsigned int curr_res = gid; unsigned int new_idx = 0; unsigned int group; for (unsigned int dimid =0; dimid < dim; dimid ++){ group = (float)curr_res*invNd[dimid]; new_idx += group * Kd_elements[dimid]; curr_res = curr_res - group * Nd_elements[dimid]; }; if (direction == 1) { for (unsigned int bat=0; bat < batch; bat ++ ) { outdata[new_idx*batch+bat]= indata[gid*batch+bat]; }; }; if (direction == -1) { for (unsigned int bat=0; bat < batch; bat ++ ) { outdata[gid*batch+bat]= indata[new_idx*batch+bat]; }; }; }; """ return code_text def cTensorMultiply(): """ Return the kernel source for cTensorMultiply """ code_text = """ KERNEL void cTensorMultiply( const unsigned int batch, // batch const unsigned int dim, // dimensions GLOBAL_MEM const unsigned int *Nd, // In batch mode, Nd*batch but is calculated outside the kernel GLOBAL_MEM const unsigned int *Nd_elements, // Number of elements to move along the dimension = strides / itemsize GLOBAL_MEM const float *invNd_elements, // float: inverse of the Nd_elements, which aims for fast division // batch mode: Nd_elements / batch GLOBAL_MEM const float *vec, // Real, vector, length sum Nd[dimid] GLOBAL_MEM float2 *outdata, const unsigned int div) { const unsigned int gid=get_global_id(0); const unsigned int pid = (float)gid / (float)batch; // const unsigned int bat = gid - pid * batch; unsigned int group; unsigned int Nd_indx_shift = 0; float mul = 1.0; unsigned int res = pid; for (unsigned int dimid = 0; dimid < dim; dimid ++){ group = (float)res * invNd_elements[dimid]; // The index along the axis res = res - group * Nd_elements[dimid]; const unsigned int N = Nd[dimid]; mul = mul * vec[group + Nd_indx_shift]; Nd_indx_shift = Nd_indx_shift + N; } if (div == 1){ // for (unsigned int bat = 0; bat < batch; bat ++ ) // { float2 tmp = outdata[gid]; tmp.x = tmp.x / mul; tmp.y = tmp.y / mul; outdata[gid] = tmp; // }; }; if (div == 0){ // for (unsigned int bat = 0; bat < batch; bat ++ ) // { float2 tmp = outdata[gid]; tmp.x = tmp.x * mul; tmp.y = tmp.y * mul; outdata[gid] = tmp; // }; }; }; """ return code_text def atomic_add(API): """ Return the atomic_add for the given API. Overcome the missing atomic_add_float for OpenCL-1.2. Note: will be checked if OpenCL 2.0 provided by all GPU vendors. """ ocl_add = """ // #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable KERNEL void atomic_add_float( GLOBAL_MEM float *ptr, const float temp) { // The work-around of AtomicAdd for float // lockless add *source += operand // Caution!!!!!!! Use with care! You have been warned! // http://simpleopencl.blogspot.com/2013/05/atomic-operations-and-floats-in-opencl.html // Source: https://github.com/clMathLibraries/clSPARSE/blob/master/src/library/kernels/csrmv_adaptive.cl union { unsigned int intVal; float floatVal; } newVal; union { unsigned int intVal; float floatVal; } prevVal; do { prevVal.floatVal = *ptr; newVal.floatVal = prevVal.floatVal + temp; } while (atomic_cmpxchg((__global volatile unsigned int *)ptr, prevVal.intVal, newVal.intVal) != prevVal.intVal); }; KERNEL void atomic_add_float2_old( GLOBAL_MEM float2 *ptr, const float2 temp) { atomic_add_float( (GLOBAL_MEM float*)ptr, temp.x); atomic_add_float( (GLOBAL_MEM float*)ptr+1, temp.y); }; KERNEL void atomic_add_float2( GLOBAL_MEM float2 *ptr, const float2 temp, GLOBAL_MEM float2 *res) { // Add atomic two sum for real/imaginary parts // *res array stores the error union { unsigned int intVal; float floatVal; } newVal; union { unsigned int intVal; float floatVal; } prevVal; float x; float val; float newacc; float r; __global const float * realptr = ( __global float *)ptr; __global const float * imagptr = ( __global float *)ptr + 1; __global const float * realres = ( __global float *)res; __global const float * imagres = ( __global float *)res + 1; do { // atomic add of the value prevVal.floatVal = *realptr ; newVal.floatVal = prevVal.floatVal + temp.x; } while (atomic_cmpxchg((__global volatile unsigned int *)realptr, prevVal.intVal, newVal.intVal) != prevVal.intVal); x = prevVal.floatVal; val = temp.x; if (fabs(x) < fabs(val)) { const float swap = x; x = val; val = swap; } newacc = x + val; r = val - (newacc - x); // Recover rounding error using Fast2Sum, assumes |oldacc|>=|val| do { // atomic_add(realres, r) prevVal.floatVal = *realres ; newVal.floatVal = prevVal.floatVal + r; } while (atomic_cmpxchg((__global volatile unsigned int *)realres, prevVal.intVal, newVal.intVal) != prevVal.intVal); // End of real part // Now do the imaginary part do { prevVal.floatVal = *imagptr; newVal.floatVal = prevVal.floatVal + temp.y; } while (atomic_cmpxchg((__global volatile unsigned int *)imagptr, prevVal.intVal, newVal.intVal) != prevVal.intVal); x = prevVal.floatVal; // this copy the old value to x val = temp.y; // this copy the real part to val if (fabs(x) < fabs(val)) { const float swap = x; x = val; val = swap; } newacc = x + val; r = val - (newacc - x); // Recover rounding error using Fast2Sum, assumes |oldacc|>=|val| do { prevVal.floatVal = *imagres ; newVal.floatVal = prevVal.floatVal + r; } while (atomic_cmpxchg((__global volatile unsigned int *)imagres, prevVal.intVal, newVal.intVal) != prevVal.intVal); }; """ cuda_add = """ __device__ void atomic_add_float_twosum( GLOBAL_MEM float * address, float val, GLOBAL_MEM float * residue) { // https://devtalk.nvidia.com/default/topic/817899/atomicadd-kahan-summation/ // By Sylvain Collange // Also add the branch clsparse float x = atomicAdd(address, val); if (fabs(x) < fabs(val)) { const float swap = x; x = val; val = swap; } float newacc = x + val; float r = val - (newacc - x); // Recover rounding error using Fast2Sum, assumes |oldacc|>=|val| atomicAdd(residue, r); // Accumulate error in residue } __device__ void atomic_add_float( GLOBAL_MEM float *ptr, const float temp) { // Wrapper around CUDA atomicAdd(); atomicAdd(ptr, temp); }; __device__ void atomic_add_float2( GLOBAL_MEM float2 * ptr, const float2 temp, GLOBAL_MEM float2 *res) { // Wrapper around CUDA atomicAdd(); // atomicAdd((float*)ptr, temp.x); // atomicAdd((float*)ptr+1, temp.y); atomic_add_float_twosum((float*)ptr, temp.x, (float*)res); atomic_add_float_twosum((float*)ptr+1, temp.y, (float*)res+1); }; """ if API == 'cuda': code_text = cuda_add elif API == 'ocl': code_text = ocl_add return code_text def cMultiplyVec(): """ Return the kernel source of cMultiplyVec """ code_text=""" KERNEL void cMultiplyVec( GLOBAL_MEM float2 *a, GLOBAL_MEM float2 *b, GLOBAL_MEM float2 *dest) { const unsigned int i = get_global_id(0); dest[i].x = a[i].x*b[i].x-a[i].y*b[i].y; dest[i].y = a[i].x*b[i].y+a[i].y*b[i].x; //barrier(CLK_GLOBAL_MEM_FENCE); }; """ return code_text def cAddScalar(): """ Return the kernel source for cAddScalar. """ code_text =""" KERNEL void cAddScalar(const float2 CA, GLOBAL_MEM float2 *CX) { // (single complex) scale x by a: x = x + ca; // CA: add factor // CX: input and output array (float2) int gid = get_global_id(0); CX[gid].x += CA.x; CX[gid].y += CA.y; }; """ return code_text def cHadamard(): """ Return the Hadamard operations related kernel sources. """ R=""" // Batched copying indata[order1] to outdata[order2] // Superceded by cTensorCopy() // However left here as a general function. KERNEL void cSelect2( const unsigned int Reps, GLOBAL_MEM const unsigned int *order1, GLOBAL_MEM const unsigned int *order2, GLOBAL_MEM const float2 *indata, GLOBAL_MEM float2 *outdata) { const unsigned int gid=get_global_id(0); const unsigned int t = ((float)gid / (float)Reps); // indptr const unsigned int r = gid - t*Reps; // residue const unsigned int index2 = order2[t]*Reps + r; const unsigned int index1 = order1[t]*Reps + r; outdata[index2]=indata[index1]; }; KERNEL void cDistribute( const unsigned int Reps, const unsigned int prodNd, GLOBAL_MEM const float2 *arr_large, // sensitivity and scaling array, prodNd*Reps GLOBAL_MEM const float2 *arr_image, // image array, prodNd GLOBAL_MEM float2 *arr_out // output array, prodNd * Reps ) { const unsigned int t = get_global_id(0); //const unsigned int nd = t/Reps; const unsigned int nd = ((float)t / (float)Reps); if (nd < prodNd){ const float2 u = arr_large[t]; const float2 v = arr_image[nd]; float2 w; w.x = u.x * v.x - u.y * v.y; w.y = u.x * v.y + u.y * v.x; arr_out[t] = w; } }; // End of cDistribute KERNEL void cMerge( const unsigned int Reps, const unsigned int prodNd, GLOBAL_MEM const float2 *arr_large, // sensitivity and scaling array, prodNd*Reps GLOBAL_MEM const float2 *arr_image, // image array, prodNd*Reps GLOBAL_MEM float2 *arr_out // reduced output array, prodNd ) { const unsigned int t = get_local_id(0); //const float float_reps = (float)Reps; const unsigned int vecWidth=${LL}; // Thread ID within wavefront const unsigned int id = t & (vecWidth-1); // One row per wavefront unsigned int vecsPerBlock=get_local_size(0)/vecWidth; unsigned int myRow=(get_group_id(0)*vecsPerBlock) + (t/ vecWidth); LOCAL_MEM float2 partialSums[${LL}]; float2 zero; zero.x = 0.0; zero.y = 0.0; partialSums[t] = zero; // float2 y= zero; if (myRow < prodNd) { const unsigned int vecStart = myRow * Reps; const unsigned int vecEnd = vecStart + Reps; for (unsigned int j = vecStart+id; j 0) { if (id < bar) partialSums[t] = partialSums[t] + partialSums[t+bar]; //barrier(CLK_LOCAL_MEM_FENCE); //__syncthreads(); LOCAL_BARRIER; bar = bar / 2; } // Write result if (id == 0) { arr_out[myRow]=partialSums[t]; } } }; // End of cMerge KERNEL void cAggregate( const unsigned int Reps, const unsigned int prodNd, GLOBAL_MEM const float2 *arr_image, // image array, prodNd*Reps GLOBAL_MEM float2 *arr_out // reduced output array, prodNd ) { const unsigned int t = get_local_id(0); //const float float_reps = (float)Reps; const unsigned int vecWidth=${LL}; // Thread ID within wavefront const unsigned int id = t & (vecWidth-1); // One row per wavefront unsigned int vecsPerBlock=get_local_size(0)/vecWidth; unsigned int myRow=(get_group_id(0)*vecsPerBlock) + (t/ vecWidth); LOCAL_MEM float2 partialSums[${LL}]; float2 zero; zero.x = 0.0; zero.y = 0.0; partialSums[t] = zero; // float2 y= zero; if (myRow < prodNd) { const unsigned int vecStart = myRow * Reps; const unsigned int vecEnd = vecStart + Reps; for (unsigned int j = vecStart+id; j 0) { if (id < bar) partialSums[t] = partialSums[t] + partialSums[t+bar]; //barrier(CLK_LOCAL_MEM_FENCE); //__syncthreads(); LOCAL_BARRIER; bar = bar / 2; } // Write result if (id == 0) { arr_out[myRow]=partialSums[t]; } } LOCAL_BARRIER; }; // End of cAggregate KERNEL void cPopulate( const unsigned int Reps, const unsigned int prodNd, GLOBAL_MEM const float2 *arr_image, // image array, prodNd GLOBAL_MEM float2 *arr_out // output array, prodNd * Reps ) { const unsigned int t = get_global_id(0); //const unsigned int nd = t/Reps; const unsigned int nd = ((float)t / (float)Reps); if (nd < prodNd){ const float2 v = arr_image[nd]; arr_out[t] = v; } LOCAL_BARRIER; }; // End of cPopulate """ return R def cSpmvh(): """ Return the cSpmvh related kernel source. Only pELL_spmvh_mCoil is provided for Spmvh. NUFFT_hsa_legacy reuses the cCSR_spmv() function, which doubles the storage. """ R=""" KERNEL void pELL_spmvh_mCoil_old( const unsigned int Reps, // number of coils const unsigned int nRow, // number of rows const unsigned int prodJd, // product of Jd const unsigned int sumJd, // sum of Jd const unsigned int dim, // dimensionality GLOBAL_MEM const unsigned int *Jd, // Jd // GLOBAL_MEM const unsigned int *curr_sumJd, // GLOBAL_MEM const unsigned int *meshindex, // meshindex, prodJd * dim GLOBAL_MEM const unsigned int *kindx, // unmixed column indexes of all dimensions GLOBAL_MEM const float2 *udata, // interpolation data before Kronecker product GLOBAL_MEM float *kx, GLOBAL_MEM float *ky, GLOBAL_MEM const float2 *input) // y { const unsigned int t = get_local_id(0); const unsigned int d = get_local_size(0); const unsigned int g = get_group_id(0); const unsigned int a = g/prodJd; const unsigned int b = g - (prodJd*a); const unsigned int myRow = a*d + t; const unsigned int j = b; unsigned int m = myRow / Reps; unsigned int nc = myRow - m * Reps; // unsigned int myRow= get_global_id(0); float2 zero; zero.x = 0.0; zero.y = 0.0; if (m< nRow) {if (nc < Reps) { float2 u = zero; // for (unsigned int j = 0; j < prodJd; j ++) if (j Nd - 1 float2 ydata=input[myRow]; // kout[col]; u.x = spdata.x*ydata.x + spdata.y*ydata.y; u.y = - spdata.y*ydata.x + spdata.x*ydata.y; atomic_add_float(kx + col*Reps + nc, u.x); atomic_add_float(ky + col*Reps + nc, u.y); }; // Iterate for (unsigned int j = 0; j < prodJd; j ++) }; // if (nc < Reps) }; // if (m < nRow) }; // End of pELL_spmvh_mCoil_old KERNEL void pELL_spmvh_mCoil( const unsigned int Reps, // number of coils const unsigned int nRow, // number of rows const unsigned int prodJd, // product of Jd const unsigned int sumJd, // sum of Jd const unsigned int dim, // dimensionality GLOBAL_MEM const unsigned int *Jd, // Jd // GLOBAL_MEM const unsigned int *curr_sumJd, // GLOBAL_MEM const unsigned int *meshindex, // meshindex, prodJd * dim GLOBAL_MEM const unsigned int *kindx, // unmixed column indexes of all dimensions GLOBAL_MEM const float2 *udata, // interpolation data before Kronecker product GLOBAL_MEM float2 *k, GLOBAL_MEM float2 *res, GLOBAL_MEM const float2 *input) // y { unsigned int myRow0= get_global_id(0); unsigned int myRow= myRow0/(float)Reps; unsigned int nc = myRow0 - myRow*Reps; float2 zero; zero.x = 0.0; zero.y = 0.0; if (myRow < nRow){ for (unsigned int j = 0; j < prodJd; j ++){ float2 u = zero; // now doing the first dimension unsigned int index_shift = myRow * sumJd; // unsigned int tmp_sumJd = 0; unsigned int J = Jd[0]; unsigned int index = index_shift + meshindex[dim*j + 0]; unsigned int col = kindx[index] ; float2 spdata = udata[index]; index_shift += J; for (unsigned int dimid = 1; dimid < dim; dimid ++ ){ J = Jd[dimid]; index = index_shift + meshindex[dim*j + dimid]; // the index of the partial ELL arrays *kindx and *udata col += kindx[index];// + 1 ; // the column index of the current j float tmp_x = spdata.x; float2 tmp_udata = udata[index]; spdata.x = tmp_x * tmp_udata.x - spdata.y * tmp_udata.y; // the spdata of the current j spdata.y = tmp_x * tmp_udata.y + spdata.y * tmp_udata.x; index_shift += J; }; // Iterate over dimensions 1 -> Nd - 1 float2 ydata=input[myRow*Reps + nc]; // kout[col]; u.x = spdata.x*ydata.x + spdata.y*ydata.y; u.y = - spdata.y*ydata.x + spdata.x*ydata.y; atomic_add_float2(k + col*Reps + nc, u, res + col*Reps + nc); }; // Iterate for (unsigned int j = 0; j < prodJd; j ++) }; // if (m < nRow) }; // End of pELL_spmvh_mCoil """ return R def cHypot(): """ Return the kernel code for hypot, which computes the sqrt(x*x + y*y) without intermediate overflow. """ R=""" KERNEL void cHypot(GLOBAL_MEM float2 *x, GLOBAL_MEM const float2 *y) { const unsigned int gid = get_global_id(0); float2 tmp_x; float2 tmp_y; tmp_x = x[gid]; tmp_y = y[gid]; tmp_x.x = hypot( tmp_x.x, tmp_x.y); // sqrt( tmp_x.x*tmp_x.x + tmp_x.y*tmp_x.y); tmp_y.x = hypot( tmp_y.x, tmp_y.y); // sqrt( tmp_y.x*tmp_y.x + tmp_y.y*tmp_y.y); x[gid].x = hypot(tmp_x.x, tmp_y.x); x[gid].y = 0.0; }; """ return R def cSpmv(): """ Return the kernel sources for cSpmv related operations, providing cCSR_spmv_vector and cpELL_spmv_mCoil. """ R = """ KERNEL void cCSR_spmv_vector( const unsigned int Reps, // Number of coils const unsigned int numRow, GLOBAL_MEM const unsigned int *rowDelimiters, GLOBAL_MEM const unsigned int *cols, GLOBAL_MEM const float2 *val, GLOBAL_MEM const float2 *vec, GLOBAL_MEM float2 *out) { const unsigned int t = get_local_id(0); const unsigned int vecWidth=${LL}; // Thread ID within wavefront const unsigned int id = t & (vecWidth-1); // One row per wavefront unsigned int vecsPerBlock=get_local_size(0)/vecWidth; unsigned int myRow=(get_group_id(0)*vecsPerBlock) + (t/ vecWidth); unsigned int m = myRow / Reps; unsigned int nc = myRow - m * Reps; LOCAL_MEM float2 partialSums[${LL}]; float2 zero; zero.x = 0.0; zero.y = 0.0; partialSums[t] = zero; float2 y= zero; if (myRow < numRow * Reps) { const unsigned int vecStart = rowDelimiters[m]; const unsigned int vecEnd = rowDelimiters[m+1]; for (unsigned int j = vecStart+id; j 0) { if (id < bar) partialSums[t] = partialSums[t] + partialSums[t+bar]; //barrier(CLK_LOCAL_MEM_FENCE); //__syncthreads(); LOCAL_BARRIER; bar = bar / 2; } // Write result if (id == 0) { out[myRow]=partialSums[t]; } } }; // End of cCSR_spmv_vector KERNEL void pELL_spmv_mCoil( const unsigned int Reps, // Number of coils const unsigned int nRow, // number of rows const unsigned int prodJd, // product of Jd const unsigned int sumJd, // sum of Jd const unsigned int dim, // dimensionality GLOBAL_MEM const unsigned int *Jd, // Jd, length = dim //GLOBAL_MEM const unsigned int *curr_sumJd, // summation of Jd[0:dimid] GLOBAL_MEM const unsigned int *meshindex, // meshindex, prodJd * dim GLOBAL_MEM const unsigned int *kindx, // unmixed column indexes of all dimensions GLOBAL_MEM const float2 *udata,// interpolation data before Kronecker product GLOBAL_MEM const float2 *vec, // multi-channel kspace data, prodKd * Reps GLOBAL_MEM float2 *out) // multi-channel output, nRow * Reps { const unsigned int t = get_local_id(0); const unsigned int vecWidth=${LL}; // Thread ID within wavefront const unsigned int id = t & (vecWidth-1); // One row per wavefront unsigned int vecsPerBlock=get_local_size(0)/vecWidth; unsigned int myRow=(get_group_id(0)*vecsPerBlock) + (t/ vecWidth); // the myRow-th non-Cartesian sample unsigned int m = myRow / Reps; unsigned int nc = myRow - m * Reps; LOCAL_MEM float2 partialSums[${LL}]; float2 zero; zero.x = 0.0; zero.y = 0.0; partialSums[t] = zero; if (myRow < nRow * Reps) { const unsigned int vecStart = 0; const unsigned int vecEnd =prodJd; float2 y;//=zero; for (unsigned int j = vecStart+id; j 0) { if (id < bar) partialSums[t]= partialSums[t]+partialSums[t+bar]; //barrier(CLK_LOCAL_MEM_FENCE); //__syncthreads(); LOCAL_BARRIER; bar = bar / 2; } // Write result if (id == 0) { out[myRow]=partialSums[t]; } } }; // End of pELL_spmv_mCoil """ return R def cMultiplyConjVecInplace(): """ Return the kernel source of cMultiplyConjVecInplace """ R=""" KERNEL void cMultiplyConjVecInplace( const unsigned int batch, GLOBAL_MEM float2 *a, GLOBAL_MEM float2 *outb) { const unsigned int gid = get_global_id(0); const unsigned int voxel_id = (float)gid / (float)batch; float2 mul = a[voxel_id]; // taking the conjugate float2 orig = outb[gid]; float2 tmp; tmp.x = orig.x * mul.x + orig.y * mul.y; tmp.y = - orig.x * mul.y + orig.y * mul.x; outb[gid] = tmp; }; """ return R def cMultiplyConjVec(): """ Return the kernel source of cMultiplyConjVec. """ R=""" KERNEL void cMultiplyConjVec( GLOBAL_MEM float2 *a, GLOBAL_MEM float2 *b, GLOBAL_MEM float2 *dest) {// dest[i]=conj(a[i]) * b[i] const unsigned int i=get_global_id(0); dest[i].x=a[i].x*b[i].x+a[i].y*b[i].y; dest[i].y=a[i].x*b[i].y-a[i].y*b[i].x; }; """ return R def cMultiplyRealInplace(): """ Return the kernel source of cMultiplyRealInplace. """ R=""" KERNEL void cMultiplyRealInplace( const unsigned int batch, GLOBAL_MEM const float *a, GLOBAL_MEM float2 *outb) { const unsigned int gid = get_global_id(0); // const unsigned int voxel_id = gid / batch; const unsigned int voxel_id = (float)gid / (float)batch; float mul = a[voxel_id]; float2 orig = outb[gid]; orig.x=orig.x*mul; orig.y=orig.y*mul; outb[gid]=orig; }; """ return R def cMultiplyVecInplace(): """ Return the kernel source of cMultiplyVecInplace. """ R=""" KERNEL void cMultiplyVecInplace( const unsigned int batch, GLOBAL_MEM const float2 *a, GLOBAL_MEM float2 *outb) { const unsigned int gid = get_global_id(0); // const unsigned int voxel_id = gid / batch; const unsigned int voxel_id = (float)gid / (float)batch; float2 mul = a[voxel_id]; float2 orig = outb[gid]; float2 tmp; tmp.x=orig.x*mul.x-orig.y*mul.y; tmp.y=orig.x*mul.y+orig.y*mul.x; outb[gid]=tmp; }; """ return R def cAddVec(): """ Return the kernel source for cAddVec. """ R=""" KERNEL void cAddVec( GLOBAL_MEM float2 *a, GLOBAL_MEM float2 *b, GLOBAL_MEM float2 *dest) {const int i = get_global_id(0); dest[i]= a[i]+b[i]; };""" return R def cDiff(): """ Return the kernel source of cDiff. """ R=""" KERNEL void cDiff( GLOBAL_MEM const int *order2, GLOBAL_MEM const float2 *indata, GLOBAL_MEM float2 *outdata) { const unsigned int gid = get_global_id(0); const unsigned int ind = order2[gid]; outdata[gid]=indata[ind]- indata[gid]; }; """ return R def cAnisoShrink(): """ Return the kernel source of cAnisoShrink """ R=""" KERNEL void cAnisoShrink(const float2 threshold, GLOBAL_MEM const float2 *indata, GLOBAL_MEM float2 *outdata) { const unsigned int gid = get_global_id(0); float2 tmp; // temporay register tmp = indata[gid]; //float zero = 0.0; //tmp.x=sign(tmp.x)*max(fabs(tmp.x)-threshold.x, zero); //tmp.y=sign(tmp.y)*max(fabs(tmp.y)-threshold.y, zero); tmp.x = (tmp.x > threshold.x)*(tmp.x - threshold.x) ;//+ (tmp.x < - threshold.x)*(tmp.x + threshold.x); tmp.y = (tmp.y > threshold.x)*(tmp.y - threshold.x) ;//+ (tmp.y < - threshold.x)*(tmp.y + threshold.x); outdata[gid]=tmp; }; """ return R def cSqrt(): """ Return the kernel source of cSqrt. """ R=""" KERNEL void cSqrt( GLOBAL_MEM float2 *CX) { // Copy x to y: y = x; //CX: input output array (float2) int gid=get_global_id(0); CX[gid].x=sqrt(CX[gid].x); }; """ return R def cSelect(): """ Return the kernel source of cSelect. """ R=""" KERNEL void cSelect( GLOBAL_MEM const int *order1, GLOBAL_MEM const int *order2, GLOBAL_MEM const float2 *indata, GLOBAL_MEM float2 *outdata) { const unsigned int gid=get_global_id(0); outdata[order2[gid]]= indata[order1[gid]]; }; """ return R