swh:1:snp:a9d01202ad630f8a750d9bf34ca651272e4b534f
Tip revision: 505b5ef808e2d357b192a6ec1c4d5b4c45606cc9 authored by Jyh-Miin Lin on 14 February 2020, 19:27:23 UTC
commit message
commit message
Tip revision: 505b5ef
re_subroutine.py
"""
Metaprogramming subroutines (using reikna, pyopencl, pycuda)
========================================================
"""
from __future__ import absolute_import # Python2 compatibility
# from . import cSelect
def create_kernel_sets(API):
"""
Create the kernel from the kernel sets.
"""
kernel_sets = ( cMultiplyScalar() +
cCopy() +
cTensorCopy() +
cTensorMultiply() +
cMultiplyVec() +
cHypot() +
cAddScalar() +
cSelect() +
# cMultiplyConjVec() +
cAddVec() +
cMultiplyVecInplace() +
cMultiplyConjVecInplace() +
cMultiplyRealInplace() +
cMultiplyConjVec() +
cDiff() +
cSqrt() +
cAnisoShrink() +
cSpmv() +
cSpmvh() +
cHadamard())
# if 'cuda' is API:
# print('Select cuda interface')
# kernel_sets = atomic_add.cuda_add + kernel_sets
# elif 'ocl' is API:
# print("Selecting opencl interface")
# kernel_sets = atomic_add.ocl_add + kernel_sets
kernel_sets = atomic_add(API) + kernel_sets
return kernel_sets
def cMultiplyScalar():
"""
Return the kernel source for cMultiplyScalar.
"""
code_text ="""
KERNEL void cMultiplyScalar(
const float2 CA,
GLOBAL_MEM float2 *CX)
{
// Scale CX by CA: CX=CA*CX
// CA: scaling factor(float2)
//*CX: input, output array(float2)
int gid = get_global_id(0);
CX[gid].x=CA.x*CX[gid].x-CA.y*CX[gid].y;
CX[gid].y=CA.x*CX[gid].y+CA.y*CX[gid].x;
};
"""
return code_text
def cCopy():
"""
Return the kernel source for cCopy
"""
code_text = """
KERNEL void cCopy(
GLOBAL_MEM const float2 *CX,
GLOBAL_MEM float2 *CY)
{
// Copy x to y: y = x;
//CX: input array (float2)
// CY output array (float2)
int gid=get_global_id(0);
CY[gid]=CX[gid];
};
"""
return code_text
def cTensorCopy():
"""
Return the kernel source for cTensorCopy.
"""
code_text = """
KERNEL void cTensorCopy(
const unsigned int batch,
const unsigned int dim,
GLOBAL_MEM const unsigned int *Nd_elements,
GLOBAL_MEM const unsigned int *Kd_elements,
GLOBAL_MEM const float *invNd,
GLOBAL_MEM const float2 *indata,
GLOBAL_MEM float2 *outdata,
const int direction)
{
const unsigned int gid=get_global_id(0);
unsigned int curr_res = gid;
unsigned int new_idx = 0;
unsigned int group;
for (unsigned int dimid =0; dimid < dim; dimid ++){
group = (float)curr_res*invNd[dimid];
new_idx += group * Kd_elements[dimid];
curr_res = curr_res - group * Nd_elements[dimid];
};
if (direction == 1) {
for (unsigned int bat=0; bat < batch; bat ++ )
{
outdata[new_idx*batch+bat]= indata[gid*batch+bat];
};
};
if (direction == -1) {
for (unsigned int bat=0; bat < batch; bat ++ )
{
outdata[gid*batch+bat]= indata[new_idx*batch+bat];
};
};
};
"""
return code_text
def cTensorMultiply():
"""
Return the kernel source for cTensorMultiply
"""
code_text = """
KERNEL void cTensorMultiply(
const unsigned int batch, // batch
const unsigned int dim, // dimensions
GLOBAL_MEM const unsigned int *Nd, // In batch mode, Nd*batch but is calculated outside the kernel
GLOBAL_MEM const unsigned int *Nd_elements, // Number of elements to move along the dimension = strides / itemsize
GLOBAL_MEM const float *invNd_elements, // float: inverse of the Nd_elements, which aims for fast division // batch mode: Nd_elements / batch
GLOBAL_MEM const float *vec, // Real, vector, length sum Nd[dimid]
GLOBAL_MEM float2 *outdata,
const unsigned int div)
{
const unsigned int gid=get_global_id(0);
const unsigned int pid = (float)gid / (float)batch;
// const unsigned int bat = gid - pid * batch;
unsigned int group;
unsigned int Nd_indx_shift = 0;
float mul = 1.0;
unsigned int res = pid;
for (unsigned int dimid = 0; dimid < dim; dimid ++){
group = (float)res * invNd_elements[dimid]; // The index along the axis
res = res - group * Nd_elements[dimid];
const unsigned int N = Nd[dimid];
mul = mul * vec[group + Nd_indx_shift];
Nd_indx_shift = Nd_indx_shift + N;
}
if (div == 1){
// for (unsigned int bat = 0; bat < batch; bat ++ )
// {
float2 tmp = outdata[gid];
tmp.x = tmp.x / mul;
tmp.y = tmp.y / mul;
outdata[gid] = tmp;
// };
};
if (div == 0){
// for (unsigned int bat = 0; bat < batch; bat ++ )
// {
float2 tmp = outdata[gid];
tmp.x = tmp.x * mul;
tmp.y = tmp.y * mul;
outdata[gid] = tmp;
// };
};
};
"""
return code_text
def atomic_add(API):
"""
Return the atomic_add for the given API.
Overcome the missing atomic_add_float for OpenCL-1.2.
"""
ocl_add = """
// #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
KERNEL void atomic_add_float(
GLOBAL_MEM float *ptr,
const float temp)
{
// The work-around of AtomicAdd for float
// lockless add *source += operand
// Caution!!!!!!! Use with care! You have been warned!
// http://simpleopencl.blogspot.com/2013/05/atomic-operations-and-floats-in-opencl.html
// Source: https://github.com/clMathLibraries/clSPARSE/blob/master/src/library/kernels/csrmv_adaptive.cl
union {
unsigned int intVal;
float floatVal;
} newVal;
union {
unsigned int intVal;
float floatVal;
} prevVal;
do {
prevVal.floatVal = *ptr;
newVal.floatVal = prevVal.floatVal + temp;
} while (atomic_cmpxchg((volatile GLOBAL_MEM unsigned int *)ptr, prevVal.intVal, newVal.intVal) != prevVal.intVal);
};
"""
cuda_add = """
__device__ void atomic_add_float(
GLOBAL_MEM float *ptr,
const float temp)
{ // Wrapper around CUDA atomicAdd();
atomicAdd(ptr, temp);
};
"""
if API == 'cuda':
code_text = cuda_add
elif API == 'ocl':
code_text = ocl_add
return code_text
def cMultiplyVec():
"""
Return the kernel source of cMultiplyVec
"""
code_text="""
KERNEL void cMultiplyVec( GLOBAL_MEM float2 *a,
GLOBAL_MEM float2 *b,
GLOBAL_MEM float2 *dest)
{ const unsigned int i = get_global_id(0);
dest[i].x = a[i].x*b[i].x-a[i].y*b[i].y;
dest[i].y = a[i].x*b[i].y+a[i].y*b[i].x;
//barrier(CLK_GLOBAL_MEM_FENCE);
};
"""
return code_text
def cAddScalar():
"""
Return the kernel source for cAddScalar.
"""
code_text ="""
KERNEL void cAddScalar(const float2 CA,
GLOBAL_MEM float2 *CX)
{
// (single complex) scale x by a: x = x + ca;
// CA: add factor
// CX: input and output array (float2)
int gid = get_global_id(0);
CX[gid].x += CA.x;
CX[gid].y += CA.y;
};
"""
return code_text
def cHadamard():
"""
Return the Hardamard operations related kernel sources.
"""
R="""
KERNEL void cSelect2(
const unsigned int Reps,
GLOBAL_MEM const unsigned int *order1,
GLOBAL_MEM const unsigned int *order2,
GLOBAL_MEM const float2 *indata,
GLOBAL_MEM float2 *outdata)
{
const unsigned int gid=get_global_id(0);
const unsigned int t = ((float)gid / (float)Reps); // indptr
const unsigned int r = gid - t*Reps; // residue
const unsigned int index2 = order2[t]*Reps + r;
const unsigned int index1 = order1[t]*Reps + r;
outdata[index2]=indata[index1];
};
KERNEL void cDistribute(
const unsigned int Reps,
const unsigned int prodNd,
GLOBAL_MEM const float2 *arr_large, // sensitivity and scaling array, prodNd*Reps
GLOBAL_MEM const float2 *arr_image, // image array, prodNd
GLOBAL_MEM float2 *arr_out // output array, prodNd * Reps
)
{
const unsigned int t = get_global_id(0);
//const unsigned int nd = t/Reps;
const unsigned int nd = ((float)t / (float)Reps);
if (nd < prodNd){
const float2 u = arr_large[t];
const float2 v = arr_image[nd];
float2 w;
w.x = u.x * v.x - u.y * v.y;
w.y = u.x * v.y + u.y * v.x;
arr_out[t] = w;
}
}; // End of cDistribute
KERNEL void cMerge(
const unsigned int Reps,
const unsigned int prodNd,
GLOBAL_MEM const float2 *arr_large, // sensitivity and scaling array, prodNd*Reps
GLOBAL_MEM const float2 *arr_image, // image array, prodNd*Reps
GLOBAL_MEM float2 *arr_out // reduced output array, prodNd
)
{
const unsigned int t = get_local_id(0);
//const float float_reps = (float)Reps;
const unsigned int vecWidth=${LL};
// Thread ID within wavefront
const unsigned int id = t & (vecWidth-1);
// One row per wavefront
unsigned int vecsPerBlock=get_local_size(0)/vecWidth;
unsigned int myRow=(get_group_id(0)*vecsPerBlock) + (t/ vecWidth);
LOCAL_MEM float2 partialSums[${LL}];
float2 zero;
zero.x = 0.0;
zero.y = 0.0;
partialSums[t] = zero;
// float2 y= zero;
if (myRow < prodNd)
{
const unsigned int vecStart = myRow * Reps;
const unsigned int vecEnd = vecStart + Reps;
for (unsigned int j = vecStart+id; j<vecEnd; j += vecWidth)
{
const float2 u = arr_large[j]; // sensitivities and scaling, complex
const float2 v = arr_image[j];
float2 w;
w.x = u.x * v.x + u.y * v.y; // conjugate of u
w.y = u.x * v.y - u.y * v.x;
w.x = w.x/(float)Reps;
w.y = w.y/(float)Reps;
partialSums[t] = w; //partialSums[t] + y;
}
LOCAL_BARRIER;
//__syncthreads();
//barrier(CLK_LOCAL_MEM_FENCE);
// Reduce partial sums
unsigned int bar = vecWidth / 2;
while(bar > 0)
{
if (id < bar)
partialSums[t] = partialSums[t] + partialSums[t+bar];
//barrier(CLK_LOCAL_MEM_FENCE);
//__syncthreads();
LOCAL_BARRIER;
bar = bar / 2;
}
// Write result
if (id == 0)
{
arr_out[myRow]=partialSums[t];
}
}
}; // End of cMerge
KERNEL void cAggregate(
const unsigned int Reps,
const unsigned int prodNd,
GLOBAL_MEM const float2 *arr_image, // image array, prodNd*Reps
GLOBAL_MEM float2 *arr_out // reduced output array, prodNd
)
{
const unsigned int t = get_local_id(0);
//const float float_reps = (float)Reps;
const unsigned int vecWidth=${LL};
// Thread ID within wavefront
const unsigned int id = t & (vecWidth-1);
// One row per wavefront
unsigned int vecsPerBlock=get_local_size(0)/vecWidth;
unsigned int myRow=(get_group_id(0)*vecsPerBlock) + (t/ vecWidth);
LOCAL_MEM float2 partialSums[${LL}];
float2 zero;
zero.x = 0.0;
zero.y = 0.0;
partialSums[t] = zero;
// float2 y= zero;
if (myRow < prodNd)
{
const unsigned int vecStart = myRow * Reps;
const unsigned int vecEnd = vecStart + Reps;
for (unsigned int j = vecStart+id; j<vecEnd; j += vecWidth)
{
float2 v = arr_image[j];
v.x = v.x/(float)Reps;
v.y = v.y/(float)Reps;
partialSums[t] =partialSums[t] + v; //partialSums[t] + y;
}
LOCAL_BARRIER;
//__syncthreads();
//barrier(CLK_LOCAL_MEM_FENCE);
// Reduce partial sums
unsigned int bar = vecWidth / 2;
while(bar > 0)
{
if (id < bar)
partialSums[t] = partialSums[t] + partialSums[t+bar];
//barrier(CLK_LOCAL_MEM_FENCE);
//__syncthreads();
LOCAL_BARRIER;
bar = bar / 2;
}
// Write result
if (id == 0)
{
arr_out[myRow]=partialSums[t];
}
}
LOCAL_BARRIER;
}; // End of cAggregate
KERNEL void cPopulate(
const unsigned int Reps,
const unsigned int prodNd,
GLOBAL_MEM const float2 *arr_image, // image array, prodNd
GLOBAL_MEM float2 *arr_out // output array, prodNd * Reps
)
{
const unsigned int t = get_global_id(0);
//const unsigned int nd = t/Reps;
const unsigned int nd = ((float)t / (float)Reps);
if (nd < prodNd){
const float2 v = arr_image[nd];
arr_out[t] = v;
}
LOCAL_BARRIER;
}; // End of cPopulate
"""
return R
def cSpmvh():
"""
Return the cSpmvh related kernel source. Only pELL_spmvh_mCoil is provided for Spmvh.
"""
R="""
KERNEL void pELL_spmvh_mCoil(
const unsigned int Reps, // number of coils
const unsigned int nRow, // number of rows
const unsigned int prodJd, // product of Jd
const unsigned int sumJd, // sum of Jd
const unsigned int dim, // dimensionality
GLOBAL_MEM const unsigned int *Jd, // Jd
// GLOBAL_MEM const unsigned int *curr_sumJd, //
GLOBAL_MEM const unsigned int *meshindex, // meshindex, prodJd * dim
GLOBAL_MEM const unsigned int *kindx, // unmixed column indexes of all dimensions
GLOBAL_MEM const float2 *udata, // interpolation data before Kronecker product
GLOBAL_MEM float *kx,
GLOBAL_MEM float *ky,
GLOBAL_MEM const float2 *input) // y
{
const unsigned int t = get_local_id(0);
const unsigned int d = get_local_size(0);
const unsigned int g = get_group_id(0);
const unsigned int a = g/prodJd;
const unsigned int b = g - (prodJd*a);
const unsigned int myRow = a*d + t;
const unsigned int j = b;
unsigned int m = myRow / Reps;
unsigned int nc = myRow - m * Reps;
// unsigned int myRow= get_global_id(0);
float2 zero;
zero.x = 0.0;
zero.y = 0.0;
if (m< nRow)
{ if (nc < Reps)
{
float2 u = zero;
// for (unsigned int j = 0; j < prodJd; j ++)
if (j<prodJd)
{
// now doing the first dimension
unsigned int index_shift = m * sumJd;
// unsigned int tmp_sumJd = 0;
unsigned int J = Jd[0];
unsigned int index = index_shift + meshindex[dim*j + 0];
unsigned int col = kindx[index] ;
float2 spdata = udata[index];
index_shift += J;
for (unsigned int dimid = 1; dimid < dim; dimid ++ )
{
J = Jd[dimid];
index = index_shift + meshindex[dim*j + dimid]; // the index of the partial ELL arrays *kindx and *udata
col += kindx[index] + 1 ; // the column index of the current j
float tmp_x = spdata.x;
float2 tmp_udata = udata[index];
spdata.x = spdata.x * tmp_udata.x - spdata.y * tmp_udata.y; // the spdata of the current j
spdata.y = tmp_x * tmp_udata.y + spdata.y * tmp_udata.x;
index_shift += J;
}; // Iterate over dimensions 1 -> Nd - 1
float2 ydata=input[myRow]; // kout[col];
u.x = spdata.x*ydata.x + spdata.y*ydata.y;
u.y = - spdata.y*ydata.x + spdata.x*ydata.y;
atomic_add_float(kx + col*Reps + nc, u.x);
atomic_add_float(ky + col*Reps + nc, u.y);
}; // Iterate for (unsigned int j = 0; j < prodJd; j ++)
}; // if (nc < Reps)
}; // if (m < nRow)
}; // End of pELL_spmv_mCoil
"""
return R
def cHypot():
"""
Return the kernel code for hypot, which computes the sqrt(x*x + y*y) without intermediate overflow.
"""
R="""
KERNEL void cHypot(GLOBAL_MEM float2 *x,
GLOBAL_MEM const float2 *y)
{
const unsigned int gid = get_global_id(0);
float2 tmp_x;
float2 tmp_y;
tmp_x = x[gid];
tmp_y = y[gid];
tmp_x.x = hypot( tmp_x.x, tmp_x.y); // sqrt( tmp_x.x*tmp_x.x + tmp_x.y*tmp_x.y);
tmp_y.x = hypot( tmp_y.x, tmp_y.y); // sqrt( tmp_y.x*tmp_y.x + tmp_y.y*tmp_y.y);
x[gid].x = hypot(tmp_x.x, tmp_y.x);
x[gid].y = 0.0;
};
"""
return R
def cSpmv():
"""
Return the kernel sources for cSpmv related operations.
Providing cCSR_spmv_vector and cpELL_spmv_mCoil.
"""
R = """
KERNEL void cCSR_spmv_vector(
const unsigned int numRow,
GLOBAL_MEM const unsigned int *rowDelimiters,
GLOBAL_MEM const unsigned int *cols,
GLOBAL_MEM const float2 *val,
GLOBAL_MEM const float2 *vec,
GLOBAL_MEM float2 *out)
{
const unsigned int t = get_local_id(0);
const unsigned int vecWidth=${LL};
// Thread ID within wavefront
const unsigned int id = t & (vecWidth-1);
// One row per wavefront
unsigned int vecsPerBlock=get_local_size(0)/vecWidth;
unsigned int myRow=(get_group_id(0)*vecsPerBlock) + (t/ vecWidth);
LOCAL_MEM float2 partialSums[${LL}];
float2 zero;
zero.x = 0.0;
zero.y = 0.0;
partialSums[t] = zero;
float2 y= zero;
if (myRow < numRow)
{
const unsigned int vecStart = rowDelimiters[myRow];
const unsigned int vecEnd = rowDelimiters[myRow+1];
for (unsigned int j = vecStart+id; j<vecEnd; j += vecWidth)
{
const unsigned int col = cols[j];
const float2 spdata=val[j];
const float2 vecdata=vec[col];
y.x=spdata.x*vecdata.x - spdata.y*vecdata.y;
y.y=spdata.y*vecdata.x + spdata.x*vecdata.y;
partialSums[t] = partialSums[t] + y;
}
LOCAL_BARRIER;
//__syncthreads();
//barrier(CLK_LOCAL_MEM_FENCE);
// Reduce partial sums
unsigned int bar = vecWidth / 2;
while(bar > 0)
{
if (id < bar)
partialSums[t] = partialSums[t] + partialSums[t+bar];
//barrier(CLK_LOCAL_MEM_FENCE);
//__syncthreads();
LOCAL_BARRIER;
bar = bar / 2;
}
// Write result
if (id == 0)
{
out[myRow]=partialSums[t];
}
}
}; // End of cCSR_spmv_vector
KERNEL void pELL_spmv_mCoil(
const unsigned int Reps, // Number of coils
const unsigned int nRow, // number of rows
const unsigned int prodJd, // product of Jd
const unsigned int sumJd, // sum of Jd
const unsigned int dim, // dimensionality
GLOBAL_MEM const unsigned int *Jd, // Jd, length = dim
//GLOBAL_MEM const unsigned int *curr_sumJd, // summation of Jd[0:dimid]
GLOBAL_MEM const unsigned int *meshindex, // meshindex, prodJd * dim
GLOBAL_MEM const unsigned int *kindx, // unmixed column indexes of all dimensions
GLOBAL_MEM const float2 *udata,// interpolation data before Kronecker product
GLOBAL_MEM const float2 *vec, // multi-channel kspace data, prodKd * Reps
GLOBAL_MEM float2 *out) // multi-channel output, nRow * Reps
{
const unsigned int t = get_local_id(0);
const unsigned int vecWidth=${LL};
// Thread ID within wavefront
const unsigned int id = t & (vecWidth-1);
// One row per wavefront
unsigned int vecsPerBlock=get_local_size(0)/vecWidth;
unsigned int myRow=(get_group_id(0)*vecsPerBlock) + (t/ vecWidth); // the myRow-th non-Cartesian sample
unsigned int m = myRow / Reps;
unsigned int nc = myRow - m * Reps;
LOCAL_MEM float2 partialSums[${LL}];
float2 zero;
zero.x = 0.0;
zero.y = 0.0;
partialSums[t] = zero;
if (myRow < nRow * Reps)
{
const unsigned int vecStart = 0;
const unsigned int vecEnd =prodJd;
float2 y;//=zero;
for (unsigned int j = vecStart+id; j<vecEnd; j += vecWidth)
{ // now doing the first dimension
unsigned int J = Jd[0];
unsigned int index_shift = m * sumJd ;
unsigned int index = index_shift + meshindex[dim*j + 0];
unsigned int col = kindx[index] ;
float2 spdata = udata[index];
index_shift += J;
for (unsigned int dimid = 1; dimid < dim; dimid ++ )
{
unsigned int J = Jd[dimid];
unsigned int index = index_shift + meshindex[dim*j + dimid]; // the index of the partial ELL arrays *kindx and *udata
col += kindx[index] + 1; // the column index of the current j
float tmp_x= spdata.x;
float2 tmp_udata = udata[index];
spdata.x = spdata.x * tmp_udata.x - spdata.y * tmp_udata.y; // the spdata of the current j
spdata.y = tmp_x * tmp_udata.y + spdata.y * tmp_udata.x;
index_shift += J;
}
float2 vecdata=vec[col * Reps + nc];
y.x = spdata.x*vecdata.x - spdata.y*vecdata.y;
y.y = spdata.y*vecdata.x + spdata.x*vecdata.y;
partialSums[t] = y + partialSums[t];
}
LOCAL_BARRIER;
// Reduce partial sums
unsigned int bar = vecWidth / 2;
while(bar > 0)
{
if (id < bar)
partialSums[t]= partialSums[t]+partialSums[t+bar];
//barrier(CLK_LOCAL_MEM_FENCE);
//__syncthreads();
LOCAL_BARRIER;
bar = bar / 2;
}
// Write result
if (id == 0)
{
out[myRow]=partialSums[t];
}
}
}; // End of pELL_spmv_mCoil
"""
return R
def cMultiplyConjVecInplace():
"""
Return the kernel source of cMultiplyConjVecInplace
"""
R="""
KERNEL void cMultiplyConjVecInplace(
const unsigned int batch,
GLOBAL_MEM float2 *a,
GLOBAL_MEM float2 *outb)
{
const unsigned int gid = get_global_id(0);
const unsigned int voxel_id = (float)gid / (float)batch;
float2 mul = a[voxel_id]; // taking the conjugate
float2 orig = outb[gid];
float2 tmp;
tmp.x = orig.x * mul.x + orig.y * mul.y;
tmp.y = - orig.x * mul.y + orig.y * mul.x;
outb[gid] = tmp;
};
"""
return R
def cMultiplyConjVec():
"""
Return the kernel source of cMultiplyConjVec.
"""
R="""
KERNEL void cMultiplyConjVec(
GLOBAL_MEM float2 *a,
GLOBAL_MEM float2 *b,
GLOBAL_MEM float2 *dest)
{// dest[i]=conj(a[i]) * b[i]
const unsigned int i=get_global_id(0);
dest[i].x=a[i].x*b[i].x+a[i].y*b[i].y;
dest[i].y=a[i].x*b[i].y-a[i].y*b[i].x;
};
"""
return R
def cMultiplyRealInplace():
"""
Return the kernel source of cMultiplyRealInplace.
"""
R="""
KERNEL void cMultiplyRealInplace(
const unsigned int batch,
GLOBAL_MEM const float *a,
GLOBAL_MEM float2 *outb)
{
const unsigned int gid = get_global_id(0);
// const unsigned int voxel_id = gid / batch;
const unsigned int voxel_id = (float)gid / (float)batch;
float mul = a[voxel_id];
float2 orig = outb[gid];
orig.x=orig.x*mul;
orig.y=orig.y*mul;
outb[gid]=orig;
};
"""
return R
def cMultiplyVecInplace():
"""
Return the kernel source of cMultiplyVecInplace.
"""
R="""
KERNEL void cMultiplyVecInplace(
const unsigned int batch,
GLOBAL_MEM const float2 *a,
GLOBAL_MEM float2 *outb)
{
const unsigned int gid = get_global_id(0);
// const unsigned int voxel_id = gid / batch;
const unsigned int voxel_id = (float)gid / (float)batch;
float2 mul = a[voxel_id];
float2 orig = outb[gid];
float2 tmp;
tmp.x=orig.x*mul.x-orig.y*mul.y;
tmp.y=orig.x*mul.y+orig.y*mul.x;
outb[gid]=tmp;
};
"""
return R
def cAddVec():
"""
Return the kernel source for cAddVec.
"""
R="""
KERNEL void cAddVec(
GLOBAL_MEM float2 *a,
GLOBAL_MEM float2 *b,
GLOBAL_MEM float2 *dest)
{const int i = get_global_id(0);
dest[i]= a[i]+b[i];
};"""
return R
def cDiff():
"""
Return the kernel source of cDiff.
"""
R="""
KERNEL void cDiff( GLOBAL_MEM const int *order2,
GLOBAL_MEM const float2 *indata,
GLOBAL_MEM float2 *outdata)
{
const unsigned int gid = get_global_id(0);
const unsigned int ind = order2[gid];
outdata[gid]=indata[ind]- indata[gid];
};
"""
return R
def cAnisoShrink():
"""
Return the kernel source of cAnisoShrink
"""
R="""
KERNEL void cAnisoShrink(const float2 threshold,
GLOBAL_MEM const float2 *indata,
GLOBAL_MEM float2 *outdata)
{
const unsigned int gid = get_global_id(0);
float2 tmp; // temporay register
tmp = indata[gid];
//float zero = 0.0;
//tmp.x=sign(tmp.x)*max(fabs(tmp.x)-threshold.x, zero);
//tmp.y=sign(tmp.y)*max(fabs(tmp.y)-threshold.y, zero);
tmp.x = (tmp.x > threshold.x)*(tmp.x - threshold.x) ;//+ (tmp.x < - threshold.x)*(tmp.x + threshold.x);
tmp.y = (tmp.y > threshold.x)*(tmp.y - threshold.x) ;//+ (tmp.y < - threshold.x)*(tmp.y + threshold.x);
outdata[gid]=tmp;
};
"""
return R
def cSqrt():
"""
Return the kernel source of cSqrt.
"""
R="""
KERNEL void cSqrt(
GLOBAL_MEM float2 *CX)
{
// Copy x to y: y = x;
//CX: input output array (float2)
int gid=get_global_id(0);
CX[gid].x=sqrt(CX[gid].x);
};
"""
return R
def cSelect():
"""
Return the kernel source of cSelect.
"""
R="""
KERNEL void cSelect(
GLOBAL_MEM const int *order1,
GLOBAL_MEM const int *order2,
GLOBAL_MEM const float2 *indata,
GLOBAL_MEM float2 *outdata)
{
const unsigned int gid=get_global_id(0);
outdata[order2[gid]]=
indata[order1[gid]];
};
"""
return R