#################################### # HSA code #################################### import numpy # import reikna from functools import wraps as _wraps from ..src._helper import helper#, helper1 def push_cuda_context(hsa_method): """ Decorator: Push cuda context to the top of the stack before calling the context Add @push_cuda_context before the methods of NUFFT_device() """ @_wraps(hsa_method) def wrapper(*args, **kwargs): try: args[0].thr._context.push() except: pass return hsa_method(*args, **kwargs) return wrapper def _init__device(self, device_indx=None): """ Constructor. :param API: The API for the heterogeneous system. API='cuda' or API='ocl' :param platform_number: The number of the platform found by the API. :param device_number: The number of the device found on the platform. :param verbosity: Defines the verbosity level, default value is 0 :type API: string :type platform_number: integer :type device_number: integer :type verbosity: integer :returns: 0 :rtype: int, float :Example: >>> from pynufft import NUFFT >>> NufftObj = NUFFT_device(API='cuda', platform_number=0, device_number=0, verbosity=0) """ self.dtype = numpy.complex64 self.verbosity = 0#verbosity from reikna import cluda import reikna.transformations from reikna.cluda import functions, dtypes # try: # try to create api/platform/device using the given parameters API = device_indx[0] self.API = API # if 'cuda' == API: # api = cluda.cuda_api() # elif 'ocl' == API: # api = cluda.ocl_api() # api = device_indx[0] platform_number = device_indx[1] device_number = device_indx[2] # NUFFT_hsa.__init__(self, API, platform_number, device_number) platform = device_indx[3] device = device_indx[4] # self.thr = api.Thread(device) self.device = device self.thr = device_indx[5] # api = device_indx[6] # self.thr = api.Thread(device) # pyopencl.create_some_context() # self.device = device # : device name # """ # Wavefront: as warp in cuda. Can control the width in a workgroup # Wavefront is required in spmv_vector as it improves data coalescence. # see cCSR_spmv and zSparseMatVec # """ self.wavefront = device_indx[6] # if self.verbosity > 0: # print('Wavefront of OpenCL (as wrap of CUDA) = ', self.wavefront) # print('API = ', API) # print('thr = ', self.thr # ) from ..src import re_subroutine # import create_kernel_sets kernel_sets = re_subroutine.create_kernel_sets(self.API) prg = self.thr.compile(kernel_sets, render_kwds=dict(LL=str(self.wavefront)), fast_math=False) self.prg = prg self.processor = 'hsa' def _plan_device(self, om, Nd, Kd, Jd, ft_axes=None, radix=None): """ Design the multi-coil or single-coil memory reduced interpolator. :param om: The M off-grid locations in the frequency domain. Normalized between [-pi, pi] :param Nd: The matrix size of equispaced image. Example: Nd=(256, 256) for a 2D image; Nd = (128, 128, 128) for a 3D image :param Kd: The matrix size of the oversampled frequency grid. Example: Kd=(512,512) for 2D image; Kd = (256,256,256) for a 3D image :param Jd: The interpolator size. Example: Jd=(6,6) for 2D image; Jd = (6,6,6) for a 3D image :param ft_axes: The dimensions to be transformed by FFT. Example: ft_axes = (0, 1) for 2D, ft_axes = (0, 1, 2) for 3D; ft_axes = None for all dimensions. :param radix: expert mode. If provided, the shape is Nd. The last axis is the number of parallel coils. :type om: numpy.float array, matrix size = (M, ndims) :type Nd: tuple, ndims integer elements. :type Kd: tuple, ndims integer elements. :type Jd: tuple, ndims integer elements. :type ft_axes: tuple, selected axes to be transformed. :returns: 0 :rtype: int, float :Example: >>> import pynufft >>> device=pynufft.helper.device_list()[0] >>> NufftObj = pynufft.NUFFT(device) >>> NufftObj.plan(om, Nd, Kd, Jd) """ self.ndims = len(Nd) # dimension if ft_axes is None: ft_axes = range(0, self.ndims) self.ft_axes = ft_axes self.st = helper.plan(om, Nd, Kd, Jd, ft_axes=ft_axes, format='pELL', radix=radix) # if batch is None: self.parallel_flag = 0 # else: # self.parallel_flag = 1 # if batch is None: self.batch = numpy.uint32(1) # else: # self.batch = numpy.uint32(batch) self.Nd = self.st['Nd'] # backup self.Kd = self.st['Kd'] # self.sn = numpy.asarray(self.st['sn'].astype(self.dtype), # order='C')# backup # if self.batch == 1: # self.multi_Nd = self.Nd # self.multi_Kd = self.Kd # self.multi_M = (self.st['M'], ) # # Broadcasting the sense and scaling factor (Roll-off) # # self.sense2 = self.sense*numpy.reshape(self.sn, self.Nd + (1, )) # else: # self.batch is 0: # self.multi_Nd = self.Nd + (self.batch, ) # self.multi_Kd = self.Kd + (self.batch, ) # self.multi_M = (self.st['M'], ) + (self.batch, ) self.Kdprod = numpy.uint32(numpy.prod(self.st['Kd'])) self.Jdprod = numpy.uint32(numpy.prod(self.st['Jd'])) self.Ndprod = numpy.uint32(numpy.prod(self.st['Nd'])) self.Nd_elements, self.invNd_elements = helper.strides_divide_itemsize( self.st['Nd']) # only return the Kd_elements self.Kd_elements = helper.strides_divide_itemsize(self.st['Kd'])[0] self._offload_device() return 0 @push_cuda_context def _offload_device(self): # API, platform_number=0, device_number=0): """ self.offload(): Off-load NUFFT to the opencl or cuda device(s) :param API: define the device type, which can be 'cuda' or 'ocl' :param platform_number: define which platform to be used. The default platform_number = 0. :param device_number: define which device to be used. The default device_number = 0. :type API: string :type platform_number: int :type device_number: int :return: self: instance """ self.pELL = {} # dictionary self.pELL['nRow'] = numpy.uint32(self.st['pELL'].nRow) self.pELL['prodJd'] = numpy.uint32(self.st['pELL'].prodJd) self.pELL['sumJd'] = numpy.uint32(self.st['pELL'].sumJd) self.pELL['dim'] = numpy.uint32(self.st['pELL'].dim) self.pELL['Jd'] = self.thr.to_device( self.st['pELL'].Jd.astype(numpy.uint32)) self.pELL['meshindex'] = self.thr.to_device( self.st['pELL'].meshindex.astype(numpy.uint32)) self.pELL['kindx'] = self.thr.to_device( self.st['pELL'].kindx.astype(numpy.uint32)) self.pELL['udata'] = self.thr.to_device( self.st['pELL'].udata.astype(self.dtype)) self.volume = {} self.volume['Nd_elements'] = self.thr.to_device( numpy.asarray(self.Nd_elements, dtype=numpy.uint32)) self.volume['Kd_elements'] = self.thr.to_device( numpy.asarray(self.Kd_elements, dtype=numpy.uint32)) self.volume['invNd_elements'] = self.thr.to_device( self.invNd_elements.astype(numpy.float32)) self.volume['Nd'] = self.thr.to_device(numpy.asarray( self.st['Nd'], dtype=numpy.uint32)) # self.volume['gpu_coil_profile'] = self.thr.array( # self.multi_Nd, dtype=self.dtype).fill(1.0) Nd = self.st['Nd'] self.tSN = {} self.tSN['Td_elements'] = self.thr.to_device( numpy.asarray(self.st['tSN'].Td_elements, dtype=numpy.uint32)) self.tSN['invTd_elements'] = self.thr.to_device( self.st['tSN'].invTd_elements.astype(numpy.float32)) self.tSN['Td'] = self.thr.to_device( numpy.asarray(self.st['tSN'].Td, dtype=numpy.uint32)) self.tSN['Tdims'] = self.st['tSN'].Tdims self.tSN['tensor_sn'] = self.thr.to_device( self.st['tSN'].tensor_sn.astype(numpy.float32)) self.Ndprod = numpy.int32(numpy.prod(self.st['Nd'])) self.Kdprod = numpy.int32(numpy.prod(self.st['Kd'])) self.M = numpy.int32(self.st['M']) import reikna.fft self.fft = reikna.fft.FFT( numpy.empty(self.Kd, dtype=self.dtype), self.ft_axes).compile(self.thr, fast_math=False) self.zero_scalar = self.dtype(0.0+0.0j) del self.st['pELL'] if self.verbosity > 0: print('End of offload') @push_cuda_context def _set_wavefront_device(self, wf): # try: self.wavefront = int(wf)#api.DeviceParameters(device).warp_size if self.verbosity > 0: print('Wavefront of OpenCL (as wrap of CUDA) = ', self.wavefront) from ..src import re_subroutine # import create_kernel_sets kernel_sets = re_subroutine.create_kernel_sets(self.API) prg = self.thr.compile(kernel_sets, render_kwds=dict(LL=str(self.wavefront)), fast_math=False) self.prg = prg # @push_cuda_context # def _reset_sense_device_deprecated(self): # self.volume['gpu_coil_profile'].fill(1.0) # # @push_cuda_context # def _set_sense_device_deprecated(self, coil_profile_device): # self.volume['gpu_coil_profile'] = coil_profile_device # if coil_profile.shape == self.Nd + (self.batch, ): @push_cuda_context def to_device(self, x, shape=None): gx = self.thr.to_device(x.copy().astype(self.dtype)) # g_image = self.thr.array(image.shape, dtype=self.dtype) # self.thr.to_device(image.astype(self.dtype), dest=g_image) return gx @push_cuda_context def to_host(self, data): return data.get() # @push_cuda_context # def _s2x_device(self, s): # x = self.thr.array(self.multi_Nd, dtype=self.dtype) # # self.prg.cPopulate( # self.batch, # self.Ndprod, # s, # x, # local_size=None, # global_size=int(self.batch * self.Ndprod)) # # self.prg.cMultiplyVecInplace( # numpy.uint32(1), # self.volume['gpu_coil_profile'], # x, # local_size=None, # global_size=int(self.batch*self.Ndprod)) # # return x @push_cuda_context def _x2xx_device(self, x): xx = self.thr.array(x.shape, dtype=self.dtype) self.thr.copy_array(x, dest=xx, ) self.prg.cTensorMultiply(numpy.uint32(self.batch), numpy.uint32(self.tSN['Tdims']), self.tSN['Td'], self.tSN['Td_elements'], self.tSN['invTd_elements'], self.tSN['tensor_sn'], xx, numpy.uint32(0), local_size=None, global_size=int(self.batch*self.Ndprod)) # self.thr.synchronize() return xx @push_cuda_context def _xx2k_device(self, xx): """ Private: oversampled FFT on the heterogeneous device First, zeroing the self.k_Kd array Second, copy self.x_Nd array to self.k_Kd array by cSelect Third, inplace FFT """ k = self.thr.array(self.Kd, dtype=self.dtype) k.fill(0) self.prg.cTensorCopy( self.batch, numpy.uint32(self.ndims), self.volume['Nd_elements'], self.volume['Kd_elements'], self.volume['invNd_elements'], xx, k, numpy.int32(1), # Directions: Nd -> Kd, 1; Kd -> Nd, -1 local_size=None, global_size=int(self.Ndprod)) self.fft(k, k, inverse=False) # self.thr.synchronize() return k @push_cuda_context def _k2y_device(self, k): """ Private: interpolation by the Sparse Matrix-Vector Multiplication """ y = self.thr.array(self.M, dtype=self.dtype).fill(0) self.prg.pELL_spmv_mCoil( self.batch, self.pELL['nRow'], self.pELL['prodJd'], self.pELL['sumJd'], self.pELL['dim'], self.pELL['Jd'], # self.pELL_currsumJd, self.pELL['meshindex'], self.pELL['kindx'], self.pELL['udata'], k, y, local_size=int(self.wavefront), global_size=int(self.pELL['nRow'] * self.batch * self.wavefront) ) # self.thr.synchronize() return y @push_cuda_context def _y2k_device(self, y): """ Private: gridding by the Sparse Matrix-Vector Multiplication Atomic_twosum together provide better accuracy than generic atomic_add. See: ocl_add and cuda_add code-strings in atomic_add(), inside the re_subroutine.py. """ # kx = self.thr.array(self.multi_Kd, dtype=numpy.float32).fill(0.0) # ky = self.thr.array(self.multi_Kd, dtype=numpy.float32).fill(0.0) k = self.thr.array(self.Kd, dtype=numpy.complex64).fill(0.0) # res = self.thr.array(self.Kd, dtype=numpy.complex64).fill(0.0) # array which saves the residue of two sum self.prg.pELL_spmvh_mCoil( self.batch, self.pELL['nRow'], self.pELL['prodJd'], self.pELL['sumJd'], self.pELL['dim'], self.pELL['Jd'], self.pELL['meshindex'], self.pELL['kindx'], self.pELL['udata'], # kx, ky, k, # res, y, local_size=int(self.wavefront), global_size=int(self.pELL['nRow'] *self.wavefront)#* # int(self.pELL['prodJd']) * int(self.batch)) ) return k# + res @push_cuda_context def _k2xx_device(self, k): """ Private: the inverse FFT and image cropping (which is the reverse of _xx2k() method) """ self.fft(k, k, inverse=True) xx = self.thr.array(self.Nd, dtype=self.dtype) xx.fill(0) self.prg.cTensorCopy( self.batch, numpy.uint32(self.ndims), self.volume['Nd_elements'], self.volume['Kd_elements'], self.volume['invNd_elements'], k, xx, numpy.int32(-1), local_size=None, global_size=int(self.Ndprod)) return xx @push_cuda_context def _xx2x_device(self, xx): x = self.thr.array(xx.shape, dtype=self.dtype) self.thr.copy_array(xx, dest=x, ) self.prg.cTensorMultiply(numpy.uint32(self.batch), numpy.uint32(self.tSN['Tdims']), self.tSN['Td'], self.tSN['Td_elements'], self.tSN['invTd_elements'], self.tSN['tensor_sn'], x, numpy.uint32(0), local_size=None, global_size=int(self.batch * self.Ndprod)) return x # @push_cuda_context # def _x2s_device(self, x): # s = self.thr.array(self.st['Nd'], dtype=self.dtype) # # self.prg.cMultiplyConjVecInplace( # numpy.uint32(1), # self.volume['gpu_coil_profile'], # x, # local_size=None, # global_size=int(self.batch*self.Ndprod)) # # self.prg.cAggregate( # self.batch, # self.Ndprod, # x, # s, # local_size=int(self.wavefront), # global_size=int(self.batch*self.Ndprod*self.wavefront)) # # return s # @push_cuda_context # def _selfadjoint_one2many2one_device_deprecated(self, gx): # """ # selfadjoint_one2many2one NUFFT on the heterogeneous device # # :param gx: The input gpu array, with size=Nd # :type gx: reikna gpu array with dtype =numpy.complex64 # :return: gx: The output gpu array, with size=Nd # :rtype: reikna gpu array with dtype =numpy.complex64 # """ # # gy = self._forward_one2many_device(gx) # gx2 = self._adjoint_many2one_device(gy) # del gy # return gx2 # # @push_cuda_context # def _selfadjoint_one2many2one_legacy_deprecated(self, gx): # """ # selfadjoint_one2many2one NUFFT (Teplitz) on the heterogeneous device # # :param gx: The input gpu array, with size=Nd # :type gx: reikna gpu array with dtype =numpy.complex64 # :return: gx: The output gpu array, with size=Nd # :rtype: reikna gpu array with dtype =numpy.complex64 # """ # # gy = self._forward_one2many_legacy(gx) # gx2 = self._adjoint_many2one_legacy(gy) # del gy # return gx2 @push_cuda_context def _selfadjoint_device(self, gx): """ selfadjoint NUFFT on the heterogeneous device :param gx: The input gpu array, with size=Nd :type gx: reikna gpu array with dtype =numpy.complex64 :return: gx: The output gpu array, with size=Nd :rtype: reikna gpu array with dtype =numpy.complex64 """ gy = self._forward_device(gx) gx2 = self._adjoint_device(gy) del gy return gx2 @push_cuda_context def _selfadjoint_legacy(self, gx): """ selfadjoint NUFFT on the heterogeneous device :param gx: The input gpu array, with size=Nd :type gx: reikna gpu array with dtype =numpy.complex64 :return: gx: The output gpu array, with size=Nd :rtype: reikna gpu array with dtype =numpy.complex64 """ gy = self._forward_legacy(gx) gx2 = self._adjoint_legacy(gy) del gy return gx2 @push_cuda_context def _forward_device(self, gx): """ Forward NUFFT on the heterogeneous device :param gx: The input gpu array, with size = Nd :type gx: reikna gpu array with dtype = numpy.complex64 :return: gy: The output gpu array, with size = (M,) :rtype: reikna gpu array with dtype = numpy.complex64 """ xx = self._x2xx_device(gx) k = self._xx2k_device(xx) del xx gy = self._k2y_device(k) del k return gy @push_cuda_context def _forward_legacy(self, gx): """ Forward NUFFT on the heterogeneous device :param gx: The input gpu array, with size = Nd :type gx: reikna gpu array with dtype = numpy.complex64 :return: gy: The output gpu array, with size = (M,) :rtype: reikna gpu array with dtype = numpy.complex64 """ xx = self._x2xx_device(gx) k = self._xx2k_device(xx) del xx gy = self._k2y_legacy(k) del k return gy @push_cuda_context def _forward_one2many_device_deprecated(self, s): x = self._s2x_device(s) y = self._forward_device(x) return y @push_cuda_context def _adjoint_many2one_device_deprecated(self, y): x = self._adjoint_device(y) s = self._x2s_device(x) return s @push_cuda_context def _forward_one2many_legacy_deprecated(self, s): x = self._s2x_device(s) y = self._forward_legacy(x) return y @push_cuda_context def _adjoint_many2one_legacy_deprecated(self, y): x = self._adjoint_legacy(y) s = self._x2s_device(x) return s @push_cuda_context def _adjoint_device(self, gy): """ Adjoint NUFFT on the heterogeneous device :param gy: The input gpu array, with size=(M,) :type: reikna gpu array with dtype =numpy.complex64 :return: gx: The output gpu array, with size=Nd :rtype: reikna gpu array with dtype =numpy.complex64 """ k = self._y2k_device(gy) xx = self._k2xx_device(k) del k gx = self._xx2x_device(xx) del xx return gx @push_cuda_context def _adjoint_legacy(self, gy): """ Adjoint NUFFT on the heterogeneous device :param gy: The input gpu array, with size=(M,) :type: reikna gpu array with dtype =numpy.complex64 :return: gx: The output gpu array, with size=Nd :rtype: reikna gpu array with dtype =numpy.complex64 """ k = self._y2k_legacy(gy) xx = self._k2xx_device(k) del k gx = self._xx2x_device(xx) del xx return gx @push_cuda_context def release(self): try: del self.volume except: pass try: del self.tSN except: pass try: del self.prg except: pass try: del self.pELL except: pass try: del self.csr del self.csrH except: pass try: self.thr.release() except: pass try: del self.thr except: pass @push_cuda_context def _solve_device(self, gy, solver=None, *args, **kwargs): """ The solver of NUFFT :param gy: data, reikna array, (M,) size :param solver: could be 'cg', 'L1TVOLS', 'L1TVLAD' :param maxiter: the number of iterations :type gy: reikna array, dtype = numpy.complex64 :type solver: string :type maxiter: int :return: reikna array with size Nd """ from ..linalg.solve_device import solve return solve(self, gy, solver, *args, **kwargs) @push_cuda_context def _solve_legacy(self, gy, solver=None, *args, **kwargs): """ The solver of NUFFT :param gy: data, reikna array, (M,) size :param solver: could be 'cg', 'L1TVOLS', 'L1TVLAD' :param maxiter: the number of iterations :type gy: reikna array, dtype = numpy.complex64 :type solver: string :type maxiter: int :return: reikna array with size Nd """ from ..linalg.solve_legacy import solve as solve2 return solve2(self, gy, solver, *args, **kwargs) def _plan_legacy(self, om, Nd, Kd, Jd, ft_axes = None): """ Design the min-max interpolator. :param om: The M off-grid locations in the frequency domain. Normalized between [-pi, pi] :param Nd: The matrix size of equispaced image. Example: Nd=(256,256) for a 2D image; Nd = (128,128,128) for a 3D image :param Kd: The matrix size of the oversampled frequency grid. Example: Kd=(512,512) for 2D image; Kd = (256,256,256) for a 3D image :param Jd: The interpolator size. Example: Jd=(6,6) for 2D image; Jd = (6,6,6) for a 3D image :type om: numpy.float array, matrix size = M * ndims :type Nd: tuple, ndims integer elements. :type Kd: tuple, ndims integer elements. :type Jd: tuple, ndims integer elements. :returns: 0 :rtype: int, float :Example: >>> import pynufft >>> NufftObj = pynufft.NUFFT_cpu() >>> NufftObj.plan(om, Nd, Kd, Jd) """ self.ndims = len(Nd) # dimension if ft_axes is None: ft_axes = range(0, self.ndims) self.ft_axes = ft_axes self.st = helper.plan(om, Nd, Kd, Jd, ft_axes=ft_axes, format='CSR') # if batch is None: self.parallel_flag = 0 # else: # self.parallel_flag = 1 # if batch is None: self.batch = numpy.uint32(1) # else: # self.batch = numpy.uint32(batch) self.Nd = self.st['Nd'] # backup self.Kd = self.st['Kd'] # self.sn = numpy.asarray(self.st['sn'].astype(self.dtype), # order='C')# backup # if self.batch == 1: # self.multi_Nd = self.Nd # self.multi_Kd = self.Kd # self.multi_M = (self.st['M'], ) # Broadcasting the sense and scaling factor (Roll-off) # self.sense2 = self.sense*numpy.reshape(self.sn, self.Nd + (1, )) # else: # self.batch is 0: # self.multi_Nd = self.Nd + (self.batch, ) # self.multi_Kd = self.Kd + (self.batch, ) # self.multi_M = (self.st['M'], ) + (self.batch, ) self.Kdprod = numpy.uint32(numpy.prod(self.st['Kd'])) self.Jdprod = numpy.uint32(numpy.prod(self.st['Jd'])) self.Ndprod = numpy.uint32(numpy.prod(self.st['Nd'])) self.Nd_elements, self.invNd_elements = helper.strides_divide_itemsize( self.st['Nd']) # only return the Kd_elements self.Kd_elements = helper.strides_divide_itemsize(self.st['Kd'])[0] self.sp = self.st['p'].copy().tocsr() self.spH = (self.st['p'].getH().copy()).tocsr() self._offload_legacy() return 0 @push_cuda_context def _offload_legacy(self): """ self.offload(): Off-load NUFFT to the opencl or cuda device(s) :param API: define the device type, which can be 'cuda' or 'ocl' :param platform_number: define which platform to be used. The default platform_number = 0. :param device_number: define which device to be used. The default device_number = 0. :type API: string :type platform_number: int :type device_number: int :return: self: instance """ # self.pELL = {} # dictionary # # self.pELL['nRow'] = numpy.uint32(self.st['pELL'].nRow) # self.pELL['prodJd'] = numpy.uint32(self.st['pELL'].prodJd) # self.pELL['sumJd'] = numpy.uint32(self.st['pELL'].sumJd) # self.pELL['dim'] = numpy.uint32(self.st['pELL'].dim) # self.pELL['Jd'] = self.thr.to_device( # self.st['pELL'].Jd.astype(numpy.uint32)) # self.pELL['meshindex'] = self.thr.to_device( # self.st['pELL'].meshindex.astype(numpy.uint32)) # self.pELL['kindx'] = self.thr.to_device( # self.st['pELL'].kindx.astype(numpy.uint32)) # self.pELL['udata'] = self.thr.to_device( # self.st['pELL'].udata.astype(self.dtype)) self.volume = {} self.volume['Nd_elements'] = self.thr.to_device( numpy.asarray(self.Nd_elements, dtype=numpy.uint32)) self.volume['Kd_elements'] = self.thr.to_device( numpy.asarray(self.Kd_elements, dtype=numpy.uint32)) self.volume['invNd_elements'] = self.thr.to_device( self.invNd_elements.astype(numpy.float32)) self.volume['Nd'] = self.thr.to_device(numpy.asarray( self.st['Nd'], dtype=numpy.uint32)) # self.volume['gpu_coil_profile'] = self.thr.array( # self.multi_Nd, dtype=self.dtype).fill(1.0) Nd = self.st['Nd'] self.tSN = {} self.tSN['Td_elements'] = self.thr.to_device( numpy.asarray(self.st['tSN'].Td_elements, dtype=numpy.uint32)) self.tSN['invTd_elements'] = self.thr.to_device( self.st['tSN'].invTd_elements.astype(numpy.float32)) self.tSN['Td'] = self.thr.to_device( numpy.asarray(self.st['tSN'].Td, dtype=numpy.uint32)) self.tSN['Tdims'] = self.st['tSN'].Tdims self.tSN['tensor_sn'] = self.thr.to_device( self.st['tSN'].tensor_sn.astype(numpy.float32)) self.Ndprod = numpy.int32(numpy.prod(self.st['Nd'])) self.Kdprod = numpy.int32(numpy.prod(self.st['Kd'])) self.M = numpy.int32(self.st['M']) import reikna.fft self.fft = reikna.fft.FFT( numpy.empty(self.Kd, dtype=self.dtype), self.ft_axes).compile(self.thr, fast_math=False) self.zero_scalar = self.dtype(0.0+0.0j) # del self.st['pELL'] # if self.verbosity > 0: # print('End of offload') # self.SnGPUArray = self.thr.to_device( self.sn) self.csr = {} self.csrH = {} self.csr['data'] = self.thr.to_device( self.sp.data.astype(self.dtype)) self.csr['indices'] = self.thr.to_device( self.sp.indices.astype(numpy.uint32)) self.csr['indptr'] = self.thr.to_device( self.sp.indptr.astype(numpy.uint32)) self.csr['numrow'] = self.M self.csr['numcol'] = self.Kdprod del self.sp self.csrH['data'] = self.thr.to_device( self.spH.data.astype(self.dtype)) self.csrH['indices'] = self.thr.to_device( self.spH.indices.astype(numpy.uint32)) self.csrH['indptr'] = self.thr.to_device( self.spH.indptr.astype(numpy.uint32)) self.csrH['numrow'] = self.Kdprod del self.spH # import reikna.fft # # self.fft = reikna.fft.FFT(numpy.empty(self.st['Kd'], dtype=self.dtype), self.ft_axes).compile(self.thr, fast_math=False) # # self.zero_scalar=self.dtype(0.0+0.0j) @push_cuda_context def _k2y_legacy(self, k): """ Private: interpolation by the Sparse Matrix-Vector Multiplication """ y =self.thr.array( self.st['M'], dtype=self.dtype).fill(0) self.prg.cCSR_spmv_vector( self.batch, self.csr['numrow'], self.csr['indptr'], self.csr['indices'], self.csr['data'], k, y, local_size=int(self.wavefront), global_size=int(self.csr['numrow']*self.wavefront*self.batch) ) # self.thr.synchronize() return y @push_cuda_context def _y2k_legacy(self, y): """ Private: gridding by the Sparse Matrix-Vector Multiplication """ k = self.thr.array(self.Kd, dtype = self.dtype) self.prg.cCSR_spmv_vector( self.batch, self.csrH['numrow'], self.csrH['indptr'], self.csrH['indices'], self.csrH['data'], y, k, local_size=int(self.wavefront), global_size=int(self.csrH['numrow']*self.wavefront) )#,g_times_l=int(csrnumrow)) self.thr.synchronize() return k