""" NUFFT HSA legacy classes (deprecated) ======================================= """ from __future__ import absolute_import import numpy import scipy.sparse import numpy.fft import scipy.signal import scipy.linalg import scipy.special from functools import wraps as _wraps from ..src._helper import helper, helper1 class hypercube: def __init__(self, shape, steps, invsteps, nelements, batch, dtype): self.shape = shape self.steps = steps self.invsteps = invsteps self.nelements = nelements self.batch = batch self.dtype = dtype def push_cuda_context(hsa_method): """ Decorator: Push cude context to the top of the stack for current use Add @push_cuda_context before the methods of NUFFT_hsa() """ @_wraps(hsa_method) def wrapper(*args, **kwargs): try: args[0].thr._context.push() except: pass return hsa_method(*args, **kwargs) return wrapper class NUFFT_hsa_legacy: """ (deprecated) Classical precomputed NUFFT_hsa for heterogeneous systems. Naive implementation of multi-dimensional NUFFT on GPU. Will be removed in future releases due to large memory size of 3D interpolator. .. deprecated:: 0.4. Use :class:`pynufft.NUFFT_hsa` instead. """ def __init__(self, API = None, platform_number=None, device_number=None): """ Constructor. :param API: The API for the heterogeneous system. API='cuda' or API='ocl' :param platform_number: The number of the platform found by the API. :param device_number: The number of the device found on the platform. :type API: string :type platform_number: integer :type device_number: integer :returns: 0 :rtype: int, float :Example: >>> import pynufft >>> NufftObj = pynufft.NUFFT_hsa(API='cuda', 0, 0) """ # pass self.dtype = numpy.complex64 # NUFFT_cpu.__init__(self) import reikna.cluda as cluda print('API = ', API) self.cuda_flag, self.ocl_flag = helper.diagnose() if None is API: if self.cuda_flag is 1: API = 'cuda' elif self.ocl_flag is 1: API = 'ocl' else: print('No accelerator is available.') else: api = API print('now using API = ', API) if platform_number is None: platform_number = 0 if device_number is None: device_number = 0 from reikna import cluda import reikna.transformations from reikna.cluda import functions, dtypes try: # try to create api/platform/device using the given parameters if 'cuda' == API: api = cluda.cuda_api() elif 'ocl' == API: api = cluda.ocl_api() platform = api.get_platforms()[platform_number] device = platform.get_devices()[device_number] except: # if failed, find out what's going wrong? print('No accelerator is detected.') # return 1 # Create context from device self.thr = api.Thread(device) #pyopencl.create_some_context() print('Using opencl or cuda = ', self.thr.api) # print('Using opencl? ', self.thr.api is reikna.cluda.ocl) # """ # Wavefront: as warp in cuda. Can control the width in a workgroup # Wavefront is required in spmv_vector as it improves data coalescence. # see cCSR_spmv and zSparseMatVec # """ self.wavefront = api.DeviceParameters(device).warp_size print('wavefront of OpenCL (as warp in CUDA) = ',self.wavefront) from ..src.re_subroutine import create_kernel_sets kernel_sets = create_kernel_sets(API) prg = self.thr.compile(kernel_sets, render_kwds=dict(LL = str(self.wavefront)), fast_math=False) self.prg = prg print("Note: In the future the api will change!") print("You have been warned!") def plan(self, om, Nd, Kd, Jd, ft_axes = None, batch = None): """ Design the min-max interpolator. :param om: The M off-grid locations in the frequency domain. Normalized between [-pi, pi] :param Nd: The matrix size of equispaced image. Example: Nd=(256,256) for a 2D image; Nd = (128,128,128) for a 3D image :param Kd: The matrix size of the oversampled frequency grid. Example: Kd=(512,512) for 2D image; Kd = (256,256,256) for a 3D image :param Jd: The interpolator size. Example: Jd=(6,6) for 2D image; Jd = (6,6,6) for a 3D image :type om: numpy.float array, matrix size = M * ndims :type Nd: tuple, ndims integer elements. :type Kd: tuple, ndims integer elements. :type Jd: tuple, ndims integer elements. :returns: 0 :rtype: int, float :Example: >>> import pynufft >>> NufftObj = pynufft.NUFFT_cpu() >>> NufftObj.plan(om, Nd, Kd, Jd) """ # n_shift = tuple(0*x for x in Nd) self.ndims = len(Nd) # dimension self.scale_gamma = numpy.prod(Kd)/numpy.prod(Nd) if ft_axes is None: ft_axes = range(0, self.ndims) self.ft_axes = ft_axes # self.st = helper.plan(om, Nd, Kd, Jd, ft_axes = ft_axes, format = 'CSR') self.Nd = self.st['Nd'] # backup self.Kd = self.st['Kd'] self.sn = numpy.asarray(self.st['sn'].astype(self.dtype) ,order='C')# backup if batch is None: self.parallel_flag = 0 else: self.parallel_flag = 1 if batch is None: self.batch = numpy.uint32(1) # self.nobatch = 1 else: self.batch = numpy.uint32(batch) self.Nd = self.st['Nd'] # backup self.Kd = self.st['Kd'] self.sn = numpy.asarray(self.st['sn'].astype(self.dtype) ,order='C')# backup if self.batch == 1 and (self.parallel_flag == 0): self.multi_Nd = self.Nd self.multi_Kd = self.Kd self.multi_M = (self.st['M'], ) # self.sense2 = self.sense*numpy.reshape(self.sn, self.Nd + (1, )) # broadcasting the sense and scaling factor (Roll-off) else: #self.batch is 0: self.multi_Nd = self.Nd + (self.batch, ) self.multi_Kd = self.Kd + (self.batch, ) self.multi_M = (self.st['M'], )+ (self.batch, ) # Calculate the density compensation function self.sp = self.st['p'].copy().tocsr() self.spH = (self.st['p'].getH().copy()).tocsr() self.Kdprod = numpy.int32(numpy.prod(self.st['Kd'])) self.Jdprod = numpy.int32(numpy.prod(self.st['Jd'])) del self.st['p'], self.st['sn'] # self._precompute_sp() # del self.st['p0'] self.NdCPUorder, self.KdCPUorder, self.nelem = helper.preindex_copy(self.st['Nd'], self.st['Kd']) # self.Nd_elements, self.Kd_elements, self.invNd_elements = helper.preindex_copy2(self.st['Nd'], self.st['Kd']) self.Nd_elements, self.invNd_elements = helper.strides_divide_itemsize(self.st['Nd']) self.Kd_elements = helper.strides_divide_itemsize( self.st['Kd'])[0] # only return the Kd_elements self.offload() return 0 @push_cuda_context def offload(self): """ self.offload(): Off-load NUFFT to the opencl or cuda device(s) :param API: define the device type, which can be 'cuda' or 'ocl' :param platform_number: define which platform to be used. The default platform_number = 0. :param device_number: define which device to be used. The default device_number = 0. :type API: string :type platform_number: int :type device_number: int :return: self: instance """ self.volume = {} self.volume['NdGPUorder'] = self.thr.to_device( self.NdCPUorder) self.volume['KdGPUorder'] = self.thr.to_device( self.KdCPUorder) self.volume['SnGPUArray'] = self.thr.to_device( self.sn) # self.NdGPUorder = self.thr.to_device( self.NdCPUorder) # self.KdGPUorder = self.thr.to_device( self.KdCPUorder) self.Ndprod = numpy.int32(numpy.prod(self.st['Nd'])) self.Kdprod = numpy.int32(numpy.prod(self.st['Kd'])) self.M = numpy.int32( self.st['M']) # self.SnGPUArray = self.thr.to_device( self.sn) self.csr = {} self.csrH = {} self.csr['data'] = self.thr.to_device( self.sp.data.astype(self.dtype)) self.csr['indices'] = self.thr.to_device( self.sp.indices.astype(numpy.uint32)) self.csr['indptr'] = self.thr.to_device( self.sp.indptr.astype(numpy.uint32)) self.csr['numrow'] = self.M self.csr['numcol'] = self.Kdprod del self.sp self.csrH['data'] = self.thr.to_device( self.spH.data.astype(self.dtype)) self.csrH['indices'] = self.thr.to_device( self.spH.indices.astype(numpy.uint32)) self.csrH['indptr'] = self.thr.to_device( self.spH.indptr.astype(numpy.uint32)) self.csrH['numrow'] = self.Kdprod del self.spH import reikna.fft self.fft = reikna.fft.FFT(numpy.empty(self.st['Kd'], dtype=self.dtype), self.ft_axes).compile(self.thr, fast_math=False) self.zero_scalar=self.dtype(0.0+0.0j) @push_cuda_context def release(self): del self.volume del self.csr del self.csrH del self.prg self.thr.release() del self.thr @push_cuda_context def solve(self,gy, solver=None, *args, **kwargs): """ The solver of NUFFT_hsa :param gy: data, reikna array, (M,) size :param solver: could be 'cg', 'L1TVOLS', 'L1TVLAD' :param maxiter: the number of iterations :type gy: reikna array, dtype = numpy.complex64 :type solver: string :type maxiter: int :return: reikna array with size Nd """ from ..linalg.solve_hsa import solve self._precompute_sp() try: return solve(self, gy, solver, *args, **kwargs) except: try: print('The input array may not be a GPUarray.') print('Automatically moving the input array to gpu, which is throttled by PCIe.') print('You have been warned!') py = self.thr.to_device(numpy.asarray(gy.astype(self.dtype), order = 'C' )) return solve(self, py, solver, *args, **kwargs) except: if numpy.ndarray==type(gy): print("input gy must be a reikna array with dtype = numpy.complex64") raise #TypeError else: print("wrong") raise #TypeError @push_cuda_context def forward(self, gx): """ Forward NUFFT on the heterogeneous device :param gx: The input gpu array, with the size of Nd or Nd + (batch, ) :type: reikna gpu array with the dtype of numpy.complex64 :return: gy: The output gpu array, with size of (M,) or (M, batch) :rtype: reikna gpu array with the dtype of numpy.complex64 """ try: xx = self.x2xx(gx) except: # gx is not a gpu array try: print('The input array may not be a GPUarray.') print('Automatically moving the input array to gpu, which is throttled by PCIe.') print('You have been warned!') px = self.thr.to_device(numpy.asarray(gx.astype(self.dtype), order = 'C' )) xx = self.x2xx(px) except: if gx.shape != self.Nd: print('shape of the input = ', gx.shape, ', but it should be ', self.Nd) raise k = self.xx2k(xx) del xx gy = self.k2y(k) del k return gy @push_cuda_context def adjoint(self, gy): """ Adjoint NUFFT on the heterogeneous device :param gy: The input gpu array, with size=(M,) :type: reikna gpu array with dtype =numpy.complex64 :return: gx: The output gpu array, with size=Nd :rtype: reikna gpu array with dtype =numpy.complex64 """ try: k = self.y2k(gy) except: # gx is not a gpu array try: print('The input array may not be a GPUarray.') print('Automatically moving the input array to gpu, which is throttled by PCIe.') print('You have been warned!') py = self.thr.to_device(numpy.asarray(gy.astype(self.dtype), order = 'C' )) k = self.y2k(py) except: print('Failed at self.adjont! Please check the gy shape, type, stride.') raise # k = self.y2k(gy) xx = self.k2xx(k) del k gx = self.xx2x(xx) del xx return gx @push_cuda_context def selfadjoint2(self, gx): """ The Toeplitz on the heterogeneous device using the diaognal matrix to compute the convolution. It is an approximation to the fully selfadjoint but in some cases the accuracy is high enough. :param gx: The input gpu array, with size=Nd :type: reikna gpu array with dtype =numpy.complex64 :return: gx: The output gpu array, with size=Nd :rtype: reikna gpu array with dtype =numpy.complex64 """ gk = self.xx2k(gx) # try: self.prg.cMultiplyVecInplace(self.batch, self.W, gk, local_size=None, global_size=int(self.Kdprod)) # except: # self._precompute_sp() # self.prg.cMultiplyVecInplace(self.W, gk, local_size=None, global_size=int(self.Kdprod)) gx2 = self.k2xx(gk) del gk return gx2 @push_cuda_context def selfadjoint(self, gx): """ selfadjoint. :param gx: The input gpu array, with size=Nd :type: reikna gpu array with dtype =numpy.complex64 :return: gx: The output gpu array, with size=Nd :rtype: reikna gpu array with dtype =numpy.complex64 """ gy = self.forward(gx) gx2 = self.adjoint(gy) del gy return gx2 @push_cuda_context def x2z(self, x): z = self.thr.array(self.multi_Nd, dtype=self.dtype) self.thr.copy_array(x, z, )#size = int(x.nbytes/x.dtype.itemsize))#src_offset, dest_offset, size) return z @push_cuda_context def z2xx(self, x): """ Private: Scaling on the heterogeneous device Inplace multiplication of self.x_Nd by the scaling factor self.SnGPUArray. """ xx = self.thr.array(self.multi_Nd, dtype=self.dtype) self.thr.copy_array(x, xx, )#size = int(x.nbytes/x.dtype.itemsize))#src_offset, dest_offset, size) self.prg.cMultiplyVecInplace(self.batch, self.volume['SnGPUArray'], xx, local_size=None, global_size=int(self.Ndprod)) # self.thr.synchronize() return xx @push_cuda_context def xx2k(self, xx): """ Private: oversampled FFT on the heterogeneous device Firstly, zeroing the self.k_Kd array Second, copy self.x_Nd array to self.k_Kd array by cSelect Third: inplace FFT """ k = self.thr.array(self.multi_Kd, dtype = self.dtype) k.fill(0) # self.prg.cMultiplyScalar(self.zero_scalar, k, local_size=None, global_size=int(self.Kdprod)) self.prg.cSelect(self.volume['NdGPUorder'], self.volume['KdGPUorder'], xx, k, local_size=None, global_size=int(self.Ndprod)) self.fft( k, k,inverse=False) self.thr.synchronize() return k @push_cuda_context def k2y(self, k): """ Private: interpolation by the Sparse Matrix-Vector Multiplication """ y =self.thr.array( self.multi_M, dtype=self.dtype).fill(0) self.prg.cCSR_spmv_vector( self.csr['numrow'], self.csr['indptr'], self.csr['indices'], self.csr['data'], k, y, local_size=int(self.wavefront), global_size=int(self.csr['numrow']*self.wavefront) ) self.thr.synchronize() return y @push_cuda_context def y2k(self, y): """ Private: gridding by the Sparse Matrix-Vector Multiplication """ k = self.thr.array(self.multi_Kd, dtype = self.dtype) self.prg.cCSR_spmv_vector( self.csrH['numrow'], self.csrH['indptr'], self.csrH['indices'], self.csrH['data'], y, k, local_size=int(self.wavefront), global_size=int(self.csrH['numrow']*self.wavefront) )#,g_times_l=int(csrnumrow)) self.thr.synchronize() return k @push_cuda_context def k2xx(self, k): """ Private: the inverse FFT and image cropping (which is the reverse of _xx2k() method) """ xx = self.thr.array(self.multi_Nd, dtype = self.dtype) self.fft( k, k, inverse=True) self.thr.synchronize() # self.x_Nd._zero_fill() # self.prg.cMultiplyScalar(self.zero_scalar, xx, local_size=None, global_size=int(self.Ndprod )) xx.fill(0) # self.prg.cSelect(self.queue, (self.Ndprod,), None, self.KdGPUorder.data, self.NdGPUorder.data, self.k_Kd2.data, self.x_Nd.data ) self.prg.cSelect( self.volume['KdGPUorder'], self.volume['NdGPUorder'], k, xx, local_size=None, global_size=int(self.Ndprod )) return xx @push_cuda_context def x2xx(self, x): z = self.x2z(x) xx = self.z2xx(z) del z return xx @push_cuda_context def xx2x(self, xx): """ Private: rescaling, which is identical to the _x2xx() method """ z = self.xx2z(xx) x = self.z2x(z) del z return x @push_cuda_context def xx2z(self, xx): z = self.thr.array(self.multi_Nd, dtype=self.dtype) self.thr.copy_array(xx, z, )#size = int(x.nbytes/x.dtype.itemsize))#src_offset, dest_offset, size) self.prg.cMultiplyConjVecInplace(self.batch, self.volume['SnGPUArray'], z, local_size=None, global_size=int(self.Ndprod)) self.thr.synchronize() return z def z2x(self, z): return self.x2z(z) @push_cuda_context def _precompute_sp(self): y =self.thr.array( self.multi_M, dtype=self.dtype).fill(1.0) M = self.adjoint(y) W = self.xx2k(M) del M self.W = self.thr.array( self.multi_Kd, dtype=self.dtype) self.thr.copy_array(W, self.W, ) self.prg.cMultiplyConjVecInplace(numpy.uint32(1), W, self.W, local_size=None, global_size = int(self.batch * self.Kdprod)) del W self.prg.cSqrt(self.W, local_size=None, global_size = int( self.Kdprod)) @push_cuda_context def to_device(self, image, shape = None): g_image = self.thr.array(image.shape, dtype = self.dtype) self.thr.to_device(image.astype(self.dtype), dest = g_image) return g_image