nufft_hsa_legacy.py
"""
NUFFT HSA legacy classes (deprecated)
=======================================
"""
from __future__ import absolute_import
import numpy
import scipy.sparse
import numpy.fft
# import scipy.signal
import scipy.linalg
import scipy.special
from functools import wraps as _wraps
from ..src._helper import helper, helper1
class hypercube:
def __init__(self, shape, steps, invsteps, nelements, batch, dtype):
self.shape = shape
self.steps = steps
self.invsteps = invsteps
self.nelements = nelements
self.batch = batch
self.dtype = dtype
def push_cuda_context(hsa_method):
"""
Decorator: Push cude context to the top of the stack for current use
Add @push_cuda_context before the methods of NUFFT_hsa()
"""
@_wraps(hsa_method)
def wrapper(*args, **kwargs):
try:
args[0].thr._context.push()
except:
pass
return hsa_method(*args, **kwargs)
return wrapper
class NUFFT_hsa_legacy:
"""
(deprecated) Classical precomputed NUFFT_hsa for heterogeneous systems.
Naive implementation of multi-dimensional NUFFT on GPU.
Will be removed in future releases due to large memory size of 3D interpolator.
.. deprecated:: 0.4. Use :class:`pynufft.NUFFT_hsa` instead.
"""
def __init__(self, API = None, platform_number=None, device_number=None):
"""
Constructor.
:param API: The API for the heterogeneous system. API='cuda' or API='ocl'
:param platform_number: The number of the platform found by the API.
:param device_number: The number of the device found on the platform.
:type API: string
:type platform_number: integer
:type device_number: integer
:returns: 0
:rtype: int, float
:Example:
>>> import pynufft
>>> NufftObj = pynufft.NUFFT_hsa(API='cuda', 0, 0)
"""
# pass
self.dtype = numpy.complex64
# NUFFT_cpu.__init__(self)
import reikna.cluda as cluda
print('API = ', API)
self.cuda_flag, self.ocl_flag = helper.diagnose()
if None is API:
if self.cuda_flag is 1:
API = 'cuda'
elif self.ocl_flag is 1:
API = 'ocl'
else:
print('No accelerator is available.')
else:
api = API
print('now using API = ', API)
if platform_number is None:
platform_number = 0
if device_number is None:
device_number = 0
from reikna import cluda
import reikna.transformations
from reikna.cluda import functions, dtypes
try: # try to create api/platform/device using the given parameters
if 'cuda' == API:
api = cluda.cuda_api()
elif 'ocl' == API:
api = cluda.ocl_api()
platform = api.get_platforms()[platform_number]
device = platform.get_devices()[device_number]
except: # if failed, find out what's going wrong?
print('No accelerator is detected.')
# return 1
# Create context from device
self.thr = api.Thread(device) #pyopencl.create_some_context()
print('Using opencl or cuda = ', self.thr.api)
# print('Using opencl? ', self.thr.api is reikna.cluda.ocl)
# """
# Wavefront: as warp in cuda. Can control the width in a workgroup
# Wavefront is required in spmv_vector as it improves data coalescence.
# see cCSR_spmv and zSparseMatVec
# """
self.wavefront = api.DeviceParameters(device).warp_size
print('wavefront of OpenCL (as warp in CUDA) = ',self.wavefront)
from ..src.re_subroutine import create_kernel_sets
kernel_sets = create_kernel_sets(API)
prg = self.thr.compile(kernel_sets,
render_kwds=dict(LL = str(self.wavefront)),
fast_math=False)
self.prg = prg
print("Note: In the future the api will change!")
print("You have been warned!")
def plan(self, om, Nd, Kd, Jd, ft_axes = None, batch = None):
"""
Design the min-max interpolator.
:param om: The M off-grid locations in the frequency domain. Normalized between [-pi, pi]
:param Nd: The matrix size of equispaced image. Example: Nd=(256,256) for a 2D image; Nd = (128,128,128) for a 3D image
:param Kd: The matrix size of the oversampled frequency grid. Example: Kd=(512,512) for 2D image; Kd = (256,256,256) for a 3D image
:param Jd: The interpolator size. Example: Jd=(6,6) for 2D image; Jd = (6,6,6) for a 3D image
:type om: numpy.float array, matrix size = M * ndims
:type Nd: tuple, ndims integer elements.
:type Kd: tuple, ndims integer elements.
:type Jd: tuple, ndims integer elements.
:returns: 0
:rtype: int, float
:Example:
>>> import pynufft
>>> NufftObj = pynufft.NUFFT_cpu()
>>> NufftObj.plan(om, Nd, Kd, Jd)
"""
# n_shift = tuple(0*x for x in Nd)
self.ndims = len(Nd) # dimension
self.scale_gamma = numpy.prod(Kd)/numpy.prod(Nd)
if ft_axes is None:
ft_axes = range(0, self.ndims)
self.ft_axes = ft_axes
#
self.st = helper.plan(om, Nd, Kd, Jd, ft_axes = ft_axes, format = 'CSR')
self.Nd = self.st['Nd'] # backup
self.Kd = self.st['Kd']
self.sn = numpy.asarray(self.st['sn'].astype(self.dtype) ,order='C')# backup
if batch is None:
self.parallel_flag = 0
else:
self.parallel_flag = 1
if batch is None:
self.batch = numpy.uint32(1)
# self.nobatch = 1
else:
self.batch = numpy.uint32(batch)
self.Nd = self.st['Nd'] # backup
self.Kd = self.st['Kd']
self.sn = numpy.asarray(self.st['sn'].astype(self.dtype) ,order='C')# backup
if self.batch == 1 and (self.parallel_flag == 0):
self.multi_Nd = self.Nd
self.multi_Kd = self.Kd
self.multi_M = (self.st['M'], )
# self.sense2 = self.sense*numpy.reshape(self.sn, self.Nd + (1, )) # broadcasting the sense and scaling factor (Roll-off)
else: #self.batch is 0:
self.multi_Nd = self.Nd + (self.batch, )
self.multi_Kd = self.Kd + (self.batch, )
self.multi_M = (self.st['M'], )+ (self.batch, )
# Calculate the density compensation function
self.sp = self.st['p'].copy().tocsr()
self.spH = (self.st['p'].getH().copy()).tocsr()
self.Kdprod = numpy.int32(numpy.prod(self.st['Kd']))
self.Jdprod = numpy.int32(numpy.prod(self.st['Jd']))
del self.st['p'], self.st['sn']
# self._precompute_sp()
# del self.st['p0']
self.NdCPUorder, self.KdCPUorder, self.nelem = helper.preindex_copy(self.st['Nd'], self.st['Kd'])
# self.Nd_elements, self.Kd_elements, self.invNd_elements = helper.preindex_copy2(self.st['Nd'], self.st['Kd'])
self.Nd_elements, self.invNd_elements = helper.strides_divide_itemsize(self.st['Nd'])
self.Kd_elements = helper.strides_divide_itemsize( self.st['Kd'])[0] # only return the Kd_elements
self.offload()
return 0
@push_cuda_context
def offload(self):
"""
self.offload():
Off-load NUFFT to the opencl or cuda device(s)
:param API: define the device type, which can be 'cuda' or 'ocl'
:param platform_number: define which platform to be used. The default platform_number = 0.
:param device_number: define which device to be used. The default device_number = 0.
:type API: string
:type platform_number: int
:type device_number: int
:return: self: instance
"""
self.volume = {}
self.volume['NdGPUorder'] = self.thr.to_device( self.NdCPUorder)
self.volume['KdGPUorder'] = self.thr.to_device( self.KdCPUorder)
self.volume['SnGPUArray'] = self.thr.to_device( self.sn)
# self.NdGPUorder = self.thr.to_device( self.NdCPUorder)
# self.KdGPUorder = self.thr.to_device( self.KdCPUorder)
self.Ndprod = numpy.int32(numpy.prod(self.st['Nd']))
self.Kdprod = numpy.int32(numpy.prod(self.st['Kd']))
self.M = numpy.int32( self.st['M'])
# self.SnGPUArray = self.thr.to_device( self.sn)
self.csr = {}
self.csrH = {}
self.csr['data'] = self.thr.to_device( self.sp.data.astype(self.dtype))
self.csr['indices'] = self.thr.to_device( self.sp.indices.astype(numpy.uint32))
self.csr['indptr'] = self.thr.to_device( self.sp.indptr.astype(numpy.uint32))
self.csr['numrow'] = self.M
self.csr['numcol'] = self.Kdprod
del self.sp
self.csrH['data'] = self.thr.to_device( self.spH.data.astype(self.dtype))
self.csrH['indices'] = self.thr.to_device( self.spH.indices.astype(numpy.uint32))
self.csrH['indptr'] = self.thr.to_device( self.spH.indptr.astype(numpy.uint32))
self.csrH['numrow'] = self.Kdprod
del self.spH
import reikna.fft
self.fft = reikna.fft.FFT(numpy.empty(self.st['Kd'], dtype=self.dtype), self.ft_axes).compile(self.thr, fast_math=False)
self.zero_scalar=self.dtype(0.0+0.0j)
@push_cuda_context
def release(self):
del self.volume
del self.csr
del self.csrH
del self.prg
self.thr.release()
del self.thr
@push_cuda_context
def solve(self,gy, solver=None, *args, **kwargs):
"""
The solver of NUFFT_hsa
:param gy: data, reikna array, (M,) size
:param solver: could be 'cg', 'L1TVOLS', 'L1TVLAD'
:param maxiter: the number of iterations
:type gy: reikna array, dtype = numpy.complex64
:type solver: string
:type maxiter: int
:return: reikna array with size Nd
"""
from ..linalg.solve_hsa import solve
self._precompute_sp()
try:
return solve(self, gy, solver, *args, **kwargs)
except:
try:
print('The input array may not be a GPUarray.')
print('Automatically moving the input array to gpu, which is throttled by PCIe.')
print('You have been warned!')
py = self.thr.to_device(numpy.asarray(gy.astype(self.dtype), order = 'C' ))
return solve(self, py, solver, *args, **kwargs)
except:
if numpy.ndarray==type(gy):
print("input gy must be a reikna array with dtype = numpy.complex64")
raise #TypeError
else:
print("wrong")
raise #TypeError
@push_cuda_context
def forward(self, gx):
"""
Forward NUFFT on the heterogeneous device
:param gx: The input gpu array, with the size of Nd or Nd + (batch, )
:type: reikna gpu array with the dtype of numpy.complex64
:return: gy: The output gpu array, with size of (M,) or (M, batch)
:rtype: reikna gpu array with the dtype of numpy.complex64
"""
try:
xx = self.x2xx(gx)
except: # gx is not a gpu array
try:
print('The input array may not be a GPUarray.')
print('Automatically moving the input array to gpu, which is throttled by PCIe.')
print('You have been warned!')
px = self.thr.to_device(numpy.asarray(gx.astype(self.dtype), order = 'C' ))
xx = self.x2xx(px)
except:
if gx.shape != self.Nd:
print('shape of the input = ', gx.shape, ', but it should be ', self.Nd)
raise
k = self.xx2k(xx)
del xx
gy = self.k2y(k)
del k
return gy
@push_cuda_context
def adjoint(self, gy):
"""
Adjoint NUFFT on the heterogeneous device
:param gy: The input gpu array, with size=(M,)
:type: reikna gpu array with dtype =numpy.complex64
:return: gx: The output gpu array, with size=Nd
:rtype: reikna gpu array with dtype =numpy.complex64
"""
try:
k = self.y2k(gy)
except: # gx is not a gpu array
try:
print('The input array may not be a GPUarray.')
print('Automatically moving the input array to gpu, which is throttled by PCIe.')
print('You have been warned!')
py = self.thr.to_device(numpy.asarray(gy.astype(self.dtype), order = 'C' ))
k = self.y2k(py)
except:
print('Failed at self.adjont! Please check the gy shape, type, stride.')
raise
# k = self.y2k(gy)
xx = self.k2xx(k)
del k
gx = self.xx2x(xx)
del xx
return gx
@push_cuda_context
def selfadjoint2(self, gx):
"""
The Toeplitz on the heterogeneous device using the diaognal matrix to compute the convolution.
It is an approximation to the fully selfadjoint but in some cases the accuracy is high enough.
:param gx: The input gpu array, with size=Nd
:type: reikna gpu array with dtype =numpy.complex64
:return: gx: The output gpu array, with size=Nd
:rtype: reikna gpu array with dtype =numpy.complex64
"""
gk = self.xx2k(gx)
# try:
self.prg.cMultiplyVecInplace(self.batch, self.W, gk, local_size=None, global_size=int(self.Kdprod))
# except:
# self._precompute_sp()
# self.prg.cMultiplyVecInplace(self.W, gk, local_size=None, global_size=int(self.Kdprod))
gx2 = self.k2xx(gk)
del gk
return gx2
@push_cuda_context
def selfadjoint(self, gx):
"""
selfadjoint.
:param gx: The input gpu array, with size=Nd
:type: reikna gpu array with dtype =numpy.complex64
:return: gx: The output gpu array, with size=Nd
:rtype: reikna gpu array with dtype =numpy.complex64
"""
gy = self.forward(gx)
gx2 = self.adjoint(gy)
del gy
return gx2
@push_cuda_context
def x2z(self, x):
z = self.thr.array(self.multi_Nd, dtype=self.dtype)
self.thr.copy_array(x, z, )#size = int(x.nbytes/x.dtype.itemsize))#src_offset, dest_offset, size)
return z
@push_cuda_context
def z2xx(self, x):
"""
Private: Scaling on the heterogeneous device
Inplace multiplication of self.x_Nd by the scaling factor self.SnGPUArray.
"""
xx = self.thr.array(self.multi_Nd, dtype=self.dtype)
self.thr.copy_array(x, xx, )#size = int(x.nbytes/x.dtype.itemsize))#src_offset, dest_offset, size)
self.prg.cMultiplyVecInplace(self.batch, self.volume['SnGPUArray'], xx, local_size=None, global_size=int(self.Ndprod))
# self.thr.synchronize()
return xx
@push_cuda_context
def xx2k(self, xx):
"""
Private: oversampled FFT on the heterogeneous device
Firstly, zeroing the self.k_Kd array
Second, copy self.x_Nd array to self.k_Kd array by cSelect
Third: inplace FFT
"""
k = self.thr.array(self.multi_Kd, dtype = self.dtype)
k.fill(0)
# self.prg.cMultiplyScalar(self.zero_scalar, k, local_size=None, global_size=int(self.Kdprod))
self.prg.cSelect(self.volume['NdGPUorder'], self.volume['KdGPUorder'], xx, k, local_size=None, global_size=int(self.Ndprod))
self.fft( k, k,inverse=False)
self.thr.synchronize()
return k
@push_cuda_context
def k2y(self, k):
"""
Private: interpolation by the Sparse Matrix-Vector Multiplication
"""
y =self.thr.array( self.multi_M, dtype=self.dtype).fill(0)
self.prg.cCSR_spmv_vector(
self.csr['numrow'],
self.csr['indptr'],
self.csr['indices'],
self.csr['data'],
k,
y,
local_size=int(self.wavefront),
global_size=int(self.csr['numrow']*self.wavefront)
)
self.thr.synchronize()
return y
@push_cuda_context
def y2k(self, y):
"""
Private: gridding by the Sparse Matrix-Vector Multiplication
"""
k = self.thr.array(self.multi_Kd, dtype = self.dtype)
self.prg.cCSR_spmv_vector(
self.csrH['numrow'],
self.csrH['indptr'],
self.csrH['indices'],
self.csrH['data'],
y,
k,
local_size=int(self.wavefront),
global_size=int(self.csrH['numrow']*self.wavefront)
)#,g_times_l=int(csrnumrow))
self.thr.synchronize()
return k
@push_cuda_context
def k2xx(self, k):
"""
Private: the inverse FFT and image cropping (which is the reverse of _xx2k() method)
"""
xx = self.thr.array(self.multi_Nd, dtype = self.dtype)
self.fft( k, k, inverse=True)
self.thr.synchronize()
# self.x_Nd._zero_fill()
# self.prg.cMultiplyScalar(self.zero_scalar, xx, local_size=None, global_size=int(self.Ndprod ))
xx.fill(0)
# self.prg.cSelect(self.queue, (self.Ndprod,), None, self.KdGPUorder.data, self.NdGPUorder.data, self.k_Kd2.data, self.x_Nd.data )
self.prg.cSelect( self.volume['KdGPUorder'], self.volume['NdGPUorder'], k, xx, local_size=None, global_size=int(self.Ndprod ))
return xx
@push_cuda_context
def x2xx(self, x):
z = self.x2z(x)
xx = self.z2xx(z)
del z
return xx
@push_cuda_context
def xx2x(self, xx):
"""
Private: rescaling, which is identical to the _x2xx() method
"""
z = self.xx2z(xx)
x = self.z2x(z)
del z
return x
@push_cuda_context
def xx2z(self, xx):
z = self.thr.array(self.multi_Nd, dtype=self.dtype)
self.thr.copy_array(xx, z, )#size = int(x.nbytes/x.dtype.itemsize))#src_offset, dest_offset, size)
self.prg.cMultiplyConjVecInplace(self.batch, self.volume['SnGPUArray'], z, local_size=None, global_size=int(self.Ndprod))
self.thr.synchronize()
return z
def z2x(self, z):
return self.x2z(z)
@push_cuda_context
def _precompute_sp(self):
y =self.thr.array( self.multi_M, dtype=self.dtype).fill(1.0)
M = self.adjoint(y)
W = self.xx2k(M)
del M
self.W = self.thr.array( self.multi_Kd, dtype=self.dtype)
self.thr.copy_array(W, self.W, )
self.prg.cMultiplyConjVecInplace(numpy.uint32(1), W, self.W, local_size=None, global_size = int(self.batch * self.Kdprod))
del W
self.prg.cSqrt(self.W, local_size=None, global_size = int( self.Kdprod))
@push_cuda_context
def to_device(self, image, shape = None):
g_image = self.thr.array(image.shape, dtype = self.dtype)
self.thr.to_device(image.astype(self.dtype), dest = g_image)
return g_image