nufft_hsa.py
"""
NUFFT HSA classes (deprecated)
=======================================
"""
from __future__ import absolute_import
import numpy
import warnings
import scipy.sparse
import numpy.fft
#import scipy.signal
import scipy.linalg
import scipy.special
from functools import wraps as _wraps
from ..src._helper import helper, helper1
class hypercube:
def __init__(self, shape, steps, invsteps, nelements, batch, dtype):
self.shape = shape
self.steps = steps
self.invsteps = invsteps
self.nelements = nelements
self.batch = batch
self.dtype = dtype
def push_cuda_context(hsa_method):
"""
Decorator to push up CUDA context to the top of the stack for current use
Add @push_cuda_context before the methods of NUFFT_hsa()
"""
@_wraps(hsa_method)
def wrapper(*args, **kwargs):
try:
args[0].thr._context.push()
except:
pass
return hsa_method(*args, **kwargs)
return wrapper
class NUFFT_hsa:
"""
NUFFT_hsa class.
Multi-coil or single-coil memory reduced NUFFT.
"""
def __init__(self, API=None, platform_number=None, device_number=None,
verbosity=0):
"""
Constructor.
:param API: The API for the heterogeneous system. API='cuda'
or API='ocl'
:param platform_number: The number of the platform found by the API.
:param device_number: The number of the device found on the platform.
:param verbosity: Defines the verbosity level, default value is 0
:type API: string
:type platform_number: integer
:type device_number: integer
:type verbosity: integer
:returns: 0
:rtype: int, float
:Example:
>>> from pynufft import NUFFT_hsa
>>> NufftObj = NUFFT_hsa(API='cuda', platform_number=0,
device_number=0, verbosity=0)
"""
warnings.warn('In the future NUFFT_hsa and NUFFT_cpu api will'
' be merged', FutureWarning)
self.dtype = numpy.complex64
self.verbosity = verbosity
import reikna.cluda as cluda
if self.verbosity > 0:
print('The choosen API by the user is ', API)
self.cuda_flag, self.ocl_flag = helper.diagnose(
verbosity=self.verbosity)
if None is API:
if self.cuda_flag is 1:
API = 'cuda'
elif self.ocl_flag is 1:
API = 'ocl'
else:
warnings.warn('No parallelization will be made since no GPU '
'device has been detected.', UserWarning)
else:
api = API
if self.verbosity > 0:
print('The used API will be ', API)
if platform_number is None:
platform_number = 0
if device_number is None:
device_number = 0
from reikna import cluda
import reikna.transformations
from reikna.cluda import functions, dtypes
try: # try to create api/platform/device using the given parameters
if 'cuda' == API:
api = cluda.cuda_api()
elif 'ocl' == API:
api = cluda.ocl_api()
platform = api.get_platforms()[platform_number]
device = platform.get_devices()[device_number]
except: # if failed, find out what's going wrong?
warnings.warn('No parallelization will be made since no GPU '
'device has been detected.', UserWarning)
# return 1
# Create context from device
self.thr = api.Thread(device) # pyopencl.create_some_context()
self.device = device # : device name
if self.verbosity > 0:
print('Using opencl or cuda = ', self.thr.api)
# """
# Wavefront: as warp in cuda. Can control the width in a workgroup
# Wavefront is required in spmv_vector as it improves data coalescence.
# see cCSR_spmv and zSparseMatVec
# """
self.wavefront = api.DeviceParameters(device).warp_size
if self.verbosity > 0:
print('Wavefront of OpenCL (as wrap of CUDA) = ', self.wavefront)
from ..src import re_subroutine # import create_kernel_sets
kernel_sets = re_subroutine.create_kernel_sets(API)
prg = self.thr.compile(kernel_sets,
render_kwds=dict(LL=str(self.wavefront)),
fast_math=False)
self.prg = prg
def set_wavefront(self, wf):
self.wavefront = int(wt)#api.DeviceParameters(device).warp_size
if self.verbosity > 0:
print('Wavefront of OpenCL (as wrap of CUDA) = ', self.wavefront)
from ..src import re_subroutine # import create_kernel_sets
kernel_sets = re_subroutine.create_kernel_sets(API)
prg = self.thr.compile(kernel_sets,
render_kwds=dict(LL=str(self.wavefront)),
fast_math=False)
self.prg = prg
def plan(self, om, Nd, Kd, Jd, ft_axes=None, batch=None, radix=None):
"""
Design the multi-coil or single-coil memory reduced interpolator.
:param om: The M off-grid locations in the frequency domain.
Normalized between [-pi, pi]
:param Nd: The matrix size of equispaced image.
Example: Nd=(256, 256) for a 2D image;
Nd = (128, 128, 128) for a 3D image
:param Kd: The matrix size of the oversampled frequency grid.
Example: Kd=(512,512) for 2D image;
Kd = (256,256,256) for a 3D image
:param Jd: The interpolator size.
Example: Jd=(6,6) for 2D image;
Jd = (6,6,6) for a 3D image
:param ft_axes: The dimensions to be transformed by FFT.
Example: ft_axes = (0, 1) for 2D,
ft_axes = (0, 1, 2) for 3D;
ft_axes = None for all dimensions.
:param batch: Batch NUFFT.
If provided, the shape is Nd + (batch, ).
The last axis is the number of parallel coils.
batch = None for single coil.
:param radix: ????.
If provided, the shape is Nd + (batch, ).
The last axis is the number of parallel coils.
batch = None for single coil.
:type om: numpy.float array, matrix size = (M, ndims)
:type Nd: tuple, ndims integer elements.
:type Kd: tuple, ndims integer elements.
:type Jd: tuple, ndims integer elements.
:type ft_axes: tuple, selected axes to be transformed.
:type batch: int or None
:returns: 0
:rtype: int, float
:Example:
>>> from pynufft import NUFFT_hsa
>>> NufftObj = NUFFT_hsa()
>>> NufftObj.plan(om, Nd, Kd, Jd)
"""
self.ndims = len(Nd) # dimension
if ft_axes is None:
ft_axes = range(0, self.ndims)
self.ft_axes = ft_axes
self.st = helper.plan(om, Nd, Kd, Jd, ft_axes=ft_axes,
format='pELL', radix=radix)
if batch is None:
self.parallel_flag = 0
else:
self.parallel_flag = 1
if batch is None:
self.batch = numpy.uint32(1)
else:
self.batch = numpy.uint32(batch)
self.Nd = self.st['Nd'] # backup
self.Kd = self.st['Kd']
# self.sn = numpy.asarray(self.st['sn'].astype(self.dtype),
# order='C')# backup
if self.batch == 1 and (self.parallel_flag == 0):
self.multi_Nd = self.Nd
self.multi_Kd = self.Kd
self.multi_M = (self.st['M'], )
# Broadcasting the sense and scaling factor (Roll-off)
# self.sense2 = self.sense*numpy.reshape(self.sn, self.Nd + (1, ))
else: # self.batch is 0:
self.multi_Nd = self.Nd + (self.batch, )
self.multi_Kd = self.Kd + (self.batch, )
self.multi_M = (self.st['M'], ) + (self.batch, )
self.invbatch = 1.0 / self.batch
self.Kdprod = numpy.uint32(numpy.prod(self.st['Kd']))
self.Jdprod = numpy.uint32(numpy.prod(self.st['Jd']))
self.Ndprod = numpy.uint32(numpy.prod(self.st['Nd']))
self.Nd_elements, self.invNd_elements = helper.strides_divide_itemsize(
self.st['Nd'])
# only return the Kd_elements
self.Kd_elements = helper.strides_divide_itemsize(self.st['Kd'])[0]
self.NdCPUorder, self.KdCPUorder, self.nelem = helper.preindex_copy(
self.st['Nd'],
self.st['Kd'])
self.offload()
return 0
@push_cuda_context
def offload(self): # API, platform_number=0, device_number=0):
"""
self.offload():
Off-load NUFFT to the opencl or cuda device(s)
:param API: define the device type, which can be 'cuda' or 'ocl'
:param platform_number: define which platform to be used.
The default platform_number = 0.
:param device_number: define which device to be used.
The default device_number = 0.
:type API: string
:type platform_number: int
:type device_number: int
:return: self: instance
"""
self.pELL = {} # dictionary
self.pELL['nRow'] = numpy.uint32(self.st['pELL'].nRow)
self.pELL['prodJd'] = numpy.uint32(self.st['pELL'].prodJd)
self.pELL['sumJd'] = numpy.uint32(self.st['pELL'].sumJd)
self.pELL['dim'] = numpy.uint32(self.st['pELL'].dim)
self.pELL['Jd'] = self.thr.to_device(
self.st['pELL'].Jd.astype(numpy.uint32))
self.pELL['meshindex'] = self.thr.to_device(
self.st['pELL'].meshindex.astype(numpy.uint32))
self.pELL['kindx'] = self.thr.to_device(
self.st['pELL'].kindx.astype(numpy.uint32))
self.pELL['udata'] = self.thr.to_device(
self.st['pELL'].udata.astype(self.dtype))
self.volume = {}
self.volume['Nd_elements'] = self.thr.to_device(
numpy.asarray(self.Nd_elements, dtype=numpy.uint32))
self.volume['Kd_elements'] = self.thr.to_device(
numpy.asarray(self.Kd_elements, dtype=numpy.uint32))
self.volume['invNd_elements'] = self.thr.to_device(
self.invNd_elements.astype(numpy.float32))
self.volume['Nd'] = self.thr.to_device(numpy.asarray(
self.st['Nd'], dtype=numpy.uint32))
self.volume['NdGPUorder'] = self.thr.to_device(self.NdCPUorder)
self.volume['KdGPUorder'] = self.thr.to_device(self.KdCPUorder)
self.volume['gpu_coil_profile'] = self.thr.array(
self.multi_Nd, dtype=self.dtype).fill(1.0)
Nd = self.st['Nd']
# tensor_sn = numpy.empty((numpy.sum(Nd), ), dtype=numpy.float32)
#
# shift = 0
# for dimid in range(0, len(Nd)):
#
# tensor_sn[shift :shift + Nd[dimid]] = \
# self.st['tensor_sn'][dimid][:, 0].real
# shift = shift + Nd[dimid]
# self.volume['tensor_sn'] = self.thr.to_device(
# self.st['tensor_sn'].astype(numpy.float32))
self.tSN = {}
self.tSN['Td_elements'] = self.thr.to_device(
numpy.asarray(self.st['tSN'].Td_elements, dtype=numpy.uint32))
self.tSN['invTd_elements'] = self.thr.to_device(
self.st['tSN'].invTd_elements.astype(numpy.float32))
self.tSN['Td'] = self.thr.to_device(
numpy.asarray(self.st['tSN'].Td, dtype=numpy.uint32))
self.tSN['Tdims'] = self.st['tSN'].Tdims
self.tSN['tensor_sn'] = self.thr.to_device(
self.st['tSN'].tensor_sn.astype(numpy.float32))
self.Ndprod = numpy.int32(numpy.prod(self.st['Nd']))
self.Kdprod = numpy.int32(numpy.prod(self.st['Kd']))
self.M = numpy.int32(self.st['M'])
import reikna.fft
if self.batch > 1: # batch mode
self.fft = reikna.fft.FFT(
numpy.empty(self.st['Kd']+(self.batch, ), dtype=self.dtype),
self.ft_axes).compile(self.thr, fast_math=False)
else: # elf.Reps == 1 Batch mode is wrong for
self.fft = reikna.fft.FFT(
numpy.empty(self.st['Kd'], dtype=self.dtype),
self.ft_axes).compile(self.thr, fast_math=False)
self.zero_scalar = self.dtype(0.0+0.0j)
del self.st['pELL']
if self.verbosity > 0:
print('End of offload')
@push_cuda_context
def reset_sense(self):
self.volume['gpu_coil_profile'].fill(1.0)
@push_cuda_context
def set_sense(self, coil_profile):
if coil_profile.shape != self.multi_Nd:
print('The shape of coil_profile is ', coil_profile.shape)
print('But it should be', self.Nd + (self.batch, ))
raise ValueError
else:
self.volume['gpu_coil_profile'] = self.thr.to_device(
coil_profile.astype(self.dtype))
if self.verbosity > 0:
print('Successfully loading coil sensitivities!')
# if coil_profile.shape == self.Nd + (self.batch, ):
@push_cuda_context
def to_device(self, image, shape=None):
g_image = self.thr.array(image.shape, dtype=self.dtype)
self.thr.to_device(image.astype(self.dtype), dest=g_image)
return g_image
@push_cuda_context
def s2x(self, s):
x = self.thr.array(self.multi_Nd, dtype=self.dtype)
# print("Now populate the array to multi-coil")
self.prg.cPopulate(
self.batch,
self.Ndprod,
s,
x,
local_size=None,
global_size=int(self.batch * self.Ndprod))
# x2 = x * self.volume['gpu_coil_profile']
# try:
# x2 = x * self.volume['gpu_coil_profile']
# except:
# x2 = x
self.prg.cMultiplyVecInplace(
numpy.uint32(1),
self.volume['gpu_coil_profile'],
x,
local_size=None,
global_size=int(self.batch*self.Ndprod))
# self.prg.cDistribute(
# self.batch,
# self.Ndprod,
# self.volume['gpu_coil_profile'],
# s,
# x,
# local_size=None,
# global_size=int(self.batch*self.Ndprod))
return x
@push_cuda_context
def x2xx(self, x):
# xx = self.thr.array(xx.shape, dtype = self.dtype)
# self.thr.copy_array(z, dest=xx, )
# size = int(xx.nbytes/xx.dtype.itemsize))
# Hack of error in cuda backends; 8 is the byte of numpy.complex64
# size = int(xx.nbytes/8)
xx = self.thr.array(x.shape, dtype=self.dtype)
self.thr.copy_array(x, dest=xx, )
# size = int(xx.nbytes/xx.dtype.itemsize))
# Hack of error in cuda backends; 8 is the byte of numpy.complex64:
# size = int(xx.nbytes/8)
# self.prg.cMultiplyRealInplace(
# self.batch,
# self.volume['SnGPUArray'],
# xx,
# local_size=None,
# global_size=int(self.Ndprod*self.batch))
# self.prg.cTensorMultiply(numpy.uint32(self.batch),
# numpy.uint32(self.ndims),
# self.volume['Nd'],
# self.volume['Nd_elements'],
# self.volume['invNd_elements'],
# self.volume['tensor_sn'],
# xx,
# numpy.uint32(0),
# local_size=None,
# global_size=int(self.batch*self.Ndprod))
self.prg.cTensorMultiply(numpy.uint32(self.batch),
numpy.uint32(self.tSN['Tdims']),
self.tSN['Td'],
self.tSN['Td_elements'],
self.tSN['invTd_elements'],
self.tSN['tensor_sn'],
xx,
numpy.uint32(0),
local_size=None,
global_size=int(self.batch*self.Ndprod))
# self.thr.synchronize()
return xx
@push_cuda_context
def xx2k(self, xx):
"""
Private: oversampled FFT on the heterogeneous device
First, zeroing the self.k_Kd array
Second, copy self.x_Nd array to self.k_Kd array by cSelect
Third, inplace FFT
"""
k = self.thr.array(self.multi_Kd, dtype=self.dtype)
# k = self.thr.array(self.multi_Kd, dtype=self.dtype).fill(0.0 + 0.0j)
k.fill(0)
# self.prg.cMultiplyScalar(self.zero_scalar,
# k,
# local_size=None,
# global_size=int(self.Kdprod))
# # self.prg.cSelect(self.NdGPUorder,
# self.KdGPUorder,
# xx,
# k,
# local_size=None,
# global_size=int(self.Ndprod))
# self.prg.cSelect2(self.batch,
# self.volume['NdGPUorder'],
# self.volume['KdGPUorder'],
# xx,
# k,
# local_size=None,
# global_size=int(self.Ndprod*self.batch))
self.prg.cTensorCopy(
self.batch,
numpy.uint32(self.ndims),
self.volume['Nd_elements'],
self.volume['Kd_elements'],
self.volume['invNd_elements'],
xx,
k,
numpy.int32(1), # Directions: Nd -> Kd, 1; Kd -> Nd, -1
local_size=None,
global_size=int(self.Ndprod))
self.fft(k, k, inverse=False)
# self.thr.synchronize()
return k
@push_cuda_context
def k2y(self, k):
"""
Private: interpolation by the Sparse Matrix-Vector Multiplication
"""
# if self.parallel_flag is 1:
# y =self.thr.array((self.st['M'], self.batch),
# dtype=self.dtype).fill(0)
# else:
# y =self.thr.array( (self.st['M'], ), dtype=self.dtype).fill(0)
y = self.thr.array(self.multi_M, dtype=self.dtype).fill(0)
self.prg.pELL_spmv_mCoil(
self.batch,
self.pELL['nRow'],
self.pELL['prodJd'],
self.pELL['sumJd'],
self.pELL['dim'],
self.pELL['Jd'],
# self.pELL_currsumJd,
self.pELL['meshindex'],
self.pELL['kindx'],
self.pELL['udata'],
k,
y,
local_size=int(self.wavefront),
global_size=int(self.pELL['nRow'] *
self.batch * self.wavefront)
)
# self.thr.synchronize()
return y
@push_cuda_context
def y2k(self, y):
"""
Private: gridding by the Sparse Matrix-Vector Multiplication
However, serial atomic add is far too slow and inaccurate.
"""
# kx = self.thr.array(self.multi_Kd, dtype=numpy.float32).fill(0.0)
# ky = self.thr.array(self.multi_Kd, dtype=numpy.float32).fill(0.0)
k = self.thr.array(self.multi_Kd, dtype=numpy.complex64).fill(0.0)
res = self.thr.array(self.multi_Kd, dtype=numpy.complex64).fill(0.0)
# array which saves the residue of two sum
self.prg.pELL_spmvh_mCoil(
self.batch,
self.pELL['nRow'],
self.pELL['prodJd'],
self.pELL['sumJd'],
self.pELL['dim'],
self.pELL['Jd'],
self.pELL['meshindex'],
self.pELL['kindx'],
self.pELL['udata'],
# kx, ky,
k,
res,
y,
local_size=None,
global_size=int(self.pELL['nRow']*self.batch )#*
# int(self.pELL['prodJd']) * int(self.batch))
)
return k + res
@push_cuda_context
def k2xx(self, k):
"""
Private: the inverse FFT and image cropping (which is the reverse of
_xx2k() method)
"""
self.fft(k, k, inverse=True)
# self.thr.synchronize()
# self.x_Nd._zero_fill()
# self.prg.cMultiplyScalar(self.zero_scalar,
# xx,
# local_size=None,
# global_size=int(self.Ndprod))
# if self.parallel_flag is 1:
# xx = self.thr.array(self.st['Nd']+(self.batch, ),
# dtype = self.dtype)
# else:
# xx = self.thr.array(self.st['Nd'], dtype = self.dtype)
xx = self.thr.array(self.multi_Nd, dtype=self.dtype)
xx.fill(0)
# self.prg.cSelect(self.queue,
# (self.Ndprod,),
# None,
# self.volume['KdGPUorder'].data,
# self.NdGPUorder.data,
# self.k_Kd2.data,
# self.x_Nd.data)
# self.prg.cSelect2(self.batch,
# self.volume['KdGPUorder'],
# self.volume['NdGPUorder'],
# k,
# xx,
# local_size=None,
# global_size=int(self.Ndprod*self.batch))
self.prg.cTensorCopy(
self.batch,
numpy.uint32(self.ndims),
self.volume['Nd_elements'],
self.volume['Kd_elements'],
self.volume['invNd_elements'],
k,
xx,
numpy.int32(-1),
local_size=None,
global_size=int(self.Ndprod))
return xx
@push_cuda_context
def xx2x(self, xx):
x = self.thr.array(xx.shape, dtype=self.dtype)
self.thr.copy_array(xx, dest=x, )
# size = int(xx.nbytes/xx.dtype.itemsize))
# Hack of error in cuda backends; 8 is the byte of numpy.complex64
# size = int(xx.nbytes/8)
# self.prg.cMultiplyRealInplace(self.batch,
# self.volume['SnGPUArray'],
# z,
# local_size=None,
# global_size=int(self.batch*self.Ndprod))
# self.prg.cTensorMultiply(numpy.uint32(self.batch),
# numpy.uint32(self.ndims),
# self.volume['Nd'],
# self.volume['Nd_elements'],
# self.volume['invNd_elements'],
# self.volume['tensor_sn'],
# x,
# numpy.uint32(0),
# local_size=None,
# global_size=int(self.batch*self.Ndprod))
self.prg.cTensorMultiply(numpy.uint32(self.batch),
numpy.uint32(self.tSN['Tdims']),
self.tSN['Td'],
self.tSN['Td_elements'],
self.tSN['invTd_elements'],
self.tSN['tensor_sn'],
x,
numpy.uint32(0),
local_size=None,
global_size=int(self.batch *
self.Ndprod))
# self.thr.synchronize()
return x
@push_cuda_context
def x2s(self, x):
s = self.thr.array(self.st['Nd'], dtype=self.dtype)
# try:
self.prg.cMultiplyConjVecInplace(
numpy.uint32(1),
self.volume['gpu_coil_profile'],
x,
local_size=None,
global_size=int(self.batch*self.Ndprod))
# x2 = x * self.volume['gpu_coil_profile'].conj()
# except:
# x2 = x
self.prg.cAggregate(
self.batch,
self.Ndprod,
x,
s,
local_size=int(self.wavefront),
global_size=int(self.batch*self.Ndprod*self.wavefront))
# self.prg.cMerge(self.batch,
# self.Ndprod,
# self.volume['gpu_coil_profile'],
# x,
# s,
# local_size=int(self.wavefront),
# global_size = int(self.batch*self.Ndprod*
# self.wavefront))
return s
@push_cuda_context
def selfadjoint_one2many2one(self, gx):
"""
selfadjoint_one2many2one NUFFT on the heterogeneous device
:param gx: The input gpu array, with size=Nd
:type gx: reikna gpu array with dtype =numpy.complex64
:return: gx: The output gpu array, with size=Nd
:rtype: reikna gpu array with dtype =numpy.complex64
"""
gy = self.forward_one2many(gx)
gx2 = self.adjoint_many2one(gy)
del gy
return gx2
def selfadjoint(self, gx):
"""
selfadjoint NUFFT on the heterogeneous device
:param gx: The input gpu array, with size=Nd
:type gx: reikna gpu array with dtype =numpy.complex64
:return: gx: The output gpu array, with size=Nd
:rtype: reikna gpu array with dtype =numpy.complex64
"""
gy = self.forward(gx)
gx2 = self.adjoint(gy)
del gy
return gx2
@push_cuda_context
def forward(self, gx):
"""
Forward NUFFT on the heterogeneous device
:param gx: The input gpu array, with size = Nd
:type gx: reikna gpu array with dtype = numpy.complex64
:return: gy: The output gpu array, with size = (M,)
:rtype: reikna gpu array with dtype = numpy.complex64
"""
try:
xx = self.x2xx(gx)
except: # gx is not a gpu array
try:
warnings.warn('The input array may not be a GPUarray '
'Automatically moving the input array to gpu, '
'which is throttled by PCIe.', UserWarning)
px = self.to_device(gx, )
# pz = self.thr.to_device(numpy.asarray(gz.astype(self.dtype),
# order = 'C' ))
xx = self.x2xx(px)
except:
if gx.shape != self.Nd + (self.batch, ):
warnings.warn('Shape of the input is ' + str(gx.shape) +
' while it should be ' +
str(self.Nd+(self.batch, )), UserWarning)
raise
k = self.xx2k(xx)
del xx
gy = self.k2y(k)
del k
return gy
@push_cuda_context
def forward_one2many(self, s):
try:
x = self.s2x(s)
except: # gx is not a gpu array
try:
warnings.warn('In s2x(): The input array may not be '
'a GPUarray. Automatically moving the input'
' array to gpu, which is throttled by PCIe.',
UserWarning)
ps = self.to_device(s, )
# px = self.thr.to_device(numpy.asarray(x.astype(self.dtype),
# order = 'C' ))
x = self.s2x(ps)
except:
if s.shape != self.Nd:
warnings.warn('Shape of the input is ' + str(x.shape) +
' while it should be ' +
str(self.Nd), UserWarning)
raise
y = self.forward(x)
return y
@push_cuda_context
def adjoint_many2one(self, y):
try:
x = self.adjoint(y)
except: # gx is not a gpu array
try:
if self.verbosity > 0:
print('y.shape = ', y.shape)
warnings.warn('In adjoint_many2one(): The input array may not '
'be a GPUarray. Automatically moving the input'
' array to gpu, which is throttled by PCIe.',
UserWarning)
py = self.to_device(y, )
# py = self.thr.to_device(numpy.asarray(y.astype(self.dtype),
# order = 'C' ))
x = self.adjoint(py)
except:
print('Failed at self.adjoint_many2one! Please check the gy'
' shape, type and stride.')
raise
# z = self.adjoint(y)
s = self.x2s(x)
return s
@push_cuda_context
def adjoint(self, gy):
"""
Adjoint NUFFT on the heterogeneous device
:param gy: The input gpu array, with size=(M,)
:type: reikna gpu array with dtype =numpy.complex64
:return: gx: The output gpu array, with size=Nd
:rtype: reikna gpu array with dtype =numpy.complex64
"""
try:
k = self.y2k(gy)
except: # gx is not a gpu array
try:
warnings.warn('In adjoint(): The input array may not '
'be a GPUarray. Automatically moving the input'
' array to gpu, which is throttled by PCIe.',
UserWarning)
py = self.to_device(gy, )
# py = self.thr.to_device(numpy.asarray(gy.astype(self.dtype),
# order = 'C' ))
k = self.y2k(py)
except:
print('Failed at self.adjont! Please check the gy shape, '
'type, stride.')
raise
# k = self.y2k(gy)
xx = self.k2xx(k)
del k
gx = self.xx2x(xx)
del xx
return gx
@push_cuda_context
def release(self):
del self.volume
del self.prg
del self.pELL
self.thr.release()
del self.thr
@push_cuda_context
def solve(self, gy, solver=None, *args, **kwargs):
"""
The solver of NUFFT_hsa
:param gy: data, reikna array, (M,) size
:param solver: could be 'cg', 'L1TVOLS', 'L1TVLAD'
:param maxiter: the number of iterations
:type gy: reikna array, dtype = numpy.complex64
:type solver: string
:type maxiter: int
:return: reikna array with size Nd
"""
from ..linalg.solve_hsa import solve
try:
return solve(self, gy, solver, *args, **kwargs)
except:
try:
warnings.warn('In solve(): The input array may not '
'be a GPUarray. Automatically moving the input'
' array to gpu, which is throttled by PCIe.',
UserWarning)
py = self.to_device(gy, )
return solve(self, py, solver, *args, **kwargs)
except:
if numpy.ndarray == type(gy):
print("Input gy must be a reikna array with dtype ="
" numpy.complex64")
raise # TypeError
else:
print("wrong")
raise # TypeError