Raw File
_nufft_class_methods_device.py
####################################
#     HSA code
####################################

import numpy
# import reikna
from functools import wraps as _wraps
from ..src._helper import helper#, helper1

def push_cuda_context(hsa_method):
    """
    Decorator: Push cuda context to the top of the stack before calling the context
    Add @push_cuda_context before the methods of NUFFT_device()
    """
    @_wraps(hsa_method)
    def wrapper(*args, **kwargs):
        try:
            args[0].thr._context.push()
        except:
            pass
        return hsa_method(*args, **kwargs)
    return wrapper


def _init__device(self, device_indx=None):
    """
    Constructor.

    :param API: The API for the heterogeneous system. API='cuda'
                or API='ocl'
    :param platform_number: The number of the platform found by the API.
    :param device_number: The number of the device found on the platform.
    :param verbosity: Defines the verbosity level, default value is 0
    :type API: string
    :type platform_number: integer
    :type device_number: integer
    :type verbosity: integer
    :returns: 0
    :rtype: int, float

    :Example:

    >>> from pynufft import NUFFT
    >>> NufftObj = NUFFT_device(API='cuda', platform_number=0,
                                     device_number=0, verbosity=0)
    """

    self.dtype = numpy.complex64
    self.verbosity = 0#verbosity

    from reikna import cluda
    import reikna.transformations
    from reikna.cluda import functions, dtypes
#         try:  # try to create api/platform/device using the given parameters
    API = device_indx[0]
    self.API = API
#             if 'cuda' == API:
#                 api = cluda.cuda_api()
#             elif 'ocl' == API:
#                 api = cluda.ocl_api()
#         api = device_indx[0]
    platform_number = device_indx[1]
    device_number = device_indx[2]
    
#         NUFFT_hsa.__init__(self, API, platform_number, device_number)
    
    platform = device_indx[3]
    device = device_indx[4]
#             self.thr = api.Thread(device)
    self.device = device
    self.thr = device_indx[5]
#             api = device_indx[6]
#             self.thr = api.Thread(device)  # pyopencl.create_some_context()
#         self.device = device  # : device name

#         """
#         Wavefront: as warp in cuda. Can control the width in a workgroup
#         Wavefront is required in spmv_vector as it improves data coalescence.
#         see cCSR_spmv and zSparseMatVec
#         """
    self.wavefront = device_indx[6]
#         if self.verbosity > 0:
#             print('Wavefront of OpenCL (as wrap of CUDA) = ', self.wavefront)
#             print('API = ', API)
#             print('thr = ',  self.thr
#                   )
    from ..src import re_subroutine  # import create_kernel_sets
    kernel_sets = re_subroutine.create_kernel_sets(self.API)

    prg = self.thr.compile(kernel_sets,
                           render_kwds=dict(LL=str(self.wavefront)),
                           fast_math=False)
    self.prg = prg
    self.processor = 'hsa'



def _plan_device(self, om, Nd, Kd, Jd, ft_axes=None, radix=None):
    """
    Design the multi-coil or single-coil memory reduced interpolator.


    :param om: The M off-grid locations in the frequency domain.
               Normalized between [-pi, pi]
    :param Nd: The matrix size of equispaced image.
                Example: Nd=(256, 256) for a 2D image;
                         Nd = (128, 128, 128) for a 3D image
    :param Kd: The matrix size of the oversampled frequency grid.
               Example: Kd=(512,512) for 2D image;
               Kd = (256,256,256) for a 3D image
    :param Jd: The interpolator size.
               Example: Jd=(6,6) for 2D image;
               Jd = (6,6,6) for a 3D image
    :param ft_axes: The dimensions to be transformed by FFT.
               Example: ft_axes = (0, 1) for 2D,
               ft_axes = (0, 1, 2) for 3D;
               ft_axes = None for all dimensions.
    :param radix: expert mode.
                If provided, the shape is Nd.
                The last axis is the number of parallel coils.
    :type om: numpy.float array, matrix size = (M, ndims)
    :type Nd: tuple, ndims integer elements.
    :type Kd: tuple, ndims integer elements.
    :type Jd: tuple, ndims integer elements.
    :type ft_axes: tuple, selected axes to be transformed.
    :returns: 0
    :rtype: int, float
    :Example:

    >>> import pynufft
    >>> device=pynufft.helper.device_list()[0]
    >>> NufftObj = pynufft.NUFFT(device)
    >>> NufftObj.plan(om, Nd, Kd, Jd)

    """

    self.ndims = len(Nd)  # dimension
    if ft_axes is None:
        ft_axes = range(0, self.ndims)
    self.ft_axes = ft_axes

    self.st = helper.plan(om, Nd, Kd, Jd, ft_axes=ft_axes,
                          format='pELL', radix=radix)
#     if batch is None:
    self.parallel_flag = 0
#     else:
#         self.parallel_flag = 1

#     if batch is None:
    self.batch = numpy.uint32(1)

#     else:
#         self.batch = numpy.uint32(batch)

    self.Nd = self.st['Nd']  # backup
    self.Kd = self.st['Kd']
    #  self.sn = numpy.asarray(self.st['sn'].astype(self.dtype),
    #                            order='C')# backup
#     if self.batch == 1:
#         self.multi_Nd = self.Nd
#         self.multi_Kd = self.Kd
#         self.multi_M = (self.st['M'], )
#         # Broadcasting the sense and scaling factor (Roll-off)
#         # self.sense2 = self.sense*numpy.reshape(self.sn, self.Nd + (1, ))
#     else:  # self.batch is 0:
#         self.multi_Nd = self.Nd + (self.batch, )
#         self.multi_Kd = self.Kd + (self.batch, )
#         self.multi_M = (self.st['M'], ) + (self.batch, )

    self.Kdprod = numpy.uint32(numpy.prod(self.st['Kd']))
    self.Jdprod = numpy.uint32(numpy.prod(self.st['Jd']))
    self.Ndprod = numpy.uint32(numpy.prod(self.st['Nd']))

    self.Nd_elements, self.invNd_elements = helper.strides_divide_itemsize(
                                                self.st['Nd'])
    # only return the Kd_elements
    self.Kd_elements = helper.strides_divide_itemsize(self.st['Kd'])[0]

    self._offload_device()

    return 0    

@push_cuda_context
def _offload_device(self):  # API, platform_number=0, device_number=0):
    """
    self.offload():

    Off-load NUFFT to the opencl or cuda device(s)

    :param API: define the device type, which can be 'cuda' or 'ocl'
    :param platform_number: define which platform to be used.
                            The default platform_number = 0.
    :param device_number: define which device to be used.
                        The default device_number = 0.
    :type API: string
    :type platform_number: int
    :type device_number: int
    :return: self: instance
    """

    self.pELL = {}  # dictionary

    self.pELL['nRow'] = numpy.uint32(self.st['pELL'].nRow)
    self.pELL['prodJd'] = numpy.uint32(self.st['pELL'].prodJd)
    self.pELL['sumJd'] = numpy.uint32(self.st['pELL'].sumJd)
    self.pELL['dim'] = numpy.uint32(self.st['pELL'].dim)
    self.pELL['Jd'] = self.thr.to_device(
        self.st['pELL'].Jd.astype(numpy.uint32))
    self.pELL['meshindex'] = self.thr.to_device(
        self.st['pELL'].meshindex.astype(numpy.uint32))
    self.pELL['kindx'] = self.thr.to_device(
        self.st['pELL'].kindx.astype(numpy.uint32))
    self.pELL['udata'] = self.thr.to_device(
        self.st['pELL'].udata.astype(self.dtype))

    self.volume = {}

    self.volume['Nd_elements'] = self.thr.to_device(
        numpy.asarray(self.Nd_elements, dtype=numpy.uint32))
    self.volume['Kd_elements'] = self.thr.to_device(
        numpy.asarray(self.Kd_elements, dtype=numpy.uint32))
    self.volume['invNd_elements'] = self.thr.to_device(
        self.invNd_elements.astype(numpy.float32))
    self.volume['Nd'] = self.thr.to_device(numpy.asarray(
        self.st['Nd'], dtype=numpy.uint32))
#     self.volume['gpu_coil_profile'] = self.thr.array(
#         self.multi_Nd, dtype=self.dtype).fill(1.0)

    Nd = self.st['Nd']

    self.tSN = {}
    self.tSN['Td_elements'] = self.thr.to_device(
        numpy.asarray(self.st['tSN'].Td_elements, dtype=numpy.uint32))
    self.tSN['invTd_elements'] = self.thr.to_device(
        self.st['tSN'].invTd_elements.astype(numpy.float32))
    self.tSN['Td'] = self.thr.to_device(
        numpy.asarray(self.st['tSN'].Td, dtype=numpy.uint32))
    self.tSN['Tdims'] = self.st['tSN'].Tdims
    self.tSN['tensor_sn'] = self.thr.to_device(
        self.st['tSN'].tensor_sn.astype(numpy.float32))

    self.Ndprod = numpy.int32(numpy.prod(self.st['Nd']))
    self.Kdprod = numpy.int32(numpy.prod(self.st['Kd']))
    self.M = numpy.int32(self.st['M'])

    import reikna.fft
    self.fft = reikna.fft.FFT(
            numpy.empty(self.Kd, dtype=self.dtype),
            self.ft_axes).compile(self.thr, fast_math=False)

    self.zero_scalar = self.dtype(0.0+0.0j)
    del self.st['pELL']
    if self.verbosity > 0:
        print('End of offload')

@push_cuda_context
def _set_wavefront_device(self, wf):
#         try:
    self.wavefront = int(wf)#api.DeviceParameters(device).warp_size
    if self.verbosity > 0:
        print('Wavefront of OpenCL (as wrap of CUDA) = ', self.wavefront)

    from ..src import re_subroutine  # import create_kernel_sets
    kernel_sets = re_subroutine.create_kernel_sets(self.API)

    prg = self.thr.compile(kernel_sets,
                           render_kwds=dict(LL=str(self.wavefront)),
                           fast_math=False)
    self.prg = prg

# @push_cuda_context
# def _reset_sense_device_deprecated(self):
#     self.volume['gpu_coil_profile'].fill(1.0)
# 
# @push_cuda_context
# def _set_sense_device_deprecated(self, coil_profile_device):
#     self.volume['gpu_coil_profile'] = coil_profile_device


    # if coil_profile.shape == self.Nd + (self.batch, ):

@push_cuda_context
def to_device(self, x, shape=None):
    gx = self.thr.to_device(x.copy().astype(self.dtype))
#         g_image = self.thr.array(image.shape, dtype=self.dtype)
#         self.thr.to_device(image.astype(self.dtype), dest=g_image)
    return gx
 
@push_cuda_context
def to_host(self, data):
    return data.get() 

# @push_cuda_context
# def _s2x_device(self, s):
#     x = self.thr.array(self.multi_Nd, dtype=self.dtype)
# 
#     self.prg.cPopulate(
#         self.batch,
#         self.Ndprod,
#         s,
#         x,
#         local_size=None,
#         global_size=int(self.batch * self.Ndprod))
# 
#     self.prg.cMultiplyVecInplace(
#         numpy.uint32(1),
#         self.volume['gpu_coil_profile'],
#         x,
#         local_size=None,
#         global_size=int(self.batch*self.Ndprod))
# 
#     return x    

@push_cuda_context
def _x2xx_device(self, x):

    xx = self.thr.array(x.shape, dtype=self.dtype)
    self.thr.copy_array(x, dest=xx, )

    self.prg.cTensorMultiply(numpy.uint32(self.batch),
                             numpy.uint32(self.tSN['Tdims']),
                             self.tSN['Td'],
                             self.tSN['Td_elements'],
                             self.tSN['invTd_elements'],
                             self.tSN['tensor_sn'],
                             xx,
                             numpy.uint32(0),
                             local_size=None,
                             global_size=int(self.batch*self.Ndprod))
    # self.thr.synchronize()
    return xx    

@push_cuda_context
def _xx2k_device(self, xx):
    """
    Private: oversampled FFT on the heterogeneous device

    First, zeroing the self.k_Kd array
    Second, copy self.x_Nd array to self.k_Kd array by cSelect
    Third, inplace FFT
    """
    k = self.thr.array(self.Kd, dtype=self.dtype)
    
    k.fill(0)

    self.prg.cTensorCopy(
         self.batch,
         numpy.uint32(self.ndims),
         self.volume['Nd_elements'],
         self.volume['Kd_elements'],
         self.volume['invNd_elements'],
         xx,
         k,
         numpy.int32(1),  # Directions: Nd -> Kd, 1; Kd -> Nd, -1
         local_size=None,
         global_size=int(self.Ndprod))
    self.fft(k, k, inverse=False)
#         self.thr.synchronize()
    return k    

@push_cuda_context
def _k2y_device(self, k):
    """
    Private: interpolation by the Sparse Matrix-Vector Multiplication
    """

    y = self.thr.array(self.M, dtype=self.dtype).fill(0)
    self.prg.pELL_spmv_mCoil(
                        self.batch,
                        self.pELL['nRow'],
                        self.pELL['prodJd'],
                        self.pELL['sumJd'],
                        self.pELL['dim'],
                        self.pELL['Jd'],
                        # self.pELL_currsumJd,
                        self.pELL['meshindex'],
                        self.pELL['kindx'],
                        self.pELL['udata'],
                        k,
                        y,
                        local_size=int(self.wavefront),
                        global_size=int(self.pELL['nRow'] *
                                        self.batch * self.wavefront)
                        )
#         self.thr.synchronize()
    return y    

@push_cuda_context
def _y2k_device(self, y):
    """
    Private: gridding by the Sparse Matrix-Vector Multiplication
    Atomic_twosum together provide better accuracy than generic atomic_add. 
    See: ocl_add and cuda_add code-strings in atomic_add(), inside the re_subroutine.py. 
    
    """

#         kx = self.thr.array(self.multi_Kd, dtype=numpy.float32).fill(0.0)
#         ky = self.thr.array(self.multi_Kd, dtype=numpy.float32).fill(0.0)
    k = self.thr.array(self.Kd, dtype=numpy.complex64).fill(0.0)
#     res = self.thr.array(self.Kd, dtype=numpy.complex64).fill(0.0)
    # array which saves the residue of two sum
    
    self.prg.pELL_spmvh_mCoil(
                        self.batch,
                        self.pELL['nRow'],
                        self.pELL['prodJd'],
                        self.pELL['sumJd'],
                        self.pELL['dim'],
                        self.pELL['Jd'],
                        self.pELL['meshindex'],
                        self.pELL['kindx'],
                        self.pELL['udata'],
#                             kx, ky,
                        k,
#                         res,
                        y,
                        local_size=int(self.wavefront),
                        global_size=int(self.pELL['nRow'] *self.wavefront)#*
#                                             int(self.pELL['prodJd']) * int(self.batch))
                        )

    return k# + res    


@push_cuda_context
def _k2xx_device(self, k):
    """
    Private: the inverse FFT and image cropping (which is the reverse of
    _xx2k() method)
    """

    self.fft(k, k, inverse=True)

    xx = self.thr.array(self.Nd, dtype=self.dtype)
    xx.fill(0)

    self.prg.cTensorCopy(
                         self.batch,
                         numpy.uint32(self.ndims),
                         self.volume['Nd_elements'],
                         self.volume['Kd_elements'],
                         self.volume['invNd_elements'],
                         k,
                         xx,
                         numpy.int32(-1),
                         local_size=None,
                         global_size=int(self.Ndprod))
    return xx    


@push_cuda_context
def _xx2x_device(self, xx):
    x = self.thr.array(xx.shape, dtype=self.dtype)
    self.thr.copy_array(xx, dest=x, )

    self.prg.cTensorMultiply(numpy.uint32(self.batch),
                             numpy.uint32(self.tSN['Tdims']),
                             self.tSN['Td'],
                             self.tSN['Td_elements'],
                             self.tSN['invTd_elements'],
                             self.tSN['tensor_sn'],
                             x,
                             numpy.uint32(0),
                             local_size=None,
                             global_size=int(self.batch *
                                             self.Ndprod))
    
    return x    

# @push_cuda_context
# def _x2s_device(self, x):
#     s = self.thr.array(self.st['Nd'], dtype=self.dtype)
# 
#     self.prg.cMultiplyConjVecInplace(
#         numpy.uint32(1),
#         self.volume['gpu_coil_profile'],
#         x,
#         local_size=None,
#         global_size=int(self.batch*self.Ndprod))
# 
#     self.prg.cAggregate(
#         self.batch,
#         self.Ndprod,
#         x,
#         s,
#         local_size=int(self.wavefront),
#         global_size=int(self.batch*self.Ndprod*self.wavefront))
# 
#     return s    

# @push_cuda_context
# def _selfadjoint_one2many2one_device_deprecated(self, gx):
#     """
#     selfadjoint_one2many2one NUFFT on the heterogeneous device
# 
#     :param gx: The input gpu array, with size=Nd
#     :type gx: reikna gpu array with dtype =numpy.complex64
#     :return: gx: The output gpu array, with size=Nd
#     :rtype: reikna gpu array with dtype =numpy.complex64
#     """
# 
#     gy = self._forward_one2many_device(gx)
#     gx2 = self._adjoint_many2one_device(gy)
#     del gy
#     return gx2    
# 
# @push_cuda_context
# def _selfadjoint_one2many2one_legacy_deprecated(self, gx):
#     """
#     selfadjoint_one2many2one NUFFT (Teplitz) on the heterogeneous device
# 
#     :param gx: The input gpu array, with size=Nd
#     :type gx: reikna gpu array with dtype =numpy.complex64
#     :return: gx: The output gpu array, with size=Nd
#     :rtype: reikna gpu array with dtype =numpy.complex64
#     """
# 
#     gy = self._forward_one2many_legacy(gx)
#     gx2 = self._adjoint_many2one_legacy(gy)
#     del gy
#     return gx2    

@push_cuda_context
def _selfadjoint_device(self, gx):
    """
    selfadjoint NUFFT on the heterogeneous device

    :param gx: The input gpu array, with size=Nd
    :type gx: reikna gpu array with dtype =numpy.complex64
    :return: gx: The output gpu array, with size=Nd
    :rtype: reikna gpu array with dtype =numpy.complex64
    """

    gy = self._forward_device(gx)
    gx2 = self._adjoint_device(gy)
    del gy
    return gx2    

@push_cuda_context
def _selfadjoint_legacy(self, gx):
    """
    selfadjoint NUFFT on the heterogeneous device

    :param gx: The input gpu array, with size=Nd
    :type gx: reikna gpu array with dtype =numpy.complex64
    :return: gx: The output gpu array, with size=Nd
    :rtype: reikna gpu array with dtype =numpy.complex64
    """

    gy = self._forward_legacy(gx)
    gx2 = self._adjoint_legacy(gy)
    del gy
    return gx2    

@push_cuda_context
def _forward_device(self, gx):
    """
    Forward NUFFT on the heterogeneous device

    :param gx: The input gpu array, with size = Nd
    :type gx: reikna gpu array with dtype = numpy.complex64
    :return: gy: The output gpu array, with size = (M,)
    :rtype: reikna gpu array with dtype = numpy.complex64
    """
    xx = self._x2xx_device(gx)


    k = self._xx2k_device(xx)
    del xx
    gy = self._k2y_device(k)
    del k
    return gy    

@push_cuda_context
def _forward_legacy(self, gx):
    """
    Forward NUFFT on the heterogeneous device

    :param gx: The input gpu array, with size = Nd
    :type gx: reikna gpu array with dtype = numpy.complex64
    :return: gy: The output gpu array, with size = (M,)
    :rtype: reikna gpu array with dtype = numpy.complex64
    """
    xx = self._x2xx_device(gx)


    k = self._xx2k_device(xx)
    del xx
    gy = self._k2y_legacy(k)
    del k
    return gy    

@push_cuda_context
def _forward_one2many_device_deprecated(self, s):
    x = self._s2x_device(s)
    y = self._forward_device(x)
    return y    

@push_cuda_context
def _adjoint_many2one_device_deprecated(self, y):
    x = self._adjoint_device(y)
    s = self._x2s_device(x)
    return s    

@push_cuda_context
def _forward_one2many_legacy_deprecated(self, s):
    x = self._s2x_device(s)
    y = self._forward_legacy(x)
    return y    

@push_cuda_context
def _adjoint_many2one_legacy_deprecated(self, y):
    x = self._adjoint_legacy(y)
    s = self._x2s_device(x)
    return s    

@push_cuda_context
def _adjoint_device(self, gy):
    """
    Adjoint NUFFT on the heterogeneous device

    :param gy: The input gpu array, with size=(M,)
    :type: reikna gpu array with dtype =numpy.complex64
    :return: gx: The output gpu array, with size=Nd
    :rtype: reikna gpu array with dtype =numpy.complex64
    """
    k = self._y2k_device(gy)

    xx = self._k2xx_device(k)
    del k
    gx = self._xx2x_device(xx)
    del xx
    return gx   

@push_cuda_context
def _adjoint_legacy(self, gy):
    """
    Adjoint NUFFT on the heterogeneous device

    :param gy: The input gpu array, with size=(M,)
    :type: reikna gpu array with dtype =numpy.complex64
    :return: gx: The output gpu array, with size=Nd
    :rtype: reikna gpu array with dtype =numpy.complex64
    """
    k = self._y2k_legacy(gy)

    xx = self._k2xx_device(k)
    del k
    gx = self._xx2x_device(xx)
    del xx
    return gx   

@push_cuda_context
def release(self):
    try:
        del self.volume
    except:
        pass
    try:
        del self.tSN
    except:
        pass
    try:
        del self.prg
    except:
        pass
    try:
        del self.pELL
    except:
        pass
    try:
        del self.csr 
        del self.csrH
    except:
        pass
    try:
        self.thr.release()
    except:
        pass
    try:
        del self.thr     
    except:
        pass
    
@push_cuda_context
def _solve_device(self, gy, solver=None, *args, **kwargs):
    """
    The solver of NUFFT

    :param gy: data, reikna array, (M,) size
    :param solver: could be 'cg', 'L1TVOLS', 'L1TVLAD'
    :param maxiter: the number of iterations
    :type gy: reikna array, dtype = numpy.complex64
    :type solver: string
    :type maxiter: int
    :return: reikna array with size Nd
    """
    from ..linalg.solve_device import solve
    return solve(self,  gy,  solver, *args, **kwargs)

@push_cuda_context
def _solve_legacy(self, gy, solver=None, *args, **kwargs):
    """
    The solver of NUFFT

    :param gy: data, reikna array, (M,) size
    :param solver: could be 'cg', 'L1TVOLS', 'L1TVLAD'
    :param maxiter: the number of iterations
    :type gy: reikna array, dtype = numpy.complex64
    :type solver: string
    :type maxiter: int
    :return: reikna array with size Nd
    """
    from ..linalg.solve_legacy import solve as solve2
    return solve2(self,  gy,  solver, *args, **kwargs)

def _plan_legacy(self, om, Nd, Kd, Jd, ft_axes = None):
    """
    Design the min-max interpolator.
     
    :param om: The M off-grid locations in the frequency domain. Normalized between [-pi, pi]
    :param Nd: The matrix size of equispaced image. Example: Nd=(256,256) for a 2D image; Nd = (128,128,128) for a 3D image
    :param Kd: The matrix size of the oversampled frequency grid. Example: Kd=(512,512) for 2D image; Kd = (256,256,256) for a 3D image
    :param Jd: The interpolator size. Example: Jd=(6,6) for 2D image; Jd = (6,6,6) for a 3D image
    :type om: numpy.float array, matrix size = M * ndims
    :type Nd: tuple, ndims integer elements. 
    :type Kd: tuple, ndims integer elements. 
    :type Jd: tuple, ndims integer elements. 
    :returns: 0
    :rtype: int, float
    :Example:

    >>> import pynufft
    >>> NufftObj = pynufft.NUFFT_cpu()
    >>> NufftObj.plan(om, Nd, Kd, Jd) 
     
    """         
    self.ndims = len(Nd)  # dimension
    if ft_axes is None:
        ft_axes = range(0, self.ndims)
    self.ft_axes = ft_axes

    self.st = helper.plan(om, Nd, Kd, Jd, ft_axes=ft_axes,
                          format='CSR')
#     if batch is None:
    self.parallel_flag = 0
#     else:
#         self.parallel_flag = 1

#     if batch is None:
    self.batch = numpy.uint32(1)

#     else:
#         self.batch = numpy.uint32(batch)

    self.Nd = self.st['Nd']  # backup
    self.Kd = self.st['Kd']
    #  self.sn = numpy.asarray(self.st['sn'].astype(self.dtype),
    #                            order='C')# backup
#     if self.batch == 1:
#         self.multi_Nd = self.Nd
#         self.multi_Kd = self.Kd
#         self.multi_M = (self.st['M'], )
        # Broadcasting the sense and scaling factor (Roll-off)
        # self.sense2 = self.sense*numpy.reshape(self.sn, self.Nd + (1, ))
#     else:  # self.batch is 0:
#         self.multi_Nd = self.Nd + (self.batch, )
#         self.multi_Kd = self.Kd + (self.batch, )
#         self.multi_M = (self.st['M'], ) + (self.batch, )

    self.Kdprod = numpy.uint32(numpy.prod(self.st['Kd']))
    self.Jdprod = numpy.uint32(numpy.prod(self.st['Jd']))
    self.Ndprod = numpy.uint32(numpy.prod(self.st['Nd']))

    self.Nd_elements, self.invNd_elements = helper.strides_divide_itemsize(
                                                self.st['Nd'])
    # only return the Kd_elements
    self.Kd_elements = helper.strides_divide_itemsize(self.st['Kd'])[0]
    
    self.sp = self.st['p'].copy().tocsr()
    self.spH = (self.st['p'].getH().copy()).tocsr()        
    
    self._offload_legacy()

    
     
    return 0

@push_cuda_context
def _offload_legacy(self):
    """
    self.offload():
     
    Off-load NUFFT to the opencl or cuda device(s)
     
    :param API: define the device type, which can be 'cuda' or 'ocl'
    :param platform_number: define which platform to be used. The default platform_number = 0.
    :param device_number: define which device to be used. The default device_number = 0.
    :type API: string
    :type platform_number: int
    :type device_number: int
    :return: self: instance

    """


#     self.pELL = {}  # dictionary
# 
#     self.pELL['nRow'] = numpy.uint32(self.st['pELL'].nRow)
#     self.pELL['prodJd'] = numpy.uint32(self.st['pELL'].prodJd)
#     self.pELL['sumJd'] = numpy.uint32(self.st['pELL'].sumJd)
#     self.pELL['dim'] = numpy.uint32(self.st['pELL'].dim)
#     self.pELL['Jd'] = self.thr.to_device(
#         self.st['pELL'].Jd.astype(numpy.uint32))
#     self.pELL['meshindex'] = self.thr.to_device(
#         self.st['pELL'].meshindex.astype(numpy.uint32))
#     self.pELL['kindx'] = self.thr.to_device(
#         self.st['pELL'].kindx.astype(numpy.uint32))
#     self.pELL['udata'] = self.thr.to_device(
#         self.st['pELL'].udata.astype(self.dtype))

    self.volume = {}

    self.volume['Nd_elements'] = self.thr.to_device(
        numpy.asarray(self.Nd_elements, dtype=numpy.uint32))
    self.volume['Kd_elements'] = self.thr.to_device(
        numpy.asarray(self.Kd_elements, dtype=numpy.uint32))
    self.volume['invNd_elements'] = self.thr.to_device(
        self.invNd_elements.astype(numpy.float32))
    self.volume['Nd'] = self.thr.to_device(numpy.asarray(
        self.st['Nd'], dtype=numpy.uint32))
#     self.volume['gpu_coil_profile'] = self.thr.array(
#         self.multi_Nd, dtype=self.dtype).fill(1.0)

    Nd = self.st['Nd']

    self.tSN = {}
    self.tSN['Td_elements'] = self.thr.to_device(
        numpy.asarray(self.st['tSN'].Td_elements, dtype=numpy.uint32))
    self.tSN['invTd_elements'] = self.thr.to_device(
        self.st['tSN'].invTd_elements.astype(numpy.float32))
    self.tSN['Td'] = self.thr.to_device(
        numpy.asarray(self.st['tSN'].Td, dtype=numpy.uint32))
    self.tSN['Tdims'] = self.st['tSN'].Tdims
    self.tSN['tensor_sn'] = self.thr.to_device(
        self.st['tSN'].tensor_sn.astype(numpy.float32))

    self.Ndprod = numpy.int32(numpy.prod(self.st['Nd']))
    self.Kdprod = numpy.int32(numpy.prod(self.st['Kd']))
    self.M = numpy.int32(self.st['M'])

    import reikna.fft
    self.fft = reikna.fft.FFT(
            numpy.empty(self.Kd, dtype=self.dtype),
            self.ft_axes).compile(self.thr, fast_math=False)

    self.zero_scalar = self.dtype(0.0+0.0j)
#     del self.st['pELL']
#     if self.verbosity > 0:
#         print('End of offload')
     
#         self.SnGPUArray = self.thr.to_device(  self.sn)
    self.csr = {}
    self.csrH = {}
    self.csr['data'] = self.thr.to_device( self.sp.data.astype(self.dtype))
    self.csr['indices'] = self.thr.to_device( self.sp.indices.astype(numpy.uint32))
    self.csr['indptr'] =  self.thr.to_device( self.sp.indptr.astype(numpy.uint32))
    self.csr['numrow'] = self.M
    self.csr['numcol'] = self.Kdprod
     

    del self.sp
    
    self.csrH['data'] = self.thr.to_device(  self.spH.data.astype(self.dtype))
    self.csrH['indices'] =  self.thr.to_device(  self.spH.indices.astype(numpy.uint32))
    self.csrH['indptr'] =  self.thr.to_device(  self.spH.indptr.astype(numpy.uint32))
    self.csrH['numrow'] = self.Kdprod
     
    del self.spH
    
#     import reikna.fft
# 
#     self.fft = reikna.fft.FFT(numpy.empty(self.st['Kd'], dtype=self.dtype), self.ft_axes).compile(self.thr, fast_math=False)
# 
#     self.zero_scalar=self.dtype(0.0+0.0j)
     

@push_cuda_context  
def _k2y_legacy(self, k):
    """
    Private: interpolation by the Sparse Matrix-Vector Multiplication
    """
    y =self.thr.array( self.st['M'], dtype=self.dtype).fill(0)
    self.prg.cCSR_spmv_vector(    
                     self.batch,                            
                       self.csr['numrow'], 
                       self.csr['indptr'],
                       self.csr['indices'],
                       self.csr['data'], 
                       k,
                       y,
                       local_size=int(self.wavefront),
                       global_size=int(self.csr['numrow']*self.wavefront*self.batch) 
                        )

#     self.thr.synchronize()
    return y    
@push_cuda_context
def _y2k_legacy(self, y):
    """
    Private: gridding by the Sparse Matrix-Vector Multiplication
    """
    k = self.thr.array(self.Kd, dtype = self.dtype)

    self.prg.cCSR_spmv_vector(
                        self.batch,
                       self.csrH['numrow'], 
                       self.csrH['indptr'],
                       self.csrH['indices'],
                       self.csrH['data'], 
                       y,
                       k,
                       local_size=int(self.wavefront),
                       global_size=int(self.csrH['numrow']*self.wavefront) 
                        )#,g_times_l=int(csrnumrow))

    self.thr.synchronize()
    return k    
back to top