Revision 6c2489a48c65fc09a4c12b7a243f29c60a6626d1 authored by Jyh-Miin Lin on 21 May 2019, 09:12:16 UTC, committed by Jyh-Miin Lin on 21 May 2019, 09:12:16 UTC
1 parent e9261fd
Raw File
.pynufft_hsa.py
"""
Class NUFFT on heterogeneous platforms
==================================================================
"""
from __future__ import division

from numpy.testing import (run_module_suite, assert_raises, assert_equal,
                           assert_almost_equal)

from unittest import skipIf
import numpy

import scipy.sparse  # TODO: refactor to remove this

from scipy.sparse.csgraph import _validation  # for cx_freeze debug

import numpy.fft
 
import scipy.linalg

dtype = numpy.complex64

from src._helper.helper import *


class NUFFT:
    """
    The class NUFFT belongs to pynufft_hsa, which offloads Non-Uniform Fast Fourier Transform (NUFFT) to heterogeneous devices.
   """

    def __init__(self):
        """
        Constructor.
        
        :param None:
        :type None: Python NoneType
        :return: NUFFT: the pynufft_hsa.NUFFT instance
        :rtype: NUFFT: the pynufft_hsa.NUFFT class
        :Example:

        >>> import pynufft
        >>> NufftObj = pynufft_hsa.NUFFT()


        .. note:: can be useful to emphasize
            important feature
        .. seealso:: :class:`MainClass2`
        .. warning:: arg2 must be non-zero.
        .. todo:: check that arg2 is non zero.
        """
        pass


    def plan(self, om, Nd, Kd, Jd):
        """
        Design the min-max interpolator.
        
        :param om: The M off-grid locations in the frequency domain. Normalized between [-pi, pi]
        :param Nd: The matrix size of equispaced image. Example: Nd=(256,256) for a 2D image; Nd = (128,128,128) for a 3D image
        :param Kd: The matrix size of the oversampled frequency grid. Example: Kd=(512,512) for 2D image; Kd = (256,256,256) for a 3D image
        :param Jd: The interpolator size. Example: Jd=(6,6) for 2D image; Jd = (6,6,6) for a 3D image
        :type om: numpy.float array, matrix size = M * ndims
        :type Nd: tuple, ndims integer elements. 
        :type Kd: tuple, ndims integer elements. 
        :type Jd: tuple, ndims integer elements. 
        :returns: 0
        :rtype: int, float
        :Example:

        >>> import pynufft
        >>> NufftObj = pynufft_hsa.NUFFT()
        >>> NufftObj.plan(om, Nd, Kd, Jd) 
        
        """         
        self.debug = 0  # debug

        n_shift = tuple(0*x for x in Nd)
        self.st = plan(om, Nd, Kd, Jd)
        
        self.Nd = self.st['Nd']  # backup
        self.sn = numpy.asarray(self.st['sn'].astype(dtype)  ,order='C')# backup
        self.ndims = len(self.st['Nd']) # dimension
        self._linear_phase(n_shift)  # calculate the linear phase thing
        
        # Calculate the density compensation function
        self._precompute_sp()
        del self.st['p'], self.st['sn']
        del self.st['p0'] 

        return 0

    def _precompute_sp(self):
        """

        Private: Precompute adjoint (gridding) and Toepitz interpolation matrix.
        
        :param None: 
        :type None: Python Nonetype
        :return: self: instance
        """
        try:
            self.sp = self.st['p']
            self.spH = (self.st['p'].getH().copy()).tocsr()
            self.spHsp =self.st['p'].getH().dot(self.st['p']).tocsr()
        except:
            print("errors occur in self.precompute_sp()")
            raise
#         self.truncate_selfadjoint( 1e-2)

    def offload(self, API, platform_number=0, device_number=0):
        """
        self.offload():
        
        Off-load NUFFT to the opencl or cuda device(s)
        
        :param API: define the device type, which can be 'cuda' or 'ocl'
        :param platform_number: define which platform to be used. The default platform_number = 0.
        :param device_number: define which device to be used. The default device_number = 0.
        :type API: string
        :type platform_number: int
        :type device_number: int
        :return: self: instance

        """
        from reikna import cluda
        import reikna.transformations
        from reikna.cluda import functions, dtypes
        try: # try to create api/platform/device using the given parameters
            if 'cuda' == API:
                api = cluda.cuda_api()
            elif 'ocl' == API:
                api = cluda.ocl_api()
     
            platform = api.get_platforms()[platform_number]
            
            device = platform.get_devices()[device_number]
        except: # if failed, find out what's going wrong?
            diagnose()
            
            return 1

        
#         print('device = ', device)
#         Create context from device
        self.thr = api.Thread(device) #pyopencl.create_some_context()
#         self.queue = pyopencl.CommandQueue( self.ctx)

#         """
#         Wavefront: as warp in cuda. Can control the width in a workgroup
#         Wavefront is required in spmv_vector as it improves data coalescence.
#         see cSparseMatVec and zSparseMatVec
#         """
        self.wavefront = api.DeviceParameters(device).warp_size
        print(api.DeviceParameters(device).max_work_group_size)
#         print(self.wavefront)
#         print(type(self.wavefront))
#          pyopencl.characterize.get_simd_group_size(device[0], dtype.size)
        from src.re_subroutine import cMultiplyScalar, cCopy, cAddScalar,cAddVec, cSparseMatVec, cSelect, cMultiplyVec, cMultiplyVecInplace, cMultiplyConjVec, cDiff, cSqrt, cAnisoShrink
        # import complex float routines
#         print(dtypes.ctype(dtype))
        prg = self.thr.compile( 
                                cMultiplyScalar.R + #cCopy.R, 
                                cCopy.R + 
                                cAddScalar.R + 
                                cSelect.R +cMultiplyConjVec.R + cAddVec.R+
                                cMultiplyVecInplace.R +cSparseMatVec.R+cDiff.R+ cSqrt.R+ cAnisoShrink.R+ cMultiplyVec.R,
                                render_kwds=dict(
                                    LL =  str(self.wavefront)), fast_math=False)
#                                fast_math = False)
#                                 "#define LL  "+ str(self.wavefront) + "   "+cSparseMatVec.R)
#                                ),
#                                 fast_math=False)
#         prg2 = pyopencl.Program(self.ctx, "#define LL "+ str(self.wavefront) + " "+cSparseMatVec.R).build()
        
        self.cMultiplyScalar = prg.cMultiplyScalar
#         self.cMultiplyScalar.set_scalar_arg_dtypes( cMultiplyScalar.scalar_arg_dtypes)
        self.cCopy = prg.cCopy
        self.cAddScalar = prg.cAddScalar
        self.cAddVec = prg.cAddVec
        self.cSparseMatVec = prg.cSparseMatVec     
        self.cSelect = prg.cSelect
        self.cMultiplyVecInplace = prg.cMultiplyVecInplace
        self.cMultiplyVec = prg.cMultiplyVec
        self.cMultiplyConjVec = prg.cMultiplyConjVec
        self.cDiff = prg.cDiff
        self.cSqrt= prg.cSqrt
        self.cAnisoShrink = prg.cAnisoShrink                                 

#         self.xx_Kd = pyopencl.array.zeros(self.queue, self.st['Kd'], dtype=dtype, order="C")
        self.k_Kd = self.thr.to_device(numpy.zeros(self.st['Kd'], dtype=dtype, order="C"))
        self.k_Kd2 = self.thr.to_device(numpy.zeros(self.st['Kd'], dtype=dtype, order="C"))
        self.y =self.thr.to_device( numpy.zeros((self.st['M'],), dtype=dtype, order="C"))
        self.x_Nd = self.thr.to_device(numpy.zeros(self.st['Nd'], dtype=dtype, order="C"))
#         self.xx_Nd =     pyopencl.array.zeros(self.queue, self.st['Nd'], dtype=dtype, order="C")

        self.NdCPUorder, self.KdCPUorder, self.nelem =     preindex_copy(self.st['Nd'], self.st['Kd'])
        self.NdGPUorder = self.thr.to_device( self.NdCPUorder)
        self.KdGPUorder =  self.thr.to_device( self.KdCPUorder)
        self.Ndprod = numpy.int32(numpy.prod(self.st['Nd']))
        self.Kdprod = numpy.int32(numpy.prod(self.st['Kd']))
        self.M = numpy.int32( self.st['M'])
        
        self.SnGPUArray = self.thr.to_device(  self.sn)
        
        self.sp_data = self.thr.to_device( self.sp.data.astype(dtype))
        self.sp_indices =self.thr.to_device( self.sp.indices.astype(numpy.int32))
        self.sp_indptr = self.thr.to_device( self.sp.indptr.astype(numpy.int32))
        self.sp_numrow =  self.M
        del self.sp
        self.spH_data = self.thr.to_device(  self.spH.data.astype(dtype))
        self.spH_indices = self.thr.to_device(  self.spH.indices)
        self.spH_indptr = self.thr.to_device(  self.spH.indptr)
        self.spH_numrow = self.Kdprod
        del self.spH
        self.spHsp_data = self.thr.to_device(  self.spHsp.data.astype(dtype))
        self.spHsp_indices = self.thr.to_device( self.spHsp.indices)
        self.spHsp_indptr =self.thr.to_device(  self.spHsp.indptr)
        self.spHsp_numrow = self.Kdprod
        del self.spHsp
#         import reikna.cluda
        import reikna.fft
#         api = 
#         self.thr = reikna.cluda.ocl_api().Thread(self.queue)        
        self.fft = reikna.fft.FFT(self.k_Kd, numpy.arange(0, self.ndims)).compile(self.thr, fast_math=False)
#         self.fft = reikna.fft.FFT(self.k_Kd).compile(thr, fast_math=True)
#         self.fft = FFT(self.ctx, self.queue,  self.k_Kd, fast_math=True)
#         self.ifft = FFT(self.ctx, self.queue, self.k_Kd2,  fast_math=True)
        self.zero_scalar=dtype(0.0+0.0j)
#     def solver(self,  gy, maxiter):#, solver='cg', maxiter=200):
    def solver(self,gy, solver=None, *args, **kwargs):
        """
        The solver of NUFFT
        :param gy: data, reikna array, (M,) size
        :param solver: could be 'cg', 'L1TVOLS', 'L1TVLAD' 
        :param maxiter: the number of iterations
        :type gy: reikna array, dtype = numpy.complex64
        :type solver: string
        :type maxiter: int
        :return: reikna array with size Nd
            
        
        """
        import src._solver.solver_hsa
        
            
        try:
            return src._solver.solver_hsa.solver(self,  gy,  solver, *args, **kwargs)
        except:
            if numpy.ndarray==type(gy):
                print("input gy must be a reikna array with dtype = numpy.complex64")
                raise TypeError
            else:
                print("wrong")
                raise TypeError

    def _pipe_density(self,maxiter):
        """
        Private: create the density function by iterative solution
        Generate pHp matrix
        """


        try:
            if maxiter < self.last_iter:
            
                W = self.st['W']
            else: #maxiter > self.last_iter
                W = self.st['W']
                for pp in range(0,maxiter - self.last_iter):
 
    #             E = self.st['p'].dot(V1.dot(W))
                    E = self.forward(self.adjoint(W))
                    W = (W/E)             
                self.last_iter = maxiter   
        except:
            W = self.thr.copy_array(self.y)
            self.cMultiplyScalar(self.zero_scalar, W, local_size=None, global_size=int(self.M))
    #         V1= self.st['p'].getH()
        #     VVH = V.dot(V.getH()) 
            
            for pp in range(0,1):
    #             E = self.st['p'].dot(V1.dot(W))
 
                E = self.forward(self.adjoint(W))
                W /= E
#                 self.cMultiplyVecInplace(self.SnGPUArray, self.x_Nd, local_size=None, global_size=int(self.Ndprod))


            self.last_iter = maxiter
        
        return W    
    def _linear_phase(self, n_shift):
        """
        Private: Select the center of FOV
        """
        om = self.st['om']
        M = self.st['M']
        final_shifts = tuple(
            numpy.array(n_shift) +
            numpy.array(self.st['Nd']) / 2)

        phase = numpy.exp(
            1.0j *
            numpy.sum(
                om * numpy.tile(
                    final_shifts,
                    (M,1)),
                1))
        # add-up all the linear phasees in all axes,

        self.st['p'] = scipy.sparse.diags(phase, 0).dot(self.st['p0'])
 

    def _truncate_selfadjoint(self, tol):
        """
        Yet to be tested.
        """
#         for pp in range(1, 8):
#             self.st['pHp'].setdiag(0,k=pp)
#             self.st['pHp'].setdiag(0,k=-pp)
        indix=numpy.abs(self.spHsp.data)< tol
        self.spHsp.data[indix]=0
 
        self.spHsp.eliminate_zeros()
        indix=numpy.abs(self.sp.data)< tol
        self.sp.data[indix]=0
 
        self.sp.eliminate_zeros()
        
    def forward(self, gx):
            """
            Forward NUFFT on the heterogeneous device
            
            :param gx: The input gpu array, with size=Nd
            :type: reikna gpu array with dtype =numpy.complex64
            :return: gy: The output gpu array, with size=(M,)
            :rtype: reikna gpu array with dtype =numpy.complex64
            """
            self.x_Nd =  self.thr.copy_array(gx)
            
            self._x2xx()

            self._xx2k()

            self._k2y()

            gy =  self.thr.copy_array(self.y)
            return gy
    
    def adjoint(self, gy):
            """
            Adjoint NUFFT on the heterogeneous device
            
            :param gy: The output gpu array, with size=(M,)
            :type: reikna gpu array with dtype =numpy.complex64
            :return: gx: The input gpu array, with size=Nd
            :rtype: reikna gpu array with dtype =numpy.complex64
            """        
            self.y = self.thr.copy_array(gy) 

            self._y2k()
            self._k2xx()
            self._xx2x()
            gx = self.thr.copy_array(self.x_Nd)
            return gx
    def selfadjoint(self, gx):
        """
        selfadjoint NUFFT (Teplitz) on the heterogeneous device
        
        :param gx: The input gpu array, with size=Nd
        :type: reikna gpu array with dtype =numpy.complex64
        :return: gx: The input gpu array, with size=Nd
        :rtype: reikna gpu array with dtype =numpy.complex64
        """                
        self.x_Nd = self.thr.copy_array(gx)
        self._x2xx()
        self._xx2k()
        self._k2y2k()
        self._k2xx()
        self._xx2x()
        gx2 = self.thr.copy_array(self.x_Nd)
        return gx2

    def _x2xx(self):
        """
        Private: Scaling on the heterogeneous device
        Inplace multiplication of self.x_Nd by the scaling factor self.SnGPUArray.
        """                
#         self.cMultiplyVecInplace(self.queue, (self.Ndprod,), None,  self.SnGPUArray.data, self.x_Nd.data)
        self.cMultiplyVecInplace(self.SnGPUArray, self.x_Nd, local_size=None, global_size=int(self.Ndprod))
        self.thr.synchronize()
    def _xx2k(self ):
        
        """
        Private: oversampled FFT on the heterogeneous device
        
        Firstly, zeroing the self.k_Kd array
        Second, copy self.x_Nd array to self.k_Kd array by cSelect
        Third: inplace FFT
        """
        
        self.cMultiplyScalar(self.zero_scalar, self.k_Kd, local_size=None, global_size=int(self.Kdprod))
        self.cSelect(self.NdGPUorder,      self.KdGPUorder,  self.x_Nd, self.k_Kd, local_size=None, global_size=int(self.Ndprod))
        self.fft( self.k_Kd,self.k_Kd,inverse=False)
        self.thr.synchronize()
    def _k2y(self ):
        """
        Private: interpolation by the Sparse Matrix-Vector Multiplication
        """
        self.cSparseMatVec(                                
                                   self.sp_numrow, 
                                   self.sp_indptr,
                                   self.sp_indices,
                                   self.sp_data, 
                                   self.k_Kd,
                                   self.y,
                                   local_size=int(self.wavefront),
                                   global_size=int(self.sp_numrow*self.wavefront) 
                                    )
        self.thr.synchronize()
    def _y2k(self):
        """
        Private: gridding by the Sparse Matrix-Vector Multiplication
        """
        self.cSparseMatVec(
                                   self.spH_numrow, 
                                   self.spH_indptr,
                                   self.spH_indices,
                                   self.spH_data, 
                                   self.y,
                                   self.k_Kd2,
                                   local_size=int(self.wavefront),
                                   global_size=int(self.spH_numrow*self.wavefront) 
                                    )#,g_times_l=int(csrnumrow))
#         return k
        self.thr.synchronize()
    def _k2y2k(self):
        """
        Private: the integrated interpolation-gridding by the Sparse Matrix-Vector Multiplication
        """
        self.cSparseMatVec( 
                                    
                                   self.spHsp_numrow, 
                                   self.spHsp_indptr,
                                   self.spHsp_indices,
                                   self.spHsp_data, 
                                   self.k_Kd,
                                   self.k_Kd2,
                                   local_size=int(self.wavefront),
                                   global_size=int(self.spHsp_numrow*self.wavefront) 
                                    )#,g_times_l=int(csrnumrow))

    def _k2xx(self):
        """
        Private: the inverse FFT and image cropping (which is the reverse of _xx2k() method)
        """
        self.fft( self.k_Kd2, self.k_Kd2,inverse=True)
#         self.x_Nd._zero_fill()
        self.cMultiplyScalar(self.zero_scalar, self.x_Nd,  local_size=None, global_size=int(self.Ndprod ))
#         self.cSelect(self.queue, (self.Ndprod,), None,   self.KdGPUorder.data,  self.NdGPUorder.data,     self.k_Kd2.data, self.x_Nd.data )
        self.cSelect(  self.KdGPUorder,  self.NdGPUorder,     self.k_Kd2, self.x_Nd, local_size=None, global_size=int(self.Ndprod ))
        
        self.thr.synchronize()
    def _xx2x(self ):
        """
        Private: rescaling, which is identical to the  _x2xx() method
        """
        self.cMultiplyVecInplace( self.SnGPUArray, self.x_Nd, local_size=None, global_size=int(self.Ndprod))
        self.thr.synchronize()

        
        
def benchmark():
    import cProfile
    import numpy
    #import matplotlib.pyplot
    import copy

    #cm = matplotlib.cm.gray
    # load example image
    import pkg_resources
    
    DATA_PATH = pkg_resources.resource_filename('pynufft', 'data/')
#     PHANTOM_FILE = pkg_resources.resource_filename('pynufft', 'data/phantom_256_256.txt')
    import numpy
    #import matplotlib.pyplot
    import scipy
    import scipy.misc
    # load example image
#     image = numpy.loadtxt(DATA_PATH +'phantom_256_256.txt')
    image = scipy.misc.face(gray=True)
#    image = scipy.misc.ascent()    
    image = scipy.misc.imresize(image, (256,256))
    
    image=image.astype(numpy.float)/numpy.max(image[...])
    #numpy.save('phantom_256_256',image)
    #matplotlib.pyplot.subplot(1,3,1)
    #matplotlib.pyplot.imshow(image, cmap=matplotlib.cm.gray)
    #matplotlib.pyplot.title("Load Scipy \"ascent\" image")
#     matplotlib.pyplot.show()
    print('loading image...')
#     image[128, 128] = 1.0
    Nd = (256, 256)  # image space size
    Kd = (512, 512)  # k-space size
    Jd = (6, 6)  # interpolation size

    # load k-space points
    om = numpy.load(DATA_PATH+'om2D.npz')['arr_0']

    # create object
    
#         else:
#             n_shift=tuple(list(n_shift)+numpy.array(Nd)/2)
    import pynufft
    nfft = pynufft.NUFFT()  # CPU
    nfft.plan(om, Nd, Kd, Jd)
#     nfft.initialize_gpu()
    import scipy.sparse
#     scipy.sparse.save_npz('tests/test.npz', nfft.st['p'])
    print("create NUFFT gpu object")
    NufftObj = NUFFT()
    print("plan nufft on gpu")
    NufftObj.plan(om, Nd, Kd, Jd)
    print("NufftObj planed")
#     print('sp close? = ', numpy.allclose( nfft.st['p'].data,  NufftObj.st['p'].data, atol=1e-1))
#     NufftObj.initialize_gpu()

    y = nfft.k2y(nfft.xx2k(nfft.x2xx(image)))
    print("send image to device")
    NufftObj.x_Nd = NufftObj.thr.to_device( image.astype(dtype))
    print("copy image to gx")
    gx = NufftObj.thr.copy_array(NufftObj.x_Nd)
    print('x close? = ', numpy.allclose(image, NufftObj.x_Nd.get() , atol=1e-4))
    NufftObj._x2xx()    
#     ttt2= NufftObj.thr.copy_array(NufftObj.x_Nd)
    print('xx close? = ', numpy.allclose(nfft.x2xx(image), NufftObj.x_Nd.get() , atol=1e-4))        

    NufftObj._xx2k()    
    
#     print(NufftObj.k_Kd.get(queue=NufftObj.queue, async=True).flags)
#     print(nfft.xx2k(nfft.x2xx(image)).flags)
    k = nfft.xx2k(nfft.x2xx(image))
    print('k close? = ', numpy.allclose(nfft.xx2k(nfft.x2xx(image)), NufftObj.k_Kd.get() , atol=1e-3*numpy.linalg.norm(k)))   
    
    
    NufftObj._k2y()    
    
    
    NufftObj._y2k()
    y2 = NufftObj.y.get(   )
    
    print('y close? = ', numpy.allclose(y, y2 ,  atol=1e-3*numpy.linalg.norm(y)))
#     print(numpy.mean(numpy.abs(nfft.y2k(y)-NufftObj.k_Kd2.get(queue=NufftObj.queue, async=False) )))
    print('k2 close? = ', numpy.allclose(nfft.y2k(y2), NufftObj.k_Kd2.get(), atol=1e-3*numpy.linalg.norm(nfft.y2k(y2)) ))   
    NufftObj._k2xx()
#     print('xx close? = ', numpy.allclose(nfft.k2xx(nfft.y2k(y2)), NufftObj.xx_Nd.get(queue=NufftObj.queue, async=False) , atol=0.1))
    NufftObj._xx2x()
    print('x close? = ', numpy.allclose(nfft.adjoint(y2), NufftObj.x_Nd.get() , atol=1e-3*numpy.linalg.norm(nfft.adjoint(y2))))
    image3 = NufftObj.x_Nd.get() 
    import time
    t0 = time.time()
    for pp in range(0,10):
#         y = nfft.k2y(nfft.xx2k(nfft.x2xx(image)))    
#             x = nfft.adjoint(y)
            y = nfft.forward(image)
#     y2 = NufftObj.y.get(   NufftObj.queue, async=False)
    t_cpu = (time.time() - t0)/10.0 
    print(t_cpu)
    
#     del nfft

    
    t0= time.time()
    for pp in range(0,20):
#         pass
#         NufftObj.adjoint()
        gy=NufftObj.forward(gx)        

#     NufftObj.thr.synchronize()
    t_cl = (time.time() - t0)/20
    print(t_cl)
    print("forward acceleration=", t_cpu/t_cl)

    t0 = time.time()
    for pp in range(0,10):
#         y = nfft.k2y(nfft.xx2k(nfft.x2xx(image)))    
            x = nfft.adjoint(y)
#             y = nfft.forward(image)
#     y2 = NufftObj.y.get(   NufftObj.queue, async=False)
    t_cpu = (time.time() - t0)/10.0 
    print(t_cpu)
    
#     del nfft

    
    t0= time.time()
    for pp in range(0,20):
#         pass
#         NufftObj.adjoint()
        gx=NufftObj.adjoint(gy)        

#     NufftObj.thr.synchronize()
    t_cl = (time.time() - t0)/20
    print(t_cl)
    print("adjoint acceleration=", t_cpu/t_cl)

    t0 = time.time()
    for pp in range(0,10):
#         y = nfft.k2y(nfft.xx2k(nfft.x2xx(image)))    
#             x = nfft.adjoint(y)
            x = nfft.selfadjoint(image)
#     y2 = NufftObj.y.get(   NufftObj.queue, async=False)
    t_cpu = (time.time() - t0)/10.0 
    print(t_cpu)
    
   
#     del nfft

    
    t0= time.time()
    for pp in range(0,20):
#         pass
#         NufftObj.adjoint()
        g2=NufftObj.selfadjoint(gx)        

#     NufftObj.thr.synchronize()
    t_cl = (time.time() - t0)/20
    print(t_cl)
    print("selfadjoint acceleration=", t_cpu/t_cl)



    maxiter = 100
    import time
    t0= time.time()
    x2 = nfft.solver(y2, 'cg',maxiter=maxiter)
#    x2 =  nfft.solver(y2, 'L1TVLAD',maxiter=maxiter, rho = 1)
    t1 = time.time()-t0
#     gy=NufftObj.thr.copy_array(NufftObj.thr.to_device(y2))
    
    t0= time.time()
    x = NufftObj.solver(gy,'cg', maxiter=maxiter)
#    x = NufftObj.solver(gy,'L1TVLAD', maxiter=maxiter, rho=1)
    
    t2 = time.time() - t0
    print(t1, t2)
    print('acceleration=', t1/t2 )

    t0= time.time()
#     x = NufftObj.solver(gy,'cg', maxiter=maxiter)
    x = NufftObj.solver(gy,'L1TVOLS', maxiter=maxiter, rho=2)

    
    t3 = time.time() - t0
    print(t2, t3)
    print('Speed of LAD vs OLS =', t3/t2 )


#     k = x.get()
#     x = nfft.k2xx(k)/nfft.st['sn']
#     return
    
    #matplotlib.pyplot.subplot(1, 3, 2)
    #matplotlib.pyplot.imshow( NufftObj.x_Nd.get().real, cmap= matplotlib.cm.gray)
    #matplotlib.pyplot.subplot(1, 3,3)
    #matplotlib.pyplot.imshow(x2.real, cmap= matplotlib.cm.gray)
    #matplotlib.pyplot.show()


def test_init():
    import cProfile
    import numpy
    import matplotlib.pyplot
    import copy

    cm = matplotlib.cm.gray
    # load example image
    import pkg_resources
    
    DATA_PATH = pkg_resources.resource_filename('pynufft', 'src/data/')
#     PHANTOM_FILE = pkg_resources.resource_filename('pynufft', 'data/phantom_256_256.txt')
    import numpy
    import matplotlib.pyplot
    import scipy
    # load example image
#     image = numpy.loadtxt(DATA_PATH +'phantom_256_256.txt')
#     image = scipy.misc.face(gray=True)
    image = scipy.misc.ascent()    
    image = scipy.misc.imresize(image, (256,256))
    
    image=image.astype(numpy.float)/numpy.max(image[...])
    #numpy.save('phantom_256_256',image)
    matplotlib.pyplot.subplot(1,3,1)
    matplotlib.pyplot.imshow(image, cmap=matplotlib.cm.gray)
    matplotlib.pyplot.title("Load Scipy \"ascent\" image")
#     matplotlib.pyplot.show()
    print('loading image...')
#     image[128, 128] = 1.0
    Nd = (256, 256)  # image space size
    Kd = (512, 512)  # k-space size
    Jd = (6, 6)  # interpolation size

    # load k-space points
    om = numpy.load(DATA_PATH+'om2D.npz')['arr_0']

    # create object
    
#         else:
#             n_shift=tuple(list(n_shift)+numpy.array(Nd)/2)
    import pynufft
    nfft = pynufft.NUFFT()  # CPU
    nfft.plan(om, Nd, Kd, Jd)
#     nfft.initialize_gpu()
    import scipy.sparse
#     scipy.sparse.save_npz('tests/test.npz', nfft.st['p'])

    NufftObj = NUFFT()

    NufftObj.plan(om, Nd, Kd, Jd)
    NufftObj.offload(API='ocl',   platform_number= 1 , device_number= 0)
#     print('sp close? = ', numpy.allclose( nfft.st['p'].data,  NufftObj.st['p'].data, atol=1e-1))
#     NufftObj.initialize_gpu()

    y = nfft.k2y(nfft.xx2k(nfft.x2xx(image)))
    NufftObj.x_Nd = NufftObj.thr.to_device( image.astype(dtype))
    gx = NufftObj.thr.copy_array(NufftObj.x_Nd)
    print('x close? = ', numpy.allclose(image, NufftObj.x_Nd.get() , atol=1e-4))
    NufftObj._x2xx()    
#     ttt2= NufftObj.thr.copy_array(NufftObj.x_Nd)
    print('xx close? = ', numpy.allclose(nfft.x2xx(image), NufftObj.x_Nd.get() , atol=1e-4))        

    NufftObj._xx2k()    
    
#     print(NufftObj.k_Kd.get(queue=NufftObj.queue, async=True).flags)
#     print(nfft.xx2k(nfft.x2xx(image)).flags)
    k = nfft.xx2k(nfft.x2xx(image))
    print('k close? = ', numpy.allclose(nfft.xx2k(nfft.x2xx(image)), NufftObj.k_Kd.get() , atol=1e-3*numpy.linalg.norm(k)))   
    
    NufftObj._k2y()    
    
    
    NufftObj._y2k()
    y2 = NufftObj.y.get(   )
    
    print('y close? = ', numpy.allclose(y, y2 ,  atol=1e-3*numpy.linalg.norm(y)))
#     print(numpy.mean(numpy.abs(nfft.y2k(y)-NufftObj.k_Kd2.get(queue=NufftObj.queue, async=False) )))
    print('k2 close? = ', numpy.allclose(nfft.y2k(y2), NufftObj.k_Kd2.get(), atol=1e-3*numpy.linalg.norm(nfft.y2k(y2)) ))   
    NufftObj._k2xx()
#     print('xx close? = ', numpy.allclose(nfft.k2xx(nfft.y2k(y2)), NufftObj.xx_Nd.get(queue=NufftObj.queue, async=False) , atol=0.1))
    NufftObj._xx2x()
    print('x close? = ', numpy.allclose(nfft.adjoint(y2), NufftObj.x_Nd.get() , atol=1e-3*numpy.linalg.norm(nfft.adjoint(y2))))
    image3 = NufftObj.x_Nd.get() 
    import time
    t0 = time.time()
    for pp in range(0,10):
#         y = nfft.k2y(nfft.xx2k(nfft.x2xx(image)))    
#             x = nfft.adjoint(y)
            y = nfft.forward(image)
#     y2 = NufftObj.y.get(   NufftObj.queue, async=False)
    t_cpu = (time.time() - t0)/10.0 
    print(t_cpu)
    
#     del nfft
        
    
    t0= time.time()
    for pp in range(0,100):
#         pass
#         NufftObj.adjoint()
        gy=NufftObj.forward(gx)        
        
#     NufftObj.thr.synchronize()
    t_cl = (time.time() - t0)/100
    print(t_cl)
    print('gy close? = ', numpy.allclose(y,gy.get(),  atol=1e-1))
    print("acceleration=", t_cpu/t_cl)
    maxiter =100
    import time
    t0= time.time()
#     x2 = nfft.solver(y2, 'cg',maxiter=maxiter)
    x2 =  nfft.solver(y2, 'L1TVLAD',maxiter=maxiter, rho = 2)
    t1 = time.time()-t0
#     gy=NufftObj.thr.copy_array(NufftObj.thr.to_device(y2))
    
    t0= time.time()
#     x = NufftObj.solver(gy,'dc', maxiter=maxiter)
    x = NufftObj.solver(gy,'L1TVLAD', maxiter=maxiter, rho=2)
    
    t2 = time.time() - t0
    print(t1, t2)
    print('acceleration=', t1/t2 )
#     k = x.get()
#     x = nfft.k2xx(k)/nfft.st['sn']
#     return
    
    matplotlib.pyplot.subplot(1, 3, 2)
    matplotlib.pyplot.imshow( x.get().real, cmap= matplotlib.cm.gray)
    matplotlib.pyplot.subplot(1, 3,3)
    matplotlib.pyplot.imshow(x2.real, cmap= matplotlib.cm.gray)
    matplotlib.pyplot.show()
def test_cAddScalar():

    dtype = numpy.complex64
    
    try:
        device=pyopencl.get_platforms()[1].get_devices()
        
    except:
        device=pyopencl.get_platforms()[0].get_devices()
    print('using cl device=',device,device[0].max_work_group_size, device[0].max_compute_units,pyopencl.characterize.get_simd_group_size(device[0], dtype.size))

    ctx = pyopencl.Context(device) #pyopencl.create_some_context()
    queue = pyopencl.CommandQueue(ctx)
    wavefront = pyopencl.characterize.get_simd_group_size(device[0], dtype.size)

#     B = routine(wavefront)
    import cl_subroutine.cAddScalar
    prg = pyopencl.Program(ctx, cl_subroutine.cAddScalar.R).build()
    
    AddScalar = prg.cAddScalar
    AddScalar.set_scalar_arg_dtypes(cl_subroutine.cAddScalar.scalar_arg_dtypes)
#     indata= numpy.arange(0,128).astype(dtype)
    indata = (numpy.random.randn(128,)+numpy.random.randn(128,)*1.0j).astype(dtype)      
    indata_g = pyopencl.array.to_device(queue, indata)
    scal= 0.1+0.1j
    AddScalar(queue, (128,),None,scal, indata_g.data)
    print(-indata[0]+indata_g.get()[0])
    
if __name__ == '__main__':
    import cProfile
#     cProfile.run('benchmark()')
    test_init()
#     test_cAddScalar()
#     cProfile.run('test_init()')
back to top