##### https://github.com/jyhmiinlin/pynufft
Tip revision: 14452db
solve_legacy.py
``````"""
HSA solvers
======================================
"""

"""
- Bugfix: fix the instability of cg due to alpha and beta which must be fetched from the device
"""
import numpy, scipy
dtype = numpy.complex64
import scipy
import numpy
from ..src._helper import helper

def cDiff(x, d_indx):
"""
(stable) Compute image gradient
Work with indxmap_diff(Nd).
...
"""
a2=numpy.asarray(x.copy(),order='C')
a2.flat =   a2 .flat[d_indx] - a2 .flat
return a2
def _create_kspace_sampling_density(nufft):
"""
(stable) Compute k-space sampling density from the nufft object
"""
y = numpy.ones((nufft.st['M'],),dtype = numpy.complex64)
gy = nufft.thr.to_device(y)
gk = nufft._y2k_legacy(gy)
w =  numpy.abs(gk.get())#**2) ))

nufft.st['w'] = w#self.nufftobj.vec2k(w)
RTR=nufft.st['w'] # see __init__() in class "nufft"
return RTR
# def _create_laplacian_kernel(nufft):
# #===============================================================================
# # #        # Laplacian oeprator, convolution kernel in spatial domain
# #         # related to constraint
# #===============================================================================
#     uker = numpy.zeros(nufft.st['Kd'][:],dtype=numpy.complex64,order='C')
#     n_dims= numpy.size(nufft.st['Nd'])
#
#     if n_dims == 1:
#         uker[0] = -2.0
#         uker[1] = 1.0
#         uker[-1] = 1.0
#     elif n_dims == 2:
#         uker[0,0] = -4.0
#         uker[1,0] = 1.0
#         uker[-1,0] = 1.0
#         uker[0,1] = 1.0
#         uker[0,-1] = 1.0
#     elif n_dims == 3:
#         uker[0,0,0] = -6.0
#         uker[1,0,0] = 1.0
#         uker[-1,0,0] = 1.0
#         uker[0,1,0] = 1.0
#         uker[0,-1,0] = 1.0
#         uker[0,0,1] = 1.0
#         uker[0,0,-1] = 1.0
#
#     uker =numpy.fft.fftn(uker) #, self.nufftobj.st['Kd'], range(0,numpy.ndim(uker)))
#     return uker
# def GBPDNA(nufft. gy, maxiter, rho):
#     def A(x):
#         gy = nufft.forward(x)
#         return gy
#     def AH(gy):
#         x2 = nufft.adjoint(gy)
#         return x2

def L1TVOLS(nufft, gy, maxiter, rho  ): # main function of solver
"""
L1-total variation regularized ordinary least square
"""
mu = 1.0
LMBD = rho*mu

def AHA(x):
return x2
def AH(gy):
return x2

uker_cpu = mu*_create_kspace_sampling_density(nufft)   - LMBD* helper.create_laplacian_kernel(nufft) # on cpu
uker = nufft.thr.to_device(uker_cpu.astype(numpy.complex64))
AHy = AH(gy) # on  device?
z = numpy.zeros(nufft.st['Nd'],dtype = numpy.complex64,order='C')
z_gpu = nufft.thr.to_device(z)
xkp1 = nufft.thr.copy_array(z_gpu)
AHyk = nufft.thr.copy_array(z_gpu)

#         self._allo_split_variables()
zz= []
bb = []
dd = []
d_indx, dt_indx = helper.indxmap_diff(nufft.st['Nd'])
ndims = len(nufft.st['Nd'])
s_tmp = []
for pp in range(0, ndims):
s_tmp += [0, ]
for jj in range(    0,  ndims): # n_dims + 1 for wavelets
d_indx[jj] = nufft.thr.to_device(d_indx[jj])
dt_indx[jj] = nufft.thr.to_device(dt_indx[jj])
#     z=numpy.zeros(nufft.st['Nd'], dtype = nufft.dtype, order='C')

#     ndims = len(nufft.st['Nd'])

for jj in range(    0,  ndims): # n_dims + 1 for wavelets

zz += [nufft.thr.copy_array(z_gpu),]
bb += [nufft.thr.copy_array(z_gpu),]
dd +=  [nufft.thr.copy_array(z_gpu),]
#     zf = nufft.thr.copy_array(z_gpu)
#     bf = nufft.thr.copy_array(z_gpu)
#     df = nufft.thr.copy_array(z_gpu)

n_dims = len(nufft.st['Nd'])#numpy.size(uf.shape)

tmp_gpu = nufft.thr.copy_array(z_gpu)

for outer in numpy.arange(0, maxiter):
#             for inner in numpy.arange(0,nInner):

# solve Ku = rhs
#                 rhs = (mu*(AHyk + df - bf) +  # right hand side
#                 LMBD*(cDiff(dd[0] - bb[0],  dt_indx[0])) +
#                 LMBD*(cDiff(dd[1] - bb[1],  dt_indx[1]))  )
rhs = nufft.thr.copy_array(AHyk)

#         rhs += df
#
#         rhs -= bf

#         rhs *= mu
nufft.prg.cMultiplyScalar( dtype(mu), rhs, local_size=None, global_size=int(nufft.Ndprod))

#         print(rhs.get())
for pp in range(0, ndims):
in_cDiff = nufft.thr.copy_array(dd[pp])

in_cDiff -= bb[pp]

#         out_cDiff = nufft.thr.empty_like(in_cDiff)
nufft.prg.cDiff(dt_indx[pp], in_cDiff, tmp_gpu, local_size=None, global_size=int(nufft.Ndprod))

#         tmp_gpu *= LMBD
nufft.prg.cMultiplyScalar( dtype(LMBD), tmp_gpu, local_size=None, global_size=int(nufft.Ndprod))

rhs += tmp_gpu

#         print(rhs.get())
#         in_cDiff = nufft.thr.copy_array(dd[1])
#
#         in_cDiff -= bb[1]
#
# #         out_cDiff = nufft.thr.empty_like(in_cDiff)
#         nufft.prg.cDiff(dt_indx[1], in_cDiff, tmp_gpu, local_size=None, global_size=int(nufft.Ndprod))
#
# #         tmp_gpu *= LMBD
#         nufft.prg.cMultiplyScalar( dtype(LMBD), tmp_gpu, local_size=None, global_size=int(nufft.Ndprod))
#
#         rhs += tmp_gpu

#         print(rhs.get())

# Note K = F' uker F
# so K-1 ~ F
#         xkp1 = nufft.k2xx(nufft.xx2k(rhs) / uker)
xx = nufft.thr.copy_array(rhs)

k = nufft._xx2k_device(xx)

k /= uker

#         nufft.k_Kd2 = nufft.thr.copy_array(nufft.k_Kd)
xkp1 = nufft._k2xx_device(k)
#         xkp1 = nufft.thr.copy_array(nufft.x_Nd)
#         print(xkp1.get())
#                 self._update_d(xkp1)

#         zz[0] = cDiff(xkp1,  d_indx[0])
#         zz[1] = cDiff(xkp1,  d_indx[1])
for pp in range(0, ndims):
nufft.prg.cDiff(d_indx[pp],  xkp1, zz[pp],  local_size=None, global_size=int(nufft.Ndprod))
#         nufft.prg.cDiff(d_indx[0],  xkp1, zz[0],  local_size=None, global_size=int(nufft.Ndprod))
#         nufft.prg.cDiff(d_indx[1],  xkp1, zz[1],  local_size=None, global_size=int(nufft.Ndprod))

#         zf = AHA(xkp1)  -AHy
zf = AHA(xkp1)

zf -= AHy

'''
soft-thresholding the edges
'''
for pp in range(0, ndims):
s_tmp[pp] = zz[pp] + bb[pp]
#         s1 = zz[0] + bb[0]
#
#         s2 = zz[1] + bb[1]

#         s = s1**2 + s2**2
#         s1 *= s1
#             nufft.prg.cMultiplyConjVec(s_tmp[pp], s_tmp[pp], tmp_gpu, local_size=None, global_size=int(nufft.Ndprod))
for pp in range(0, ndims):
if pp > 0:
#                 s += tmp_gpu
nufft.prg.cHypot(s, s_tmp[pp], local_size=None, global_size=int(nufft.Ndprod))
#                 nufft.thr.synchronize()
else: # pp == 0
s = nufft.thr.copy_array(s_tmp[pp])
#         nufft.prg.cMultiplyConjVec(s1, s1, tmp_gpu, local_size=None, global_size=int(nufft.Ndprod))
#         s = nufft.thr.copy_array(tmp_gpu)
# #         s2 *= s2
#         nufft.prg.cMultiplyConjVec(s2, s2, tmp_gpu, local_size=None, global_size=int(nufft.Ndprod))
#         s += tmp_gpu

#         s = s1 + s2

#         nufft.prg.cSqrt(s, local_size=None, global_size=int(nufft.Ndprod))

s += 1e-6

threshold_value = dtype(1/LMBD)
#         r =(s > threshold_value)*(s-threshold_value)/s#numpy.maximum(s - threshold_value ,  0.0)/s
nufft.prg.cAnisoShrink(threshold_value, s, tmp_gpu, local_size=None, global_size=int(nufft.Ndprod))
tmp_gpu /=s

#         dd[0] = s1*r
#         dd[1] = s2*r
#         dd[0] = s1*tmp_gpu
for pp in range(0, ndims):
nufft.prg.cMultiplyVec(s_tmp[pp], tmp_gpu, dd[pp], local_size=None, global_size=int(nufft.Ndprod))
#         nufft.prg.cMultiplyVec(s1, tmp_gpu, dd[0], local_size=None, global_size=int(nufft.Ndprod))

#         dd[1] = s2*tmp_gpu

#         nufft.prg.cMultiplyVec(s2, tmp_gpu, dd[1], local_size=None, global_size=int(nufft.Ndprod))

#         tmp_gpu = zf+bf

#         threshold_value=dtype(1.0/mu)

#         df.real =0.0+ (df.real>threshold_value)*(df.real - threshold_value) +(df.real<= - threshold_value)*(df.real+threshold_value)
#         df.imag = 0.0+(df.imag>threshold_value)*(df.imag - threshold_value) +(df.imag<= - threshold_value)*(df.imag+threshold_value)

#         nufft.prg.cAnisoShrink(threshold_value,  tmp_gpu, df, local_size=None, global_size=int(nufft.Ndprod))

#                 df =     sy
# end of shrinkage
for pp in range(0, ndims):
bb[pp] += zz[pp] - dd[pp]
#         bb[0] += zz[0] - dd[0]
#         bb[1] += zz[1] - dd[1]
#         bf += zf - df
#                 self._update_b() # update b based on the current u
#         print(outer)

AHyk -= zf # Linearized Bregman iteration f^k+1 = f^k + f - Au

#     print(xkp1.get())
#             print(outer)
#     print('here')
#     nufft.x_Nd = nufft.thr.copy_array(xkp1)
return xkp1

def _pipe_density(nufft,maxiter):
"""
Private: create the density function in the data space by a iterative solution
Pipe et al. 1999
"""

try:
if maxiter < nufft.last_iter:

W = nufft.st['W']
else: #maxiter > nufft.last_iter
W = nufft.st['W']
for pp in range(0,maxiter - nufft.last_iter):

#             E = nufft.st['p'].dot(V1.dot(W))
W = (W/E)
nufft.last_iter = maxiter
except:
W = nufft.thr.copy_array(nufft.y)
#         nufft.prg.cMultiplyScalar(nufft.zero_scalar, W, local_size=None, global_size=int(nufft.M))
W.fill(0.0 + 0.0j)
#         V1= nufft.st['p'].getH()
#     VVH = V.dot(V.getH())

for pp in range(0,1):
#             E = nufft.st['p'].dot(V1.dot(W))

W /= E
#                 nufft.prg.cMultiplyVecInplace(self.SnGPUArray, self.x_Nd, local_size=None, global_size=int(self.Ndprod))

return W

def solve(nufft,gy, solver=None,  maxiter=30, *args, **kwargs):
"""
The solve function of NUFFT_hsa.
The current version supports solvers = 'cg' or 'L1TVOLS'.

:param nufft: NUFFT_hsa object
:param y: (M,) or (M, batch) array, non-uniform data. If batch is provided, 'cg' and 'L1TVOLS' returns different image shape.
:type y: numpy.complex64 reikna array
:return: x: Nd or Nd + (batch, ) image. L1TVOLS always returns Nd. 'cg' returns Nd + (batch, ) in batch mode.
:rtype: x: reikna array, complex64.
"""
# define the reduction kernel on the device
#     if None ==  solver:
#         solver  =   'cg'
if 'L1TVLAD' == solver:
x2=L1TVLAD(nufft, gy, maxiter=maxiter, *args, **kwargs  )
#         x2 = nufft.thr.copy_array(nufft.x_Nd)
return x2
elif 'L1TVOLS' == solver:
x2=L1TVOLS(nufft, gy, maxiter=maxiter, *args, **kwargs  )
#         x2 = nufft.thr.copy_array(nufft.x_Nd)
return x2
elif 'dc'   ==  solver:
"""
Density compensation method
nufft.st['W'] will be computed if doesn't exist
If nufft.st['W'] exist then x2 = nufft.adjoint(nufft.st['W']*y)
input:
y: (M,) array
output:
x2: Nd array
"""
print(solver, ":density compensation method. I won't recommend it as the GPU version is not needed! Try the CPU version")

#          nufft.st['W'] = nufft._pipe_density(maxiter=maxiter,*args, **kwargs)
#
#          x2 = nufft.adjoint(nufft.st['W']*gy)
return x2
#             return gx
elif 'cg' == solver:

from reikna.algorithms import Reduce, Predicate, predicate_sum

nufft.reduce_sum = Reduce(numpy.zeros(nufft.Kd, dtype = nufft.dtype), predicate_sum(dtype)).compile(nufft.thr)
#         nufft.reduce_sum  = nufft.reduce_sum.compile(nufft.thr)

# update: b = spH * gy
b = nufft._y2k_legacy(gy)

# Initialize x = b
x   =   nufft.thr.copy_array( b)
rsold = nufft.thr.empty_like(nufft.reduce_sum.parameter.output)
#         rsold.fill(0.0+0.0j)
nufft.reduce_sum(rsold, x)
#         print('x',rsold)

# initialize r = b - A * x
r   =   nufft.thr.empty_like( b)

#         r.fill(0.0 + 0.0j)
y_tmp = nufft._k2y_legacy(x)

Ax = nufft._y2k_legacy(y_tmp)

del y_tmp
rsold = nufft.thr.empty_like(nufft.reduce_sum.parameter.output)
#         rsold.fill(0.0 + 0.0j)
nufft.reduce_sum(rsold, Ax)
#         print('Ax',rsold)
nufft.prg.cAddVec(b, - Ax, r , local_size=None, global_size = int(nufft.batch * nufft.Kdprod))

#         nufft.thr.synchronize()
# p = r
p   =   nufft.thr.copy_array(r)

# rsold = r' * r
tmp_array = nufft.thr.empty_like( r)
#         tmp_array.fill(0.0 + 0.0j)
nufft.prg.cMultiplyConjVec(r, r, tmp_array, local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
#         nufft.thr.synchronize()
rsold = nufft.thr.empty_like(nufft.reduce_sum.parameter.output)
#         rsold.fill(0.0 + 0.0j)
nufft.reduce_sum(rsold, tmp_array)

# allocate Ap
#         Ap  =   nufft.thr.empty_like( b)

rsnew = nufft.thr.empty_like(nufft.reduce_sum.parameter.output)
#         rsnew.fill(0.0 + 0.0j)
tmp_sum = nufft.thr.empty_like(nufft.reduce_sum.parameter.output)
#         tmp_sum.fill(0.0 + 0.0j)
for pp in range(0, maxiter):

tmp_p = nufft._k2y_legacy(p)
Ap = nufft._y2k_legacy(tmp_p)
del tmp_p
#             alpha = rs_old/(p'*Ap)
nufft.prg.cMultiplyConjVec(p, Ap, tmp_array, local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
#             nufft.thr.synchronize()
nufft.reduce_sum(tmp_sum, tmp_array)

alpha = rsold / tmp_sum
#             alpha_cpu = alpha.get()
#             if numpy.isnan(alpha_cpu):
#                 alpha_cpu = 0 # avoid singularity

#             print(tmp_sum, alpha, rsold)
#             print(pp,rsold , alpha, numpy.sum(tmp_array.get()) )
# x = x + alpha*p
p2 = nufft.thr.copy_array(p)

nufft.prg.cMultiplyScalar(alpha.get(), p2,  local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
#             nufft.thr.synchronize()
#             nufft.prg.cAddVec(x, alpha, local_size=None, global_size=int(nufft.Kdprod))
x += p2

# r = r - alpha * Ap
p2= nufft.thr.copy_array(Ap)
#             nufft.thr.synchronize()
nufft.prg.cMultiplyScalar(alpha.get(), p2,  local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
#             nufft.thr.synchronize()
r -= p2
#             print(pp, numpy.sum(x.get()), numpy.sum(r.get()))
# rs_new = r'*r

nufft.prg.cMultiplyConjVec(r,    r,  tmp_array, local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
#             nufft.thr.synchronize()
nufft.reduce_sum(rsnew, tmp_array)

# tmp_sum = p = r + (rs_new/rs_old)*p
beta = rsnew/rsold
#             beta_cpu = beta.get()
#             if numpy.isnan(beta_cpu):
#                 beta_cpu = 0
#             print(beta, rsnew, rsold)
p2= nufft.thr.copy_array(p)
nufft.prg.cMultiplyScalar(beta.get(),   p2,  local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
#             nufft.thr.synchronize()
nufft.prg.cAddVec(r, p2, p, local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
#             nufft.thr.synchronize()
p = r + p2

rsold =nufft.thr.copy_array( rsnew)
#             nufft.thr.synchronize()
# end of iteration

# copy result to k_Kd2
#         nufft.k_Kd2 = nufft.thr.copy_array(x)

# inverse FFT: k_Kd2 -> x_Nd
x2 = nufft._k2xx_device(x) # x is the solved k space

# rescale the SnGPUArray
# x2 /= nufft.volume['gpu_sense2']
#         x3 = nufft.x2s(x2) # combine multi-coil to single-coil
try:
x2 /= nufft.volume['SnGPUArray']
except:

nufft.prg.cTensorMultiply(numpy.uint32(nufft.batch),
numpy.uint32(nufft.tSN['Tdims']),
nufft.tSN['Td'],
nufft.tSN['Td_elements'],
nufft.tSN['invTd_elements'],
nufft.tSN['tensor_sn'],
x2,
numpy.uint32(1), # division, 1 is true
local_size = None, global_size = int(nufft.batch*nufft.Ndprod))

return x2``````