solve_hsa.py
"""
HSA solvers
======================================
"""
"""
- Bugfix: fix the instability of cg due to alpha and beta which must be fetched from the device
"""
import numpy, scipy
dtype = numpy.complex64
# def L1TVLAD():
import scipy
import numpy
from ..src._helper import helper
def cDiff(x, d_indx):
"""
(stable) Compute image gradient
Work with indxmap_diff(Nd).
...
"""
a2=numpy.asarray(x.copy(),order='C')
a2.flat = a2 .flat[d_indx] - a2 .flat
return a2
def _create_kspace_sampling_density(nufft):
"""
(stable) Compute k-space sampling density from the nufft object
"""
y = numpy.ones((nufft.st['M'],),dtype = numpy.complex64)
gy = nufft.thr.to_device(y)
gk = nufft.y2k(gy)
w = numpy.abs(gk.get())#**2) ))
nufft.st['w'] = w#self.nufftobj.vec2k(w)
RTR=nufft.st['w'] # see __init__() in class "nufft"
return RTR
# def _create_laplacian_kernel(nufft):
# #===============================================================================
# # # # Laplacian oeprator, convolution kernel in spatial domain
# # # related to constraint
# #===============================================================================
# uker = numpy.zeros(nufft.st['Kd'][:],dtype=numpy.complex64,order='C')
# n_dims= numpy.size(nufft.st['Nd'])
#
# if n_dims == 1:
# uker[0] = -2.0
# uker[1] = 1.0
# uker[-1] = 1.0
# elif n_dims == 2:
# uker[0,0] = -4.0
# uker[1,0] = 1.0
# uker[-1,0] = 1.0
# uker[0,1] = 1.0
# uker[0,-1] = 1.0
# elif n_dims == 3:
# uker[0,0,0] = -6.0
# uker[1,0,0] = 1.0
# uker[-1,0,0] = 1.0
# uker[0,1,0] = 1.0
# uker[0,-1,0] = 1.0
# uker[0,0,1] = 1.0
# uker[0,0,-1] = 1.0
#
# uker =numpy.fft.fftn(uker) #, self.nufftobj.st['Kd'], range(0,numpy.ndim(uker)))
# return uker
# def GBPDNA(nufft. gy, maxiter, rho):
# def A(x):
# gy = nufft.forward(x)
# return gy
# def AH(gy):
# x2 = nufft.adjoint(gy)
# return x2
def L1TVLAD(nufft, gy, maxiter, rho): # main function of solver
"""
(testing) L1-total variation regularized least absolute deviation
"""
mu = 1.0
LMBD = rho*mu
def AHA(x):
x2 = nufft.selfadjoint(x)
return x2
def AH(gy):
x2 = nufft.adjoint(gy)
return x2
uker_cpu = mu*_create_kspace_sampling_density(nufft) - LMBD* helper.create_laplacian_kernel(nufft) # on cpu
uker = nufft.thr.to_device(uker_cpu.astype(numpy.complex64))
AHy = AH(gy) # on device?
z = numpy.zeros(nufft.st['Nd'],dtype = numpy.complex64,order='C')
z_gpu = nufft.thr.to_device(z)
xkp1 = nufft.thr.copy_array(z_gpu)
AHyk = nufft.thr.copy_array(z_gpu)
# self._allo_split_variables()
zz= []
bb = []
dd = []
d_indx, dt_indx = helper.indxmap_diff(nufft.st['Nd'])
ndims = len(nufft.st['Nd'])
s_tmp = []
for pp in range(0, ndims):
s_tmp += [0, ]
for jj in range( 0, ndims): # n_dims + 1 for wavelets
d_indx[jj] = nufft.thr.to_device(d_indx[jj])
dt_indx[jj] = nufft.thr.to_device(dt_indx[jj])
# z=numpy.zeros(nufft.st['Nd'], dtype = nufft.dtype, order='C')
# ndims = len(nufft.st['Nd'])
for jj in range( 0, ndims): # n_dims + 1 for wavelets
zz += [nufft.thr.copy_array(z_gpu),]
bb += [nufft.thr.copy_array(z_gpu),]
dd += [nufft.thr.copy_array(z_gpu),]
zf = nufft.thr.copy_array(z_gpu)
bf = nufft.thr.copy_array(z_gpu)
df = nufft.thr.copy_array(z_gpu)
n_dims = len(nufft.st['Nd'])#numpy.size(uf.shape)
tmp_gpu = nufft.thr.copy_array(z_gpu)
for outer in numpy.arange(0, maxiter):
# for inner in numpy.arange(0,nInner):
# solve Ku = rhs
# rhs = (mu*(AHyk + df - bf) + # right hand side
# LMBD*(cDiff(dd[0] - bb[0], dt_indx[0])) +
# LMBD*(cDiff(dd[1] - bb[1], dt_indx[1])) )
rhs = nufft.thr.copy_array(AHyk)
rhs += df
rhs -= bf
# rhs *= mu
nufft.prg.cMultiplyScalar( dtype(mu), rhs, local_size=None, global_size=int(nufft.Ndprod))
# print(rhs.get())
for pp in range(0, ndims):
in_cDiff = nufft.thr.copy_array(dd[pp])
in_cDiff -= bb[pp]
# out_cDiff = nufft.thr.empty_like(in_cDiff)
nufft.prg.cDiff(dt_indx[pp], in_cDiff, tmp_gpu, local_size=None, global_size=int(nufft.Ndprod))
# tmp_gpu *= LMBD
nufft.prg.cMultiplyScalar( dtype(LMBD), tmp_gpu, local_size=None, global_size=int(nufft.Ndprod))
rhs += tmp_gpu
# print(rhs.get())
# in_cDiff = nufft.thr.copy_array(dd[1])
#
# in_cDiff -= bb[1]
#
# # out_cDiff = nufft.thr.empty_like(in_cDiff)
# nufft.prg.cDiff(dt_indx[1], in_cDiff, tmp_gpu, local_size=None, global_size=int(nufft.Ndprod))
#
# # tmp_gpu *= LMBD
# nufft.prg.cMultiplyScalar( dtype(LMBD), tmp_gpu, local_size=None, global_size=int(nufft.Ndprod))
#
# rhs += tmp_gpu
# print(rhs.get())
# Note K = F' uker F
# so K-1 ~ F
# xkp1 = nufft.k2xx(nufft.xx2k(rhs) / uker)
xx = nufft.thr.copy_array(rhs)
k=nufft.xx2k(xx)
k /= uker
# nufft.k_Kd2 = nufft.thr.copy_array(nufft.k_Kd)
xkp1 = nufft.k2xx(k)
# xkp1 = nufft.thr.copy_array(nufft.x_Nd)
# print(xkp1.get())
# self._update_d(xkp1)
# zz[0] = cDiff(xkp1, d_indx[0])
# zz[1] = cDiff(xkp1, d_indx[1])
for pp in range(0, ndims):
nufft.prg.cDiff(d_indx[pp], xkp1, zz[pp], local_size=None, global_size=int(nufft.Ndprod))
# nufft.prg.cDiff(d_indx[0], xkp1, zz[0], local_size=None, global_size=int(nufft.Ndprod))
# nufft.prg.cDiff(d_indx[1], xkp1, zz[1], local_size=None, global_size=int(nufft.Ndprod))
# zf = AHA(xkp1) -AHy
zf = AHA(xkp1)
zf -= AHy
'''
soft-thresholding the edges
'''
for pp in range(0, ndims):
s_tmp[pp] = zz[pp] + bb[pp]
# s1 = zz[0] + bb[0]
#
# s2 = zz[1] + bb[1]
# s = s1**2 + s2**2
# s1 *= s1
for pp in range(0, ndims):
if pp > 0:
# s += tmp_gpu
nufft.prg.cHypot(s, s_tmp[pp], local_size=None, global_size=int(nufft.Ndprod))
# nufft.thr.synchronize()
else: # pp == 0
s = nufft.thr.copy_array(s_tmp[pp])
# nufft.prg.cMultiplyConjVec(s1, s1, tmp_gpu, local_size=None, global_size=int(nufft.Ndprod))
# s = nufft.thr.copy_array(tmp_gpu)
# # s2 *= s2
# nufft.prg.cMultiplyConjVec(s2, s2, tmp_gpu, local_size=None, global_size=int(nufft.Ndprod))
# s += tmp_gpu
# s = s1 + s2
# nufft.prg.cSqrt(s, local_size=None, global_size=int(nufft.Ndprod))
s += 1e-6
threshold_value = dtype(1/LMBD)
# r =(s > threshold_value)*(s-threshold_value)/s#numpy.maximum(s - threshold_value , 0.0)/s
nufft.prg.cAnisoShrink(threshold_value, s, tmp_gpu, local_size=None, global_size=int(nufft.Ndprod))
tmp_gpu /=s
# dd[0] = s1*r
# dd[1] = s2*r
# dd[0] = s1*tmp_gpu
for pp in range(0, ndims):
nufft.prg.cMultiplyVec(s_tmp[pp], tmp_gpu, dd[pp], local_size=None, global_size=int(nufft.Ndprod))
# nufft.prg.cMultiplyVec(s1, tmp_gpu, dd[0], local_size=None, global_size=int(nufft.Ndprod))
# dd[1] = s2*tmp_gpu
# nufft.prg.cMultiplyVec(s2, tmp_gpu, dd[1], local_size=None, global_size=int(nufft.Ndprod))
tmp_gpu = zf+bf
threshold_value=dtype(1.0/mu)
# df.real =0.0+ (df.real>threshold_value)*(df.real - threshold_value) +(df.real<= - threshold_value)*(df.real+threshold_value)
# df.imag = 0.0+(df.imag>threshold_value)*(df.imag - threshold_value) +(df.imag<= - threshold_value)*(df.imag+threshold_value)
nufft.prg.cAnisoShrink(threshold_value, tmp_gpu, df, local_size=None, global_size=int(nufft.Ndprod))
# df = sy
# end of shrinkage
for pp in range(0, ndims):
bb[pp] += zz[pp] - dd[pp]
# bb[0] += zz[0] - dd[0]
# bb[1] += zz[1] - dd[1]
bf += zf - df
# self._update_b() # update b based on the current u
# print(outer)
AHyk -= zf # Linearized Bregman iteration f^k+1 = f^k + f - Au
# print(xkp1.get())
# print(outer)
# print('here')
# nufft.x_Nd = nufft.thr.copy_array(xkp1)
return xkp1
def L1TVOLS(nufft, gy, maxiter, rho ): # main function of solver
"""
L1-total variation regularized ordinary least square
"""
mu = 1.0
LMBD = rho*mu
def AHA(x):
x2 = nufft.selfadjoint(x)
return x2
def AH(gy):
x2 = nufft.adjoint(gy)
return x2
uker_cpu = mu*_create_kspace_sampling_density(nufft) - LMBD* helper.create_laplacian_kernel(nufft) # on cpu
uker = nufft.thr.to_device(uker_cpu.astype(numpy.complex64))
AHy = AH(gy) # on device?
z = numpy.zeros(nufft.st['Nd'],dtype = numpy.complex64,order='C')
z_gpu = nufft.thr.to_device(z)
xkp1 = nufft.thr.copy_array(z_gpu)
AHyk = nufft.thr.copy_array(z_gpu)
# self._allo_split_variables()
zz= []
bb = []
dd = []
d_indx, dt_indx = helper.indxmap_diff(nufft.st['Nd'])
ndims = len(nufft.st['Nd'])
s_tmp = []
for pp in range(0, ndims):
s_tmp += [0, ]
for jj in range( 0, ndims): # n_dims + 1 for wavelets
d_indx[jj] = nufft.thr.to_device(d_indx[jj])
dt_indx[jj] = nufft.thr.to_device(dt_indx[jj])
# z=numpy.zeros(nufft.st['Nd'], dtype = nufft.dtype, order='C')
# ndims = len(nufft.st['Nd'])
for jj in range( 0, ndims): # n_dims + 1 for wavelets
zz += [nufft.thr.copy_array(z_gpu),]
bb += [nufft.thr.copy_array(z_gpu),]
dd += [nufft.thr.copy_array(z_gpu),]
# zf = nufft.thr.copy_array(z_gpu)
# bf = nufft.thr.copy_array(z_gpu)
# df = nufft.thr.copy_array(z_gpu)
n_dims = len(nufft.st['Nd'])#numpy.size(uf.shape)
tmp_gpu = nufft.thr.copy_array(z_gpu)
for outer in numpy.arange(0, maxiter):
# for inner in numpy.arange(0,nInner):
# solve Ku = rhs
# rhs = (mu*(AHyk + df - bf) + # right hand side
# LMBD*(cDiff(dd[0] - bb[0], dt_indx[0])) +
# LMBD*(cDiff(dd[1] - bb[1], dt_indx[1])) )
rhs = nufft.thr.copy_array(AHyk)
# rhs += df
#
# rhs -= bf
# rhs *= mu
nufft.prg.cMultiplyScalar( dtype(mu), rhs, local_size=None, global_size=int(nufft.Ndprod))
# print(rhs.get())
for pp in range(0, ndims):
in_cDiff = nufft.thr.copy_array(dd[pp])
in_cDiff -= bb[pp]
# out_cDiff = nufft.thr.empty_like(in_cDiff)
nufft.prg.cDiff(dt_indx[pp], in_cDiff, tmp_gpu, local_size=None, global_size=int(nufft.Ndprod))
# tmp_gpu *= LMBD
nufft.prg.cMultiplyScalar( dtype(LMBD), tmp_gpu, local_size=None, global_size=int(nufft.Ndprod))
rhs += tmp_gpu
# print(rhs.get())
# in_cDiff = nufft.thr.copy_array(dd[1])
#
# in_cDiff -= bb[1]
#
# # out_cDiff = nufft.thr.empty_like(in_cDiff)
# nufft.prg.cDiff(dt_indx[1], in_cDiff, tmp_gpu, local_size=None, global_size=int(nufft.Ndprod))
#
# # tmp_gpu *= LMBD
# nufft.prg.cMultiplyScalar( dtype(LMBD), tmp_gpu, local_size=None, global_size=int(nufft.Ndprod))
#
# rhs += tmp_gpu
# print(rhs.get())
# Note K = F' uker F
# so K-1 ~ F
# xkp1 = nufft.k2xx(nufft.xx2k(rhs) / uker)
xx = nufft.thr.copy_array(rhs)
k = nufft.xx2k(xx)
k /= uker
# nufft.k_Kd2 = nufft.thr.copy_array(nufft.k_Kd)
xkp1 = nufft.k2xx(k)
# xkp1 = nufft.thr.copy_array(nufft.x_Nd)
# print(xkp1.get())
# self._update_d(xkp1)
# zz[0] = cDiff(xkp1, d_indx[0])
# zz[1] = cDiff(xkp1, d_indx[1])
for pp in range(0, ndims):
nufft.prg.cDiff(d_indx[pp], xkp1, zz[pp], local_size=None, global_size=int(nufft.Ndprod))
# nufft.prg.cDiff(d_indx[0], xkp1, zz[0], local_size=None, global_size=int(nufft.Ndprod))
# nufft.prg.cDiff(d_indx[1], xkp1, zz[1], local_size=None, global_size=int(nufft.Ndprod))
# zf = AHA(xkp1) -AHy
zf = AHA(xkp1)
zf -= AHy
'''
soft-thresholding the edges
'''
for pp in range(0, ndims):
s_tmp[pp] = zz[pp] + bb[pp]
# s1 = zz[0] + bb[0]
#
# s2 = zz[1] + bb[1]
# s = s1**2 + s2**2
# s1 *= s1
# nufft.prg.cMultiplyConjVec(s_tmp[pp], s_tmp[pp], tmp_gpu, local_size=None, global_size=int(nufft.Ndprod))
for pp in range(0, ndims):
if pp > 0:
# s += tmp_gpu
nufft.prg.cHypot(s, s_tmp[pp], local_size=None, global_size=int(nufft.Ndprod))
# nufft.thr.synchronize()
else: # pp == 0
s = nufft.thr.copy_array(s_tmp[pp])
# nufft.prg.cMultiplyConjVec(s1, s1, tmp_gpu, local_size=None, global_size=int(nufft.Ndprod))
# s = nufft.thr.copy_array(tmp_gpu)
# # s2 *= s2
# nufft.prg.cMultiplyConjVec(s2, s2, tmp_gpu, local_size=None, global_size=int(nufft.Ndprod))
# s += tmp_gpu
# s = s1 + s2
# nufft.prg.cSqrt(s, local_size=None, global_size=int(nufft.Ndprod))
s += 1e-6
threshold_value = dtype(1/LMBD)
# r =(s > threshold_value)*(s-threshold_value)/s#numpy.maximum(s - threshold_value , 0.0)/s
nufft.prg.cAnisoShrink(threshold_value, s, tmp_gpu, local_size=None, global_size=int(nufft.Ndprod))
tmp_gpu /=s
# dd[0] = s1*r
# dd[1] = s2*r
# dd[0] = s1*tmp_gpu
for pp in range(0, ndims):
nufft.prg.cMultiplyVec(s_tmp[pp], tmp_gpu, dd[pp], local_size=None, global_size=int(nufft.Ndprod))
# nufft.prg.cMultiplyVec(s1, tmp_gpu, dd[0], local_size=None, global_size=int(nufft.Ndprod))
# dd[1] = s2*tmp_gpu
# nufft.prg.cMultiplyVec(s2, tmp_gpu, dd[1], local_size=None, global_size=int(nufft.Ndprod))
# tmp_gpu = zf+bf
# threshold_value=dtype(1.0/mu)
# df.real =0.0+ (df.real>threshold_value)*(df.real - threshold_value) +(df.real<= - threshold_value)*(df.real+threshold_value)
# df.imag = 0.0+(df.imag>threshold_value)*(df.imag - threshold_value) +(df.imag<= - threshold_value)*(df.imag+threshold_value)
# nufft.prg.cAnisoShrink(threshold_value, tmp_gpu, df, local_size=None, global_size=int(nufft.Ndprod))
# df = sy
# end of shrinkage
for pp in range(0, ndims):
bb[pp] += zz[pp] - dd[pp]
# bb[0] += zz[0] - dd[0]
# bb[1] += zz[1] - dd[1]
# bf += zf - df
# self._update_b() # update b based on the current u
# print(outer)
AHyk -= zf # Linearized Bregman iteration f^k+1 = f^k + f - Au
# print(xkp1.get())
# print(outer)
# print('here')
# nufft.x_Nd = nufft.thr.copy_array(xkp1)
return xkp1
def _pipe_density(nufft,maxiter):
"""
Private: create the density function in the data space by a iterative solution
Pipe et al. 1999
"""
try:
if maxiter < nufft.last_iter:
W = nufft.st['W']
else: #maxiter > nufft.last_iter
W = nufft.st['W']
for pp in range(0,maxiter - nufft.last_iter):
# E = nufft.st['p'].dot(V1.dot(W))
E = nufft.forward(nufft.adjoint(W))
W = (W/E)
nufft.last_iter = maxiter
except:
W = nufft.thr.copy_array(nufft.y)
# nufft.prg.cMultiplyScalar(nufft.zero_scalar, W, local_size=None, global_size=int(nufft.M))
W.fill(0.0 + 0.0j)
# V1= nufft.st['p'].getH()
# VVH = V.dot(V.getH())
for pp in range(0,1):
# E = nufft.st['p'].dot(V1.dot(W))
E = nufft.forward(nufft.adjoint(W))
W /= E
# nufft.prg.cMultiplyVecInplace(self.SnGPUArray, self.x_Nd, local_size=None, global_size=int(self.Ndprod))
return W
def solve(nufft,gy, solver=None, maxiter=30, *args, **kwargs):
"""
The solve function of NUFFT_hsa.
The current version supports solvers = 'cg' or 'L1TVOLS'.
:param nufft: NUFFT_hsa object
:param y: (M,) array, non-uniform data. If batch is provided, 'cg' and 'L1TVOLS' returns different image shape.
:type y: numpy.complex64 reikna array
:return: x: Nd image. L1TVOLS always returns Nd. 'cg' returns Nd.
:rtype: x: reikna array, complex64.
"""
# define the reduction kernel on the device
# if None == solver:
# solver = 'cg'
if 'L1TVLAD' == solver:
x2=L1TVLAD(nufft, gy, maxiter=maxiter, *args, **kwargs )
# x2 = nufft.thr.copy_array(nufft.x_Nd)
return x2
elif 'L1TVOLS' == solver:
x2=L1TVOLS(nufft, gy, maxiter=maxiter, *args, **kwargs )
# x2 = nufft.thr.copy_array(nufft.x_Nd)
return x2
elif 'dc' == solver:
"""
Density compensation method
nufft.st['W'] will be computed if doesn't exist
If nufft.st['W'] exist then x2 = nufft.adjoint(nufft.st['W']*y)
input:
y: (M,) array
output:
x2: Nd array
"""
# print(solver, ":density compensation method. I won't recommend it as the GPU version is not needed! Try the CPU version")
nufft.st['W'] = _pipe_density(nufft, maxiter=maxiter,*args, **kwargs)
#
x2 = nufft.adjoint(nufft.st['W']*gy)
return x2
# return gx
elif 'cg' == solver:
from reikna.algorithms import Reduce, Predicate, predicate_sum
nufft.reduce_sum = Reduce(numpy.zeros(nufft.multi_Kd, dtype = nufft.dtype), predicate_sum(dtype)).compile(nufft.thr)
# nufft.reduce_sum = nufft.reduce_sum.compile(nufft.thr)
# update: b = spH * gy
b = nufft.y2k(gy)
# Initialize x = b
x = nufft.thr.copy_array( b)
rsold = nufft.thr.empty_like(nufft.reduce_sum.parameter.output)
# rsold.fill(0.0+0.0j)
nufft.reduce_sum(rsold, x)
# print('x',rsold)
# initialize r = b - A * x
r = nufft.thr.empty_like( b)
# r.fill(0.0 + 0.0j)
y_tmp = nufft.k2y(x)
Ax = nufft.y2k(y_tmp)
del y_tmp
rsold = nufft.thr.empty_like(nufft.reduce_sum.parameter.output)
# rsold.fill(0.0 + 0.0j)
nufft.reduce_sum(rsold, Ax)
# print('Ax',rsold)
nufft.prg.cAddVec(b, - Ax, r , local_size=None, global_size = int(nufft.batch * nufft.Kdprod))
# nufft.thr.synchronize()
# p = r
p = nufft.thr.copy_array(r)
# rsold = r' * r
tmp_array = nufft.thr.empty_like( r)
# tmp_array.fill(0.0 + 0.0j)
nufft.prg.cMultiplyConjVec(r, r, tmp_array, local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
# nufft.thr.synchronize()
rsold = nufft.thr.empty_like(nufft.reduce_sum.parameter.output)
# rsold.fill(0.0 + 0.0j)
nufft.reduce_sum(rsold, tmp_array)
# allocate Ap
# Ap = nufft.thr.empty_like( b)
rsnew = nufft.thr.empty_like(nufft.reduce_sum.parameter.output)
# rsnew.fill(0.0 + 0.0j)
tmp_sum = nufft.thr.empty_like(nufft.reduce_sum.parameter.output)
# tmp_sum.fill(0.0 + 0.0j)
for pp in range(0, maxiter):
tmp_p = nufft.k2y(p)
Ap = nufft.y2k(tmp_p)
del tmp_p
# alpha = rs_old/(p'*Ap)
nufft.prg.cMultiplyConjVec(p, Ap, tmp_array, local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
# nufft.thr.synchronize()
nufft.reduce_sum(tmp_sum, tmp_array)
alpha = rsold / tmp_sum
# alpha_cpu = alpha.get()
# if numpy.isnan(alpha_cpu):
# alpha_cpu = 0 # avoid singularity
# print(tmp_sum, alpha, rsold)
# print(pp,rsold , alpha, numpy.sum(tmp_array.get()) )
# x = x + alpha*p
p2 = nufft.thr.copy_array(p)
nufft.prg.cMultiplyScalar(alpha.get(), p2, local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
# nufft.thr.synchronize()
# nufft.prg.cAddVec(x, alpha, local_size=None, global_size=int(nufft.Kdprod))
x += p2
# r = r - alpha * Ap
p2= nufft.thr.copy_array(Ap)
# nufft.thr.synchronize()
nufft.prg.cMultiplyScalar(alpha.get(), p2, local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
# nufft.thr.synchronize()
r -= p2
# print(pp, numpy.sum(x.get()), numpy.sum(r.get()))
# rs_new = r'*r
nufft.prg.cMultiplyConjVec(r, r, tmp_array, local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
# nufft.thr.synchronize()
nufft.reduce_sum(rsnew, tmp_array)
# tmp_sum = p = r + (rs_new/rs_old)*p
beta = rsnew/rsold
# beta_cpu = beta.get()
# if numpy.isnan(beta_cpu):
# beta_cpu = 0
# print(beta, rsnew, rsold)
p2= nufft.thr.copy_array(p)
nufft.prg.cMultiplyScalar(beta.get(), p2, local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
# nufft.thr.synchronize()
nufft.prg.cAddVec(r, p2, p, local_size=None, global_size=int(nufft.batch * nufft.Kdprod))
# nufft.thr.synchronize()
p = r + p2
rsold =nufft.thr.copy_array( rsnew)
# nufft.thr.synchronize()
# end of iteration
# copy result to k_Kd2
# nufft.k_Kd2 = nufft.thr.copy_array(x)
# inverse FFT: k_Kd2 -> x_Nd
x2 = nufft.k2xx(x) # x is the solved k space
# rescale the SnGPUArray
# x2 /= nufft.volume['gpu_sense2']
# x3 = nufft.x2s(x2) # combine multi-coil to single-coil
try:
x2 /= nufft.volume['SnGPUArray']
except:
nufft.prg.cTensorMultiply(numpy.uint32(nufft.batch),
numpy.uint32(nufft.tSN['Tdims']),
nufft.tSN['Td'],
nufft.tSN['Td_elements'],
nufft.tSN['invTd_elements'],
nufft.tSN['tensor_sn'],
x2,
numpy.uint32(1), # division, 1 is true
local_size = None, global_size = int(nufft.batch*nufft.Ndprod))
return x2