https://github.com/mikgroup/sigpy
Tip revision: 63348600f1dc8fb361e112d0be6fa14d69bac22c authored by Frank Ong on 04 March 2019, 00:44:51 UTC
Bump version: 0.1.0 → 0.1.0
Bump version: 0.1.0 → 0.1.0
Tip revision: 6334860
block.py
import numpy as np
import numba as nb
from sigpy import backend, config
__all__ = ['array_to_blocks', 'blocks_to_array']
def array_to_blocks(input, blk_shape, blk_strides):
"""Extract blocks from an array in a sliding window manner.
Args:
input (array): input array.
blk_shape (tuple): block shape. Must have same length as input ndim.
blk_strides (tuple): block strides.
Must have same length as input ndim.
Returns:
array: array of shape num_blks + blk_shape, where
num_blks = (input.shape - blk_shape + blk_strides) // blk_strides
Example:
>>> input = np.array([0, 1, 2, 3, 4, 5])
>>> print(array_to_blocks(input, [2], [2]))
[[0, 1],
[2, 3],
[4, 5]]
"""
ndim = input.ndim
if ndim != len(blk_shape) or ndim != len(blk_strides):
raise ValueError('Input must have same dimensions as blocks.')
num_blks = [(i - b + s) // s for i, b,
s in zip(input.shape, blk_shape, blk_strides)]
device = backend.get_device(input)
xp = device.xp
with device:
output = xp.zeros(num_blks + blk_shape, dtype=input.dtype)
if ndim == 1:
if device == backend.cpu_device:
_array_to_blocks1(output, input,
blk_shape[-1],
blk_strides[-1],
num_blks[-1])
else:
_array_to_blocks1_cuda(input,
blk_shape[-1],
blk_strides[-1],
num_blks[-1],
output,
size=num_blks[-1] * blk_shape[-1])
elif ndim == 2:
if device == backend.cpu_device:
_array_to_blocks2(output, input,
blk_shape[-1], blk_shape[-2],
blk_strides[-1], blk_strides[-2],
num_blks[-1], num_blks[-2])
else:
_array_to_blocks2_cuda(input,
blk_shape[-1], blk_shape[-2],
blk_strides[-1], blk_strides[-2],
num_blks[-1], num_blks[-2],
output,
size=num_blks[-1] * num_blks[-2] *
blk_shape[-1] * blk_shape[-2])
elif ndim == 3:
if device == backend.cpu_device:
_array_to_blocks3(output,
input,
blk_shape[-1],
blk_shape[-2],
blk_shape[-3],
blk_strides[-1],
blk_strides[-2],
blk_strides[-3],
num_blks[-1],
num_blks[-2],
num_blks[-3])
else:
_array_to_blocks3_cuda(input,
blk_shape[-1], blk_shape[-2],
blk_shape[-3],
blk_strides[-1], blk_strides[-2],
blk_strides[-3],
num_blks[-1], num_blks[-2],
num_blks[-3],
output,
size=num_blks[-1] * num_blks[-2] *
num_blks[-3] *
blk_shape[-1] * blk_shape[-2] *
blk_shape[-3])
return output
def blocks_to_array(input, oshape, blk_shape, blk_strides):
"""Accumulate blocks into an array in a sliding window manner.
Args:
input (array): input array of shape num_blks + blk_shape
oshape (tuple): output shape.
blk_shape (tuple): block shape. Must have same length as oshape.
blk_strides (tuple): block strides. Must have same length as oshape.
Returns:
array: array of shape oshape.
"""
ndim = len(blk_shape)
if 2 * ndim != input.ndim or ndim != len(blk_strides):
raise ValueError('Input must have same dimensions as blocks.')
num_blks = input.shape[:ndim]
device = backend.get_device(input)
xp = device.xp
with device:
output = xp.zeros(oshape, dtype=input.dtype)
if ndim == 1:
if device == backend.cpu_device:
_blocks_to_array1(output, input,
blk_shape[-1],
blk_strides[-1],
num_blks[-1])
else:
if np.issubdtype(input.dtype, np.floating):
_blocks_to_array1_cuda(input,
blk_shape[-1],
blk_strides[-1],
num_blks[-1],
output,
size=num_blks[-1] * blk_shape[-1])
else:
_blocks_to_array1_cuda_complex(input,
blk_shape[-1],
blk_strides[-1],
num_blks[-1],
output,
size=num_blks[-1] *
blk_shape[-1])
elif ndim == 2:
if device == backend.cpu_device:
_blocks_to_array2(output, input,
blk_shape[-1], blk_shape[-2],
blk_strides[-1], blk_strides[-2],
num_blks[-1], num_blks[-2])
else:
if np.issubdtype(input.dtype, np.floating):
_blocks_to_array2_cuda(input,
blk_shape[-1], blk_shape[-2],
blk_strides[-1], blk_strides[-2],
num_blks[-1], num_blks[-2],
output,
size=num_blks[-1] * num_blks[-2] *
blk_shape[-1] * blk_shape[-2])
else:
_blocks_to_array2_cuda_complex(
input,
blk_shape[-1], blk_shape[-2],
blk_strides[-1], blk_strides[-2],
num_blks[-1], num_blks[-2],
output,
size=num_blks[-1] * num_blks[-2] *
blk_shape[-1] * blk_shape[-2])
elif ndim == 3:
if device == backend.cpu_device:
_blocks_to_array3(output,
input,
blk_shape[-1],
blk_shape[-2],
blk_shape[-3],
blk_strides[-1],
blk_strides[-2],
blk_strides[-3],
num_blks[-1],
num_blks[-2],
num_blks[-3])
else:
if np.issubdtype(input.dtype, np.floating):
_blocks_to_array3_cuda(
input,
blk_shape[-1], blk_shape[-2], blk_shape[-3],
blk_strides[-1], blk_strides[-2], blk_strides[-3],
num_blks[-1], num_blks[-2], num_blks[-3],
output,
size=num_blks[-1] * num_blks[-2] *
num_blks[-3] * blk_shape[-1] * blk_shape[-2] *
blk_shape[-3])
else:
_blocks_to_array3_cuda_complex(
input,
blk_shape[-1], blk_shape[-2], blk_shape[-3],
blk_strides[-1], blk_strides[-2],
blk_strides[-3],
num_blks[-1], num_blks[-2], num_blks[-3],
output,
size=num_blks[-1] * num_blks[-2] * num_blks[-3] *
blk_shape[-1] * blk_shape[-2] * blk_shape[-3])
return output
@nb.jit(nopython=True, cache=True)
def _array_to_blocks1(output, input, Bx, Sx, Nx):
for nx in range(Nx):
for bx in range(Bx):
ix = nx * Sx + bx
if ix < input.shape[-1]:
output[nx, bx] = input[ix]
@nb.jit(nopython=True, cache=True)
def _array_to_blocks2(output, input, Bx, By, Sx, Sy, Nx, Ny):
for ny in range(Ny):
for nx in range(Nx):
for by in range(By):
for bx in range(Bx):
iy = ny * Sy + by
ix = nx * Sx + bx
if ix < input.shape[-1] and iy < input.shape[-2]:
output[ny, nx, by, bx] = input[iy, ix]
@nb.jit(nopython=True, cache=True)
def _array_to_blocks3(output, input, Bx, By, Bz, Sx, Sy, Sz, Nx, Ny, Nz):
for nz in range(Nz):
for ny in range(Ny):
for nx in range(Nx):
for bz in range(Bz):
for by in range(By):
for bx in range(Bx):
iz = nz * Sz + bz
iy = ny * Sy + by
ix = nx * Sx + bx
if (ix < input.shape[-1] and
iy < input.shape[-2] and
iz < input.shape[-3]):
output[nz, ny, nx, bz, by,
bx] = input[iz, iy, ix]
@nb.jit(nopython=True, cache=True)
def _blocks_to_array1(output, input, Bx, Sx, Nx):
for nx in range(Nx):
for bx in range(Bx):
ix = nx * Sx + bx
if ix < output.shape[-1]:
output[ix] += input[nx, bx]
@nb.jit(nopython=True, cache=True)
def _blocks_to_array2(output, input, Bx, By, Sx, Sy, Nx, Ny):
for ny in range(Ny):
for nx in range(Nx):
for by in range(By):
for bx in range(Bx):
iy = ny * Sy + by
ix = nx * Sx + bx
if ix < output.shape[-1] and iy < output.shape[-2]:
output[iy, ix] += input[ny, nx, by, bx]
@nb.jit(nopython=True, cache=True)
def _blocks_to_array3(output, input, Bx, By, Bz, Sx, Sy, Sz, Nx, Ny, Nz):
for nz in range(Nz):
for ny in range(Ny):
for nx in range(Nx):
for bz in range(Bz):
for by in range(By):
for bx in range(Bx):
iz = nz * Sz + bz
iy = ny * Sy + by
ix = nx * Sx + bx
if (ix < output.shape[-1]
and iy < output.shape[-2]
and iz < output.shape[-3]):
output[iz, iy, ix] += input[nz,
ny, nx, bz, by, bx]
if config.cupy_enabled:
import cupy as cp
_array_to_blocks1_cuda = cp.ElementwiseKernel(
'raw T input, int32 Bx, int32 Sx, int32 Nx',
'raw T output',
"""
const int ndim = input.ndim;
int nx = i / Bx;
i -= nx * Bx;
int bx = i;
int ix = nx * Sx + bx;
if (ix < input.shape()[ndim - 1]) {
int input_idx[] = {ix};
int output_idx[] = {nx, bx};
output[output_idx] = input[input_idx];
}
""",
name='_array_to_blocks1_cuda')
_array_to_blocks2_cuda = cp.ElementwiseKernel(
'raw T input, int32 Bx, int32 By, '
'int32 Sx, int32 Sy, int32 Nx, int32 Ny',
'raw T output',
"""
const int ndim = input.ndim;
int ny = i / Bx / By / Nx;
i -= ny * Bx * By * Nx;
int nx = i / Bx / By;
i -= nx * Bx * By;
int by = i / Bx;
i -= by * Bx;
int bx = i;
int iy = ny * Sy + by;
int ix = nx * Sx + bx;
if (ix < input.shape()[ndim - 1] && iy < input.shape()[ndim - 2]) {
int input_idx[] = {iy, ix};
int output_idx[] = {ny, nx, by, bx};
output[output_idx] = input[input_idx];
}
""",
name='_array_to_blocks2_cuda')
_array_to_blocks3_cuda = cp.ElementwiseKernel(
'raw T input, int32 Bx, int32 By, int32 Bz, '
'int32 Sx, int32 Sy, int32 Sz, int32 Nx, int32 Ny, int32 Nz',
'raw T output',
"""
const int ndim = input.ndim;
int nz = i / Bx / By / Bz / Nx / Ny;
i -= nz * Bx * By * Bz * Nx * Ny;
int ny = i / Bx / By / Bz / Nx;
i -= ny * Bx * By * Bz * Nx;
int nx = i / Bx / By / Bz;
i -= nx * Bx * By * Bz;
int bz = i / Bx / By;
i -= bz * Bx * By;
int by = i / Bx;
i -= by * Bx;
int bx = i;
int iz = nz * Sz + bz;
int iy = ny * Sy + by;
int ix = nx * Sx + bx;
if (ix < input.shape()[ndim - 1] && iy < input.shape()[ndim - 2]
&& iz < input.shape()[ndim - 3]) {
int input_idx[] = {iz, iy, ix};
int output_idx[] = {nz, ny, nx, bz, by, bx};
output[output_idx] = input[input_idx];
}
""",
name='_array_to_blocks3_cuda')
_blocks_to_array1_cuda = cp.ElementwiseKernel(
'raw T input, int32 Bx, int32 Sx, int32 Nx',
'raw T output',
"""
const int ndim = output.ndim;
int nx = i / Bx;
i -= nx * Bx;
int bx = i;
int ix = nx * Sx + bx;
if (ix < output.shape()[ndim - 1]) {
int input_idx[] = {nx, bx};
int output_idx[] = {ix};
atomicAdd(&output[output_idx], input[input_idx]);
}
""",
name='_blocks_to_array1_cuda')
_blocks_to_array1_cuda_complex = cp.ElementwiseKernel(
'raw T input, int32 Bx, int32 Sx, int32 Nx',
'raw T output',
"""
const int ndim = output.ndim;
int nx = i / Bx;
i -= nx * Bx;
int bx = i;
int ix = nx * Sx + bx;
if (ix < output.shape()[ndim - 1]) {
int input_idx[] = {nx, bx};
int output_idx[] = {ix};
atomicAdd(reinterpret_cast<T::value_type*>(&(output[output_idx])),
input[input_idx].real());
atomicAdd(
reinterpret_cast<T::value_type*>(&(output[output_idx])) + 1,
input[input_idx].imag());
}
""",
name='_blocks_to_array1_cuda_complex')
_blocks_to_array2_cuda = cp.ElementwiseKernel(
'raw T input, int32 Bx, int32 By, int32 Sx, int32 Sy, '
'int32 Nx, int32 Ny',
'raw T output',
"""
const int ndim = output.ndim;
int ny = i / Bx / By / Nx;
i -= ny * Bx * By * Nx;
int nx = i / Bx / By;
i -= nx * Bx * By;
int by = i / Bx;
i -= by * Bx;
int bx = i;
int iy = ny * Sy + by;
int ix = nx * Sx + bx;
if (ix < output.shape()[ndim - 1] && iy < output.shape()[ndim - 2]) {
int input_idx[] = {ny, nx, by, bx};
int output_idx[] = {iy, ix};
atomicAdd(&output[output_idx], input[input_idx]);
}
""",
name='_blocks_to_array2_cuda')
_blocks_to_array2_cuda_complex = cp.ElementwiseKernel(
'raw T input, int32 Bx, int32 By, int32 Sx, int32 Sy, '
'int32 Nx, int32 Ny',
'raw T output',
"""
const int ndim = output.ndim;
int ny = i / Bx / By / Nx;
i -= ny * Bx * By * Nx;
int nx = i / Bx / By;
i -= nx * Bx * By;
int by = i / Bx;
i -= by * Bx;
int bx = i;
int iy = ny * Sy + by;
int ix = nx * Sx + bx;
if (ix < output.shape()[ndim - 1] && iy < output.shape()[ndim - 2]) {
int input_idx[] = {ny, nx, by, bx};
int output_idx[] = {iy, ix};
atomicAdd(reinterpret_cast<T::value_type*>(&(output[output_idx])),
input[input_idx].real());
atomicAdd(
reinterpret_cast<T::value_type*>(&(output[output_idx])) + 1,
input[input_idx].imag());
}
""",
name='_blocks_to_array2_cuda_complex')
_blocks_to_array3_cuda = cp.ElementwiseKernel(
'raw T input, int32 Bx, int32 By, int32 Bz, '
'int32 Sx, int32 Sy, int32 Sz, int32 Nx, int32 Ny, int32 Nz',
'raw T output',
"""
const int ndim = output.ndim;
int nz = i / Bx / By / Bz / Nx / Ny;
i -= nz * Bx * By * Bz * Nx * Ny;
int ny = i / Bx / By / Bz / Nx;
i -= ny * Bx * By * Bz * Nx;
int nx = i / Bx / By / Bz;
i -= nx * Bx * By * Bz;
int bz = i / Bx / By;
i -= bz * Bx * By;
int by = i / Bx;
i -= by * Bx;
int bx = i;
int iz = nz * Sz + bz;
int iy = ny * Sy + by;
int ix = nx * Sx + bx;
if (ix < output.shape()[ndim - 1] &&
iy < output.shape()[ndim - 2] &&
iz < output.shape()[ndim - 3]) {
int input_idx[] = {nz, ny, nx, bz, by, bx};
int output_idx[] = {iz, iy, ix};
atomicAdd(&output[output_idx], input[input_idx]);
}
""",
name='_blocks_to_array3_cuda')
_blocks_to_array3_cuda_complex = cp.ElementwiseKernel(
'raw T input, int32 Bx, int32 By, int32 Bz, '
'int32 Sx, int32 Sy, int32 Sz, int32 Nx, int32 Ny, int32 Nz',
'raw T output',
"""
const int ndim = output.ndim;
int nz = i / Bx / By / Bz / Nx / Ny;
i -= nz * Bx * By * Bz * Nx * Ny;
int ny = i / Bx / By / Bz / Nx;
i -= ny * Bx * By * Bz * Nx;
int nx = i / Bx / By / Bz;
i -= nx * Bx * By * Bz;
int bz = i / Bx / By;
i -= bz * Bx * By;
int by = i / Bx;
i -= by * Bx;
int bx = i;
int iz = nz * Sz + bz;
int iy = ny * Sy + by;
int ix = nx * Sx + bx;
if (ix < output.shape()[ndim - 1] &&
iy < output.shape()[ndim - 2] &&
iz < output.shape()[ndim - 3]) {
int input_idx[] = {nz, ny, nx, bz, by, bx};
int output_idx[] = {iz, iy, ix};
atomicAdd(reinterpret_cast<T::value_type*>(&(output[output_idx])),
input[input_idx].real());
atomicAdd(reinterpret_cast<T::value_type*>(
&(output[output_idx])) + 1, input[input_idx].imag());
}
""",
name='_blocks_to_array3_cuda_complex')