Content - a1ba365931f04726db5657d652c5bf00f5277af8 - e808e53/interpolate_generator.py

interpolate_generator.py
"""
Fast image interpolation using a pyramid.
"""

import halide as hl


def _func_list(name, size):
    """Return a list containing `size` Funcs, named `name_n` for n in 0..size-1."""
    return [hl.Func("%s_%d" % (name, i)) for i in range(size)]


@hl.alias(
    interpolate_Mullapudi2016={"autoscheduler": "Mullapudi2016"},
)
@hl.generator()
class interpolate:
    levels = hl.GeneratorParam(10)

    input_buf = hl.InputBuffer(hl.Float(32), 3)
    output_buf = hl.OutputBuffer(hl.Float(32), 3)

    def generate(self):
        g = self

        x, y, c = hl.vars("x y c")

        # Input must have four color channels - rgba
        g.input_buf.dim(2).set_bounds(0, 4)

        downsampled = _func_list("downsampled", g.levels)
        downx = _func_list("downx", g.levels)
        interpolated = _func_list("interpolated", g.levels)
        upsampled = _func_list("upsampled", g.levels)
        upsampledx = _func_list("upsampledx", g.levels)

        clamped = hl.BoundaryConditions.repeat_edge(g.input_buf)

        downsampled[0][x, y, c] = hl.select(
            c < 3,
            clamped[x, y, c] * clamped[x, y, 3],
            clamped[x, y, 3],
        )

        for l in range(1, g.levels):
            prev = downsampled[l - 1]

            if l == 4:
                # Also add a boundary condition at a middle pyramid level
                # to prevent the footprint of the downsamplings to extend
                # too far off the base image. Otherwise we look 512
                # pixels off each edge.
                w = g.input_buf.width() / (1 << (l - 1))
                h = g.input_buf.height() / (1 << (l - 1))
                prev = hl.lambda_func(
                    x, y, c, prev[hl.clamp(x, 0, w), hl.clamp(y, 0, h), c]
                )

            downx[l][x, y, c] = (
                prev[x * 2 - 1, y, c] + 2 * prev[x * 2, y, c] + prev[x * 2 + 1, y, c]
            ) * 0.25

            downsampled[l][x, y, c] = (
                downx[l][x, y * 2 - 1, c]
                + 2 * downx[l][x, y * 2, c]
                + downx[l][x, y * 2 + 1, c]
            ) * 0.25

        interpolated[g.levels - 1][x, y, c] = downsampled[g.levels - 1][x, y, c]

        for l in range(g.levels - 2, -1, -1):
            upsampledx[l][x, y, c] = (
                interpolated[l + 1][x / 2, y, c]
                + interpolated[l + 1][(x + 1) / 2, y, c]
            ) / 2
            upsampled[l][x, y, c] = (
                upsampledx[l][x, y / 2, c] + upsampledx[l][x, (y + 1) / 2, c]
            ) / 2
            alpha = 1.0 - downsampled[l][x, y, 3]
            interpolated[l][x, y, c] = (
                downsampled[l][x, y, c] + alpha * upsampled[l][x, y, c]
            )

        g.output_buf[x, y, c] = interpolated[0][x, y, c] / interpolated[0][x, y, 3]

        # Schedule
        if g.using_autoscheduler():
            # nothing
            pass
        elif g.target().has_gpu_feature():
            # 0.86ms on a 2060 RTX
            yo, yi, xo, xi, ci, xii, yii = hl.vars("yo yi xo xi ci xii yii")

            (
                g.output_buf.bound(x, 0, g.input_buf.width())
                .bound(y, 0, g.input_buf.height())
                .bound(c, 0, 3)
                .reorder(c, x, y)
                .tile(x, y, xi, yi, 32, 32, hl.TailStrategy.RoundUp)
                .tile(xi, yi, xii, yii, 2, 2)
                .gpu_blocks(x, y)
                .gpu_threads(xi, yi)
                .unroll(xii)
                .unroll(yii)
                .unroll(c)
            )

            for l in range(1, g.levels):
                (
                    downsampled[l]
                    .compute_root()
                    .reorder(c, x, y)
                    .unroll(c)
                    .gpu_tile(x, y, xi, yi, 16, 16)
                )

            for l in range(3, g.levels, 2):
                (
                    interpolated[l]
                    .compute_root()
                    .reorder(c, x, y)
                    .tile(x, y, xi, yi, 32, 32, hl.TailStrategy.RoundUp)
                    .tile(xi, yi, xii, yii, 2, 2)
                    .gpu_blocks(x, y)
                    .gpu_threads(xi, yi)
                    .unroll(xii)
                    .unroll(yii)
                    .unroll(c)
                )

            (
                upsampledx[1]
                .compute_at(g.output_buf, x)
                .reorder(c, x, y)
                .tile(x, y, xi, yi, 2, 1)
                .unroll(xi)
                .unroll(yi)
                .unroll(c)
                .gpu_threads(x, y)
            )

            (
                interpolated[1]
                .compute_at(g.output_buf, x)
                .reorder(c, x, y)
                .tile(x, y, xi, yi, 2, 2)
                .unroll(xi)
                .unroll(yi)
                .unroll(c)
                .gpu_threads(x, y)
            )

            (
                interpolated[2]
                .compute_at(g.output_buf, x)
                .reorder(c, x, y)
                .unroll(c)
                .gpu_threads(x, y)
            )

        else:
            # 4.54ms on an Intel i9-9960X using 16 threads
            xo, xi, yo, yi = hl.vars("xo xi yo yi")
            vec = g.natural_vector_size(hl.Float(32))
            for l in range(1, g.levels - 1):
                # We must refer to the downsampled stages in the
                # upsampling later, so they must all be
                # compute_root or redundantly recomputed, as in
                # the local_laplacian app.
                (
                    downsampled[l]
                    .compute_root()
                    .reorder(x, c, y)
                    .split(y, yo, yi, 8)
                    .parallel(yo)
                    .vectorize(x, vec)
                )

            # downsampled[0] takes too long to compute_root, so
            # we'll redundantly recompute it instead.  Make a
            # separate clone of it in the first downsampled stage
            # so that we can schedule the two versions
            # separately.
            (
                downsampled[0]
                .clone_in(downx[1])
                .store_at(downsampled[1], yo)
                .compute_at(downsampled[1], yi)
                .reorder(c, x, y)
                .unroll(c)
                .vectorize(x, vec)
            )

            (
                g.output_buf.bound(x, 0, g.input_buf.width())
                .bound(y, 0, g.input_buf.height())
                .bound(c, 0, 3)
                .split(x, xo, xi, vec)
                .split(y, yo, yi, 32)
                .reorder(xi, c, xo, yi, yo)
                .unroll(c)
                .vectorize(xi)
                .parallel(yo)
            )

            for l in range(1, g.levels):
                (
                    interpolated[l]
                    .store_at(g.output_buf, yo)
                    .compute_at(g.output_buf, yi)
                    .vectorize(x, vec)
                )

        # Estimates (for autoscheduler; ignored otherwise)
        (
            g.input_buf.dim(0)
            .set_estimate(0, 1536)
            .dim(1)
            .set_estimate(0, 2560)
            .dim(2)
            .set_estimate(0, 4)
        )
        (
            g.output_buf.output_buffer()
            .dim(0)
            .set_estimate(0, 1536)
            .dim(1)
            .set_estimate(0, 2560)
            .dim(2)
            .set_estimate(0, 3)
        )


if __name__ == "__main__":
    hl.main()