Content - 2866b09e2392ec9c98d5efb180f24abe64734f96 - 749c418/apps/camera_pipe/camera_pipe_generator.cpp

visit type:
Tip revision: 39ffa7dd150219f2341f909707766848f51deb07 authored by Steven Johnson on 11 February 2022, 17:58:29 UTC
Merge branch 'abadams/reenable_unscheduled_stage_warning' into srj/tmp-unschedule-experiment
Tip revision: 39ffa7d
camera_pipe_generator.cpp
#include "Halide.h"
#include "halide_trace_config.h"
#include <stdint.h>

namespace {

using std::vector;

using namespace Halide;
using namespace Halide::ConciseCasts;

// Shared variables
Var x, y, c, yi, yo, yii, xi;

// Average two positive values rounding up
Expr avg(Expr a, Expr b) {
    Type wider = a.type().with_bits(a.type().bits() * 2);
    return cast(a.type(), (cast(wider, a) + b + 1) / 2);
}

Expr blur121(Expr a, Expr b, Expr c) {
    return avg(avg(a, c), b);
}

Func interleave_x(Func a, Func b) {
    Func out;
    out(x, y) = select((x % 2) == 0, a(x / 2, y), b(x / 2, y));
    return out;
}

Func interleave_y(Func a, Func b) {
    Func out;
    out(x, y) = select((y % 2) == 0, a(x, y / 2), b(x, y / 2));
    return out;
}

class Demosaic : public Halide::Generator<Demosaic> {
public:
    GeneratorParam<LoopLevel> intermed_compute_at{"intermed_compute_at", LoopLevel::inlined()};
    GeneratorParam<LoopLevel> intermed_store_at{"intermed_store_at", LoopLevel::inlined()};
    GeneratorParam<LoopLevel> output_compute_at{"output_compute_at", LoopLevel::inlined()};

    // Inputs and outputs
    Input<Func> deinterleaved{"deinterleaved", UInt(16), 3};
    Output<Func> output{"output", Int(16), 3};

    // Defines outputs using inputs
    void generate() {
        // These are the values we already know from the input
        // x_y = the value of channel x at a site in the input of channel y
        // gb refers to green sites in the blue rows
        // gr refers to green sites in the red rows

        // Give more convenient names to the four channels we know
        Func r_r, g_gr, g_gb, b_b;

        g_gr(x, y) = deinterleaved(x, y, 0);
        r_r(x, y) = deinterleaved(x, y, 1);
        b_b(x, y) = deinterleaved(x, y, 2);
        g_gb(x, y) = deinterleaved(x, y, 3);

        // These are the ones we need to interpolate
        Func b_r, g_r, b_gr, r_gr, b_gb, r_gb, r_b, g_b;

        // First calculate green at the red and blue sites

        // Try interpolating vertically and horizontally. Also compute
        // differences vertically and horizontally. Use interpolation in
        // whichever direction had the smallest difference.
        Expr gv_r = avg(g_gb(x, y - 1), g_gb(x, y));
        Expr gvd_r = absd(g_gb(x, y - 1), g_gb(x, y));
        Expr gh_r = avg(g_gr(x + 1, y), g_gr(x, y));
        Expr ghd_r = absd(g_gr(x + 1, y), g_gr(x, y));

        g_r(x, y) = select(ghd_r < gvd_r, gh_r, gv_r);

        Expr gv_b = avg(g_gr(x, y + 1), g_gr(x, y));
        Expr gvd_b = absd(g_gr(x, y + 1), g_gr(x, y));
        Expr gh_b = avg(g_gb(x - 1, y), g_gb(x, y));
        Expr ghd_b = absd(g_gb(x - 1, y), g_gb(x, y));

        g_b(x, y) = select(ghd_b < gvd_b, gh_b, gv_b);

        // Next interpolate red at gr by first interpolating, then
        // correcting using the error green would have had if we had
        // interpolated it in the same way (i.e. add the second derivative
        // of the green channel at the same place).
        Expr correction;
        correction = g_gr(x, y) - avg(g_r(x, y), g_r(x - 1, y));
        r_gr(x, y) = correction + avg(r_r(x - 1, y), r_r(x, y));

        // Do the same for other reds and blues at green sites
        correction = g_gr(x, y) - avg(g_b(x, y), g_b(x, y - 1));
        b_gr(x, y) = correction + avg(b_b(x, y), b_b(x, y - 1));

        correction = g_gb(x, y) - avg(g_r(x, y), g_r(x, y + 1));
        r_gb(x, y) = correction + avg(r_r(x, y), r_r(x, y + 1));

        correction = g_gb(x, y) - avg(g_b(x, y), g_b(x + 1, y));
        b_gb(x, y) = correction + avg(b_b(x, y), b_b(x + 1, y));

        // Now interpolate diagonally to get red at blue and blue at
        // red. Hold onto your hats; this gets really fancy. We do the
        // same thing as for interpolating green where we try both
        // directions (in this case the positive and negative diagonals),
        // and use the one with the lowest absolute difference. But we
        // also use the same trick as interpolating red and blue at green
        // sites - we correct our interpolations using the second
        // derivative of green at the same sites.

        correction = g_b(x, y) - avg(g_r(x, y), g_r(x - 1, y + 1));
        Expr rp_b = correction + avg(r_r(x, y), r_r(x - 1, y + 1));
        Expr rpd_b = absd(r_r(x, y), r_r(x - 1, y + 1));

        correction = g_b(x, y) - avg(g_r(x - 1, y), g_r(x, y + 1));
        Expr rn_b = correction + avg(r_r(x - 1, y), r_r(x, y + 1));
        Expr rnd_b = absd(r_r(x - 1, y), r_r(x, y + 1));

        r_b(x, y) = select(rpd_b < rnd_b, rp_b, rn_b);

        // Same thing for blue at red
        correction = g_r(x, y) - avg(g_b(x, y), g_b(x + 1, y - 1));
        Expr bp_r = correction + avg(b_b(x, y), b_b(x + 1, y - 1));
        Expr bpd_r = absd(b_b(x, y), b_b(x + 1, y - 1));

        correction = g_r(x, y) - avg(g_b(x + 1, y), g_b(x, y - 1));
        Expr bn_r = correction + avg(b_b(x + 1, y), b_b(x, y - 1));
        Expr bnd_r = absd(b_b(x + 1, y), b_b(x, y - 1));

        b_r(x, y) = select(bpd_r < bnd_r, bp_r, bn_r);

        // Resulting color channels
        Func r, g, b;

        // Interleave the resulting channels
        r = interleave_y(interleave_x(r_gr, r_r),
                         interleave_x(r_b, r_gb));
        g = interleave_y(interleave_x(g_gr, g_r),
                         interleave_x(g_b, g_gb));
        b = interleave_y(interleave_x(b_gr, b_r),
                         interleave_x(b_b, b_gb));

        // It's possible that some of the final additions of
        // correction terms underflowed, so reinterpret the output as
        // signed.
        output(x, y, c) = cast<int16_t>(mux(c, {r(x, y), g(x, y), b(x, y)}));

        // These are the stencil stages we want to schedule
        // separately. Everything else we'll just inline.
        intermediates.push_back(g_r);
        intermediates.push_back(g_b);
    }

    void schedule() {
        Pipeline p(output);

        if (auto_schedule) {
            // blank
        } else if (get_target().has_gpu_feature()) {
            Var xi, yi;
            for (Func f : intermediates) {
                f.compute_at(intermed_compute_at).gpu_threads(x, y);
            }
            output.compute_at(output_compute_at)
                .unroll(x, 2)
                .gpu_threads(x, y)
                .reorder(c, x, y)
                .unroll(c);
        } else {
            int vec = get_target().natural_vector_size(UInt(16));
            if (get_target().has_feature(Target::Xtensa)) {
                // Native vector size for 16-bit data.
                vec = 32;
            }
            bool use_hexagon = get_target().has_feature(Target::HVX);

            for (Func f : intermediates) {
                f.compute_at(intermed_compute_at)
                    .store_at(intermed_store_at)
                    .vectorize(x, 2 * vec, TailStrategy::RoundUp)
                    .fold_storage(y, 4);
            }
            intermediates[1].compute_with(
                intermediates[0], x,
                {{x, LoopAlignStrategy::AlignStart}, {y, LoopAlignStrategy::AlignStart}});
            output.compute_at(output_compute_at)
                .vectorize(x)
                .unroll(y)
                .reorder(c, x, y)
                .unroll(c);
            if (use_hexagon) {
                output.hexagon();
                for (Func f : intermediates) {
                    f.align_storage(x, vec);
                }
            }
        }

        /* Optional tags to specify layout for HalideTraceViz */
        Halide::Trace::FuncConfig cfg;
        cfg.pos = {860, 340 - 220};
        cfg.max = 1024;
        for (Func f : intermediates) {
            std::string label = f.name();
            std::replace(label.begin(), label.end(), '_', '@');
            cfg.pos.y += 220;
            cfg.labels = {{label}};
            f.add_trace_tag(cfg.to_trace_tag());
        }
    }

private:
    // Intermediate stencil stages to schedule
    vector<Func> intermediates;
};

class CameraPipe : public Halide::Generator<CameraPipe> {
public:
    // Parameterized output type, because LLVM PTX (GPU) backend does not
    // currently allow 8-bit computations
    GeneratorParam<Type> result_type{"result_type", UInt(8)};

    Input<Buffer<uint16_t, 2>> input{"input"};
    Input<Buffer<float, 2>> matrix_3200{"matrix_3200"};
    Input<Buffer<float, 2>> matrix_7000{"matrix_7000"};
    Input<float> color_temp{"color_temp"};
    Input<float> gamma{"gamma"};
    Input<float> contrast{"contrast"};
    Input<float> sharpen_strength{"sharpen_strength"};
    Input<int> blackLevel{"blackLevel"};
    Input<int> whiteLevel{"whiteLevel"};
    Output<Buffer<uint8_t, 3>> processed{"processed"};

    void generate();

private:
    Func hot_pixel_suppression(Func input);
    Func deinterleave(Func raw);
    Func apply_curve(Func input);
    Func color_correct(Func input);
    Func sharpen(Func input);
};

Func CameraPipe::hot_pixel_suppression(Func input) {

    Expr a = max(input(x - 2, y), input(x + 2, y),
                 input(x, y - 2), input(x, y + 2));

    Func denoised;
    denoised(x, y) = clamp(input(x, y), 0, a);

    return denoised;
}

Func CameraPipe::deinterleave(Func raw) {
    // Deinterleave the color channels
    Func deinterleaved("deinterleaved");

    deinterleaved(x, y, c) = mux(c,
                                 {raw(2 * x, 2 * y),
                                  raw(2 * x + 1, 2 * y),
                                  raw(2 * x, 2 * y + 1),
                                  raw(2 * x + 1, 2 * y + 1)});
    return deinterleaved;
}

Func CameraPipe::color_correct(Func input) {
    // Get a color matrix by linearly interpolating between two
    // calibrated matrices using inverse kelvin.
    Expr kelvin = color_temp;

    Func matrix;
    Expr alpha = (1.0f / kelvin - 1.0f / 3200) / (1.0f / 7000 - 1.0f / 3200);
    Expr val = (matrix_3200(x, y) * alpha + matrix_7000(x, y) * (1 - alpha));
    matrix(x, y) = cast<int16_t>(val * 256.0f);  // Q8.8 fixed point

    if (!auto_schedule) {
        matrix.compute_root();
        if (get_target().has_gpu_feature()) {
            matrix.gpu_single_thread();
        }
    }

    Func corrected;
    Expr ir = cast<int32_t>(input(x, y, 0));
    Expr ig = cast<int32_t>(input(x, y, 1));
    Expr ib = cast<int32_t>(input(x, y, 2));

    Expr r = matrix(3, 0) + matrix(0, 0) * ir + matrix(1, 0) * ig + matrix(2, 0) * ib;
    Expr g = matrix(3, 1) + matrix(0, 1) * ir + matrix(1, 1) * ig + matrix(2, 1) * ib;
    Expr b = matrix(3, 2) + matrix(0, 2) * ir + matrix(1, 2) * ig + matrix(2, 2) * ib;

    r = cast<int16_t>(r / 256);
    g = cast<int16_t>(g / 256);
    b = cast<int16_t>(b / 256);
    corrected(x, y, c) = mux(c, {r, g, b});

    return corrected;
}

Func CameraPipe::apply_curve(Func input) {
    // copied from FCam
    Func curve("curve");

    Expr minRaw = 0 + blackLevel;
    Expr maxRaw = whiteLevel;

    // How much to upsample the LUT by when sampling it.
    int lutResample = 1;
    if (get_target().has_feature(Target::HVX)) {
        // On HVX, LUT lookups are much faster if they are to LUTs not
        // greater than 256 elements, so we reduce the tonemap to 256
        // elements and use linear interpolation to upsample it.
        lutResample = 8;
    }

    minRaw /= lutResample;
    maxRaw /= lutResample;

    Expr invRange = 1.0f / (maxRaw - minRaw);
    Expr b = 2.0f - pow(2.0f, contrast / 100.0f);
    Expr a = 2.0f - 2.0f * b;

    // Get a linear luminance in the range 0-1
    Expr xf = clamp(cast<float>(x - minRaw) * invRange, 0.0f, 1.0f);
    // Gamma correct it
    Expr g = pow(xf, 1.0f / gamma);
    // Apply a piecewise quadratic contrast curve
    Expr z = select(g > 0.5f,
                    1.0f - (a * (1.0f - g) * (1.0f - g) + b * (1.0f - g)),
                    a * g * g + b * g);

    // Convert to 8 bit and save
    Expr val = cast(result_type, clamp(z * 255.0f + 0.5f, 0.0f, 255.0f));
    // makeLUT add guard band outside of (minRaw, maxRaw]:
    curve(x) = select(x <= minRaw, 0, select(x > maxRaw, 255, val));

    if (!auto_schedule) {
        // It's a LUT, compute it once ahead of time.
        curve.compute_root();
        if (get_target().has_gpu_feature()) {
            Var xi;
            curve.gpu_tile(x, xi, 32);
        }
    }

    /* Optional tags to specify layout for HalideTraceViz */
    {
        Halide::Trace::FuncConfig cfg;
        cfg.labels = {{"tone curve"}};
        cfg.pos = {580, 1000};
        curve.add_trace_tag(cfg.to_trace_tag());
    }

    Func curved;

    if (lutResample == 1) {
        // Use clamp to restrict size of LUT as allocated by compute_root
        curved(x, y, c) = curve(clamp(input(x, y, c), 0, 1023));
    } else {
        // Use linear interpolation to sample the LUT.
        Expr in = input(x, y, c);
        Expr u0 = in / lutResample;
        Expr u = in % lutResample;
        Expr y0 = curve(clamp(u0, 0, 127));
        Expr y1 = curve(clamp(u0 + 1, 0, 127));
        curved(x, y, c) = cast<uint8_t>((cast<uint16_t>(y0) * lutResample + (y1 - y0) * u) / lutResample);
    }

    return curved;
}

Func CameraPipe::sharpen(Func input) {
    // Convert the sharpening strength to 2.5 fixed point. This allows sharpening in the range [0, 4].
    Func sharpen_strength_x32("sharpen_strength_x32");
    sharpen_strength_x32() = u8_sat(sharpen_strength * 32);
    if (!auto_schedule) {
        sharpen_strength_x32.compute_root();
        if (get_target().has_gpu_feature()) {
            sharpen_strength_x32.gpu_single_thread();
        }
    }

    /* Optional tags to specify layout for HalideTraceViz */
    {
        Halide::Trace::FuncConfig cfg;
        cfg.labels = {{"sharpen strength"}};
        cfg.pos = {10, 1000};
        sharpen_strength_x32.add_trace_tag(cfg.to_trace_tag());
    }

    // Make an unsharp mask by blurring in y, then in x.
    Func unsharp_y("unsharp_y");
    unsharp_y(x, y, c) = blur121(input(x, y - 1, c), input(x, y, c), input(x, y + 1, c));

    Func unsharp("unsharp");
    unsharp(x, y, c) = blur121(unsharp_y(x - 1, y, c), unsharp_y(x, y, c), unsharp_y(x + 1, y, c));

    Func mask("mask");
    mask(x, y, c) = cast<int16_t>(input(x, y, c)) - cast<int16_t>(unsharp(x, y, c));

    // Weight the mask with the sharpening strength, and add it to the
    // input to get the sharpened result.
    Func sharpened("sharpened");
    sharpened(x, y, c) = u8_sat(input(x, y, c) + (mask(x, y, c) * sharpen_strength_x32()) / 32);

    return sharpened;
}

void CameraPipe::generate() {
    // shift things inwards to give us enough padding on the
    // boundaries so that we don't need to check bounds. We're going
    // to make a 2560x1920 output image, just like the FCam pipe, so
    // shift by 16, 12.
    Func shifted;
    shifted(x, y) = input(x + 16, y + 12);

    Func denoised = hot_pixel_suppression(shifted);

    Func deinterleaved = deinterleave(denoised);

    auto demosaiced = create<Demosaic>();
    demosaiced->apply(deinterleaved);

    Func corrected = color_correct(demosaiced->output);

    Func curved = apply_curve(corrected);

    processed(x, y, c) = sharpen(curved)(x, y, c);

    /* ESTIMATES */
    // (This can be useful in conjunction with RunGen and benchmarks as well
    // as auto-schedule, so we do it in all cases.)
    input.set_estimates({{0, 2592}, {0, 1968}});
    matrix_3200.set_estimates({{0, 4}, {0, 3}});
    matrix_7000.set_estimates({{0, 4}, {0, 3}});
    color_temp.set_estimate(3700);
    gamma.set_estimate(2.0);
    contrast.set_estimate(50);
    sharpen_strength.set_estimate(1.0);
    blackLevel.set_estimate(25);
    whiteLevel.set_estimate(1023);
    processed.set_estimates({{0, 2592}, {0, 1968}, {0, 3}});

    // Schedule
    if (auto_schedule) {
        // nothing
    } else if (get_target().has_gpu_feature()) {

        // We can generate slightly better code if we know the output is even-sized
        if (!auto_schedule) {
            // TODO: The autoscheduler really ought to be able to
            // accommodate bounds on the output Func.
            Expr out_width = processed.width();
            Expr out_height = processed.height();
            processed.bound(c, 0, 3)
                .bound(x, 0, (out_width / 2) * 2)
                .bound(y, 0, (out_height / 2) * 2);
        }

        Var xi, yi, xii, xio;

        /* These tile factors obtain 1391us on a gtx 980. */
        int tile_x = 28;
        int tile_y = 12;

        if (get_target().has_feature(Target::D3D12Compute)) {
            // D3D12 SM 5.1 can only utilize a limited amount of
            // shared memory, so we use a slightly smaller
            // tile size.
            tile_x = 20;
            tile_y = 12;
        }

        processed.compute_root()
            .reorder(c, x, y)
            .unroll(x, 2)
            .gpu_tile(x, y, xi, yi, tile_x, tile_y);

        curved.compute_at(processed, x)
            .unroll(x, 2)
            .gpu_threads(x, y);

        corrected.compute_at(processed, x)
            .unroll(x, 2)
            .gpu_threads(x, y);

        demosaiced->output_compute_at.set({processed, x});
        demosaiced->intermed_compute_at.set({processed, x});

        denoised.compute_at(processed, x)
            .tile(x, y, xi, yi, 2, 2)
            .unroll(xi)
            .unroll(yi)
            .gpu_threads(x, y);

        deinterleaved.compute_at(processed, x)
            .unroll(x, 2)
            .gpu_threads(x, y)
            .reorder(c, x, y)
            .unroll(c);

    } else {

        Expr out_width = processed.width();
        Expr out_height = processed.height();

        // Depending on the HVX generation, we need 2 or 4 threads
        // to saturate HVX with work. For simplicity, we'll just
        // stick to 4 threads. On balance, the overhead should
        // not be much for the 2 extra threads that we create
        // on cores that have only two HVX contexts.
        Expr strip_size;
        if (get_target().has_feature(Target::HVX)) {
            strip_size = processed.dim(1).extent() / 4;
        } else {
            strip_size = 32;
        }
        strip_size = (strip_size / 2) * 2;

        int vec = get_target().natural_vector_size(UInt(16));
        if (get_target().has_feature(Target::HVX)) {
            vec = 64;
        }
        if (get_target().has_feature(Target::Xtensa)) {
            // Native vector size for 16-bit data.
            vec = 32;
        }

        processed
            .compute_root()
            .reorder(c, x, y)
            .split(y, yi, yii, 2, TailStrategy::RoundUp)
            .split(yi, yo, yi, strip_size / 2)
            .vectorize(x, 2 * vec, TailStrategy::RoundUp)
            .unroll(c)
            .parallel(yo);

        denoised
            .compute_at(processed, yi)
            .store_at(processed, yo)
            .prefetch(input, y, y, 2)
            .fold_storage(y, 4)
            .tile(x, y, x, y, xi, yi, 2 * vec, 2)
            .vectorize(xi)
            .unroll(yi);

        if (!get_target().has_feature(Target::Xtensa)) {
            denoised.prefetch(input, y, 2);
        }

        int deinterleaved_vector_size = get_target().has_feature(Target::Xtensa) ? vec : vec * 2;

        deinterleaved
            .compute_at(processed, yi)
            .store_at(processed, yo)
            .fold_storage(y, 4)
            .reorder(c, x, y)
            .vectorize(x, deinterleaved_vector_size, TailStrategy::RoundUp)
            .unroll(c);

        curved
            .compute_at(processed, yi)
            .store_at(processed, yo)
            .reorder(c, x, y)
            .tile(x, y, x, y, xi, yi, 2 * vec, 2, TailStrategy::RoundUp)
            .vectorize(xi)
            .unroll(yi)
            .unroll(c);

        corrected
            .compute_at(curved, x)
            .reorder(c, x, y)
            .vectorize(x)
            .unroll(c);

        demosaiced->intermed_compute_at.set({processed, yi});
        demosaiced->intermed_store_at.set({processed, yo});
        demosaiced->output_compute_at.set({curved, x});

        if (get_target().has_feature(Target::HVX)) {
            processed.hexagon();
            denoised.align_storage(x, vec);
            deinterleaved.align_storage(x, vec);
            corrected.align_storage(x, vec);
        }

        // We can generate slightly better code if we know the splits divide the extent.
        processed
            .bound(c, 0, 3)
            .bound(x, 0, ((out_width) / (2 * vec)) * (2 * vec))
            .bound(y, 0, (out_height / strip_size) * strip_size);

        /* Optional tags to specify layout for HalideTraceViz */
        {
            Halide::Trace::FuncConfig cfg;
            cfg.max = 1024;
            cfg.pos = {10, 348};
            cfg.labels = {{"input"}};
            input.add_trace_tag(cfg.to_trace_tag());

            cfg.pos = {305, 360};
            cfg.labels = {{"denoised"}};
            denoised.add_trace_tag(cfg.to_trace_tag());

            cfg.pos = {580, 120};
            const int y_offset = 220;
            cfg.strides = {{1, 0}, {0, 1}, {0, y_offset}};
            cfg.labels = {
                {"gr", {0, 0 * y_offset}},
                {"r", {0, 1 * y_offset}},
                {"b", {0, 2 * y_offset}},
                {"gb", {0, 3 * y_offset}},
            };
            deinterleaved.add_trace_tag(cfg.to_trace_tag());

            cfg.color_dim = 2;
            cfg.strides = {{1, 0}, {0, 1}, {0, 0}};
            cfg.pos = {1140, 360};
            cfg.labels = {{"demosaiced"}};
            processed.add_trace_tag(cfg.to_trace_tag());

            cfg.pos = {1400, 360};
            cfg.labels = {{"color-corrected"}};
            corrected.add_trace_tag(cfg.to_trace_tag());

            cfg.max = 256;
            cfg.pos = {1660, 360};
            cfg.labels = {{"gamma-corrected"}};
            curved.add_trace_tag(cfg.to_trace_tag());
        }
    }
};

}  // namespace

HALIDE_REGISTER_GENERATOR(CameraPipe, camera_pipe)
Browse the archive

https://github.com/halide/Halide