Revision - 79dc0c0 - Use CXX_FLAGS instead of CXXFLAGS for consistency

Revision 79dc0c0d5e6de1fa0deca2a82019ceba23d4d8aa authored by Pranav Bhandarkar on 08 October 2019, 23:56:21 UTC, committed by Pranav Bhandarkar on 08 October 2019, 23:56:21 UTC

Use CXX_FLAGS instead of CXXFLAGS for consistency

1 parent 90701c9

Files
Changes

Permalinks

register_shuffle.cpp

#include "Halide.h"

using namespace Halide;

int main(int argc, char **argv) {
    Target t = get_jit_target_from_environment();

    if (!t.features_any_of({Target::CUDACapability50,
                            Target::CUDACapability61})) {
        printf("This test requires cuda enabled with cuda capability 5.0 or greater\n");
        return 0;
    }

    {
        // Shuffle test to do a small convolution
        Func f, g;
        Var x, y;

        f(x, y) = x + y;
        g(x, y) = f(x-1, y) + f(x+1, y);

        Var xo, xi, yi, yo;
        g.gpu_tile(x, y, xi, yi, 32, 2, TailStrategy::RoundUp).gpu_lanes(xi);
        f.compute_root();
        f.in(g).compute_at(g, yi).split(x, xo, xi, 32, TailStrategy::RoundUp).gpu_lanes(xi).unroll(xo);

        Buffer<int> out = g.realize(32, 4);
        for (int y = 0; y < out.height(); y++) {
            for (int x = 0; x < out.width(); x++) {
                int correct = 2*(x + y);
                int actual = out(x, y);
                if (correct != actual) {
                    printf("out(%d, %d) = %d instead of %d\n",
                           x, y, actual, correct);
                    return -1;
                }
            }
        }
    }

    {
        // Broadcast test - an outer product access pattern
        Func a, b, c;
        Var x, y;
        a(x) = cast<float>(x);
        b(y) = cast<float>(y);
        c(x, y) = a(x) + 100 * b(y);

        a.compute_root();
        b.compute_root();

        Var xi, yi, yii;

        c.tile(x, y, xi, yi, 32, 32, TailStrategy::RoundUp)
            .gpu_blocks(x, y)
            .gpu_lanes(xi);
        // We're going to be computing 'a' and 'b' at block level, but
        // we want them in register, not shared, so we explicitly call
        // store_in.
        a.in(c).compute_at(c, x)
            .gpu_lanes(x)
            .store_in(MemoryType::Register);
        b.in(c).compute_at(c, x)
            .gpu_lanes(y)
            .store_in(MemoryType::Register);

        Buffer<float> out = c.realize(32, 32);
        for (int y = 0; y < out.height(); y++) {
            for (int x = 0; x < out.width(); x++) {
                float correct = x + 100 * y;
                float actual = out(x, y);
                // The floats are small integers, so they should be exact.
                if (correct != actual) {
                    printf("out(%d, %d) = %f instead of %f\n",
                           x, y, actual, correct);
                    return -1;
                }
            }
        }
    }

    {
        // Vectorized broadcast test. Each lane is responsible for a
        // 2-vector from 'a' and a 2-vector from 'b' instead of a single
        // value.
        Func a, b, c;
        Var x, y;
        a(x) = cast<float>(x);
        b(y) = cast<float>(y);
        c(x, y) = a(x) + 100 * b(y);

        a.compute_root();
        b.compute_root();

        Var xi, yi, yii;

        c.tile(x, y, xi, yi, 64, 64, TailStrategy::RoundUp)
            .gpu_blocks(x, y)
            .split(yi, yi, yii, 64).unroll(yii, 2).gpu_threads(yi)
            .vectorize(xi, 2).gpu_lanes(xi);
        a.in(c).compute_at(c, yi).vectorize(x, 2).gpu_lanes(x);
        b.in(c).compute_at(c, yi).vectorize(y, 2).gpu_lanes(y);

        Buffer<float> out = c.realize(64, 64);
        for (int y = 0; y < out.height(); y++) {
            for (int x = 0; x < out.width(); x++) {
                float correct = x + 100 * y;
                float actual = out(x, y);
                // The floats are small integers, so they should be exact.
                if (correct != actual) {
                    printf("out(%d, %d) = %f instead of %f\n",
                           x, y, actual, correct);
                    return -1;
                }
            }
        }
    }

    {
        // A stencil chain where many of the lanes will be masked
        Func a, b, c, d;
        Var x, y;

        a(x, y) = x + y;
        a.compute_root();

        b(x, y) = a(x-1, y) + a(x, y) + a(x+1, y);
        c(x, y) = b(x-1, y) + b(x, y) + b(x+1, y);
        d(x, y) = c(x-1, y) + c(x, y) + c(x+1, y);

        Var xi, yi;
        // Compute 24-wide pieces of output per block. Should use 32
        // warp lanes to do so. The footprint on the input is 30, so
        // the last two lanes are always inactive. 26-wide blocks
        // would be a more efficient use of the gpu, but a less
        // interesting test.
        d.gpu_tile(x, y, xi, yi, 24, 2).gpu_lanes(xi);
        for (Func stage : {a.in(), b, c}) {
            stage.compute_at(d, yi).gpu_lanes(x);
        }

        Buffer<int> out = d.realize(24, 2);
        for (int y = 0; y < out.height(); y++) {
            for (int x = 0; x < out.width(); x++) {
                int correct = 27*(x + y);
                int actual = out(x, y);
                if (correct != actual) {
                    printf("out(%d, %d) = %d instead of %d\n",
                           x, y, actual, correct);
                    return -1;
                }
            }
        }
    }

    {
        // Same as above, but in half-warps
        Func a, b, c, d;
        Var x, y;

        a(x, y) = x + y;
        a.compute_root();

        b(x, y) = a(x-1, y) + a(x, y) + a(x+1, y);
        c(x, y) = b(x-1, y) + b(x, y) + b(x+1, y);
        d(x, y) = c(x-1, y) + c(x, y) + c(x+1, y);

        Var xi, yi;
        // Compute 10-wide pieces of output per block. Should use 16
        // warp lanes to do so.
        d.gpu_tile(x, y, xi, yi, 10, 2).gpu_lanes(xi);
        for (Func stage : {a.in(), b, c}) {
            stage.compute_at(d, yi).gpu_lanes(x);
        }

        Buffer<int> out = d.realize(24, 2);
        for (int y = 0; y < out.height(); y++) {
            for (int x = 0; x < out.width(); x++) {
                int correct = 27*(x + y);
                int actual = out(x, y);
                if (correct != actual) {
                    printf("out(%d, %d) = %d instead of %d\n",
                           x, y, actual, correct);
                    return -1;
                }
            }
        }
    }

    {
        // A shuffle with a shift amount that depends on the y coord
        Func a, b;
        Var x, y;

        a(x, y) = x + y;
        b(x, y) = a(x + y, y);

        Var xi, yi;
        b.gpu_tile(x, y, xi, yi, 16, 8, TailStrategy::RoundUp).gpu_lanes(xi);
        a.compute_at(b, yi).gpu_lanes(x);

        Buffer<int> out = b.realize(32, 32);
        for (int y = 0; y < out.height(); y++) {
            for (int x = 0; x < out.width(); x++) {
                int correct = x + 2*y;
                int actual = out(x, y);
                if (correct != actual) {
                    printf("out(%d, %d) = %d instead of %d\n",
                           x, y, actual, correct);
                    return -1;
                }
            }
        }
    }

    {
        // Bilinear upsample
        Func f, upx, upy;
        Var x, y;

        f(x, y) = cast<float>(x + y);
        f.compute_root();

        upx(x, y) = 0.25f * f((x/2) - 1 + 2*(x % 2), y) + 0.75f * f(x/2, y);
        upy(x, y) = 0.25f * upx(x, (y/2) - 1 + 2*(y % 2)) + 0.75f * upx(x, y/2);

        // Compute 128x64 tiles of output, which require 66x34 tiles
        // of input. All intermediate data stored in lanes and
        // accessed using register shuffles.

        Var xi, yi, xii, yii;
        upy.tile(x, y, xi, yi, 128, 64, TailStrategy::RoundUp)
            .tile(xi, yi, xii, yii, 4, 8).vectorize(xii)
            .gpu_blocks(x, y).gpu_threads(yi).gpu_lanes(xi);

        upx.compute_at(upy, yi).unroll(x, 4).gpu_lanes(x).unroll(y);

        // Stage the input into lanes, doing two dense vector loads
        // per lane, and use register shuffles to do the upsample in x.
        f.in().compute_at(upy, yi).align_storage(x, 64)
            .vectorize(x, 2, TailStrategy::RoundUp)
            .split(x, x, xi, 32, TailStrategy::GuardWithIf)
            .reorder(xi, y, x).gpu_lanes(xi).unroll(x).unroll(y);

        upy.output_buffer().dim(0).set_min(0).dim(1).set_min(0);
        Buffer<float> out = upy.realize(128, 128);

        for (int y = 0; y < out.height(); y++) {
            for (int x = 0; x < out.width(); x++) {
                float actual = out(x, y);
                float correct = (x + y - 1) / 2.0f;
                if (correct != actual) {
                    printf("out(%d, %d) = %f instead of %f\n",
                           x, y, actual, correct);
                    return -1;
                }
            }
        }
    }

    {
        // Box-downsample by a factor of 8 using summation within each
        // warp.
        Func f;
        Var x, y;
        f(x, y) = cast<float>(x + y);
        f.compute_root();

        Func s1, s2, s3, s4;

        s1(x, y) = f(2*x, y) + f(2*x + 1, y);
        s2(x, y) = s1(2*x, y) + s1(2*x + 1, y);
        s3(x, y) = s2(2*x, y) + s2(2*x + 1, y);
        s4(x, y) = s3(x, y);

        Var xi, yi;
        s4.gpu_tile(x, y, xi, yi, 64, 1, TailStrategy::RoundUp).vectorize(xi, 2).gpu_lanes(xi);
        s3.compute_at(s4, yi).split(x, x, xi, 32, TailStrategy::RoundUp).gpu_lanes(xi).unroll(x);
        s2.compute_at(s4, yi).split(x, x, xi, 32, TailStrategy::RoundUp).gpu_lanes(xi).unroll(x);
        s1.compute_at(s4, yi).split(x, x, xi, 32, TailStrategy::RoundUp).gpu_lanes(xi).unroll(x);
        f.in().compute_at(s4, yi).split(x, x, xi, 64, TailStrategy::RoundUp).vectorize(xi, 2).gpu_lanes(xi).unroll(x);

        Buffer<float> out = s4.realize(64, 64);

        for (int y = 0; y < out.height(); y++) {
            for (int x = 0; x < out.width(); x++) {
                float actual = out(x, y);
                // One factor of 8 from adding instead of averaging,
                // and another factor of 8 from the compression of the
                // coordinate system across x.
                float correct = (x*8 + y)*8 + 28;
                if (correct != actual) {
                    printf("out(%d, %d) = %f instead of %f\n",
                           x, y, actual, correct);
                    return -1;
                }
            }
        }
    }

    {
        // The same, with a narrower tile in x so that one warp is divided up across many scanlines.
        Func f;
        Var x, y;
        f(x, y) = cast<float>(x + y);
        f.compute_root();

        Func s1, s2, s3, s4;

        s1(x, y) = f(2*x, y) + f(2*x + 1, y);
        s2(x, y) = s1(2*x, y) + s1(2*x + 1, y);
        s3(x, y) = s2(2*x, y) + s2(2*x + 1, y);
        s4(x, y) = s3(x, y);

        Var xi, yi;
        s4.gpu_tile(x, y, xi, yi, 8, 16, TailStrategy::RoundUp).vectorize(xi, 2).gpu_lanes(xi);
        s3.compute_at(s4, yi).split(x, x, xi, 4, TailStrategy::RoundUp).gpu_lanes(xi).unroll(x);
        s2.compute_at(s4, yi).split(x, x, xi, 4, TailStrategy::RoundUp).gpu_lanes(xi).unroll(x);
        s1.compute_at(s4, yi).split(x, x, xi, 4, TailStrategy::RoundUp).gpu_lanes(xi).unroll(x);
        f.in().compute_at(s4, yi).split(x, x, xi, 8, TailStrategy::RoundUp).vectorize(xi, 2).gpu_lanes(xi).unroll(x);

        Buffer<float> out = s4.realize(32, 32);

        for (int y = 0; y < out.height(); y++) {
            for (int x = 0; x < out.width(); x++) {
                float actual = out(x, y);
                float correct = (x*8 + y)*8 + 28;
                if (correct != actual) {
                    printf("out(%d, %d) = %f instead of %f\n",
                           x, y, actual, correct);
                    return -1;
                }
            }
        }
    }

    {
        Buffer<uint8_t> buf(256, 256);
        buf.for_each_value([](uint8_t &x) {
                x = rand();
            });
        buf.set_host_dirty();

        // Store a small LUT in-register, populated at the warp
        // level.
        Func lut;
        Var x, y;
        lut(x) = cast<uint16_t>(x)+1;

        Func curved;
        curved(x, y) = lut(buf(x, y));

        Var xi, yi, xo;
        curved.compute_root().tile(x, y, xi, yi, 32, 32)
            .gpu_blocks(x, y).gpu_threads(yi).gpu_lanes(xi);

        lut.compute_root();

        // Load the LUT into shared at the start of each block using warp 0.
        lut.in().compute_at(curved, x).split(x, xo, xi, 32 * 4).vectorize(xi, 4).gpu_lanes(xi).unroll(xo);

        // Load it from shared into registers for each warp.
        lut.in().in().compute_at(curved, yi).split(x, xo, xi, 32 * 4).vectorize(xi, 4).gpu_lanes(xi).unroll(xo);

        Buffer<uint16_t> out = curved.realize(buf.width(), buf.height());

        for (int y = 0; y < out.height(); y++) {
            for (int x = 0; x < out.width(); x++) {
                uint16_t actual = out(x, y);
                uint16_t correct = ((uint16_t)buf(x, y)) + 1;
                if (correct != actual) {
                    printf("out(%d, %d) = %d instead of %d\n",
                           x, y, actual, correct);
                    return -1;
                }
            }
        }
    }

    {
        // Test a case that caused combinatorial explosion
        Var x;
        Expr e = x;
        for (int i = 0; i < 10; i++) {
            e = fast_pow(e, e + 1);
        }

        Func f;
        f(x) = e;

        Var xo, xi;
        f.gpu_tile(x, xo, xi, 32);
        f.realize(1024);
    }

    printf("Success!\n");
    return 0;
}

Showing with 0 additions and 0 deletions (0 / 0 diffs computed)

Computing file changes ...