Raw File
4283_store_at_gpu.cpp
#include "Halide.h"

using namespace Halide;

int main(int argc, char *argv[]) {
    // This is the atomic version of issue 4283,
    // where I believe should be running fine after we fix
    // that issue.
    int img_size = 1000;
    int hist_size = 53;

    Func im, hist, final;
    Var x, y;
    RDom r(0, img_size);

    im(x) = (x * x) % hist_size;

    hist(x) = cast<T>(0);
    hist(im(r)) += cast<T>(1);

    final(x, y) = hist((x + y) % hist_size);

    Type t = cast<T>(0).type();
    bool is_float_16 = t.is_float() && t.bits() == 16;

    final.compute_root().parallel(y);
    hist.store_at(final, y)
        .compute_at(final, x);
    RVar ro, ri;
    hist.update()
        .atomic()
        .split(r, ro, ri, 32)
        .gpu_blocks(ro)
        .gpu_threads(ri);

    Buffer<T> correct_hist(hist_size);
    Buffer<T> correct_final(10, 10);
    correct_hist.fill(T(0));
    correct_final.fill(T(0));
    for (int i = 0; i < img_size; i++) {
        int idx = (i * i) % hist_size;
        correct_hist(idx) = correct_hist(idx) + T(1);
    }
    for (int i = 0; i < 10; i++) {
        for (int j = 0; j < 10; j++) {
            correct_final(i, j) = correct_hist((i + j) % hist_size);
        }
    }

    // Run 100 times to make sure race condition do happen
    for (int iter = 0; iter < 100; iter++) {
        Buffer<T> out = final.realize(10, 10);
        for (int i = 0; i < 10; i++) {
            for (int j = 0; j < 10; j++) {
                check(__LINE__, out(i, j), correct_final(i, j));
            }
        }
    }
    return 0;
}
back to top