Content - 3e0270d5482e40ce0e1973b48b483287ad76e91b - d87d346/apps/nn_ops/Im2col.cpp

Im2col.cpp
#include <assert.h>
#include <memory.h>
#include <stdio.h>
#include <stdlib.h>

#include <limits>

#include "halide_benchmark.h"

#include "Im2col.h"

#include "HalideBuffer.h"

int clamp(int x, int low, int high) {
    return std::min(std::max(x, low), high);
}

int main(int argc, char **argv) {
    if (argc < 5) {
        printf("Usage: %s C W H N [stride pad_width pad_height filter_width filter_height byte_zero]\n", argv[0]);
        return 0;
    }

    int C = atoi(argv[1]);
    int W = atoi(argv[2]);
    int H = atoi(argv[3]);
    int N = atoi(argv[4]);

    printf("Benchmarking %dx%dx%dx%d\n", C, W, H, N);

    // TODO(vksnk): do we need to align C, W, H, N?

    // Hexagon's device_malloc implementation will also set the host
    // pointer if it is null, giving a zero copy buffer.
    Halide::Runtime::Buffer<uint8_t> input_tensor(nullptr, C, W, H, N);

    // These parameters lead to reasonable values for testing in
    // most cases.
    int stride = 1;
    int pad_width = 0;
    int pad_height = 0;
    int filter_width = 1;
    int filter_height = 1;
    uint8_t byte_zero = 0;

    if (argc > 7) stride = atoi(argv[5]);
    if (argc > 8) pad_width = atoi(argv[6]);
    if (argc > 9) pad_height = atoi(argv[7]);
    if (argc > 10) filter_width = atoi(argv[8]);
    if (argc > 11) filter_height = atoi(argv[9]);
    if (argc > 12) byte_zero = atoi(argv[10]);

    const int output_depth = C * W * H;
    // Output size should be:
    // ceil((input_size + 2 * padding - filter_size) / stride) + 1
    const int output_width =
        (W + 2 * pad_width - filter_width + stride - 1) / stride + 1;
    const int output_height =
        (H + 2 * pad_height - filter_height + stride - 1) / stride + 1;

    Halide::Runtime::Buffer<uint8_t> output_tensor(nullptr,
                                                   output_depth, output_width, output_height, N);

#ifdef HALIDE_RUNTIME_HEXAGON
    input_tensor.device_malloc(halide_hexagon_device_interface());
    output_tensor.device_malloc(halide_hexagon_device_interface());
#else
    input_tensor.allocate();
    output_tensor.allocate();
#endif

    input_tensor.for_each_value([](uint8_t &x) {
        x = static_cast<uint8_t>(rand());
    });

#ifdef HALIDE_RUNTIME_HEXAGON
    // To avoid the cost of powering HVX on in each call of the
    // pipeline, power it on once now. Also, set Hexagon performance to turbo.
    halide_hexagon_set_performance_mode(nullptr, halide_hexagon_power_turbo);
    halide_hexagon_power_hvx_on(nullptr);
#endif

    printf("Running pipeline...\n");
    double time = Halide::Tools::benchmark([&]() {
        int result = Im2col(input_tensor, stride, pad_width, pad_height,
                            filter_width, filter_height, byte_zero, output_tensor);
        if (result != 0) {
            printf("pipeline failed! %d\n", result);
        }
    });

    printf("Done, time: %g s\n", time);

#ifdef HALIDE_RUNTIME_HEXAGON
    // We're done with HVX, power it off, and reset the performance mode
    // to default to save power.
    halide_hexagon_power_hvx_off(nullptr);
    halide_hexagon_set_performance_mode(nullptr, halide_hexagon_power_default);
#endif

    // Copy the output back to the host. If the buffer is zero-copy (as
    // it should be on a real device), this will be a no-op.
    output_tensor.copy_to_host();

    // Validate that the algorithm did what we expect.
    output_tensor.for_each_element([&](int c, int x, int y, int b) {
        int x_ungated_start = x * stride - pad_width;
        int y_ungated_start = y * stride - pad_height;
        int element_location = c / C;
        int x_offset = element_location % filter_width;
        int y_offset = element_location / filter_width;

        int x_input = x_ungated_start + x_offset;
        int y_input = y_ungated_start + y_offset;

        int32_t output = byte_zero;

        if ((x_input >= 0) && (x_input < W) && (y_input >= 0) && (y_input < H)) {
            output = static_cast<int32_t>(
                input_tensor(c % C, x_input, y_input, b));
        }

        if (output != output_tensor(c, x, y, b)) {
            printf("Mismatch at %d %d %d %d: %d != %d\n",
                   c, x, y, b, output, output_tensor(c, x, y, b));
            abort();
        }
    });

    printf("Success!\n");
    return 0;
}