Content - ef5e0746b15ea401f1af46bad45472431f5f5eee

Permalink
#ifndef BENCHMARK_H
#define BENCHMARK_H

#include <algorithm>
#include <cassert>
#include <chrono>
#include <functional>
#include <limits>

#if defined(__EMSCRIPTEN__)
#include <emscripten.h>
#endif

namespace Halide {
namespace Tools {

#if !(defined(__EMSCRIPTEN__) && defined(HALIDE_BENCHMARK_USE_EMSCRIPTEN_GET_NOW))

// Prefer high_resolution_clock, but only if it's steady...
template<bool HighResIsSteady = std::chrono::high_resolution_clock::is_steady>
struct SteadyClock {
    using type = std::chrono::high_resolution_clock;
};

// ...otherwise use steady_clock.
template<>
struct SteadyClock<false> {
    using type = std::chrono::steady_clock;
};

inline SteadyClock<>::type::time_point benchmark_now() {
    return SteadyClock<>::type::now();
}

inline double benchmark_duration_seconds(
    SteadyClock<>::type::time_point start,
    SteadyClock<>::type::time_point end) {
    return std::chrono::duration_cast<std::chrono::duration<double>>(end - start).count();
}

#else  // __EMSCRIPTEN__

// Emscripten's std::chrono::steady_clock and/or high_resolution_clock
// can throw an exception (!) if the runtime doesn't have a truly
// steady clock available. Advice from emscripten-discuss suggested
// that emscripten_get_now() is the best bet, as it is milliseconds
// (but returned as a double, with microseconds in the fractional portion),
// using either performance.now() or performance.hrtime() depending on the
// environment. Unfortunately, it's not guaranteed to be steady, and the
// auto-benchmark algorithm reacts badly to negative times, so we'll leave this
// disabled for now; you can opt-in to this by defining HALIDE_BENCHMARK_USE_EMSCRIPTEN_GET_NOW
// if you need to build for a wasm runtime with the exception behavior above.
inline double benchmark_now() {
    return emscripten_get_now();
}

inline double benchmark_duration_seconds(double start, double end) {
    // emscripten_get_now is *not* guaranteed to be steady.
    // Clamping to a positive value is arguably better than nothing,
    // but still produces unpredictable results in the adaptive case
    // (which is why this is disabled by default).
    return std::max((end - start) / 1000.0, 1e-9);
}

#endif

// Benchmark the operation 'op'. The number of iterations refers to
// how many times the operation is run for each time measurement, the
// result is the minimum over a number of samples runs. The result is the
// amount of time in seconds for one iteration.
//
// NOTE: it is usually simpler and more accurate to use the adaptive
// version of benchmark() later in this file; this function is provided
// for legacy code.
//
// IMPORTANT NOTE: Using this tool for timing GPU code may be misleading,
// as it does not account for time needed to synchronize to/from the GPU;
// if the callback doesn't include calls to device_sync(), the reported
// time may only be that to queue the requests; if the callback *does*
// include calls to device_sync(), it might exaggerate the sync overhead
// for real-world use. For now, callers using this to benchmark GPU
// code should measure with extreme caution.

inline double benchmark(uint64_t samples, uint64_t iterations, const std::function<void()> &op) {
    double best = std::numeric_limits<double>::infinity();
    for (uint64_t i = 0; i < samples; i++) {
        auto start = benchmark_now();
        for (uint64_t j = 0; j < iterations; j++) {
            op();
        }
        auto end = benchmark_now();
        double elapsed_seconds = benchmark_duration_seconds(start, end);
        best = std::min(best, elapsed_seconds);
    }
    return best / iterations;
}

// Benchmark the operation 'op': run the operation until at least min_time
// has elapsed; the number of iterations is expanded as we
// progress (based on initial runs of 'op') to minimize overhead. The time
// reported will be that of the best single iteration.
//
// Most callers should be able to get good results without needing to specify
// custom BenchmarkConfig values.
//
// IMPORTANT NOTE: Using this tool for timing GPU code may be misleading,
// as it does not account for time needed to synchronize to/from the GPU;
// if the callback doesn't include calls to device_sync(), the reported
// time may only be that to queue the requests; if the callback *does*
// include calls to device_sync(), it might exaggerate the sync overhead
// for real-world use. For now, callers using this to benchmark GPU
// code should measure with extreme caution.

constexpr uint64_t kBenchmarkMaxIterations = 1000000000;

struct BenchmarkConfig {
    // Attempt to use this much time (in seconds) for the meaningful samples
    // taken; initial iterations will be done to find an iterations-per-sample
    // count that puts the total runtime in this ballpark.
    double min_time{0.1};

    // Set an absolute upper time limit. Defaults to min_time * 4.
    double max_time{0.1 * 4};

    // Maximum value for the computed iters-per-sample.
    // We need this for degenerate cases in which we have a
    // very coarse-grained timer (e.g. some Emscripten/browser environments),
    // and a very short operation being benchmarked; in these cases,
    // we can get timings that are effectively zero, which can explode
    // the predicted next-iter into ~100B or more. It should be unusual
    // that client code needs to adjust this value.
    uint64_t max_iters_per_sample{1000000};

    // Terminate when the relative difference between the best runtime
    // seen and the third-best runtime seen is no more than
    // this. Controls accuracy. The closer to zero this gets the more
    // reliable the answer, but the longer it may take to run.
    double accuracy{0.03};
};

struct BenchmarkResult {
    // Best elapsed wall-clock time per iteration (seconds).
    double wall_time;

    // Number of samples used for measurement.
    // (There might be additional samples taken that are not used
    // for measurement.)
    uint64_t samples;

    // Total number of iterations across all samples.
    // (There might be additional iterations taken that are not used
    // for measurement.)
    uint64_t iterations;

    // Measured accuracy between the best and third-best result.
    // Will be <= config.accuracy unless max_time is exceeded.
    double accuracy;

    operator double() const {
        return wall_time;
    }
};

inline BenchmarkResult benchmark(const std::function<void()> &op, const BenchmarkConfig &config = {}) {
    BenchmarkResult result{0, 0, 0};

    const double min_time = std::max(10 * 1e-6, config.min_time);
    const double max_time = std::max(config.min_time, config.max_time);

    const double accuracy = 1.0 + std::min(std::max(0.001, config.accuracy), 0.1);

    // We will do (at least) kMinSamples samples; we will do additional
    // samples until the best the kMinSamples'th results are within the
    // accuracy tolerance (or we run out of iterations).
    constexpr int kMinSamples = 3;
    double times[kMinSamples + 1] = {0};

    double total_time = 0;
    uint64_t iters_per_sample = 1;
    for (;;) {
        result.samples = 0;
        result.iterations = 0;
        total_time = 0;
        for (int i = 0; i < kMinSamples; i++) {
            times[i] = benchmark(1, iters_per_sample, op);
            result.samples++;
            result.iterations += iters_per_sample;
            total_time += times[i] * iters_per_sample;
        }
        std::sort(times, times + kMinSamples);

        // Any time result <= to this is considered 'zero' here.
        const double kTimeEpsilon = 1e-9;
        if (times[0] < kTimeEpsilon) {
            // If the fastest time is tiny, then trying to use it to predict next_iters
            // can just explode into something unpredictably huge, which could take far too
            // long to complete. Just double iters_per_sample and try again (or terminate if
            // we're over the max).
            iters_per_sample *= 2;
        } else {
            const double time_factor = std::max(times[0] * kMinSamples, kTimeEpsilon);
            if (time_factor * iters_per_sample >= min_time) {
                break;
            }
            // Use an estimate based on initial times to converge faster.
            const double next_iters = std::max(min_time / time_factor,
                                               iters_per_sample * 2.0);
            iters_per_sample = (uint64_t)(next_iters + 0.5);
        }

        // Ensure we never explode beyond the max.
        if (iters_per_sample >= config.max_iters_per_sample) {
            iters_per_sample = config.max_iters_per_sample;
            break;
        }
    }

    // - Keep taking samples until we are accurate enough (even if we run over min_time).
    // - If we are already accurate enough but have time remaining, keep taking samples.
    // - No matter what, don't go over max_time; this is important, in case
    // we happen to get faster results for the first samples, then happen to transition
    // to throttled-down CPU state.
    while ((times[0] * accuracy < times[kMinSamples - 1] || total_time < min_time) &&
           total_time < max_time) {
        times[kMinSamples] = benchmark(1, iters_per_sample, op);
        result.samples++;
        result.iterations += iters_per_sample;
        total_time += times[kMinSamples] * iters_per_sample;
        std::sort(times, times + kMinSamples + 1);
    }
    result.wall_time = times[0];
    result.accuracy = (times[kMinSamples - 1] / times[0]) - 1.0;

    return result;
}

}  // namespace Tools
}  // namespace Halide

#endif