https://github.com/halide/Halide
Raw File
Tip revision: f9e4c7878385f43cf88cca23d5bd663233e9e7da authored by Steven Johnson on 27 April 2021, 19:14:54 UTC
Add support for dynamic tensors to hannk (#5942)
Tip revision: f9e4c78
Profiling.cpp
#include <algorithm>
#include <limits>
#include <map>
#include <string>

#include "CodeGen_Internal.h"
#include "IRMutator.h"
#include "IROperator.h"
#include "Profiling.h"
#include "Scope.h"
#include "Simplify.h"
#include "Substitute.h"
#include "Util.h"

namespace Halide {
namespace Internal {

using std::map;
using std::string;
using std::vector;

namespace {

class InjectProfiling : public IRMutator {

public:
    map<string, int> indices;  // maps from func name -> index in buffer.

    vector<int> stack;  // What produce nodes are we currently inside of.

    string pipeline_name;

    InjectProfiling(const string &pipeline_name)
        : pipeline_name(pipeline_name) {
        stack.push_back(get_func_id("overhead"));
        // ID 0 is treated specially in the runtime as overhead
        internal_assert(stack.back() == 0);

        malloc_id = get_func_id("halide_malloc");
        free_id = get_func_id("halide_free");
        profiler_pipeline_state = Variable::make(Handle(), "profiler_pipeline_state");
        profiler_state = Variable::make(Handle(), "profiler_state");
        profiler_token = Variable::make(Int(32), "profiler_token");
    }

    map<int, uint64_t> func_stack_current;  // map from func id -> current stack allocation
    map<int, uint64_t> func_stack_peak;     // map from func id -> peak stack allocation

private:
    using IRMutator::visit;

    int malloc_id, free_id;
    Expr profiler_pipeline_state;
    Expr profiler_state;
    Expr profiler_token;

    struct AllocSize {
        bool on_stack;
        Expr size;
    };

    Scope<AllocSize> func_alloc_sizes;

    bool profiling_memory = true;

    // Strip down the tuple name, e.g. f.0 into f
    string normalize_name(const string &name) {
        vector<string> v = split_string(name, ".");
        internal_assert(!v.empty());
        return v[0];
    }

    int get_func_id(const string &name) {
        string norm_name = normalize_name(name);
        int idx = -1;
        map<string, int>::iterator iter = indices.find(norm_name);
        if (iter == indices.end()) {
            idx = (int)indices.size();
            indices[norm_name] = idx;
        } else {
            idx = iter->second;
        }
        return idx;
    }

    Stmt set_current_func(const Expr &id) {
        // This call gets inlined and becomes a single store instruction.
        return Evaluate::make(Call::make(Int(32), "halide_profiler_set_current_func",
                                         {profiler_state, profiler_token, id}, Call::Extern));
    }

    Expr compute_allocation_size(const vector<Expr> &extents,
                                 const Expr &condition,
                                 const Type &type,
                                 const std::string &name,
                                 bool &on_stack) {
        on_stack = true;

        Expr cond = simplify(condition);
        if (is_const_zero(cond)) {  // Condition always false
            return make_zero(UInt(64));
        }

        int64_t constant_size = Allocate::constant_allocation_size(extents, name);
        if (constant_size > 0) {
            int64_t stack_bytes = constant_size * type.bytes();
            if (can_allocation_fit_on_stack(stack_bytes)) {  // Allocation on stack
                return make_const(UInt(64), stack_bytes);
            }
        }

        // Check that the allocation is not scalar (if it were scalar
        // it would have constant size).
        internal_assert(!extents.empty());

        on_stack = false;
        Expr size = cast<uint64_t>(extents[0]);
        for (size_t i = 1; i < extents.size(); i++) {
            size *= extents[i];
        }
        size = simplify(Select::make(condition, size * type.bytes(), make_zero(UInt(64))));
        return size;
    }

    Stmt visit(const Allocate *op) override {
        int idx = get_func_id(op->name);

        vector<Expr> new_extents;
        bool all_extents_unmodified = true;
        for (size_t i = 0; i < op->extents.size(); i++) {
            new_extents.push_back(mutate(op->extents[i]));
            all_extents_unmodified &= new_extents[i].same_as(op->extents[i]);
        }
        Expr condition = mutate(op->condition);

        bool on_stack;
        Expr size = compute_allocation_size(new_extents, condition, op->type, op->name, on_stack);
        internal_assert(size.type() == UInt(64));
        func_alloc_sizes.push(op->name, {on_stack, size});

        // compute_allocation_size() might return a zero size, if the allocation is
        // always conditionally false. remove_dead_allocations() is called after
        // inject_profiling() so this is a possible scenario.
        if (!is_const_zero(size) && on_stack) {
            const uint64_t *int_size = as_const_uint(size);
            internal_assert(int_size != nullptr);  // Stack size is always a const int
            func_stack_current[idx] += *int_size;
            func_stack_peak[idx] = std::max(func_stack_peak[idx], func_stack_current[idx]);
            debug(3) << "  Allocation on stack: " << op->name
                     << "(" << size << ") in pipeline " << pipeline_name
                     << "; current: " << func_stack_current[idx]
                     << "; peak: " << func_stack_peak[idx] << "\n";
        }

        Stmt body = mutate(op->body);
        Expr new_expr;
        Stmt stmt;
        if (op->new_expr.defined()) {
            new_expr = mutate(op->new_expr);
        }
        if (all_extents_unmodified &&
            body.same_as(op->body) &&
            condition.same_as(op->condition) &&
            new_expr.same_as(op->new_expr)) {
            stmt = op;
        } else {
            stmt = Allocate::make(op->name, op->type, op->memory_type,
                                  new_extents, condition, body, new_expr, op->free_function);
        }

        if (!is_const_zero(size) && !on_stack && profiling_memory) {
            debug(3) << "  Allocation on heap: " << op->name
                     << "(" << size << ") in pipeline "
                     << pipeline_name << "\n";

            vector<Stmt> tasks{
                set_current_func(malloc_id),
                Evaluate::make(Call::make(Int(32), "halide_profiler_memory_allocate",
                                          {profiler_pipeline_state, idx, size}, Call::Extern)),
                stmt,
                set_current_func(stack.back())};

            stmt = Block::make(tasks);
        }

        return stmt;
    }

    Stmt visit(const Free *op) override {
        int idx = get_func_id(op->name);

        AllocSize alloc = func_alloc_sizes.get(op->name);
        internal_assert(alloc.size.type() == UInt(64));
        func_alloc_sizes.pop(op->name);

        Stmt stmt = IRMutator::visit(op);

        if (!is_const_zero(alloc.size)) {
            if (!alloc.on_stack) {
                if (profiling_memory) {
                    debug(3) << "  Free on heap: " << op->name << "(" << alloc.size << ") in pipeline " << pipeline_name << "\n";

                    vector<Stmt> tasks{
                        set_current_func(free_id),
                        Evaluate::make(Call::make(Int(32), "halide_profiler_memory_free",
                                                  {profiler_pipeline_state, idx, alloc.size}, Call::Extern)),
                        stmt,
                        set_current_func(stack.back())};

                    stmt = Block::make(tasks);
                }
            } else {
                const uint64_t *int_size = as_const_uint(alloc.size);
                internal_assert(int_size != nullptr);

                func_stack_current[idx] -= *int_size;
                debug(3) << "  Free on stack: " << op->name << "(" << alloc.size << ") in pipeline " << pipeline_name
                         << "; current: " << func_stack_current[idx] << "; peak: " << func_stack_peak[idx] << "\n";
            }
        }
        return stmt;
    }

    Stmt visit(const ProducerConsumer *op) override {
        int idx;
        Stmt body;
        if (op->is_producer) {
            idx = get_func_id(op->name);
            stack.push_back(idx);
            body = mutate(op->body);
            stack.pop_back();
        } else {
            body = mutate(op->body);
            // At the beginning of the consume step, set the current task
            // back to the outer one.
            idx = stack.back();
        }

        body = Block::make(set_current_func(idx), body);

        return ProducerConsumer::make(op->name, op->is_producer, body);
    }

    Stmt incr_active_threads() {
        return Evaluate::make(Call::make(Int(32), "halide_profiler_incr_active_threads",
                                         {profiler_state}, Call::Extern));
    }

    Stmt decr_active_threads() {
        return Evaluate::make(Call::make(Int(32), "halide_profiler_decr_active_threads",
                                         {profiler_state}, Call::Extern));
    }

    Stmt visit_parallel_task(const Stmt &s) {
        if (const Fork *f = s.as<Fork>()) {
            return Fork::make(visit_parallel_task(f->first), visit_parallel_task(f->rest));
        } else if (const Acquire *a = s.as<Acquire>()) {
            return Acquire::make(a->semaphore, a->count, visit_parallel_task(a->body));
        } else {
            return Block::make({incr_active_threads(), mutate(s), decr_active_threads()});
        }
    }

    Stmt visit(const Acquire *op) override {
        Stmt s = visit_parallel_task(op);
        return Block::make({decr_active_threads(), s, incr_active_threads()});
    }

    Stmt visit(const Fork *op) override {
        Stmt s = visit_parallel_task(op);
        return Block::make({decr_active_threads(), s, incr_active_threads()});
    }

    Stmt visit(const For *op) override {
        Stmt body = op->body;

        // The for loop indicates a device transition or a
        // parallel job launch. Decrement the number of active
        // threads outside the loop, and increment it inside the
        // body.
        bool update_active_threads = (op->device_api == DeviceAPI::Hexagon ||
                                      op->is_unordered_parallel());

        if (update_active_threads) {
            body = Block::make({incr_active_threads(), body, decr_active_threads()});
        }

        // We profile by storing a token to global memory, so don't enter GPU loops
        if (op->device_api == DeviceAPI::Hexagon) {
            // TODO: This is for all offload targets that support
            // limited internal profiling, which is currently just
            // hexagon. We don't support per-func stats remotely,
            // which means we can't do memory accounting.
            bool old_profiling_memory = profiling_memory;
            profiling_memory = false;
            body = mutate(body);
            profiling_memory = old_profiling_memory;

            // Get the profiler state pointer from scratch inside the
            // kernel. There will be a separate copy of the state on
            // the DSP that the host side will periodically query.
            Expr get_state = Call::make(Handle(), "halide_profiler_get_state", {}, Call::Extern);
            body = substitute("profiler_state", Variable::make(Handle(), "hvx_profiler_state"), body);
            body = LetStmt::make("hvx_profiler_state", get_state, body);
        } else if (op->device_api == DeviceAPI::None ||
                   op->device_api == DeviceAPI::Host) {
            body = mutate(body);
        } else {
            body = op->body;
        }

        Stmt stmt = For::make(op->name, op->min, op->extent, op->for_type, op->device_api, body);

        if (update_active_threads) {
            stmt = Block::make({decr_active_threads(), stmt, incr_active_threads()});
        }
        return stmt;
    }
};

}  // namespace

Stmt inject_profiling(Stmt s, const string &pipeline_name) {
    InjectProfiling profiling(pipeline_name);
    s = profiling.mutate(s);

    int num_funcs = (int)(profiling.indices.size());

    Expr func_names_buf = Variable::make(Handle(), "profiling_func_names");

    Expr start_profiler = Call::make(Int(32), "halide_profiler_pipeline_start",
                                     {pipeline_name, num_funcs, func_names_buf}, Call::Extern);

    Expr get_state = Call::make(Handle(), "halide_profiler_get_state", {}, Call::Extern);

    Expr get_pipeline_state = Call::make(Handle(), "halide_profiler_get_pipeline_state", {pipeline_name}, Call::Extern);

    Expr profiler_token = Variable::make(Int(32), "profiler_token");

    Expr stop_profiler = Call::make(Handle(), Call::register_destructor,
                                    {Expr("halide_profiler_pipeline_end"), get_state}, Call::Intrinsic);

    bool no_stack_alloc = profiling.func_stack_peak.empty();
    if (!no_stack_alloc) {
        Expr func_stack_peak_buf = Variable::make(Handle(), "profiling_func_stack_peak_buf");

        Expr profiler_pipeline_state = Variable::make(Handle(), "profiler_pipeline_state");
        Stmt update_stack = Evaluate::make(Call::make(Int(32), "halide_profiler_stack_peak_update",
                                                      {profiler_pipeline_state, func_stack_peak_buf}, Call::Extern));
        s = Block::make(update_stack, s);
    }

    Expr profiler_state = Variable::make(Handle(), "profiler_state");
    Stmt incr_active_threads =
        Evaluate::make(Call::make(Int(32), "halide_profiler_incr_active_threads",
                                  {profiler_state}, Call::Extern));
    Stmt decr_active_threads =
        Evaluate::make(Call::make(Int(32), "halide_profiler_decr_active_threads",
                                  {profiler_state}, Call::Extern));
    s = Block::make({incr_active_threads, s, decr_active_threads});

    s = LetStmt::make("profiler_pipeline_state", get_pipeline_state, s);
    s = LetStmt::make("profiler_state", get_state, s);
    // If there was a problem starting the profiler, it will call an
    // appropriate halide error function and then return the
    // (negative) error code as the token.
    s = Block::make(AssertStmt::make(profiler_token >= 0, profiler_token), s);
    s = LetStmt::make("profiler_token", start_profiler, s);

    if (!no_stack_alloc) {
        for (int i = num_funcs - 1; i >= 0; --i) {
            s = Block::make(Store::make("profiling_func_stack_peak_buf",
                                        make_const(UInt(64), profiling.func_stack_peak[i]),
                                        i, Parameter(), const_true(), ModulusRemainder()),
                            s);
        }
        s = Block::make(s, Free::make("profiling_func_stack_peak_buf"));
        s = Allocate::make("profiling_func_stack_peak_buf", UInt(64),
                           MemoryType::Auto, {num_funcs}, const_true(), s);
    }

    for (const auto &p : profiling.indices) {
        s = Block::make(Store::make("profiling_func_names", p.first, p.second, Parameter(), const_true(), ModulusRemainder()), s);
    }

    s = Block::make(s, Free::make("profiling_func_names"));
    s = Allocate::make("profiling_func_names", Handle(),
                       MemoryType::Auto, {num_funcs}, const_true(), s);
    s = Block::make(Evaluate::make(stop_profiler), s);

    return s;
}

}  // namespace Internal
}  // namespace Halide
back to top