#include #include #include #include "Bounds.h" #include "CSE.h" #include "CodeGen_GPU_Dev.h" #include "CompilerLogger.h" #include "ExprUsesVar.h" #include "FuseGPUThreadLoops.h" #include "IR.h" #include "IREquality.h" #include "IRMutator.h" #include "IROperator.h" #include "IRPrinter.h" #include "Monotonic.h" #include "Simplify.h" #include "Solve.h" #include "Substitute.h" namespace Halide { namespace Internal { using std::map; using std::pair; using std::sort; using std::string; using std::vector; namespace { string thread_names[] = {"__thread_id_x", "__thread_id_y", "__thread_id_z", "__thread_id_w"}; string block_names[] = {"__block_id_x", "__block_id_y", "__block_id_z", "__block_id_w"}; class ExtractBlockSize : public IRVisitor { Expr block_extent[4], block_count[4]; string block_var_name[4]; using IRVisitor::visit; void found_thread_for(int dim, const string &name, const Expr &extent) { internal_assert(dim >= 0 && dim < 4); if (!block_extent[dim].defined()) { block_extent[dim] = extent; } else { block_extent[dim] = simplify(Max::make(extent, block_extent[dim])); } } void found_block_for(int dim, const string &name, Expr extent) { internal_assert(dim >= 0 && dim < 4); internal_assert(!block_count[dim].defined()); block_count[dim] = std::move(extent); block_var_name[dim] = name; } void visit(const For *op) override { for (int i = 0; i < 4; i++) { if (ends_with(op->name, thread_names[i])) { found_thread_for(i, op->name, op->extent); } else if (ends_with(op->name, block_names[i])) { found_block_for(i, op->name, op->extent); } } IRVisitor::visit(op); Scope scope; scope.push(op->name, Interval(op->min, simplify(op->min + op->extent - 1))); // For non-rectangular thread loops, use a bounding box. We'll inject if statements later. for (Expr &e : block_extent) { if (e.defined() && expr_uses_var(e, op->name)) { e = simplify(common_subexpression_elimination(e)); e = simplify(bounds_of_expr_in_scope(e, scope).max); } } } void visit(const LetStmt *op) override { IRVisitor::visit(op); for (Expr &e : block_extent) { if (e.defined() && expr_uses_var(e, op->name)) { e = simplify(Let::make(op->name, op->value, e)); } } } public: int blocks_dimensions() const { for (int i = 0; i < 4; i++) { if (!block_count[i].defined()) { return i; } } return 4; } int threads_dimensions() const { for (int i = 0; i < 4; i++) { if (!block_extent[i].defined()) { return i; } } return 4; } Expr num_threads(int d) const { return block_extent[d]; } Expr num_blocks(int d) const { return block_count[d]; } Expr block_var(int d) const { return Variable::make(Int(32), block_var_name[d]); } Expr thread_var(int d) const { // Thread variables get canonical names return Variable::make(Int(32), "." + thread_names[d]); } }; class NormalizeDimensionality : public IRMutator { using IRMutator::visit; const ExtractBlockSize &block_size; const DeviceAPI device_api; int depth; int max_depth; Stmt wrap(Stmt s) { if (depth != 0) { return mutate(s); } max_depth = 0; s = mutate(s); if (is_no_op(s)) { return s; } while (max_depth < block_size.threads_dimensions()) { string name = thread_names[max_depth]; s = For::make("." + name, 0, 1, ForType::GPUThread, device_api, s); max_depth++; } return s; } Stmt visit(const Block *op) override { Stmt first = wrap(op->first); Stmt rest; if (op->rest.defined()) { rest = wrap(op->rest); } if (first.same_as(op->first) && rest.same_as(op->rest)) { return op; } else { return Block::make(first, rest); } } Stmt visit(const For *op) override { if (CodeGen_GPU_Dev::is_gpu_thread_var(op->name)) { depth++; if (depth > max_depth) { max_depth = depth; } Stmt stmt = IRMutator::visit(op); depth--; return stmt; } else { return IRMutator::visit(op); } } public: NormalizeDimensionality(const ExtractBlockSize &e, DeviceAPI device_api) : block_size(e), device_api(device_api), depth(0), max_depth(0) { } }; class ReplaceForWithIf : public IRMutator { using IRMutator::visit; const ExtractBlockSize &block_size; Stmt visit(const For *op) override { if (CodeGen_GPU_Dev::is_gpu_thread_var(op->name)) { int dim; for (dim = 0; dim < 4; dim++) { if (ends_with(op->name, thread_names[dim])) { break; } } internal_assert(dim >= 0 && dim < block_size.threads_dimensions()); Stmt body = mutate(op->body); Expr var = Variable::make(Int(32), "." + thread_names[dim]); body = substitute(op->name, var + op->min, body); if (equal(op->extent, block_size.num_threads(dim))) { return body; } else { Expr cond = var < op->extent; return IfThenElse::make(cond, body, Stmt()); } } else { return IRMutator::visit(op); } } public: ReplaceForWithIf(const ExtractBlockSize &e) : block_size(e) { } }; class ExtractSharedAndHeapAllocations : public IRMutator { using IRMutator::visit; struct IntInterval { IntInterval() : IntInterval(0, 0) { } IntInterval(int min, int max) : min(min), max(max) { } int min; int max; }; struct SharedAllocation { string name; Type type; Expr size; IntInterval liveness; // Start and end of the barrier stage at which this allocation is used. MemoryType memory_type; // Should be GPUShared or Heap bool striped_over_threads; bool size_computed_on_host; }; struct AllocGroup { AllocGroup() = default; AllocGroup(const SharedAllocation &alloc) : name(alloc.name), widest_type(alloc.type), max_size(alloc.size), memory_type(alloc.memory_type) { group.push_back(alloc); } void insert(const SharedAllocation &alloc) { internal_assert(alloc.memory_type == memory_type); if (alloc.type.bytes() == widest_type.bytes()) { max_size = max(max_size, alloc.size); } else if (alloc.type.bytes() > widest_type.bytes()) { // Change units of max_size int size_ratio = alloc.type.bytes() / widest_type.bytes(); max_size = max(max_size / size_ratio, alloc.size); widest_type = alloc.type; } else { int size_ratio = widest_type.bytes() / alloc.type.bytes(); max_size = max(max_size, alloc.size / size_ratio); } group.push_back(alloc); name += "_" + alloc.name; } // Only need to check the back of the vector since we always insert // the most recent allocation at the back. bool is_free(int stage) const { return group.back().liveness.max < stage; } string name; Type widest_type; Expr max_size; // In units of the widest type vector group; // Groups of allocs that should be coalesced together MemoryType memory_type; // All allocations in the group have this memory type }; public: vector allocations; private: map shared; bool in_threads; int barrier_stage; const DeviceAPI device_api; string thread_id_var_name, num_threads_var_name; bool may_merge_allocs_of_different_type; // A loop on the host used to compute the shared memory size Stmt host_side_preamble; void precompute_allocation_size(SharedAllocation &s) { Expr val = Load::make(Int(32), s.name + ".shared_size", 0, Buffer<>{}, Parameter{}, const_true(), ModulusRemainder{}); Stmt update_size = Store::make(s.name + ".shared_size", max(s.size, val), 0, Parameter{}, const_true(), ModulusRemainder{}); if (host_side_preamble.defined()) { host_side_preamble = Block::make(host_side_preamble, update_size); } else { host_side_preamble = update_size; } s.size_computed_on_host = true; s.size = Variable::make(Int(32), s.name + ".shared_size_var"); } Stmt visit(const For *op) override { bool is_thread_loop = CodeGen_GPU_Dev::is_gpu_thread_var(op->name); ScopedValue old_in_threads(in_threads, in_threads || is_thread_loop); // Set aside the allocations we've found so far. vector old; old.swap(allocations); // And any preamble Stmt old_preamble = host_side_preamble; host_side_preamble = Stmt(); // Find allocations inside the loop body Stmt body = mutate(op->body); // Expand any new shared allocations found in the body using the loop bounds. Scope scope; scope.push(op->name, Interval(op->min, simplify(op->min + op->extent - 1))); for (SharedAllocation &s : allocations) { // If the size depends on the loop variable, take the max // over all loop iterations if (expr_uses_var(s.size, op->name) && !s.size_computed_on_host) { s.size = simplify(common_subexpression_elimination(s.size)); // It's worth working extra hard to remove any // repeated dependence on the block var s.size = solve_expression(s.size, op->name).result; s.size = simplify(common_subexpression_elimination(s.size)); switch (is_monotonic(s.size, op->name)) { case Monotonic::Unknown: // TODO: if bounds_of_expr_in_scope becomes more // powerful than is_monotonic, it might be better // to call it here. That would be risky though, as // it's not exact. debug(1) << "Shared allocation for " << s.name << " has a size that is non-monotonic in the gpu block variable " << op->name << ": " << s.size << "\n"; if (get_compiler_logger()) { get_compiler_logger()->record_non_monotonic_loop_var(op->name, s.size); } precompute_allocation_size(s); break; case Monotonic::Increasing: s.size = substitute(op->name, simplify(op->min + op->extent - 1), s.size); break; case Monotonic::Constant: // The size expression used the variable, but we // may have successfully eliminated it above, or // is_monotonic might have detected that the // dependence is false somehow. Just treat it as // decreasing... case Monotonic::Decreasing: s.size = substitute(op->name, op->min, s.size); break; } } if (in_threads && op->is_parallel()) { // For parallel inner loops, make a separate slice per loop iteration s.size *= op->extent; } } // Add back on the allocations we set aside. if (!allocations.empty()) { allocations.insert(allocations.end(), old.begin(), old.end()); } else { allocations.swap(old); } Expr new_min = mutate(op->min); Expr new_extent = mutate(op->extent); if (host_side_preamble.defined()) { string loop_name = unique_name('t'); Expr v = Variable::make(Int(32), loop_name); host_side_preamble = substitute(op->name, v, host_side_preamble); host_side_preamble = For::make(loop_name, new_min, new_extent, ForType::Serial, DeviceAPI::None, host_side_preamble); if (old_preamble.defined()) { host_side_preamble = Block::make(old_preamble, host_side_preamble); } } else { host_side_preamble = old_preamble; } return For::make(op->name, new_min, new_extent, op->for_type, op->device_api, body); } Stmt visit(const Block *op) override { if (!in_threads && op->rest.defined()) { Stmt first = mutate(op->first); barrier_stage++; Stmt rest = mutate(op->rest); if (first.same_as(op->first) && rest.same_as(op->rest)) { return op; } else { return Block::make(first, rest); } } else { return IRMutator::visit(op); } } Stmt visit(const IfThenElse *op) override { Expr condition = mutate(op->condition); Stmt before_preamble = host_side_preamble; host_side_preamble = Stmt(); Stmt then_case = mutate(op->then_case); Stmt then_preamble = host_side_preamble; host_side_preamble = Stmt(); Stmt else_case = mutate(op->else_case); Stmt else_preamble = host_side_preamble; if (then_preamble.defined()) { host_side_preamble = IfThenElse::make(condition, then_preamble, else_preamble); } else if (else_preamble.defined()) { host_side_preamble = IfThenElse::make(!condition, else_preamble); } if (before_preamble.defined() && host_side_preamble.defined()) { host_side_preamble = Block::make(before_preamble, host_side_preamble); } else if (before_preamble.defined()) { host_side_preamble = before_preamble; } return IfThenElse::make(condition, then_case, else_case); } int alloc_node_counter = 0; Stmt visit(const Allocate *op) override { user_assert(!op->new_expr.defined()) << "Allocate node inside GPU kernel has custom new expression.\n" << "(Memoization is not supported inside GPU kernels at present.)\n"; bool fixed_size_thread_allocation = (op->constant_allocation_size() != 0) && in_threads; if ((fixed_size_thread_allocation && op->memory_type != MemoryType::Heap && op->memory_type != MemoryType::GPUShared && op->memory_type != MemoryType::GPUTexture) || op->memory_type == MemoryType::Register || op->memory_type == MemoryType::Stack) { // These allocations go in register or local memory return IRMutator::visit(op); } user_assert(op->memory_type == MemoryType::Auto || op->memory_type == MemoryType::GPUShared || op->memory_type == MemoryType::GPUTexture || op->memory_type == MemoryType::Heap) << "Allocation " << op->name << " must live in shared or heap memory, " << "but is scheduled to live in " << op->memory_type << " memory.\n"; SharedAllocation alloc; alloc.name = op->name + "." + std::to_string(alloc_node_counter++); alloc.type = op->type; alloc.liveness = IntInterval(barrier_stage, barrier_stage); alloc.size = 1; for (const auto &extent : op->extents) { alloc.size *= extent; } alloc.size = simplify(alloc.size); alloc.memory_type = op->memory_type; alloc.size_computed_on_host = false; alloc.striped_over_threads = in_threads; if (alloc.memory_type == MemoryType::Auto) { if (in_threads) { // Dynamic allocation within the threads loop go on // the heap by default. alloc.memory_type = MemoryType::Heap; } else { // Allocations at the blocks level go in shared by // default. alloc.memory_type = MemoryType::GPUShared; } } // Updates the liveness by checking for all uses shared.emplace(op->name, &alloc); Stmt stmt = IRMutator::visit(op); op = stmt.as(); internal_assert(op); allocations.push_back(alloc); shared.erase(op->name); return op->body; } Expr mutate_index(SharedAllocation *alloc, const Expr &index) { Expr idx = mutate(index); if (alloc->striped_over_threads) { idx *= Variable::make(Int(32), num_threads_var_name); idx += Variable::make(Int(32), thread_id_var_name); } return idx; } Expr visit(const Load *op) override { auto it = shared.find(op->name); if (it != shared.end()) { SharedAllocation *alloc = it->second; alloc->liveness.max = barrier_stage; Expr predicate = mutate(op->predicate); Expr index = mutate_index(alloc, op->index); return Load::make(op->type, alloc->name, index, op->image, op->param, predicate, op->alignment); } else { return IRMutator::visit(op); } } Stmt visit(const Store *op) override { auto it = shared.find(op->name); if (it != shared.end()) { SharedAllocation *alloc = it->second; alloc->liveness.max = barrier_stage; Expr predicate = mutate(op->predicate); Expr index = mutate_index(alloc, op->index); Expr value = mutate(op->value); return Store::make(alloc->name, value, index, op->param, predicate, op->alignment); } else { return IRMutator::visit(op); } } Stmt visit(const LetStmt *op) override { Expr value = mutate(op->value); // Set aside the allocations we've found so far. Stmt old_preamble = host_side_preamble; host_side_preamble = Stmt(); vector old; old.swap(allocations); Stmt body = mutate(op->body); // Wrap let expression for any allocations found within for (SharedAllocation &s : allocations) { if (expr_uses_var(s.size, op->name) && !s.size_computed_on_host) { s.size = Let::make(op->name, op->value, s.size); s.size = simplify(s.size); } } if (host_side_preamble.defined() && stmt_uses_var(host_side_preamble, op->name)) { host_side_preamble = LetStmt::make(op->name, op->value, host_side_preamble); } if (old_preamble.defined()) { if (host_side_preamble.defined()) { host_side_preamble = Block::make(old_preamble, host_side_preamble); } else { host_side_preamble = old_preamble; } } // Add back on the allocations we set aside. if (!allocations.empty()) { allocations.insert(allocations.end(), old.begin(), old.end()); } else { allocations.swap(old); } if (op->body.same_as(body) && value.same_as(op->value)) { return op; } else { return LetStmt::make(op->name, value, body); } } // Return index to free_spaces where 'alloc' should be coalesced. Return -1 // if there isn't any. int find_best_fit(const vector &mem_allocs, const vector &free_spaces, const SharedAllocation &alloc, int stage) { int free_idx = -1; Expr alloc_size = simplify(alloc.size); // We prefer to coalesce dynamic-sized allocation with a dynamic-sized one and // constant-sized alloc with a constant-sized one. If we can't find any free // space with a matching type, we pick the most-recently freed space of the // other type (e.g. pick constant-sized free space for a dynamic-sized allocation // and vice versa). We prefer the most-recently freed space as stages that are // close together usually have relatively similar allocation size. For // constant-sized allocation, we prioritize free space which size differs // the least with 'alloc' (can be smaller or larger; it does not really // matter since we take the max of the two as the new size). if (!is_const(alloc_size)) { // dynamic-sized alloc for (int i = free_spaces.size() - 1; i >= 0; --i) { internal_assert(free_spaces[i] >= 0 && free_spaces[i] < (int)mem_allocs.size()); internal_assert(mem_allocs[free_spaces[i]].is_free(stage)); if (mem_allocs[free_spaces[i]].memory_type != alloc.memory_type) { continue; } if (!may_merge_allocs_of_different_type && mem_allocs[free_spaces[i]].group[0].type != alloc.type) { // Types must also match for OpenGLCompute continue; } if (!is_const(mem_allocs[free_spaces[i]].max_size)) { return i; } else if (free_idx == -1) { free_idx = i; } } } else { // constant-sized alloc int64_t diff = -1; for (int i = free_spaces.size() - 1; i >= 0; --i) { internal_assert(free_spaces[i] >= 0 && free_spaces[i] < (int)mem_allocs.size()); internal_assert(mem_allocs[free_spaces[i]].is_free(stage)); if (mem_allocs[free_spaces[i]].memory_type != alloc.memory_type) { continue; } if (!may_merge_allocs_of_different_type && mem_allocs[free_spaces[i]].group[0].type != alloc.type) { // Types must also match for OpenGLCompute continue; } if (is_const(mem_allocs[free_spaces[i]].max_size)) { const auto &candidate_group = mem_allocs[free_spaces[i]]; Expr size = alloc_size * alloc.type.bytes(); Expr dist = candidate_group.max_size * candidate_group.widest_type.bytes() - size; const int64_t *current_diff = as_const_int(simplify(dist)); internal_assert(current_diff != nullptr); int64_t abs_diff = std::abs(*current_diff); if ((free_idx == -1) || (abs_diff < diff)) { diff = abs_diff; free_idx = i; } } else if (free_idx == -1) { free_idx = i; } } } return free_idx; } // Given some allocations, return a vector of allocation group where each group // consists of a number of allocations which should be coalesced together // in the shared memory. vector allocate_funcs(vector &allocations) { // Sort based on the ascending order of the min liveness stage, // then sort based on the ascending order of the max liveness stage. sort(allocations.begin(), allocations.end(), [](const SharedAllocation &lhs, const SharedAllocation &rhs) { if (lhs.liveness.min < rhs.liveness.min) { return true; } else if (lhs.liveness.min == rhs.liveness.min) { return lhs.liveness.max < rhs.liveness.max; } return false; }); vector mem_allocs; vector free_spaces; // Contains index to free spaces in mem_allocs int start_idx = 0; for (int stage = 0; stage <= barrier_stage; ++stage) { for (int i = start_idx; i < (int)allocations.size(); ++i) { if (allocations[i].liveness.min > stage) { break; } else if (allocations[i].liveness.min == stage) { // Allocate int free_idx = find_best_fit(mem_allocs, free_spaces, allocations[i], stage); if (free_idx != -1) { mem_allocs[free_spaces[free_idx]].insert(allocations[i]); free_spaces.erase(free_spaces.begin() + free_idx); } else { mem_allocs.emplace_back(allocations[i]); } } else if (allocations[i].liveness.max == stage - 1) { // Free int free_idx = -1; for (int j = 0; j < (int)mem_allocs.size(); ++j) { // Find the index of the space to free if (mem_allocs[j].group.back().name == allocations[i].name) { free_idx = j; break; } } internal_assert(free_idx >= 0 && free_idx < (int)mem_allocs.size()); free_spaces.push_back(free_idx); start_idx = i + 1; } } } return mem_allocs; } Expr get_block_id(const ExtractBlockSize &bs) const { Expr block_id = 0; for (int d = bs.blocks_dimensions() - 1; d >= 0; d--) { block_id *= bs.num_blocks(d); block_id += bs.block_var(d); } return block_id; } Expr max_over_blocks(const Expr &e, const ExtractBlockSize &bs) const { Scope scope; for (int d = 0; d < bs.blocks_dimensions(); d++) { scope.push(bs.block_var(d).as()->name, Interval(0, bs.num_blocks(d) - 1)); } Interval in = bounds_of_expr_in_scope(simplify(e), scope); if (in.has_upper_bound()) { return in.max; } else { return Expr(); } } struct GlobalAllocation { string name; Expr size; Type type; }; vector global_allocations; public: Stmt rewrap_block(Stmt s, const ExtractBlockSize &bs) { // Combine the allocations into groups that have disjoint // lifetimes, and then cluster the groups according to which // ones can share a single allocation. For cuda, opencl, and // similar we get one big combined allocation per memory // type. For vulkan, openglcompute and direct3d, we also separate by // element type. map, vector> clustered_allocs; { vector mem_allocs = allocate_funcs(allocations); // Every allocation must belong to one group internal_assert(allocations.size() >= mem_allocs.size()); // Sort the allocations by the max size in bytes of the primitive // types in the group. Because the type sizes are then decreasing powers of // two, doing this guarantees that all allocations are aligned // to then element type as long as the original one is aligned // to the widest type. sort(mem_allocs.begin(), mem_allocs.end(), [](const AllocGroup &lhs, const AllocGroup &rhs) { return lhs.widest_type.bytes() > rhs.widest_type.bytes(); }); for (const auto &alloc : mem_allocs) { Type t = may_merge_allocs_of_different_type ? UInt(8) : alloc.widest_type; pair key{alloc.memory_type, t}; clustered_allocs[key].push_back(alloc); } } for (auto &p : clustered_allocs) { vector &cluster = p.second; // Heap or shared? MemoryType memory_type = p.first.first; // Type of the combined Allocate node Type alloc_type = p.first.second; // Figure out a name for the cluster, the total size of // the cluster (in terms of the alloc_type), and the // widest type in the cluster (which may be wider than the // alloc_type). string name; Expr total_size = 0; Type widest_type; for (const auto &alloc : cluster) { if (name.empty()) { name = alloc.name; widest_type = alloc.widest_type; } else { name += "__" + alloc.name; if (alloc.widest_type.bytes() > widest_type.bytes()) { widest_type = alloc.widest_type; } } int ratio = alloc.widest_type.bytes() / alloc_type.bytes(); internal_assert(ratio != 0) << "alloc_type should have been at most as wide as the widest type in group\n"; total_size += alloc.max_size * ratio; } // Upgrade the alloc type to the widest type found, and // downgrade total_size accordingly. int ratio = widest_type.bytes() / alloc_type.bytes(); internal_assert(ratio != 0) << "alloc_type should have been at most as wide as the widest type in cluster\n"; if (ratio != 1) { total_size += ratio - 1; total_size /= ratio; } alloc_type = widest_type; // Remove any dependence on the block vars by taking a max { Expr size = max_over_blocks(total_size, bs); internal_assert(size.defined()) << memory_type << " memory used by GPU kernel varies with the block index in an unbounded way: " << total_size << "\n"; total_size = size; } const string total_size_name = name + ".size"; Expr total_size_var = Variable::make(Int(32), total_size_name); // Make the allocation if (memory_type == MemoryType::Heap) { global_allocations.push_back(GlobalAllocation{name, total_size, alloc_type}); } else { s = Allocate::make(name, alloc_type, memory_type, {total_size_var}, const_true(), s); } // Define a group offset for each group in the // cluster. The group offsets are in elements of // widest_type across the entire cluster. Using that, // define an individual offset for each allocation in the // group, using units of that allocation's type. for (int i = (int)(cluster.size()) - 1; i >= 0; i--) { Expr group_offset = Variable::make(Int(32), name + "." + std::to_string(i) + ".offset"); for (const SharedAllocation &alloc : cluster[i].group) { // Change units, as described above. Expr offset = group_offset; internal_assert(alloc.type.bytes() <= widest_type.bytes()); if (alloc.type.bytes() < widest_type.bytes()) { offset *= (widest_type.bytes() / alloc.type.bytes()); } offset = simplify(offset); // Rewrite all loads and stores to point to the allocation // cluster they belong to with the appropriate offset into it. class RewriteGroupAccess : public IRMutator { using IRMutator::visit; Expr visit(const Load *op) override { if (op->name == alloc_name) { return Load::make(op->type, cluster_name, mutate(op->index) + offset, op->image, op->param, mutate(op->predicate), op->alignment); } else { return IRMutator::visit(op); } } Stmt visit(const Store *op) override { if (op->name == alloc_name) { return Store::make(cluster_name, mutate(op->value), mutate(op->index) + offset, op->param, mutate(op->predicate), op->alignment); } else { return IRMutator::visit(op); } } const string &alloc_name; const string &cluster_name; const Expr &offset; public: RewriteGroupAccess(const string &alloc_name, const string &cluster_name, const Expr &offset) : alloc_name(alloc_name), cluster_name(cluster_name), offset(offset) { } } rewriter{alloc.name, name, offset}; s = rewriter.mutate(s); } // Define the group offset in terms of the previous group in the cluster Expr offset; if (i > 0) { // Build off the last offset offset = Variable::make(Int(32), name + "." + std::to_string(i - 1) + ".offset"); int ratio = (widest_type.bytes() / cluster[i - 1].widest_type.bytes()); internal_assert(ratio != 0); offset += simplify((cluster[i - 1].max_size + ratio - 1) / ratio); } else { if (memory_type == MemoryType::Heap) { // One slice of a larger global allocation offset = get_block_id(bs) * total_size_var; } else { // Base address for shared memory is zero offset = 0; } } s = LetStmt::make(group_offset.as()->name, simplify(offset), s); } s = LetStmt::make(total_size_name, total_size, s); } // Resolve thread_id and threads_per_block variables, uses of // which were injected above if any allocation was striped // over the threads. Expr thread_id = 0, num_threads = 1; for (int d = bs.threads_dimensions() - 1; d >= 0; d--) { num_threads *= bs.num_threads(d); thread_id *= bs.num_threads(d); thread_id += bs.thread_var(d); } if (stmt_uses_var(s, thread_id_var_name)) { s = LetStmt::make(thread_id_var_name, thread_id, s); } if (stmt_uses_var(s, num_threads_var_name)) { s = LetStmt::make(num_threads_var_name, num_threads, s); } return s; } Stmt rewrap_kernel_launch(Stmt s, const ExtractBlockSize &bs, DeviceAPI device_api) { for (const auto &alloc : global_allocations) { Expr total_size = alloc.size; Expr device_interface = make_device_interface_call(device_api); string buffer_name = alloc.name + ".buffer"; Expr buffer_var = Variable::make(type_of(), buffer_name); BufferBuilder builder; builder.mins.emplace_back(0); builder.extents.push_back(total_size); builder.strides.emplace_back(1); builder.type = alloc.type; builder.dimensions = 1 + bs.blocks_dimensions(); for (int d = 0; d < bs.blocks_dimensions(); d++) { Expr next_stride = builder.strides.back() * builder.extents.back(); builder.strides.push_back(next_stride); builder.extents.emplace_back(bs.num_blocks(d)); } Expr buffer = builder.build(); Expr allocate_heap_call = Call::make(Int(32), "halide_device_malloc", {buffer_var, device_interface}, Call::Extern); string allocate_heap_result_var_name = unique_name('t'); Expr allocate_heap_result_var = Variable::make(Int(32), allocate_heap_result_var_name); Stmt check_allocated = AssertStmt::make(allocate_heap_result_var == 0, allocate_heap_result_var); Expr device_field = Call::make(Handle(), Call::buffer_get_device, {buffer_var}, Call::Extern); s = LetStmt::make(alloc.name, device_field, s); s = Block::make(check_allocated, s); s = LetStmt::make(allocate_heap_result_var_name, allocate_heap_call, s); s = Allocate::make(buffer_name, alloc.type, MemoryType::Auto, {}, const_true(), s, buffer, "halide_device_free_as_destructor"); } s = compute_shared_memory_sizes_on_host(s); return s; } Stmt compute_shared_memory_sizes_on_host(Stmt result) { if (!host_side_preamble.defined()) { return result; } // Make all the let stmts that define the size vars for (auto &alloc : allocations) { if (alloc.size_computed_on_host) { string alloc_name = alloc.name + ".shared_size"; string var_name = alloc.name + ".shared_size_var"; Expr val = Load::make(Int(32), alloc_name, 0, Buffer<>{}, Parameter{}, const_true(), ModulusRemainder{}); result = LetStmt::make(var_name, val, result); alloc.size = Variable::make(Int(32), var_name); } } // Prefix the preamble result = Block::make(host_side_preamble, result); // Wrap the preamble in all the allocation nodes for (auto &alloc : allocations) { if (alloc.size_computed_on_host) { string alloc_name = alloc.name + ".shared_size"; Stmt init = Store::make(alloc_name, 0, 0, Parameter{}, const_true(), ModulusRemainder{}); result = Block::make(init, result); result = Allocate::make(alloc_name, Int(32), MemoryType::Stack, {1}, const_true(), result); } } return result; } ExtractSharedAndHeapAllocations(DeviceAPI d) : in_threads(false), barrier_stage(0), device_api(d), thread_id_var_name(unique_name('t')), num_threads_var_name(unique_name('t')), may_merge_allocs_of_different_type(device_api != DeviceAPI::OpenGLCompute && device_api != DeviceAPI::D3D12Compute && device_api != DeviceAPI::Vulkan && device_api != DeviceAPI::WebGPU) { } }; // namespace Internal // Pull out any allocate node outside of the innermost thread // block. Should only be run after shared allocations have already // been extracted. class ExtractRegisterAllocations : public IRMutator { using IRMutator::visit; struct RegisterAllocation { string name; string loop_var; // The nearest enclosing loop over threads. Empty if it's at block level. Type type; Expr size; MemoryType memory_type; // Should be Auto, Stack, or Register }; bool in_lane_loop = false; Stmt visit(const For *op) override { ScopedValue old_loop_var(loop_var); if (op->for_type == ForType::GPULane) { loop_var = op->name; internal_assert(!in_lane_loop); ScopedValue old_in_lane_loop(in_lane_loop, true); has_lane_loop = true; return IRMutator::visit(op); } else { if (op->for_type == ForType::GPUThread) { has_thread_loop = true; loop_var = op->name; } // Hoisting an allocation out of a vectorized for loop // would break here. We should already have hoisted // vectorized allocations. internal_assert(op->for_type != ForType::Vectorized); // Set aside the allocations we've found so far. vector old; old.swap(allocations); // Find allocations inside the loop body Stmt body = mutate(op->body); // Expand any new register allocations found in the body using the loop bounds. Scope scope; scope.push(op->name, Interval(op->min, simplify(op->min + op->extent - 1))); // Expand the inner allocations using the loop bounds. for (RegisterAllocation &s : allocations) { if (expr_uses_var(s.size, op->name)) { s.size = bounds_of_expr_in_scope(s.size, scope).max; } } // Add back on the allocations we set aside. if (!allocations.empty()) { allocations.insert(allocations.end(), old.begin(), old.end()); } else { allocations.swap(old); } return For::make(op->name, mutate(op->min), mutate(op->extent), op->for_type, op->device_api, body); } } int alloc_node_counter = 0; Scope alloc_renaming; Stmt visit(const Allocate *op) override { if (in_lane_loop) { return IRMutator::visit(op); } user_assert(op->memory_type == MemoryType::Stack || op->memory_type == MemoryType::Register || op->memory_type == MemoryType::Heap || op->memory_type == MemoryType::Auto) << "Allocation " << op->name << " is scheduled inside a loop over GPU threads, so " << "it must live in stack memory, heap memory, or registers. " << "Shared allocations at this loop level are not yet supported.\n"; ScopedBinding p(register_allocations, op->name, 0); RegisterAllocation alloc; alloc.name = op->name + "." + std::to_string(alloc_node_counter++); alloc.type = op->type; alloc.size = 1; alloc.loop_var = loop_var; for (const auto &extent : op->extents) { alloc.size *= extent; } alloc.size = simplify(mutate(alloc.size)); alloc.memory_type = op->memory_type; allocations.push_back(alloc); { ScopedBinding bind(alloc_renaming, op->name, alloc.name); return mutate(op->body); } } Expr visit(const Load *op) override { string new_name = op->name; if (alloc_renaming.contains(op->name)) { new_name = alloc_renaming.get(op->name); } return Load::make(op->type, new_name, mutate(op->index), op->image, op->param, mutate(op->predicate), op->alignment); } Stmt visit(const Store *op) override { string new_name = op->name; if (alloc_renaming.contains(op->name)) { new_name = alloc_renaming.get(op->name); } return Store::make(new_name, mutate(op->value), mutate(op->index), op->param, mutate(op->predicate), op->alignment); } template ExprOrStmt visit_let(const LetOrLetStmt *op) { ExprOrStmt body = op->body; body = mutate(op->body); Expr value = mutate(op->value); for (RegisterAllocation &s : allocations) { if (expr_uses_var(s.size, op->name)) { s.size = simplify(Let::make(op->name, value, s.size)); } } if (op->body.same_as(body) && op->value.same_as(value)) { return op; } else { return LetOrLetStmt::make(op->name, value, body); } } Expr visit(const Let *op) override { return visit_let(op); } Stmt visit(const LetStmt *op) override { return visit_let(op); } Scope register_allocations; string loop_var; public: vector allocations; Stmt rewrap(Stmt body, const string &loop_var) { for (RegisterAllocation &alloc : allocations) { if ((!loop_var.empty() && ends_with(alloc.loop_var, loop_var)) || (loop_var.empty() && alloc.loop_var.empty())) { body = Allocate::make(alloc.name, alloc.type, alloc.memory_type, {alloc.size}, const_true(), body); } } return body; } bool has_lane_loop = false; bool has_thread_loop = false; }; class InjectThreadBarriers : public IRMutator { bool in_threads, injected_barrier; using IRMutator::visit; const ExtractSharedAndHeapAllocations &block_allocs; const ExtractRegisterAllocations ®ister_allocs; std::set shared_stores; std::set device_stores; std::set shared_loads; std::set device_loads; MemoryType memory_type_for_name(const std::string &name) { for (const auto &x : register_allocs.allocations) { if (x.name == name) { return x.memory_type; } } for (const auto &x : block_allocs.allocations) { if (x.name == name) { return x.memory_type; } } // Not allocated here, so must assume it's input/output // of shader return MemoryType::Auto; } Stmt make_barrier(int mask) { return Evaluate::make(Call::make(Int(32), Call::gpu_thread_barrier, {IntImm::make(Int(32), mask)}, Call::Intrinsic)); } Stmt visit(const For *op) override { ScopedValue old_in_threads(in_threads, (in_threads || op->for_type == ForType::GPUThread || op->for_type == ForType::GPULane)); ScopedValue old_injected_barrier(injected_barrier, false); if (!is_parallel(op->for_type)) { Stmt body = mutate(op->body); // Serial for loops at the block level with internal // synchronization also need synchronization after each // loop iteration. if (!in_threads && injected_barrier) { // Any memory access fences should be handled by the // synchronizations within the block body = Block::make(body, make_barrier(0)); } return For::make(op->name, op->min, op->extent, op->for_type, op->device_api, body); } else { return IRMutator::visit(op); } } Stmt visit(const Store *op) override { debug(4) << "Encountered store to " << op->name << "\n"; auto mem_type = memory_type_for_name(op->name); switch (mem_type) { case MemoryType::GPUShared: debug(4) << " memory type is shared\n"; shared_stores.insert(op->name); break; case MemoryType::Auto: case MemoryType::Heap: case MemoryType::GPUTexture: debug(4) << " memory type is heap or auto\n"; device_stores.insert(op->name); break; case MemoryType::Stack: case MemoryType::Register: case MemoryType::LockedCache: case MemoryType::VTCM: case MemoryType::AMXTile: break; } return IRMutator::visit(op); } Expr visit(const Load *op) override { debug(4) << "Encountered load from " << op->name << "\n"; auto mem_type = memory_type_for_name(op->name); switch (mem_type) { case MemoryType::GPUShared: debug(4) << " memory type is shared\n"; shared_loads.insert(op->name); break; case MemoryType::Auto: case MemoryType::Heap: case MemoryType::GPUTexture: debug(4) << " memory type is heap or auto\n"; device_loads.insert(op->name); break; case MemoryType::Stack: case MemoryType::Register: case MemoryType::LockedCache: case MemoryType::VTCM: case MemoryType::AMXTile: break; } return IRMutator::visit(op); } Stmt visit(const Block *op) override { if (!in_threads && op->rest.defined()) { // First, we record which loads from shared/device memory occur // in the rest block Stmt rest = mutate(op->rest); // Now, record which stores occur in the first stmt // of this block shared_stores.clear(); device_stores.clear(); Stmt first = mutate(op->first); // If there are any loads in the rest part that // load from something stored in first, insert the appropriate // fence type int mask = 0; for (const auto &st : shared_stores) { auto elem = shared_loads.find(st); if (elem != shared_loads.end()) { mask |= CodeGen_GPU_Dev::MemoryFenceType::Shared; break; } } for (const auto &st : device_stores) { auto elem = device_loads.find(st); if (elem != device_loads.end()) { mask |= CodeGen_GPU_Dev::MemoryFenceType::Device; break; } } injected_barrier = true; return Block::make({first, make_barrier(mask), rest}); } else { return IRMutator::visit(op); } } public: InjectThreadBarriers(ExtractSharedAndHeapAllocations &sha, ExtractRegisterAllocations &ra) : in_threads(false), block_allocs(sha), register_allocs(ra) { } }; class FuseGPUThreadLoopsSingleKernel : public IRMutator { using IRMutator::visit; const ExtractBlockSize &block_size; ExtractSharedAndHeapAllocations &block_allocations; Stmt visit(const For *op) override { if (ends_with(op->name, ".__block_id_x")) { Stmt body = op->body; // This is the innermost loop over blocks. debug(3) << "Fusing thread block:\n" << body << "\n\n"; NormalizeDimensionality n(block_size, op->device_api); body = n.mutate(body); debug(3) << "Normalized dimensionality:\n" << body << "\n\n"; Expr block_size_x = block_size.threads_dimensions() ? block_size.num_threads(0) : 1; ExtractRegisterAllocations register_allocs; ForType innermost_loop_type = ForType::GPUThread; if (block_size.threads_dimensions()) { body = register_allocs.mutate(body); if (register_allocs.has_lane_loop) { innermost_loop_type = ForType::GPULane; } } debug(3) << "Extracted register-level allocations:\n" << body << "\n\n"; if (register_allocs.has_thread_loop) { // If there's no loop over threads, everything is already synchronous. InjectThreadBarriers i{block_allocations, register_allocs}; body = i.mutate(body); } debug(3) << "Injected synchronization:\n" << body << "\n\n"; ReplaceForWithIf f(block_size); body = f.mutate(body); debug(3) << "Replaced for with if:\n" << body << "\n\n"; // There is always a loop over thread_id_x string thread_id = "." + thread_names[0]; // Add back in any register-level allocations body = register_allocs.rewrap(body, thread_id); body = For::make(thread_id, 0, block_size_x, innermost_loop_type, op->device_api, body); // Rewrap the whole thing in other loops over threads for (int i = 1; i < block_size.threads_dimensions(); i++) { thread_id = "." + thread_names[i]; body = register_allocs.rewrap(body, thread_id); body = For::make("." + thread_names[i], 0, block_size.num_threads(i), ForType::GPUThread, op->device_api, body); } thread_id.clear(); body = register_allocs.rewrap(body, thread_id); debug(3) << "Rewrapped in for loops:\n" << body << "\n\n"; // Add back in the shared allocations body = block_allocations.rewrap_block(body, block_size); debug(3) << "Add back in shared allocations:\n" << body << "\n\n"; if (body.same_as(op->body)) { return op; } else { return For::make(op->name, op->min, op->extent, op->for_type, op->device_api, body); } } else { return IRMutator::visit(op); } } public: FuseGPUThreadLoopsSingleKernel(const ExtractBlockSize &bs, ExtractSharedAndHeapAllocations &sm) : block_size(bs), block_allocations(sm) { } }; class FuseGPUThreadLoops : public IRMutator { using IRMutator::visit; Stmt visit(const For *op) override { user_assert(!(CodeGen_GPU_Dev::is_gpu_thread_var(op->name))) << "Loops over GPU thread variable: \"" << op->name << "\" is outside of any loop over a GPU block variable. " << "This schedule is malformed. There must be a GPU block " << "variable, and it must reordered to be outside all GPU " << "thread variables.\n"; if (CodeGen_GPU_Dev::is_gpu_block_var(op->name)) { // Do the analysis of thread block size and shared memory // usage. ExtractBlockSize block_size; Stmt loop = Stmt(op); loop.accept(&block_size); ExtractSharedAndHeapAllocations block_allocations(op->device_api); loop = block_allocations.mutate(loop); debug(3) << "Pulled out shared allocations:\n" << loop << "\n\n"; // Mutate the inside of the kernel loop = FuseGPUThreadLoopsSingleKernel(block_size, block_allocations).mutate(loop); loop = block_allocations.rewrap_kernel_launch(loop, block_size, op->device_api); return loop; } else { return IRMutator::visit(op); } } }; class ZeroGPULoopMins : public IRMutator { bool in_non_glsl_gpu = false; using IRMutator::visit; Stmt visit(const For *op) override { ScopedValue old_in_non_glsl_gpu(in_non_glsl_gpu); in_non_glsl_gpu = (in_non_glsl_gpu && op->device_api == DeviceAPI::None) || (op->device_api == DeviceAPI::CUDA) || (op->device_api == DeviceAPI::OpenCL) || (op->device_api == DeviceAPI::Metal) || (op->device_api == DeviceAPI::D3D12Compute) || (op->device_api == DeviceAPI::Vulkan); Stmt stmt = IRMutator::visit(op); if (CodeGen_GPU_Dev::is_gpu_var(op->name) && !is_const_zero(op->min)) { op = stmt.as(); internal_assert(op); Expr adjusted = Variable::make(Int(32), op->name) + op->min; Stmt body = substitute(op->name, adjusted, op->body); stmt = For::make(op->name, 0, op->extent, op->for_type, op->device_api, body); } return stmt; } public: ZeroGPULoopMins() = default; }; class ValidateGPULoopNesting : public IRVisitor { int gpu_block_depth = 0, gpu_thread_depth = 0; string innermost_block_var, innermost_thread_var; using IRVisitor::visit; void visit(const For *op) override { ScopedValue old_innermost_block_var(innermost_block_var); ScopedValue old_innermost_thread_var(innermost_thread_var); ScopedValue old_gpu_block_depth(gpu_block_depth); ScopedValue old_gpu_thread_depth(gpu_thread_depth); for (int i = 1; i <= 4; i++) { if (ends_with(op->name, block_names[4 - i])) { user_assert(i > gpu_block_depth) << "Invalid schedule: Loop over " << op->name << " cannot be inside of loop over " << innermost_block_var << "\n"; user_assert(gpu_thread_depth == 0) << "Invalid schedule: Loop over " << op->name << " cannot be inside of loop over " << innermost_thread_var << "\n"; innermost_block_var = op->name; gpu_block_depth = i; } if (ends_with(op->name, thread_names[4 - i])) { user_assert(i > gpu_thread_depth) << "Invalid schedule: Loop over " << op->name << " cannot be inside of loop over " << innermost_thread_var << "\n"; user_assert(gpu_block_depth > 0) << "Invalid schedule: Loop over " << op->name << " must be inside a loop over gpu blocks\n"; innermost_thread_var = op->name; gpu_thread_depth = i; } } IRVisitor::visit(op); } }; } // namespace // Also used by InjectImageIntrinsics Stmt zero_gpu_loop_mins(const Stmt &s) { return ZeroGPULoopMins().mutate(s); } namespace { // Find the inner most GPU block of a statement. class FindInnermostGPUBlock : public IRVisitor { using IRVisitor::visit; void visit(const For *op) override { if (CodeGen_GPU_Dev::is_gpu_block_var(op->name)) { // Set the last found GPU block to found_gpu_block. found_gpu_block = op; } IRVisitor::visit(op); } public: const For *found_gpu_block = nullptr; }; // Given a condition and a loop, add the condition // to the loop body. class AddConditionToALoop : public IRMutator { using IRMutator::visit; Stmt visit(const For *op) override { if (op != loop) { return IRMutator::visit(op); } return For::make(op->name, op->min, op->extent, op->for_type, op->device_api, IfThenElse::make(condition, op->body, Stmt())); } public: AddConditionToALoop(const Expr &condition, const For *loop) : condition(condition), loop(loop) { } const Expr &condition; const For *loop; }; // Push if statements between GPU blocks through all GPU blocks. // Throw error if the if statement has an else clause. class NormalizeIfStatements : public IRMutator { using IRMutator::visit; bool inside_gpu_blocks = false; Stmt visit(const For *op) override { if (!CodeGen_GPU_Dev::is_gpu_block_var(op->name)) { return IRMutator::visit(op); } ScopedValue old_inside_gpu_blocks(inside_gpu_blocks, true); return IRMutator::visit(op); } Stmt visit(const IfThenElse *op) override { if (!inside_gpu_blocks) { return IRMutator::visit(op); } FindInnermostGPUBlock find; op->accept(&find); if (find.found_gpu_block != nullptr) { internal_assert(!op->else_case.defined()) << "Found an if statement with else case between two GPU blocks.\n"; return AddConditionToALoop(op->condition, find.found_gpu_block).mutate(op->then_case); } return IRMutator::visit(op); } }; } // namespace Stmt fuse_gpu_thread_loops(Stmt s) { ValidateGPULoopNesting validate; s.accept(&validate); // NormalizeIfStatements pushes the predicates between GPU blocks // into the innermost GPU block. FuseGPUThreadLoops would then // merge the predicate into the merged GPU thread. s = NormalizeIfStatements().mutate(s); s = FuseGPUThreadLoops().mutate(s); s = ZeroGPULoopMins().mutate(s); return s; } } // namespace Internal } // namespace Halide