Revision f2d750f355fccadcd03af51bcb58af724719859c authored by Roman Lebedev on 14 February 2024, 20:35:52 UTC, committed by GitHub on 14 February 2024, 20:35:52 UTC
``` [2336/4154] /usr/bin/clang++-17 -DHALIDE_ENABLE_RTTI -DHALIDE_VERSION_MAJOR=17 -DHALIDE_VERSION_MINOR=0 -DHALIDE_VERSION_PATCH=0 -DHALIDE_WITH_EXCEPTIONS -I/build/halide-17.0.0/test/common -I/build/halide-17.0.0/tools -I/build/halide-17.0.0/build/stage-1/halide/include -g -fdebug-default-version=4 -fprofile-use=/build/halide-17.0.0/build-profile/default.profdata -fcs-profile-generate -Xclang -mllvm -Xclang -vp-counters-per-site=100.0 -fuse-ld=lld-17 -Wl,--build-id=sha1 -std=c++17 -flto=thin -fPIE -fvisibility=hidden -fvisibility-inlines-hidden -Winvalid-pch -Xclang -include-pch -Xclang /build/halide-17.0.0/build/stage-1/halide/test/CMakeFiles/_test_internal.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /build/halide-17.0.0/build/stage-1/halide/test/CMakeFiles/_test_internal.dir/cmake_pch.hxx -MD -MT test/correctness/CMakeFiles/correctness_float16_t.dir/float16_t.cpp.o -MF test/correctness/CMakeFiles/correctness_float16_t.dir/float16_t.cpp.o.d -o test/correctness/CMakeFiles/correctness_float16_t.dir/float16_t.cpp.o -c /build/halide-17.0.0/test/correctness/float16_t.cpp <...> ld.lld-17: error: undefined hidden symbol: __extendhfsf2 >>> referenced by float16_t.cpp:391 (/build/halide-17.0.0/test/correctness/float16_t.cpp:391) >>> lto.tmp:(main) >>> did you mean: __extendbfsf2 >>> defined in: /lib/x86_64-linux-gnu/libgcc_s.so.1 clang++-17: error: linker command failed with exit code 1 (use -v to see invocation) ```
1 parent 40a622f
StageStridedLoads.cpp
#include "StageStridedLoads.h"
#include "CSE.h"
#include "IREquality.h"
#include "IRMutator.h"
#include "IROperator.h"
#include "IRVisitor.h"
#include "Scope.h"
#include "Simplify.h"
#include "Substitute.h"
namespace Halide {
namespace Internal {
namespace {
class FindStridedLoads : public IRVisitor {
public:
struct Key {
// The buffer being accessed.
std::string buf;
// The base index being accessed, without any constant offset.
Expr base;
// The stride and lanes of the vector access.
int64_t stride;
int lanes;
// The loaded type.
Type type;
// The Allocate node the load belongs to. nullptr for loads from external buffers.
const Allocate *allocation;
// The Stmt over which the load definitely happens, and definitely
// refers to the same buffer as other loads with the same name. nullptr
// means global scope.
const IRNode *scope;
bool operator<(const Key &other) const {
// Check fields in order of cost to compare
if (stride < other.stride) {
return true;
} else if (stride > other.stride) {
return false;
} else if (lanes < other.lanes) {
return true;
} else if (lanes > other.lanes) {
return false;
} else if (scope < other.scope) {
return true;
} else if (scope > other.scope) {
return false;
} else if (allocation < other.allocation) {
return true;
} else if (allocation > other.allocation) {
return false;
} else if (type < other.type) {
return true;
} else if (other.type < type) {
return false;
} else if (buf < other.buf) {
return true;
} else if (buf > other.buf) {
return false;
} else {
return graph_less_than(base, other.base);
}
}
};
// Entry entry maps from an offset from the base to a vector of identical
// Load nodes with that offset.
std::map<Key, std::map<int64_t, std::vector<const Load *>>> found_loads;
// The current scope over which accesses definitely occur.
const IRNode *scope = nullptr;
Scope<const Allocate *> allocation_scope;
std::map<const IRNode *, const IRNode *> parent_scope;
protected:
void visit(const Load *op) override {
if (is_const_one(op->predicate)) {
// We want to give ourselves the best possible chance at recognizing
// a naked Ramp, so we simplify and substitute in lets (and take
// care to treat the index expression as a graph until the next
// CSE).
Expr idx = substitute_in_all_lets(simplify(common_subexpression_elimination(op->index)));
if (const Ramp *r = idx.as<Ramp>()) {
const int64_t *stride_ptr = as_const_int(r->stride);
int64_t stride = stride_ptr ? *stride_ptr : 0;
Expr base = r->base;
int64_t offset = 0;
const Add *base_add = base.as<Add>();
const int64_t *offset_ptr = base_add ? as_const_int(base_add->b) : nullptr;
if (offset_ptr) {
base = base_add->a;
offset = *offset_ptr;
}
// TODO: We do not yet handle nested vectorization here for
// ramps which have not already collapsed. We could potentially
// handle more interesting types of shuffle than simple flat slices.
if (stride >= 2 && stride < r->lanes && r->stride.type().is_scalar()) {
const IRNode *s = scope;
const Allocate *a = nullptr;
if (allocation_scope.contains(op->name)) {
a = allocation_scope.get(op->name);
}
found_loads[Key{op->name, base, stride, r->lanes, op->type, a, s}][offset].push_back(op);
}
}
}
IRVisitor::visit(op);
}
void visit(const For *op) override {
if (can_prove(op->extent > 0)) {
// The loop body definitely runs
IRVisitor::visit(op);
} else {
const IRNode *child_scope = op->body.get();
parent_scope[child_scope] = scope;
ScopedValue<const IRNode *> bind(scope, child_scope);
IRVisitor::visit(op);
}
}
void visit(const IfThenElse *op) override {
op->condition.accept(this);
{
const IRNode *child_scope = op->then_case.get();
parent_scope[child_scope] = scope;
ScopedValue<const IRNode *> bind(scope, child_scope);
op->then_case.accept(this);
}
if (op->else_case.defined()) {
const IRNode *child_scope = op->else_case.get();
parent_scope[child_scope] = scope;
ScopedValue<const IRNode *> bind(scope, child_scope);
op->else_case.accept(this);
}
}
void visit(const Allocate *op) override {
// Provide a mapping from load nodes to paddable allocations they belong
// to.
ScopedBinding<const Allocate *> bind(allocation_scope, op->name, op);
IRVisitor::visit(op);
}
using IRVisitor::visit;
};
// Replace a bunch of load expressions in a stmt
class ReplaceStridedLoads : public IRMutator {
public:
std::map<std::pair<const Allocate *, const Load *>, Expr> replacements;
std::map<const Allocate *, int> padding;
Scope<const Allocate *> allocation_scope;
protected:
Expr visit(const Load *op) override {
const Allocate *alloc = nullptr;
if (allocation_scope.contains(op->name)) {
alloc = allocation_scope.get(op->name);
}
auto it = replacements.find({alloc, op});
if (it != replacements.end()) {
return mutate(it->second);
} else {
return IRMutator::visit(op);
}
}
Stmt visit(const Allocate *op) override {
ScopedBinding bind(allocation_scope, op->name, op);
auto it = padding.find(op);
Stmt s = IRMutator::visit(op);
if (it == padding.end()) {
return s;
} else {
op = s.as<Allocate>();
internal_assert(op);
return Allocate::make(op->name, op->type, op->memory_type,
op->extents, op->condition,
op->body, op->new_expr, op->free_function,
std::max(it->second, op->padding));
}
}
using IRMutator::visit;
};
} // namespace
Stmt stage_strided_loads(const Stmt &s) {
FindStridedLoads finder;
ReplaceStridedLoads replacer;
// Find related clusters of strided loads anywhere in the stmt. While this
// appears to look globally, it requires expressions to match exactly, so
// really it's only going to find things inside the same loops and let
// statements.
s.accept(&finder);
for (const auto &l : finder.found_loads) {
const FindStridedLoads::Key &k = l.first;
const Allocate *alloc = k.allocation;
const std::map<int64_t, std::vector<const Load *>> &v = l.second;
// Find clusters of strided loads that can share the same dense load.
for (auto load = v.begin(); load != v.end();) {
// If there is any other load at the same base at an offset at least
// stride-1 ahead, it's safe to do a big dense load. Note that we're
// assuming that it's always valid to load addresses between two
// valid addresses, which rules out games involving protected pages
// at the end of scanlines.
const bool can_lift = l.second.lower_bound(load->first + k.stride - 1) != l.second.end();
if (!can_lift) {
load++;
continue;
}
// We have a complete cluster of loads. Make a single dense load
int lanes = k.lanes * k.stride;
int64_t first_offset = load->first;
Expr idx = Ramp::make(k.base + (int)first_offset, make_one(k.base.type()), lanes);
Type t = k.type.with_lanes(lanes);
const Load *op = load->second[0];
Expr shared_load = Load::make(t, k.buf, idx, op->image, op->param,
const_true(lanes), op->alignment);
shared_load = common_subexpression_elimination(shared_load);
for (; load != v.end() && load->first < first_offset + k.stride; load++) {
Expr shuf = Shuffle::make_slice(shared_load, load->first - first_offset, k.stride, k.lanes);
for (const Load *l : load->second) {
replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
}
}
}
// Do the same in reverse to pick up any loads that didn't get
// picked up in a cluster, but for whom we know it's safe to do a
// dense load before their start.
for (auto load = v.rbegin(); load != v.rend(); load++) {
if (replacer.replacements.count({alloc, load->second[0]})) {
continue;
}
int64_t delta = k.stride - 1;
const bool can_lift = l.second.upper_bound(load->first - delta) != l.second.begin();
if (!can_lift) {
continue;
}
int lanes = k.lanes * k.stride;
int64_t first_offset = load->first - delta;
Expr idx = Ramp::make(k.base + (int)first_offset, make_one(k.base.type()), lanes);
Type t = k.type.with_lanes(lanes);
const Load *op = load->second[0];
Expr dense_load = Load::make(t, k.buf, idx, op->image, op->param,
const_true(lanes), op->alignment - delta);
dense_load = common_subexpression_elimination(dense_load);
Expr shuf = Shuffle::make_slice(dense_load, delta, k.stride, k.lanes);
for (const Load *l : load->second) {
replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
}
}
// Look for any loads we can densify because an overlapping load occurs
// in any parent scope.
for (auto load = v.rbegin(); load != v.rend(); load++) {
if (replacer.replacements.count({alloc, load->second[0]})) {
continue;
}
int64_t min_offset = load->first;
int64_t max_offset = load->first;
const IRNode *scope = k.scope;
while (scope) {
const IRNode *parent = finder.parent_scope[scope];
auto parent_key = k;
parent_key.scope = parent;
auto it = finder.found_loads.find(parent_key);
if (it != finder.found_loads.end() && !it->second.empty()) {
min_offset = std::min(it->second.begin()->first, min_offset);
max_offset = std::max(it->second.rbegin()->first, max_offset);
}
scope = parent;
}
if (max_offset - min_offset < k.stride - 1) {
continue;
}
int64_t offset = std::max(load->first - (k.stride - 1), min_offset);
int lanes = k.lanes * k.stride;
Expr idx = Ramp::make(k.base + (int)offset, make_one(k.base.type()), lanes);
Type t = k.type.with_lanes(lanes);
const Load *op = load->second[0];
Expr dense_load = Load::make(t, k.buf, idx, op->image, op->param,
const_true(lanes), op->alignment);
dense_load = common_subexpression_elimination(dense_load);
Expr shuf = Shuffle::make_slice(dense_load, load->first - offset, k.stride, k.lanes);
for (const Load *l : load->second) {
replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
}
}
// Densify any remaining strided loads to internal allocations by
// padding the allocation, and densify any remaining strided loads to
// external allocations by doing a dense load at a trimmed size. We rely
// on codegen to do a good job at loading vectors of a funny size.
for (auto load = v.begin(); load != v.end(); load++) { // NOLINT
if (replacer.replacements.count({alloc, load->second[0]})) {
continue;
}
int lanes = k.lanes * k.stride;
bool may_pad = k.allocation && !k.allocation->new_expr.defined();
int delta = (int)(k.stride - 1);
if (may_pad) {
auto p = replacer.padding.insert({k.allocation, delta});
if (!p.second) {
p.first->second = std::max(p.first->second, delta);
}
int64_t first_offset = load->first;
Expr idx = Ramp::make(k.base + (int)first_offset, make_one(k.base.type()), lanes);
Type t = k.type.with_lanes(lanes);
const Load *op = load->second[0];
Expr dense_load = Load::make(t, k.buf, idx, op->image, op->param,
const_true(lanes), op->alignment);
dense_load = common_subexpression_elimination(dense_load);
Expr shuf = Shuffle::make_slice(dense_load, load->first - first_offset, k.stride, k.lanes);
for (const Load *l : load->second) {
replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
}
} else if (k.lanes % 2 == 0) {
// Do two overlapping half-sized dense loads and mush them together.
int64_t first_offset = load->first;
int half_lanes = lanes / 2;
internal_assert(delta <= half_lanes);
Expr idx1 = Ramp::make(k.base + (int)first_offset, make_one(k.base.type()), half_lanes);
Expr idx2 = Ramp::make(k.base + (int)first_offset + half_lanes - delta, make_one(k.base.type()), half_lanes);
Type t = k.type.with_lanes(half_lanes);
const Load *op = load->second[0];
Expr dense_load1 = Load::make(t, k.buf, idx1, op->image, op->param,
const_true(half_lanes), op->alignment);
Expr dense_load2 = Load::make(t, k.buf, idx2, op->image, op->param,
const_true(half_lanes), op->alignment + half_lanes - delta);
dense_load1 = common_subexpression_elimination(dense_load1);
dense_load2 = common_subexpression_elimination(dense_load2);
Expr shuf1 = Shuffle::make_slice(dense_load1, 0, k.stride, k.lanes / 2);
Expr shuf2 = Shuffle::make_slice(dense_load2, delta, k.stride, k.lanes / 2);
Expr shuf = Shuffle::make_concat({shuf1, shuf2});
for (const Load *l : load->second) {
replacer.replacements.emplace(std::make_pair(alloc, l), shuf);
}
}
}
}
return replacer.mutate(s);
}
} // namespace Internal
} // namespace Halide
![swh spinner](/static/img/swh-spinner.gif)
Computing file changes ...