#include #include #include "AlignLoads.h" #include "CSE.h" #include "CodeGen_Internal.h" #include "CodeGen_Posix.h" #include "Debug.h" #include "HexagonOptimize.h" #include "IREquality.h" #include "IRMutator.h" #include "IROperator.h" #include "IRPrinter.h" #include "LLVM_Headers.h" #include "LoopCarry.h" #include "Simplify.h" #include "Substitute.h" #include "Target.h" #include "Util.h" namespace Halide { namespace Internal { using std::string; using std::vector; using namespace llvm; #ifdef WITH_HEXAGON namespace { /** A code generator that emits Hexagon code from a given Halide stmt. */ class CodeGen_Hexagon : public CodeGen_Posix { public: /** Create a Hexagon code generator for the given Hexagon target. */ CodeGen_Hexagon(const Target &); protected: void compile_func(const LoweredFunc &f, const std::string &simple_name, const std::string &extern_name) override; void init_module() override; std::string mcpu() const override; std::string mattrs() const override; int isa_version; bool use_soft_float_abi() const override; int native_vector_bits() const override; llvm::Function *define_hvx_intrinsic(llvm::Function *intrin, Type ret_ty, const std::string &name, std::vector arg_types, int flags); int is_hvx_v65_or_later() const { return (isa_version >= 65); } using CodeGen_Posix::visit; /** Nodes for which we want to emit specific hexagon intrinsics */ ///@{ void visit(const Max *) override; void visit(const Min *) override; void visit(const Call *) override; void visit(const Mul *) override; void visit(const Select *) override; void visit(const Allocate *) override; ///@} /** We ask for an extra vector on each allocation to enable fast * clamped ramp loads. */ int allocation_padding(Type type) const override { return CodeGen_Posix::allocation_padding(type) + native_vector_bits() / 8; } /** Call an LLVM intrinsic, potentially casting the operands to * match the type of the function. */ ///@{ llvm::Value *call_intrin_cast(llvm::Type *ret_ty, llvm::Function *F, std::vector Ops); llvm::Value *call_intrin_cast(llvm::Type *ret_ty, int id, std::vector Ops); ///@} /** Define overloads of CodeGen_LLVM::call_intrin that determine * the intrin_lanes from the type, and allows the function to * return null if the maybe option is true and the intrinsic is * not found. */ ///@{ llvm::Value *call_intrin(Type t, const std::string &name, std::vector, bool maybe = false); llvm::Value *call_intrin(llvm::Type *t, const std::string &name, std::vector, bool maybe = false); ///@} /** Override CodeGen_LLVM to use hexagon intrinics when possible. */ ///@{ llvm::Value *interleave_vectors(const std::vector &v) override; llvm::Value *shuffle_vectors(llvm::Value *a, llvm::Value *b, const std::vector &indices) override; using CodeGen_Posix::shuffle_vectors; ///@} /** Generate a LUT lookup using vlut instructions. */ ///@{ llvm::Value *vlut(llvm::Value *lut, llvm::Value *indices, int min_index = 0, int max_index = 1 << 30); llvm::Value *vlut(llvm::Value *lut, const std::vector &indices); ///@} llvm::Value *vdelta(llvm::Value *lut, const std::vector &indices); /** Because HVX intrinsics operate on vectors of i32, using them * requires a lot of extraneous bitcasts, which make it difficult * to manipulate the IR. This function avoids generating redundant * bitcasts. */ llvm::Value *create_bitcast(llvm::Value *v, llvm::Type *ty); private: /** Generates code for computing the size of an allocation from a * list of its extents and its size. Fires a runtime assert * (halide_error) if the size overflows 2^31 -1, the maximum * positive number an int32_t can hold. */ llvm::Value *codegen_cache_allocation_size(const std::string &name, Type type, const std::vector &extents); /** Generate a LUT (8/16 bit, max_index < 256) lookup using vlut instructions. */ llvm::Value *vlut256(llvm::Value *lut, llvm::Value *indices, int min_index = 0, int max_index = 255); }; CodeGen_Hexagon::CodeGen_Hexagon(const Target &t) : CodeGen_Posix(t) { if (target.has_feature(Halide::Target::HVX_v66)) { isa_version = 66; } else if (target.has_feature(Halide::Target::HVX_v65)) { isa_version = 65; } else { isa_version = 62; } user_assert(target.has_feature(Target::HVX)) << "Creating a Codegen target for Hexagon without the hvx target feature.\n"; } Stmt call_halide_qurt_hvx_lock(const Target &target) { Expr hvx_lock = Call::make(Int(32), "halide_qurt_hvx_lock", {}, Call::Extern); string hvx_lock_result_name = unique_name("hvx_lock_result"); Expr hvx_lock_result_var = Variable::make(Int(32), hvx_lock_result_name); Stmt check_hvx_lock = LetStmt::make( hvx_lock_result_name, hvx_lock, AssertStmt::make(EQ::make(hvx_lock_result_var, 0), hvx_lock_result_var)); return check_hvx_lock; } Stmt call_halide_qurt_hvx_unlock() { Expr hvx_unlock = Call::make(Int(32), "halide_qurt_hvx_unlock", {}, Call::Extern); string hvx_unlock_result_name = unique_name("hvx_unlock_result"); Expr hvx_unlock_result_var = Variable::make(Int(32), hvx_unlock_result_name); Stmt check_hvx_unlock = LetStmt::make(hvx_unlock_result_name, hvx_unlock, AssertStmt::make(EQ::make(hvx_unlock_result_var, 0), hvx_unlock_result_var)); return check_hvx_unlock; } // Wrap the stmt in a call to qurt_hvx_lock, calling qurt_hvx_unlock // as a destructor if successful. Stmt acquire_hvx_context(Stmt stmt, const Target &target) { // Modify the stmt to add a call to halide_qurt_hvx_lock, and // register a destructor to call halide_qurt_hvx_unlock. Stmt check_hvx_lock = call_halide_qurt_hvx_lock(target); Expr dummy_obj = reinterpret(Handle(), cast(1)); Expr hvx_unlock = Call::make(Handle(), Call::register_destructor, {Expr("halide_qurt_hvx_unlock_as_destructor"), dummy_obj}, Call::Intrinsic); stmt = Block::make(Evaluate::make(hvx_unlock), stmt); stmt = Block::make(check_hvx_lock, stmt); return stmt; } bool is_dense_ramp(const Expr &x) { const Ramp *r = x.as(); if (!r) { return false; } return is_const_one(r->stride); } // In Hexagon, we assume that we can read one vector past the end of // buffers. Using this assumption, this mutator replaces vector // predicated dense loads with scalar predicated dense loads. class SloppyUnpredicateLoadsAndStores : public IRMutator { using IRMutator::visit; // The first and last lanes of all monotonic vectors in scope Scope> monotonic_vectors; // If a vector monotonically increases or decreases across the // lanes, return the first and last lane. std::pair get_extreme_lanes(const Expr &e) { if (const Ramp *r = e.as()) { return {r->base, r->base + r->stride * (r->lanes - 1)}; } else if (const Broadcast *b = e.as()) { return {b->value, b->value}; } else if (const LT *op = e.as()) { if (!op->a.type().is_bool()) { auto a = get_extreme_lanes(op->a); auto b = get_extreme_lanes(op->b); if (a.first.defined() && b.first.defined()) { return {a.first < b.first, a.second < b.second}; } } } else if (const LE *op = e.as()) { if (!op->a.type().is_bool()) { auto a = get_extreme_lanes(op->a); auto b = get_extreme_lanes(op->b); if (a.first.defined() && b.first.defined()) { return {a.first <= b.first, a.second <= b.second}; } } } else if (const Variable *op = e.as()) { if (monotonic_vectors.contains(op->name)) { return monotonic_vectors.get(op->name); } } else if (const Let *op = e.as()) { auto v = get_extreme_lanes(op->value); ScopedBinding> bind(v.first.defined(), monotonic_vectors, op->name, v); return get_extreme_lanes(op->body); } return {Expr(), Expr()}; } Expr visit(const Let *op) override { auto v = get_extreme_lanes(op->value); ScopedBinding> bind(op->value.type().is_vector() && v.first.defined(), monotonic_vectors, op->name, v); return IRMutator::visit(op); } Expr visit(const Load *op) override { if (is_const_one(op->predicate)) { // These are handled fine return IRMutator::visit(op); } Expr predicate = mutate(op->predicate); Expr index = mutate(op->index); if (is_dense_ramp(index) || index.as()) { // Make the predicate into a scalar that is true if any of the lanes are // true. Expr condition; // If the predicate is monotonic increasing or decreasing // over the vector lanes, we can just check the last or // first lane, respectively. We won't bother to // distinguish between the two cases though, so we just or // them both together. auto v = get_extreme_lanes(predicate); if (v.first.defined()) { internal_assert(v.first.type() == Bool() && v.second.type() == Bool()) << "The extreme lanes of a bool vector should be scalar bools\n"; condition = simplify(v.first || v.second); } else { // Take an OR over all lanes. condition = VectorReduce::make(VectorReduce::Or, predicate, 1); condition = simplify(condition); } Expr load = Load::make(op->type, op->name, index, op->image, op->param, const_true(op->type.lanes()), op->alignment); return Call::make(op->type, Call::if_then_else, {condition, load}, Call::PureIntrinsic); } else { // It's a predicated vector gather. Just scalarize. We'd // prefer to keep it in a loop, but that would require // some sort of loop Expr. Another option would be // introducing a set of runtime functions to do predicated // loads. Expr load = Load::make(op->type, op->name, index, op->image, op->param, const_true(op->type.lanes()), op->alignment); return Call::make(op->type, Call::if_then_else, {predicate, load}, Call::PureIntrinsic); } } Stmt visit(const Store *op) override { if (is_const_one(op->predicate)) { return IRMutator::visit(op); } Expr predicate = mutate(op->predicate); Expr value = mutate(op->value); Expr index = mutate(op->index); int lanes = value.type().lanes(); if (const Broadcast *scalar_pred = predicate.as()) { Stmt unpredicated_store = Store::make(op->name, value, index, op->param, const_true(lanes), op->alignment); return IfThenElse::make(scalar_pred->value, unpredicated_store); } if (predicate.same_as(op->predicate) && value.same_as(op->value) && index.same_as(op->index)) { return op; } else { return Store::make(op->name, value, index, op->param, predicate, op->alignment); } } }; Stmt sloppy_unpredicate_loads_and_stores(const Stmt &s) { return SloppyUnpredicateLoadsAndStores().mutate(s); } class InjectHVXLocks : public IRMutator { public: InjectHVXLocks(const Target &t) : target(t) { uses_hvx_var = Variable::make(Bool(), "uses_hvx"); } bool uses_hvx = false; private: Expr uses_hvx_var; using IRMutator::visit; // Primarily, we do two things when we encounter a parallel for loop. // First, we check if the paralell for loop uses_hvx and accordingly // acqure_hvx_context i.e. acquire and release HVX locks. // Then we insert a conditional unlock before the for loop, let's call // this the prolog, and a conditional lock after the for loop which // we shall call the epilog. So the code for a parallel loop that uses // hvx should look like so. // // if (uses_hvx_var) { // halide_qurt_hvx_unlock(); // } // parallel_for { // halide_qurt_hvx_lock(); // ... // ... // halide_qurt_hvx_unlock(); // } // if (uses_hvx_var) { // halide_qurt_hvx_lock(); // } // // When we move up to the enclosing scope we substitute the value of uses_hvx // into the IR that should convert the conditionals to constants. Stmt visit(const For *op) override { if (op->for_type == ForType::Parallel) { bool old_uses_hvx = uses_hvx; uses_hvx = false; Stmt body = mutate(op->body); Stmt s; if (uses_hvx) { body = acquire_hvx_context(body, target); body = substitute("uses_hvx", true, body); Stmt new_for = For::make(op->name, op->min, op->extent, op->for_type, op->device_api, body); Stmt prolog = IfThenElse::make(uses_hvx_var, call_halide_qurt_hvx_unlock()); Stmt epilog = IfThenElse::make(uses_hvx_var, call_halide_qurt_hvx_lock(target)); s = Block::make({prolog, new_for, epilog}); debug(4) << "Wrapping prolog & epilog around par loop\n" << s << "\n"; } else { // We do not substitute false for "uses_hvx" into the body as we do in // the true case because we want to defer that to an enclosing scope. // The logic is that in case this scope doesn't use_hvx (we are here in // the else because of that) then an enclosing scope might. However, // substituting false for "uses_hvx" at this stage will remove the // prolog and epilog checks that will be needed as the enclosing scope // uses hvx. This is exhibited by the following code structure // // for_par(z..) {//uses hvx // for_par(y..) { // doesn't use hvx // for_par(x..) { // uses hvx // vector code // } // } // vector code // } // If we substitute false in the else here, we'll get // for_par(z.) { // halide_qurt_hvx_lock(); // for_par(y..) { // if (false) { // halide_qurt_hvx_unlock(); // will get optimized away. // } // for_par(x..) { // halide_qurt_hvx_lock(); // double lock. Not good. // vector code // halide_qurt_hvx_unlock(); // } // if (false) { // halide_qurt_hvx_lock(); // } // } // vector code // halide_qurt_unlock // } s = For::make(op->name, op->min, op->extent, op->for_type, op->device_api, body); } uses_hvx = old_uses_hvx; return s; } return IRMutator::visit(op); } Expr visit(const Variable *op) override { uses_hvx = uses_hvx || op->type.is_vector(); return op; } Expr visit(const Ramp *op) override { uses_hvx = uses_hvx || op->type.is_vector(); return op; } Expr visit(const Broadcast *op) override { uses_hvx = uses_hvx || op->lanes > 1; return op; } Expr visit(const Call *op) override { uses_hvx = uses_hvx || op->type.is_vector(); return op; } Target target; }; Stmt inject_hvx_lock_unlock(Stmt body, const Target &target) { InjectHVXLocks i(target); body = i.mutate(body); if (i.uses_hvx) { body = acquire_hvx_context(body, target); } body = substitute("uses_hvx", i.uses_hvx, body); body = simplify(body); return body; } void CodeGen_Hexagon::compile_func(const LoweredFunc &f, const string &simple_name, const string &extern_name) { CodeGen_Posix::begin_func(f.linkage, simple_name, extern_name, f.args); Stmt body = f.body; debug(1) << "Unpredicating loads and stores...\n"; // Replace dense vector predicated loads with sloppy scalarized // predicates, and scalarize predicated stores body = sloppy_unpredicate_loads_and_stores(body); debug(2) << "Lowering after unpredicating loads/stores:\n" << body << "\n\n"; if (is_hvx_v65_or_later()) { // Generate vscatter-vgathers before optimize_hexagon_shuffles. debug(1) << "Looking for vscatter-vgather...\n"; body = scatter_gather_generator(body); } debug(1) << "Optimizing shuffles...\n"; // vlut always indexes 64 bytes of the LUT at a time, even in 128 byte mode. const int lut_alignment = 64; body = optimize_hexagon_shuffles(body, lut_alignment); debug(2) << "Lowering after optimizing shuffles:\n" << body << "\n\n"; debug(1) << "Aligning loads for HVX....\n"; body = align_loads(body, target.natural_vector_size(Int(8)), 8); body = common_subexpression_elimination(body); // Don't simplify here, otherwise it will re-collapse the loads we // want to carry across loop iterations. debug(2) << "Lowering after aligning loads:\n" << body << "\n\n"; debug(1) << "Carrying values across loop iterations...\n"; // Use at most 16 vector registers for carrying values. body = loop_carry(body, 16); body = simplify(body); debug(2) << "Lowering after forwarding stores:\n" << body << "\n\n"; // Optimize the IR for Hexagon. debug(1) << "Optimizing Hexagon instructions...\n"; body = optimize_hexagon_instructions(body, target); debug(1) << "Adding calls to qurt_hvx_lock, if necessary...\n"; body = inject_hvx_lock_unlock(body, target); debug(1) << "Hexagon function body:\n"; debug(1) << body << "\n"; body.accept(this); CodeGen_Posix::end_func(f.args); } struct HvxIntrinsic { enum { BroadcastScalarsToWords = 1 << 0, // Some intrinsics need scalar arguments // broadcasted up to 32 bits. v65OrLater = 1 << 1, }; llvm::Intrinsic::ID id; halide_type_t ret_type; const char *name; halide_type_t arg_types[4]; int flags; }; // TODO: these should probably be declared constexpr, but that would // require marking various halide_type_t methods as constexpr, and an // obscure bug in MSVC2017 can cause compilation failures for them. // The bug appears to be fixed in MSVC2019, so when we move to that // as a baseline for Windows, this should be revisited. halide_type_t i8 = halide_type_t(halide_type_int, 8); halide_type_t i16 = halide_type_t(halide_type_int, 16); halide_type_t i32 = halide_type_t(halide_type_int, 32); halide_type_t u8 = halide_type_t(halide_type_uint, 8); halide_type_t u16 = halide_type_t(halide_type_uint, 16); halide_type_t u32 = halide_type_t(halide_type_uint, 32); // Define vectors that are 1x and 2x the Hexagon HVX width -- // Note that we use placeholders here (which we fix up when processing // the table) as we don't know the HVX width until we know the target // we're using; this approach lets us make a compact table with static // data, rather than having to assemble it at runtime. constexpr int kOneX = 64 * 8; halide_type_t i8v1 = i8.with_lanes(kOneX / 8); halide_type_t i16v1 = i16.with_lanes(kOneX / 16); halide_type_t i32v1 = i32.with_lanes(kOneX / 32); halide_type_t u8v1 = u8.with_lanes(kOneX / 8); halide_type_t u16v1 = u16.with_lanes(kOneX / 16); halide_type_t u32v1 = u32.with_lanes(kOneX / 32); halide_type_t i8v2 = i8v1.with_lanes(i8v1.lanes * 2); halide_type_t i16v2 = i16v1.with_lanes(i16v1.lanes * 2); halide_type_t i32v2 = i32v1.with_lanes(i32v1.lanes * 2); halide_type_t u8v2 = u8v1.with_lanes(u8v1.lanes * 2); halide_type_t u16v2 = u16v1.with_lanes(u16v1.lanes * 2); halide_type_t u32v2 = u32v1.with_lanes(u32v1.lanes * 2); // clang-format off #define INTRINSIC_128B(id) llvm::Intrinsic::hexagon_V6_##id##_128B const HvxIntrinsic intrinsic_wrappers[] = { // Zero/sign extension: {INTRINSIC_128B(vzb), u16v2, "zxt.vub", {u8v1}}, {INTRINSIC_128B(vzh), u32v2, "zxt.vuh", {u16v1}}, {INTRINSIC_128B(vsb), i16v2, "sxt.vb", {i8v1}}, {INTRINSIC_128B(vsh), i32v2, "sxt.vh", {i16v1}}, // Similar to zxt/sxt, but without deinterleaving the result. {INTRINSIC_128B(vunpackub), u16v2, "unpack.vub", {u8v1}}, {INTRINSIC_128B(vunpackuh), u32v2, "unpack.vuh", {u16v1}}, {INTRINSIC_128B(vunpackb), i16v2, "unpack.vb", {i8v1}}, {INTRINSIC_128B(vunpackh), i32v2, "unpack.vh", {i16v1}}, // Truncation: // (Yes, there really are two fs in the b versions, and 1 f in // the h versions.) {INTRINSIC_128B(vshuffeb), i8v1, "trunc.vh", {i16v2}}, {INTRINSIC_128B(vshufeh), i16v1, "trunc.vw", {i32v2}}, {INTRINSIC_128B(vshuffob), i8v1, "trunclo.vh", {i16v2}}, {INTRINSIC_128B(vshufoh), i16v1, "trunclo.vw", {i32v2}}, // Downcast with saturation: {INTRINSIC_128B(vsathub), u8v1, "trunc_satub.vh", {i16v2}}, {INTRINSIC_128B(vsatwh), i16v1, "trunc_sath.vw", {i32v2}}, {INTRINSIC_128B(vsatuwuh), u16v1, "trunc_satuh.vuw", {u32v2}}, {INTRINSIC_128B(vroundhub), u8v1, "trunc_satub_rnd.vh", {i16v2}}, {INTRINSIC_128B(vroundhb), i8v1, "trunc_satb_rnd.vh", {i16v2}}, {INTRINSIC_128B(vrounduhub), u8v1, "trunc_satub_rnd.vuh", {u16v2}}, {INTRINSIC_128B(vroundwuh), u16v1, "trunc_satuh_rnd.vw", {i32v2}}, {INTRINSIC_128B(vroundwh), i16v1, "trunc_sath_rnd.vw", {i32v2}}, {INTRINSIC_128B(vrounduwuh), u16v1, "trunc_satuh_rnd.vuw", {u32v2}}, // vpack does not interleave its input. {INTRINSIC_128B(vpackhub_sat), u8v1, "pack_satub.vh", {i16v2}}, {INTRINSIC_128B(vpackwuh_sat), u16v1, "pack_satuh.vw", {i32v2}}, {INTRINSIC_128B(vpackhb_sat), i8v1, "pack_satb.vh", {i16v2}}, {INTRINSIC_128B(vpackwh_sat), i16v1, "pack_sath.vw", {i32v2}}, {INTRINSIC_128B(vpackeb), i8v1, "pack.vh", {i16v2}}, {INTRINSIC_128B(vpackeh), i16v1, "pack.vw", {i32v2}}, {INTRINSIC_128B(vpackob), i8v1, "packhi.vh", {i16v2}}, {INTRINSIC_128B(vpackoh), i16v1, "packhi.vw", {i32v2}}, // Widening adds. There are other instructions that add two vub and two vuh // but do not widen. // To differentiate those from the widening ones, we encode the return type // in the name here. {INTRINSIC_128B(vaddubh), u16v2, "add_vuh.vub.vub", {u8v1, u8v1}}, {INTRINSIC_128B(vaddhw), i32v2, "add_vw.vh.vh", {i16v1, i16v1}}, {INTRINSIC_128B(vadduhw), u32v2, "add_vuw.vuh.vuh", {u16v1, u16v1}}, // Widening subtracts. There are other instructions that subtact two vub and // two vuh but do not widen. // To differentiate those from the widening ones, we encode the return type // in the name here. {INTRINSIC_128B(vsububh), i16v2, "sub_vh.vub.vub", {u8v1, u8v1}}, {INTRINSIC_128B(vsubhw), i32v2, "sub_vw.vh.vh", {i16v1, i16v1}}, {INTRINSIC_128B(vsubuhw), i32v2, "sub_vw.vuh.vuh", {u16v1, u16v1}}, // Adds/subtract of unsigned values with saturation. {INTRINSIC_128B(vaddubsat), u8v1, "sat_add.vub.vub", {u8v1, u8v1}}, {INTRINSIC_128B(vadduhsat), u16v1, "sat_add.vuh.vuh", {u16v1, u16v1}}, {INTRINSIC_128B(vadduwsat), u32v1, "sat_add.vuw.vuw", {u32v1, u32v1}}, {INTRINSIC_128B(vaddhsat), i16v1, "sat_add.vh.vh", {i16v1, i16v1}}, {INTRINSIC_128B(vaddwsat), i32v1, "sat_add.vw.vw", {i32v1, i32v1}}, {INTRINSIC_128B(vaddubsat_dv), u8v2, "sat_add.vub.vub.dv", {u8v2, u8v2}}, {INTRINSIC_128B(vadduhsat_dv), u16v2, "sat_add.vuh.vuh.dv", {u16v2, u16v2}}, {INTRINSIC_128B(vadduwsat_dv), u32v2, "sat_add.vuw.vuw.dv", {u32v2, u32v2}}, {INTRINSIC_128B(vaddhsat_dv), i16v2, "sat_add.vh.vh.dv", {i16v2, i16v2}}, {INTRINSIC_128B(vaddwsat_dv), i32v2, "sat_add.vw.vw.dv", {i32v2, i32v2}}, {INTRINSIC_128B(vsububsat), i8v1, "sat_sub.vub.vub", {u8v1, u8v1}}, {INTRINSIC_128B(vsubuhsat), i16v1, "sat_sub.vuh.vuh", {u16v1, u16v1}}, {INTRINSIC_128B(vsubhsat), i16v1, "sat_sub.vh.vh", {i16v1, i16v1}}, {INTRINSIC_128B(vsubwsat), i32v1, "sat_sub.vw.vw", {i32v1, i32v1}}, {INTRINSIC_128B(vsububsat_dv), i8v2, "sat_sub.vub.vub.dv", {u8v2, u8v2}}, {INTRINSIC_128B(vsubuhsat_dv), i16v2, "sat_sub.vuh.vuh.dv", {u16v2, u16v2}}, {INTRINSIC_128B(vsubhsat_dv), i16v2, "sat_sub.vh.vh.dv", {i16v2, i16v2}}, {INTRINSIC_128B(vsubwsat_dv), i32v2, "sat_sub.vw.vw.dv", {i32v2, i32v2}}, // Absolute value: {INTRINSIC_128B(vabsh), u16v1, "abs.vh", {i16v1}}, {INTRINSIC_128B(vabsw), u32v1, "abs.vw", {i32v1}}, {INTRINSIC_128B(vabsb), u8v1, "abs.vb", {i8v1}, HvxIntrinsic::v65OrLater}, // Absolute difference: {INTRINSIC_128B(vabsdiffub), u8v1, "absd.vub.vub", {u8v1, u8v1}}, {INTRINSIC_128B(vabsdiffuh), u16v1, "absd.vuh.vuh", {u16v1, u16v1}}, {INTRINSIC_128B(vabsdiffh), u16v1, "absd.vh.vh", {i16v1, i16v1}}, {INTRINSIC_128B(vabsdiffw), u32v1, "absd.vw.vw", {i32v1, i32v1}}, // Averaging: {INTRINSIC_128B(vavgub), u8v1, "avg.vub.vub", {u8v1, u8v1}}, {INTRINSIC_128B(vavguh), u16v1, "avg.vuh.vuh", {u16v1, u16v1}}, {INTRINSIC_128B(vavguw), u32v1, "avg.vuw.vuw", {u32v1, u32v1}, HvxIntrinsic::v65OrLater}, {INTRINSIC_128B(vavgb), i8v1, "avg.vb.vb", {i8v1, i8v1}, HvxIntrinsic::v65OrLater}, {INTRINSIC_128B(vavgh), i16v1, "avg.vh.vh", {i16v1, i16v1}}, {INTRINSIC_128B(vavgw), i32v1, "avg.vw.vw", {i32v1, i32v1}}, {INTRINSIC_128B(vavgubrnd), u8v1, "avg_rnd.vub.vub", {u8v1, u8v1}}, {INTRINSIC_128B(vavguhrnd), u16v1, "avg_rnd.vuh.vuh", {u16v1, u16v1}}, {INTRINSIC_128B(vavguwrnd), u32v1, "avg_rnd.vuw.vuw", {u32v1, u32v1}, HvxIntrinsic::v65OrLater}, {INTRINSIC_128B(vavgbrnd), i8v1, "avg_rnd.vb.vb", {i8v1, i8v1}, HvxIntrinsic::v65OrLater}, {INTRINSIC_128B(vavghrnd), i16v1, "avg_rnd.vh.vh", {i16v1, i16v1}}, {INTRINSIC_128B(vavgwrnd), i32v1, "avg_rnd.vw.vw", {i32v1, i32v1}}, // This one is weird: i8_sat((u8 - u8)/2). It both saturates and averages. {INTRINSIC_128B(vnavgub), i8v1, "navg.vub.vub", {u8v1, u8v1}}, {INTRINSIC_128B(vnavgb), i8v1, "navg.vb.vb", {i8v1, i8v1}, HvxIntrinsic::v65OrLater}, {INTRINSIC_128B(vnavgh), i16v1, "navg.vh.vh", {i16v1, i16v1}}, {INTRINSIC_128B(vnavgw), i32v1, "navg.vw.vw", {i32v1, i32v1}}, // Non-widening multiplication: {INTRINSIC_128B(vmpyih), i16v1, "mul.vh.vh", {i16v1, i16v1}}, {INTRINSIC_128B(vmpyihb), i16v1, "mul.vh.b", {i16v1, i8}, HvxIntrinsic::BroadcastScalarsToWords}, {INTRINSIC_128B(vmpyiwh), i32v1, "mul.vw.h", {i32v1, i16}, HvxIntrinsic::BroadcastScalarsToWords}, {INTRINSIC_128B(vmpyiwb), i32v1, "mul.vw.b", {i32v1, i8}, HvxIntrinsic::BroadcastScalarsToWords}, {INTRINSIC_128B(vmpyih_acc), i16v1, "add_mul.vh.vh.vh", {i16v1, i16v1, i16v1}}, {INTRINSIC_128B(vmpyihb_acc), i16v1, "add_mul.vh.vh.b", {i16v1, i16v1, i8}, HvxIntrinsic::BroadcastScalarsToWords}, {INTRINSIC_128B(vmpyiwh_acc), i32v1, "add_mul.vw.vw.h", {i32v1, i32v1, i16}, HvxIntrinsic::BroadcastScalarsToWords}, {INTRINSIC_128B(vmpyiwb_acc), i32v1, "add_mul.vw.vw.b", {i32v1, i32v1, i8}, HvxIntrinsic::BroadcastScalarsToWords}, // Widening vector multiplication: {INTRINSIC_128B(vmpyubv), u16v2, "mpy.vub.vub", {u8v1, u8v1}}, {INTRINSIC_128B(vmpyuhv), u32v2, "mpy.vuh.vuh", {u16v1, u16v1}}, {INTRINSIC_128B(vmpybv), i16v2, "mpy.vb.vb", {i8v1, i8v1}}, {INTRINSIC_128B(vmpyhv), i32v2, "mpy.vh.vh", {i16v1, i16v1}}, {INTRINSIC_128B(vmpyubv_acc), u16v2, "add_mpy.vuh.vub.vub", {u16v2, u8v1, u8v1}}, {INTRINSIC_128B(vmpyuhv_acc), u32v2, "add_mpy.vuw.vuh.vuh", {u32v2, u16v1, u16v1}}, {INTRINSIC_128B(vmpybv_acc), i16v2, "add_mpy.vh.vb.vb", {i16v2, i8v1, i8v1}}, {INTRINSIC_128B(vmpyhv_acc), i32v2, "add_mpy.vw.vh.vh", {i32v2, i16v1, i16v1}}, // Inconsistencies: both are vector instructions despite the // missing 'v', and the signedness is indeed swapped. {INTRINSIC_128B(vmpybusv), i16v2, "mpy.vub.vb", {u8v1, i8v1}}, {INTRINSIC_128B(vmpyhus), i32v2, "mpy.vh.vuh", {i16v1, u16v1}}, {INTRINSIC_128B(vmpybusv_acc), i16v2, "add_mpy.vh.vub.vb", {i16v2, u8v1, i8v1}}, {INTRINSIC_128B(vmpyhus_acc), i32v2, "add_mpy.vw.vh.vuh", {i32v2, i16v1, u16v1}}, // Widening scalar multiplication: {INTRINSIC_128B(vmpyub), u16v2, "mpy.vub.ub", {u8v1, u8}, HvxIntrinsic::BroadcastScalarsToWords}, {INTRINSIC_128B(vmpyuh), u32v2, "mpy.vuh.uh", {u16v1, u16}, HvxIntrinsic::BroadcastScalarsToWords}, {INTRINSIC_128B(vmpyh), i32v2, "mpy.vh.h", {i16v1, i16}, HvxIntrinsic::BroadcastScalarsToWords}, {INTRINSIC_128B(vmpybus), i16v2, "mpy.vub.b", {u8v1, i8}, HvxIntrinsic::BroadcastScalarsToWords}, {INTRINSIC_128B(vmpyub_acc), u16v2, "add_mpy.vuh.vub.ub", {u16v2, u8v1, u8}, HvxIntrinsic::BroadcastScalarsToWords}, {INTRINSIC_128B(vmpyuh_acc), u32v2, "add_mpy.vuw.vuh.uh", {u32v2, u16v1, u16}, HvxIntrinsic::BroadcastScalarsToWords}, {INTRINSIC_128B(vmpybus_acc), i16v2, "add_mpy.vh.vub.b", {i16v2, u8v1, i8}, HvxIntrinsic::BroadcastScalarsToWords}, {INTRINSIC_128B(vmpyhsat_acc), i32v2, "satw_add_mpy.vw.vh.h", {i32v2, i16v1, i16}, HvxIntrinsic::BroadcastScalarsToWords}, // Widening vector multiplication, with horizontal reduction. {INTRINSIC_128B(vrmpyubv), u32v1, "add_4mpy.vub.vub", {u8v1, u8v1}}, {INTRINSIC_128B(vrmpybv), i32v1, "add_4mpy.vb.vb", {i8v1, i8v1}}, {INTRINSIC_128B(vrmpybusv), i32v1, "add_4mpy.vub.vb", {i8v1, i8v1}}, {INTRINSIC_128B(vrmpyubv_acc), u32v1, "acc_add_4mpy.vuw.vub.vub", {u32v1, u8v1, u8v1}}, {INTRINSIC_128B(vrmpybv_acc), i32v1, "acc_add_4mpy.vw.vb.vb", {i32v1, i8v1, i8v1}}, {INTRINSIC_128B(vrmpybusv_acc), i32v1, "acc_add_4mpy.vw.vub.vb", {i32v1, i8v1, i8v1}}, // Widening scalar multiplication, with horizontal reduction. {INTRINSIC_128B(vdmpybus), i16v1, "add_2mpy.vub.b", {u8v1, i32}}, {INTRINSIC_128B(vdmpyhb), i32v1, "add_2mpy.vh.b", {i16v1, i32}}, {INTRINSIC_128B(vdmpybus_acc), i16v1, "acc_add_2mpy.vh.vub.b", {i16v1, u8v1, i32}}, {INTRINSIC_128B(vdmpyhb_acc), i32v1, "acc_add_2mpy.vw.vh.b", {i32v1, i16v1, i32}}, // Saturating versions of vdmpy. {INTRINSIC_128B(vdmpyhsat), i32v1, "add_2mpy.vh.h", {i16v1, i32}}, {INTRINSIC_128B(vdmpyhsusat), i32v1, "add_2mpy.vh.uh", {i16v1, u32}}, {INTRINSIC_128B(vdmpyhvsat), i32v1, "add_2mpy.vh.vh", {i16v1, i16v1}}, {INTRINSIC_128B(vmpabus), i16v2, "add_2mpy.vub.vub.b.b", {i8v2, i32}}, {INTRINSIC_128B(vmpabus_acc), i16v2, "acc_add_2mpy.vh.vub.vub.b.b", {i16v2, i8v2, i32}}, {INTRINSIC_128B(vmpahb), i32v2, "add_2mpy.vh.vh.b.b", {i16v2, i32}}, {INTRINSIC_128B(vmpahb_acc), i32v2, "acc_add_2mpy.vw.vh.vh.b.b", {i32v2, i16v2, i32}}, // TODO: These don't generate correctly because the vectors // aren't interleaved correctly. //{ vdmpybus_dv, i16v2, "add_2mpy.vub.b.dv", {u8v2, i32} }, //{ vdmpyhb_dv, i32v2, "add_2mpy.vh.b.dv", {i16v2, i32} }, //{ vdmpybus_dv_acc, i16v2, "acc_add_2mpy.vh.vub.b.dv", {i16v2, u8v2, i32} }, //{ vdmpyhb_dv_acc, i32v2, "acc_add_2mpy.vw.vh.b.dv", {i32v2, i16v2, i32} }, // vtmpy // TODO: These (and many vdmpy variants) should have 16-bit scalars with BroadcastScalarsToWords, so // we don't need to replicate the arguments in HexagonOptimize.cpp. However, this triggers opaque // failures in LLVM. {INTRINSIC_128B(vtmpybus), i16v2, "add_3mpy.vub.b", {u8v2, i32}}, {INTRINSIC_128B(vtmpyb), i16v2, "add_3mpy.vb.b", {i8v2, i32}}, {INTRINSIC_128B(vtmpyhb), i32v2, "add_3mpy.vh.b", {u16v2, i32}}, {INTRINSIC_128B(vtmpybus_acc), i16v2, "acc_add_3mpy.vh.vub.b", {i16v2, u8v2, i32}}, {INTRINSIC_128B(vtmpyb_acc), i16v2, "acc_add_3mpy.vh.vb.b", {i16v2, i8v2, i32}}, {INTRINSIC_128B(vtmpyhb_acc), i32v2, "acc_add_3mpy.vw.vh.b", {i32v2, u16v2, i32}}, {INTRINSIC_128B(vrmpybus), i32v1, "add_4mpy.vub.b", {u8v1, i32}}, {INTRINSIC_128B(vrmpyub), u32v1, "add_4mpy.vub.ub", {u8v1, u32}}, {INTRINSIC_128B(vrmpybus_acc), i32v1, "acc_add_4mpy.vw.vub.b", {i32v1, u8v1, i32}}, {INTRINSIC_128B(vrmpyub_acc), u32v1, "acc_add_4mpy.vuw.vub.ub", {u32v1, u8v1, u32}}, // Multiply keep high half, with multiplication by 2. {INTRINSIC_128B(vmpyhvsrs), i16v1, "trunc_satw_mpy2_rnd.vh.vh", {i16v1, i16v1}}, {INTRINSIC_128B(vmpyhss), i16v1, "trunc_satw_mpy2.vh.h", {i16v1, i16}, HvxIntrinsic::BroadcastScalarsToWords}, {INTRINSIC_128B(vmpyhsrs), i16v1, "trunc_satw_mpy2_rnd.vh.h", {i16v1, i16}, HvxIntrinsic::BroadcastScalarsToWords}, // Min/max: {INTRINSIC_128B(vmaxub), u8v1, "max.vub.vub", {u8v1, u8v1}}, {INTRINSIC_128B(vmaxuh), u16v1, "max.vuh.vuh", {u16v1, u16v1}}, {INTRINSIC_128B(vmaxh), i16v1, "max.vh.vh", {i16v1, i16v1}}, {INTRINSIC_128B(vmaxw), i32v1, "max.vw.vw", {i32v1, i32v1}}, {INTRINSIC_128B(vminub), u8v1, "min.vub.vub", {u8v1, u8v1}}, {INTRINSIC_128B(vminuh), u16v1, "min.vuh.vuh", {u16v1, u16v1}}, {INTRINSIC_128B(vminh), i16v1, "min.vh.vh", {i16v1, i16v1}}, {INTRINSIC_128B(vminw), i32v1, "min.vw.vw", {i32v1, i32v1}}, // Shifts // We map arithmetic and logical shifts to just "shr", depending on type. {INTRINSIC_128B(vlsrhv), u16v1, "shr.vuh.vh", {u16v1, u16v1}}, {INTRINSIC_128B(vlsrwv), u32v1, "shr.vuw.vw", {u32v1, u32v1}}, {INTRINSIC_128B(vasrhv), i16v1, "shr.vh.vh", {i16v1, u16v1}}, {INTRINSIC_128B(vasrwv), i32v1, "shr.vw.vw", {i32v1, u32v1}}, // Rounding shift right {INTRINSIC_128B(vasrhubrndsat), u8v1, "trunc_satub_shr_rnd.vh", {i16v2, u16}}, {INTRINSIC_128B(vasrhbrndsat), i8v1, "trunc_satb_shr_rnd.vh", {i16v2, u16}}, {INTRINSIC_128B(vasruhubrndsat), u8v1, "trunc_satub_shr_rnd.vuh", {u16v2, u16}, HvxIntrinsic::v65OrLater}, {INTRINSIC_128B(vasrwuhrndsat), u16v1, "trunc_satuh_shr_rnd.vw", {i32v2, u32}}, {INTRINSIC_128B(vasrwhrndsat), i16v1, "trunc_sath_shr_rnd.vw", {i32v2, u32}}, {INTRINSIC_128B(vasruwuhrndsat), u16v1, "trunc_satuh_shr_rnd.vuw", {u32v2, u32}}, {INTRINSIC_128B(vaslhv), u16v1, "shl.vuh.vh", {u16v1, u16v1}}, {INTRINSIC_128B(vaslwv), u32v1, "shl.vuw.vw", {u32v1, u32v1}}, {INTRINSIC_128B(vaslhv), i16v1, "shl.vh.vh", {i16v1, u16v1}}, {INTRINSIC_128B(vaslwv), i32v1, "shl.vw.vw", {i32v1, u32v1}}, {INTRINSIC_128B(vlsrh), u16v1, "shr.vuh.h", {u16v1, u16}}, {INTRINSIC_128B(vlsrw), u32v1, "shr.vuw.w", {u32v1, u32}}, {INTRINSIC_128B(vasrh), i16v1, "shr.vh.h", {i16v1, u16}}, {INTRINSIC_128B(vasrw), i32v1, "shr.vw.w", {i32v1, u32}}, {INTRINSIC_128B(vaslh), u16v1, "shl.vuh.h", {u16v1, u16}}, {INTRINSIC_128B(vaslw), u32v1, "shl.vuw.w", {u32v1, u32}}, {INTRINSIC_128B(vaslh), i16v1, "shl.vh.h", {i16v1, u16}}, {INTRINSIC_128B(vaslw), i32v1, "shl.vw.w", {i32v1, u32}}, {INTRINSIC_128B(vasrh_acc), i16v1, "add_shr.vh.vh.uh", {i16v1, i16v1, i16}, HvxIntrinsic::BroadcastScalarsToWords | HvxIntrinsic::v65OrLater}, {INTRINSIC_128B(vaslh_acc), i16v1, "add_shl.vh.vh.uh", {i16v1, i16v1, i16}, HvxIntrinsic::BroadcastScalarsToWords | HvxIntrinsic::v65OrLater}, {INTRINSIC_128B(vasrw_acc), i32v1, "add_shr.vw.vw.uw", {i32v1, i32v1, i32}}, {INTRINSIC_128B(vaslw_acc), i32v1, "add_shl.vw.vw.uw", {i32v1, i32v1, i32}}, {INTRINSIC_128B(vasrwh), i16v1, "trunc_shr.vw.uw", {i32v2, u32}}, {INTRINSIC_128B(vasrhubsat), u8v1, "trunc_satub_shr.vh.uh", {i16v2, u16}}, {INTRINSIC_128B(vasrwuhsat), u16v1, "trunc_satuh_shr.vw.uw", {i32v2, u32}}, {INTRINSIC_128B(vasrwhsat), i16v1, "trunc_sath_shr.vw.uw", {i32v2, u32}}, {INTRINSIC_128B(vror), u8v1, "vror", {u8v1, i32}}, // Bit counting {INTRINSIC_128B(vnormamth), u16v1, "cls.vh", {u16v1}}, {INTRINSIC_128B(vnormamtw), u32v1, "cls.vw", {u32v1}}, }; // clang-format on // TODO: Many variants of the above functions are missing. They // need to be implemented in the runtime module, or via // fall-through to CodeGen_LLVM. void CodeGen_Hexagon::init_module() { CodeGen_Posix::init_module(); // LLVM's HVX vector intrinsics don't include the type of the // operands, they all operate on vectors of 32 bit integers. To make // it easier to generate code, we define wrapper intrinsics with // the correct type (plus the necessary bitcasts). const auto fix_lanes = [&](const halide_type_t &t) -> halide_type_t { if (t.lanes == 1) { return t; } const int lanes_actual = ((int)t.lanes * native_vector_bits()) / kOneX; return t.with_lanes(lanes_actual); }; vector arg_types; for (const HvxIntrinsic &i : intrinsic_wrappers) { llvm::Intrinsic::ID id = i.id; internal_assert(id != llvm::Intrinsic::not_intrinsic); // Get the real intrinsic. llvm::Function *intrin = llvm::Intrinsic::getDeclaration(module.get(), id); halide_type_t ret_type = fix_lanes(i.ret_type); arg_types.clear(); for (const auto &a : i.arg_types) { if (a.bits == 0) { break; } arg_types.emplace_back(fix_lanes(a)); } define_hvx_intrinsic(intrin, ret_type, i.name, arg_types, i.flags); } } llvm::Function *CodeGen_Hexagon::define_hvx_intrinsic(llvm::Function *intrin, Type ret_ty, const string &name, vector arg_types, int flags) { internal_assert(intrin) << "Null definition for intrinsic '" << name << "'\n"; llvm::FunctionType *intrin_ty = intrin->getFunctionType(); bool broadcast_scalar_word = flags & HvxIntrinsic::BroadcastScalarsToWords; bool v65OrLater = flags & HvxIntrinsic::v65OrLater; if (v65OrLater && !is_hvx_v65_or_later()) { return nullptr; } // Get the types of the arguments we want to pass. vector llvm_arg_types; llvm_arg_types.reserve(arg_types.size()); for (Type i : arg_types) { llvm_arg_types.push_back(llvm_type_of(i)); } // Make a wrapper intrinsic. llvm::FunctionType *wrapper_ty = llvm::FunctionType::get(llvm_type_of(ret_ty), llvm_arg_types, false); llvm::Function *wrapper = llvm::Function::Create(wrapper_ty, llvm::GlobalValue::InternalLinkage, "halide.hexagon." + name, module.get()); llvm::BasicBlock *block = llvm::BasicBlock::Create(module->getContext(), "entry", wrapper); IRBuilderBase::InsertPoint here = builder->saveIP(); builder->SetInsertPoint(block); vector args; for (Value &arg : wrapper->args()) { args.push_back(&arg); } if (args.size() + 1 == intrin_ty->getNumParams()) { // This intrinsic needs the first argument split into the high and low // vectors. Value *dv = args[0]; int vec_lanes = native_vector_bits() / arg_types[0].bits(); Value *low = slice_vector(dv, 0, vec_lanes); Value *high = slice_vector(dv, vec_lanes, vec_lanes); args[0] = high; args.insert(args.begin() + 1, low); Type split_type = arg_types.front().with_lanes(arg_types.front().lanes() / 2); arg_types[0] = split_type; arg_types.insert(arg_types.begin() + 1, split_type); } // Replace args with bitcasts if necessary. internal_assert(args.size() == intrin_ty->getNumParams()); for (size_t i = 0; i < args.size(); i++) { llvm::Type *arg_ty = intrin_ty->getParamType(i); if (args[i]->getType() != arg_ty) { if (arg_ty->isVectorTy()) { args[i] = builder->CreateBitCast(args[i], arg_ty); } else { if (broadcast_scalar_word) { llvm::Function *fn = nullptr; // We know it is a scalar type. We can have 8 bit, 16 bit or 32 bit // types only. unsigned bits = arg_types[i].bits(); const char *fn_name = ""; switch (bits) { case 8: fn_name = "halide.hexagon.dup4.b"; break; case 16: fn_name = "halide.hexagon.dup2.h"; break; default: internal_error << "unhandled broadcast_scalar_word in define_hvx_intrinsic"; } fn = module->getFunction(fn_name); internal_assert(fn) << "Unable to find function " << fn_name << " in define_hvx_intrinsic."; args[i] = builder->CreateCall(fn, {args[i]}); } else if (args[i]->getType()->isIntegerTy()) { args[i] = builder->CreateIntCast(args[i], arg_ty, arg_types[i].is_int()); } else { args[i] = builder->CreateBitCast(args[i], arg_ty); } } } } // Call the real intrinsic. Value *ret = builder->CreateCall(intrin, args); // Cast the result, if necessary. if (ret->getType() != wrapper_ty->getReturnType()) { ret = builder->CreateBitCast(ret, wrapper_ty->getReturnType()); } builder->CreateRet(ret); // Always inline these wrappers. wrapper->addFnAttr(llvm::Attribute::AlwaysInline); builder->restoreIP(here); llvm::verifyFunction(*wrapper); return wrapper; } Value *CodeGen_Hexagon::create_bitcast(Value *v, llvm::Type *ty) { if (BitCastInst *c = dyn_cast(v)) { return create_bitcast(c->getOperand(0), ty); } else if (isa(v)) { return UndefValue::get(ty); } else if (v->getType() != ty) { v = builder->CreateBitCast(v, ty); } return v; } Value *CodeGen_Hexagon::call_intrin_cast(llvm::Type *ret_ty, llvm::Function *F, vector Ops) { llvm::FunctionType *FType = F->getFunctionType(); internal_assert(FType->getNumParams() == Ops.size()); for (unsigned I = 0; I < FType->getNumParams(); ++I) { Ops[I] = create_bitcast(Ops[I], FType->getParamType(I)); } Value *ret = builder->CreateCall(F, Ops); return create_bitcast(ret, ret_ty); } Value *CodeGen_Hexagon::call_intrin_cast(llvm::Type *ret_ty, int id, vector Ops) { llvm::Function *intrin = llvm::Intrinsic::getDeclaration(module.get(), (llvm::Intrinsic::ID)id); return call_intrin_cast(ret_ty, intrin, std::move(Ops)); } Value *CodeGen_Hexagon::interleave_vectors(const vector &v) { llvm::Type *v_ty = v[0]->getType(); llvm::Type *element_ty = get_vector_element_type(v_ty); int element_bits = element_ty->getScalarSizeInBits(); int native_elements = native_vector_bits() / element_ty->getScalarSizeInBits(); int result_elements = get_vector_num_elements(v_ty) * v.size(); if (v.size() == 2) { // Interleaving two vectors. Value *a = v[0]; Value *b = v[1]; if (result_elements == native_elements && (element_bits == 8 || element_bits == 16)) { llvm::Type *native_ty = get_vector_type(element_ty, native_elements); // This is an interleave of two half native vectors, use // vshuff. llvm::Intrinsic::ID vshuff = element_bits == 8 ? INTRINSIC_128B(vshuffb) : INTRINSIC_128B(vshuffh); return call_intrin_cast(native_ty, vshuff, {concat_vectors({a, b})}); } else { // Break them into native vectors, use vshuffvdd, and // concatenate the shuffled results. llvm::Type *native2_ty = get_vector_type(element_ty, native_elements * 2); Value *bytes = codegen(-static_cast(element_bits / 8)); vector ret; for (int i = 0; i < result_elements / 2; i += native_elements) { Value *a_i = slice_vector(a, i, native_elements); Value *b_i = slice_vector(b, i, native_elements); Value *ret_i = call_intrin_cast( native2_ty, INTRINSIC_128B(vshuffvdd), {b_i, a_i, bytes}); if ((i + native_elements) * 2 > result_elements) { // This is the last vector, and it has some extra // elements. Slice it down. ret_i = slice_vector(ret_i, 0, result_elements - i * 2); } ret.push_back(ret_i); } return concat_vectors(ret); } } else if (v.size() == 3) { // Interleaving 3 vectors - this generates awful code if we let LLVM do it, // so we use vdelta. Value *lut = concat_vectors(v); std::vector indices; for (int i = 0; i < get_vector_num_elements(v_ty); i++) { for (size_t j = 0; j < v.size(); j++) { indices.push_back(j * get_vector_num_elements(v_ty) + i); } } return vdelta(lut, indices); } return CodeGen_Posix::interleave_vectors(v); } // Check if indices form a strided ramp, allowing undef elements to // pretend to be part of the ramp. bool is_strided_ramp(const vector &indices, int &start, int &stride) { int size = static_cast(indices.size()); // To find the proposed start and stride, find two non-undef elements. int x0 = -1; int x1 = -1; for (int i = 0; i < size; i++) { if (indices[i] != -1) { if (x0 == -1) { x0 = i; } else { x1 = i; break; } } } if (x1 == -1) { // If we don't have enough non-undef elements, we can pretend // the ramp is anything we want! stride = 1; start = x0 != -1 ? indices[x0] - x0 : 0; return true; } int dx = x1 - x0; int dy = indices[x1] - indices[x0]; stride = dy / dx; start = indices[x0] - stride * x0; // Verify that all of the non-undef elements are part of the strided ramp. for (int i = 0; i < size; i++) { if (indices[i] != -1 && indices[i] != start + i * stride) { return false; } } return true; } bool is_concat_or_slice(const vector &indices) { // Skip undef elements at the beginning and the end. size_t begin = 0; while (begin < indices.size() && indices[begin] == -1) { ++begin; } size_t end = indices.size(); while (end > 1 && indices[end - 1] == -1) { --end; } // Check that the remaining elements are a dense ramp. for (size_t i = begin; i + 1 < end; i++) { if (indices[i] + 1 != indices[i + 1]) { return false; } } return true; } Value *CodeGen_Hexagon::shuffle_vectors(Value *a, Value *b, const vector &indices) { llvm::Type *a_ty = a->getType(); llvm::Type *b_ty = b->getType(); internal_assert(a_ty == b_ty); int a_elements = static_cast(get_vector_num_elements(a_ty)); llvm::Type *element_ty = get_vector_element_type(a->getType()); internal_assert(element_ty); int element_bits = element_ty->getScalarSizeInBits(); int native_elements = native_vector_bits() / element_bits; llvm::Type *native_ty = get_vector_type(element_ty, native_elements); llvm::Type *native2_ty = get_vector_type(element_ty, native_elements * 2); int result_elements = static_cast(indices.size()); internal_assert(result_elements > 0); llvm::Type *result_ty = get_vector_type(element_ty, result_elements); // Try to rewrite shuffles that only access the elements of b. int min = indices[0]; for (size_t i = 1; i < indices.size(); i++) { if (indices[i] != -1 && indices[i] < min) { min = indices[i]; } } if (min >= a_elements) { vector shifted_indices(indices); for (int &i : shifted_indices) { if (i != -1) { i -= a_elements; } } return shuffle_vectors(b, UndefValue::get(b->getType()), shifted_indices); } // Try to rewrite shuffles that only access the elements of a. int max = *std::max_element(indices.begin(), indices.end()); if (max < a_elements) { BitCastInst *a_cast = dyn_cast(a); CallInst *a_call = dyn_cast(a_cast ? a_cast->getOperand(0) : a); llvm::Function *vcombine = llvm::Intrinsic::getDeclaration( module.get(), INTRINSIC_128B(vcombine)); if (a_call && a_call->getCalledFunction() == vcombine) { // Rewrite shuffle(vcombine(a, b), x) to shuffle(a, b) return shuffle_vectors( create_bitcast(a_call->getArgOperand(1), native_ty), create_bitcast(a_call->getArgOperand(0), native_ty), indices); } else if (ShuffleVectorInst *a_shuffle = dyn_cast(a)) { bool is_identity = true; for (int i = 0; i < a_elements; i++) { int mask_i = a_shuffle->getMaskValue(i); is_identity = is_identity && (mask_i == i || mask_i == -1); } if (is_identity) { return shuffle_vectors(a_shuffle->getOperand(0), a_shuffle->getOperand(1), indices); } } } // Try to rewrite shuffles of (maybe strided) ramps. int start = 0, stride = 0; if (!is_strided_ramp(indices, start, stride)) { if (is_concat_or_slice(indices)) { // Let LLVM handle concat or slices. return CodeGen_Posix::shuffle_vectors(a, b, indices); } return vdelta(concat_vectors({a, b}), indices); } if (stride == 1) { if (result_ty == native2_ty && a_ty == native_ty && b_ty == native_ty) { // This is a concatenation of a and b, where a and b are // native vectors. Use vcombine. internal_assert(start == 0); return call_intrin_cast(native2_ty, INTRINSIC_128B(vcombine), {b, a}); } if (result_ty == native_ty && a_ty == native2_ty && max < a_elements) { // Extract a and b from a double vector. b = call_intrin_cast(native_ty, INTRINSIC_128B(hi), {a}); a = call_intrin_cast(native_ty, INTRINSIC_128B(lo), {a}); a_ty = a->getType(); b_ty = b->getType(); a_elements = get_vector_num_elements(a_ty); } if (start == 0 && result_ty == a_ty) { return a; } if (start == a_elements && result_ty == b_ty) { return b; } if (result_ty == native_ty && a_ty == native_ty && b_ty == native_ty) { // Use valign to select a subset of the concatenation of a // and b. int bytes_off = start * (element_bits / 8); int reverse_bytes = (native_vector_bits() / 8) - bytes_off; llvm::Intrinsic::ID intrin_id = INTRINSIC_128B(valignb); // v(l)align is a bit more efficient if the offset fits in // 3 bits, so if the offset is with in 3 bits from the // high end, use vlalign instead. if (bytes_off <= 7) { intrin_id = INTRINSIC_128B(valignbi); } else if (reverse_bytes <= 7) { intrin_id = INTRINSIC_128B(vlalignbi); bytes_off = reverse_bytes; } return call_intrin_cast(native_ty, intrin_id, {b, a, codegen(bytes_off)}); } return CodeGen_Posix::shuffle_vectors(a, b, indices); } else if (stride == 2 && (start == 0 || start == 1)) { // For stride 2 shuffles, we can use vpack or vdeal. // It's hard to use call_intrin here. We'll just slice and // concat manually. Value *ab = max < a_elements ? a : concat_vectors({a, b}); vector ret; for (int i = 0; i < result_elements; i += native_elements) { Value *ab_i0 = slice_vector(ab, i * 2, native_elements); Value *ab_i1 = slice_vector(ab, i * 2 + native_elements, native_elements); Value *ret_i; if (element_bits == 8) { llvm::Intrinsic::ID intrin = start == 0 ? INTRINSIC_128B(vpackeb) : INTRINSIC_128B(vpackob); ret_i = call_intrin_cast(native_ty, intrin, {ab_i1, ab_i0}); } else if (element_bits == 16) { llvm::Intrinsic::ID intrin = start == 0 ? INTRINSIC_128B(vpackeh) : INTRINSIC_128B(vpackoh); ret_i = call_intrin_cast(native_ty, intrin, {ab_i1, ab_i0}); } else if (element_bits % 8 == 0) { // Need to use vdealw, followed by lo/hi. // TODO: Is there a better instruction? This generates a // double vector, then only uses half of the result. int element_bytes = element_bits / 8; Value *packed = call_intrin_cast( native2_ty, INTRINSIC_128B(vdealvdd), {ab_i1, ab_i0, ConstantInt::get(i32_t, -element_bytes)}); llvm::Intrinsic::ID intrin = start == 0 ? INTRINSIC_128B(lo) : INTRINSIC_128B(hi); ret_i = call_intrin_cast(native_ty, intrin, {packed}); } else { return CodeGen_Posix::shuffle_vectors(a, b, indices); } if (i + native_elements > result_elements) { // This is the last vector, and it has a few extra // elements. Slice it down. ret_i = slice_vector(ret_i, 0, result_elements - i); } ret.push_back(ret_i); } return concat_vectors(ret); } // Use a general delta operation. return vdelta(concat_vectors({a, b}), indices); } Value *CodeGen_Hexagon::vlut256(Value *lut, Value *idx, int min_index, int max_index) { llvm::Type *lut_ty = lut->getType(); llvm::Type *idx_ty = idx->getType(); internal_assert(isa(lut_ty)); internal_assert(isa(idx_ty)); internal_assert(idx_ty->getScalarSizeInBits() == 8); internal_assert(min_index >= 0); internal_assert(max_index < 256); llvm::Intrinsic::ID vlut, vlut_acc, vshuff; if (lut_ty->getScalarSizeInBits() == 8) { // We can use vlut32. vlut = INTRINSIC_128B(vlutvvb); vlut_acc = INTRINSIC_128B(vlutvvb_oracc); vshuff = INTRINSIC_128B(vshuffb); } else { // We can use vlut16. vlut = INTRINSIC_128B(vlutvwh); vlut_acc = INTRINSIC_128B(vlutvwh_oracc); vshuff = INTRINSIC_128B(vshuffh); } // There are two dimensions in which we need to slice up the // inputs. First, if the index is larger than a native vector, we // need to slice up the operation into native vectors of // indices. Second, the LUT may need to be broken into several // stages, and that may need to be further broken up into select // operations. // Split up the LUT into native vectors, using the max_index to // indicate how many we need. max_index = std::min(max_index, get_vector_num_elements(lut_ty) - 1); int native_idx_elements = native_vector_bits() / 8; int native_lut_elements = native_vector_bits() / lut_ty->getScalarSizeInBits(); // The vlut instructions work on pairs of LUTs interleaved, with // each lut containing lut_slice_elements. We need to interleave // pairs of the native LUTs to make a full set of native LUTs. vector lut_slices; for (int i = 0; i <= max_index; i += native_lut_elements) { Value *lut_slice = slice_vector(lut, i, native_lut_elements); lut_slice = call_intrin_cast(lut_slice->getType(), vshuff, {lut_slice}); lut_slices.push_back(lut_slice); } internal_assert(!lut_slices.empty()); llvm::Type *native_result_ty = get_vector_type( get_vector_element_type(lut_ty), native_idx_elements); // The result will have the same number of elements as idx. int idx_elements = get_vector_num_elements(idx_ty); // Each LUT has 1 pair of even/odd mask values for HVX 64, 2 for // HVX 128. We may not need all of the passes, if the LUT has // fewer than half of the elements in an HVX 128 vector. constexpr int lut_passes = 2; vector result; for (int i = 0; i < idx_elements; i += native_idx_elements) { Value *idx_i = slice_vector(idx, i, native_idx_elements); if (lut_ty->getScalarSizeInBits() == 16) { // vlut16 deinterleaves its output. We can either // interleave the result, or the indices. It's slightly // cheaper to interleave the indices (they are single // vectors, vs. the result which is a double vector), and // if the indices are constant (which is true for boundary // conditions) this should get lifted out of any loops. idx_i = call_intrin_cast( idx_i->getType(), INTRINSIC_128B(vshuffb), {idx_i}); } Value *result_i = nullptr; for (int j = 0; j < static_cast(lut_slices.size()); j++) { for (int k = 0; k < lut_passes; k++) { int pass_index = lut_passes * j + k; Value *mask[2] = { ConstantInt::get(i32_t, 2 * pass_index + 0), ConstantInt::get(i32_t, 2 * pass_index + 1), }; if (result_i == nullptr) { // The first native LUT, use vlut. result_i = call_intrin_cast(native_result_ty, vlut, {idx_i, lut_slices[j], mask[0]}); result_i = call_intrin_cast(native_result_ty, vlut_acc, {result_i, idx_i, lut_slices[j], mask[1]}); } else if (max_index >= pass_index * native_lut_elements / lut_passes) { // Not the first native LUT, accumulate the LUT // with the previous result. for (Value *v : mask) { result_i = call_intrin_cast(native_result_ty, vlut_acc, {result_i, idx_i, lut_slices[j], v}); } } } } result.push_back(result_i); } return slice_vector(concat_vectors(result), 0, idx_elements); } bool is_power_of_two(int x) { return (x & (x - 1)) == 0; } // vdelta and vrdelta are instructions that take an input vector and // pass it through a network made up of levels. Each element x at each // level i can either take the element from the previous level at the // same position x, or take the element from the previous level at y, // where y is x with the bit at position i flipped. This forms a // butterfly network. vdelta and vrdelta have the same structure, // except the ordering of the levels is flipped. // Find a descriptor of the path between x1 and x2. To find the path // between element x1 and element x2, the algorithm is the same for // both vdelta and vrdelta. To get from x1 to x2, we need to take the // switch path at level i if bit i of x1 and x2 are not the same. The // path is an integer where the bit at position i indicates the switch // that jumps by i elements should be on. int generate_delta_path(int x1, int x2) { int result = 0; for (int delta = 1; x1 != x2; delta *= 2) { if ((x1 & delta) != (x2 & delta)) { result |= delta; } x1 &= ~delta; x2 &= ~delta; } return result; } // Generate the switch descriptors for a vdelta or vrdelta // instruction. To do this, we need to generate the switch descriptors // of each output to input path, and then make sure that none of the // switches need conflicting settings. bool generate_vdelta(const std::vector &indices, bool reverse, std::vector &switches) { int width = (int)indices.size(); internal_assert(is_power_of_two(width)); switches.resize(width); // For each switch bit, we have a bit indicating whether we // already care about the switch position. std::vector switches_used(switches.size()); std::fill(switches.begin(), switches.end(), 0); std::fill(switches_used.begin(), switches_used.end(), 0); for (int out = 0; out < width; out++) { int in = indices[out]; if (in == -1) { // We don't care what the output is at this index. continue; } int path = generate_delta_path(out, in); int x = out; // Follow the path backwards, setting the switches we need as // we go. This is the only place where vdelta and vrdelta // differ. For vdelta, we start with the small jumps, vrdelta // starts with the large jumps. int start = reverse ? (1 << 30) : 1; for (int delta = start; path != 0; delta = reverse ? delta / 2 : delta * 2) { int switch_state = path & delta; if ((switches_used[x] & delta) != 0) { // This switch is already set... if ((switches[x] & delta) != switch_state) { // ... and it is set to the wrong thing. We can't represent this // shuffle. return false; } } else { // This switch is not already set, set it to the value we want, and mark // it used. switches_used[x] |= delta; switches[x] |= switch_state; } // Update our position in the network. if (switch_state) { x ^= delta; } path &= ~delta; } } return true; } // Try generating vdelta/vrdelta before falling back to vlut. Value *CodeGen_Hexagon::vdelta(Value *lut, const vector &indices) { llvm::Type *lut_ty = lut->getType(); int lut_elements = get_vector_num_elements(lut_ty); llvm::Type *element_ty = get_vector_element_type(lut_ty); int element_bits = element_ty->getScalarSizeInBits(); int native_elements = native_vector_bits() / element_ty->getScalarSizeInBits(); int result_elements = indices.size(); if (element_bits == 1) { // If this is a vector of booleans, convert it to a vector of ints, // do the shuffle, and convert back. llvm::Type *new_lut_ty = get_vector_type(i8_t, lut_elements); Value *i8_lut = builder->CreateIntCast(lut, new_lut_ty, true); Value *result = vdelta(i8_lut, indices); return builder->CreateIntCast(result, lut_ty, true); } else if (element_bits != 8) { // If the input is not a vector of 8 bit elements, replicate the // indices and cast the LUT. int replicate = element_bits / 8; internal_assert(replicate != 0); llvm::Type *new_lut_ty = get_vector_type(i8_t, lut_elements * replicate); Value *i8_lut = builder->CreateBitCast(lut, new_lut_ty); vector i8_indices(indices.size() * replicate); for (size_t i = 0; i < indices.size(); i++) { for (int j = 0; j < replicate; j++) { i8_indices[i * replicate + j] = indices[i] * replicate + j; } } Value *result = vdelta(i8_lut, i8_indices); llvm::Type *result_ty = get_vector_type(get_vector_element_type(lut_ty), indices.size()); return builder->CreateBitCast(result, result_ty); } // We can only use vdelta to produce a single native vector at a // time. Break the input into native vector length shuffles. if (result_elements != native_elements) { vector ret; for (int i = 0; i < result_elements; i += native_elements) { vector indices_i(native_elements); for (int j = 0; j < native_elements; j++) { if (i + j < result_elements) { indices_i[j] = indices[i + j]; } else { indices_i[j] = -1; } } Value *ret_i = vdelta(lut, indices_i); if (result_elements - i < native_elements) { // This was a fractional vector at the end, slice the part we want. ret_i = slice_vector(ret_i, 0, result_elements - i); } ret.push_back(ret_i); } return concat_vectors(ret); } internal_assert(result_elements == native_elements); // We can only use vdelta to shuffle a single native vector of // input. If we have more than one, we need to break it into // multiple vdelta operations, and combine them with select. if (lut_elements != native_elements) { Value *ret = nullptr; for (int i = 0; i < lut_elements; i += native_elements) { Value *lut_i = slice_vector(lut, i, native_elements); vector indices_i(native_elements); vector mask(native_elements); bool all_used = true; bool none_used = true; for (int j = 0; j < native_elements; j++) { int idx = indices[j] - i; if (0 <= idx && idx < native_elements) { indices_i[j] = idx; mask[j] = ConstantInt::get(i1_t, 1); none_used = false; } else { indices_i[j] = -1; mask[j] = ConstantInt::get(i1_t, 0); all_used = false; } } Value *ret_i = vdelta(lut_i, indices_i); if (all_used || ret == nullptr) { // If the mask is all ones, or this is the first result, we don't need // to preserve past results. ret = ret_i; } else if (!none_used) { // Create a condition value for which elements of the range are valid // for this index. ret = builder->CreateSelect(ConstantVector::get(mask), ret_i, ret); } } return ret; } // We now have a single native vector to native vector shuffle. Try // Generating a vdelta or vrdelta. for (bool reverse : {false, true}) { std::vector switches; if (generate_vdelta(indices, reverse, switches)) { vector control_elements(switches.size()); for (int i = 0; i < (int)switches.size(); i++) { control_elements[i] = ConstantInt::get(i8_t, switches[i]); } Value *control = ConstantVector::get(control_elements); llvm::Intrinsic::ID vdelta = reverse ? INTRINSIC_128B(vrdelta) : INTRINSIC_128B(vdelta); return call_intrin_cast(lut_ty, vdelta, {lut, control}); } } // TODO: If the above fails, we might be able to use a vdelta and // vrdelta instruction together to implement the shuffle. // TODO: If the vdelta results are sparsely used, it might be // better to use vlut. return vlut(lut, indices); } Value *create_vector(llvm::Type *ty, int val) { llvm::Type *scalar_ty = ty->getScalarType(); Constant *value = ConstantInt::get(scalar_ty, val); return ConstantVector::getSplat(element_count(get_vector_num_elements(ty)), value); } Value *CodeGen_Hexagon::vlut(Value *lut, Value *idx, int min_index, int max_index) { const unsigned idx_elem_size = idx->getType()->getScalarSizeInBits(); internal_assert(idx_elem_size <= 16) << "Index element for lookup tables must be <= 16 bits in size.\n"; llvm::Type *lut_ty = lut->getType(); llvm::Type *result_ty = get_vector_type(get_vector_element_type(lut_ty), get_vector_num_elements(idx->getType())); const unsigned idx_elems = get_vector_num_elements(idx->getType()); // Construct a new index with 16-bit elements. unsigned idx16_elems = idx_elems; Value *idx16 = (idx_elem_size == 8) ? call_intrin(get_vector_type(i16_t, idx_elems), "halide.hexagon.unpack.vub", {idx}) : idx; const int replicate = lut_ty->getScalarSizeInBits() / 16; if (replicate > 1) { // Replicate the LUT indices and use vlut16. // For LUT32: create two indices: // - 2 * index // - 2 * index + 1 // Interleave the two index vectors and use vlut16 with the new index // vector. vector indices; Value *replicate_val = ConstantInt::get(i8_t, replicate); for (int i = 0; i < replicate; i++) { Value *pos = ConstantInt::get(idx16->getType(), i); indices.emplace_back(call_intrin(idx16->getType(), "halide.hexagon.add_mul.vh.vh.b", {pos, idx16, replicate_val})); } idx16 = interleave_vectors(indices); idx16_elems *= replicate; min_index = min_index * replicate; max_index = (max_index + 1) * replicate - 1; internal_assert(max_index <= 32676) << "Index range for lookup table must be <= 32676\n"; lut_ty = get_vector_type(i16_t, get_vector_num_elements(lut_ty) * replicate); lut = builder->CreateBitCast(lut, lut_ty); } llvm::Type *i8x_t = get_vector_type(i8_t, idx16_elems); llvm::Type *i16x_t = get_vector_type(i16_t, idx16_elems); Value *minus_one = create_vector(i16x_t, -1); // If we can do this with one vlut, do it now. if (max_index < 256) { // If the idx already had 8 bit elements and no replication was needed, // we can use idx else we need to pack idx16. Value *idx8 = (idx_elem_size == 16 || idx_elems != idx16_elems) ? call_intrin(i8x_t, "halide.hexagon.pack.vh", {idx16}) : idx; Value *result_val = vlut256(lut, idx8, min_index, max_index); return builder->CreateBitCast(result_val, result_ty); } // We need to break the index up into ranges of up to 256, and select // the ranges together after using vlut on each range. This vector // contains the result of each range, and a condition vector // indicating whether the result should be used. vector> ranges; for (int min_index_i = 0; min_index_i < max_index; min_index_i += 256) { // Make a vector of the indices shifted such that the min of // this range is at 0. Use 16-bit indices for this. Value *min_index_i_val = create_vector(i16x_t, min_index_i); Value *indices = builder->CreateSub(idx16, min_index_i_val); // Create a condition value for which elements of the range are valid // for this index. Value *use_index = builder->CreateICmpSGT(indices, minus_one); // After we've eliminated the invalid elements, we can // truncate to 8 bits, as vlut requires. indices = call_intrin(i8x_t, "halide.hexagon.pack.vh", {indices}); int range_extent_i = std::min(max_index - min_index_i, 255); Value *range_i = vlut256(slice_vector(lut, min_index_i, range_extent_i), indices, 0, range_extent_i); ranges.emplace_back(range_i, use_index); } // TODO: This could be reduced hierarchically instead of in // order. However, this requires the condition for the select to be // quite tricky. Value *result = ranges[0].first; for (size_t i = 1; i < ranges.size(); i++) { result = builder->CreateSelect(ranges[i].second, ranges[i].first, result); } return builder->CreateBitCast(result, result_ty); } Value *CodeGen_Hexagon::vlut(Value *lut, const vector &indices) { // TODO: We can take advantage of the fact that we know the // indices at compile time to implement a few // optimizations. First, we can avoid running the vlut // instructions for ranges of the LUT for which we know we don't // have any indices. This wil happen often for strided // ramps. Second, we can do the shuffling of the indices necessary // at compile time. vector llvm_indices; llvm_indices.reserve(indices.size()); int min_index = get_vector_num_elements(lut->getType()); int max_index = 0; for (int i : indices) { if (i != -1) { min_index = std::min(min_index, i); max_index = std::max(max_index, i); } llvm_indices.push_back(ConstantInt::get(i16_t, i)); } // We use i16 indices because we can't support LUTs with more than // 32k elements anyways without massive stack spilling (the LUT // must fit in registers), and it costs some runtime performance // due to the conversion to 8 bit. This is also crazy and should // never happen. internal_assert(max_index < std::numeric_limits::max()) << "vlut of more than 32k elements not supported \n"; return vlut(lut, ConstantVector::get(llvm_indices), min_index, max_index); } Value *CodeGen_Hexagon::call_intrin(Type result_type, const string &name, vector args, bool maybe) { llvm::Function *fn = module->getFunction(name); if (maybe && !fn) { return nullptr; } internal_assert(fn) << "Function '" << name << "' not found\n"; if (get_vector_num_elements(fn->getReturnType()) * 2 <= result_type.lanes()) { // We have fewer than half as many lanes in our intrinsic as // we have in the call. Check to see if a double vector // version of this intrinsic exists. llvm::Function *fn2 = module->getFunction(name + ".dv"); if (fn2) { fn = fn2; } } fn->addFnAttr(llvm::Attribute::ReadNone); fn->addFnAttr(llvm::Attribute::NoUnwind); return CodeGen_Posix::call_intrin(result_type, get_vector_num_elements(fn->getReturnType()), fn, std::move(args)); } Value *CodeGen_Hexagon::call_intrin(llvm::Type *result_type, const string &name, vector args, bool maybe) { llvm::Function *fn = module->getFunction(name); if (maybe && !fn) { return nullptr; } internal_assert(fn) << "Function '" << name << "' not found\n"; if (get_vector_num_elements(fn->getReturnType()) * 2 <= get_vector_num_elements(result_type)) { // We have fewer than half as many lanes in our intrinsic as // we have in the call. Check to see if a double vector // version of this intrinsic exists. llvm::Function *fn2 = module->getFunction(name + ".dv"); if (fn2) { fn = fn2; } } fn->addFnAttr(llvm::Attribute::ReadNone); fn->addFnAttr(llvm::Attribute::NoUnwind); return CodeGen_Posix::call_intrin(result_type, get_vector_num_elements(fn->getReturnType()), fn, std::move(args)); } string CodeGen_Hexagon::mcpu() const { if (target.has_feature(Halide::Target::HVX_v66)) { return "hexagonv66"; } else if (target.has_feature(Halide::Target::HVX_v65)) { return "hexagonv65"; } else { return "hexagonv62"; } } string CodeGen_Hexagon::mattrs() const { std::stringstream attrs; attrs << "+hvx-length128b"; attrs << ",+long-calls"; if (target.has_feature(Target::HVX)) { attrs << ",+hvxv" << isa_version; } return attrs.str(); } bool CodeGen_Hexagon::use_soft_float_abi() const { return false; } int CodeGen_Hexagon::native_vector_bits() const { return 128 * 8; } Expr maybe_scalar(Expr x) { const Broadcast *xb = x.as(); if (xb) { return xb->value; } else { return x; } } void CodeGen_Hexagon::visit(const Mul *op) { if (op->type.is_vector()) { value = call_intrin(op->type, "halide.hexagon.mul" + type_suffix(op->a, op->b), {op->a, op->b}, true /*maybe*/); if (value) { return; } // Hexagon has mostly widening multiplies. Try to find a // widening multiply we can use. // TODO: It would probably be better to just define a bunch of // mul.*.* functions in the runtime HVX modules so the above // implementation can be used unconditionally. value = call_intrin(op->type, "halide.hexagon.mpy" + type_suffix(op->a, op->b), {op->a, op->b}, true /*maybe*/); if (value) { // We found a widening op, we need to narrow back // down. The widening multiply deinterleaved the result, // but the trunc operation reinterleaves. Type wide = op->type.widen(); value = call_intrin(llvm_type_of(op->type), "halide.hexagon.trunc" + type_suffix(wide, false), {value}); return; } internal_error << "Unhandled HVX multiply " << op->a.type() << "*" << op->b.type() << "\n" << Expr(op) << "\n"; } else { CodeGen_Posix::visit(op); } } void CodeGen_Hexagon::visit(const Call *op) { internal_assert(op->is_extern() || op->is_intrinsic()) << "Can only codegen extern calls and intrinsics\n"; // Map Halide functions to Hexagon intrinsics, plus a boolean // indicating if the intrinsic has signed variants or not. static std::map> functions = { {Call::get_intrinsic_name(Call::absd), {"halide.hexagon.absd", true}}, {Call::get_intrinsic_name(Call::halving_add), {"halide.hexagon.avg", true}}, {Call::get_intrinsic_name(Call::rounding_halving_add), {"halide.hexagon.avg_rnd", true}}, {Call::get_intrinsic_name(Call::halving_sub), {"halide.hexagon.navg", true}}, {Call::get_intrinsic_name(Call::saturating_add), {"halide.hexagon.sat_add", true}}, {Call::get_intrinsic_name(Call::saturating_sub), {"halide.hexagon.sat_sub", true}}, }; if (is_native_interleave(op)) { internal_assert( op->type.lanes() % (native_vector_bits() * 2 / op->type.bits()) == 0); } if (starts_with(op->name, "halide.hexagon.")) { // Handle all of the intrinsics we generated in // hexagon_optimize. I'm not sure why this is different than // letting it fall through to CodeGen_LLVM. value = call_intrin(op->type, op->name, op->args); return; } if (op->type.is_vector()) { auto i = functions.find(op->name); if (i != functions.end()) { string intrin = i->second.first + type_suffix(op->args, i->second.second); value = call_intrin(op->type, intrin, op->args, true /*maybe*/); if (value) { return; } } else if (op->is_intrinsic(Call::shift_left) || op->is_intrinsic(Call::shift_right)) { internal_assert(op->args.size() == 2); string instr = op->is_intrinsic(Call::shift_left) ? "halide.hexagon.shl" : "halide.hexagon.shr"; Expr b = maybe_scalar(op->args[1]); // Make b signed. Shifts are only well defined if this wouldn't overflow. b = cast(b.type().with_code(Type::Int), b); value = call_intrin(op->type, instr + type_suffix(op->args[0], b), {op->args[0], b}); return; } else if (op->is_intrinsic(Call::dynamic_shuffle)) { internal_assert(op->args.size() == 4); const int64_t *min_index = as_const_int(op->args[2]); const int64_t *max_index = as_const_int(op->args[3]); internal_assert(min_index && max_index); Value *lut = codegen(op->args[0]); Value *idx = codegen(op->args[1]); value = vlut(lut, idx, *min_index, *max_index); return; } else if (op->is_intrinsic(Call::abs)) { internal_assert(op->args.size() == 1); Type ty = op->args[0].type(); if ((ty.is_vector() && ty.is_int())) { if (ty.bits() != 8 || is_hvx_v65_or_later()) { value = call_intrin(op->type, "halide.hexagon.abs" + type_suffix(op->args[0]), op->args); return; } } } else if (op->is_intrinsic(Call::cast_mask)) { internal_error << "cast_mask should already have been handled in HexagonOptimize\n"; } } if (op->is_intrinsic(Call::prefetch)) { internal_assert((op->args.size() == 4) || (op->args.size() == 6)) << "Hexagon only supports 1D or 2D prefetch\n"; const int elem_size = op->type.bytes(); const Expr &base_address = op->args[0]; const Expr &base_offset = op->args[1]; const Expr &extent0 = op->args[2]; const Expr &stride0 = op->args[3]; Expr width_bytes = extent0 * stride0 * elem_size; Expr height, stride_bytes; if (op->args.size() == 6) { const Expr &extent1 = op->args[4]; const Expr &stride1 = op->args[5]; height = extent1; stride_bytes = stride1 * elem_size; } else { height = 1; stride_bytes = 1; } vector args; args.push_back(codegen_buffer_pointer(codegen(base_address), op->type, base_offset)); args.push_back(codegen(width_bytes)); args.push_back(codegen(height)); args.push_back(codegen(stride_bytes)); llvm::Function *prefetch_fn = module->getFunction("_halide_prefetch_2d"); internal_assert(prefetch_fn); // The first argument is a pointer, which has type i8*. We // need to cast the argument, which might be a pointer to a // different type. llvm::Type *ptr_type = prefetch_fn->getFunctionType()->params()[0]; args[0] = builder->CreateBitCast(args[0], ptr_type); value = builder->CreateCall(prefetch_fn, args); return; } if (op->is_intrinsic(Call::hvx_gather)) { internal_assert(op->args.size() == 5); internal_assert(op->type.bits() == 16 || op->type.bits() == 32); int index_lanes = op->type.lanes(); int intrin_lanes = native_vector_bits() / op->type.bits(); string name = "halide.hexagon.vgather"; name += (op->type.bits() == 16) ? ".h.h" : ".w.w"; llvm::Function *fn = module->getFunction(name); Value *dst_buffer = codegen(op->args[0]); Value *src_ptr = codegen(op->args[2]); Value *size = codegen(op->args[3]); Value *index = codegen(op->args[4]); // Cut up the indices into appropriately-sized pieces. for (int start = 0; start < index_lanes; start += intrin_lanes) { vector args; Value *new_index = slice_vector(index, start, intrin_lanes); args.push_back(dst_buffer); args.push_back(codegen(op->args[1] + start)); args.push_back(src_ptr); args.push_back(size); args.push_back(new_index); value = builder->CreateCall(fn, args); } return; } else if (op->is_intrinsic(Call::hvx_scatter) || op->is_intrinsic(Call::hvx_scatter_acc)) { internal_assert(op->args.size() == 4); internal_assert(op->type.bits() == 16 || op->type.bits() == 32); int index_lanes = op->type.lanes(); int intrin_lanes = native_vector_bits() / op->type.bits(); string name = "halide.hexagon.vscatter"; name += op->is_intrinsic(Call::hvx_scatter_acc) ? "_acc" : ""; name += (op->type.bits() == 16) ? ".h.h" : ".w.w"; llvm::Function *fn = module->getFunction(name); Value *src_ptr = codegen(op->args[0]); Value *size = codegen(op->args[1]); Value *index = codegen(op->args[2]); Value *val = codegen(op->args[3]); Value *args[4]; args[0] = src_ptr; args[1] = size; // Cut up the indices into appropriately-sized pieces. for (int start = 0; start < index_lanes; start += intrin_lanes) { args[2] = slice_vector(index, start, intrin_lanes); args[3] = slice_vector(val, start, intrin_lanes); value = builder->CreateCall(fn, args); } return; } else if (op->is_intrinsic(Call::hvx_scatter_release)) { internal_assert(op->args.size() == 1); Value *ptr = codegen(op->args[0]); llvm::Function *fn = module->getFunction("halide.hexagon.scatter.release"); value = builder->CreateCall(fn, {ptr}); return; } else if (op->is_intrinsic(Call::sorted_avg) && op->type.is_vector() && ((op->type.is_uint() && (op->type.bits() == 8 || op->type.bits() == 16)) || (op->type.is_int() && (op->type.bits() == 16 || op->type.bits() == 32)))) { value = codegen(Call::make( op->type, "halide.hexagon.avg" + type_suffix(op->args[0], op->args[1]), {op->args[0], op->args[1]}, Call::PureExtern)); return; } CodeGen_Posix::visit(op); } void CodeGen_Hexagon::visit(const Max *op) { if (op->type.is_vector()) { value = call_intrin(op->type, "halide.hexagon.max" + type_suffix(op->a, op->b), {op->a, op->b}, true /*maybe*/); if (!value) { Expr equiv = Select::make(op->a > op->b, op->a, op->b); equiv = common_subexpression_elimination(equiv); value = codegen(equiv); } } else { CodeGen_Posix::visit(op); } } void CodeGen_Hexagon::visit(const Min *op) { if (op->type.is_vector()) { value = call_intrin(op->type, "halide.hexagon.min" + type_suffix(op->a, op->b), {op->a, op->b}, true /*maybe*/); if (!value) { Expr equiv = Select::make(op->a > op->b, op->b, op->a); equiv = common_subexpression_elimination(equiv); value = codegen(equiv); } } else { CodeGen_Posix::visit(op); } } void CodeGen_Hexagon::visit(const Select *op) { if (op->condition.type().is_scalar() && op->type.is_vector()) { // Implement scalar conditions on vector values with if-then-else. value = codegen(Call::make(op->type, Call::if_then_else, {op->condition, op->true_value, op->false_value}, Call::PureIntrinsic)); } else { CodeGen_Posix::visit(op); } } Value *CodeGen_Hexagon::codegen_cache_allocation_size( const std::string &name, Type type, const std::vector &extents) { // Compute size from list of extents checking for overflow. Expr overflow = make_zero(UInt(32)); Expr total_size = make_const(UInt(32), type.lanes() * type.bytes()); // We'll multiply all the extents into the 32-bit value // total_size. We'll also track (total_size >> 24) as a 32-bit // value to check for overflow as we go. The loop invariant will // be that either the overflow Expr is non-zero, or total_size_hi // only occupies the bottom 8-bits. Overflow could be more simply // checked for using division, but that's slower at runtime. This // method generates much better assembly. Expr total_size_hi = make_zero(UInt(32)); Expr low_mask = make_const(UInt(32), (uint32_t)(0xfffff)); for (const auto &extent : extents) { Expr next_extent = cast(UInt(32), extent); // Update total_size >> 24. This math can't overflow due to // the loop invariant: total_size_hi *= next_extent; // Deal with carry from the low bits. Still can't overflow. total_size_hi += ((total_size & low_mask) * next_extent) >> 24; // Update total_size. This may overflow. total_size *= next_extent; // We can check for overflow by asserting that total_size_hi // is still an 8-bit number. overflow = overflow | (total_size_hi >> 24); } Expr max_size = make_const(UInt(32), target.maximum_buffer_size()); Expr size_check = (overflow == 0) && (total_size <= max_size); // For constant-sized allocations this check should simplify away. size_check = common_subexpression_elimination(simplify(size_check)); if (!is_const_one(size_check)) { create_assertion( codegen(size_check), Call::make(Int(32), "halide_error_buffer_allocation_too_large", {name, Cast::make(UInt(64), total_size), Cast::make(UInt(64), max_size)}, Call::Extern)); } total_size = simplify(total_size); return codegen(total_size); } void CodeGen_Hexagon::visit(const Allocate *alloc) { if (sym_exists(alloc->name)) { user_error << "Can't have two different buffers with the same name: " << alloc->name << "\n"; } if (alloc->memory_type == MemoryType::LockedCache) { // We are not allowing Customized memory allocation for Locked Cache user_assert(!alloc->new_expr.defined()) << "Custom Expression not allowed for Memory Type Locked Cache\n"; Value *llvm_size = nullptr; int32_t constant_bytes = Allocate::constant_allocation_size(alloc->extents, alloc->name); if (constant_bytes > 0) { constant_bytes *= alloc->type.bytes(); llvm_size = codegen(Expr(constant_bytes)); } else { llvm_size = codegen_cache_allocation_size(alloc->name, alloc->type, alloc->extents); } // Only allocate memory if the condition is true, otherwise 0. Value *llvm_condition = codegen(alloc->condition); if (llvm_size != nullptr) { llvm_size = builder->CreateSelect( llvm_condition, llvm_size, ConstantInt::get(llvm_size->getType(), 0)); } Allocation allocation; allocation.constant_bytes = constant_bytes; allocation.stack_bytes = 0; allocation.type = alloc->type; allocation.ptr = nullptr; allocation.destructor = nullptr; allocation.destructor_function = nullptr; allocation.name = alloc->name; // Call Halide_Locked_Cache_Alloc llvm::Function *alloc_fn = module->getFunction("halide_locked_cache_malloc"); internal_assert(alloc_fn) << "Could not find halide_locked_cache_malloc in module\n"; llvm::Function::arg_iterator arg_iter = alloc_fn->arg_begin(); ++arg_iter; // skip the user context * llvm_size = builder->CreateIntCast(llvm_size, arg_iter->getType(), false); debug(4) << "Creating call to halide_locked_cache_malloc for allocation " << alloc->name << " of size " << alloc->type.bytes(); for (const Expr &e : alloc->extents) { debug(4) << " x " << e; } debug(4) << "\n"; Value *args[2] = {get_user_context(), llvm_size}; Value *call = builder->CreateCall(alloc_fn, args); // Fix the type to avoid pointless bitcasts later call = builder->CreatePointerCast( call, llvm_type_of(alloc->type)->getPointerTo()); allocation.ptr = call; // Assert that the allocation worked. Value *check = builder->CreateIsNotNull(allocation.ptr); if (llvm_size) { Value *zero_size = builder->CreateIsNull(llvm_size); check = builder->CreateOr(check, zero_size); } create_assertion(check, Call::make(Int(32), "halide_error_out_of_memory", std::vector(), Call::Extern)); std::string free_function_string; // Register a destructor for this allocation. if (alloc->free_function.empty()) { free_function_string = "halide_locked_cache_free"; } llvm::Function *free_fn = module->getFunction(free_function_string); internal_assert(free_fn) << "Could not find " << alloc->free_function << " in module.\n"; allocation.destructor = register_destructor(free_fn, allocation.ptr, OnError); allocation.destructor_function = free_fn; // Push the allocation base pointer onto the symbol table debug(3) << "Pushing allocation called " << alloc->name << " onto the symbol table\n"; allocations.push(alloc->name, allocation); sym_push(alloc->name, allocation.ptr); codegen(alloc->body); // If there was no early free, free it now. if (allocations.contains(alloc->name)) { Allocation alloc_obj = allocations.get(alloc->name); internal_assert(alloc_obj.destructor); trigger_destructor(alloc_obj.destructor_function, alloc_obj.destructor); allocations.pop(alloc->name); sym_pop(alloc->name); } } else if (alloc->memory_type == MemoryType::VTCM && !alloc->new_expr.defined()) { if (!is_hvx_v65_or_later()) { user_error << "VTCM store_in requires HVX_v65 or later.\n"; } // Calculate size of allocation. Expr size = alloc->type.bytes(); for (const auto &extent : alloc->extents) { size *= extent; } size += allocation_padding(alloc->type); Expr new_expr = Call::make(Handle(), "halide_vtcm_malloc", {size}, Call::Extern); string free_function = "halide_vtcm_free"; Stmt new_alloc = Allocate::make( alloc->name, alloc->type, alloc->memory_type, alloc->extents, alloc->condition, alloc->body, new_expr, free_function); new_alloc.accept(this); } else { // For all other memory types CodeGen_Posix::visit(alloc); } } } // namespace std::unique_ptr new_CodeGen_Hexagon(const Target &target) { return std::make_unique(target); } #else // WITH_HEXAGON std::unique_ptr new_CodeGen_Hexagon(const Target &target) { user_error << "hexagon not enabled for this build of Halide.\n"; return nullptr; } #endif // WITH_HEXAGON } // namespace Internal } // namespace Halide