#include #include #include #include "CPlusPlusMangle.h" #include "CSE.h" #include "CodeGen_Internal.h" #include "CodeGen_LLVM.h" #include "CodeGen_Posix.h" #include "CodeGen_Targets.h" #include "CompilerLogger.h" #include "Debug.h" #include "Deinterleave.h" #include "EmulateFloat16Math.h" #include "ExprUsesVar.h" #include "ExternFuncArgument.h" #include "FindIntrinsics.h" #include "IREquality.h" #include "IROperator.h" #include "IRPrinter.h" #include "IntegerDivisionTable.h" #include "JITModule.h" #include "LLVM_Headers.h" #include "LLVM_Runtime_Linker.h" #include "Lerp.h" #include "LowerParallelTasks.h" #include "Pipeline.h" #include "Simplify.h" #include "Util.h" // MSVC won't set __cplusplus correctly unless certain compiler flags are set // (and CMake doesn't set those flags for you even if you specify C++17), // so we need to check against _MSVC_LANG as well, for completeness. #if !(__cplusplus >= 201703L || _MSVC_LANG >= 201703L) #error "Halide requires C++17 or later; please upgrade your compiler." #endif namespace Halide { std::unique_ptr codegen_llvm(const Module &module, llvm::LLVMContext &context) { std::unique_ptr cg(Internal::CodeGen_LLVM::new_for_target(module.target(), context)); return cg->compile(module); } namespace Internal { using namespace llvm; using std::map; using std::ostringstream; using std::pair; using std::string; using std::vector; // Define a local empty inline function for each target // to disable initialization. #define LLVM_TARGET(target) \ inline void Initialize##target##Target() { \ } #include #undef LLVM_TARGET #define LLVM_ASM_PARSER(target) \ inline void Initialize##target##AsmParser() { \ } #include #undef LLVM_ASM_PARSER #define LLVM_ASM_PRINTER(target) \ inline void Initialize##target##AsmPrinter() { \ } #include #undef LLVM_ASM_PRINTER #define InitializeTarget(target) \ LLVMInitialize##target##Target(); \ LLVMInitialize##target##TargetInfo(); \ LLVMInitialize##target##TargetMC(); #define InitializeAsmParser(target) \ LLVMInitialize##target##AsmParser(); #define InitializeAsmPrinter(target) \ LLVMInitialize##target##AsmPrinter(); // Override above empty init function with macro for supported targets. #ifdef WITH_ARM #define InitializeARMTarget() InitializeTarget(ARM) #define InitializeARMAsmParser() InitializeAsmParser(ARM) #define InitializeARMAsmPrinter() InitializeAsmPrinter(ARM) #endif #ifdef WITH_NVPTX #define InitializeNVPTXTarget() InitializeTarget(NVPTX) // #define InitializeNVPTXAsmParser() InitializeAsmParser(NVPTX) // there is no ASM parser for NVPTX #define InitializeNVPTXAsmPrinter() InitializeAsmPrinter(NVPTX) #endif #ifdef WITH_AMDGPU #define InitializeAMDGPUTarget() InitializeTarget(AMDGPU) #define InitializeAMDGPUAsmParser() InitializeAsmParser(AMDGPU) #define InitializeAMDGPUAsmPrinter() InitializeAsmParser(AMDGPU) #endif #ifdef WITH_AARCH64 #define InitializeAArch64Target() InitializeTarget(AArch64) #define InitializeAArch64AsmParser() InitializeAsmParser(AArch64) #define InitializeAArch64AsmPrinter() InitializeAsmPrinter(AArch64) #endif #ifdef WITH_HEXAGON #define InitializeHexagonTarget() InitializeTarget(Hexagon) #define InitializeHexagonAsmParser() InitializeAsmParser(Hexagon) #define InitializeHexagonAsmPrinter() InitializeAsmPrinter(Hexagon) #endif #ifdef WITH_POWERPC #define InitializePowerPCTarget() InitializeTarget(PowerPC) #define InitializePowerPCAsmParser() InitializeAsmParser(PowerPC) #define InitializePowerPCAsmPrinter() InitializeAsmPrinter(PowerPC) #endif #ifdef WITH_RISCV #define InitializeRISCVTarget() InitializeTarget(RISCV) #define InitializeRISCVAsmParser() InitializeAsmParser(RISCV) #define InitializeRISCVAsmPrinter() InitializeAsmPrinter(RISCV) #endif #ifdef WITH_X86 #define InitializeX86Target() InitializeTarget(X86) #define InitializeX86AsmParser() InitializeAsmParser(X86) #define InitializeX86AsmPrinter() InitializeAsmPrinter(X86) #endif #ifdef WITH_WEBASSEMBLY #define InitializeWebAssemblyTarget() InitializeTarget(WebAssembly) #define InitializeWebAssemblyAsmParser() InitializeAsmParser(WebAssembly) #define InitializeWebAssemblyAsmPrinter() InitializeAsmPrinter(WebAssembly) #endif namespace { llvm::Value *CreateConstGEP1_32(IRBuilderBase *builder, llvm::Type *gep_type, Value *ptr, unsigned index) { return builder->CreateConstGEP1_32(gep_type, ptr, index); } llvm::Value *CreateInBoundsGEP(IRBuilderBase *builder, llvm::Type *gep_type, Value *ptr, ArrayRef index_list) { return builder->CreateInBoundsGEP(gep_type, ptr, index_list); } // Get the LLVM linkage corresponding to a Halide linkage type. llvm::GlobalValue::LinkageTypes llvm_linkage(LinkageType t) { // TODO(dsharlet): For some reason, marking internal functions as // private linkage on OSX is causing some of the static tests to // fail. Figure out why so we can remove this. return llvm::GlobalValue::ExternalLinkage; // switch (t) { // case LinkageType::ExternalPlusArgv: // case LinkageType::ExternalPlusMetadata: // case LinkageType::External: // return llvm::GlobalValue::ExternalLinkage; // default: // return llvm::GlobalValue::PrivateLinkage; // } } } // namespace CodeGen_LLVM::CodeGen_LLVM(const Target &t) : builder(nullptr), target(t), wild_u1x_(Variable::make(UInt(1, 0), "*")), wild_i8x_(Variable::make(Int(8, 0), "*")), wild_u8x_(Variable::make(UInt(8, 0), "*")), wild_i16x_(Variable::make(Int(16, 0), "*")), wild_u16x_(Variable::make(UInt(16, 0), "*")), wild_i32x_(Variable::make(Int(32, 0), "*")), wild_u32x_(Variable::make(UInt(32, 0), "*")), wild_i64x_(Variable::make(Int(64, 0), "*")), wild_u64x_(Variable::make(UInt(64, 0), "*")), wild_f32x_(Variable::make(Float(32, 0), "*")), wild_f64x_(Variable::make(Float(64, 0), "*")), wild_u1_(Variable::make(UInt(1), "*")), wild_i8_(Variable::make(Int(8), "*")), wild_u8_(Variable::make(UInt(8), "*")), wild_i16_(Variable::make(Int(16), "*")), wild_u16_(Variable::make(UInt(16), "*")), wild_i32_(Variable::make(Int(32), "*")), wild_u32_(Variable::make(UInt(32), "*")), wild_i64_(Variable::make(Int(64), "*")), wild_u64_(Variable::make(UInt(64), "*")), wild_f32_(Variable::make(Float(32), "*")), wild_f64_(Variable::make(Float(64), "*")), strict_float(t.has_feature(Target::StrictFloat)), llvm_large_code_model(t.has_feature(Target::LLVMLargeCodeModel)) { initialize_llvm(); } void CodeGen_LLVM::set_context(llvm::LLVMContext &context) { this->context = &context; effective_vscale = target_vscale(); } std::unique_ptr CodeGen_LLVM::new_for_target(const Target &target, llvm::LLVMContext &context) { std::unique_ptr result; if (target.arch == Target::X86) { result = new_CodeGen_X86(target); } else if (target.arch == Target::ARM) { result = new_CodeGen_ARM(target); } else if (target.arch == Target::POWERPC) { result = new_CodeGen_PowerPC(target); } else if (target.arch == Target::Hexagon) { result = new_CodeGen_Hexagon(target); } else if (target.arch == Target::WebAssembly) { result = new_CodeGen_WebAssembly(target); } else if (target.arch == Target::RISCV) { result = new_CodeGen_RISCV(target); } user_assert(result) << "Unknown target architecture: " << target.to_string() << "\n"; result->set_context(context); return result; } void CodeGen_LLVM::initialize_llvm() { static std::once_flag init_llvm_once; std::call_once(init_llvm_once, []() { // You can hack in command-line args to llvm with the // environment variable HL_LLVM_ARGS, e.g. HL_LLVM_ARGS="-print-after-all" std::string args = get_env_variable("HL_LLVM_ARGS"); if (!args.empty()) { vector arg_vec = split_string(args, " "); vector c_arg_vec; c_arg_vec.push_back("llc"); for (const std::string &s : arg_vec) { c_arg_vec.push_back(s.c_str()); } // TODO: Remove after opaque pointers become the default in LLVM. // This is here to document how to turn on opaque pointers, for testing, in LLVM 15 // c_arg_vec.push_back("-opaque-pointers"); cl::ParseCommandLineOptions((int)(c_arg_vec.size()), &c_arg_vec[0], "Halide compiler\n"); } InitializeNativeTarget(); InitializeNativeTargetAsmPrinter(); InitializeNativeTargetAsmParser(); #define LLVM_TARGET(target) \ Initialize##target##Target(); #include #undef LLVM_TARGET #define LLVM_ASM_PARSER(target) \ Initialize##target##AsmParser(); #include #undef LLVM_ASM_PARSER #define LLVM_ASM_PRINTER(target) \ Initialize##target##AsmPrinter(); #include #undef LLVM_ASM_PRINTER }); } void CodeGen_LLVM::init_context() { // Ensure our IRBuilder is using the current context. builder = std::make_unique>(*context); // Branch weights for very likely branches llvm::MDBuilder md_builder(*context); very_likely_branch = md_builder.createBranchWeights(1 << 30, 0); default_fp_math_md = md_builder.createFPMath(0.0); strict_fp_math_md = md_builder.createFPMath(0.0); builder->setDefaultFPMathTag(default_fp_math_md); llvm::FastMathFlags fast_flags; fast_flags.setNoNaNs(); fast_flags.setNoInfs(); fast_flags.setNoSignedZeros(); // Don't use approximate reciprocals for division. It's too inaccurate even for Halide. // fast_flags.setAllowReciprocal(); // Theoretically, setAllowReassoc could be setUnsafeAlgebra for earlier versions, but that // turns on all the flags. fast_flags.setAllowReassoc(); fast_flags.setAllowContract(true); fast_flags.setApproxFunc(); builder->setFastMathFlags(fast_flags); // Define some types void_t = llvm::Type::getVoidTy(*context); i1_t = llvm::Type::getInt1Ty(*context); i8_t = llvm::Type::getInt8Ty(*context); i16_t = llvm::Type::getInt16Ty(*context); i32_t = llvm::Type::getInt32Ty(*context); i64_t = llvm::Type::getInt64Ty(*context); f16_t = llvm::Type::getHalfTy(*context); f32_t = llvm::Type::getFloatTy(*context); f64_t = llvm::Type::getDoubleTy(*context); // Ensure no Value pointers carry over from previous context. struct_type_recovery.clear(); } void CodeGen_LLVM::init_module() { init_context(); // Start with a module containing the initial module for this target. module = get_initial_module_for_target(target, context); } namespace { struct MangledNames { string simple_name; string extern_name; string argv_name; string metadata_name; }; MangledNames get_mangled_names(const std::string &name, LinkageType linkage, NameMangling mangling, const std::vector &args, const Target &target) { std::vector namespaces; MangledNames names; names.simple_name = extract_namespaces(name, namespaces); names.extern_name = names.simple_name; names.argv_name = names.simple_name + "_argv"; names.metadata_name = names.simple_name + "_metadata"; if (linkage != LinkageType::Internal && ((mangling == NameMangling::Default && target.has_feature(Target::CPlusPlusMangling)) || mangling == NameMangling::CPlusPlus)) { std::vector mangle_args; for (const auto &arg : args) { if (arg.kind == Argument::InputScalar) { mangle_args.emplace_back(make_zero(arg.type)); } else if (arg.kind == Argument::InputBuffer || arg.kind == Argument::OutputBuffer) { mangle_args.emplace_back(Buffer<>()); } } names.extern_name = cplusplus_function_mangled_name(names.simple_name, namespaces, type_of(), mangle_args, target); halide_handle_cplusplus_type inner_type(halide_cplusplus_type_name(halide_cplusplus_type_name::Simple, "void"), {}, {}, {halide_handle_cplusplus_type::Pointer, halide_handle_cplusplus_type::Pointer}); Type void_star_star(Handle(1, &inner_type)); names.argv_name = cplusplus_function_mangled_name(names.argv_name, namespaces, type_of(), {ExternFuncArgument(make_zero(void_star_star))}, target); names.metadata_name = cplusplus_function_mangled_name(names.metadata_name, namespaces, type_of(), {}, target); } return names; } MangledNames get_mangled_names(const LoweredFunc &f, const Target &target) { return get_mangled_names(f.name, f.linkage, f.name_mangling, f.args, target); } } // namespace llvm::FunctionType *CodeGen_LLVM::signature_to_type(const ExternSignature &signature) { internal_assert(void_t != nullptr && halide_buffer_t_type != nullptr); llvm::Type *ret_type = signature.is_void_return() ? void_t : llvm_type_of(upgrade_type_for_argument_passing(signature.ret_type())); std::vector llvm_arg_types; for (const Type &t : signature.arg_types()) { if (t == type_of()) { llvm_arg_types.push_back(halide_buffer_t_type->getPointerTo()); } else { llvm_arg_types.push_back(llvm_type_of(upgrade_type_for_argument_passing(t))); } } return llvm::FunctionType::get(ret_type, llvm_arg_types, false); } /*static*/ std::unique_ptr CodeGen_LLVM::compile_trampolines( const Target &target, llvm::LLVMContext &context, const std::string &suffix, const std::vector> &externs) { std::unique_ptr codegen(new_for_target(target, context)); codegen->init_codegen("trampolines" + suffix); for (const std::pair &e : externs) { const std::string &callee_name = e.first; const std::string wrapper_name = callee_name + suffix; llvm::FunctionType *fn_type = codegen->signature_to_type(e.second); // callee might already be present for builtins, e.g. halide_print llvm::Function *callee = codegen->module->getFunction(callee_name); if (!callee) { callee = llvm::Function::Create(fn_type, llvm::Function::ExternalLinkage, callee_name, codegen->module.get()); } std::vector buffer_args(e.second.arg_types().size()); size_t index = 0; for (const Type &t : e.second.arg_types()) { buffer_args[index++] = (t == type_of()); } codegen->add_argv_wrapper(callee, wrapper_name, /*result_in_argv*/ true, buffer_args); } return codegen->finish_codegen(); } void CodeGen_LLVM::init_codegen(const std::string &name, bool any_strict_float) { init_module(); internal_assert(module && context); debug(1) << "Target triple of initial module: " << module->getTargetTriple() << "\n"; module->setModuleIdentifier(name); // Add some target specific info to the module as metadata. module->addModuleFlag(llvm::Module::Warning, "halide_use_soft_float_abi", use_soft_float_abi() ? 1 : 0); module->addModuleFlag(llvm::Module::Warning, "halide_mcpu_target", MDString::get(*context, mcpu_target())); module->addModuleFlag(llvm::Module::Warning, "halide_mcpu_tune", MDString::get(*context, mcpu_tune())); module->addModuleFlag(llvm::Module::Warning, "halide_mattrs", MDString::get(*context, mattrs())); module->addModuleFlag(llvm::Module::Warning, "halide_mabi", MDString::get(*context, mabi())); module->addModuleFlag(llvm::Module::Warning, "halide_use_pic", use_pic() ? 1 : 0); module->addModuleFlag(llvm::Module::Warning, "halide_use_large_code_model", llvm_large_code_model ? 1 : 0); module->addModuleFlag(llvm::Module::Warning, "halide_per_instruction_fast_math_flags", any_strict_float); if (effective_vscale != 0) { module->addModuleFlag(llvm::Module::Warning, "halide_vscale_range", MDString::get(*context, std::to_string(effective_vscale) + ", " + std::to_string(effective_vscale))); } // Ensure some types we need are defined halide_buffer_t_type = get_llvm_struct_type_by_name(module.get(), "struct.halide_buffer_t"); internal_assert(halide_buffer_t_type) << "Did not find halide_buffer_t in initial module"; type_t_type = get_llvm_struct_type_by_name(module.get(), "struct.halide_type_t"); internal_assert(type_t_type) << "Did not find halide_type_t in initial module"; dimension_t_type = get_llvm_struct_type_by_name(module.get(), "struct.halide_dimension_t"); internal_assert(dimension_t_type) << "Did not find halide_dimension_t in initial module"; metadata_t_type = get_llvm_struct_type_by_name(module.get(), "struct.halide_filter_metadata_t"); internal_assert(metadata_t_type) << "Did not find halide_filter_metadata_t in initial module"; argument_t_type = get_llvm_struct_type_by_name(module.get(), "struct.halide_filter_argument_t"); internal_assert(argument_t_type) << "Did not find halide_filter_argument_t in initial module"; scalar_value_t_type = get_llvm_struct_type_by_name(module.get(), "struct.halide_scalar_value_t"); internal_assert(scalar_value_t_type) << "Did not find halide_scalar_value_t in initial module"; device_interface_t_type = get_llvm_struct_type_by_name(module.get(), "struct.halide_device_interface_t"); internal_assert(device_interface_t_type) << "Did not find halide_device_interface_t in initial module"; pseudostack_slot_t_type = get_llvm_struct_type_by_name(module.get(), "struct.halide_pseudostack_slot_t"); internal_assert(pseudostack_slot_t_type) << "Did not find halide_pseudostack_slot_t in initial module"; semaphore_t_type = get_llvm_struct_type_by_name(module.get(), "struct.halide_semaphore_t"); internal_assert(semaphore_t_type) << "Did not find halide_semaphore_t in initial module"; } std::unique_ptr CodeGen_LLVM::compile(const Module &input) { init_codegen(input.name(), input.any_strict_float()); internal_assert(module && context && builder) << "The CodeGen_LLVM subclass should have made an initial module before calling CodeGen_LLVM::compile\n"; // Generate the code for this module. debug(1) << "Generating llvm bitcode...\n"; for (const auto &b : input.buffers()) { compile_buffer(b); } vector function_names; // Declare all functions for (const auto &f : input.functions()) { const auto names = get_mangled_names(f, get_target()); function_names.push_back(names); // Deduce the types of the arguments to our function vector arg_types(f.args.size()); for (size_t i = 0; i < f.args.size(); i++) { if (f.args[i].is_buffer()) { arg_types[i] = halide_buffer_t_type->getPointerTo(); } else { arg_types[i] = llvm_type_of(upgrade_type_for_argument_passing(f.args[i].type)); } } FunctionType *func_t = FunctionType::get(i32_t, arg_types, false); function = llvm::Function::Create(func_t, llvm_linkage(f.linkage), names.extern_name, module.get()); set_function_attributes_from_halide_target_options(*function); // Mark the buffer args as no alias and save indication for add_argv_wrapper if needed std::vector buffer_args(f.args.size()); for (size_t i = 0; i < f.args.size(); i++) { bool is_buffer = f.args[i].is_buffer(); buffer_args[i] = is_buffer; if (is_buffer) { function->addParamAttr(i, Attribute::NoAlias); } } // sym_push helpfully calls setName, which we don't want symbol_table.push("::" + f.name, function); // If the Func is externally visible, also create the argv wrapper and metadata. // (useful for calling from JIT and other machine interfaces). if (f.linkage == LinkageType::ExternalPlusArgv || f.linkage == LinkageType::ExternalPlusMetadata) { add_argv_wrapper(function, names.argv_name, false, buffer_args); if (f.linkage == LinkageType::ExternalPlusMetadata) { embed_metadata_getter(names.metadata_name, names.simple_name, f.args, input.get_metadata_name_map()); } } // Workaround for https://github.com/halide/Halide/issues/635: // For historical reasons, Halide-generated AOT code // defines user_context as `void const*`, but expects all // define_extern code with user_context usage to use `void *`. This // usually isn't an issue, but if both the caller and callee of the // pass a user_context, *and* c_plus_plus_name_mangling is enabled, // we get link errors because of this dichotomy. Fixing this // "correctly" (ie so that everything always uses identical types for // user_context in all cases) will require a *lot* of downstream // churn (see https://github.com/halide/Halide/issues/7298), // so this is a workaround: Add a wrapper with `void*` // ucon -> `void const*` ucon. In most cases this will be ignored // (and probably dead-stripped), but in these cases it's critical. // // (Note that we don't check to see if c_plus_plus_name_mangling is // enabled, since that would have to be done on the caller side, and // this is purely a callee-side fix.) if (f.linkage != LinkageType::Internal && target.has_feature(Target::CPlusPlusMangling) && target.has_feature(Target::UserContext)) { int wrapper_ucon_index = -1; auto wrapper_args = f.args; // make a copy auto wrapper_llvm_arg_types = arg_types; // make a copy for (int i = 0; i < (int)wrapper_args.size(); i++) { if (wrapper_args[i].name == "__user_context" && wrapper_args[i].type == type_of()) { // Update the type of the user_context argument to be void* rather than void const* wrapper_args[i].type = type_of(); wrapper_llvm_arg_types[i] = llvm_type_of(upgrade_type_for_argument_passing(wrapper_args[i].type)); wrapper_ucon_index = i; } } if (wrapper_ucon_index >= 0) { const auto wrapper_names = get_mangled_names(f.name, f.linkage, f.name_mangling, wrapper_args, target); FunctionType *wrapper_func_t = FunctionType::get(i32_t, wrapper_llvm_arg_types, false); llvm::Function *wrapper_func = llvm::Function::Create(wrapper_func_t, llvm::GlobalValue::ExternalLinkage, wrapper_names.extern_name, module.get()); set_function_attributes_from_halide_target_options(*wrapper_func); llvm::BasicBlock *wrapper_block = llvm::BasicBlock::Create(module->getContext(), "entry", wrapper_func); builder->SetInsertPoint(wrapper_block); std::vector wrapper_call_args; for (auto &arg : wrapper_func->args()) { wrapper_call_args.push_back(&arg); } wrapper_call_args[wrapper_ucon_index] = builder->CreatePointerCast(wrapper_call_args[wrapper_ucon_index], llvm_type_of(type_of())); llvm::CallInst *wrapper_result = builder->CreateCall(function, wrapper_call_args); // This call should never inline wrapper_result->setIsNoInline(); builder->CreateRet(wrapper_result); internal_assert(!verifyFunction(*wrapper_func, &llvm::errs())); } } } // Define all functions int idx = 0; for (const auto &f : input.functions()) { const auto names = function_names[idx++]; run_with_large_stack([&]() { compile_func(f, names.simple_name, names.extern_name); }); } debug(2) << "llvm::Module pointer: " << module.get() << "\n"; return finish_codegen(); } std::unique_ptr CodeGen_LLVM::finish_codegen() { llvm::for_each(*module, set_function_attributes_from_halide_target_options); // Verify the module is ok internal_assert(!verifyModule(*module, &llvm::errs())); debug(2) << "Done generating llvm bitcode\n"; // Optimize CodeGen_LLVM::optimize_module(); if (target.has_feature(Target::EmbedBitcode)) { std::string halide_command = "halide target=" + target.to_string(); embed_bitcode(module.get(), halide_command); } // Disown the module and return it. return std::move(module); } void CodeGen_LLVM::begin_func(LinkageType linkage, const std::string &name, const std::string &extern_name, const std::vector &args) { current_function_args = args; function = module->getFunction(extern_name); if (!function) { internal_assert(function) << "Could not find a function of name " << extern_name << " in module\n"; } debug(1) << "Generating llvm bitcode prolog for function " << name << "...\n"; // Null out the destructor block. destructor_block = nullptr; // Make the initial basic block BasicBlock *block = BasicBlock::Create(*context, "entry", function); builder->SetInsertPoint(block); // Put the arguments in the symbol table { size_t i = 0; for (auto &arg : function->args()) { if (args[i].is_buffer()) { sym_push(args[i].name + ".buffer", &arg); } else { Type passed_type = upgrade_type_for_argument_passing(args[i].type); if (args[i].type != passed_type) { llvm::Value *a = builder->CreateBitCast(&arg, llvm_type_of(args[i].type)); sym_push(args[i].name, a); } else { sym_push(args[i].name, &arg); } } i++; } } } void CodeGen_LLVM::end_func(const std::vector &args) { return_with_error_code(ConstantInt::get(i32_t, 0)); // Remove the arguments from the symbol table for (const auto &arg : args) { if (arg.is_buffer()) { sym_pop(arg.name + ".buffer"); } else { sym_pop(arg.name); } } internal_assert(!verifyFunction(*function, &llvm::errs())); current_function_args.clear(); } void CodeGen_LLVM::compile_func(const LoweredFunc &f, const std::string &simple_name, const std::string &extern_name) { // Generate the function declaration and argument unpacking code. begin_func(f.linkage, simple_name, extern_name, f.args); // If building with MSAN, ensure that calls to halide_msan_annotate_buffer_is_initialized() // happen for every output buffer if the function succeeds. if (f.linkage != LinkageType::Internal && target.has_feature(Target::MSAN)) { llvm::Function *annotate_buffer_fn = module->getFunction("halide_msan_annotate_buffer_is_initialized_as_destructor"); internal_assert(annotate_buffer_fn) << "Could not find halide_msan_annotate_buffer_is_initialized_as_destructor in module\n"; annotate_buffer_fn->addParamAttr(0, Attribute::NoAlias); for (const auto &arg : f.args) { if (arg.kind == Argument::OutputBuffer) { register_destructor(annotate_buffer_fn, sym_get(arg.name + ".buffer"), OnSuccess); } } } // Generate the function body. debug(1) << "Generating llvm bitcode for function " << f.name << "...\n"; f.body.accept(this); // Show one time warning and clear it. for (auto it = onetime_warnings.begin(); it != onetime_warnings.end(); it = onetime_warnings.erase(it)) { user_warning << "In function " << f.name << ", " << it->second; } // Clean up and return. end_func(f.args); } // Given a range of iterators of constant ints, get a corresponding vector of llvm::Constant. template std::vector get_constants(llvm::Type *t, It begin, It end) { std::vector ret; for (It i = begin; i != end; i++) { ret.push_back(ConstantInt::get(t, *i)); } return ret; } BasicBlock *CodeGen_LLVM::get_destructor_block() { if (!destructor_block) { // Create it if it doesn't exist. IRBuilderBase::InsertPoint here = builder->saveIP(); destructor_block = BasicBlock::Create(*context, "destructor_block", function); builder->SetInsertPoint(destructor_block); // The first instruction in the destructor block is a phi node // that collects the error code. PHINode *error_code = builder->CreatePHI(i32_t, 0); // Calls to destructors will get inserted here. // The last instruction is the return op that returns it. builder->CreateRet(error_code); // Jump back to where we were. builder->restoreIP(here); } internal_assert(destructor_block->getParent() == function); return destructor_block; } Value *CodeGen_LLVM::register_destructor(llvm::Function *destructor_fn, Value *obj, DestructorType when) { // Create a null-initialized stack slot to track this object llvm::Type *void_ptr = i8_t->getPointerTo(); llvm::Value *stack_slot = create_alloca_at_entry(void_ptr, 1, true); // Cast the object to llvm's representation of void * obj = builder->CreatePointerCast(obj, void_ptr); // Put it in the stack slot builder->CreateStore(obj, stack_slot); // Passing the constant null as the object means the destructor // will never get called. { llvm::Constant *c = dyn_cast(obj); if (c && c->isNullValue()) { internal_error << "Destructors must take a non-null object\n"; } } // Switch to the destructor block, and add code that cleans up // this object if the contents of the stack slot is not nullptr. IRBuilderBase::InsertPoint here = builder->saveIP(); BasicBlock *dtors = get_destructor_block(); builder->SetInsertPoint(dtors->getFirstNonPHI()); PHINode *error_code = dyn_cast(dtors->begin()); internal_assert(error_code) << "The destructor block is supposed to start with a phi node\n"; llvm::Value *should_call = nullptr; switch (when) { case Always: should_call = ConstantInt::get(i1_t, 1); break; case OnError: should_call = builder->CreateIsNotNull(error_code); break; case OnSuccess: should_call = builder->CreateIsNull(error_code); break; } llvm::Function *call_destructor = module->getFunction("call_destructor"); internal_assert(call_destructor); internal_assert(destructor_fn); internal_assert(should_call); Value *args[] = {get_user_context(), destructor_fn, stack_slot, should_call}; builder->CreateCall(call_destructor, args); // Switch back to the original location builder->restoreIP(here); // Return the stack slot so that it's possible to cleanup the object early. return stack_slot; } void CodeGen_LLVM::trigger_destructor(llvm::Function *destructor_fn, Value *stack_slot) { llvm::Function *call_destructor = module->getFunction("call_destructor"); internal_assert(call_destructor); internal_assert(destructor_fn); stack_slot = builder->CreatePointerCast(stack_slot, i8_t->getPointerTo()->getPointerTo()); Value *should_call = ConstantInt::get(i1_t, 1); Value *args[] = {get_user_context(), destructor_fn, stack_slot, should_call}; builder->CreateCall(call_destructor, args); } void CodeGen_LLVM::compile_buffer(const Buffer<> &buf) { // Embed the buffer declaration as a global. internal_assert(buf.defined()); user_assert(buf.data()) << "Can't embed buffer " << buf.name() << " because it has a null host pointer.\n"; user_assert(!buf.device_dirty()) << "Can't embed Image \"" << buf.name() << "\"" << " because it has a dirty device pointer\n"; Constant *type_fields[] = { ConstantInt::get(i8_t, buf.type().code()), ConstantInt::get(i8_t, buf.type().bits()), ConstantInt::get(i16_t, buf.type().lanes())}; Constant *shape = nullptr; if (buf.dimensions()) { size_t shape_size = buf.dimensions() * sizeof(halide_dimension_t); vector shape_blob((char *)buf.raw_buffer()->dim, (char *)buf.raw_buffer()->dim + shape_size); shape = create_binary_blob(shape_blob, buf.name() + ".shape"); shape = ConstantExpr::getPointerCast(shape, dimension_t_type->getPointerTo()); } else { shape = ConstantPointerNull::get(dimension_t_type->getPointerTo()); } // For now, we assume buffers that aren't scalar are constant, // while scalars can be mutated. This accommodates all our existing // use cases, which is that all buffers are constant, except those // used to store stateful module information in offloading runtimes. bool constant = buf.dimensions() != 0; vector data_blob((const char *)buf.data(), (const char *)buf.data() + buf.size_in_bytes()); Constant *fields[] = { ConstantInt::get(i64_t, 0), // device ConstantPointerNull::get(device_interface_t_type->getPointerTo()), // device_interface create_binary_blob(data_blob, buf.name() + ".data", constant), // host ConstantInt::get(i64_t, halide_buffer_flag_host_dirty), // flags ConstantStruct::get(type_t_type, type_fields), // type ConstantInt::get(i32_t, buf.dimensions()), // dimensions shape, // dim ConstantPointerNull::get(i8_t->getPointerTo()), // padding }; Constant *buffer_struct = ConstantStruct::get(halide_buffer_t_type, fields); // Embed the halide_buffer_t and make it point to the data array. GlobalVariable *global = new GlobalVariable(*module, halide_buffer_t_type, false, GlobalValue::PrivateLinkage, nullptr, buf.name() + ".buffer"); global->setInitializer(buffer_struct); // Finally, dump it in the symbol table Constant *zero[] = {ConstantInt::get(i32_t, 0)}; Constant *global_ptr = ConstantExpr::getInBoundsGetElementPtr(halide_buffer_t_type, global, zero); sym_push(buf.name() + ".buffer", global_ptr); } Constant *CodeGen_LLVM::embed_constant_scalar_value_t(const Expr &e) { if (!e.defined()) { return Constant::getNullValue(scalar_value_t_type->getPointerTo()); } internal_assert(!e.type().is_handle()) << "Should never see Handle types here."; llvm::Value *val = codegen(e); llvm::Constant *constant = dyn_cast(val); internal_assert(constant); // Verify that the size of the LLVM value is the size we expected. internal_assert((uint64_t)constant->getType()->getPrimitiveSizeInBits() == (uint64_t)e.type().bits()); // It's important that we allocate a full scalar_value_t_type here, // even if the type of the value is smaller; downstream consumers should // be able to correctly load an entire scalar_value_t_type regardless of its // type, and if we emit just (say) a uint8 value here, the pointer may be // misaligned and/or the storage after may be unmapped. LLVM doesn't support // unions directly, so we'll fake it by making a constant array of the elements // we need, setting the first to the constant we want, and setting the rest // to all-zeros. (This happens to work because sizeof(halide_scalar_value_t) is evenly // divisible by sizeof(any-union-field.) const size_t value_size = e.type().bytes(); internal_assert(value_size > 0 && value_size <= sizeof(halide_scalar_value_t)); const size_t array_size = sizeof(halide_scalar_value_t) / value_size; internal_assert(array_size * value_size == sizeof(halide_scalar_value_t)); vector array_entries(array_size, Constant::getNullValue(constant->getType())); array_entries[0] = constant; llvm::ArrayType *array_type = ArrayType::get(constant->getType(), array_size); GlobalVariable *storage = new GlobalVariable( *module, array_type, /*isConstant*/ true, GlobalValue::PrivateLinkage, ConstantArray::get(array_type, array_entries)); // Ensure that the storage is aligned for halide_scalar_value_t storage->setAlignment(llvm::Align((int)sizeof(halide_scalar_value_t))); Constant *zero[] = {ConstantInt::get(i32_t, 0)}; return ConstantExpr::getBitCast( ConstantExpr::getInBoundsGetElementPtr(array_type, storage, zero), scalar_value_t_type->getPointerTo()); } Constant *CodeGen_LLVM::embed_constant_expr(Expr e, llvm::Type *t) { internal_assert(t != scalar_value_t_type); if (!e.defined()) { return Constant::getNullValue(t->getPointerTo()); } internal_assert(!e.type().is_handle()) << "Should never see Handle types here."; if (!is_const(e)) { e = simplify(e); internal_assert(is_const(e)) << "Should only see constant values for estimates."; } llvm::Value *val = codegen(e); llvm::Constant *constant = dyn_cast(val); internal_assert(constant); GlobalVariable *storage = new GlobalVariable( *module, constant->getType(), /*isConstant*/ true, GlobalValue::PrivateLinkage, constant); Constant *zero[] = {ConstantInt::get(i32_t, 0)}; return ConstantExpr::getBitCast( ConstantExpr::getInBoundsGetElementPtr(constant->getType(), storage, zero), t->getPointerTo()); } // Make a wrapper to call the function with an array of pointer // args. This is easier for the JIT to call than a function with an // unknown (at compile time) argument list. If result_in_argv is false, // the internal function result is returned as the wrapper function // result; if result_in_argv is true, the internal function result // is stored as the last item in the argv list (which must be one // longer than the number of arguments), and the wrapper's actual // return type is always 'void'. llvm::Function *CodeGen_LLVM::add_argv_wrapper(llvm::Function *fn, const std::string &name, bool result_in_argv, std::vector &arg_is_buffer) { llvm::Type *wrapper_result_type = result_in_argv ? void_t : i32_t; llvm::Type *wrapper_args_t[] = {i8_t->getPointerTo()->getPointerTo()}; llvm::FunctionType *wrapper_func_t = llvm::FunctionType::get(wrapper_result_type, wrapper_args_t, false); llvm::Function *wrapper_func = llvm::Function::Create(wrapper_func_t, llvm::GlobalValue::ExternalLinkage, name, module.get()); llvm::BasicBlock *wrapper_block = llvm::BasicBlock::Create(module->getContext(), "entry", wrapper_func); builder->SetInsertPoint(wrapper_block); llvm::Value *arg_array = iterator_to_pointer(wrapper_func->arg_begin()); std::vector wrapper_args; for (llvm::Function::arg_iterator i = fn->arg_begin(); i != fn->arg_end(); i++) { // Get the address of the nth argument llvm::Value *ptr = CreateConstGEP1_32(builder.get(), i8_t->getPointerTo(), arg_array, wrapper_args.size()); ptr = builder->CreateLoad(i8_t->getPointerTo(), ptr); if (arg_is_buffer[i->getArgNo()]) { // Cast the argument to a halide_buffer_t * wrapper_args.push_back(builder->CreatePointerCast(ptr, halide_buffer_t_type->getPointerTo())); } else { // Cast to the appropriate type and load ptr = builder->CreatePointerCast(ptr, i->getType()->getPointerTo()); wrapper_args.push_back(builder->CreateLoad(i->getType(), ptr)); } } debug(4) << "Creating call from wrapper to actual function\n"; llvm::CallInst *result = builder->CreateCall(fn, wrapper_args); // This call should never inline result->setIsNoInline(); if (result_in_argv) { llvm::Value *result_in_argv_ptr = CreateConstGEP1_32(builder.get(), i8_t->getPointerTo(), arg_array, wrapper_args.size()); if (fn->getReturnType() != void_t) { result_in_argv_ptr = builder->CreateLoad(i8_t->getPointerTo(), result_in_argv_ptr); // Cast to the appropriate type and store result_in_argv_ptr = builder->CreatePointerCast(result_in_argv_ptr, fn->getReturnType()->getPointerTo()); builder->CreateStore(result, result_in_argv_ptr); } builder->CreateRetVoid(); } else { // We could probably support other types as return values, // but int32 results are all that have actually been tested. internal_assert(fn->getReturnType() == i32_t); builder->CreateRet(result); } internal_assert(!verifyFunction(*wrapper_func, &llvm::errs())); return wrapper_func; } llvm::Function *CodeGen_LLVM::embed_metadata_getter(const std::string &metadata_name, const std::string &function_name, const std::vector &args, const MetadataNameMap &metadata_name_map) { Constant *zero = ConstantInt::get(i32_t, 0); const int num_args = (int)args.size(); auto map_string = [&metadata_name_map](const std::string &from) -> std::string { auto it = metadata_name_map.find(from); return it == metadata_name_map.end() ? from : it->second; }; vector arguments_array_entries; for (int arg = 0; arg < num_args; ++arg) { llvm::StructType *type_t_type = get_llvm_struct_type_by_name(module.get(), "struct.halide_type_t"); internal_assert(type_t_type) << "Did not find halide_type_t in module.\n"; Constant *type_fields[] = { ConstantInt::get(i8_t, args[arg].type.code()), ConstantInt::get(i8_t, args[arg].type.bits()), ConstantInt::get(i16_t, 1)}; Constant *type = ConstantStruct::get(type_t_type, type_fields); auto argument_estimates = args[arg].argument_estimates; if (args[arg].type.is_handle()) { // Handle values are always emitted into metadata as "undefined", regardless of // what sort of Expr is provided. argument_estimates = ArgumentEstimates{}; } Constant *buffer_estimates_array_ptr; if (args[arg].is_buffer() && !argument_estimates.buffer_estimates.empty()) { internal_assert((int)argument_estimates.buffer_estimates.size() == args[arg].dimensions); vector buffer_estimates_array_entries; for (const auto &be : argument_estimates.buffer_estimates) { Expr min = be.min; if (min.defined()) { min = cast(min); } Expr extent = be.extent; if (extent.defined()) { extent = cast(extent); } buffer_estimates_array_entries.push_back(embed_constant_expr(min, i64_t)); buffer_estimates_array_entries.push_back(embed_constant_expr(extent, i64_t)); } llvm::ArrayType *buffer_estimates_array = ArrayType::get(i64_t->getPointerTo(), buffer_estimates_array_entries.size()); GlobalVariable *buffer_estimates_array_storage = new GlobalVariable( *module, buffer_estimates_array, /*isConstant*/ true, GlobalValue::PrivateLinkage, ConstantArray::get(buffer_estimates_array, buffer_estimates_array_entries)); Value *zeros[] = {zero, zero}; buffer_estimates_array_ptr = ConstantExpr::getInBoundsGetElementPtr(buffer_estimates_array, buffer_estimates_array_storage, zeros); } else { buffer_estimates_array_ptr = Constant::getNullValue(i64_t->getPointerTo()->getPointerTo()); } Constant *argument_fields[] = { create_string_constant(map_string(args[arg].name)), ConstantInt::get(i32_t, args[arg].kind), ConstantInt::get(i32_t, args[arg].dimensions), type, embed_constant_scalar_value_t(argument_estimates.scalar_def), embed_constant_scalar_value_t(argument_estimates.scalar_min), embed_constant_scalar_value_t(argument_estimates.scalar_max), embed_constant_scalar_value_t(argument_estimates.scalar_estimate), buffer_estimates_array_ptr}; arguments_array_entries.push_back(ConstantStruct::get(argument_t_type, argument_fields)); } llvm::ArrayType *arguments_array = ArrayType::get(argument_t_type, num_args); GlobalVariable *arguments_array_storage = new GlobalVariable( *module, arguments_array, /*isConstant*/ true, GlobalValue::PrivateLinkage, ConstantArray::get(arguments_array, arguments_array_entries)); Constant *version = ConstantInt::get(i32_t, halide_filter_metadata_t::VERSION); Value *zeros[] = {zero, zero}; Constant *metadata_fields[] = { /* version */ version, /* num_arguments */ ConstantInt::get(i32_t, num_args), /* arguments */ ConstantExpr::getInBoundsGetElementPtr(arguments_array, arguments_array_storage, zeros), /* target */ create_string_constant(target.to_string()), /* name */ create_string_constant(function_name)}; GlobalVariable *metadata_storage = new GlobalVariable( *module, metadata_t_type, /*isConstant*/ true, GlobalValue::PrivateLinkage, ConstantStruct::get(metadata_t_type, metadata_fields), metadata_name + "_storage"); llvm::FunctionType *func_t = llvm::FunctionType::get(metadata_t_type->getPointerTo(), false); llvm::Function *metadata_getter = llvm::Function::Create(func_t, llvm::GlobalValue::ExternalLinkage, metadata_name, module.get()); llvm::BasicBlock *block = llvm::BasicBlock::Create(module->getContext(), "entry", metadata_getter); builder->SetInsertPoint(block); builder->CreateRet(metadata_storage); internal_assert(!verifyFunction(*metadata_getter, &llvm::errs())); return metadata_getter; } llvm::Type *CodeGen_LLVM::llvm_type_of(const Type &t) const { return llvm_type_of(context, t, effective_vscale); } void CodeGen_LLVM::optimize_module() { debug(3) << "Optimizing module\n"; auto time_start = std::chrono::high_resolution_clock::now(); if (debug::debug_level() >= 3) { module->print(dbgs(), nullptr, false, true); } std::unique_ptr tm = make_target_machine(*module); const bool do_loop_opt = get_target().has_feature(Target::EnableLLVMLoopOpt); PipelineTuningOptions pto; pto.LoopInterleaving = do_loop_opt; pto.LoopVectorization = do_loop_opt; pto.SLPVectorization = true; // Note: SLP vectorization has no analogue in the Halide scheduling model pto.LoopUnrolling = do_loop_opt; // Clear ScEv info for all loops. Certain Halide applications spend a very // long time compiling in forgetLoop, and prefer to forget everything // and rebuild SCEV (aka "Scalar Evolution") from scratch. // Sample difference in compile time reduction at the time of this change was // 21.04 -> 14.78 using current ToT release build. (See also https://reviews.llvm.org/rL358304) pto.ForgetAllSCEVInLoopUnroll = true; llvm::PassBuilder pb(tm.get(), pto); bool debug_pass_manager = false; // These analysis managers have to be declared in this order. llvm::LoopAnalysisManager lam; llvm::FunctionAnalysisManager fam; llvm::CGSCCAnalysisManager cgam; llvm::ModuleAnalysisManager mam; // Register all the basic analyses with the managers. pb.registerModuleAnalyses(mam); pb.registerCGSCCAnalyses(cgam); pb.registerFunctionAnalyses(fam); pb.registerLoopAnalyses(lam); pb.crossRegisterProxies(lam, fam, cgam, mam); ModulePassManager mpm; using OptimizationLevel = llvm::OptimizationLevel; OptimizationLevel level = OptimizationLevel::O3; if (get_target().has_feature(Target::SanitizerCoverage)) { pb.registerOptimizerLastEPCallback( [&](ModulePassManager &mpm, OptimizationLevel level) { SanitizerCoverageOptions sanitizercoverage_options; // Mirror what -fsanitize=fuzzer-no-link would enable. // See https://github.com/halide/Halide/issues/6528 sanitizercoverage_options.CoverageType = SanitizerCoverageOptions::SCK_Edge; sanitizercoverage_options.IndirectCalls = true; sanitizercoverage_options.TraceCmp = true; sanitizercoverage_options.Inline8bitCounters = true; sanitizercoverage_options.PCTable = true; // Due to TLS differences, stack depth tracking is only enabled on Linux if (get_target().os == Target::OS::Linux) { sanitizercoverage_options.StackDepth = true; } #if LLVM_VERSION >= 160 mpm.addPass(SanitizerCoveragePass(sanitizercoverage_options)); #else mpm.addPass(ModuleSanitizerCoveragePass(sanitizercoverage_options)); #endif }); } if (get_target().has_feature(Target::ASAN)) { #if LLVM_VERSION >= 150 // Nothing, ASanGlobalsMetadataAnalysis no longer exists #else pb.registerPipelineStartEPCallback([&](ModulePassManager &mpm, OptimizationLevel) { mpm.addPass(RequireAnalysisPass()); }); #endif pb.registerPipelineStartEPCallback([](ModulePassManager &mpm, OptimizationLevel) { AddressSanitizerOptions asan_options; // default values are good... asan_options.UseAfterScope = true; // ...except this one constexpr bool use_global_gc = false; constexpr bool use_odr_indicator = true; constexpr auto destructor_kind = AsanDtorKind::Global; #if LLVM_VERSION >= 160 mpm.addPass(AddressSanitizerPass( asan_options, use_global_gc, use_odr_indicator, destructor_kind)); #else mpm.addPass(ModuleAddressSanitizerPass( asan_options, use_global_gc, use_odr_indicator, destructor_kind)); #endif }); } // Target::MSAN handling is sprinkled throughout the codebase, // there is no need to run MemorySanitizerPass here. if (get_target().has_feature(Target::TSAN)) { pb.registerOptimizerLastEPCallback( [](ModulePassManager &mpm, OptimizationLevel level) { mpm.addPass( createModuleToFunctionPassAdaptor(ThreadSanitizerPass())); }); } for (auto &function : *module) { if (get_target().has_feature(Target::ASAN)) { function.addFnAttr(Attribute::SanitizeAddress); } if (get_target().has_feature(Target::MSAN)) { function.addFnAttr(Attribute::SanitizeMemory); } if (get_target().has_feature(Target::TSAN)) { // Do not annotate any of Halide's low-level synchronization code as it has // tsan interface calls to mark its behavior and is much faster if // it is not analyzed instruction by instruction. if (!(function.getName().startswith("_ZN6Halide7Runtime8Internal15Synchronization") || // TODO: this is a benign data race that re-initializes the detected features; // we should really fix it properly inside the implementation, rather than disabling // it here as a band-aid. function.getName().startswith("halide_default_can_use_target_features") || function.getName().startswith("halide_mutex_") || function.getName().startswith("halide_cond_"))) { function.addFnAttr(Attribute::SanitizeThread); } } } if (tm) { tm->registerPassBuilderCallbacks(pb); } mpm = pb.buildPerModuleDefaultPipeline(level, debug_pass_manager); mpm.run(*module, mam); if (llvm::verifyModule(*module, &errs())) { report_fatal_error("Transformation resulted in an invalid module\n"); } debug(3) << "After LLVM optimizations:\n"; if (debug::debug_level() >= 2) { module->print(dbgs(), nullptr, false, true); } auto *logger = get_compiler_logger(); if (logger) { auto time_end = std::chrono::high_resolution_clock::now(); std::chrono::duration diff = time_end - time_start; logger->record_compilation_time(CompilerLogger::Phase::LLVM, diff.count()); } } void CodeGen_LLVM::sym_push(const string &name, llvm::Value *value) { if (!value->getType()->isVoidTy()) { value->setName(name); } symbol_table.push(name, value); } void CodeGen_LLVM::sym_pop(const string &name) { symbol_table.pop(name); } llvm::Value *CodeGen_LLVM::sym_get(const string &name, bool must_succeed) const { // look in the symbol table if (!symbol_table.contains(name)) { if (must_succeed) { std::ostringstream err; err << "Symbol not found: " << name << "\n"; if (debug::debug_level() > 0) { err << "The following names are in scope:\n" << symbol_table << "\n"; } internal_error << err.str(); } else { return nullptr; } } return symbol_table.get(name); } bool CodeGen_LLVM::sym_exists(const string &name) const { return symbol_table.contains(name); } Value *CodeGen_LLVM::codegen(const Expr &e) { internal_assert(e.defined()); debug(4) << "Codegen: " << e.type() << ", " << e << "\n"; value = nullptr; e.accept(this); internal_assert(value) << "Codegen of an expr did not produce an llvm value\n" << e; // Halide's type system doesn't distinguish between scalars and // vectors of size 1, so if a codegen method returned a vector of // size one, just extract it out as a scalar. if (e.type().is_scalar() && value->getType()->isVectorTy()) { internal_assert(get_vector_num_elements(value->getType()) == 1); value = builder->CreateExtractElement(value, ConstantInt::get(i32_t, 0)); } // Make sure fixed/vscale property of vector types match what is expected. if (!value->getType()->isVoidTy()) { value = convert_fixed_or_scalable_vector_type(value, llvm_type_of(e.type())); } // TODO: skip this correctness check for bool vectors, // as eliminate_bool_vectors() will cause a discrepancy for some backends // (eg OpenCL, HVX, WASM); for now we're just ignoring the assert, but // in the long run we should improve the smarts. See https://github.com/halide/Halide/issues/4194. const bool is_bool_vector = e.type().is_bool() && e.type().lanes() > 1; bool types_match = is_bool_vector || e.type().is_handle() || value->getType()->isVoidTy() || value->getType() == llvm_type_of(e.type()); if (!types_match && debug::debug_level() > 0) { debug(1) << "Unexpected LLVM type for generated expression. Expected (llvm_type_of(e.type())): "; llvm_type_of(e.type())->print(dbgs(), true); debug(1) << " got (value->getType()): "; value->print(dbgs(), true); debug(1) << "\n"; } internal_assert(types_match) << "Codegen of Expr " << e << " of type " << e.type() << " did not produce llvm IR of the corresponding llvm type.\n"; return value; } void CodeGen_LLVM::codegen(const Stmt &s) { internal_assert(s.defined()); debug(4) << "Codegen: " << s << "\n"; value = nullptr; s.accept(this); } namespace { bool is_power_of_two(int x) { return (x & (x - 1)) == 0; } int next_power_of_two(int x) { return static_cast(1) << static_cast(std::ceil(std::log2(x))); } } // namespace Type CodeGen_LLVM::upgrade_type_for_arithmetic(const Type &t) const { if (t.is_bfloat() || (t.is_float() && t.bits() < 32)) { return Float(32, t.lanes()); } else if (t.is_int_or_uint() && !is_power_of_two(t.bits())) { return t.with_bits(next_power_of_two(t.bits())); } else { return t; } } Type CodeGen_LLVM::upgrade_type_for_argument_passing(const Type &t) const { if (t.is_bfloat() || (t.is_float() && t.bits() < 32)) { return t.with_code(halide_type_uint); } else { return t; } } Type CodeGen_LLVM::upgrade_type_for_storage(const Type &t) const { if (t.is_bfloat() || (t.is_float() && t.bits() < 32)) { return t.with_code(halide_type_uint); } else if (t.is_bool()) { return t.with_bits(8); } else if (t.is_handle()) { return UInt(64, t.lanes()); } else if (t.is_int_or_uint() && !is_power_of_two(t.bits())) { return t.with_bits(next_power_of_two(t.bits())); } else { return t; } } void CodeGen_LLVM::visit(const IntImm *op) { value = ConstantInt::getSigned(llvm_type_of(op->type), op->value); } void CodeGen_LLVM::visit(const UIntImm *op) { value = ConstantInt::get(llvm_type_of(op->type), op->value); } void CodeGen_LLVM::visit(const FloatImm *op) { if (op->type.is_bfloat()) { codegen(reinterpret(BFloat(16), make_const(UInt(16), bfloat16_t(op->value).to_bits()))); } else if (op->type.bits() == 16) { codegen(reinterpret(Float(16), make_const(UInt(16), float16_t(op->value).to_bits()))); } else { value = ConstantFP::get(llvm_type_of(op->type), op->value); } } void CodeGen_LLVM::visit(const StringImm *op) { value = create_string_constant(op->value); } void CodeGen_LLVM::visit(const Cast *op) { Halide::Type src = op->value.type(); Halide::Type dst = op->type; if (upgrade_type_for_arithmetic(src) != src || upgrade_type_for_arithmetic(dst) != dst) { // Handle casts to and from types for which we don't have native support. debug(4) << "Emulating cast from " << src << " to " << dst << "\n"; if ((src.is_float() && src.bits() < 32) || (dst.is_float() && dst.bits() < 32)) { string warn_msg = "(b)float16 type operation is emulated, which is likely to slow down the performance. " "If your target supports native (b)float16 operations, " "it could be improved by adding Target feature to enable it.\n"; onetime_warnings.try_emplace(WarningKind::EmulatedFloat16, warn_msg); Expr equiv = lower_float16_cast(op); internal_assert(equiv.type() == op->type); codegen(equiv); } else { internal_error << "Cast from type: " << src << " to " << dst << " unimplemented\n"; } return; } if (const Call *c = Call::as_intrinsic(op->value, {Call::lerp})) { // We want to codegen a cast of a lerp as a single thing, because it can // be done more intelligently than a lerp followed by a cast. Type t = upgrade_type_for_arithmetic(c->type); Type wt = upgrade_type_for_arithmetic(c->args[2].type()); Expr e = lower_lerp(op->type, cast(t, c->args[0]), cast(t, c->args[1]), cast(wt, c->args[2]), target); codegen(e); return; } value = codegen(op->value); llvm::Type *llvm_dst = llvm_type_of(dst); if (dst.is_handle() && src.is_handle()) { value = builder->CreateBitCast(value, llvm_dst); } else if (dst.is_handle() || src.is_handle()) { internal_error << "Can't cast from " << src << " to " << dst << "\n"; } else if (!src.is_float() && !dst.is_float()) { // Widening integer casts either zero extend or sign extend, // depending on the source type. Narrowing integer casts // always truncate. value = builder->CreateIntCast(value, llvm_dst, src.is_int()); } else if (src.is_float() && dst.is_int()) { value = builder->CreateFPToSI(value, llvm_dst); } else if (src.is_float() && dst.is_uint()) { // fptoui has undefined behavior on overflow. Seems reasonable // to get an unspecified uint on overflow, but because uint1s // are stored in uint8s for float->uint1 casts this undefined // behavior manifests itself as uint1 values greater than 1, // which could in turn break our bounds inference // guarantees. So go via uint8 in this case. if (dst.bits() < 8) { value = builder->CreateFPToUI(value, llvm_type_of(dst.with_bits(8))); value = builder->CreateIntCast(value, llvm_dst, false); } else { value = builder->CreateFPToUI(value, llvm_dst); } } else if (src.is_int() && dst.is_float()) { value = builder->CreateSIToFP(value, llvm_dst); } else if (src.is_uint() && dst.is_float()) { value = builder->CreateUIToFP(value, llvm_dst); } else { internal_assert(src.is_float() && dst.is_float()); // Float widening or narrowing value = builder->CreateFPCast(value, llvm_dst); } } void CodeGen_LLVM::visit(const Reinterpret *op) { Type dst = op->type; llvm::Type *llvm_dst = llvm_type_of(dst); value = codegen(op->value); // Our `Reinterpret` expr directly maps to LLVM IR bitcast/ptrtoint/inttoptr // instructions with no additional handling required: // * bitcast between vectors and scalars is well-formed. // * ptrtoint/inttoptr implicitly truncates/zero-extends the integer // to match the pointer size. value = builder->CreateBitOrPointerCast(value, llvm_dst); } void CodeGen_LLVM::visit(const Variable *op) { value = sym_get(op->name); } template bool CodeGen_LLVM::try_to_fold_vector_reduce(const Expr &a, Expr b) { const VectorReduce *red = a.as(); if (!red) { red = b.as(); b = a; } if (red && ((std::is_same::value && red->op == VectorReduce::Add) || (std::is_same::value && red->op == VectorReduce::Min) || (std::is_same::value && red->op == VectorReduce::Max) || (std::is_same::value && red->op == VectorReduce::Mul) || (std::is_same::value && red->op == VectorReduce::And) || (std::is_same::value && red->op == VectorReduce::Or) || (std::is_same::value && red->op == VectorReduce::SaturatingAdd))) { codegen_vector_reduce(red, b); return true; } return false; } void CodeGen_LLVM::visit(const Add *op) { Type t = upgrade_type_for_arithmetic(op->type); if (t != op->type) { codegen(cast(op->type, Add::make(cast(t, op->a), cast(t, op->b)))); return; } // Some backends can fold the add into a vector reduce if (try_to_fold_vector_reduce(op->a, op->b)) { return; } Value *a = codegen(op->a); Value *b = codegen(op->b); if (op->type.is_float()) { if (!try_vector_predication_intrinsic("llvm.vp.fadd", llvm_type_of(t), t.lanes(), AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { value = builder->CreateFAdd(a, b); } } else if (op->type.is_int() && op->type.bits() >= 32) { // We tell llvm integers don't wrap, so that it generates good // code for loop indices. // TODO(zvookin): This needs vector predication, but I can't // see a way to do it. May go away in introducing correct // index type instead of using int32_t. value = builder->CreateNSWAdd(a, b); } else { if (!try_vector_predication_intrinsic("llvm.vp.add", llvm_type_of(t), t.lanes(), AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { value = builder->CreateAdd(a, b); } } } void CodeGen_LLVM::visit(const Sub *op) { Type t = upgrade_type_for_arithmetic(op->type); if (t != op->type) { codegen(cast(op->type, Sub::make(cast(t, op->a), cast(t, op->b)))); return; } Value *a = codegen(op->a); Value *b = codegen(op->b); if (op->type.is_float()) { if (!try_vector_predication_intrinsic("llvm.vp.fsub", llvm_type_of(t), t.lanes(), AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { value = builder->CreateFSub(a, b); } } else if (op->type.is_int() && op->type.bits() >= 32) { // We tell llvm integers don't wrap, so that it generates good // code for loop indices. // TODO(zvookin): This needs vector predication, but I can't // see a way to do it. May go away in introducing correct // index type instead of using int32_t. value = builder->CreateNSWSub(a, b); } else { if (!try_vector_predication_intrinsic("llvm.vp.sub", llvm_type_of(t), t.lanes(), AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { value = builder->CreateSub(a, b); } } } void CodeGen_LLVM::visit(const Mul *op) { Type t = upgrade_type_for_arithmetic(op->type); if (t != op->type) { codegen(cast(op->type, Mul::make(cast(t, op->a), cast(t, op->b)))); return; } if (try_to_fold_vector_reduce(op->a, op->b)) { return; } Value *a = codegen(op->a); Value *b = codegen(op->b); if (op->type.is_float()) { if (!try_vector_predication_intrinsic("llvm.vp.fmul", llvm_type_of(t), t.lanes(), AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { value = builder->CreateFMul(a, b); } } else if (op->type.is_int() && op->type.bits() >= 32) { // We tell llvm integers don't wrap, so that it generates good // code for loop indices. // TODO(zvookin): This needs vector predication, but I can't // see a way to do it. May go away in introducing correct // index type instead of using int32_t. value = builder->CreateNSWMul(a, b); } else { if (!try_vector_predication_intrinsic("llvm.vp.mul", llvm_type_of(t), t.lanes(), AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { value = builder->CreateMul(a, b); } } } void CodeGen_LLVM::visit(const Div *op) { user_assert(!is_const_zero(op->b)) << "Division by constant zero in expression: " << Expr(op) << "\n"; Type t = upgrade_type_for_arithmetic(op->type); if (t != op->type) { codegen(cast(op->type, Div::make(cast(t, op->a), cast(t, op->b)))); return; } if (op->type.is_float()) { // Don't call codegen() multiple times within an argument list: // order-of-evaluation isn't guaranteed and can vary by compiler, // leading to different LLVM IR ordering, which makes comparing // output hard. Value *a = codegen(op->a); Value *b = codegen(op->b); if (!try_vector_predication_intrinsic("llvm.vp.fdiv", llvm_type_of(t), t.lanes(), AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { value = builder->CreateFDiv(a, b); } } else { value = codegen(lower_int_uint_div(op->a, op->b)); } } void CodeGen_LLVM::visit(const Mod *op) { Type t = upgrade_type_for_arithmetic(op->type); if (t != op->type) { codegen(cast(op->type, Mod::make(cast(t, op->a), cast(t, op->b)))); return; } if (op->type.is_float()) { value = codegen(simplify(op->a - op->b * floor(op->a / op->b))); } else { value = codegen(lower_int_uint_mod(op->a, op->b)); } } void CodeGen_LLVM::visit(const Min *op) { Type t = upgrade_type_for_arithmetic(op->type); if (t != op->type) { codegen(cast(op->type, Min::make(cast(t, op->a), cast(t, op->b)))); return; } if (try_to_fold_vector_reduce(op->a, op->b)) { return; } string a_name = unique_name('a'); string b_name = unique_name('b'); Expr a = Variable::make(op->a.type(), a_name); Expr b = Variable::make(op->b.type(), b_name); value = codegen(Let::make(a_name, op->a, Let::make(b_name, op->b, select(a < b, a, b)))); } void CodeGen_LLVM::visit(const Max *op) { Type t = upgrade_type_for_arithmetic(op->type); if (t != op->type) { codegen(cast(op->type, Max::make(cast(t, op->a), cast(t, op->b)))); return; } if (try_to_fold_vector_reduce(op->a, op->b)) { return; } string a_name = unique_name('a'); string b_name = unique_name('b'); Expr a = Variable::make(op->a.type(), a_name); Expr b = Variable::make(op->b.type(), b_name); value = codegen(Let::make(a_name, op->a, Let::make(b_name, op->b, select(a > b, a, b)))); } void CodeGen_LLVM::visit(const EQ *op) { Type t = upgrade_type_for_arithmetic(op->a.type()); if (t != op->a.type()) { codegen(EQ::make(cast(t, op->a), cast(t, op->b))); return; } Value *a = codegen(op->a); Value *b = codegen(op->b); if (t.is_float()) { if (!try_vector_predication_comparison("llvm.vp.fcmp", op->type, AllEnabledMask(), a, b, "oeq")) { value = builder->CreateFCmpOEQ(a, b); } } else { if (!try_vector_predication_comparison("llvm.vp.icmp", op->type, AllEnabledMask(), a, b, "eq")) { value = builder->CreateICmpEQ(a, b); } } } void CodeGen_LLVM::visit(const NE *op) { Type t = upgrade_type_for_arithmetic(op->a.type()); if (t != op->a.type()) { codegen(NE::make(cast(t, op->a), cast(t, op->b))); return; } Value *a = codegen(op->a); Value *b = codegen(op->b); if (t.is_float()) { if (!try_vector_predication_comparison("llvm.vp.fcmp", op->type, AllEnabledMask(), a, b, "one")) { value = builder->CreateFCmpONE(a, b); } } else { if (!try_vector_predication_comparison("llvm.vp.icmp", op->type, AllEnabledMask(), a, b, "ne")) { value = builder->CreateICmpNE(a, b); } } } void CodeGen_LLVM::visit(const LT *op) { Type t = upgrade_type_for_arithmetic(op->a.type()); if (t != op->a.type()) { codegen(LT::make(cast(t, op->a), cast(t, op->b))); return; } Value *a = codegen(op->a); Value *b = codegen(op->b); if (t.is_float()) { if (!try_vector_predication_comparison("llvm.vp.fcmp", op->type, AllEnabledMask(), a, b, "olt")) { value = builder->CreateFCmpOLT(a, b); } } else if (t.is_int()) { if (!try_vector_predication_comparison("llvm.vp.icmp", op->type, AllEnabledMask(), a, b, "slt")) { value = builder->CreateICmpSLT(a, b); } } else { if (!try_vector_predication_comparison("llvm.vp.icmp", op->type, AllEnabledMask(), a, b, "ult")) { value = builder->CreateICmpULT(a, b); } } } void CodeGen_LLVM::visit(const LE *op) { Type t = upgrade_type_for_arithmetic(op->a.type()); if (t != op->a.type()) { codegen(LE::make(cast(t, op->a), cast(t, op->b))); return; } Value *a = codegen(op->a); Value *b = codegen(op->b); if (t.is_float()) { if (!try_vector_predication_comparison("llvm.vp.fcmp", op->type, AllEnabledMask(), a, b, "ole")) { value = builder->CreateFCmpOLE(a, b); } } else if (t.is_int()) { if (!try_vector_predication_comparison("llvm.vp.icmp", op->type, AllEnabledMask(), a, b, "sle")) { value = builder->CreateICmpSLE(a, b); } } else { if (!try_vector_predication_comparison("llvm.vp.icmp", op->type, AllEnabledMask(), a, b, "ule")) { value = builder->CreateICmpULE(a, b); } } } void CodeGen_LLVM::visit(const GT *op) { Type t = upgrade_type_for_arithmetic(op->a.type()); if (t != op->a.type()) { codegen(GT::make(cast(t, op->a), cast(t, op->b))); return; } Value *a = codegen(op->a); Value *b = codegen(op->b); if (t.is_float()) { if (!try_vector_predication_comparison("llvm.vp.fcmp", op->type, AllEnabledMask(), a, b, "ogt")) { value = builder->CreateFCmpOGT(a, b); } } else if (t.is_int()) { if (!try_vector_predication_comparison("llvm.vp.icmp", op->type, AllEnabledMask(), a, b, "sgt")) { value = builder->CreateICmpSGT(a, b); } } else { if (!try_vector_predication_comparison("llvm.vp.icmp", op->type, AllEnabledMask(), a, b, "ugt")) { value = builder->CreateICmpUGT(a, b); } } } void CodeGen_LLVM::visit(const GE *op) { Type t = upgrade_type_for_arithmetic(op->a.type()); if (t != op->a.type()) { codegen(GE::make(cast(t, op->a), cast(t, op->b))); return; } Value *a = codegen(op->a); Value *b = codegen(op->b); if (t.is_float()) { if (!try_vector_predication_comparison("llvm.vp.fcmp", op->type, AllEnabledMask(), a, b, "oge")) { value = builder->CreateFCmpOGE(a, b); } } else if (t.is_int()) { if (!try_vector_predication_comparison("llvm.vp.icmp", op->type, AllEnabledMask(), a, b, "sge")) { value = builder->CreateICmpSGE(a, b); } } else { if (!try_vector_predication_comparison("llvm.vp.icmp", op->type, AllEnabledMask(), a, b, "uge")) { value = builder->CreateICmpUGE(a, b); } } } void CodeGen_LLVM::visit(const And *op) { if (try_to_fold_vector_reduce(op->a, op->b)) { return; } Value *a = codegen(op->a); Value *b = codegen(op->b); if (!try_vector_predication_intrinsic("llvm.vp.and", llvm_type_of(op->type), op->type.lanes(), AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { value = builder->CreateAnd(a, b); } } void CodeGen_LLVM::visit(const Or *op) { if (try_to_fold_vector_reduce(op->a, op->b)) { return; } Value *a = codegen(op->a); Value *b = codegen(op->b); if (!try_vector_predication_intrinsic("llvm.vp.or", llvm_type_of(op->type), op->type.lanes(), AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { value = builder->CreateOr(a, b); } } void CodeGen_LLVM::visit(const Not *op) { Value *a = codegen(op->a); if (!try_vector_predication_intrinsic("llvm.vp.not", llvm_type_of(op->type), op->type.lanes(), AllEnabledMask(), {VPArg(a, 0)})) { value = builder->CreateNot(a); } } void CodeGen_LLVM::visit(const Select *op) { Value *cmp = codegen(op->condition); if (use_llvm_vp_intrinsics && op->type.is_vector() && op->condition.type().is_scalar()) { cmp = create_broadcast(cmp, op->type.lanes()); } Value *a = codegen(op->true_value); Value *b = codegen(op->false_value); if (!try_vector_predication_intrinsic("llvm.vp.select", llvm_type_of(op->type), op->type.lanes(), NoMask(), {VPArg(cmp), VPArg(a, 0), VPArg(b)})) { value = builder->CreateSelect(cmp, a, b); } } namespace { Expr promote_64(const Expr &e) { if (const Add *a = e.as()) { return Add::make(promote_64(a->a), promote_64(a->b)); } else if (const Sub *s = e.as()) { return Sub::make(promote_64(s->a), promote_64(s->b)); } else if (const Mul *m = e.as()) { return Mul::make(promote_64(m->a), promote_64(m->b)); } else if (const Min *m = e.as()) { return Min::make(promote_64(m->a), promote_64(m->b)); } else if (const Max *m = e.as()) { return Max::make(promote_64(m->a), promote_64(m->b)); } else { return cast(Int(64), e); } } } // namespace Value *CodeGen_LLVM::codegen_buffer_pointer(const string &buffer, Halide::Type type, Expr index) { // Find the base address from the symbol table Value *base_address = symbol_table.get(buffer); return codegen_buffer_pointer(base_address, type, std::move(index)); } Value *CodeGen_LLVM::codegen_buffer_pointer(Value *base_address, Halide::Type type, Expr index) { // Promote index to 64-bit on targets that use 64-bit pointers. llvm::DataLayout d(module.get()); if (promote_indices() && d.getPointerSize() == 8) { index = promote_64(index); } // Peel off a constant offset as a second GEP. This helps LLVM's // aliasing analysis, especially for backends that do address // computation in 32 bits but use 64-bit pointers. if (const Add *add = index.as()) { if (const int64_t *offset = as_const_int(add->b)) { Value *base = codegen_buffer_pointer(base_address, type, add->a); Value *off = codegen(make_const(Int(8 * d.getPointerSize()), *offset)); return CreateInBoundsGEP(builder.get(), llvm_type_of(type), base, off); } } return codegen_buffer_pointer(base_address, type, codegen(index)); } Value *CodeGen_LLVM::codegen_buffer_pointer(const string &buffer, Halide::Type type, Value *index) { // Find the base address from the symbol table Value *base_address = symbol_table.get(buffer); return codegen_buffer_pointer(base_address, type, index); } Value *CodeGen_LLVM::codegen_buffer_pointer(Value *base_address, Halide::Type type, Value *index) { type = upgrade_type_for_storage(type); llvm::Type *load_type = llvm_type_of(type); unsigned address_space = base_address->getType()->getPointerAddressSpace(); llvm::Type *pointer_load_type = load_type->getPointerTo(address_space); // TODO: This can likely be removed once opaque pointers are default // in all supported LLVM versions. base_address = builder->CreatePointerCast(base_address, pointer_load_type); llvm::Constant *constant_index = dyn_cast(index); if (constant_index && constant_index->isZeroValue()) { return base_address; } // Promote index to 64-bit on targets that use 64-bit pointers. llvm::DataLayout d(module.get()); if (d.getPointerSize() == 8) { llvm::Type *index_type = index->getType(); llvm::Type *desired_index_type = i64_t; if (isa(index_type)) { desired_index_type = VectorType::get(desired_index_type, dyn_cast(index_type)->getElementCount()); } index = builder->CreateIntCast(index, desired_index_type, true); } return CreateInBoundsGEP(builder.get(), load_type, base_address, index); } void CodeGen_LLVM::add_tbaa_metadata(llvm::Instruction *inst, string buffer, const Expr &index) { // Get the unique name for the block of memory this allocate node // is using. buffer = get_allocation_name(buffer); // If the index is constant, we generate some TBAA info that helps // LLVM understand our loads/stores aren't aliased. bool constant_index = false; int64_t base = 0; int64_t width = 1; if (index.defined()) { if (const Ramp *ramp = index.as()) { const int64_t *pstride = as_const_int(ramp->stride); const int64_t *pbase = as_const_int(ramp->base); if (pstride && pbase) { // We want to find the smallest aligned width and offset // that contains this ramp. int64_t stride = *pstride; base = *pbase; internal_assert(base >= 0); width = next_power_of_two(ramp->lanes * stride); while (base % width) { base -= base % width; width *= 2; } constant_index = true; } } else { const int64_t *pbase = as_const_int(index); if (pbase) { base = *pbase; constant_index = true; } } } llvm::MDBuilder builder(*context); // Add type-based-alias-analysis metadata to the pointer, so that // loads and stores to different buffers can get reordered. MDNode *tbaa = builder.createTBAARoot("Halide buffer"); tbaa = builder.createTBAAScalarTypeNode(buffer, tbaa); // We also add metadata for constant indices to allow loads and // stores to the same buffer to get reordered. if (constant_index) { for (int w = 1024; w >= width; w /= 2) { int64_t b = (base / w) * w; std::stringstream level; level << buffer << ".width" << w << ".base" << b; tbaa = builder.createTBAAScalarTypeNode(level.str(), tbaa); } } tbaa = builder.createTBAAStructTagNode(tbaa, tbaa, 0); inst->setMetadata("tbaa", tbaa); } void CodeGen_LLVM::function_does_not_access_memory(llvm::Function *fn) { #if LLVM_VERSION >= 160 fn->addFnAttr("memory(none)"); #else fn->addFnAttr(llvm::Attribute::ReadNone); #endif } void CodeGen_LLVM::visit(const Load *op) { // If the type should be stored as some other type, insert a reinterpret cast. Type storage_type = upgrade_type_for_storage(op->type); if (op->type != storage_type) { codegen(reinterpret(op->type, Load::make(storage_type, op->name, op->index, op->image, op->param, op->predicate, op->alignment))); return; } // Predicated load if (!is_const_one(op->predicate)) { codegen_predicated_load(op); return; } // There are several cases. Different architectures may wish to override some. if (op->type.is_scalar()) { // Scalar loads Value *ptr = codegen_buffer_pointer(op->name, op->type, op->index); LoadInst *load = builder->CreateAlignedLoad(llvm_type_of(op->type), ptr, llvm::Align(op->type.bytes())); add_tbaa_metadata(load, op->name, op->index); value = load; } else { const Ramp *ramp = op->index.as(); const IntImm *stride = ramp ? ramp->stride.as() : nullptr; llvm::Type *load_type = llvm_type_of(op->type.element_of()); if (ramp && stride && stride->value == 1) { value = codegen_dense_vector_load(op); } else if (ramp && stride && stride->value == -1) { // Load the vector and then flip it in-place Expr flipped_base = ramp->base - ramp->lanes + 1; Expr flipped_stride = make_one(flipped_base.type()); Expr flipped_index = Ramp::make(flipped_base, flipped_stride, ramp->lanes); ModulusRemainder align = op->alignment; // Switch to the alignment of the last lane align = align - (ramp->lanes - 1); Expr flipped_load = Load::make(op->type, op->name, flipped_index, op->image, op->param, op->predicate, align); Value *flipped = codegen(flipped_load); vector indices(ramp->lanes); for (int i = 0; i < ramp->lanes; i++) { indices[i] = ramp->lanes - 1 - i; } value = shuffle_vectors(flipped, indices); } else if (ramp) { // Gather without generating the indices as a vector Value *ptr = codegen_buffer_pointer(op->name, op->type.element_of(), ramp->base); Value *stride = codegen(ramp->stride); value = PoisonValue::get(llvm_type_of(op->type)); for (int i = 0; i < ramp->lanes; i++) { Value *lane = ConstantInt::get(i32_t, i); LoadInst *val = builder->CreateLoad(load_type, ptr); add_tbaa_metadata(val, op->name, op->index); value = builder->CreateInsertElement(value, val, lane); ptr = CreateInBoundsGEP(builder.get(), load_type, ptr, stride); } } else if ((false)) { /* should_scalarize(op->index) */ // TODO: put something sensible in for // should_scalarize. Probably a good idea if there are no // loads in it, and it's all int32. // Compute the index as scalars, and then do a gather Value *vec = PoisonValue::get(llvm_type_of(op->type)); for (int i = 0; i < op->type.lanes(); i++) { Expr idx = extract_lane(op->index, i); Value *ptr = codegen_buffer_pointer(op->name, op->type.element_of(), idx); LoadInst *val = builder->CreateLoad(load_type, ptr); add_tbaa_metadata(val, op->name, op->index); vec = builder->CreateInsertElement(vec, val, ConstantInt::get(i32_t, i)); } value = vec; } else { // General gathers Value *index = codegen(op->index); Value *vec = PoisonValue::get(llvm_type_of(op->type)); for (int i = 0; i < op->type.lanes(); i++) { Value *idx = builder->CreateExtractElement(index, ConstantInt::get(i32_t, i)); Value *ptr = codegen_buffer_pointer(op->name, op->type.element_of(), idx); LoadInst *val = builder->CreateLoad(load_type, ptr); add_tbaa_metadata(val, op->name, op->index); vec = builder->CreateInsertElement(vec, val, ConstantInt::get(i32_t, i)); } value = vec; } } } void CodeGen_LLVM::visit(const Ramp *op) { if (is_const(op->stride) && !is_const(op->base)) { // If the stride is const and the base is not (e.g. ramp(x, 1, // 4)), we can lift out the stride and broadcast the base so // we can do a single vector broadcast and add instead of // repeated insertion Expr broadcast = Broadcast::make(op->base, op->lanes); Expr ramp = Ramp::make(make_zero(op->base.type()), op->stride, op->lanes); value = codegen(broadcast + ramp); } else if (!is_const(op->stride)) { Expr broadcast_base = Broadcast::make(op->base, op->lanes); Expr broadcast_stride = Broadcast::make(op->stride, op->lanes); Expr ramp = Ramp::make(make_zero(op->base.type()), make_one(op->base.type()), op->lanes); value = codegen(broadcast_base + broadcast_stride * ramp); } else { internal_assert(is_const(op->base) && is_const(op->stride)); // At this point base and stride should be constant. Generate // an insert element sequence. The code will be lifted to a // constant vector stored in .rodata or similar. Value *base = codegen(op->base); Value *stride = codegen(op->stride); value = PoisonValue::get(llvm_type_of(op->type)); for (int i = 0; i < op->type.lanes(); i++) { if (i > 0) { if (op->type.is_float()) { base = builder->CreateFAdd(base, stride); } else if (op->type.is_int() && op->type.bits() >= 32) { base = builder->CreateNSWAdd(base, stride); } else { base = builder->CreateAdd(base, stride); } } value = builder->CreateInsertElement(value, base, ConstantInt::get(i32_t, i)); } } } llvm::Value *CodeGen_LLVM::create_broadcast(llvm::Value *v, int lanes) { Constant *poison = PoisonValue::get(get_vector_type(v->getType(), lanes)); Constant *zero = ConstantInt::get(i32_t, 0); v = builder->CreateInsertElement(poison, v, zero); Constant *zeros = get_splat(lanes, zero); return builder->CreateShuffleVector(v, poison, zeros); } void CodeGen_LLVM::visit(const Broadcast *op) { Value *v = codegen(op->value); value = create_broadcast(v, op->lanes); } Value *CodeGen_LLVM::interleave_vectors(const std::vector &vecs) { internal_assert(!vecs.empty()); for (size_t i = 1; i < vecs.size(); i++) { internal_assert(vecs[0]->getType() == vecs[i]->getType()); } int vec_elements = get_vector_num_elements(vecs[0]->getType()); if (vecs.size() == 1) { return vecs[0]; } else if (vecs.size() == 2) { Value *a = vecs[0]; Value *b = vecs[1]; vector indices(vec_elements * 2); for (int i = 0; i < vec_elements * 2; i++) { indices[i] = i % 2 == 0 ? i / 2 : i / 2 + vec_elements; } return shuffle_vectors(a, b, indices); } else { // Grab the even and odd elements of vecs. vector even_vecs; vector odd_vecs; for (size_t i = 0; i < vecs.size(); i++) { if (i % 2 == 0) { even_vecs.push_back(vecs[i]); } else { odd_vecs.push_back(vecs[i]); } } // If the number of vecs is odd, save the last one for later. Value *last = nullptr; if (even_vecs.size() > odd_vecs.size()) { last = even_vecs.back(); even_vecs.pop_back(); } internal_assert(even_vecs.size() == odd_vecs.size()); // Interleave the even and odd parts. Value *even = interleave_vectors(even_vecs); Value *odd = interleave_vectors(odd_vecs); if (last) { int result_elements = vec_elements * vecs.size(); // Interleave even and odd, leaving a space for the last element. vector indices(result_elements, -1); for (int i = 0, idx = 0; i < result_elements; i++) { if (i % vecs.size() < vecs.size() - 1) { indices[i] = idx % 2 == 0 ? idx / 2 : idx / 2 + vec_elements * even_vecs.size(); idx++; } } Value *even_odd = shuffle_vectors(even, odd, indices); // Interleave the last vector into the result. last = slice_vector(last, 0, result_elements); for (int i = 0; i < result_elements; i++) { if (i % vecs.size() < vecs.size() - 1) { indices[i] = i; } else { indices[i] = i / vecs.size() + result_elements; } } return shuffle_vectors(even_odd, last, indices); } else { return interleave_vectors({even, odd}); } } } void CodeGen_LLVM::scalarize(const Expr &e) { llvm::Type *result_type = llvm_type_of(e.type()); Value *result = PoisonValue::get(result_type); for (int i = 0; i < e.type().lanes(); i++) { Value *v = codegen(extract_lane(e, i)); result = builder->CreateInsertElement(result, v, ConstantInt::get(i32_t, i)); } value = result; } void CodeGen_LLVM::codegen_predicated_store(const Store *op) { const Ramp *ramp = op->index.as(); if (ramp && is_const_one(ramp->stride) && !emit_atomic_stores) { // Dense vector store debug(4) << "Predicated dense vector store\n\t" << Stmt(op) << "\n"; Value *vpred = codegen(op->predicate); Halide::Type value_type = op->value.type(); Value *val = codegen(op->value); int alignment = value_type.bytes(); int native_bytes = native_vector_bits() / 8; // Boost the alignment if possible, up to the native vector width. ModulusRemainder mod_rem = op->alignment; while ((mod_rem.remainder & 1) == 0 && (mod_rem.modulus & 1) == 0 && alignment < native_bytes) { mod_rem.modulus /= 2; mod_rem.remainder /= 2; alignment *= 2; } // If it is an external buffer, then we cannot assume that the host pointer // is aligned to at least the native vector width. However, we may be able to do // better than just assuming that it is unaligned. if (op->param.defined()) { int host_alignment = op->param.host_alignment(); alignment = gcd(alignment, host_alignment); } // For dense vector stores wider than the native vector // width, bust them up into native vectors. int store_lanes = value_type.lanes(); int native_lanes = maximum_vector_bits() / value_type.bits(); for (int i = 0; i < store_lanes; i += native_lanes) { int slice_lanes = std::min(native_lanes, store_lanes - i); Expr slice_base = simplify(ramp->base + i); Expr slice_stride = make_one(slice_base.type()); Expr slice_index = slice_lanes == 1 ? slice_base : Ramp::make(slice_base, slice_stride, slice_lanes); Value *slice_val = slice_vector(val, i, slice_lanes); Value *elt_ptr = codegen_buffer_pointer(op->name, value_type.element_of(), slice_base); Value *vec_ptr = builder->CreatePointerCast(elt_ptr, slice_val->getType()->getPointerTo()); Value *slice_mask = slice_vector(vpred, i, slice_lanes); Instruction *store; if (try_vector_predication_intrinsic("llvm.vp.store", void_t, slice_lanes, slice_mask, {VPArg(slice_val, 0), VPArg(vec_ptr, 1, alignment)})) { store = dyn_cast(value); } else { store = builder->CreateMaskedStore(slice_val, vec_ptr, llvm::Align(alignment), slice_mask); } add_tbaa_metadata(store, op->name, slice_index); } } else { // It's not dense vector store, we need to scalarize it debug(4) << "Scalarize predicated vector store\n"; Type value_type = op->value.type().element_of(); Value *vpred = codegen(op->predicate); Value *vval = codegen(op->value); Value *vindex = codegen(op->index); for (int i = 0; i < op->index.type().lanes(); i++) { Constant *lane = ConstantInt::get(i32_t, i); Value *p = vpred; Value *v = vval; Value *idx = vindex; if (op->index.type().lanes() > 1) { p = builder->CreateExtractElement(p, lane); v = builder->CreateExtractElement(v, lane); idx = builder->CreateExtractElement(idx, lane); } internal_assert(p && v && idx); if (p->getType() != i1_t) { p = builder->CreateIsNotNull(p); } BasicBlock *true_bb = BasicBlock::Create(*context, "true_bb", function); BasicBlock *after_bb = BasicBlock::Create(*context, "after_bb", function); builder->CreateCondBr(p, true_bb, after_bb); builder->SetInsertPoint(true_bb); // Scalar Value *ptr = codegen_buffer_pointer(op->name, value_type, idx); StoreInst *store = builder->CreateAlignedStore(v, ptr, llvm::Align(value_type.bytes())); if (emit_atomic_stores) { store->setAtomic(AtomicOrdering::Monotonic); } builder->CreateBr(after_bb); builder->SetInsertPoint(after_bb); } } } llvm::Value *CodeGen_LLVM::codegen_vector_load(const Type &type, const std::string &name, const Expr &base, const Buffer<> &image, const Parameter ¶m, const ModulusRemainder &alignment, llvm::Value *vpred, bool slice_to_native, llvm::Value *stride) { debug(4) << "Vectorize predicated dense vector load:\n\t" << "(" << type << ")" << name << "[ramp(base, 1, " << type.lanes() << ")]\n"; int align_bytes = type.bytes(); // The size of a single element int native_bits = native_vector_bits(); int native_bytes = native_bits / 8; // We assume halide_malloc for the platform returns buffers // aligned to at least the native vector width. So this is the // maximum alignment we can infer based on the index alone. // Boost the alignment if possible, up to the native vector width. ModulusRemainder mod_rem = alignment; while ((mod_rem.remainder & 1) == 0 && (mod_rem.modulus & 1) == 0 && align_bytes < native_bytes) { mod_rem.modulus /= 2; mod_rem.remainder /= 2; align_bytes *= 2; } // If it is an external buffer, then we cannot assume that the host pointer // is aligned to at least native vector width. However, we may be able to do // better than just assuming that it is unaligned. if (param.defined()) { int host_alignment = param.host_alignment(); align_bytes = gcd(align_bytes, host_alignment); } else if (get_target().has_feature(Target::JIT) && image.defined()) { // If we're JITting, use the actual pointer value to determine alignment for embedded buffers. align_bytes = gcd(align_bytes, (int)(((uintptr_t)image.data()) & std::numeric_limits::max())); } // For dense vector loads wider than the native vector // width, bust them up into native vectors int load_lanes = type.lanes(); int native_lanes = slice_to_native ? std::max(1, maximum_vector_bits() / type.bits()) : load_lanes; vector slices; for (int i = 0; i < load_lanes; i += native_lanes) { int slice_lanes = std::min(native_lanes, load_lanes - i); Expr slice_base = simplify(base + i); Expr slice_stride = make_one(slice_base.type()); Expr slice_index = slice_lanes == 1 ? slice_base : Ramp::make(slice_base, slice_stride, slice_lanes); llvm::Type *slice_type = get_vector_type(llvm_type_of(type.element_of()), slice_lanes); Value *elt_ptr = codegen_buffer_pointer(name, type.element_of(), slice_base); Value *vec_ptr = builder->CreatePointerCast(elt_ptr, slice_type->getPointerTo()); Value *slice_mask = (vpred != nullptr) ? slice_vector(vpred, i, slice_lanes) : nullptr; MaskVariant vp_slice_mask = slice_mask ? MaskVariant(slice_mask) : AllEnabledMask(); Instruction *load_inst = nullptr; // In this path, strided predicated loads are only handled if vector // predication is enabled. Otherwise this would be scalarized at a higher // level. Assume that if stride is passed, this is not dense, though // LLVM should codegen the same thing for a constant 1 strided load as // for a non-strided load. if (stride) { if (get_target().bits == 64 && !stride->getType()->isIntegerTy(64)) { stride = builder->CreateIntCast(stride, i64_t, true); } if (try_vector_predication_intrinsic("llvm.experimental.vp.strided.load", VPResultType(slice_type, 0), slice_lanes, vp_slice_mask, {VPArg(vec_ptr, 1, align_bytes), VPArg(stride, 2)})) { load_inst = dyn_cast(value); } else { internal_error << "Vector predicated strided load should not be requested if not supported.\n"; } } else { if (try_vector_predication_intrinsic("llvm.vp.load", VPResultType(slice_type, 0), slice_lanes, vp_slice_mask, {VPArg(vec_ptr, 1, align_bytes)})) { load_inst = dyn_cast(value); } else { if (slice_mask != nullptr) { load_inst = builder->CreateMaskedLoad(slice_type, vec_ptr, llvm::Align(align_bytes), slice_mask); } else { load_inst = builder->CreateAlignedLoad(slice_type, vec_ptr, llvm::Align(align_bytes)); } } } add_tbaa_metadata(load_inst, name, slice_index); slices.push_back(load_inst); } value = concat_vectors(slices); return value; } Value *CodeGen_LLVM::codegen_dense_vector_load(const Load *load, Value *vpred, bool slice_to_native) { const Ramp *ramp = load->index.as(); internal_assert(ramp && is_const_one(ramp->stride)) << "Should be dense vector load\n"; return codegen_vector_load(load->type, load->name, ramp->base, load->image, load->param, load->alignment, vpred, slice_to_native, nullptr); } void CodeGen_LLVM::codegen_predicated_load(const Load *op) { const Ramp *ramp = op->index.as(); const IntImm *stride = ramp ? ramp->stride.as() : nullptr; if (ramp && is_const_one(ramp->stride)) { // Dense vector load Value *vpred = codegen(op->predicate); value = codegen_dense_vector_load(op, vpred); } else if (use_llvm_vp_intrinsics && stride) { // Case only handled by vector predication, otherwise must scalarize. Value *vpred = codegen(op->predicate); Value *llvm_stride = codegen(stride); // Not 1 (dense) as that was caught above. value = codegen_vector_load(op->type, op->name, ramp->base, op->image, op->param, op->alignment, vpred, true, llvm_stride); } else if (ramp && stride && stride->value == -1) { debug(4) << "Predicated dense vector load with stride -1\n\t" << Expr(op) << "\n"; vector indices(ramp->lanes); for (int i = 0; i < ramp->lanes; i++) { indices[i] = ramp->lanes - 1 - i; } // Flip the predicate Value *vpred = codegen(op->predicate); vpred = shuffle_vectors(vpred, indices); // Load the vector and then flip it in-place Expr flipped_base = ramp->base - ramp->lanes + 1; Expr flipped_stride = make_one(flipped_base.type()); Expr flipped_index = Ramp::make(flipped_base, flipped_stride, ramp->lanes); ModulusRemainder align = op->alignment; align = align - (ramp->lanes - 1); Expr flipped_load = Load::make(op->type, op->name, flipped_index, op->image, op->param, const_true(op->type.lanes()), align); Value *flipped = codegen_dense_vector_load(flipped_load.as(), vpred); value = shuffle_vectors(flipped, indices); } else { // It's not dense vector load, we need to scalarize it Expr load_expr = Load::make(op->type, op->name, op->index, op->image, op->param, const_true(op->type.lanes()), op->alignment); debug(4) << "Scalarize predicated vector load\n\t" << load_expr << "\n"; Expr pred_load = Call::make(load_expr.type(), Call::if_then_else, {op->predicate, load_expr}, Internal::Call::PureIntrinsic); value = codegen(pred_load); } } void CodeGen_LLVM::codegen_atomic_rmw(const Store *op) { // TODO: predicated store (see https://github.com/halide/Halide/issues/4298). user_assert(is_const_one(op->predicate)) << "Atomic predicated store is not supported.\n"; // Detect whether we can describe this as an atomic-read-modify-write, // otherwise fallback to a compare-and-swap loop. // Currently we only test for atomicAdd. Expr val_expr = op->value; Halide::Type value_type = op->value.type(); // For atomicAdd, we check if op->value - store[index] is independent of store. // For llvm version < 9, the atomicRMW operations only support integers so we also check that. Expr equiv_load = Load::make(value_type, op->name, op->index, Buffer<>(), op->param, op->predicate, op->alignment); Expr delta = simplify(common_subexpression_elimination(op->value - equiv_load)); bool is_atomic_add = supports_atomic_add(value_type) && !expr_uses_var(delta, op->name); if (is_atomic_add) { Value *val = codegen(delta); if (value_type.is_scalar()) { Value *ptr = codegen_buffer_pointer(op->name, op->value.type(), op->index); if (value_type.is_float()) { builder->CreateAtomicRMW(AtomicRMWInst::FAdd, ptr, val, llvm::MaybeAlign(), AtomicOrdering::Monotonic); } else { builder->CreateAtomicRMW(AtomicRMWInst::Add, ptr, val, llvm::MaybeAlign(), AtomicOrdering::Monotonic); } } else { Value *index = codegen(op->index); // Scalarize vector store. for (int i = 0; i < value_type.lanes(); i++) { Value *lane = ConstantInt::get(i32_t, i); Value *idx = builder->CreateExtractElement(index, lane); Value *v = builder->CreateExtractElement(val, lane); Value *ptr = codegen_buffer_pointer(op->name, value_type.element_of(), idx); if (value_type.is_float()) { builder->CreateAtomicRMW(AtomicRMWInst::FAdd, ptr, v, llvm::MaybeAlign(), AtomicOrdering::Monotonic); } else { builder->CreateAtomicRMW(AtomicRMWInst::Add, ptr, v, llvm::MaybeAlign(), AtomicOrdering::Monotonic); } } } } else { // We want to create the following CAS loop: // entry: // %orig = load atomic op->name[op->index] // br label %casloop.start // casloop.start: // %cmp = phi [%orig, %entry], [%value_loaded %casloop.start] // %val = ... // %val_success = cmpxchg %ptr, %cmp, %val, monotonic // %val_loaded = extractvalue %val_success, 0 // %success = extractvalue %val_success, 1 // br %success, label %casloop.end, label %casloop.start // casloop.end: Value *vec_index = nullptr; if (!value_type.is_scalar()) { // Precompute index for vector store. vec_index = codegen(op->index); } // Scalarize vector store. for (int lane_id = 0; lane_id < value_type.lanes(); lane_id++) { LLVMContext &ctx = builder->getContext(); BasicBlock *bb = builder->GetInsertBlock(); llvm::Function *f = bb->getParent(); BasicBlock *loop_bb = BasicBlock::Create(ctx, "casloop.start", f); // Load the old value for compare and swap test. Value *ptr = nullptr; if (value_type.is_scalar()) { ptr = codegen_buffer_pointer(op->name, value_type, op->index); } else { Value *idx = builder->CreateExtractElement(vec_index, ConstantInt::get(i32_t, lane_id)); ptr = codegen_buffer_pointer(op->name, value_type.element_of(), idx); } llvm::Type *load_type = llvm_type_of(value_type.element_of()); LoadInst *orig = builder->CreateAlignedLoad(load_type, ptr, llvm::Align(value_type.bytes())); orig->setOrdering(AtomicOrdering::Monotonic); add_tbaa_metadata(orig, op->name, op->index); // Explicit fall through from the current block to the cas loop body. builder->CreateBr(loop_bb); // CAS loop body: builder->SetInsertPoint(loop_bb); PHINode *cmp = builder->CreatePHI(load_type, 2, "loaded"); Value *cmp_val = cmp; cmp->addIncoming(orig, bb); Value *val = nullptr; if (value_type.is_scalar()) { val = codegen(op->value); } else { val = codegen(extract_lane(op->value, lane_id)); } llvm::Type *val_type = val->getType(); bool need_bit_cast = val_type->isFloatingPointTy(); if (need_bit_cast) { IntegerType *int_type = builder->getIntNTy(val_type->getPrimitiveSizeInBits()); unsigned int addr_space = ptr->getType()->getPointerAddressSpace(); ptr = builder->CreateBitCast(ptr, int_type->getPointerTo(addr_space)); val = builder->CreateBitCast(val, int_type); cmp_val = builder->CreateBitCast(cmp_val, int_type); } Value *cmpxchg_pair = builder->CreateAtomicCmpXchg( ptr, cmp_val, val, llvm::MaybeAlign(), AtomicOrdering::Monotonic, AtomicOrdering::Monotonic); Value *val_loaded = builder->CreateExtractValue(cmpxchg_pair, 0, "val_loaded"); Value *success = builder->CreateExtractValue(cmpxchg_pair, 1, "success"); if (need_bit_cast) { val_loaded = builder->CreateBitCast(val_loaded, val_type); } cmp->addIncoming(val_loaded, loop_bb); BasicBlock *exit_bb = BasicBlock::Create(ctx, "casloop.end", f); builder->CreateCondBr(success, exit_bb, loop_bb); builder->SetInsertPoint(exit_bb); } } } void CodeGen_LLVM::visit(const Call *op) { internal_assert(op->is_extern() || op->is_intrinsic()) << "Can only codegen extern calls and intrinsics\n"; value = call_overloaded_intrin(op->type, op->name, op->args); if (value) { return; } // Some call nodes are actually injected at various stages as a // cue for llvm to generate particular ops. In general these are // handled in the standard library, but ones with e.g. varying // types are handled here. if (op->is_intrinsic(Call::debug_to_file)) { internal_assert(op->args.size() == 3); const StringImm *filename = op->args[0].as(); internal_assert(filename) << "Malformed debug_to_file node\n"; // Grab the function from the initial module llvm::Function *debug_to_file = module->getFunction("halide_debug_to_file"); internal_assert(debug_to_file) << "Could not find halide_debug_to_file function in initial module\n"; // Make the filename a global string constant Value *user_context = get_user_context(); Value *char_ptr = codegen(Expr(filename)); vector args = {user_context, char_ptr, codegen(op->args[1])}; Value *buffer = codegen(op->args[2]); buffer = builder->CreatePointerCast(buffer, debug_to_file->getFunctionType()->getParamType(3)); args.push_back(buffer); value = builder->CreateCall(debug_to_file, args); } else if (op->is_intrinsic(Call::bitwise_and)) { internal_assert(op->args.size() == 2); Value *a = codegen(op->args[0]); Value *b = codegen(op->args[1]); if (!try_vector_predication_intrinsic("llvm.vp.and", llvm_type_of(op->type), op->type.lanes(), AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { value = builder->CreateAnd(a, b); } } else if (op->is_intrinsic(Call::bitwise_xor)) { internal_assert(op->args.size() == 2); Value *a = codegen(op->args[0]); Value *b = codegen(op->args[1]); if (!try_vector_predication_intrinsic("llvm.vp.xor", llvm_type_of(op->type), op->type.lanes(), AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { value = builder->CreateXor(a, b); } } else if (op->is_intrinsic(Call::bitwise_or)) { internal_assert(op->args.size() == 2); Value *a = codegen(op->args[0]); Value *b = codegen(op->args[1]); if (!try_vector_predication_intrinsic("llvm.vp.or", llvm_type_of(op->type), op->type.lanes(), AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { value = builder->CreateOr(a, b); } } else if (op->is_intrinsic(Call::bitwise_not)) { internal_assert(op->args.size() == 1); Value *a = codegen(op->args[0]); if (!try_vector_predication_intrinsic("llvm.vp.not", llvm_type_of(op->type), op->type.lanes(), AllEnabledMask(), {VPArg(a, 0)})) { value = builder->CreateNot(a); } } else if (op->is_intrinsic(Call::shift_left)) { internal_assert(op->args.size() == 2); if (op->args[1].type().is_uint()) { Value *a = codegen(op->args[0]); Value *b = codegen(op->args[1]); if (!try_vector_predication_intrinsic("llvm.vp.shl", llvm_type_of(op->type), op->type.lanes(), AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { value = builder->CreateShl(a, b); } } else { value = codegen(lower_signed_shift_left(op->args[0], op->args[1])); } } else if (op->is_intrinsic(Call::shift_right)) { internal_assert(op->args.size() == 2); if (op->args[1].type().is_uint()) { Value *a = codegen(op->args[0]); Value *b = codegen(op->args[1]); if (op->type.is_int()) { if (!try_vector_predication_intrinsic("llvm.vp.ashr", llvm_type_of(op->type), op->type.lanes(), AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { value = builder->CreateAShr(a, b); } } else { if (!try_vector_predication_intrinsic("llvm.vp.lshr", llvm_type_of(op->type), op->type.lanes(), AllEnabledMask(), {VPArg(a, 0), VPArg(b)})) { value = builder->CreateLShr(a, b); } } } else { value = codegen(lower_signed_shift_right(op->args[0], op->args[1])); } } else if (op->is_intrinsic(Call::abs)) { internal_assert(op->args.size() == 1); // Generate select(x >= 0, x, -x) instead string x_name = unique_name('x'); Expr x = Variable::make(op->args[0].type(), x_name); value = codegen(Let::make(x_name, op->args[0], select(x >= 0, x, -x))); } else if (op->is_intrinsic(Call::absd)) { internal_assert(op->args.size() == 2); Expr a = op->args[0]; Expr b = op->args[1]; string a_name = unique_name('a'); string b_name = unique_name('b'); Expr a_var = Variable::make(op->args[0].type(), a_name); Expr b_var = Variable::make(op->args[1].type(), b_name); Expr cond = a_var < b_var; // Cast to unsigned because we want wrapping semantics on the subtract // in the signed case. a_var = cast(op->type, a_var); b_var = cast(op->type, b_var); codegen(Let::make(a_name, op->args[0], Let::make(b_name, op->args[1], Select::make(cond, b_var - a_var, a_var - b_var)))); } else if (op->is_intrinsic(Call::div_round_to_zero)) { // See if we can rewrite it to something faster (e.g. a shift) Expr e = lower_int_uint_div(op->args[0], op->args[1], /** round to zero */ true); if (!e.as()) { codegen(e); return; } internal_assert(op->args.size() == 2); Value *a = codegen(op->args[0]); Value *b = codegen(op->args[1]); if (op->type.is_int()) { value = builder->CreateSDiv(a, b); } else if (op->type.is_uint()) { value = builder->CreateUDiv(a, b); } else { internal_error << "div_round_to_zero of non-integer type.\n"; } } else if (op->is_intrinsic(Call::mod_round_to_zero)) { internal_assert(op->args.size() == 2); Value *a = codegen(op->args[0]); Value *b = codegen(op->args[1]); if (op->type.is_int()) { value = builder->CreateSRem(a, b); } else if (op->type.is_uint()) { value = builder->CreateURem(a, b); } else { internal_error << "mod_round_to_zero of non-integer type.\n"; } } else if (op->is_intrinsic(Call::lerp)) { internal_assert(op->args.size() == 3); // If we need to upgrade the type, do the entire lerp in the // upgraded type for better precision. // TODO: This might be surprising behavior? Type t = upgrade_type_for_arithmetic(op->type); Type wt = upgrade_type_for_arithmetic(op->args[2].type()); Expr e = lower_lerp(op->type, cast(t, op->args[0]), cast(t, op->args[1]), cast(wt, op->args[2]), target); codegen(e); } else if (op->is_intrinsic(Call::popcount)) { internal_assert(op->args.size() == 1); std::vector arg_type(1); arg_type[0] = llvm_type_of(op->args[0].type()); llvm::Function *fn = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::ctpop, arg_type); Value *a = codegen(op->args[0]); CallInst *call = builder->CreateCall(fn, a); value = call; } else if (op->is_intrinsic(Call::count_leading_zeros) || op->is_intrinsic(Call::count_trailing_zeros)) { internal_assert(op->args.size() == 1); std::vector arg_type(1); arg_type[0] = llvm_type_of(op->args[0].type()); llvm::Function *fn = llvm::Intrinsic::getDeclaration(module.get(), (op->is_intrinsic(Call::count_leading_zeros)) ? llvm::Intrinsic::ctlz : llvm::Intrinsic::cttz, arg_type); llvm::Value *is_const_zero_poison = llvm::ConstantInt::getFalse(*context); llvm::Value *args[2] = {codegen(op->args[0]), is_const_zero_poison}; CallInst *call = builder->CreateCall(fn, args); value = call; } else if (op->is_intrinsic(Call::return_second)) { internal_assert(op->args.size() == 2); codegen(op->args[0]); value = codegen(op->args[1]); } else if (op->is_intrinsic(Call::if_then_else)) { Expr cond = op->args[0]; if (const Broadcast *b = cond.as()) { cond = b->value; } if (cond.type().is_vector()) { scalarize(op); } else { internal_assert(op->args.size() == 2 || op->args.size() == 3); BasicBlock *true_bb = BasicBlock::Create(*context, "true_bb", function); BasicBlock *false_bb = BasicBlock::Create(*context, "false_bb", function); BasicBlock *after_bb = BasicBlock::Create(*context, "after_bb", function); Value *c = codegen(cond); if (c->getType() != i1_t) { c = builder->CreateIsNotNull(c); } builder->CreateCondBr(c, true_bb, false_bb); builder->SetInsertPoint(true_bb); Value *true_value = codegen(op->args[1]); builder->CreateBr(after_bb); BasicBlock *true_pred = builder->GetInsertBlock(); builder->SetInsertPoint(false_bb); Value *false_value = codegen(op->args.size() == 3 ? op->args[2] : make_zero(op->type)); builder->CreateBr(after_bb); BasicBlock *false_pred = builder->GetInsertBlock(); builder->SetInsertPoint(after_bb); PHINode *phi = builder->CreatePHI(true_value->getType(), 2); phi->addIncoming(true_value, true_pred); phi->addIncoming(false_value, false_pred); value = phi; } } else if (op->is_intrinsic(Call::round)) { value = codegen(lower_round_to_nearest_ties_to_even(op->args[0])); } else if (op->is_intrinsic(Call::require)) { internal_assert(op->args.size() == 3); Expr cond = op->args[0]; if (cond.type().is_vector()) { scalarize(op); } else { Value *c = codegen(cond); create_assertion(c, op->args[2]); value = codegen(op->args[1]); } } else if (op->is_intrinsic(Call::make_struct)) { if (op->type.is_vector()) { // Make a vector of pointers to distinct structs scalarize(op); } else if (op->args.empty()) { // Empty structs can be emitted for arrays of size zero // (e.g. the shape of a zero-dimensional buffer). We // generate a null in this situation. */ value = ConstantPointerNull::get(dyn_cast(llvm_type_of(op->type))); } else { // Codegen each element. bool all_same_type = true; vector args(op->args.size()); vector types(op->args.size()); for (size_t i = 0; i < op->args.size(); i++) { args[i] = codegen(op->args[i]); types[i] = args[i]->getType(); all_same_type &= (types[0] == types[i]); } // Use either a single scalar, a fixed-size array, or a // struct. The struct type would always be correct, but // the array or scalar type produce slightly simpler IR. if (args.size() == 1) { value = create_alloca_at_entry(types[0], 1); builder->CreateStore(args[0], value); } else { llvm::Type *aggregate_t = (all_same_type ? (llvm::Type *)ArrayType::get(types[0], types.size()) : (llvm::Type *)llvm::StructType::get(*context, types)); value = create_alloca_at_entry(aggregate_t, 1); struct_type_recovery[value] = aggregate_t; for (size_t i = 0; i < args.size(); i++) { Value *elem_ptr = builder->CreateConstInBoundsGEP2_32(aggregate_t, value, 0, i); builder->CreateStore(args[i], elem_ptr); } } } } else if (op->is_intrinsic(Call::load_typed_struct_member)) { // Given a void * instance of a typed struct, an in-scope prototype // struct of the same type, and the index of a slot, load the value of // that slot. // // It is assumed that the slot index is valid for the given typed struct. // // TODO: this comment is replicated in CodeGen_LLVM and should be updated there too. // TODO: https://github.com/halide/Halide/issues/6468 internal_assert(op->args.size() == 3); llvm::Value *struct_instance = codegen(op->args[0]); llvm::Value *struct_prototype = codegen(op->args[1]); llvm::Value *typed_struct_instance = builder->CreatePointerCast(struct_instance, struct_prototype->getType()); const int64_t *index = as_const_int(op->args[2]); // make_struct can use a fixed-size struct, an array type, or a scalar llvm::Type *pointee_type; auto iter = struct_type_recovery.find(struct_prototype); if (iter != struct_type_recovery.end()) { pointee_type = iter->second; } else { pointee_type = llvm_type_of(op->type); } llvm::StructType *struct_type = llvm::dyn_cast(pointee_type); llvm::Type *array_type = llvm::dyn_cast(pointee_type); if (struct_type || array_type) { internal_assert(index != nullptr); llvm::Value *gep = CreateInBoundsGEP(builder.get(), pointee_type, typed_struct_instance, {ConstantInt::get(i32_t, 0), ConstantInt::get(i32_t, (int)*index)}); llvm::Type *result_type = struct_type ? struct_type->getElementType(*index) : array_type->getArrayElementType(); value = builder->CreateLoad(result_type, gep); } else { // The struct is actually just a scalar internal_assert(index == nullptr || *index == 0); value = builder->CreateLoad(pointee_type, typed_struct_instance); } } else if (op->is_intrinsic(Call::get_user_context)) { internal_assert(op->args.empty()); value = get_user_context(); } else if (op->is_intrinsic(Call::saturating_add) || op->is_intrinsic(Call::saturating_sub)) { internal_assert(op->args.size() == 2); // Try to fold the vector reduce for a call to saturating_add const bool folded = op->is_intrinsic(Call::saturating_add) && try_to_fold_vector_reduce(op->args[0], op->args[1]); if (!folded) { std::string intrin; if (op->type.is_int()) { intrin = "llvm.s"; } else { internal_assert(op->type.is_uint()); intrin = "llvm.u"; } if (op->is_intrinsic(Call::saturating_add)) { intrin += "add.sat."; } else { internal_assert(op->is_intrinsic(Call::saturating_sub)); intrin += "sub.sat."; } if (op->type.lanes() > 1) { int lanes = op->type.lanes(); llvm::Type *llvm_type = llvm_type_of(op->type); if (isa(llvm_type)) { internal_assert((effective_vscale != 0) && ((lanes % effective_vscale) == 0)); intrin += "nx"; lanes /= effective_vscale; } intrin += "v" + std::to_string(lanes); } intrin += "i" + std::to_string(op->type.bits()); value = call_intrin(op->type, op->type.lanes(), intrin, op->args); } } else if (op->is_intrinsic(Call::stringify)) { internal_assert(!op->args.empty()); if (op->type.is_vector()) { scalarize(op); } else { // Compute the maximum possible size of the message. int buf_size = 1; // One for the terminating zero. for (const auto &arg : op->args) { Type t = arg.type(); if (arg.as()) { buf_size += arg.as()->value.size(); } else if (t.is_int() || t.is_uint()) { buf_size += 19; // 2^64 = 18446744073709551616 } else if (t.is_float()) { if (t.bits() == 32) { buf_size += 47; // %f format of max negative float } else { buf_size += 14; // Scientific notation with 6 decimal places. } } else if (t == type_of()) { // Not a strict upper bound (there isn't one), but ought to be enough for most buffers. buf_size += 512; } else { internal_assert(t.is_handle()); buf_size += 18; // 0x0123456789abcdef } } // Round up to a multiple of 16 bytes. buf_size = ((buf_size + 15) / 16) * 16; // Clamp to at most 8k. buf_size = std::min(8 * 1024, buf_size); // Allocate a stack array to hold the message. llvm::Value *buf = create_alloca_at_entry(i8_t, buf_size); llvm::Value *dst = buf; llvm::Value *buf_end = CreateConstGEP1_32(builder.get(), i8_t, buf, buf_size); llvm::Function *append_string = module->getFunction("halide_string_to_string"); llvm::Function *append_int64 = module->getFunction("halide_int64_to_string"); llvm::Function *append_uint64 = module->getFunction("halide_uint64_to_string"); llvm::Function *append_double = module->getFunction("halide_double_to_string"); llvm::Function *append_pointer = module->getFunction("halide_pointer_to_string"); llvm::Function *append_buffer = module->getFunction("halide_buffer_to_string"); internal_assert(append_string); internal_assert(append_int64); internal_assert(append_uint64); internal_assert(append_double); internal_assert(append_pointer); internal_assert(append_buffer); for (const auto &arg : op->args) { const StringImm *s = arg.as(); Type t = arg.type(); internal_assert(t.lanes() == 1); vector call_args(2); call_args[0] = dst; call_args[1] = buf_end; if (s) { call_args.push_back(codegen(arg)); dst = builder->CreateCall(append_string, call_args); } else if (t.is_bool()) { Value *a = codegen(arg); Value *t = codegen(StringImm::make("true")); Value *f = codegen(StringImm::make("false")); call_args.push_back(builder->CreateSelect(a, t, f)); dst = builder->CreateCall(append_string, call_args); } else if (t.is_int()) { call_args.push_back(codegen(Cast::make(Int(64), arg))); call_args.push_back(ConstantInt::get(i32_t, 1)); dst = builder->CreateCall(append_int64, call_args); } else if (t.is_uint()) { call_args.push_back(codegen(Cast::make(UInt(64), arg))); call_args.push_back(ConstantInt::get(i32_t, 1)); dst = builder->CreateCall(append_uint64, call_args); } else if (t.is_float()) { call_args.push_back(codegen(Cast::make(Float(64), arg))); // Use scientific notation for doubles call_args.push_back(ConstantInt::get(i32_t, t.bits() == 64 ? 1 : 0)); dst = builder->CreateCall(append_double, call_args); } else if (t == type_of()) { Value *buf = codegen(arg); buf = builder->CreatePointerCast(buf, append_buffer->getFunctionType()->getParamType(2)); call_args.push_back(buf); dst = builder->CreateCall(append_buffer, call_args); } else { internal_assert(t.is_handle()); Value *ptr = codegen(arg); ptr = builder->CreatePointerCast(ptr, i8_t->getPointerTo()); call_args.push_back(ptr); dst = builder->CreateCall(append_pointer, call_args); } } if (get_target().has_feature(Target::MSAN)) { // Note that we mark the entire buffer as initialized; // it would be more accurate to just mark (dst - buf) llvm::Function *annotate = module->getFunction("halide_msan_annotate_memory_is_initialized"); vector annotate_args(3); annotate_args[0] = get_user_context(); annotate_args[1] = buf; annotate_args[2] = codegen(Cast::make(Int(64), buf_size)); builder->CreateCall(annotate, annotate_args); } value = buf; } } else if (op->is_intrinsic(Call::memoize_expr)) { // Used as an annotation for caching, should be invisible to // codegen. Ignore arguments beyond the first as they are only // used in the cache key. internal_assert(!op->args.empty()); value = codegen(op->args[0]); } else if (op->is_intrinsic(Call::alloca)) { // The argument is the number of bytes. For now it must be // const, or a call to size_of_halide_buffer_t. internal_assert(op->args.size() == 1); // We can generate slightly cleaner IR with fewer alignment // restrictions if we recognize the most common types we // expect to get alloca'd. const Call *call = op->args[0].as(); const int64_t *sz = as_const_int(op->args[0]); if (op->type == type_of() && call && call->is_intrinsic(Call::size_of_halide_buffer_t)) { value = create_alloca_at_entry(halide_buffer_t_type, 1); } else if (op->type == type_of() && semaphore_t_type != nullptr && sz && *sz == 16) { value = create_alloca_at_entry(semaphore_t_type, 1); } else { internal_assert(sz != nullptr); if (op->type == type_of()) { value = create_alloca_at_entry(dimension_t_type, *sz / sizeof(halide_dimension_t)); } else { // Just use an i8* and make the users bitcast it. value = create_alloca_at_entry(i8_t, *sz); } } } else if (op->is_intrinsic(Call::register_destructor)) { internal_assert(op->args.size() == 2); const StringImm *fn = op->args[0].as(); internal_assert(fn); llvm::Function *f = module->getFunction(fn->value); if (!f) { llvm::Type *arg_types[] = {i8_t->getPointerTo(), i8_t->getPointerTo()}; FunctionType *func_t = FunctionType::get(void_t, arg_types, false); f = llvm::Function::Create(func_t, llvm::Function::ExternalLinkage, fn->value, module.get()); f->setCallingConv(CallingConv::C); } internal_assert(op->args[1].type().is_handle()); Value *arg = codegen(op->args[1]); value = register_destructor(f, arg, Always); } else if (op->is_intrinsic(Call::call_cached_indirect_function)) { // Arguments to call_cached_indirect_function are of the form // // cond_1, "sub_function_name_1", // cond_2, "sub_function_name_2", // ... // cond_N, "sub_function_name_N" // // This will generate code that corresponds (roughly) to // // static FunctionPtr f = []{ // if (cond_1) return sub_function_name_1; // if (cond_2) return sub_function_name_2; // ... // if (cond_N) return sub_function_name_N; // } // return f(args) // // i.e.: the conditions will be evaluated *in order*; the first one // evaluating to true will have its corresponding function cached, // which will be used to complete this (and all subsequent) calls. // // The final condition (cond_N) must evaluate to a constant TRUE // value (so that the final function will be selected if all others // fail); failure to do so will cause unpredictable results. // // There is currently no way to clear the cached function pointer. // // It is assumed/required that all of the conditions are "pure"; each // must evaluate to the same value (within a given runtime environment) // across multiple evaluations. // // It is assumed/required that all of the sub-functions have arguments // (and return values) that are identical to those of this->function. // // Note that we require >= 4 arguments: fewer would imply // only one condition+function pair, which is pointless to use // (the function should always be called directly). // internal_assert(op->args.size() >= 4); internal_assert(!(op->args.size() & 1)); // Gather information we need about each function. struct SubFn { llvm::Function *fn; llvm::GlobalValue *fn_ptr; Expr cond; }; vector sub_fns; for (size_t i = 0; i < op->args.size(); i += 2) { const string sub_fn_name = op->args[i + 1].as()->value; string extern_sub_fn_name = sub_fn_name; llvm::Function *sub_fn = module->getFunction(sub_fn_name); if (!sub_fn) { extern_sub_fn_name = get_mangled_names(sub_fn_name, LinkageType::External, NameMangling::Default, current_function_args, get_target()) .extern_name; debug(1) << "Did not find function " << sub_fn_name << ", assuming extern \"C\" " << extern_sub_fn_name << "\n"; vector arg_types; for (const auto &arg : function->args()) { arg_types.push_back(arg.getType()); } llvm::Type *result_type = llvm_type_of(upgrade_type_for_argument_passing(op->type)); FunctionType *func_t = FunctionType::get(result_type, arg_types, false); sub_fn = llvm::Function::Create(func_t, llvm::Function::ExternalLinkage, extern_sub_fn_name, module.get()); sub_fn->setCallingConv(CallingConv::C); } llvm::GlobalValue *sub_fn_ptr = module->getNamedValue(extern_sub_fn_name); if (!sub_fn_ptr) { debug(1) << "Did not find function ptr " << extern_sub_fn_name << ", assuming extern \"C\".\n"; sub_fn_ptr = new GlobalVariable(*module, sub_fn->getType(), /*isConstant*/ true, GlobalValue::ExternalLinkage, /*initializer*/ nullptr, extern_sub_fn_name); } auto cond = op->args[i]; sub_fns.push_back({sub_fn, sub_fn_ptr, cond}); } // Create a null-initialized global to track this object. auto *const base_fn = sub_fns.back().fn; const string global_name = unique_name(base_fn->getName().str() + "_indirect_fn_ptr"); GlobalVariable *global = new GlobalVariable( *module, base_fn->getType(), /*isConstant*/ false, GlobalValue::PrivateLinkage, ConstantPointerNull::get(base_fn->getType()), global_name); LoadInst *loaded_value = builder->CreateLoad(base_fn->getType(), global); BasicBlock *global_inited_bb = BasicBlock::Create(*context, "global_inited_bb", function); BasicBlock *global_not_inited_bb = BasicBlock::Create(*context, "global_not_inited_bb", function); BasicBlock *call_fn_bb = BasicBlock::Create(*context, "call_fn_bb", function); // Only init the global if not already inited. // // Note that we deliberately do not attempt to make this threadsafe via (e.g.) mutexes; // the requirements of the conditions above mean that multiple writes *should* only // be able to re-write the same value, which is harmless for our purposes, and // avoiding such code simplifies and speeds the resulting code. // // (Note that if we ever need to add a way to clear the cached function pointer, // we may need to reconsider this, to avoid amusingly horrible race conditions.) builder->CreateCondBr(builder->CreateIsNotNull(loaded_value), global_inited_bb, global_not_inited_bb, very_likely_branch); // Build the not-already-inited case builder->SetInsertPoint(global_not_inited_bb); llvm::Value *selected_value = nullptr; for (int i = sub_fns.size() - 1; i >= 0; i--) { const auto sub_fn = sub_fns[i]; if (!selected_value) { selected_value = sub_fn.fn_ptr; } else { Value *c = codegen(sub_fn.cond); selected_value = builder->CreateSelect(c, sub_fn.fn_ptr, selected_value); } } builder->CreateStore(selected_value, global); builder->CreateBr(call_fn_bb); // Just an incoming edge for the Phi node builder->SetInsertPoint(global_inited_bb); builder->CreateBr(call_fn_bb); builder->SetInsertPoint(call_fn_bb); PHINode *phi = builder->CreatePHI(selected_value->getType(), 2); phi->addIncoming(selected_value, global_not_inited_bb); phi->addIncoming(loaded_value, global_inited_bb); std::vector call_args; for (auto &arg : function->args()) { call_args.push_back(&arg); } llvm::CallInst *call = builder->CreateCall(base_fn->getFunctionType(), phi, call_args); value = call; } else if (op->is_intrinsic(Call::prefetch)) { user_assert((op->args.size() == 4) && is_const_one(op->args[2])) << "Only prefetch of 1 cache line is supported.\n"; const Expr &base_address = op->args[0]; const Expr &base_offset = op->args[1]; // const Expr &extent0 = op->args[2]; // unused // const Expr &stride0 = op->args[3]; // unused llvm::Function *prefetch_fn = module->getFunction("_halide_prefetch"); internal_assert(prefetch_fn); vector args; args.push_back(codegen_buffer_pointer(codegen(base_address), op->type, base_offset)); // The first argument is a pointer, which has type i8*. We // need to cast the argument, which might be a pointer to a // different type. llvm::Type *ptr_type = prefetch_fn->getFunctionType()->params()[0]; args[0] = builder->CreateBitCast(args[0], ptr_type); builder->CreateCall(prefetch_fn, args); // Prefetch evaluates to zero of the prefetched type. value = codegen(make_zero(op->type)); } else if (op->is_intrinsic(Call::signed_integer_overflow)) { user_error << "Signed integer overflow occurred during constant-folding. Signed" " integer overflow for int32 and int64 is undefined behavior in" " Halide.\n"; } else if (op->is_intrinsic(Call::undef)) { user_error << "undef not eliminated before code generation. Please report this as a Halide bug.\n"; } else if (op->is_intrinsic(Call::size_of_halide_buffer_t)) { llvm::DataLayout d(module.get()); value = ConstantInt::get(i32_t, (int)d.getTypeAllocSize(halide_buffer_t_type)); } else if (op->is_intrinsic(Call::strict_float)) { IRBuilder::FastMathFlagGuard guard(*builder); llvm::FastMathFlags safe_flags; safe_flags.clear(); builder->setFastMathFlags(safe_flags); builder->setDefaultFPMathTag(strict_fp_math_md); value = codegen(op->args[0]); } else if (is_float16_transcendental(op) && !supports_call_as_float16(op)) { value = codegen(lower_float16_transcendental_to_float32_equivalent(op)); } else if (op->is_intrinsic(Call::mux)) { value = codegen(lower_mux(op)); } else if (op->is_intrinsic(Call::extract_bits)) { value = codegen(lower_extract_bits(op)); } else if (op->is_intrinsic(Call::concat_bits)) { value = codegen(lower_concat_bits(op)); } else if (op->is_intrinsic()) { Expr lowered = lower_intrinsic(op); if (!lowered.defined()) { internal_error << "Unknown intrinsic " << op->name; } value = codegen(lowered); } else if (op->call_type == Call::PureExtern && op->name == "pow_f32") { internal_assert(op->args.size() == 2); Expr x = op->args[0]; Expr y = op->args[1]; Halide::Expr abs_x_pow_y = Internal::halide_exp(Internal::halide_log(abs(x)) * y); Halide::Expr nan_expr = Call::make(x.type(), "nan_f32", {}, Call::PureExtern); Expr iy = floor(y); Expr one = make_one(x.type()); Expr zero = make_zero(x.type()); Expr e = select(x > 0, abs_x_pow_y, // Strictly positive x y == 0.0f, one, // x^0 == 1 x == 0.0f, zero, // 0^y == 0 y != iy, nan_expr, // negative x to a non-integer power iy % 2 == 0, abs_x_pow_y, // negative x to an even power -abs_x_pow_y); // negative x to an odd power e = common_subexpression_elimination(e); e.accept(this); } else if (op->call_type == Call::PureExtern && op->name == "log_f32") { internal_assert(op->args.size() == 1); Expr e = Internal::halide_log(op->args[0]); e.accept(this); } else if (op->call_type == Call::PureExtern && op->name == "exp_f32") { internal_assert(op->args.size() == 1); Expr e = Internal::halide_exp(op->args[0]); e.accept(this); } else if (op->call_type == Call::PureExtern && (op->name == "is_nan_f32" || op->name == "is_nan_f64" || op->name == "is_nan_f16")) { internal_assert(op->args.size() == 1); Value *a = codegen(op->args[0]); /* NaNs are not supposed to exist in "no NaNs" compilation * mode, but it appears llvm special cases the unordered * compare instruction when the global NoNaNsFPMath option is * set and still checks for a NaN. However if the nnan flag is * set on the instruction itself, llvm treats the comparison * as always false. Thus we always turn off the per-instruction * fast-math flags for this instruction. I.e. it is always * treated as strict. Note that compilation may still be in * fast-math mode due to global options, but that's ok due to * the aforementioned special casing. */ IRBuilder::FastMathFlagGuard guard(*builder); llvm::FastMathFlags safe_flags; safe_flags.clear(); builder->setFastMathFlags(safe_flags); builder->setDefaultFPMathTag(strict_fp_math_md); value = builder->CreateFCmpUNO(a, a); } else if (op->call_type == Call::PureExtern && (op->name == "is_inf_f32" || op->name == "is_inf_f64" || op->name == "is_inf_f16")) { internal_assert(op->args.size() == 1); IRBuilder::FastMathFlagGuard guard(*builder); llvm::FastMathFlags safe_flags; safe_flags.clear(); builder->setFastMathFlags(safe_flags); builder->setDefaultFPMathTag(strict_fp_math_md); // isinf(e) -> (fabs(e) == infinity) Expr e = op->args[0]; internal_assert(e.type().is_float()); Expr inf = e.type().max(); codegen(abs(e) == inf); } else if (op->call_type == Call::PureExtern && (op->name == "is_finite_f32" || op->name == "is_finite_f64" || op->name == "is_finite_f16")) { internal_assert(op->args.size() == 1); internal_assert(op->args[0].type().is_float()); IRBuilder::FastMathFlagGuard guard(*builder); llvm::FastMathFlags safe_flags; safe_flags.clear(); builder->setFastMathFlags(safe_flags); builder->setDefaultFPMathTag(strict_fp_math_md); // isfinite(e) -> (fabs(e) != infinity && !isnan(e)) -> (fabs(e) != infinity && e == e) Expr e = op->args[0]; internal_assert(e.type().is_float()); Expr inf = e.type().max(); codegen(abs(e) != inf && e == e); } else { // It's an extern call. std::string name; if (op->call_type == Call::ExternCPlusPlus) { user_assert(get_target().has_feature(Target::CPlusPlusMangling)) << "Target must specify C++ name mangling (\"c_plus_plus_name_mangling\") in order to call C++ externs. (" << op->name << ")\n"; std::vector namespaces; name = extract_namespaces(op->name, namespaces); std::vector mangle_args; for (const auto &arg : op->args) { mangle_args.emplace_back(arg); } name = cplusplus_function_mangled_name(name, namespaces, op->type, mangle_args, get_target()); } else { name = op->name; } // Codegen the args vector args(op->args.size()); for (size_t i = 0; i < op->args.size(); i++) { args[i] = codegen(op->args[i]); } llvm::Function *fn = module->getFunction(name); llvm::Type *result_type = llvm_type_of(upgrade_type_for_argument_passing(op->type)); // Add a user context arg as needed. It's never a vector. bool takes_user_context = function_takes_user_context(op->name); if (takes_user_context) { internal_assert(fn) << "External function " << op->name << " is marked as taking user_context, but is not in the runtime module. Check if runtime_api.cpp needs to be rebuilt.\n"; debug(4) << "Adding user_context to " << op->name << " args\n"; args.insert(args.begin(), get_user_context()); } // If we can't find it, declare it extern "C" if (!fn) { vector arg_types(args.size()); for (size_t i = 0; i < args.size(); i++) { arg_types[i] = args[i]->getType(); if (arg_types[i]->isVectorTy()) { VectorType *vt = dyn_cast(arg_types[i]); arg_types[i] = vt->getElementType(); } } llvm::Type *scalar_result_type = result_type; if (result_type->isVectorTy()) { VectorType *vt = dyn_cast(result_type); scalar_result_type = vt->getElementType(); } FunctionType *func_t = FunctionType::get(scalar_result_type, arg_types, false); fn = llvm::Function::Create(func_t, llvm::Function::ExternalLinkage, name, module.get()); fn->setCallingConv(CallingConv::C); debug(4) << "Did not find " << op->name << ". Declared it extern \"C\".\n"; } else { debug(4) << "Found " << op->name << "\n"; // TODO: Say something more accurate here as there is now // partial information in the handle_type field, but it is // not clear it can be matched to the LLVM types and it is // not always there. // Halide's type system doesn't preserve pointer types // correctly (they just get called "Handle()"), so we may // need to pointer cast to the appropriate type. Only look at // fixed params (not varags) in llvm function. FunctionType *func_t = fn->getFunctionType(); for (size_t i = takes_user_context ? 1 : 0; i < std::min(args.size(), (size_t)(func_t->getNumParams())); i++) { Expr halide_arg = takes_user_context ? op->args[i - 1] : op->args[i]; if (halide_arg.type().is_handle()) { llvm::Type *t = func_t->getParamType(i); // Widen to vector-width as needed. If the // function doesn't actually take a vector, // individual lanes will be extracted below. if (halide_arg.type().is_vector() && !t->isVectorTy()) { t = get_vector_type(t, halide_arg.type().lanes()); } if (t != args[i]->getType()) { debug(4) << "Pointer casting argument to extern call: " << halide_arg << "\n"; args[i] = builder->CreatePointerCast(args[i], t); } } } } if (op->type.is_scalar()) { CallInst *call = builder->CreateCall(fn, args); if (op->is_pure()) { call->setDoesNotAccessMemory(); } value = call; } else { // Check if a vector version of the function already // exists at some useful width. pair vec = find_vector_runtime_function(name, op->type.lanes()); llvm::Function *vec_fn = vec.first; int w = vec.second; if (vec_fn) { value = call_intrin(llvm_type_of(op->type), w, get_llvm_function_name(vec_fn), args); } else { // No vector version found. Scalarize. Extract each simd // lane in turn and do one scalar call to the function. value = PoisonValue::get(result_type); for (int i = 0; i < op->type.lanes(); i++) { Value *idx = ConstantInt::get(i32_t, i); vector arg_lane(args.size()); for (size_t j = 0; j < args.size(); j++) { if (args[j]->getType()->isVectorTy()) { arg_lane[j] = builder->CreateExtractElement(args[j], idx); } else { arg_lane[j] = args[j]; } } CallInst *call = builder->CreateCall(fn, arg_lane); if (op->is_pure()) { call->setDoesNotAccessMemory(); } if (!call->getType()->isVoidTy()) { value = builder->CreateInsertElement(value, call, idx); } // otherwise leave it as undef. } } } } } void CodeGen_LLVM::visit(const Prefetch *op) { internal_error << "Prefetch encountered during codegen\n"; } void CodeGen_LLVM::visit(const Let *op) { sym_push(op->name, codegen(op->value)); value = codegen(op->body); sym_pop(op->name); } void CodeGen_LLVM::visit(const LetStmt *op) { sym_push(op->name, codegen(op->value)); codegen(op->body); sym_pop(op->name); } void CodeGen_LLVM::visit(const AssertStmt *op) { create_assertion(codegen(op->condition), op->message); } Constant *CodeGen_LLVM::create_string_constant(const string &s) { map::iterator iter = string_constants.find(s); if (iter == string_constants.end()) { vector data; data.reserve(s.size() + 1); data.insert(data.end(), s.begin(), s.end()); data.push_back(0); Constant *val = create_binary_blob(data, "str"); string_constants[s] = val; return val; } else { return iter->second; } } Constant *CodeGen_LLVM::create_binary_blob(const vector &data, const string &name, bool constant) { internal_assert(!data.empty()); llvm::Type *type = ArrayType::get(i8_t, data.size()); GlobalVariable *global = new GlobalVariable(*module, type, constant, GlobalValue::PrivateLinkage, nullptr, name); ArrayRef data_array((const unsigned char *)&data[0], data.size()); global->setInitializer(ConstantDataArray::get(*context, data_array)); size_t alignment = 32; size_t native_vector_bytes = (size_t)(native_vector_bits() / 8); if (data.size() > alignment && native_vector_bytes > alignment) { alignment = native_vector_bytes; } global->setAlignment(llvm::Align(alignment)); Constant *zero = ConstantInt::get(i32_t, 0); Constant *zeros[] = {zero, zero}; Constant *ptr = ConstantExpr::getInBoundsGetElementPtr(type, global, zeros); return ptr; } void CodeGen_LLVM::create_assertion(Value *cond, const Expr &message, llvm::Value *error_code) { internal_assert(!message.defined() || message.type() == Int(32)) << "Assertion result is not an int: " << message; if (target.has_feature(Target::NoAsserts)) { return; } // If the condition is a vector, fold it down to a scalar VectorType *vt = dyn_cast(cond->getType()); if (vt) { Value *scalar_cond = builder->CreateExtractElement(cond, ConstantInt::get(i32_t, 0)); for (int i = 1; i < get_vector_num_elements(vt); i++) { Value *lane = builder->CreateExtractElement(cond, ConstantInt::get(i32_t, i)); scalar_cond = builder->CreateAnd(scalar_cond, lane); } cond = scalar_cond; } // Make a new basic block for the assert BasicBlock *assert_fails_bb = BasicBlock::Create(*context, "assert failed", function); BasicBlock *assert_succeeds_bb = BasicBlock::Create(*context, "assert succeeded", function); // If the condition fails, enter the assert body, otherwise, enter the block after builder->CreateCondBr(cond, assert_succeeds_bb, assert_fails_bb, very_likely_branch); // Build the failure case builder->SetInsertPoint(assert_fails_bb); // Call the error handler if (!error_code) { error_code = codegen(message); } return_with_error_code(error_code); // Continue on using the success case builder->SetInsertPoint(assert_succeeds_bb); } void CodeGen_LLVM::return_with_error_code(llvm::Value *error_code) { // Branch to the destructor block, which cleans up and then bails out. BasicBlock *dtors = get_destructor_block(); // Hook up our error code to the phi node that the destructor block starts with. PHINode *phi = dyn_cast(dtors->begin()); internal_assert(phi) << "The destructor block is supposed to start with a phi node\n"; phi->addIncoming(error_code, builder->GetInsertBlock()); builder->CreateBr(get_destructor_block()); } void CodeGen_LLVM::visit(const ProducerConsumer *op) { producer_consumer_id++; string name; if (op->is_producer) { name = std::to_string(producer_consumer_id) + std::string("_produce_") + op->name; } else { name = std::to_string(producer_consumer_id) + std::string("_consume_") + op->name; } BasicBlock *produce = BasicBlock::Create(*context, name, function); builder->CreateBr(produce); builder->SetInsertPoint(produce); codegen(op->body); } void CodeGen_LLVM::visit(const For *op) { Value *min = codegen(op->min); Value *extent = codegen(op->extent); const Acquire *acquire = op->body.as(); // TODO(zvookin): remove this after validating it doesn't happen internal_assert(!(op->for_type == ForType::Parallel || (op->for_type == ForType::Serial && acquire && !expr_uses_var(acquire->count, op->name)))); if (op->for_type == ForType::Serial) { Value *max = builder->CreateNSWAdd(min, extent); BasicBlock *preheader_bb = builder->GetInsertBlock(); // Make a new basic block for the loop for_loop_id++; BasicBlock *loop_bb = BasicBlock::Create( *context, std::to_string(for_loop_id) + std::string("_for_") + op->name, function); // Create the block that comes after the loop BasicBlock *after_bb = BasicBlock::Create( *context, std::to_string(for_loop_id) + std::string("_end_for_") + op->name, function); // If min < max, fall through to the loop bb Value *enter_condition = builder->CreateICmpSLT(min, max); builder->CreateCondBr(enter_condition, loop_bb, after_bb, very_likely_branch); builder->SetInsertPoint(loop_bb); // Make our phi node. PHINode *phi = builder->CreatePHI(i32_t, 2); phi->addIncoming(min, preheader_bb); // Within the loop, the variable is equal to the phi value sym_push(op->name, phi); // Emit the loop body codegen(op->body); // Update the counter Value *next_var = builder->CreateNSWAdd(phi, ConstantInt::get(i32_t, 1)); // Add the back-edge to the phi node phi->addIncoming(next_var, builder->GetInsertBlock()); // Maybe exit the loop Value *end_condition = builder->CreateICmpNE(next_var, max); builder->CreateCondBr(end_condition, loop_bb, after_bb); builder->SetInsertPoint(after_bb); // Pop the loop variable from the scope sym_pop(op->name); } else { internal_error << "Unknown type of For node. Only Serial and Parallel For nodes should survive down to codegen.\n"; } } void CodeGen_LLVM::visit(const Store *op) { if (!emit_atomic_stores) { // Peel lets off the index to make us more likely to pattern // match a ramp. if (const Let *let = op->index.as()) { Stmt s = Store::make(op->name, op->value, let->body, op->param, op->predicate, op->alignment); codegen(LetStmt::make(let->name, let->value, s)); return; } } // Fix up the type Halide::Type value_type = op->value.type(); Halide::Type storage_type = upgrade_type_for_storage(value_type); if (value_type != storage_type) { Expr v = reinterpret(storage_type, op->value); codegen(Store::make(op->name, v, op->index, op->param, op->predicate, op->alignment)); return; } if (inside_atomic_mutex_node) { user_assert(value_type.is_scalar()) << "The vectorized atomic operation for the store " << op->name << " is lowered into a mutex lock, which does not support vectorization.\n"; } bool recursive = (expr_uses_var(op->index, op->name) || expr_uses_var(op->value, op->name)); // Issue atomic store if we are inside an atomic node. if (emit_atomic_stores && recursive) { codegen_atomic_rmw(op); return; } // Predicated store. if (!is_const_one(op->predicate)) { codegen_predicated_store(op); return; } auto annotate_store = [&](StoreInst *store, const Expr &index) { add_tbaa_metadata(store, op->name, index); if (emit_atomic_stores) { store->setAtomic(AtomicOrdering::Monotonic); } }; Value *val = codegen(op->value); if (value_type.is_scalar()) { // Scalar Value *ptr = codegen_buffer_pointer(op->name, value_type, op->index); StoreInst *store = builder->CreateAlignedStore(val, ptr, llvm::Align(value_type.bytes())); annotate_store(store, op->index); } else if (const Let *let = op->index.as()) { Stmt s = Store::make(op->name, op->value, let->body, op->param, op->predicate, op->alignment); codegen(LetStmt::make(let->name, let->value, s)); } else { int alignment = value_type.bytes(); const Ramp *ramp = op->index.as(); // TODO(zvookin): consider splitting out vector predication path. Current // code shows how vector predication would simplify things as the // following scalarization cases would go away. bool is_dense = ramp && is_const_one(ramp->stride); if (use_llvm_vp_intrinsics || is_dense) { int native_bits = native_vector_bits(); int native_bytes = native_bits / 8; // Boost the alignment if possible, up to the native vector width. ModulusRemainder mod_rem = op->alignment; while ((mod_rem.remainder & 1) == 0 && (mod_rem.modulus & 1) == 0 && alignment < native_bytes) { mod_rem.modulus /= 2; mod_rem.remainder /= 2; alignment *= 2; } // If it is an external buffer, then we cannot assume that the host pointer // is aligned to at least the native vector width. However, we may be able to do // better than just assuming that it is unaligned. if (op->param.defined()) { int host_alignment = op->param.host_alignment(); alignment = gcd(alignment, host_alignment); } // For dense vector stores wider than the native vector // width, bust them up into native vectors. int store_lanes = value_type.lanes(); int native_lanes = maximum_vector_bits() / value_type.bits(); Expr base = (ramp != nullptr) ? ramp->base : 0; Expr stride = (ramp != nullptr) ? ramp->stride : 0; Value *stride_val = (!is_dense && ramp != nullptr) ? codegen(stride) : nullptr; Value *index = (ramp == nullptr) ? codegen(op->index) : nullptr; for (int i = 0; i < store_lanes; i += native_lanes) { int slice_lanes = std::min(native_lanes, store_lanes - i); Expr slice_base = simplify(base + i * stride); Expr slice_stride = make_one(slice_base.type()); Expr slice_index = slice_lanes == 1 ? slice_base : Ramp::make(slice_base, slice_stride, slice_lanes); Value *slice_val = slice_vector(val, i, slice_lanes); Value *elt_ptr = codegen_buffer_pointer(op->name, value_type.element_of(), slice_base); Value *vec_ptr = builder->CreatePointerCast(elt_ptr, slice_val->getType()->getPointerTo()); if (is_dense || slice_lanes == 1) { if (try_vector_predication_intrinsic("llvm.vp.store", void_t, slice_lanes, AllEnabledMask(), {VPArg(slice_val, 0), VPArg(vec_ptr, 1, alignment)})) { add_tbaa_metadata(dyn_cast(value), op->name, slice_index); } else { StoreInst *store = builder->CreateAlignedStore(slice_val, vec_ptr, llvm::Align(alignment)); annotate_store(store, slice_index); } } else if (ramp != nullptr) { if (get_target().bits == 64 && !stride_val->getType()->isIntegerTy(64)) { stride_val = builder->CreateIntCast(stride_val, i64_t, true); } bool generated = try_vector_predication_intrinsic("llvm.experimental.vp.strided.store", void_t, slice_lanes, AllEnabledMask(), {VPArg(slice_val, 0), VPArg(vec_ptr, 1, alignment), VPArg(stride_val, 2)}); internal_assert(generated) << "Using vector predicated intrinsics, but code generation was not successful for strided store.\n"; add_tbaa_metadata(dyn_cast(value), op->name, slice_index); } else { Value *slice_index = slice_vector(index, i, slice_lanes); Value *vec_ptrs = codegen_buffer_pointer(op->name, value_type, slice_index); bool generated = try_vector_predication_intrinsic("llvm.vp.scatter", void_t, slice_lanes, AllEnabledMask(), {VPArg(slice_val, 0), VPArg(vec_ptrs, 1, alignment)}); internal_assert(generated) << "Using vector predicated intrinsics, but code generation was not successful for gathering store.\n"; } } } else if (ramp) { Type ptr_type = value_type.element_of(); Value *ptr = codegen_buffer_pointer(op->name, ptr_type, ramp->base); const IntImm *const_stride = ramp->stride.as(); Value *stride = codegen(ramp->stride); llvm::Type *load_type = llvm_type_of(ptr_type); // Scatter without generating the indices as a vector for (int i = 0; i < ramp->lanes; i++) { Constant *lane = ConstantInt::get(i32_t, i); Value *v = builder->CreateExtractElement(val, lane); if (const_stride) { // Use a constant offset from the base pointer Value *p = builder->CreateConstInBoundsGEP1_32( load_type, ptr, const_stride->value * i); StoreInst *store = builder->CreateStore(v, p); annotate_store(store, op->index); } else { // Increment the pointer by the stride for each element StoreInst *store = builder->CreateStore(v, ptr); annotate_store(store, op->index); ptr = CreateInBoundsGEP(builder.get(), load_type, ptr, stride); } } } else { // Scatter Value *index = codegen(op->index); for (int i = 0; i < value_type.lanes(); i++) { Value *lane = ConstantInt::get(i32_t, i); Value *idx = builder->CreateExtractElement(index, lane); Value *v = builder->CreateExtractElement(val, lane); Value *ptr = codegen_buffer_pointer(op->name, value_type.element_of(), idx); StoreInst *store = builder->CreateStore(v, ptr); annotate_store(store, op->index); } } } } void CodeGen_LLVM::codegen_asserts(const vector &asserts) { if (target.has_feature(Target::NoAsserts)) { return; } if (asserts.size() < 4) { for (const auto *a : asserts) { codegen(Stmt(a)); } return; } internal_assert(asserts.size() <= 63); // Mix all the conditions together into a bitmask Expr bitmask = cast(1) << 63; for (size_t i = 0; i < asserts.size(); i++) { bitmask = bitmask | (cast(!asserts[i]->condition) << i); } Expr switch_case = count_trailing_zeros(bitmask); BasicBlock *no_errors_bb = BasicBlock::Create(*context, "no_errors_bb", function); // Now switch on the bitmask to the correct failure Expr case_idx = cast(count_trailing_zeros(bitmask)); llvm::SmallVector weights; weights.push_back(1 << 30); for (int i = 0; i < (int)asserts.size(); i++) { weights.push_back(0); } llvm::MDBuilder md_builder(*context); llvm::MDNode *switch_very_likely_branch = md_builder.createBranchWeights(weights); auto *switch_inst = builder->CreateSwitch(codegen(case_idx), no_errors_bb, asserts.size(), switch_very_likely_branch); for (int i = 0; i < (int)asserts.size(); i++) { BasicBlock *fail_bb = BasicBlock::Create(*context, "assert_failed", function); switch_inst->addCase(ConstantInt::get(IntegerType::get(*context, 32), i), fail_bb); builder->SetInsertPoint(fail_bb); Value *v = codegen(asserts[i]->message); builder->CreateRet(v); } builder->SetInsertPoint(no_errors_bb); } void CodeGen_LLVM::visit(const Block *op) { // Peel blocks of assertions with pure conditions const AssertStmt *a = op->first.as(); if (a && is_pure(a->condition)) { vector asserts; asserts.push_back(a); Stmt s = op->rest; while ((op = s.as()) && (a = op->first.as()) && is_pure(a->condition) && asserts.size() < 63) { asserts.push_back(a); s = op->rest; } codegen_asserts(asserts); codegen(s); } else { codegen(op->first); codegen(op->rest); } } void CodeGen_LLVM::visit(const Realize *op) { internal_error << "Realize encountered during codegen\n"; } void CodeGen_LLVM::visit(const Provide *op) { internal_error << "Provide encountered during codegen\n"; } void CodeGen_LLVM::visit(const IfThenElse *op) { // Gather the conditions and values in an if-else chain vector> blocks; Stmt final_else; const IfThenElse *next_if = op; do { blocks.emplace_back(next_if->condition, next_if->then_case); final_else = next_if->else_case; next_if = final_else.defined() ? final_else.as() : nullptr; } while (next_if); // Check if we should use a switch statement or an if-else tree Expr lhs; bool use_switch = blocks.size() > 1; vector rhs; for (auto &block : blocks) { const EQ *eq = block.first.as(); const int64_t *r = eq ? as_const_int(eq->b) : nullptr; if (eq && r && Int(32).can_represent(*r) && is_pure(eq->a) && is_const(eq->b) && (!lhs.defined() || equal(lhs, eq->a))) { lhs = eq->a; rhs.push_back((int)*r); } else { use_switch = false; } } if (use_switch) { // Conditions are all of the form expr == constant for a // consistent expr and different constants. Use a switch // statement. BasicBlock *after_bb = BasicBlock::Create(*context, "after_bb", function); BasicBlock *default_bb = BasicBlock::Create(*context, "default_bb", function); auto *switch_inst = builder->CreateSwitch(codegen(lhs), default_bb, blocks.size()); for (int i = 0; i < (int)blocks.size(); i++) { string name = "case_" + std::to_string(rhs[i]) + "_bb"; BasicBlock *case_bb = BasicBlock::Create(*context, name, function); switch_inst->addCase(ConstantInt::get(IntegerType::get(*context, 32), rhs[i]), case_bb); builder->SetInsertPoint(case_bb); codegen(blocks[i].second); builder->CreateBr(after_bb); } builder->SetInsertPoint(default_bb); if (final_else.defined()) { codegen(final_else); } builder->CreateBr(after_bb); builder->SetInsertPoint(after_bb); } else { // Codegen an regular if-else chain using branches. BasicBlock *after_bb = BasicBlock::Create(*context, "after_bb", function); for (const auto &p : blocks) { BasicBlock *then_bb = BasicBlock::Create(*context, "then_bb", function); BasicBlock *next_bb = BasicBlock::Create(*context, "next_bb", function); builder->CreateCondBr(codegen(p.first), then_bb, next_bb); builder->SetInsertPoint(then_bb); codegen(p.second); builder->CreateBr(after_bb); builder->SetInsertPoint(next_bb); } if (final_else.defined()) { codegen(final_else); } builder->CreateBr(after_bb); builder->SetInsertPoint(after_bb); } } void CodeGen_LLVM::visit(const Evaluate *op) { codegen(op->value); // Discard result value = nullptr; } void CodeGen_LLVM::visit(const Shuffle *op) { vector vecs; for (const Expr &e : op->vectors) { vecs.push_back(codegen(e)); } if (op->is_interleave()) { value = interleave_vectors(vecs); } else if (op->is_concat()) { value = concat_vectors(vecs); } else { // If the even-numbered indices equal the odd-numbered // indices, only generate one and then do a self-interleave. for (int f : {4, 3, 2}) { bool self_interleave = (op->indices.size() % f) == 0; for (size_t i = 0; i < op->indices.size(); i++) { self_interleave &= (op->indices[i] == op->indices[i - (i % f)]); } if (self_interleave) { vector sub_indices; for (size_t i = 0; i < op->indices.size(); i += f) { sub_indices.push_back(op->indices[i]); } Expr equiv = Shuffle::make(op->vectors, sub_indices); value = codegen(equiv); value = interleave_vectors(std::vector(f, value)); return; } // Check for an interleave of slices (i.e. an in-vector transpose) bool interleave_of_slices = op->vectors.size() == 1 && (op->indices.size() % f) == 0; int step = op->type.lanes() / f; for (int i = 0; i < step; i++) { for (int j = 0; j < f; j++) { interleave_of_slices &= (op->indices[i * f + j] == j * step + i); } } if (interleave_of_slices) { value = codegen(op->vectors[0]); vector slices; for (int i = 0; i < f; i++) { slices.push_back(slice_vector(value, i * step, step)); } value = interleave_vectors(slices); } } // If the indices form contiguous aligned runs, do the shuffle // on entire sub-vectors by reinterpreting them as a wider // type. for (int f : {8, 4, 2}) { if (op->type.lanes() % f != 0) { continue; } if (op->type.bits() * f > 64) { continue; } bool contiguous = true; for (const Expr &vec : op->vectors) { contiguous &= ((vec.type().lanes() % f) == 0); } for (size_t i = 0; i < op->indices.size(); i += f) { contiguous &= (op->indices[i] % f) == 0; for (int j = 0; j < f; j++) { contiguous &= (op->indices[i + j] == op->indices[i] + j); } } if (contiguous) { vector equiv_args; for (const Expr &vec : op->vectors) { Type t = UInt(vec.type().bits() * f, vec.type().lanes() / f); equiv_args.push_back(reinterpret(t, vec)); } vector equiv_indices; for (size_t i = 0; i < op->indices.size(); i += f) { equiv_indices.push_back(op->indices[i] / f); } Expr equiv = Shuffle::make(equiv_args, equiv_indices); equiv = reinterpret(op->type, equiv); codegen(equiv); return; } } // Do a concat and then a single shuffle value = concat_vectors(vecs); if (op->is_slice() && op->slice_stride() == 1) { value = slice_vector(value, op->indices[0], op->indices.size()); } else { value = shuffle_vectors(value, op->indices); } } if (op->type.is_scalar() && value->getType()->isVectorTy()) { value = builder->CreateExtractElement(value, ConstantInt::get(i32_t, 0)); } } void CodeGen_LLVM::visit(const VectorReduce *op) { codegen_vector_reduce(op, Expr()); } void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &init) { Expr val = op->value; const int output_lanes = op->type.lanes(); const int native_lanes = maximum_vector_bits() / op->type.bits(); const int factor = val.type().lanes() / output_lanes; Type elt = op->type.element_of(); Expr (*binop)(Expr, Expr) = nullptr; switch (op->op) { case VectorReduce::Add: binop = Add::make; break; case VectorReduce::Mul: binop = Mul::make; break; case VectorReduce::Min: binop = Min::make; break; case VectorReduce::Max: binop = Max::make; break; case VectorReduce::And: binop = And::make; break; case VectorReduce::Or: binop = Or::make; break; case VectorReduce::SaturatingAdd: binop = Halide::saturating_add; break; } if (op->type.is_bool() && op->op == VectorReduce::Or) { // Cast to u8, use max, cast back to bool. Expr equiv = cast(op->value.type().with_bits(8), op->value); equiv = VectorReduce::make(VectorReduce::Max, equiv, op->type.lanes()); if (init.defined()) { equiv = max(equiv, init); } equiv = cast(op->type, equiv); equiv.accept(this); return; } if (op->type.is_bool() && op->op == VectorReduce::And) { // Cast to u8, use min, cast back to bool. Expr equiv = cast(op->value.type().with_bits(8), op->value); equiv = VectorReduce::make(VectorReduce::Min, equiv, op->type.lanes()); equiv = cast(op->type, equiv); if (init.defined()) { equiv = min(equiv, init); } equiv.accept(this); return; } if (elt == Float(16) && upgrade_type_for_arithmetic(elt) != elt) { Expr equiv = cast(op->value.type().with_bits(32), op->value); equiv = VectorReduce::make(op->op, equiv, op->type.lanes()); if (init.defined()) { equiv = binop(equiv, init); } equiv = cast(op->type, equiv); equiv.accept(this); return; } if (output_lanes == 1) { const int input_lanes = val.type().lanes(); const int input_bytes = input_lanes * val.type().bytes(); const bool llvm_has_intrinsic = // Must be one of these ops ((op->op == VectorReduce::Add || op->op == VectorReduce::Mul || op->op == VectorReduce::Min || op->op == VectorReduce::Max) && (use_llvm_vp_intrinsics || // Must be a power of two lanes ((input_lanes >= 2) && ((input_lanes & (input_lanes - 1)) == 0) && // int versions exist up to 1024 bits ((!op->type.is_float() && input_bytes <= 1024) || // float versions exist up to 16 lanes input_lanes <= 16) && // As of the release of llvm 10, the 64-bit experimental total // reductions don't seem to be done yet on arm. (val.type().bits() != 64 || target.arch != Target::ARM)))); if (llvm_has_intrinsic) { const char *name = ""; const int bits = op->type.bits(); bool takes_initial_value = use_llvm_vp_intrinsics; Expr initial_value = init; if (op->type.is_float()) { switch (op->op) { case VectorReduce::Add: name = "fadd"; takes_initial_value = true; if (!initial_value.defined()) { initial_value = make_zero(op->type); } break; case VectorReduce::Mul: name = "fmul"; takes_initial_value = true; if (!initial_value.defined()) { initial_value = make_one(op->type); } break; case VectorReduce::Min: name = "fmin"; // TODO(zvookin): Not correct for stricT_float. See: https://github.com/halide/Halide/issues/7118 if (takes_initial_value && !initial_value.defined()) { initial_value = op->type.max(); } break; case VectorReduce::Max: name = "fmax"; // TODO(zvookin): Not correct for stricT_float. See: https://github.com/halide/Halide/issues/7118 if (takes_initial_value && !initial_value.defined()) { initial_value = op->type.min(); } break; default: break; } } else if (op->type.is_int() || op->type.is_uint()) { switch (op->op) { case VectorReduce::Add: name = "add"; if (takes_initial_value && !initial_value.defined()) { initial_value = make_zero(op->type); } break; case VectorReduce::Mul: name = "mul"; if (takes_initial_value && !initial_value.defined()) { initial_value = make_one(op->type); } break; case VectorReduce::Min: name = op->type.is_int() ? "smin" : "umin"; if (takes_initial_value && !initial_value.defined()) { initial_value = op->type.max(); } break; case VectorReduce::Max: name = op->type.is_int() ? "smax" : "umax"; if (takes_initial_value && !initial_value.defined()) { initial_value = op->type.min(); } break; default: break; } } if (use_llvm_vp_intrinsics) { string vp_name = "llvm.vp.reduce." + std::string(name); codegen(initial_value); llvm::Value *init = value; codegen(op->value); llvm::Value *val = value; bool generated = try_vector_predication_intrinsic(vp_name, llvm_type_of(op->type), op->value.type().lanes(), AllEnabledMask(), {VPArg(init), VPArg(val, 0)}); internal_assert(generated) << "Vector predication intrinsic generation failed for vector reduction " << name << "\n"; } else { std::stringstream build_name; build_name << "llvm.vector.reduce."; build_name << name; build_name << ".v" << val.type().lanes() << (op->type.is_float() ? 'f' : 'i') << bits; string intrin_name = build_name.str(); vector args; if (takes_initial_value) { args.push_back(initial_value); initial_value = Expr(); } args.push_back(op->value); // Make sure the declaration exists, or the codegen for // call will assume that the args should scalarize. if (!module->getFunction(intrin_name)) { vector arg_types; for (const Expr &e : args) { arg_types.push_back(llvm_type_of(e.type())); } FunctionType *func_t = FunctionType::get(llvm_type_of(op->type), arg_types, false); llvm::Function::Create(func_t, llvm::Function::ExternalLinkage, intrin_name, module.get()); } Expr equiv = Call::make(op->type, intrin_name, args, Call::PureExtern); if (initial_value.defined()) { equiv = binop(initial_value, equiv); } equiv.accept(this); } return; } } if (output_lanes == 1 && factor > native_lanes && (use_llvm_vp_intrinsics || (factor % native_lanes == 0))) { // It's a total reduction of multiple native // vectors. Start by adding the vectors together. Expr equiv; for (int i = 0; i < factor / native_lanes; i++) { Expr next = Shuffle::make_slice(val, i * native_lanes, 1, native_lanes); if (equiv.defined()) { equiv = binop(equiv, next); } else { equiv = next; } } equiv = VectorReduce::make(op->op, equiv, 1); if (init.defined()) { equiv = binop(equiv, init); } equiv = common_subexpression_elimination(equiv); equiv.accept(this); return; } if (factor > 2 && ((factor & 1) == 0)) { // Factor the reduce into multiple stages. If we're going to // be widening the type by 4x or more we should also factor the // widening into multiple stages. Type intermediate_type = op->value.type().with_lanes(op->value.type().lanes() / 2); Expr equiv = VectorReduce::make(op->op, op->value, intermediate_type.lanes()); if (op->op == VectorReduce::Add && (op->type.is_int() || op->type.is_uint()) && op->type.bits() >= 32) { Type narrower_type = op->value.type().narrow().narrow(); Expr narrower = lossless_cast(narrower_type, op->value); if (!narrower.defined() && narrower_type.is_int()) { // Maybe we can narrow to an unsigned int instead. narrower_type = narrower_type.with_code(Type::UInt); narrower = lossless_cast(narrower_type, op->value); } if (narrower.defined()) { // Widen it by 2x before the horizontal add narrower = cast(narrower.type().widen(), narrower); equiv = VectorReduce::make(op->op, narrower, intermediate_type.lanes()); // Then widen it by 2x again afterwards equiv = cast(intermediate_type, equiv); } } equiv = VectorReduce::make(op->op, equiv, op->type.lanes()); if (init.defined()) { equiv = binop(equiv, init); } equiv = common_subexpression_elimination(equiv); codegen(equiv); return; } // Extract each slice and combine Expr equiv = init; for (int i = 0; i < factor; i++) { Expr next = Shuffle::make_slice(val, i, factor, val.type().lanes() / factor); if (equiv.defined()) { equiv = binop(equiv, next); } else { equiv = next; } } equiv = common_subexpression_elimination(equiv); codegen(equiv); } // namespace Internal void CodeGen_LLVM::visit(const Atomic *op) { if (!op->mutex_name.empty()) { internal_assert(!inside_atomic_mutex_node) << "Nested atomic mutex locks detected. This might causes a deadlock.\n"; ScopedValue old_inside_atomic_mutex_node(inside_atomic_mutex_node, true); // Mutex locking & unlocking are handled by function calls generated by previous lowering passes. codegen(op->body); } else { // Issue atomic stores. ScopedValue old_emit_atomic_stores(emit_atomic_stores, true); codegen(op->body); } } Value *CodeGen_LLVM::create_alloca_at_entry(llvm::Type *t, int n, bool zero_initialize, const string &name) { IRBuilderBase::InsertPoint here = builder->saveIP(); BasicBlock *entry = &builder->GetInsertBlock()->getParent()->getEntryBlock(); if (entry->empty()) { builder->SetInsertPoint(entry); } else { builder->SetInsertPoint(entry, entry->getFirstInsertionPt()); } Value *size = ConstantInt::get(i32_t, n); AllocaInst *ptr = builder->CreateAlloca(t, size, name); int align = native_vector_bits() / 8; llvm::DataLayout d(module.get()); int allocated_size = n * (int)d.getTypeAllocSize(t); if (t->isVectorTy() || n > 1) { ptr->setAlignment(llvm::Align(align)); } requested_alloca_total += allocated_size; if (zero_initialize) { if (n == 1) { builder->CreateStore(Constant::getNullValue(t), ptr); } else { builder->CreateMemSet(ptr, Constant::getNullValue(t), n, llvm::Align(align)); } } builder->restoreIP(here); return ptr; } Value *CodeGen_LLVM::get_user_context() const { Value *ctx = sym_get("__user_context", false); if (!ctx) { ctx = ConstantPointerNull::get(i8_t->getPointerTo()); // void* } return ctx; } llvm::Function *CodeGen_LLVM::get_llvm_intrin(llvm::Type *ret_type, const std::string &name, const std::vector &arg_types) { llvm::Function *intrin = module->getFunction(name); if (!intrin) { FunctionType *func_t = FunctionType::get(ret_type, arg_types, false); intrin = llvm::Function::Create(func_t, llvm::Function::ExternalLinkage, name, module.get()); intrin->setCallingConv(CallingConv::C); } return intrin; } llvm::Function *CodeGen_LLVM::get_llvm_intrin(const Type &ret_type, const std::string &name, const std::vector &arg_types, bool scalars_are_vectors) { llvm::Function *intrin = module->getFunction(name); if (intrin) { return intrin; } vector llvm_arg_types(arg_types.size()); for (size_t i = 0; i < arg_types.size(); i++) { llvm_arg_types[i] = llvm_type_of(arg_types[i]); if (arg_types[i].is_scalar() && scalars_are_vectors) { llvm_arg_types[i] = get_vector_type(llvm_arg_types[i], 1); } } llvm::Type *llvm_ret_type = llvm_type_of(ret_type); if (ret_type.is_scalar() && scalars_are_vectors) { llvm_ret_type = get_vector_type(llvm_ret_type, 1); } return get_llvm_intrin(llvm_ret_type, name, llvm_arg_types); } llvm::Function *CodeGen_LLVM::declare_intrin_overload(const std::string &name, const Type &ret_type, const std::string &impl_name, std::vector arg_types, bool scalars_are_vectors) { llvm::Function *intrin = get_llvm_intrin(ret_type, impl_name, arg_types, scalars_are_vectors); internal_assert(intrin); intrinsics[name].emplace_back(ret_type, std::move(arg_types), intrin); return intrin; } void CodeGen_LLVM::declare_intrin_overload(const std::string &name, const Type &ret_type, llvm::Function *impl, std::vector arg_types) { internal_assert(impl); intrinsics[name].emplace_back(ret_type, std::move(arg_types), impl); } Value *CodeGen_LLVM::call_overloaded_intrin(const Type &result_type, const std::string &name, const std::vector &args) { constexpr int debug_level = 4; debug(debug_level) << "call_overloaded_intrin: " << result_type << " " << name << "("; const char *comma = ""; for (const Expr &i : args) { debug(debug_level) << comma << i; comma = ", "; } debug(debug_level) << ")\n"; auto impls_i = intrinsics.find(name); if (impls_i == intrinsics.end()) { debug(debug_level) << "No intrinsic " << name << "\n"; return nullptr; } const Intrinsic *resolved = nullptr; for (const Intrinsic &overload : impls_i->second) { debug(debug_level) << "Considering candidate " << overload.result_type << "("; const char *comma = ""; for (const auto &i : overload.arg_types) { debug(debug_level) << comma << i; comma = ", "; } debug(debug_level) << ")\n"; if (overload.arg_types.size() != args.size()) { debug(debug_level) << "Wrong number of arguments\n"; continue; } if (overload.result_type.element_of() != result_type.element_of()) { debug(debug_level) << "Wrong result type\n"; continue; } bool match = true; for (int i = 0; i < (int)overload.arg_types.size(); i++) { if (args[i].type().is_scalar()) { // Allow lossless casting for scalar arguments, and // allow broadcasting to vector arguments. if (!lossless_cast(overload.arg_types[i].element_of(), args[i]).defined()) { match = false; debug(debug_level) << "Cannot promote scalar argument " << i << "\n"; break; } } else { int required_lanes = result_type.lanes() * overload.arg_types[i].lanes() / overload.result_type.lanes(); if (required_lanes != args[i].type().lanes()) { match = false; debug(debug_level) << "Need " << required_lanes << " lanes for argument " << i << "\n"; break; } // Vector arguments must be exact. if (overload.arg_types[i].element_of() != args[i].type().element_of()) { match = false; debug(debug_level) << "Vector types not equal " << i << "\n"; break; } } } if (!match) { continue; } if (!resolved) { debug(debug_level) << "Resolved!\n"; resolved = &overload; } else { if (resolved->result_type.lanes() < result_type.lanes()) { // The current match is smaller than the result type. Take the bigger intrinsic. if (overload.result_type.lanes() > resolved->result_type.lanes()) { debug(debug_level) << "Replaced with bigger intrinsic\n"; resolved = &overload; } } else { // The current match is bigger than the result type. If the current candidate is also bigger, // but smaller than the current match, take it instead. if (overload.result_type.lanes() >= result_type.lanes() && overload.result_type.lanes() < resolved->result_type.lanes()) { debug(debug_level) << "Replaced with smaller intrinsic\n"; resolved = &overload; } } } } if (resolved) { std::vector promoted_args; promoted_args.reserve(args.size()); for (size_t i = 0; i < args.size(); i++) { Expr promoted_arg = args[i]; if (args[i].type().is_scalar()) { promoted_arg = lossless_cast(resolved->arg_types[i].element_of(), promoted_arg); } if (resolved->arg_types[i].is_vector() && args[i].type().is_scalar() && result_type.lanes() > 1) { // We're passing a scalar to a vector argument, broadcast it. promoted_args.emplace_back(Broadcast::make(promoted_arg, result_type.lanes())); } else { promoted_args.emplace_back(promoted_arg); } internal_assert(promoted_args.back().defined()); } return call_intrin(result_type, resolved->result_type.lanes(), resolved->impl, promoted_args); } else { debug(debug_level) << "Unresolved intrinsic " << name << "\n"; } return nullptr; } Value *CodeGen_LLVM::call_intrin(const Type &result_type, int intrin_lanes, const string &name, vector args) { vector arg_values(args.size()); for (size_t i = 0; i < args.size(); i++) { arg_values[i] = codegen(args[i]); } llvm::Type *t = llvm_type_of(result_type); return call_intrin(t, intrin_lanes, name, arg_values, isa(t)); } Value *CodeGen_LLVM::call_intrin(const Type &result_type, int intrin_lanes, llvm::Function *intrin, vector args) { vector arg_values(args.size()); for (size_t i = 0; i < args.size(); i++) { arg_values[i] = codegen(args[i]); } llvm::Type *t = llvm_type_of(result_type); return call_intrin(t, intrin_lanes, intrin, arg_values); } Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes, const string &name, vector arg_values, bool scalable_vector_result, bool is_reduction) { llvm::Function *fn = module->getFunction(name); if (!fn) { vector arg_types(arg_values.size()); for (size_t i = 0; i < arg_values.size(); i++) { arg_types[i] = arg_values[i]->getType(); } llvm::Type *intrinsic_result_type = result_type->getScalarType(); if (intrin_lanes > 1 && !is_reduction) { if (scalable_vector_result && effective_vscale != 0) { intrinsic_result_type = get_vector_type(result_type->getScalarType(), intrin_lanes / effective_vscale, VectorTypeConstraint::VScale); } else { intrinsic_result_type = get_vector_type(result_type->getScalarType(), intrin_lanes, VectorTypeConstraint::Fixed); } } FunctionType *func_t = FunctionType::get(intrinsic_result_type, arg_types, false); fn = llvm::Function::Create(func_t, llvm::Function::ExternalLinkage, name, module.get()); fn->setCallingConv(CallingConv::C); } return call_intrin(result_type, intrin_lanes, fn, arg_values, is_reduction); } Value *CodeGen_LLVM::call_intrin(const llvm::Type *result_type, int intrin_lanes, llvm::Function *intrin, vector arg_values, bool is_reduction) { internal_assert(intrin); int arg_lanes = 1; if (result_type->isVoidTy()) { arg_lanes = intrin_lanes; } else if (result_type->isVectorTy()) { arg_lanes = get_vector_num_elements(result_type); } if (!is_reduction && intrin_lanes != arg_lanes) { // Cut up each arg into appropriately-sized pieces, call the // intrinsic on each, then splice together the results. vector results; for (int start = 0; start < arg_lanes; start += intrin_lanes) { vector args; for (size_t i = 0; i < arg_values.size(); i++) { int arg_i_lanes = 1; if (arg_values[i]->getType()->isVectorTy()) { arg_i_lanes = get_vector_num_elements(arg_values[i]->getType()); } if (arg_i_lanes >= arg_lanes) { // Horizontally reducing intrinsics may have // arguments that have more lanes than the // result. Assume that the horizontally reduce // neighboring elements... int reduce = arg_i_lanes / arg_lanes; args.push_back(slice_vector(arg_values[i], start * reduce, intrin_lanes * reduce)); } else if (arg_i_lanes == 1) { if (intrin->getFunctionType()->getParamType(i)->isVectorTy()) { // It's a scalar argument to a vector parameter. Broadcast it. // Overwriting the parameter means this only happens once. arg_values[i] = create_broadcast(arg_values[i], intrin_lanes); } else { // It's a scalar arg to an intrinsic that returns // a vector. Replicate it over the slices. } args.push_back(arg_values[i]); } else { internal_error << "Argument in call_intrin has " << arg_i_lanes << " with result type having " << arg_lanes << "\n"; } } llvm::Type *result_slice_type = get_vector_type(result_type->getScalarType(), intrin_lanes); results.push_back(call_intrin(result_slice_type, intrin_lanes, intrin, args)); } Value *result = concat_vectors(results); return slice_vector(result, 0, arg_lanes); } llvm::FunctionType *intrin_type = intrin->getFunctionType(); for (int i = 0; i < (int)arg_values.size(); i++) { if (arg_values[i]->getType() != intrin_type->getParamType(i)) { // TODO: Change this to call convert_fixed_or_scalable_vector_type and // remove normalize_fixed_scalable_vector_type, fixed_to_scalable_vector_type, // and scalable_to_fixed_vector_type arg_values[i] = normalize_fixed_scalable_vector_type(intrin_type->getParamType(i), arg_values[i]); } if (arg_values[i]->getType() != intrin_type->getParamType(i)) { // There can be some mismatches in types, such as when passing scalar Halide type T // to LLVM vector type <1 x T>. arg_values[i] = builder->CreateBitCast(arg_values[i], intrin_type->getParamType(i)); } } CallInst *call = builder->CreateCall(intrin, arg_values); return call; } Value *CodeGen_LLVM::slice_vector(Value *vec, int start, int size) { // Force the arg to be an actual vector if (!vec->getType()->isVectorTy()) { vec = create_broadcast(vec, 1); } int vec_lanes = get_vector_num_elements(vec->getType()); if (start == 0 && size == vec_lanes) { return vec; } if (size == 1) { return builder->CreateExtractElement(vec, (uint64_t)start); } vector indices(size); for (int i = 0; i < size; i++) { int idx = start + i; if (idx >= 0 && idx < vec_lanes) { indices[i] = idx; } else { indices[i] = -1; } } return shuffle_vectors(vec, indices); } Value *CodeGen_LLVM::concat_vectors(const vector &v) { if (v.size() == 1) { return v[0]; } internal_assert(!v.empty()); vector vecs = v; // Force them all to be actual vectors for (Value *&val : vecs) { if (!val->getType()->isVectorTy()) { val = create_broadcast(val, 1); } } while (vecs.size() > 1) { vector new_vecs; for (size_t i = 0; i < vecs.size() - 1; i += 2) { Value *v1 = vecs[i]; Value *v2 = vecs[i + 1]; int w1 = get_vector_num_elements(v1->getType()); int w2 = get_vector_num_elements(v2->getType()); // Possibly pad one of the vectors to match widths. if (w1 < w2) { v1 = slice_vector(v1, 0, w2); } else if (w2 < w1) { v2 = slice_vector(v2, 0, w1); } int w_matched = std::max(w1, w2); internal_assert(v1->getType() == v2->getType()); vector indices(w1 + w2); for (int i = 0; i < w1; i++) { indices[i] = i; } for (int i = 0; i < w2; i++) { indices[w1 + i] = w_matched + i; } Value *merged = shuffle_vectors(v1, v2, indices); new_vecs.push_back(merged); } // If there were an odd number of them, we need to also push // the one that didn't get merged. if (vecs.size() & 1) { new_vecs.push_back(vecs.back()); } vecs.swap(new_vecs); } return vecs[0]; } Value *CodeGen_LLVM::shuffle_vectors(Value *a, Value *b, const std::vector &indices) { internal_assert(a->getType() == b->getType()); if (!a->getType()->isVectorTy()) { a = create_broadcast(a, 1); b = create_broadcast(b, 1); } vector llvm_indices(indices.size()); for (size_t i = 0; i < llvm_indices.size(); i++) { if (indices[i] >= 0) { internal_assert(indices[i] < get_vector_num_elements(a->getType()) * 2); llvm_indices[i] = ConstantInt::get(i32_t, indices[i]); } else { // Only let -1 be undef. internal_assert(indices[i] == -1); llvm_indices[i] = PoisonValue::get(i32_t); } } if (isa(a->getType())) { a = scalable_to_fixed_vector_type(a); } if (isa(b->getType())) { b = scalable_to_fixed_vector_type(b); } return builder->CreateShuffleVector(a, b, ConstantVector::get(llvm_indices)); } Value *CodeGen_LLVM::shuffle_vectors(Value *a, const std::vector &indices) { Value *b = PoisonValue::get(a->getType()); return shuffle_vectors(a, b, indices); } std::pair CodeGen_LLVM::find_vector_runtime_function(const std::string &name, int lanes) { // Check if a vector version of the function already // exists at some useful width. We use the naming // convention that a N-wide version of a function foo is // called fooxN. All of our intrinsics are power-of-two // sized, so starting at the first power of two >= the // vector width, we'll try all powers of two in decreasing // order. vector sizes_to_try; int l = 1; while (l < lanes) { l *= 2; } for (int i = l; i > 1; i /= 2) { sizes_to_try.push_back(i); } // If none of those match, we'll also try doubling // the lanes up to the next power of two (this is to catch // cases where we're a 64-bit vector and have a 128-bit // vector implementation). sizes_to_try.push_back(l * 2); for (int l : sizes_to_try) { llvm::Function *vec_fn = module->getFunction(name + "x" + std::to_string(l)); if (vec_fn) { return {vec_fn, l}; } } return {nullptr, 0}; } bool CodeGen_LLVM::supports_atomic_add(const Type &t) const { return t.is_int_or_uint(); } bool CodeGen_LLVM::use_pic() const { return true; } std::string CodeGen_LLVM::mabi() const { return ""; } bool CodeGen_LLVM::supports_call_as_float16(const Call *op) const { return false; } llvm::Value *CodeGen_LLVM::simple_call_intrin(const std::string &intrin, const std::vector &args, llvm::Type *result_type) { llvm::Function *function = module->getFunction(intrin); if (!function) { vector arg_types(args.size()); for (size_t i = 0; i < args.size(); i++) { arg_types[i] = args[i]->getType(); } FunctionType *func_t = FunctionType::get(result_type, arg_types, false); function = llvm::Function::Create(func_t, llvm::Function::ExternalLinkage, intrin, module.get()); function->setCallingConv(CallingConv::C); } return builder->CreateCall(function, args); } // TODO: Change the one remaining call to this method to use convert_fixed_or_scalable_vector_type and // remove this_method, fixed_to_scalable_vector_type, and scalable_to_fixed_vector_type llvm::Value *CodeGen_LLVM::normalize_fixed_scalable_vector_type(llvm::Type *desired_type, llvm::Value *result) { llvm::Type *actual_type = result->getType(); if (isa(actual_type) && isa(desired_type)) { const llvm::FixedVectorType *fixed = cast(actual_type); const llvm::ScalableVectorType *scalable = cast(desired_type); if (fixed->getElementType() == scalable->getElementType()) { return fixed_to_scalable_vector_type(result); } } else if (isa(desired_type) && isa(actual_type)) { const llvm::ScalableVectorType *scalable = cast(actual_type); const llvm::FixedVectorType *fixed = cast(desired_type); if (fixed->getElementType() == scalable->getElementType()) { return scalable_to_fixed_vector_type(result); } } return result; } llvm::Value *CodeGen_LLVM::convert_fixed_or_scalable_vector_type(llvm::Value *arg, llvm::Type *desired_type) { llvm::Type *arg_type = arg->getType(); // If types are already equal or neither is a vector type, do nothing. if (arg_type == desired_type || !(arg_type->isVectorTy() || desired_type->isVectorTy())) { return arg; } internal_assert(arg_type->getScalarType() == desired_type->getScalarType()); if (!arg_type->isVectorTy()) { arg = create_broadcast(arg, 1); arg_type = arg->getType(); } llvm::Type *result_type = desired_type; if (!result_type->isVectorTy()) { result_type = get_vector_type(result_type, 1); } int arg_elements = get_vector_num_elements(arg_type); int result_elements = get_vector_num_elements(result_type); bool use_insert; if (isa(arg_type) && isa(result_type)) { use_insert = true; } else if (isa(result_type) && isa(arg_type)) { use_insert = false; } else { // Use extract to make smaller, insert to make bigger. // A somewhat arbitary decision. use_insert = (arg_elements > result_elements); } std::string intrin_name = "llvm.vector."; intrin_name += use_insert ? "insert" : "extract"; intrin_name += mangle_llvm_type(result_type); intrin_name += mangle_llvm_type(arg_type); std::vector args; args.reserve(3); // Vector insert has takes an argument which is being inserted into as well // as a value and index. Extract only takes the value and index. if (use_insert) { Constant *poison = PoisonValue::get(result_type->getScalarType()); llvm::ElementCount element_count; if (isa(result_type)) { element_count = cast(result_type)->getElementCount(); } else { element_count = ElementCount::getFixed(1); } llvm::Value *result_vec = ConstantVector::getSplat(element_count, poison); args.push_back(result_vec); } args.push_back(arg); args.push_back(ConstantInt::get(i64_t, 0)); llvm::Value *result = simple_call_intrin(intrin_name, args, result_type); if (result_type != desired_type) { internal_assert(!desired_type->isVectorTy()) << "Type mismatch should not happen unless result is scalar and requires conversion of single element vector.\n"; result = builder->CreateExtractElement(result, ConstantInt::get(i32_t, 0)); } return result; } llvm::Value *CodeGen_LLVM::fixed_to_scalable_vector_type(llvm::Value *fixed_arg) { internal_assert(effective_vscale != 0); internal_assert(isa(fixed_arg->getType())); const llvm::FixedVectorType *fixed_type = cast(fixed_arg->getType()); internal_assert(fixed_type != nullptr); auto lanes = fixed_type->getNumElements(); llvm::ScalableVectorType *scalable_type = cast(get_vector_type(fixed_type->getElementType(), lanes / effective_vscale, VectorTypeConstraint::VScale)); internal_assert(fixed_type != nullptr); internal_assert(fixed_type->getElementType() == scalable_type->getElementType()); internal_assert(lanes == (scalable_type->getMinNumElements() * effective_vscale)); // E.g. llvm.vector.insert.nxv2i64.v4i64(, <4 x i64>, i64) const char *type_designator; if (fixed_type->getElementType()->isIntegerTy()) { type_designator = "i"; } else { type_designator = "f"; } std::string intrin = "llvm.vector.insert.nxv" + std::to_string(scalable_type->getMinNumElements()); intrin += type_designator; std::string bits_designator = std::to_string(fixed_type->getScalarSizeInBits()); intrin += bits_designator; intrin += ".v" + std::to_string(lanes) + type_designator + bits_designator; Constant *poison = PoisonValue::get(scalable_type->getElementType()); llvm::Value *result_vec = ConstantVector::getSplat(scalable_type->getElementCount(), poison); std::vector args; args.push_back(result_vec); args.push_back(value); args.push_back(ConstantInt::get(i64_t, 0)); return simple_call_intrin(intrin, args, scalable_type); } llvm::Value *CodeGen_LLVM::scalable_to_fixed_vector_type(llvm::Value *scalable_arg) { internal_assert(effective_vscale != 0); internal_assert(isa(scalable_arg->getType())); const llvm::ScalableVectorType *scalable_type = cast(scalable_arg->getType()); internal_assert(scalable_type != nullptr); llvm::FixedVectorType *fixed_type = cast(get_vector_type(scalable_type->getElementType(), scalable_type->getMinNumElements() * effective_vscale, VectorTypeConstraint::Fixed)); internal_assert(fixed_type != nullptr); internal_assert(fixed_type->getElementType() == scalable_type->getElementType()); internal_assert(fixed_type->getNumElements() == (scalable_type->getMinNumElements() * effective_vscale)); // E.g. <64 x i8> @llvm.vector.extract.v64i8.nxv8i8( %vresult, i64 0) const char *type_designator; if (scalable_type->getElementType()->isIntegerTy()) { type_designator = "i"; } else { type_designator = "f"; } std::string bits_designator = std::to_string(fixed_type->getScalarSizeInBits()); std::string intrin = "llvm.vector.extract.v" + std::to_string(fixed_type->getNumElements()) + type_designator + bits_designator; intrin += ".nxv" + std::to_string(scalable_type->getMinNumElements()) + type_designator + bits_designator; std::vector args; args.push_back(scalable_arg); args.push_back(ConstantInt::get(i64_t, 0)); return simple_call_intrin(intrin, args, fixed_type); } int CodeGen_LLVM::get_vector_num_elements(const llvm::Type *t) { if (isa(t)) { const auto *vt = cast(t); return vt->getNumElements(); } else if (isa(t)) { internal_assert(effective_vscale != 0) << "Scalable vector type enountered without vector_bits being set.\n"; const auto *vt = cast(t); return vt->getMinNumElements() * effective_vscale; } else { return 1; } } llvm::Type *CodeGen_LLVM::llvm_type_of(LLVMContext *c, Halide::Type t, int effective_vscale) const { if (t.lanes() == 1) { if (t.is_float() && !t.is_bfloat()) { switch (t.bits()) { case 16: return llvm::Type::getHalfTy(*c); case 32: return llvm::Type::getFloatTy(*c); case 64: return llvm::Type::getDoubleTy(*c); default: internal_error << "There is no llvm type matching this floating-point bit width: " << t << "\n"; return nullptr; } } else if (t.is_handle()) { return llvm::Type::getInt8PtrTy(*c); } else { return llvm::Type::getIntNTy(*c, t.bits()); } } else { llvm::Type *element_type = llvm_type_of(c, t.element_of(), 0); return get_vector_type(element_type, t.lanes()); } } llvm::Type *CodeGen_LLVM::get_vector_type(llvm::Type *t, int n, VectorTypeConstraint type_constraint) const { if (t->isVoidTy()) { return t; } bool scalable = false; switch (type_constraint) { case VectorTypeConstraint::None: scalable = effective_vscale != 0 && ((n % effective_vscale) == 0); if (scalable) { n = n / effective_vscale; } break; case VectorTypeConstraint::Fixed: scalable = false; break; case VectorTypeConstraint::VScale: scalable = true; break; default: internal_error << "Impossible"; break; } return VectorType::get(t, n, scalable); } llvm::Constant *CodeGen_LLVM::get_splat(int lanes, llvm::Constant *value, VectorTypeConstraint type_constraint) const { bool scalable = false; switch (type_constraint) { case VectorTypeConstraint::None: scalable = effective_vscale != 0 && ((lanes % effective_vscale) == 0); if (scalable) { lanes = lanes / effective_vscale; } break; case VectorTypeConstraint::Fixed: scalable = false; break; case VectorTypeConstraint::VScale: scalable = true; break; } llvm::ElementCount ec = scalable ? llvm::ElementCount::getScalable(lanes) : llvm::ElementCount::getFixed(lanes); return ConstantVector::getSplat(ec, value); } std::string CodeGen_LLVM::mangle_llvm_type(llvm::Type *type) { std::string type_string = "."; if (isa(type)) { const auto *vt = cast(type); type_string += "nxv" + std::to_string(vt->getMinNumElements()); type = type->getScalarType(); } else if (isa(type)) { const auto *vt = cast(type); type_string += "v" + std::to_string(vt->getNumElements()); type = type->getScalarType(); } if (isa(type)) { const auto *vt = cast(type); type_string += "p" + std::to_string(vt->getAddressSpace()); } else if (type->isIntegerTy()) { type_string += "i" + std::to_string(type->getScalarSizeInBits()); } else if (type->isFloatingPointTy()) { type_string += "f" + std::to_string(type->getScalarSizeInBits()); } else { std::string type_name; llvm::raw_string_ostream type_name_stream(type_name); type->print(type_name_stream, true); internal_error << "Attempt to mangle unknown LLVM type " << type_name << "\n"; } return type_string; } bool CodeGen_LLVM::try_vector_predication_intrinsic(const std::string &name, VPResultType result_type, int32_t length, MaskVariant mask, std::vector vp_args) { if (!use_llvm_vp_intrinsics) { return false; } llvm::Type *llvm_result_type = result_type.type; bool any_scalable = isa(llvm_result_type); bool any_fixed = isa(llvm_result_type); bool result_is_vector_type = any_scalable || any_fixed; llvm::Type *base_vector_type = nullptr; for (const VPArg &arg : vp_args) { llvm::Type *arg_type = arg.value->getType(); bool scalable = isa(arg_type); bool fixed = isa(arg_type); if (base_vector_type == nullptr && (fixed || scalable)) { base_vector_type = arg_type; } any_scalable |= scalable; any_fixed |= fixed; } if (!any_fixed && !any_scalable) { return false; } internal_assert(!(any_scalable && any_fixed)) << "Cannot combine fixed and scalable vectors to vector predication intrinsic.\n"; if (base_vector_type == nullptr && result_is_vector_type) { base_vector_type = llvm_result_type; } bool is_scalable = any_scalable; std::vector args; args.reserve(2 + vp_args.size()); std::vector mangled_types(vp_args.size() + 1); for (const VPArg &arg : vp_args) { args.push_back(arg.value); if (arg.mangle_index) { llvm::Type *llvm_type = arg.value->getType(); mangled_types[arg.mangle_index.value()] = mangle_llvm_type(llvm_type); } } if (result_type.mangle_index) { mangled_types[result_type.mangle_index.value()] = mangle_llvm_type(llvm_result_type); } std::string full_name = name; for (const std::string &mangle : mangled_types) { full_name += mangle; } if (!std::holds_alternative(mask)) { if (std::holds_alternative(mask)) { internal_assert(base_vector_type != nullptr) << "Requested all enabled mask without any vector type to use for type/length.\n"; llvm::ElementCount llvm_vector_ec; if (is_scalable) { const auto *vt = cast(base_vector_type); llvm_vector_ec = vt->getElementCount(); } else { const auto *vt = cast(base_vector_type); llvm_vector_ec = vt->getElementCount(); } args.push_back(ConstantVector::getSplat(llvm_vector_ec, ConstantInt::get(i1_t, 1))); } else { args.push_back(std::get(mask)); } } args.push_back(ConstantInt::get(i32_t, length)); value = simple_call_intrin(full_name, args, llvm_result_type); llvm::CallInst *call = dyn_cast(value); for (size_t i = 0; i < vp_args.size(); i++) { if (vp_args[i].alignment != 0) { call->addParamAttr(i, Attribute::getWithAlignment(*context, llvm::Align(vp_args[i].alignment))); } } return true; } bool CodeGen_LLVM::try_vector_predication_comparison(const std::string &name, const Type &result_type, MaskVariant mask, llvm::Value *a, llvm::Value *b, const char *cmp_op) { // Early out to prevent creating useless metadata. if (!use_llvm_vp_intrinsics || result_type.is_scalar()) { return false; } internal_assert(result_type.is_bool()) << "Vector predicated comparisons must return bool type.\n"; llvm::MDBuilder md_builder(*context); llvm::Value *md_val = llvm::MetadataAsValue::get(*context, md_builder.createString(cmp_op)); return try_vector_predication_intrinsic(name, llvm_type_of(result_type), result_type.lanes(), mask, {VPArg(a, 0), VPArg(b), VPArg(md_val)}); } } // namespace Internal } // namespace Halide