https://github.com/halide/Halide
Tip revision: f9e4c7878385f43cf88cca23d5bd663233e9e7da authored by Steven Johnson on 27 April 2021, 19:14:54 UTC
Add support for dynamic tensors to hannk (#5942)
Add support for dynamic tensors to hannk (#5942)
Tip revision: f9e4c78
Target.cpp
#include <array>
#include <iostream>
#include <string>
#include "Target.h"
#include "Debug.h"
#include "DeviceInterface.h"
#include "Error.h"
#include "Util.h"
#include "WasmExecutor.h"
#if defined(__powerpc__) && (defined(__FreeBSD__) || defined(__linux__))
#if defined(__FreeBSD__)
#include <machine/cpu.h>
#include <sys/elf_common.h>
#endif
// This uses elf.h and must be included after "LLVM_Headers.h", which
// uses llvm/support/Elf.h.
#include <sys/auxv.h>
#endif
#ifdef _MSC_VER
#include <intrin.h>
#endif // _MSC_VER
namespace Halide {
using std::string;
using std::vector;
namespace {
#ifdef _MSC_VER
static void cpuid(int info[4], int infoType, int extra) {
__cpuidex(info, infoType, extra);
}
#else
#if defined(__x86_64__) || defined(__i386__)
// CPU feature detection code taken from ispc
// (https://github.com/ispc/ispc/blob/master/builtins/dispatch.ll)
#ifdef _LP64
void cpuid(int info[4], int infoType, int extra) {
__asm__ __volatile__(
"cpuid \n\t"
: "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3])
: "0"(infoType), "2"(extra));
}
#else
static void cpuid(int info[4], int infoType, int extra) {
// We save %ebx in case it's the PIC register
__asm__ __volatile__(
"mov{l}\t{%%}ebx, %1 \n\t"
"cpuid \n\t"
"xchg{l}\t{%%}ebx, %1 \n\t"
: "=a"(info[0]), "=r"(info[1]), "=c"(info[2]), "=d"(info[3])
: "0"(infoType), "2"(extra));
}
#endif
#endif
#endif
Target calculate_host_target() {
Target::OS os = Target::OSUnknown;
#ifdef __linux__
os = Target::Linux;
#endif
#ifdef _WIN32
os = Target::Windows;
#endif
#ifdef __APPLE__
os = Target::OSX;
#endif
bool use_64_bits = (sizeof(size_t) == 8);
int bits = use_64_bits ? 64 : 32;
std::vector<Target::Feature> initial_features;
#if __riscv__
Target::Arch arch = Target::RISCV;
#else
#if __mips__ || __mips || __MIPS__
Target::Arch arch = Target::MIPS;
#else
#if defined(__arm__) || defined(__aarch64__)
Target::Arch arch = Target::ARM;
#else
#if defined(__powerpc__) && (defined(__FreeBSD__) || defined(__linux__))
Target::Arch arch = Target::POWERPC;
#if defined(__linux__)
unsigned long hwcap = getauxval(AT_HWCAP);
unsigned long hwcap2 = getauxval(AT_HWCAP2);
#elif defined(__FreeBSD__)
unsigned long hwcap, hwcap2;
elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap));
elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2));
#endif
bool have_altivec = (hwcap & PPC_FEATURE_HAS_ALTIVEC) != 0;
bool have_vsx = (hwcap & PPC_FEATURE_HAS_VSX) != 0;
bool arch_2_07 = (hwcap2 & PPC_FEATURE2_ARCH_2_07) != 0;
user_assert(have_altivec)
<< "The POWERPC backend assumes at least AltiVec support. This machine does not appear to have AltiVec.\n";
if (have_vsx) initial_features.push_back(Target::VSX);
if (arch_2_07) initial_features.push_back(Target::POWER_ARCH_2_07);
#else
Target::Arch arch = Target::X86;
int info[4];
cpuid(info, 1, 0);
bool have_sse41 = (info[2] & (1 << 19)) != 0;
bool have_sse2 = (info[3] & (1 << 26)) != 0;
bool have_avx = (info[2] & (1 << 28)) != 0;
bool have_f16c = (info[2] & (1 << 29)) != 0;
bool have_rdrand = (info[2] & (1 << 30)) != 0;
bool have_fma = (info[2] & (1 << 12)) != 0;
user_assert(have_sse2)
<< "The x86 backend assumes at least sse2 support. This machine does not appear to have sse2.\n"
<< "cpuid returned: "
<< std::hex << info[0]
<< ", " << info[1]
<< ", " << info[2]
<< ", " << info[3]
<< std::dec << "\n";
if (have_sse41) {
initial_features.push_back(Target::SSE41);
}
if (have_avx) {
initial_features.push_back(Target::AVX);
}
if (have_f16c) {
initial_features.push_back(Target::F16C);
}
if (have_fma) {
initial_features.push_back(Target::FMA);
}
if (use_64_bits && have_avx && have_f16c && have_rdrand) {
// So far, so good. AVX2/512?
// Call cpuid with eax=7, ecx=0
int info2[4];
cpuid(info2, 7, 0);
const uint32_t avx2 = 1U << 5;
const uint32_t avx512f = 1U << 16;
const uint32_t avx512dq = 1U << 17;
const uint32_t avx512pf = 1U << 26;
const uint32_t avx512er = 1U << 27;
const uint32_t avx512cd = 1U << 28;
const uint32_t avx512bw = 1U << 30;
const uint32_t avx512vl = 1U << 31;
const uint32_t avx512ifma = 1U << 21;
const uint32_t avx512 = avx512f | avx512cd;
const uint32_t avx512_knl = avx512 | avx512pf | avx512er;
const uint32_t avx512_skylake = avx512 | avx512vl | avx512bw | avx512dq;
const uint32_t avx512_cannonlake = avx512_skylake | avx512ifma; // Assume ifma => vbmi
if ((info2[1] & avx2) == avx2) {
initial_features.push_back(Target::AVX2);
}
if ((info2[1] & avx512) == avx512) {
initial_features.push_back(Target::AVX512);
if ((info2[1] & avx512_knl) == avx512_knl) {
initial_features.push_back(Target::AVX512_KNL);
}
if ((info2[1] & avx512_skylake) == avx512_skylake) {
initial_features.push_back(Target::AVX512_Skylake);
}
if ((info2[1] & avx512_cannonlake) == avx512_cannonlake) {
initial_features.push_back(Target::AVX512_Cannonlake);
#if LLVM_VERSION >= 120
// Sapphire Rapids support was added in LLVM 12, so earlier versions cannot support this CPU's features.
const uint32_t avx512vnni = 1U << 11; // vnni result in ecx
const uint32_t avx512bf16 = 1U << 5; // bf16 result in eax, with cpuid(eax=7, ecx=1)
int info3[4];
cpuid(info3, 7, 1);
if ((info2[2] & avx512vnni) == avx512vnni &&
(info3[0] & avx512bf16) == avx512bf16) {
initial_features.push_back(Target::AVX512_SapphireRapids);
}
#endif
}
}
}
#endif
#endif
#endif
#endif
return {os, arch, bits, initial_features};
}
bool is_using_hexagon(const Target &t) {
return (t.has_feature(Target::HVX) ||
t.has_feature(Target::HVX_v62) ||
t.has_feature(Target::HVX_v65) ||
t.has_feature(Target::HVX_v66) ||
t.has_feature(Target::HexagonDma) ||
t.has_feature(Target::HVX_shared_object) ||
t.arch == Target::Hexagon);
}
int get_hvx_lower_bound(const Target &t) {
if (!is_using_hexagon(t)) {
return -1;
}
if (t.has_feature(Target::HVX_v62)) {
return 62;
}
if (t.has_feature(Target::HVX_v65)) {
return 65;
}
if (t.has_feature(Target::HVX_v66)) {
return 66;
}
return 60;
}
} // namespace
Target get_host_target() {
// Calculating the host target isn't slow but it isn't free,
// and it's pointless to recalculate it every time we (e.g.) parse
// an arbitrary Target string. It won't ever change, so cache on first
// use.
static Target host_target = calculate_host_target();
return host_target;
}
namespace {
Target::Feature calculate_host_cuda_capability(Target t) {
const auto *interface = get_device_interface_for_device_api(DeviceAPI::CUDA, t);
internal_assert(interface->compute_capability);
int major, minor;
int err = interface->compute_capability(nullptr, &major, &minor);
internal_assert(err == 0) << "Failed to query cuda compute capability\n";
int ver = major * 10 + minor;
if (ver < 30) {
return Target::FeatureEnd;
} else if (ver < 32) {
return Target::CUDACapability30;
} else if (ver < 35) {
return Target::CUDACapability32;
} else if (ver < 50) {
return Target::CUDACapability35;
} else if (ver < 61) {
return Target::CUDACapability50;
} else if (ver < 70) {
return Target::CUDACapability61;
} else if (ver < 75) {
return Target::CUDACapability70;
} else if (ver < 80) {
return Target::CUDACapability75;
} else {
return Target::CUDACapability80;
}
}
Target::Feature get_host_cuda_capability(Target t) {
static Target::Feature cap = calculate_host_cuda_capability(t);
return cap;
}
const std::map<std::string, Target::OS> os_name_map = {
{"os_unknown", Target::OSUnknown},
{"linux", Target::Linux},
{"windows", Target::Windows},
{"osx", Target::OSX},
{"android", Target::Android},
{"ios", Target::IOS},
{"qurt", Target::QuRT},
{"noos", Target::NoOS},
{"fuchsia", Target::Fuchsia},
{"wasmrt", Target::WebAssemblyRuntime}};
bool lookup_os(const std::string &tok, Target::OS &result) {
auto os_iter = os_name_map.find(tok);
if (os_iter != os_name_map.end()) {
result = os_iter->second;
return true;
}
return false;
}
const std::map<std::string, Target::Arch> arch_name_map = {
{"arch_unknown", Target::ArchUnknown},
{"x86", Target::X86},
{"arm", Target::ARM},
{"mips", Target::MIPS},
{"powerpc", Target::POWERPC},
{"hexagon", Target::Hexagon},
{"wasm", Target::WebAssembly},
{"riscv", Target::RISCV},
};
bool lookup_arch(const std::string &tok, Target::Arch &result) {
auto arch_iter = arch_name_map.find(tok);
if (arch_iter != arch_name_map.end()) {
result = arch_iter->second;
return true;
}
return false;
}
const std::map<std::string, Target::Feature> feature_name_map = {
{"jit", Target::JIT},
{"debug", Target::Debug},
{"no_asserts", Target::NoAsserts},
{"no_bounds_query", Target::NoBoundsQuery},
{"sse41", Target::SSE41},
{"avx", Target::AVX},
{"avx2", Target::AVX2},
{"fma", Target::FMA},
{"fma4", Target::FMA4},
{"f16c", Target::F16C},
{"armv7s", Target::ARMv7s},
{"no_neon", Target::NoNEON},
{"vsx", Target::VSX},
{"power_arch_2_07", Target::POWER_ARCH_2_07},
{"cuda", Target::CUDA},
{"cuda_capability_30", Target::CUDACapability30},
{"cuda_capability_32", Target::CUDACapability32},
{"cuda_capability_35", Target::CUDACapability35},
{"cuda_capability_50", Target::CUDACapability50},
{"cuda_capability_61", Target::CUDACapability61},
{"cuda_capability_70", Target::CUDACapability70},
{"cuda_capability_75", Target::CUDACapability75},
{"cuda_capability_80", Target::CUDACapability80},
{"opencl", Target::OpenCL},
{"cl_doubles", Target::CLDoubles},
{"cl_half", Target::CLHalf},
{"cl_atomics64", Target::CLAtomics64},
{"openglcompute", Target::OpenGLCompute},
{"egl", Target::EGL},
{"user_context", Target::UserContext},
{"matlab", Target::Matlab},
{"profile", Target::Profile},
{"no_runtime", Target::NoRuntime},
{"metal", Target::Metal},
{"c_plus_plus_name_mangling", Target::CPlusPlusMangling},
{"large_buffers", Target::LargeBuffers},
{"hvx", Target::HVX_128},
{"hvx_128", Target::HVX_128},
{"hvx_v62", Target::HVX_v62},
{"hvx_v65", Target::HVX_v65},
{"hvx_v66", Target::HVX_v66},
{"hvx_shared_object", Target::HVX_shared_object},
{"fuzz_float_stores", Target::FuzzFloatStores},
{"soft_float_abi", Target::SoftFloatABI},
{"msan", Target::MSAN},
{"avx512", Target::AVX512},
{"avx512_knl", Target::AVX512_KNL},
{"avx512_skylake", Target::AVX512_Skylake},
{"avx512_cannonlake", Target::AVX512_Cannonlake},
{"avx512_sapphirerapids", Target::AVX512_SapphireRapids},
{"trace_loads", Target::TraceLoads},
{"trace_stores", Target::TraceStores},
{"trace_realizations", Target::TraceRealizations},
{"trace_pipeline", Target::TracePipeline},
{"d3d12compute", Target::D3D12Compute},
{"strict_float", Target::StrictFloat},
{"tsan", Target::TSAN},
{"asan", Target::ASAN},
{"check_unsafe_promises", Target::CheckUnsafePromises},
{"hexagon_dma", Target::HexagonDma},
{"embed_bitcode", Target::EmbedBitcode},
{"disable_llvm_loop_opt", Target::DisableLLVMLoopOpt},
{"enable_llvm_loop_opt", Target::EnableLLVMLoopOpt},
{"wasm_simd128", Target::WasmSimd128},
{"wasm_signext", Target::WasmSignExt},
{"wasm_sat_float_to_int", Target::WasmSatFloatToInt},
{"wasm_threads", Target::WasmThreads},
{"wasm_bulk_memory", Target::WasmBulkMemory},
{"sve", Target::SVE},
{"sve2", Target::SVE2},
{"arm_dot_prod", Target::ARMDotProd},
{"llvm_large_code_model", Target::LLVMLargeCodeModel},
{"rvv", Target::RVV},
{"armv81a", Target::ARMv81a},
// NOTE: When adding features to this map, be sure to update PyEnums.cpp as well.
};
bool lookup_feature(const std::string &tok, Target::Feature &result) {
auto feature_iter = feature_name_map.find(tok);
if (feature_iter != feature_name_map.end()) {
result = feature_iter->second;
return true;
}
return false;
}
} // End anonymous namespace
Target get_target_from_environment() {
string target = Internal::get_env_variable("HL_TARGET");
if (target.empty()) {
return get_host_target();
} else {
return Target(target);
}
}
Target get_jit_target_from_environment() {
Target host = get_host_target();
host.set_feature(Target::JIT);
#if defined(__has_feature)
#if __has_feature(address_sanitizer)
host.set_feature(Target::ASAN);
#endif
#if __has_feature(memory_sanitizer)
host.set_feature(Target::MSAN);
#endif
#if __has_feature(thread_sanitizer)
host.set_feature(Target::TSAN);
#endif
#endif
string target = Internal::get_env_variable("HL_JIT_TARGET");
if (target.empty()) {
return host;
} else {
Target t(target);
t.set_feature(Target::JIT);
user_assert((t.os == host.os && t.arch == host.arch && t.bits == host.bits) || Internal::WasmModule::can_jit_target(t))
<< "HL_JIT_TARGET must match the host OS, architecture, and bit width.\n"
<< "HL_JIT_TARGET was " << target << ". "
<< "Host is " << host.to_string() << ".\n";
return t;
}
}
namespace {
bool merge_string(Target &t, const std::string &target) {
string rest = target;
vector<string> tokens;
size_t first_dash;
while ((first_dash = rest.find('-')) != string::npos) {
//Internal::debug(0) << first_dash << ", " << rest << "\n";
tokens.push_back(rest.substr(0, first_dash));
rest = rest.substr(first_dash + 1);
}
tokens.push_back(rest);
bool os_specified = false, arch_specified = false, bits_specified = false, features_specified = false;
bool is_host = false;
for (size_t i = 0; i < tokens.size(); i++) {
const string &tok = tokens[i];
Target::Feature feature;
if (tok == "host") {
if (i > 0) {
// "host" is now only allowed as the first token.
return false;
}
is_host = true;
t = get_host_target();
} else if (tok == "32" || tok == "64" || tok == "0") {
if (bits_specified) {
return false;
}
bits_specified = true;
t.bits = std::stoi(tok);
} else if (lookup_arch(tok, t.arch)) {
if (arch_specified) {
return false;
}
arch_specified = true;
} else if (lookup_os(tok, t.os)) {
if (os_specified) {
return false;
}
os_specified = true;
} else if (lookup_feature(tok, feature)) {
t.set_feature(feature);
features_specified = true;
} else if (tok == "trace_all") {
t.set_features({Target::TraceLoads, Target::TraceStores, Target::TraceRealizations});
features_specified = true;
} else {
return false;
}
}
if (is_host &&
t.has_feature(Target::CUDA) &&
!t.has_feature(Target::CUDACapability30) &&
!t.has_feature(Target::CUDACapability32) &&
!t.has_feature(Target::CUDACapability35) &&
!t.has_feature(Target::CUDACapability50) &&
!t.has_feature(Target::CUDACapability61) &&
!t.has_feature(Target::CUDACapability70) &&
!t.has_feature(Target::CUDACapability75) &&
!t.has_feature(Target::CUDACapability80)) {
// Detect host cuda capability
t.set_feature(get_host_cuda_capability(t));
}
if (arch_specified && !bits_specified) {
return false;
}
if (bits_specified && t.bits == 0) {
// bits == 0 is allowed iff arch and os are "unknown" and no features are set,
// to allow for roundtripping the string for default Target() ctor.
if (!(arch_specified && t.arch == Target::ArchUnknown) ||
!(os_specified && t.os == Target::OSUnknown) ||
features_specified) {
return false;
}
}
return true;
}
void bad_target_string(const std::string &target) {
const char *separator = "";
std::string architectures;
for (const auto &arch_entry : arch_name_map) {
architectures += separator + arch_entry.first;
separator = ", ";
}
separator = "";
std::string oses;
for (const auto &os_entry : os_name_map) {
oses += separator + os_entry.first;
separator = ", ";
}
separator = "";
// Format the features to go one feature over 70 characters per line,
// assume the first line starts with "Features are ".
int line_char_start = -(int)sizeof("Features are");
std::string features;
for (const auto &feature_entry : feature_name_map) {
features += separator + feature_entry.first;
if (features.length() - line_char_start > 70) {
separator = "\n";
line_char_start = features.length();
} else {
separator = ", ";
}
}
user_error << "Did not understand Halide target " << target << "\n"
<< "Expected format is arch-bits-os-feature1-feature2-...\n"
<< "Where arch is: " << architectures << ".\n"
<< "bits is either 32 or 64.\n"
<< "os is: " << oses << ".\n"
<< "\n"
<< "If arch, bits, or os are omitted, they default to the host.\n"
<< "\n"
<< "Features are: " << features << ".\n"
<< "\n"
<< "The target can also begin with \"host\", which sets the "
<< "host's architecture, os, and feature set, with the "
<< "exception of the GPU runtimes, which default to off.\n"
<< "\n"
<< "On this platform, the host target is: " << get_host_target().to_string() << "\n";
}
} // namespace
Target::Target(const std::string &target) {
Target host = get_host_target();
if (target.empty()) {
// If nothing is specified, use the full host target.
*this = host;
} else {
if (!merge_string(*this, target) || has_unknowns()) {
bad_target_string(target);
}
}
}
Target::Target(const char *s)
: Target(std::string(s)) {
}
bool Target::validate_target_string(const std::string &s) {
Target t;
return merge_string(t, s) && !t.has_unknowns();
}
std::string Target::feature_to_name(Target::Feature feature) {
for (const auto &feature_entry : feature_name_map) {
if (feature == feature_entry.second) {
return feature_entry.first;
}
}
internal_error;
return "";
}
Target::Feature Target::feature_from_name(const std::string &name) {
Target::Feature feature;
if (lookup_feature(name, feature)) {
return feature;
}
return Target::FeatureEnd;
}
std::string Target::to_string() const {
string result;
for (const auto &arch_entry : arch_name_map) {
if (arch_entry.second == arch) {
result += arch_entry.first;
break;
}
}
result += "-" + std::to_string(bits);
for (const auto &os_entry : os_name_map) {
if (os_entry.second == os) {
result += "-" + os_entry.first;
break;
}
}
for (const auto &feature_entry : feature_name_map) {
if (has_feature(feature_entry.second)) {
result += "-" + feature_entry.first;
}
}
// Use has_feature() multiple times (rather than features_any_of())
// to avoid constructing a temporary vector for this rather-common call.
if (has_feature(Target::TraceLoads) && has_feature(Target::TraceStores) && has_feature(Target::TraceRealizations)) {
result = Internal::replace_all(result, "trace_loads-trace_realizations-trace_stores", "trace_all");
}
return result;
}
/** Was libHalide compiled with support for this target? */
bool Target::supported() const {
bool bad = false;
#if !defined(WITH_ARM)
bad |= arch == Target::ARM && bits == 32;
#endif
#if !defined(WITH_AARCH64)
bad |= arch == Target::ARM && bits == 64;
#endif
#if !defined(WITH_X86)
bad |= arch == Target::X86;
#endif
#if !defined(WITH_MIPS)
bad |= arch == Target::MIPS;
#endif
#if !defined(WITH_POWERPC)
bad |= arch == Target::POWERPC;
#endif
#if !defined(WITH_HEXAGON)
bad |= arch == Target::Hexagon;
#endif
#if !defined(WITH_WEBASSEMBLY)
bad |= arch == Target::WebAssembly;
#endif
#if !defined(WITH_RISCV)
bad |= arch == Target::RISCV;
#endif
#if !defined(WITH_NVPTX)
bad |= has_feature(Target::CUDA);
#endif
#if !defined(WITH_OPENCL)
bad |= has_feature(Target::OpenCL);
#endif
#if !defined(WITH_METAL)
bad |= has_feature(Target::Metal);
#endif
#if !defined(WITH_OPENGLCOMPUTE)
bad |= has_feature(Target::OpenGLCompute);
#endif
#if !defined(WITH_D3D12)
bad |= has_feature(Target::D3D12Compute);
#endif
return !bad;
}
bool Target::has_unknowns() const {
return os == OSUnknown || arch == ArchUnknown || bits == 0;
}
void Target::set_feature(Feature f, bool value) {
if (f == FeatureEnd) {
return;
}
user_assert(f < FeatureEnd) << "Invalid Target feature.\n";
features.set(f, value);
}
void Target::set_features(const std::vector<Feature> &features_to_set, bool value) {
for (Feature f : features_to_set) {
set_feature(f, value);
}
}
bool Target::has_feature(Feature f) const {
if (f == FeatureEnd) {
return true;
}
user_assert(f < FeatureEnd) << "Invalid Target feature.\n";
return features[f];
}
bool Target::features_any_of(const std::vector<Feature> &test_features) const {
for (Feature f : test_features) {
if (has_feature(f)) {
return true;
}
}
return false;
}
bool Target::features_all_of(const std::vector<Feature> &test_features) const {
for (Feature f : test_features) {
if (!has_feature(f)) {
return false;
}
}
return true;
}
Target Target::with_feature(Feature f) const {
Target copy = *this;
copy.set_feature(f);
return copy;
}
Target Target::without_feature(Feature f) const {
Target copy = *this;
copy.set_feature(f, false);
return copy;
}
bool Target::has_gpu_feature() const {
return (has_feature(CUDA) ||
has_feature(OpenCL) ||
has_feature(Metal) ||
has_feature(D3D12Compute) ||
has_feature(OpenGLCompute));
}
int Target::get_cuda_capability_lower_bound() const {
if (!has_feature(Target::CUDA)) {
return -1;
}
if (has_feature(Target::CUDACapability30)) {
return 30;
}
if (has_feature(Target::CUDACapability32)) {
return 32;
}
if (has_feature(Target::CUDACapability35)) {
return 35;
}
if (has_feature(Target::CUDACapability50)) {
return 50;
}
if (has_feature(Target::CUDACapability61)) {
return 61;
}
if (has_feature(Target::CUDACapability70)) {
return 70;
}
if (has_feature(Target::CUDACapability75)) {
return 75;
}
if (has_feature(Target::CUDACapability80)) {
return 80;
}
return 20;
}
bool Target::supports_type(const Type &t) const {
if (t.bits() == 64) {
if (t.is_float()) {
return !has_feature(Metal) &&
!has_feature(OpenGLCompute) &&
!has_feature(D3D12Compute) &&
(!has_feature(Target::OpenCL) || has_feature(Target::CLDoubles));
} else {
return (!has_feature(Metal) &&
!has_feature(OpenGLCompute) &&
!has_feature(D3D12Compute));
}
}
return true;
}
bool Target::supports_type(const Type &t, DeviceAPI device) const {
if (device == DeviceAPI::Default_GPU) {
device = get_default_device_api_for_target(*this);
}
if (device == DeviceAPI::Hexagon) {
// HVX supports doubles and long long in the scalar unit only.
if (t.is_float() || t.bits() == 64) {
return t.lanes() == 1;
}
} else if (device == DeviceAPI::Metal) {
// Metal spec says no double or long long.
if (t.bits() == 64) {
return false;
}
} else if (device == DeviceAPI::OpenCL) {
if (t.is_float() && t.bits() == 64) {
return has_feature(Target::CLDoubles);
}
} else if (device == DeviceAPI::D3D12Compute) {
// Shader Model 5.x can optionally support double-precision; 64-bit int
// types are not supported.
return t.bits() < 64;
} else if (device == DeviceAPI::OpenGLCompute) {
return t.bits() < 64;
}
return true;
}
bool Target::supports_device_api(DeviceAPI api) const {
switch (api) {
case DeviceAPI::None:
return true;
case DeviceAPI::Host:
return true;
case DeviceAPI::Default_GPU:
return has_gpu_feature();
case DeviceAPI::Hexagon:
return has_feature(Target::HVX);
case DeviceAPI::HexagonDma:
return has_feature(Target::HexagonDma);
default:
return has_feature(target_feature_for_device_api(api));
}
}
DeviceAPI Target::get_required_device_api() const {
if (has_feature(Target::CUDA)) {
return DeviceAPI::CUDA;
}
if (has_feature(Target::D3D12Compute)) {
return DeviceAPI::D3D12Compute;
}
if (has_feature(Target::HVX)) {
return DeviceAPI::Hexagon;
}
if (has_feature(Target::HexagonDma)) {
return DeviceAPI::HexagonDma;
}
if (has_feature(Target::Metal)) {
return DeviceAPI::Metal;
}
if (has_feature(Target::OpenCL)) {
return DeviceAPI::OpenCL;
}
if (has_feature(Target::OpenGLCompute)) {
return DeviceAPI::OpenGLCompute;
}
return DeviceAPI::None;
}
Target::Feature target_feature_for_device_api(DeviceAPI api) {
switch (api) {
case DeviceAPI::CUDA:
return Target::CUDA;
case DeviceAPI::OpenCL:
return Target::OpenCL;
case DeviceAPI::OpenGLCompute:
return Target::OpenGLCompute;
case DeviceAPI::Metal:
return Target::Metal;
case DeviceAPI::Hexagon:
return Target::HVX;
case DeviceAPI::D3D12Compute:
return Target::D3D12Compute;
default:
return Target::FeatureEnd;
}
}
int Target::natural_vector_size(const Halide::Type &t) const {
user_assert(!has_unknowns())
<< "natural_vector_size cannot be used on a Target with Unknown values.\n";
const bool is_integer = t.is_int() || t.is_uint();
const int data_size = t.bytes();
if (arch == Target::Hexagon) {
if (is_integer) {
if (has_feature(Halide::Target::HVX)) {
return 128 / data_size;
} else {
user_error << "Target uses hexagon arch without target feature hvx set.\n";
return 0;
}
} else {
// HVX does not have vector float instructions.
return 1;
}
} else if (arch == Target::X86) {
if (is_integer && (has_feature(Halide::Target::AVX512_Skylake) ||
has_feature(Halide::Target::AVX512_Cannonlake))) {
// AVX512BW exists on Skylake and Cannonlake
return 64 / data_size;
} else if (t.is_float() && (has_feature(Halide::Target::AVX512) ||
has_feature(Halide::Target::AVX512_KNL) ||
has_feature(Halide::Target::AVX512_Skylake) ||
has_feature(Halide::Target::AVX512_Cannonlake))) {
// AVX512F is on all AVX512 architectures
return 64 / data_size;
} else if (has_feature(Halide::Target::AVX2)) {
// AVX2 uses 256-bit vectors for everything.
return 32 / data_size;
} else if (!is_integer && has_feature(Halide::Target::AVX)) {
// AVX 1 has 256-bit vectors for float, but not for
// integer instructions.
return 32 / data_size;
} else {
// SSE was all 128-bit. We ignore MMX.
return 16 / data_size;
}
} else if (arch == Target::WebAssembly) {
if (has_feature(Halide::Target::WasmSimd128)) {
// 128-bit vectors for other types.
return 16 / data_size;
} else {
// No vectors, sorry.
return 1;
}
} else {
// Assume 128-bit vectors on other targets.
return 16 / data_size;
}
}
bool Target::get_runtime_compatible_target(const Target &other, Target &result) {
// Create mask to select features that:
// (a) must be included if either target has the feature (union)
// (b) must be included if both targets have the feature (intersection)
// (c) must match across both targets; it is an error if one target has the feature and the other doesn't
// clang-format off
const std::array<Feature, 18> union_features = {{
// These are true union features.
CUDA,
D3D12Compute,
Metal,
NoNEON,
OpenCL,
OpenGLCompute,
// These features are actually intersection-y, but because targets only record the _highest_,
// we have to put their union in the result and then take a lower bound.
CUDACapability30,
CUDACapability32,
CUDACapability35,
CUDACapability50,
CUDACapability61,
CUDACapability70,
CUDACapability75,
CUDACapability80,
HVX_v62,
HVX_v65,
HVX_v66,
}};
// clang-format on
// clang-format off
const std::array<Feature, 14> intersection_features = {{
ARMv7s,
ARMv81a,
AVX,
AVX2,
AVX512,
AVX512_Cannonlake,
AVX512_KNL,
AVX512_SapphireRapids,
AVX512_Skylake,
F16C,
FMA,
FMA4,
SSE41,
VSX,
}};
// clang-format on
// clang-format off
const std::array<Feature, 12> matching_features = {{
ASAN,
Debug,
HexagonDma,
HVX,
HVX_shared_object,
MSAN,
SoftFloatABI,
TSAN,
WasmThreads,
}};
// clang-format on
// bitsets need to be the same width.
decltype(result.features) union_mask;
decltype(result.features) intersection_mask;
decltype(result.features) matching_mask;
for (const auto &feature : union_features) {
union_mask.set(feature);
}
for (const auto &feature : intersection_features) {
intersection_mask.set(feature);
}
for (const auto &feature : matching_features) {
matching_mask.set(feature);
}
if (arch != other.arch || bits != other.bits || os != other.os) {
Internal::debug(1) << "runtime targets must agree on platform (arch-bits-os)\n"
<< " this: " << *this << "\n"
<< " other: " << other << "\n";
return false;
}
if ((features & matching_mask) != (other.features & matching_mask)) {
Internal::debug(1) << "runtime targets must agree on SoftFloatABI, Debug, TSAN, ASAN, MSAN, HVX, HexagonDma, and HVX_shared_object\n"
<< " this: " << *this << "\n"
<< " other: " << other << "\n";
return false;
}
// Union of features is computed through bitwise-or, and masked away by the features we care about
// Intersection of features is computed through bitwise-and and masked away, too.
// We merge the bits via bitwise or.
Target output = Target{os, arch, bits};
output.features = ((features | other.features) & union_mask) | ((features | other.features) & matching_mask) | ((features & other.features) & intersection_mask);
#if LLVM_VERSION < 120
// We require LLVM 12+ to compile SapphireRapids features.
output.features.reset(AVX512_SapphireRapids);
#endif
// Pick tight lower bound for CUDA capability. Use fall-through to clear redundant features
int cuda_a = get_cuda_capability_lower_bound();
int cuda_b = other.get_cuda_capability_lower_bound();
// get_cuda_capability_lower_bound returns -1 when unused. Casting to unsigned makes this
// large, so min selects the true lower bound when one target doesn't specify a capability,
// and the other doesn't use CUDA at all.
int cuda_capability = std::min((unsigned)cuda_a, (unsigned)cuda_b);
if (cuda_capability < 30) {
output.features.reset(CUDACapability30);
}
if (cuda_capability < 32) {
output.features.reset(CUDACapability32);
}
if (cuda_capability < 35) {
output.features.reset(CUDACapability35);
}
if (cuda_capability < 50) {
output.features.reset(CUDACapability50);
}
if (cuda_capability < 61) {
output.features.reset(CUDACapability61);
}
if (cuda_capability < 70) {
output.features.reset(CUDACapability70);
}
if (cuda_capability < 75) {
output.features.reset(CUDACapability75);
}
if (cuda_capability < 80) {
output.features.reset(CUDACapability80);
}
// Pick tight lower bound for HVX version. Use fall-through to clear redundant features
int hvx_a = get_hvx_lower_bound(*this);
int hvx_b = get_hvx_lower_bound(other);
// Same trick as above for CUDA
int hvx_version = std::min((unsigned)hvx_a, (unsigned)hvx_b);
if (hvx_version < 62) {
output.features.reset(HVX_v62);
}
if (hvx_version < 65) {
output.features.reset(HVX_v65);
}
if (hvx_version < 66) {
output.features.reset(HVX_v66);
}
result = output;
return true;
}
namespace Internal {
void target_test() {
Target t;
for (const auto &feature : feature_name_map) {
t.set_feature(feature.second);
}
for (int i = 0; i < (int)(Target::FeatureEnd); i++) {
internal_assert(t.has_feature((Target::Feature)i)) << "Feature " << i << " not in feature_names_map.\n";
}
// 3 targets: {A,B,C}. Want gcd(A,B)=C
std::vector<std::array<std::string, 3>> gcd_tests = {
{{"x86-64-linux-sse41-fma", "x86-64-linux-sse41-fma", "x86-64-linux-sse41-fma"}},
{{"x86-64-linux-sse41-fma-no_asserts-no_runtime", "x86-64-linux-sse41-fma", "x86-64-linux-sse41-fma"}},
{{"x86-64-linux-avx2-sse41", "x86-64-linux-sse41-fma", "x86-64-linux-sse41"}},
{{"x86-64-linux-avx2-sse41", "x86-32-linux-sse41-fma", ""}},
{{"x86-64-linux-cuda", "x86-64-linux", "x86-64-linux-cuda"}},
{{"x86-64-linux-cuda-cuda_capability_50", "x86-64-linux-cuda", "x86-64-linux-cuda"}},
{{"x86-64-linux-cuda-cuda_capability_50", "x86-64-linux-cuda-cuda_capability_30", "x86-64-linux-cuda-cuda_capability_30"}},
{{"hexagon-32-qurt-hvx_v65", "hexagon-32-qurt-hvx_v62", "hexagon-32-qurt-hvx_v62"}},
{{"hexagon-32-qurt-hvx_v62", "hexagon-32-qurt", "hexagon-32-qurt"}},
{{"hexagon-32-qurt-hvx_v62-hvx", "hexagon-32-qurt", ""}},
{{"hexagon-32-qurt-hvx_v62-hvx", "hexagon-32-qurt-hvx", "hexagon-32-qurt-hvx"}},
};
for (const auto &test : gcd_tests) {
Target result{};
Target a{test[0]};
Target b{test[1]};
if (a.get_runtime_compatible_target(b, result)) {
internal_assert(!test[2].empty() && result == Target{test[2]})
<< "Targets " << a.to_string() << " and " << b.to_string() << " were computed to have gcd "
<< result.to_string() << " but expected '" << test[2] << "'\n";
} else {
internal_assert(test[2].empty())
<< "Targets " << a.to_string() << " and " << b.to_string() << " were computed to have no gcd "
<< "but " << test[2] << " was expected.";
}
}
std::cout << "Target test passed" << std::endl;
}
} // namespace Internal
} // namespace Halide