Content - 8678bfefbb906c751fcbe29a917f7b79951e2655 - 995dd8a/Target.h

Target.h
#ifndef HALIDE_TARGET_H
#define HALIDE_TARGET_H

/** \file
 * Defines the structure that describes a Halide target.
 */

#include <bitset>
#include <cstdint>
#include <string>

#include "DeviceAPI.h"
#include "Type.h"
#include "runtime/HalideRuntime.h"

namespace Halide {

/** A struct representing a target machine and os to generate code for. */
struct Target {
    /** The operating system used by the target. Determines which
     * system calls to generate.
     * Corresponds to os_name_map in Target.cpp. */
    enum OS {
        OSUnknown = 0,
        Linux,
        Windows,
        OSX,
        Android,
        IOS,
        QuRT,
        NoOS,
        Fuchsia,
        WebAssemblyRuntime
    } os = OSUnknown;

    /** The architecture used by the target. Determines the
     * instruction set to use.
     * Corresponds to arch_name_map in Target.cpp. */
    enum Arch {
        ArchUnknown = 0,
        X86,
        ARM,
        MIPS,
        Hexagon,
        POWERPC,
        WebAssembly,
        RISCV
    } arch = ArchUnknown;

    /** The bit-width of the target machine. Must be 0 for unknown, or 32 or 64. */
    int bits = 0;

    /** The bit-width of a vector register for targets where this is configurable and
     * targeting a fixed size is desired. The default of 0 indicates no assumption of
     * fixed size is allowed. */
    int vector_bits = 0;

    /** The specific processor to be targeted, tuned for.
     * Corresponds to processor_name_map in Target.cpp.
     *
     * New entries should be added to the end. */
    enum Processor {
        /// Do not tune for any specific CPU. In practice, this means that halide will decide the tune CPU based on the enabled features.
        ProcessorGeneric = 0,
        K8,        /// Tune for AMD K8 Hammer CPU (AMD Family 0Fh, launched 2003).
        K8_SSE3,   /// Tune for later versions of AMD K8 CPU, with SSE3 support.
        AMDFam10,  /// Tune for AMD K10 "Barcelona" CPU (AMD Family 10h, launched 2007).
        BtVer1,    /// Tune for AMD Bobcat CPU (AMD Family 14h, launched 2011).
        BdVer1,    /// Tune for AMD Bulldozer CPU (AMD Family 15h, launched 2011).
        BdVer2,    /// Tune for AMD Piledriver CPU (AMD Family 15h (2nd-gen), launched 2012).
        BdVer3,    /// Tune for AMD Steamroller CPU (AMD Family 15h (3nd-gen), launched 2014).
        BdVer4,    /// Tune for AMD Excavator CPU (AMD Family 15h (4th-gen), launched 2015).
        BtVer2,    /// Tune for AMD Jaguar CPU (AMD Family 16h, launched 2013).
        ZnVer1,    /// Tune for AMD Zen   CPU (AMD Family 17h, launched 2017).
        ZnVer2,    /// Tune for AMD Zen 2 CPU (AMD Family 17h, launched 2019).
        ZnVer3,    /// Tune for AMD Zen 3 CPU (AMD Family 19h, launched 2020).
    } processor_tune = ProcessorGeneric;

    /** Optional features a target can have.
     * Corresponds to feature_name_map in Target.cpp.
     * See definitions in HalideRuntime.h for full information.
     */
    enum Feature {
        JIT = halide_target_feature_jit,
        Debug = halide_target_feature_debug,
        NoAsserts = halide_target_feature_no_asserts,
        NoBoundsQuery = halide_target_feature_no_bounds_query,
        SSE41 = halide_target_feature_sse41,
        AVX = halide_target_feature_avx,
        AVX2 = halide_target_feature_avx2,
        FMA = halide_target_feature_fma,
        FMA4 = halide_target_feature_fma4,
        F16C = halide_target_feature_f16c,
        ARMv7s = halide_target_feature_armv7s,
        NoNEON = halide_target_feature_no_neon,
        VSX = halide_target_feature_vsx,
        POWER_ARCH_2_07 = halide_target_feature_power_arch_2_07,
        CUDA = halide_target_feature_cuda,
        CUDACapability30 = halide_target_feature_cuda_capability30,
        CUDACapability32 = halide_target_feature_cuda_capability32,
        CUDACapability35 = halide_target_feature_cuda_capability35,
        CUDACapability50 = halide_target_feature_cuda_capability50,
        CUDACapability61 = halide_target_feature_cuda_capability61,
        CUDACapability70 = halide_target_feature_cuda_capability70,
        CUDACapability75 = halide_target_feature_cuda_capability75,
        CUDACapability80 = halide_target_feature_cuda_capability80,
        CUDACapability86 = halide_target_feature_cuda_capability86,
        OpenCL = halide_target_feature_opencl,
        CLDoubles = halide_target_feature_cl_doubles,
        CLHalf = halide_target_feature_cl_half,
        CLAtomics64 = halide_target_feature_cl_atomic64,
        OpenGLCompute = halide_target_feature_openglcompute,
        EGL = halide_target_feature_egl,
        UserContext = halide_target_feature_user_context,
        Profile = halide_target_feature_profile,
        NoRuntime = halide_target_feature_no_runtime,
        Metal = halide_target_feature_metal,
        CPlusPlusMangling = halide_target_feature_c_plus_plus_mangling,
        LargeBuffers = halide_target_feature_large_buffers,
        HexagonDma = halide_target_feature_hexagon_dma,
        HVX_128 = halide_target_feature_hvx_128,
        HVX = HVX_128,
        HVX_v62 = halide_target_feature_hvx_v62,
        HVX_v65 = halide_target_feature_hvx_v65,
        HVX_v66 = halide_target_feature_hvx_v66,
        HVX_shared_object = halide_target_feature_hvx_use_shared_object,
        FuzzFloatStores = halide_target_feature_fuzz_float_stores,
        SoftFloatABI = halide_target_feature_soft_float_abi,
        MSAN = halide_target_feature_msan,
        AVX512 = halide_target_feature_avx512,
        AVX512_KNL = halide_target_feature_avx512_knl,
        AVX512_Skylake = halide_target_feature_avx512_skylake,
        AVX512_Cannonlake = halide_target_feature_avx512_cannonlake,
        AVX512_SapphireRapids = halide_target_feature_avx512_sapphirerapids,
        TraceLoads = halide_target_feature_trace_loads,
        TraceStores = halide_target_feature_trace_stores,
        TraceRealizations = halide_target_feature_trace_realizations,
        TracePipeline = halide_target_feature_trace_pipeline,
        D3D12Compute = halide_target_feature_d3d12compute,
        StrictFloat = halide_target_feature_strict_float,
        TSAN = halide_target_feature_tsan,
        ASAN = halide_target_feature_asan,
        CheckUnsafePromises = halide_target_feature_check_unsafe_promises,
        EmbedBitcode = halide_target_feature_embed_bitcode,
        EnableLLVMLoopOpt = halide_target_feature_enable_llvm_loop_opt,
        // halide_target_feature_disable_llvm_loop_opt is deprecated in Halide 15
        // (and will be removed in Halide 16). Halide 15 now defaults to disabling
        // LLVM loop optimization, unless halide_target_feature_enable_llvm_loop_opt is set.
        DisableLLVMLoopOpt = halide_target_feature_disable_llvm_loop_opt,
        WasmSimd128 = halide_target_feature_wasm_simd128,
        WasmSignExt = halide_target_feature_wasm_signext,
        WasmSatFloatToInt = halide_target_feature_wasm_sat_float_to_int,
        WasmThreads = halide_target_feature_wasm_threads,
        WasmBulkMemory = halide_target_feature_wasm_bulk_memory,
        SVE = halide_target_feature_sve,
        SVE2 = halide_target_feature_sve2,
        ARMDotProd = halide_target_feature_arm_dot_prod,
        ARMFp16 = halide_target_feature_arm_fp16,
        LLVMLargeCodeModel = halide_llvm_large_code_model,
        RVV = halide_target_feature_rvv,
        ARMv81a = halide_target_feature_armv81a,
        SanitizerCoverage = halide_target_feature_sanitizer_coverage,
        ProfileByTimer = halide_target_feature_profile_by_timer,
        SPIRV = halide_target_feature_spirv,
        FeatureEnd = halide_target_feature_end
    };
    Target() = default;
    Target(OS o, Arch a, int b, Processor pt, const std::vector<Feature> &initial_features = std::vector<Feature>(),
           int vb = 0)
        : os(o), arch(a), bits(b), vector_bits(vb), processor_tune(pt) {
        for (const auto &f : initial_features) {
            set_feature(f);
        }
    }

    Target(OS o, Arch a, int b, const std::vector<Feature> &initial_features = std::vector<Feature>())
        : Target(o, a, b, ProcessorGeneric, initial_features) {
    }

    /** Given a string of the form used in HL_TARGET
     * (e.g. "x86-64-avx"), construct the Target it specifies. Note
     * that this always starts with the result of get_host_target(),
     * replacing only the parts found in the target string, so if you
     * omit (say) an OS specification, the host OS will be used
     * instead. An empty string is exactly equivalent to
     * get_host_target().
     *
     * Invalid target strings will fail with a user_error.
     */
    // @{
    explicit Target(const std::string &s);
    explicit Target(const char *s);
    // @}

    /** Check if a target string is valid. */
    static bool validate_target_string(const std::string &s);

    /** Return true if any of the arch/bits/os fields are "unknown"/0;
        return false otherwise. */
    bool has_unknowns() const;

    void set_feature(Feature f, bool value = true);

    void set_features(const std::vector<Feature> &features_to_set, bool value = true);

    bool has_feature(Feature f) const;

    inline bool has_feature(halide_target_feature_t f) const {
        return has_feature((Feature)f);
    }

    bool features_any_of(const std::vector<Feature> &test_features) const;

    bool features_all_of(const std::vector<Feature> &test_features) const;

    /** Return a copy of the target with the given feature set.
     * This is convenient when enabling certain features (e.g. NoBoundsQuery)
     * in an initialization list, where the target to be mutated may be
     * a const reference. */
    Target with_feature(Feature f) const;

    /** Return a copy of the target with the given feature cleared.
     * This is convenient when disabling certain features (e.g. NoBoundsQuery)
     * in an initialization list, where the target to be mutated may be
     * a const reference. */
    Target without_feature(Feature f) const;

    /** Is a fully feature GPU compute runtime enabled? I.e. is
     * Func::gpu_tile and similar going to work? Currently includes
     * CUDA, OpenCL, Metal and D3D12Compute. We do not include OpenGL,
     * because it is not capable of gpgpu, and is not scheduled via
     * Func::gpu_tile.
     * TODO: Should OpenGLCompute be included here? */
    bool has_gpu_feature() const;

    /** Does this target allow using a certain type. Generally all
     * types except 64-bit float and int/uint should be supported by
     * all backends.
     *
     * It is likely better to call the version below which takes a DeviceAPI.
     */
    bool supports_type(const Type &t) const;

    /** Does this target allow using a certain type on a certain device.
     * This is the prefered version of this routine.
     */
    bool supports_type(const Type &t, DeviceAPI device) const;

    /** Returns whether a particular device API can be used with this
     * Target. */
    bool supports_device_api(DeviceAPI api) const;

    /** If this Target (including all Features) requires a specific DeviceAPI,
     * return it. If it doesn't, return DeviceAPI::None.  If the Target has
     * features with multiple (different) DeviceAPI requirements, the result
     * will be an arbitrary DeviceAPI. */
    DeviceAPI get_required_device_api() const;

    bool operator==(const Target &other) const {
        return os == other.os &&
               arch == other.arch &&
               bits == other.bits &&
               processor_tune == other.processor_tune &&
               features == other.features;
    }

    bool operator!=(const Target &other) const {
        return !(*this == other);
    }

    /**
     * Create a "greatest common denominator" runtime target that is compatible with
     * both this target and \p other. Used by generators to conveniently select a suitable
     * runtime when linking together multiple functions.
     *
     * @param other The other target from which we compute the gcd target.
     * @param[out] result The gcd target if we return true, otherwise unmodified. Can be the same as *this.
     * @return Whether it was possible to find a compatible target (true) or not.
     */
    bool get_runtime_compatible_target(const Target &other, Target &result);

    /** Convert the Target into a string form that can be reconstituted
     * by merge_string(), which will always be of the form
     *
     *   arch-bits-os-processor-feature1-feature2...featureN.
     *
     * Note that is guaranteed that Target(t1.to_string()) == t1,
     * but not that Target(s).to_string() == s (since there can be
     * multiple strings that parse to the same Target)...
     * *unless* t1 contains 'unknown' fields (in which case you'll get a string
     * that can't be parsed, which is intentional).
     */
    std::string to_string() const;

    /** Given a data type, return an estimate of the "natural" vector size
     * for that data type when compiling for this Target. */
    int natural_vector_size(const Halide::Type &t) const;

    /** Given a data type, return an estimate of the "natural" vector size
     * for that data type when compiling for this Target. */
    template<typename data_t>
    int natural_vector_size() const {
        return natural_vector_size(type_of<data_t>());
    }

    /** Return true iff 64 bits and has_feature(LargeBuffers). */
    bool has_large_buffers() const {
        return bits == 64 && has_feature(LargeBuffers);
    }

    /** Return the maximum buffer size in bytes supported on this
     * Target. This is 2^31 - 1 except on 64-bit targets when the LargeBuffers
     * feature is enabled, which expands the maximum to 2^63 - 1. */
    int64_t maximum_buffer_size() const {
        if (has_large_buffers()) {
            return (((uint64_t)1) << 63) - 1;
        } else {
            return (((uint64_t)1) << 31) - 1;
        }
    }

    /** Get the minimum cuda capability found as an integer. Returns
     * 20 (our minimum supported cuda compute capability) if no cuda
     * features are set. */
    int get_cuda_capability_lower_bound() const;

    /** Was libHalide compiled with support for this target? */
    bool supported() const;

    /** Return a bitset of the Featuress set in this Target (set = 1).
     * Note that while this happens to be the current internal representation,
     * that might not always be the case. */
    const std::bitset<FeatureEnd> &get_features_bitset() const {
        return features;
    }

    /** Return the name corresponding to a given Feature, in the form
     * used to construct Target strings (e.g., Feature::Debug is "debug" and not "Debug"). */
    static std::string feature_to_name(Target::Feature feature);

    /** Return the feature corresponding to a given name, in the form
     * used to construct Target strings (e.g., Feature::Debug is "debug" and not "Debug").
     * If the string is not a known feature name, return FeatureEnd. */
    static Target::Feature feature_from_name(const std::string &name);

private:
    /** A bitmask that stores the active features. */
    std::bitset<FeatureEnd> features;
};

/** Return the target corresponding to the host machine. */
Target get_host_target();

/** Return the target that Halide will use. If HL_TARGET is set it
 * uses that. Otherwise calls \ref get_host_target */
Target get_target_from_environment();

/** Return the target that Halide will use for jit-compilation. If
 * HL_JIT_TARGET is set it uses that. Otherwise calls \ref
 * get_host_target. Throws an error if the architecture, bit width,
 * and OS of the target do not match the host target, so this is only
 * useful for controlling the feature set. */
Target get_jit_target_from_environment();

/** Get the Target feature corresponding to a DeviceAPI. For device
 * apis that do not correspond to any single target feature, returns
 * Target::FeatureEnd */
Target::Feature target_feature_for_device_api(DeviceAPI api);

namespace Internal {

void target_test();
}

}  // namespace Halide

#endif