https://github.com/shader-slang/slang
Raw File
Tip revision: d7ba60c993366b4aaf6ef8ee7d8eab940d61eac8 authored by Yong He on 03 April 2023, 03:43:09 UTC
Fix type legalization pass. (#2768)
Tip revision: d7ba60c
hlsl.meta.slang
// Slang HLSL compatibility library

typedef uint UINT;

__generic<T>
__magic_type(HLSLAppendStructuredBufferType)
__intrinsic_type($(kIROp_HLSLAppendStructuredBufferType))
struct AppendStructuredBuffer
{
    void Append(T value);

    void GetDimensions(
        out uint numStructs,
        out uint stride);
};

__magic_type(HLSLByteAddressBufferType)
__intrinsic_type($(kIROp_HLSLByteAddressBufferType))
struct ByteAddressBuffer
{
    __target_intrinsic(glsl, "$1 = $0._data.length() * 4")
    [__readNone]
    void GetDimensions(
        out uint dim);

    __target_intrinsic(glsl, "$0._data[$1/4]")
    [__readNone]
    uint Load(int location);

    [__readNone]
    uint Load(int location, out uint status);

    __target_intrinsic(glsl, "uvec2($0._data[$1/4], $0._data[$1/4+1])")
    [__readNone]
    uint2 Load2(int location);

    [__readNone]
    uint2 Load2(int location, out uint status);

    __target_intrinsic(glsl, "uvec3($0._data[$1/4], $0._data[$1/4+1], $0._data[$1/4+2])")
    [__readNone]
    uint3 Load3(int location);

    [__readNone]
    uint3 Load3(int location, out uint status);

    __target_intrinsic(glsl, "uvec4($0._data[$1/4], $0._data[$1/4+1], $0._data[$1/4+2], $0._data[$1/4+3])")
    [__readNone]
    uint4 Load4(int location);

    [__readNone]
    uint4 Load4(int location, out uint status);

    [__readNone]
    T Load<T>(int location)
    {
        return __byteAddressBufferLoad<T>(this, location);
    }
};

// AtomicAdd

// Make the GLSL atomicAdd available.
// We have separate int/float implementations, as the float version requires some specific extensions
// https://www.khronos.org/registry/OpenGL/extensions/NV/NV_shader_atomic_float.txt

__target_intrinsic(glsl, "atomicAdd($0, $1)")
__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_float)
float __atomicAdd(__ref float value, float amount);

// Helper for hlsl, using NVAPI
__target_intrinsic(hlsl, "NvInterlockedAddUint64($0, $1, $2)")
[__requiresNVAPI]
uint2 __atomicAdd(RWByteAddressBuffer buf, uint offset, uint2);

// atomic add for hlsl using SM6.6
__target_intrinsic(hlsl, "$0.InterlockedAdd64($1, $2, $3)")
void __atomicAdd(RWByteAddressBuffer buf, uint offset, int64_t value, out int64_t originalValue);
__target_intrinsic(hlsl, "$0.InterlockedAdd64($1, $2, $3)")
void __atomicAdd(RWByteAddressBuffer buf, uint offset, uint64_t value, out uint64_t originalValue);

// Int versions require glsl 4.30
// https://www.khronos.org/registry/OpenGL-Refpages/gl4/html/atomicAdd.xhtml

__target_intrinsic(glsl, "atomicAdd($0, $1)")
__glsl_version(430)
int __atomicAdd(__ref int value, int amount);

__target_intrinsic(glsl, "atomicAdd($0, $1)")
__glsl_version(430)
uint __atomicAdd(__ref uint value, uint amount);

__target_intrinsic(glsl, "atomicAdd($0, $1)")
__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
int64_t __atomicAdd(__ref int64_t value, int64_t amount);

__target_intrinsic(glsl, "atomicAdd($0, $1)")
__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
uint64_t __atomicAdd(__ref uint64_t value, uint64_t amount);

// Cas - Compare and swap

// Helper for HLSL, using NVAPI

__target_intrinsic(hlsl, "NvInterlockedCompareExchangeUint64($0, $1, $2, $3)")
[__requiresNVAPI]
uint2 __cas(RWByteAddressBuffer buf, uint offset, uint2 compareValue, uint2 value);

// CAS using SM6.6
__target_intrinsic(hlsl, "$0.InterlockedCompareExchange64($1, $2, $3, $4)")
void __cas(RWByteAddressBuffer buf, uint offset, in int64_t compare_value, in int64_t value, out int64_t original_value);
__target_intrinsic(hlsl, "$0.InterlockedCompareExchange64($1, $2, $3, $4)")
void __cas(RWByteAddressBuffer buf, uint offset, in uint64_t compare_value, in uint64_t value, out uint64_t original_value);

__target_intrinsic(glsl, "atomicCompSwap($0, $1, $2)")
__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
uint64_t __cas(__ref int64_t ioValue, int64_t compareValue, int64_t newValue);

__target_intrinsic(glsl, "atomicCompSwap($0, $1, $2)")
__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
uint64_t __cas(__ref uint64_t ioValue, uint64_t compareValue, uint64_t newValue);

// Max

__target_intrinsic(hlsl, "NvInterlockedMaxUint64($0, $1, $2)")
[__requiresNVAPI]
uint2 __atomicMax(RWByteAddressBuffer buf, uint offset, uint2 value);

__target_intrinsic(glsl, "atomicMax($0, $1)")
__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
uint64_t __atomicMax(__ref uint64_t ioValue, uint64_t value);

// Min

__target_intrinsic(hlsl, "NvInterlockedMinUint64($0, $1, $2)")
[__requiresNVAPI]
uint2 __atomicMin(RWByteAddressBuffer buf, uint offset, uint2 value);

__target_intrinsic(glsl, "atomicMin($0, $1)")
__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
uint64_t __atomicMin(__ref uint64_t ioValue, uint64_t value);

// And

__target_intrinsic(hlsl, "NvInterlockedAndUint64($0, $1, $2)")
[__requiresNVAPI]
uint2 __atomicAnd(RWByteAddressBuffer buf, uint offset, uint2 value);

__target_intrinsic(glsl, "atomicAnd($0, $1)")
__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
uint64_t __atomicAnd(__ref uint64_t ioValue, uint64_t value);

// Or

__target_intrinsic(hlsl, "NvInterlockedOrUint64($0, $1, $2)")
[__requiresNVAPI]
uint2 __atomicOr(RWByteAddressBuffer buf, uint offset, uint2 value);

__target_intrinsic(glsl, "atomicOr($0, $1)")
__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
uint64_t __atomicOr(__ref uint64_t ioValue, uint64_t value);

// Xor

__target_intrinsic(hlsl, "NvInterlockedXorUint64($0, $1, $2)")
[__requiresNVAPI]
uint2 __atomicXor(RWByteAddressBuffer buf, uint offset, uint2 value);

__target_intrinsic(glsl, "atomicXor($0, $1)")
__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
uint64_t __atomicXor(__ref uint64_t ioValue, uint64_t value);

// Exchange

__target_intrinsic(hlsl, "NvInterlockedExchangeUint64($0, $1, $2)")
[__requiresNVAPI]
uint2 __atomicExchange(RWByteAddressBuffer buf, uint offset, uint2 value);

__target_intrinsic(glsl, "atomicExchange($0, $1)")
__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
uint64_t __atomicExchange(__ref uint64_t ioValue, uint64_t value);

// Conversion between uint64_t and uint2

uint2 __asuint2(uint64_t i)
{
    return uint2(uint(i), uint(uint64_t(i) >> 32));
}

uint64_t __asuint64(uint2 i)
{
    return (uint64_t(i.y) << 32) | i.x;
}

//

__intrinsic_op($(kIROp_ByteAddressBufferLoad))
T __byteAddressBufferLoad<T>(ByteAddressBuffer buffer, int offset);

__intrinsic_op($(kIROp_ByteAddressBufferLoad))
T __byteAddressBufferLoad<T>(RWByteAddressBuffer buffer, int offset);

__intrinsic_op($(kIROp_ByteAddressBufferLoad))
T __byteAddressBufferLoad<T>(RasterizerOrderedByteAddressBuffer buffer, int offset);

__intrinsic_op($(kIROp_ByteAddressBufferStore))
void __byteAddressBufferStore<T>(RWByteAddressBuffer buffer, int offset, T value);

__intrinsic_op($(kIROp_ByteAddressBufferStore))
void __byteAddressBufferStore<T>(RasterizerOrderedByteAddressBuffer buffer, int offset, T value);

__generic<T>
__magic_type(HLSLStructuredBufferType)
__intrinsic_type($(kIROp_HLSLStructuredBufferType))
struct StructuredBuffer
{
    __target_intrinsic(glsl, "$1 = $0._data.length(); $2 = 0")
    [__readNone]
    void GetDimensions(
        out uint numStructs,
        out uint stride);

    __target_intrinsic(glsl, "$0._data[$1]")
    __target_intrinsic(spirv_direct, "%addr = OpAccessChain resultType*StorageBuffer resultId _0 const(int, 0) _1; OpLoad resultType resultId %addr;")
    [__readNone]
    T Load(int location);

    [__readNone]
    T Load(int location, out uint status);

    __subscript(uint index) -> T
    {
        __target_intrinsic(glsl, "$0._data[$1]")
        __target_intrinsic(spirv_direct, "%addr = OpAccessChain resultType*StorageBuffer resultId _0 const(int, 0) _1; OpLoad resultType resultId %addr;")
        [__readNone]
        get;
    };
};

__generic<T>
__magic_type(HLSLConsumeStructuredBufferType)
__intrinsic_type($(kIROp_HLSLConsumeStructuredBufferType))
struct ConsumeStructuredBuffer
{
    T Consume();

    void GetDimensions(
        out uint numStructs,
        out uint stride);
};

__generic<T, let N : int>
__magic_type(HLSLInputPatchType)
__intrinsic_type($(kIROp_HLSLInputPatchType))
struct InputPatch
{
    __subscript(uint index) -> T;
};

__generic<T, let N : int>
__magic_type(HLSLOutputPatchType)
__intrinsic_type($(kIROp_HLSLOutputPatchType))
struct OutputPatch
{
    __subscript(uint index) -> T;
};

${{{{
static const struct {
    IROp op;
    char const* name;
} kMutableByteAddressBufferCases[] =
{
    { kIROp_HLSLRWByteAddressBufferType,                "RWByteAddressBuffer" },
    { kIROp_HLSLRasterizerOrderedByteAddressBufferType, "RasterizerOrderedByteAddressBuffer" },
};
for(auto item : kMutableByteAddressBufferCases) {
}}}}

__magic_type(HLSL$(item.name)Type)
__intrinsic_type($(item.op))
struct $(item.name)
{
    // Note(tfoley): supports all operations from `ByteAddressBuffer`
    // TODO(tfoley): can this be made a sub-type?

    __target_intrinsic(glsl, "$1 = $0._data.length() * 4")
    void GetDimensions(
        out uint dim);

    __target_intrinsic(glsl, "$0._data[$1/4]")
    uint Load(int location);

    uint Load(int location, out uint status);

    __target_intrinsic(glsl, "uvec2($0._data[$1/4], $0._data[$1/4+1])")
    uint2 Load2(int location);

    uint2 Load2(int location, out uint status);

    __target_intrinsic(glsl, "uvec3($0._data[$1/4], $0._data[$1/4+1], $0._data[$1/4+2])")
    uint3 Load3(int location);

    uint3 Load3(int location, out uint status);

    __target_intrinsic(glsl, "uvec4($0._data[$1/4], $0._data[$1/4+1], $0._data[$1/4+2], $0._data[$1/4+3])")
    uint4 Load4(int location);

    uint4 Load4(int location, out uint status);

    T Load<T>(int location)
    {
        return __byteAddressBufferLoad<T>(this, location);
    }
${{{{
    if (item.op == kIROp_HLSLRWByteAddressBufferType)
    {
}}}}

    // float32 and int64 atomic support. This is a Slang specific extension, it uses
    // GL_EXT_shader_atomic_float on Vulkan
    // NvAPI support on DX
    // NOTE! To use this feature on HLSL based targets the path to 'nvHLSLExtns.h' from the NvAPI SDK must
    // be set. That this include will be added to the *output* that is passed to a downstram compiler.
    // Also note that you *can* include NVAPI headers in your Slang source, and directly use NVAPI functions
    // Directly using NVAPI functions does *not* add the #include on the output
    // Finally note you can *mix* NVAPI direct calls, and use of NVAPI intrinsics below. This doesn't cause
    // any clashes, as Slang will emit any NVAPI function it parsed (say via a include in Slang source) with
    // unique functions.
    //
    // https://www.khronos.org/registry/vulkan/specs/1.2-extensions/html/vkspec.html#VK_EXT_shader_atomic_float
    // https://htmlpreview.github.io/?https://github.com/KhronosGroup/SPIRV-Registry/blob/master/extensions/EXT/SPV_EXT_shader_atomic_float_add.html

    // F32 Add

    __target_intrinsic(hlsl, "($3 = NvInterlockedAddFp32($0, $1, $2))")
    __cuda_sm_version(2.0)
    __target_intrinsic(cuda, "(*$3 = atomicAdd($0._getPtrAt<float>($1), $2))")
    [__requiresNVAPI]
    void InterlockedAddF32(uint byteAddress, float valueToAdd, out float originalValue);

    __specialized_for_target(glsl)
    void InterlockedAddF32(uint byteAddress, float valueToAdd, out float originalValue)
    {
        RWStructuredBuffer<float> buf = __getEquivalentStructuredBuffer<float>(this);
        originalValue = __atomicAdd(buf[byteAddress / 4], valueToAdd);
    }

    // Without returning original value

    __target_intrinsic(hlsl, "(NvInterlockedAddFp32($0, $1, $2))")
    [__requiresNVAPI]
    __cuda_sm_version(2.0)
    __target_intrinsic(cuda, "atomicAdd($0._getPtrAt<float>($1), $2)")
    void InterlockedAddF32(uint byteAddress, float valueToAdd);

    __specialized_for_target(glsl)
    void InterlockedAddF32(uint byteAddress, float valueToAdd)
    {
        RWStructuredBuffer<float> buf = __getEquivalentStructuredBuffer<float>(this);
        __atomicAdd(buf[byteAddress / 4], valueToAdd);
    }

    // Int64 Add
    __cuda_sm_version(6.0)
    __target_intrinsic(cuda, "(*$3 = atomicAdd($0._getPtrAt<uint64_t>($1), $2))")
    void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t originalValue);

    __specialized_for_target(hlsl)
    void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t outOriginalValue)
    {
        outOriginalValue = __asuint64(__atomicAdd(this, byteAddress, __asuint2(valueToAdd)));
    }

    __specialized_for_target(glsl)
    void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t originalValue)
    {
        RWStructuredBuffer<int64_t> buf = __getEquivalentStructuredBuffer<int64_t>(this);
        originalValue = __atomicAdd(buf[byteAddress / 8], valueToAdd);
    }

    // Without returning original value
    __cuda_sm_version(6.0)
    __target_intrinsic(cuda, "atomicAdd($0._getPtrAt<uint64_t>($1), $2)")
    void InterlockedAddI64(uint byteAddress, int64_t valueToAdd);

    __specialized_for_target(hlsl)
    void InterlockedAddI64(uint byteAddress, int64_t valueToAdd)
    {
        __atomicAdd(this, byteAddress, __asuint2(valueToAdd));
    }

    __specialized_for_target(glsl)
    void InterlockedAddI64(uint byteAddress, int64_t valueToAdd)
    {
        RWStructuredBuffer<int64_t> buf = __getEquivalentStructuredBuffer<int64_t>(this);
        __atomicAdd(buf[byteAddress / 8], valueToAdd);
    }

    // Cas uint64_t

    __target_intrinsic(cuda, "(*$4 = atomicCAS($0._getPtrAt<uint64_t>($1), $2, $3))")
    void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue);

    __specialized_for_target(hlsl)
    void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue)
    {
        outOriginalValue = __asuint64(__cas(this, byteAddress, __asuint2(compareValue), __asuint2(value)));
    }

    __specialized_for_target(glsl)
    void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue)
    {
        RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this);
        outOriginalValue = __cas(buf[byteAddress / 8], compareValue, value);
    }

    // Max

    __cuda_sm_version(3.5)
    __target_intrinsic(cuda, "atomicMax($0._getPtrAt<uint64_t>($1), $2)")
    uint64_t InterlockedMaxU64(uint byteAddress, uint64_t value);

    __specialized_for_target(hlsl)
    uint64_t InterlockedMaxU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicMax(this, byteAddress, __asuint2(value))); }

    __specialized_for_target(glsl)
    uint64_t InterlockedMaxU64(uint byteAddress, uint64_t value)
    {
        RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this);
        return __atomicMax(buf[byteAddress / 8], value);
    }

    // Min

    __cuda_sm_version(3.5)
    __target_intrinsic(cuda, "atomicMin($0._getPtrAt<uint64_t>($1), $2)")
    uint64_t InterlockedMinU64(uint byteAddress, uint64_t value);

    __specialized_for_target(hlsl)
    uint64_t InterlockedMinU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicMin(this, byteAddress, __asuint2(value))); }

    __specialized_for_target(glsl)
    uint64_t InterlockedMinU64(uint byteAddress, uint64_t value)
    {
        RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this);
        return __atomicMin(buf[byteAddress / 8], value);
    }

    // And

    __target_intrinsic(cuda, "atomicAnd($0._getPtrAt<uint64_t>($1), $2)")
    uint64_t InterlockedAndU64(uint byteAddress, uint64_t value);

    __specialized_for_target(hlsl)
    uint64_t InterlockedAndU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicAnd(this, byteAddress, __asuint2(value))); }

    __specialized_for_target(glsl)
    uint64_t InterlockedAndU64(uint byteAddress, uint64_t value)
    {
        RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this);
        return __atomicAnd(buf[byteAddress / 8], value);
    }

    // Or

    __target_intrinsic(cuda, "atomicOr($0._getPtrAt<uint64_t>($1), $2)")
    uint64_t InterlockedOrU64(uint byteAddress, uint64_t value);

    __specialized_for_target(hlsl)
    uint64_t InterlockedOrU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicOr(this, byteAddress, __asuint2(value))); }

    __specialized_for_target(glsl)
    uint64_t InterlockedOrU64(uint byteAddress, uint64_t value)
    {
        RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this);
        return __atomicOr(buf[byteAddress / 8], value);
    }

    // Xor

    __target_intrinsic(cuda, "atomicXor($0._getPtrAt<uint64_t>($1), $2)")
    uint64_t InterlockedXorU64(uint byteAddress, uint64_t value);

    __specialized_for_target(hlsl)
    uint64_t InterlockedXorU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicXor(this, byteAddress, __asuint2(value))); }

    __specialized_for_target(glsl)
    uint64_t InterlockedXorU64(uint byteAddress, uint64_t value)
    {
        RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this);
        return __atomicXor(buf[byteAddress / 8], value);
    }

    // Exchange

    __target_intrinsic(cuda, "atomicExch($0._getPtrAt<uint64_t>($1), $2)")
    uint64_t InterlockedExchangeU64(uint byteAddress, uint64_t value);

    __specialized_for_target(hlsl)
    uint64_t InterlockedExchangeU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicExchange(this, byteAddress, __asuint2(value))); }

    __specialized_for_target(glsl)
    uint64_t InterlockedExchangeU64(uint byteAddress, uint64_t value)
    {
        RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this);
        return __atomicExchange(buf[byteAddress / 8], value);
    }

    // SM6.6 6 64bit atomics.
    __specialized_for_target(hlsl)
    void InterlockedAdd64(uint byteAddress, int64_t valueToAdd, out int64_t outOriginalValue)
    {
        __atomicAdd(this, byteAddress, valueToAdd, outOriginalValue);
    }
    __specialized_for_target(glsl)
    void InterlockedAdd64(uint byteAddress, int64_t valueToAdd, out int64_t originalValue)
    {
        RWStructuredBuffer<int64_t> buf = __getEquivalentStructuredBuffer<int64_t>(this);
        originalValue = __atomicAdd(buf[byteAddress / 8], valueToAdd);
    }
    __specialized_for_target(hlsl)
    void InterlockedAdd64(uint byteAddress, uint64_t valueToAdd, out uint64_t outOriginalValue)
    {
        __atomicAdd(this, byteAddress, valueToAdd, outOriginalValue);
    }
    __specialized_for_target(glsl)
    void InterlockedAdd64(uint byteAddress, uint64_t valueToAdd, out uint64_t originalValue)
    {
        RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this);
        originalValue = __atomicAdd(buf[byteAddress / 8], valueToAdd);
    }
    __specialized_for_target(hlsl)
    void InterlockedCompareExchange64(uint byteAddress, int64_t compareValue, int64_t value, out int64_t outOriginalValue)
    {
        __cas(this, byteAddress, compareValue, value, outOriginalValue);
    }
    __specialized_for_target(glsl)
    void InterlockedCompareExchange64(uint byteAddress, int64_t compareValue, int64_t value, out int64_t outOriginalValue)
    {
        RWStructuredBuffer<int64_t> buf = __getEquivalentStructuredBuffer<int64_t>(this);
        outOriginalValue = __cas(buf[byteAddress / 8], compareValue, value);
    }
    __specialized_for_target(hlsl)
    void InterlockedCompareExchange64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue)
    {
        __cas(this, byteAddress, compareValue, value, outOriginalValue);
    }
    __specialized_for_target(glsl)
    void InterlockedCompareExchange64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue)
    {
        RWStructuredBuffer<uint64_t> buf = __getEquivalentStructuredBuffer<uint64_t>(this);
        outOriginalValue = __cas(buf[byteAddress / 8], compareValue, value);
    }
${{{{
    }
}}}}

    // Added operations:

    __target_intrinsic(glsl, "($3 = atomicAdd($0._data[$1/4], $2))")
    void InterlockedAdd(
        UINT dest,
        UINT value,
        out UINT original_value);

    __target_intrinsic(glsl, "atomicAdd($0._data[$1/4], $2)")
    void InterlockedAdd(
        UINT dest,
        UINT value);

    __target_intrinsic(glsl, "($3 = atomicAnd($0._data[$1/4], $2))")
    void InterlockedAnd(
        UINT dest,
        UINT value,
        out UINT original_value);

    __target_intrinsic(glsl, "atomicAnd($0._data[$1/4], $2)")
    void InterlockedAnd(
        UINT dest,
        UINT value);

    __target_intrinsic(glsl, "($4 = atomicCompSwap($0._data[$1/4], $2, $3))")
    void InterlockedCompareExchange(
        UINT dest,
        UINT compare_value,
        UINT value,
        out UINT original_value);

    __target_intrinsic(glsl, "atomicCompSwap($0._data[$1/4], $2, $3)")
    void InterlockedCompareStore(
        UINT dest,
        UINT compare_value,
        UINT value);

    __target_intrinsic(glsl, "($3 = atomicExchange($0._data[$1/4], $2))")
    void InterlockedExchange(
        UINT dest,
        UINT value,
        out UINT original_value);

    __target_intrinsic(glsl, "($3 = atomicMax($0._data[$1/4], $2))")
    void InterlockedMax(
        UINT dest,
        UINT value,
        out UINT original_value);

    __target_intrinsic(glsl, "atomicMax($0._data[$1/4], $2)")
    void InterlockedMax(
        UINT dest,
        UINT value);

    __target_intrinsic(glsl, "($3 = atomicMin($0._data[$1/4], $2))")
    void InterlockedMin(
        UINT dest,
        UINT value,
        out UINT original_value);

    __target_intrinsic(glsl, "atomicMin($0._data[$1/4], $2)")
    void InterlockedMin(
        UINT dest,
        UINT value);

    __target_intrinsic(glsl, "($3 = atomicOr($0._data[$1/4], $2))")
    void InterlockedOr(
        UINT dest,
        UINT value,
        out UINT original_value);

    __target_intrinsic(glsl, "atomicOr($0._data[$1/4], $2)")
    void InterlockedOr(
        UINT dest,
        UINT value);

    __target_intrinsic(glsl, "($3 = atomicXor($0._data[$1/4], $2))")
    void InterlockedXor(
        UINT dest,
        UINT value,
        out UINT original_value);

    __target_intrinsic(glsl, "atomicXor($0._data[$1/4], $2)")
    void InterlockedXor(
        UINT dest,
        UINT value);

    __target_intrinsic(glsl, "$0._data[$1/4] = $2")
    void Store(
        uint address,
        uint value);

    __target_intrinsic(glsl, "$0._data[$1/4] = $2.x, $0._data[$1/4+1] = $2.y")
    void Store2(
        uint address,
        uint2 value);

    __target_intrinsic(glsl, "$0._data[$1/4] = $2.x, $0._data[$1/4+1] = $2.y, $0._data[$1/4+2] = $2.z")
    void Store3(
        uint address,
        uint3 value);

    __target_intrinsic(glsl, "$0._data[$1/4] = $2.x, $0._data[$1/4+1] = $2.y, $0._data[$1/4+2] = $2.z, $0._data[$1/4+3] = $2.w")
    void Store4(
        uint address,
        uint4 value);

    void Store<T>(int offset, T value)
    {
        __byteAddressBufferStore(this, offset, value);
    }
};

${{{{
}
}}}}

${{{{
static const struct {
    IROp op;
    char const* name;
} kMutableStructuredBufferCases[] =
{
    { kIROp_HLSLRWStructuredBufferType,                "RWStructuredBuffer" },
    { kIROp_HLSLRasterizerOrderedStructuredBufferType, "RasterizerOrderedStructuredBuffer" },
};
for(auto item : kMutableStructuredBufferCases) {
}}}}


__generic<T>
__magic_type(HLSL$(item.name)Type)
__intrinsic_type($(item.op))
struct $(item.name)
{
    uint DecrementCounter();

    __target_intrinsic(glsl, "$1 = $0._data.length(); $2 = 0")
    void GetDimensions(
        out uint numStructs,
        out uint stride);

    uint IncrementCounter();

    __target_intrinsic(glsl, "$0._data[$1]")
    __target_intrinsic(spirv_direct, "%addr = OpAccessChain resultType*StorageBuffer resultId _0 const(int, 0) _1; OpLoad resultType resultId %addr;")
    T Load(int location);
    T Load(int location, out uint status);

    __subscript(uint index) -> T
    {
        __target_intrinsic(glsl, "$0._data[$1]")
        __target_intrinsic(spirv_direct, "*StorageBuffer OpAccessChain resultType resultId _0 const(int, 0) _1")
        ref;
    }
};

${{{{
}
}}}}

__generic<T>
__magic_type(HLSLPointStreamType)
__intrinsic_type($(kIROp_HLSLPointStreamType))
struct PointStream
{
    __target_intrinsic(glsl, "EmitVertex()")
    void Append(T value);

    __target_intrinsic(glsl, "EndPrimitive()")
    void RestartStrip();
};

__generic<T>
__magic_type(HLSLLineStreamType)
__intrinsic_type($(kIROp_HLSLLineStreamType))
struct LineStream
{
    __target_intrinsic(glsl, "EmitVertex()")
    void Append(T value);

    __target_intrinsic(glsl, "EndPrimitive()")
    void RestartStrip();
};

__generic<T>
__magic_type(HLSLTriangleStreamType)
__intrinsic_type($(kIROp_HLSLTriangleStreamType))
struct TriangleStream
{
    __target_intrinsic(glsl, "EmitVertex()")
    void Append(T value);

    __target_intrinsic(glsl, "EndPrimitive()")
    void RestartStrip();
};

#define VECTOR_MAP_UNARY(TYPE, COUNT, FUNC, VALUE) \
    vector<TYPE,COUNT> result; for(int i = 0; i < COUNT; ++i) { result[i] = FUNC(VALUE[i]); } return result

#define MATRIX_MAP_UNARY(TYPE, ROWS, COLS, FUNC, VALUE) \
    matrix<TYPE,ROWS,COLS> result; for(int i = 0; i < ROWS; ++i) { result[i] = FUNC(VALUE[i]); } return result

#define VECTOR_MAP_BINARY(TYPE, COUNT, FUNC, LEFT, RIGHT) \
    vector<TYPE,COUNT> result; for(int i = 0; i < COUNT; ++i) { result[i] = FUNC(LEFT[i], RIGHT[i]); } return result

#define MATRIX_MAP_BINARY(TYPE, ROWS, COLS, FUNC, LEFT, RIGHT) \
    matrix<TYPE,ROWS,COLS> result; for(int i = 0; i < ROWS; ++i) { result[i] = FUNC(LEFT[i], RIGHT[i]); } return result

#define VECTOR_MAP_TRINARY(TYPE, COUNT, FUNC, A, B, C) \
    vector<TYPE,COUNT> result; for(int i = 0; i < COUNT; ++i) { result[i] = FUNC(A[i], B[i], C[i]); } return result

#define MATRIX_MAP_TRINARY(TYPE, ROWS, COLS, FUNC, A, B, C) \
    matrix<TYPE,ROWS,COLS> result; for(int i = 0; i < ROWS; ++i) { result[i] = FUNC(A[i], B[i], C[i]); } return result

// Try to terminate the current draw or dispatch call (HLSL SM 4.0)
void abort();

// Absolute value (HLSL SM 1.0)

__generic<T : __BuiltinIntegerType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_abs($0)")
__target_intrinsic(cpp, "$P_abs($0)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fi(FAbs, SAbs) _0")
[__readNone]
T abs(T x);
/*{
    // Note: this simple definition may not be appropriate for floating-point inputs
    return x < 0 ? -x : x;
}*/

__generic<T : __BuiltinIntegerType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fi(FAbs, SAbs) _0")
[__readNone]
vector<T, N> abs(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, abs, x);
}

__generic<T : __BuiltinIntegerType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> abs(matrix<T,N,M> x)
{
    MATRIX_MAP_UNARY(T, N, M, abs, x);
}

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_abs($0)")
__target_intrinsic(cpp, "$P_abs($0)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fi(FAbs, SAbs) _0")
[__readNone]
T abs(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fi(FAbs, SAbs) _0")
[__readNone]
vector<T, N> abs(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, abs, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> abs(matrix<T,N,M> x)
{
    MATRIX_MAP_UNARY(T, N, M, abs, x);
}

// Inverse cosine (HLSL SM 1.0)

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_acos($0)")
__target_intrinsic(cpp, "$P_acos($0)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Acos _0")
[__readNone]
T acos(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Acos _0")
[__readNone]
vector<T, N> acos(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, acos, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> acos(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, acos, x);
}

// Test if all components are non-zero (HLSL SM 1.0)
__generic<T : __BuiltinType>
__target_intrinsic(cpp, "bool($0)")
__target_intrinsic(cuda, "bool($0)")
__target_intrinsic(glsl, "bool($0)")
[__readNone]
bool all(T x);

__generic<T : __BuiltinType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "all(bvec$N0($0))")
[__readNone]
bool all(vector<T,N> x)
{
    bool result = true;
    for(int i = 0; i < N; ++i)
        result = result && all(x[i]);
    return result;
}

__generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
bool all(matrix<T,N,M> x)
{
    bool result = true;
    for(int i = 0; i < N; ++i)
        result = result && all(x[i]);
    return result;
}

// Barrier for writes to all memory spaces (HLSL SM 5.0)
__target_intrinsic(glsl, "memoryBarrier(), groupMemoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer()")
__target_intrinsic(cuda, "__threadfence()")
void AllMemoryBarrier();

// Thread-group sync and barrier for writes to all memory spaces (HLSL SM 5.0)
__target_intrinsic(glsl, "memoryBarrier(), groupMemoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer(), barrier()")
__target_intrinsic(cuda, "__syncthreads()")
void AllMemoryBarrierWithGroupSync();

// Test if any components is non-zero (HLSL SM 1.0)

__generic<T : __BuiltinType>
__target_intrinsic(cpp, "bool($0)")
__target_intrinsic(cuda, "bool($0)")
__target_intrinsic(glsl, "bool($0)")
[__readNone]
bool any(T x);

__generic<T : __BuiltinType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "any(bvec$N0($0))")
[__readNone]
bool any(vector<T, N> x)
{
    bool result = false;
    for(int i = 0; i < N; ++i)
        result = result || any(x[i]);
    return result;
}

__generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
bool any(matrix<T, N, M> x)
{
    bool result = false;
    for(int i = 0; i < N; ++i)
        result = result || any(x[i]);
    return result;
}


// Reinterpret bits as a double (HLSL SM 5.0)

__target_intrinsic(hlsl)
__target_intrinsic(glsl, "packDouble2x32(uvec2($0, $1))")
__target_intrinsic(cpp, "$P_asdouble($0, $1)")
__target_intrinsic(cuda, "$P_asdouble($0, $1)")
__target_intrinsic(spirv_direct, "%v = OpCompositeConstruct _type(uint2) resultId _0 _1; OpExtInst resultType resultId glsl450 59 %v")
__glsl_extension(GL_ARB_gpu_shader5)
[__readNone]
double asdouble(uint lowbits, uint highbits);

// Reinterpret bits as a float (HLSL SM 4.0)

__target_intrinsic(hlsl)
__target_intrinsic(glsl, "intBitsToFloat")
__target_intrinsic(cpp, "$P_asfloat($0)")
__target_intrinsic(cuda, "$P_asfloat($0)")
__target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
[__readNone]
float asfloat(int x);

__target_intrinsic(hlsl)
__target_intrinsic(glsl, "uintBitsToFloat")
__target_intrinsic(cpp, "$P_asfloat($0)")
__target_intrinsic(cuda, "$P_asfloat($0)")
__target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
[__readNone]
float asfloat(uint x);

__generic<let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "intBitsToFloat")
__target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
[__readNone]
vector<float, N> asfloat(vector< int, N> x)
{
    VECTOR_MAP_UNARY(float, N, asfloat, x);
}

__generic<let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "uintBitsToFloat")
__target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
[__readNone]
vector<float,N> asfloat(vector<uint,N> x)
{
    VECTOR_MAP_UNARY(float, N, asfloat, x);
}

__generic<let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<float,N,M> asfloat(matrix< int,N,M> x)
{
    MATRIX_MAP_UNARY(float, N, M, asfloat, x);
}

__generic<let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<float,N,M> asfloat(matrix<uint,N,M> x)
{
    MATRIX_MAP_UNARY(float, N, M, asfloat, x);
}

// No op
[__unsafeForceInlineEarly]
[__readNone]
float asfloat(float x)
{ return x; }

__generic<let N : int>
[__unsafeForceInlineEarly]
[__readNone]
vector<float,N> asfloat(vector<float,N> x)
{ return x; }

__generic<let N : int, let M : int>
[__unsafeForceInlineEarly]
[__readNone]
matrix<float,N,M> asfloat(matrix<float,N,M> x)
{ return x; }

// Inverse sine (HLSL SM 1.0)
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_asin($0)")
__target_intrinsic(cpp, "$P_asin($0)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Asin _0")
[__readNone]
T asin(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Asin _0")
[__readNone]
vector<T, N> asin(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T,N,asin,x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> asin(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T,N,M,asin,x);
}

// Reinterpret bits as an int (HLSL SM 4.0)

__target_intrinsic(hlsl)
__target_intrinsic(glsl, "floatBitsToInt")
__target_intrinsic(cpp, "$P_asint($0)")
__target_intrinsic(cuda, "$P_asint($0)")
__target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
[__readNone]
int asint(float x);

__target_intrinsic(hlsl)
__target_intrinsic(glsl, "int($0)")
__target_intrinsic(cpp, "$P_asint($0)")
__target_intrinsic(cuda, "$P_asint($0)")
__target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
[__readNone]
int asint(uint x);

__generic<let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "floatBitsToInt")
__target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
[__readNone]
vector<int, N> asint(vector<float, N> x)
{
    VECTOR_MAP_UNARY(int, N, asint, x);
}

__generic<let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "ivec$N0($0)")
__target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
[__readNone]
vector<int, N> asint(vector<uint, N> x)
{
    VECTOR_MAP_UNARY(int, N, asint, x);
}

__generic<let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<int, N, M> asint(matrix<float, N, M> x)
{
    MATRIX_MAP_UNARY(int, N, M, asint, x);
}

__generic<let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<int, N, M> asint(matrix<uint, N, M> x)
{
    MATRIX_MAP_UNARY(int, N, M, asint, x);
}

// No op
[__unsafeForceInlineEarly]
[__readNone]
int asint(int x)
{ return x; }

__generic<let N : int>
[__unsafeForceInlineEarly]
[__readNone]
vector<int,N> asint(vector<int,N> x)
{ return x; }

__generic<let N : int, let M : int>
[__unsafeForceInlineEarly]
[__readNone]
matrix<int,N,M> asint(matrix<int,N,M> x)
{ return x; }

// Reinterpret bits of double as a uint (HLSL SM 5.0)

__target_intrinsic(hlsl)
__target_intrinsic(glsl, "{ uvec2 v = unpackDouble2x32($0); $1 = v.x; $2 = v.y; }")
__glsl_extension(GL_ARB_gpu_shader5)
__target_intrinsic(cpp, "$P_asuint($0, $1, $2)")
__target_intrinsic(cuda, "$P_asuint($0, $1, $2)")
[__readNone]
void asuint(double value, out uint lowbits, out uint highbits);

// Reinterpret bits as a uint (HLSL SM 4.0)

__target_intrinsic(hlsl)
__target_intrinsic(glsl, "floatBitsToUint")
__target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
__target_intrinsic(cpp, "$P_asuint($0)")
__target_intrinsic(cuda, "$P_asuint($0)")
[__readNone]
uint asuint(float x);

__target_intrinsic(hlsl)
__target_intrinsic(glsl, "uint($0)")
__target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
__target_intrinsic(cpp, "$P_asuint($0)")
__target_intrinsic(cuda, "$P_asuint($0)")
[__readNone]
uint asuint(int x);

__generic<let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "floatBitsToUint")
__target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
[__readNone]
vector<uint,N> asuint(vector<float,N> x)
{
    VECTOR_MAP_UNARY(uint, N, asuint, x);
}

__generic<let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "uvec$N0($0)")
__target_intrinsic(spirv_direct, "OpBitcast resultType resultId _0")
[__readNone]
vector<uint, N> asuint(vector<int, N> x)
{
    VECTOR_MAP_UNARY(uint, N, asuint, x);
}

__generic<let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<uint,N,M> asuint(matrix<float,N,M> x)
{
    MATRIX_MAP_UNARY(uint, N, M, asuint, x);
}

__generic<let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<uint, N, M> asuint(matrix<int, N, M> x)
{
    MATRIX_MAP_UNARY(uint, N, M, asuint, x);
}

[__unsafeForceInlineEarly]
[__readNone]
uint asuint(uint x)
{ return x; }

__generic<let N : int>
[__unsafeForceInlineEarly]
[__readNone]
vector<uint,N> asuint(vector<uint,N> x)
{ return x; }

__generic<let N : int, let M : int>
[__unsafeForceInlineEarly]
[__readNone]
matrix<uint,N,M> asuint(matrix<uint,N,M> x)
{ return x; }


// 16-bit bitcast ops (HLSL SM 6.2)
//
// TODO: We need to map these to GLSL/SPIR-V
// operations that don't require an intermediate
// conversion to fp32.

// Identity cases:

[__unsafeForceInlineEarly][__readNone] float16_t asfloat16(float16_t value) { return value; }
[__unsafeForceInlineEarly][__readNone] vector<float16_t,N> asfloat16<let N : int>(vector<float16_t,N> value) { return value; }
[__unsafeForceInlineEarly][__readNone] matrix<float16_t,R,C> asfloat16<let R : int, let C : int>(matrix<float16_t,R,C> value) { return value; }

[__unsafeForceInlineEarly][__readNone] int16_t asint16(int16_t value) { return value; }
[__unsafeForceInlineEarly][__readNone] vector<int16_t,N> asint16<let N : int>(vector<int16_t,N> value) { return value; }
[__unsafeForceInlineEarly][__readNone] matrix<int16_t,R,C> asint16<let R : int, let C : int>(matrix<int16_t,R,C> value) { return value; }

[__unsafeForceInlineEarly][__readNone] uint16_t asuint16(uint16_t value) { return value; }
[__unsafeForceInlineEarly][__readNone] vector<uint16_t,N> asuint16<let N : int>(vector<uint16_t,N> value) { return value; }
[__unsafeForceInlineEarly][__readNone] matrix<uint16_t,R,C> asuint16<let R : int, let C : int>(matrix<uint16_t,R,C> value) { return value; }

// Signed<->unsigned cases:

[__unsafeForceInlineEarly][__readNone] int16_t asint16(uint16_t value) { return value; }
[__unsafeForceInlineEarly][__readNone] vector<int16_t,N> asint16<let N : int>(vector<uint16_t,N> value) { return value; }
[__unsafeForceInlineEarly][__readNone] matrix<int16_t,R,C> asint16<let R : int, let C : int>(matrix<uint16_t,R,C> value) { return value; }

[__unsafeForceInlineEarly][__readNone] uint16_t asuint16(int16_t value) { return value; }
[__unsafeForceInlineEarly][__readNone] vector<uint16_t,N> asuint16<let N : int>(vector<int16_t,N> value) { return value; }
[__unsafeForceInlineEarly][__readNone] matrix<uint16_t,R,C> asuint16<let R : int, let C : int>(matrix<int16_t,R,C> value) { return value; }

// Float->unsigned cases:

__target_intrinsic(hlsl)
__target_intrinsic(glsl, "uint16_t(packHalf2x16(vec2($0, 0.0)))")
__target_intrinsic(cuda, "__half_as_ushort")
[__readNone]
uint16_t asuint16(float16_t value);

[__readNone]
vector<uint16_t,N> asuint16<let N : int>(vector<float16_t,N> value)
{ VECTOR_MAP_UNARY(uint16_t, N, asuint16, value); }

[__readNone]
matrix<uint16_t,R,C> asuint16<let R : int, let C : int>(matrix<float16_t,R,C> value)
{ MATRIX_MAP_UNARY(uint16_t, R, C, asuint16, value); }

// Unsigned->float cases:

__target_intrinsic(hlsl)
__target_intrinsic(glsl, "float16_t(unpackHalf2x16($0).x)")
__target_intrinsic(cuda, "__ushort_as_half")
[__readNone]
float16_t asfloat16(uint16_t value);

[__readNone]
vector<float16_t,N> asfloat16<let N : int>(vector<uint16_t,N> value)
{ VECTOR_MAP_UNARY(float16_t, N, asfloat16, value); }

[__readNone]
matrix<float16_t,R,C> asfloat16<let R : int, let C : int>(matrix<uint16_t,R,C> value)
{ MATRIX_MAP_UNARY(float16_t, R, C, asfloat16, value); }

// Float<->signed cases:

__target_intrinsic(hlsl)
__target_intrinsic(cuda, "__half_as_short")
[__unsafeForceInlineEarly][__readNone] int16_t asint16(float16_t value) { return asuint16(value); }
__target_intrinsic(hlsl) [__unsafeForceInlineEarly][__readNone] vector<int16_t,N> asint16<let N : int>(vector<float16_t,N> value) { return asuint16(value); }
__target_intrinsic(hlsl) [__unsafeForceInlineEarly][__readNone] matrix<int16_t,R,C> asint16<let R : int, let C : int>(matrix<float16_t,R,C> value) { return asuint16(value); }

__target_intrinsic(hlsl)
__target_intrinsic(cuda, "__short_as_half")
[__readNone]
[__unsafeForceInlineEarly] float16_t asfloat16(int16_t value) { return asfloat16(asuint16(value)); }

__target_intrinsic(hlsl) [__unsafeForceInlineEarly][__readNone] vector<float16_t,N> asfloat16<let N : int>(vector<int16_t,N> value) { return asfloat16(asuint16(value)); }
__target_intrinsic(hlsl) [__unsafeForceInlineEarly][__readNone] matrix<float16_t,R,C> asfloat16<let R : int, let C : int>(matrix<int16_t,R,C> value) { return asfloat16(asuint16(value)); }

// Inverse tangent (HLSL SM 1.0)
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_atan($0)")
__target_intrinsic(cpp, "$P_atan($0)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Atan _0")
[__readNone]
T atan(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Atan _0")
[__readNone]
vector<T, N> atan(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, atan, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> atan(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, atan, x);
}

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl,"atan($0,$1)")
__target_intrinsic(cuda, "$P_atan2($0, $1)")
__target_intrinsic(cpp, "$P_atan2($0, $1)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Atan2 _0 _1")
[__readNone]
T atan2(T y, T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl,"atan($0,$1)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Atan2 _0 _1")
[__readNone]
vector<T, N> atan2(vector<T, N> y, vector<T, N> x)
{
    VECTOR_MAP_BINARY(T, N, atan2, y, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> atan2(matrix<T,N,M> y, matrix<T,N,M> x)
{
    MATRIX_MAP_BINARY(T, N, M, atan2, y, x);
}

// Ceiling (HLSL SM 1.0)
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_ceil($0)")
__target_intrinsic(cpp, "$P_ceil($0)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Ceil _0")
[__readNone]
T ceil(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Ceil _0")
[__readNone]
vector<T, N> ceil(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, ceil, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> ceil(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, ceil, x);
}


// Check access status to tiled resource
bool CheckAccessFullyMapped(uint status);

// Clamp (HLSL SM 1.0)
__generic<T : __BuiltinIntegerType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FClamp, UClamp, SClamp) _0 _1 _2")
[__readNone]
T clamp(T x, T minBound, T maxBound)
{
    return min(max(x, minBound), maxBound);
}

__generic<T : __BuiltinIntegerType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FClamp, UClamp, SClamp) _0 _1 _2")
[__readNone]
vector<T, N> clamp(vector<T, N> x, vector<T, N> minBound, vector<T, N> maxBound)
{
    return min(max(x, minBound), maxBound);
}

__generic<T : __BuiltinIntegerType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> clamp(matrix<T,N,M> x, matrix<T,N,M> minBound, matrix<T,N,M> maxBound)
{
    return min(max(x, minBound), maxBound);
}

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FClamp, UClamp, SClamp) _0 _1 _2")
[__readNone]
T clamp(T x, T minBound, T maxBound)
{
    return min(max(x, minBound), maxBound);
}

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FClamp, UClamp, SClamp) _0 _1 _2")
[__readNone]
vector<T, N> clamp(vector<T, N> x, vector<T, N> minBound, vector<T, N> maxBound)
{
    return min(max(x, minBound), maxBound);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> clamp(matrix<T,N,M> x, matrix<T,N,M> minBound, matrix<T,N,M> maxBound)
{
    return min(max(x, minBound), maxBound);
}

// Clip (discard) fragment conditionally
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
[__readNone]
void clip(T x)
{
    if(x < T(0)) discard;
}

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
[__readNone]
void clip(vector<T,N> x)
{
    if(any(x < T(0))) discard;
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
void clip(matrix<T,N,M> x)
{
    if(any(x < T(0))) discard;
}

// Cosine
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_cos($0)")
__target_intrinsic(cpp, "$P_cos($0)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Cos _0")
[__readNone]
T cos(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Cos _0")
[__readNone]
vector<T, N> cos(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T,N, cos, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> cos(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, cos, x);
}

// Hyperbolic cosine
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_cosh($0)")
__target_intrinsic(cpp, "$P_cosh($0)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Cosh _0")
[__readNone]
T cosh(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Cosh _0")
[__readNone]
vector<T,N> cosh(vector<T,N> x)
{
    VECTOR_MAP_UNARY(T,N, cosh, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> cosh(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, cosh, x);
}

// Population count
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "bitCount")
__target_intrinsic(cuda, "$P_countbits($0)")
__target_intrinsic(cpp, "$P_countbits($0)")
[__readNone]
uint countbits(uint value);

// Cross product
// TODO: SPIRV does not support integer vectors.
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Cross _0 _1")
[__readNone]
vector<T,3> cross(vector<T,3> left, vector<T,3> right)
{
    return vector<T,3>(
        left.y * right.z - left.z * right.y,
        left.z * right.x - left.x * right.z,
        left.x * right.y - left.y * right.x);
}

__generic<T : __BuiltinIntegerType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Cross _0 _1")
[__readNone]
vector<T, 3> cross(vector<T, 3> left, vector<T, 3> right)
{
    return vector<T, 3>(
        left.y * right.z - left.z * right.y,
        left.z * right.x - left.x * right.z,
        left.x * right.y - left.y * right.x);
}

// Convert encoded color
__target_intrinsic(hlsl)
[__readNone]
int4 D3DCOLORtoUBYTE4(float4 color)
{
    let scaled = color.zyxw * 255.001999f;
    return int4(scaled);
}

// Partial-difference derivatives
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(glsl, dFdx)
[__readNone]
T ddx(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, dFdx)
[__readNone]
vector<T, N> ddx(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, ddx, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> ddx(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, ddx, x);
}

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__glsl_extension(GL_ARB_derivative_control)
__target_intrinsic(glsl, dFdxCoarse)
[__readNone]
T ddx_coarse(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__glsl_extension(GL_ARB_derivative_control)
__target_intrinsic(glsl, dFdxCoarse)
[__readNone]
vector<T, N> ddx_coarse(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, ddx_coarse, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> ddx_coarse(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, ddx_coarse, x);
}

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__glsl_extension(GL_ARB_derivative_control)
__target_intrinsic(glsl, dFdxFine)
[__readNone]
T ddx_fine(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__glsl_extension(GL_ARB_derivative_control)
__target_intrinsic(glsl, dFdxFine)
[__readNone]
vector<T, N> ddx_fine(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, ddx_fine, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> ddx_fine(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, ddx_fine, x);
}

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, dFdy)
[__readNone]
T ddy(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, dFdy)
[__readNone]
vector<T, N> ddy(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, ddy, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> ddy(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, ddy, x);
}

__generic<T : __BuiltinFloatingPointType>
__glsl_extension(GL_ARB_derivative_control)
__target_intrinsic(glsl, dFdyCoarse)
[__readNone]
T ddy_coarse(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__glsl_extension(GL_ARB_derivative_control)
__target_intrinsic(glsl, dFdyCoarse)
[__readNone]
vector<T, N> ddy_coarse(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, ddy_coarse, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> ddy_coarse(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, ddy_coarse, x);
}

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__glsl_extension(GL_ARB_derivative_control)
__target_intrinsic(glsl, dFdyFine)
[__readNone]
T ddy_fine(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__glsl_extension(GL_ARB_derivative_control)
__target_intrinsic(glsl, dFdyFine)
[__readNone]
vector<T, N> ddy_fine(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, ddy_fine, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> ddy_fine(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, ddy_fine, x);
}


// Radians to degrees

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Degrees _0")
[__readNone]
T degrees(T x)
{
    return x * (T(180) / T.getPi());
}

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Degrees _0")
[__readNone]
vector<T, N> degrees(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, degrees, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> degrees(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, degrees, x);
}

// Matrix determinant

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Determinant _0")
[__readNone]
T determinant(matrix<T,N,N> m);

// Barrier for device memory
__target_intrinsic(glsl, "memoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer()")
__target_intrinsic(cuda, "__threadfence()")
void DeviceMemoryBarrier();

__target_intrinsic(glsl, "memoryBarrier(), memoryBarrierImage(), memoryBarrierBuffer(), barrier()")
__target_intrinsic(glsl, "__syncthreads()")
void DeviceMemoryBarrierWithGroupSync();

// Vector distance

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Distance _0 _1")
[__readNone]
T distance(vector<T, N> x, vector<T, N> y)
{
    return length(x - y);
}

// Vector dot product

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
[__readNone]
T dot(vector<T, N> x, vector<T, N> y)
{
    T result = T(0);
    for(int i = 0; i < N; ++i)
        result += x[i] * y[i];
    return result;
}

__generic<T : __BuiltinIntegerType, let N : int>
__target_intrinsic(hlsl)
[__readNone]
T dot(vector<T, N> x, vector<T, N> y)
{
    T result = T(0);
    for(int i = 0; i < N; ++i)
        result += x[i] * y[i];
    return result;
}


// Helper for computing distance terms for lighting (obsolete)

__generic<T : __BuiltinFloatingPointType> vector<T,4> dst(vector<T,4> x, vector<T,4> y);

// Given a RWByteAddressBuffer allow it to be interpretted as a RWStructuredBuffer
__intrinsic_op($(kIROp_GetEquivalentStructuredBuffer))
RWStructuredBuffer<T> __getEquivalentStructuredBuffer<T>(RWByteAddressBuffer b);

// Error message

// void errorf( string format, ... );

// Attribute evaluation

// TODO: The matrix cases of these functions won't actuall work
// when compiled to GLSL, since they only support scalar/vector

// TODO: Should these be constrains to `__BuiltinFloatingPointType`?
// TODO: SPIRV-direct does not support non-floating-point types.

__generic<T : __BuiltinArithmeticType>
__target_intrinsic(glsl, interpolateAtCentroid)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 InterpolateAtCentroid _0")
[__readNone]
T EvaluateAttributeAtCentroid(T x);

__generic<T : __BuiltinArithmeticType, let N : int>
__target_intrinsic(glsl, interpolateAtCentroid)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 InterpolateAtCentroid _0")
[__readNone]
vector<T,N> EvaluateAttributeAtCentroid(vector<T,N> x);

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(glsl, interpolateAtCentroid)
[__readNone]
matrix<T,N,M> EvaluateAttributeAtCentroid(matrix<T,N,M> x)
{
    MATRIX_MAP_UNARY(T, N, M, EvaluateAttributeAtCentroid, x);
}

__generic<T : __BuiltinArithmeticType>
__target_intrinsic(glsl, "interpolateAtSample($0, int($1))")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 InterpolateAtSample _0 _1")
[__readNone]
T EvaluateAttributeAtSample(T x, uint sampleindex);

__generic<T : __BuiltinArithmeticType, let N : int>
__target_intrinsic(glsl, "interpolateAtSample($0, int($1))")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 InterpolateAtSample _0 _1")
[__readNone]
vector<T,N> EvaluateAttributeAtSample(vector<T,N> x, uint sampleindex);

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(glsl, "interpolateAtSample($0, int($1))")
[__readNone]
matrix<T,N,M> EvaluateAttributeAtSample(matrix<T,N,M> x, uint sampleindex)
{
    matrix<T,N,M> result;
    for(int i = 0; i < N; ++i)
    {
        result[i] = EvaluateAttributeAtSample(x[i], sampleindex);
    }
    return result;
}

__generic<T : __BuiltinArithmeticType>
__target_intrinsic(glsl, "interpolateAtOffset($0, vec2($1) / 16.0f)")
__target_intrinsic(spirv_direct, "%foffset = OpConvertSToF _type(float2) resultId _1; %offsetdiv16 = 136 _type(float2) resultId %foffset const(float2, 16.0, 16.0); OpExtInst resultType resultId glsl450 78 _0 %offsetdiv16")
[__readNone]
T EvaluateAttributeSnapped(T x, int2 offset);

__generic<T : __BuiltinArithmeticType, let N : int>
__target_intrinsic(glsl, "interpolateAtOffset($0, vec2($1) / 16.0f)")
__target_intrinsic(spirv_direct, "%foffset = OpConvertSToF _type(float2) resultId _1; %offsetdiv16 = 136 _type(float2) resultId %foffset const(float2, 16.0, 16.0); OpExtInst resultType resultId glsl450 78 _0 %offsetdiv16")
[__readNone]
vector<T,N> EvaluateAttributeSnapped(vector<T,N> x, int2 offset);

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(glsl, "interpolateAtOffset($0, vec2($1) / 16.0f)")
[__readNone]
matrix<T,N,M> EvaluateAttributeSnapped(matrix<T,N,M> x, int2 offset)
{
    matrix<T,N,M> result;
    for(int i = 0; i < N; ++i)
    {
        result[i] = EvaluateAttributeSnapped(x[i], offset);
    }
    return result;
}

// Base-e exponent

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_exp($0)")
__target_intrinsic(cpp, "$P_exp($0)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Exp _0")
[__readNone]
T exp(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Exp _0")
[__readNone]
vector<T, N> exp(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, exp, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> exp(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, exp, x);
}

// Base-2 exponent

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_exp2($0)")
__target_intrinsic(cpp, "$P_exp2($0)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Exp2 _0")
[__readNone]
T exp2(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Exp2 _0")
[__readNone]
vector<T,N> exp2(vector<T,N> x)
{
    VECTOR_MAP_UNARY(T, N, exp2, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> exp2(matrix<T,N,M> x)
{
    MATRIX_MAP_UNARY(T, N, M, exp2, x);
}


// Convert 16-bit float stored in low bits of integer
__target_intrinsic(glsl, "unpackHalf2x16($0).x")
__glsl_version(420)
__target_intrinsic(hlsl)
__cuda_sm_version(6.0)
__target_intrinsic(cuda, "__half2float(__ushort_as_half($0))")
[__readNone]
float f16tof32(uint value);

__generic<let N : int>
__target_intrinsic(hlsl)
[__readNone]
vector<float, N> f16tof32(vector<uint, N> value)
{
    VECTOR_MAP_UNARY(float, N, f16tof32, value);
}



// Convert to 16-bit float stored in low bits of integer
__target_intrinsic(glsl, "packHalf2x16(vec2($0,0.0))")
__glsl_version(420)
__target_intrinsic(hlsl)
__cuda_sm_version(6.0)
__target_intrinsic(cuda, "__half_as_ushort(__float2half($0))")
[__readNone]
uint f32tof16(float value);

__generic<let N : int>
__target_intrinsic(hlsl)
[__readNone]
vector<uint, N> f32tof16(vector<float, N> value)
{
    VECTOR_MAP_UNARY(uint, N, f32tof16, value);
}

// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
// The following is Slang specific and NOT part of standard HLSL
// It's not clear what happens with float16 time in HLSL -> can the float16 coerce to uint for example? If so that would
// give the wrong result

__target_intrinsic(glsl, "unpackHalf2x16($0).x")
__target_intrinsic(cuda, "__half2float")
__glsl_version(420)
[__readNone]
float f16tof32(float16_t value);

__generic<let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "__half2float")
[__readNone]
vector<float, N> f16tof32(vector<float16_t, N> value)
{
    VECTOR_MAP_UNARY(float, N, f16tof32, value);
}

// Convert to float16_t
__target_intrinsic(glsl, "packHalf2x16(vec2($0,0.0))")
__glsl_version(420)
__target_intrinsic(cuda, "__float2half")
[__readNone]
float16_t f32tof16_(float value);

__generic<let N : int>
__target_intrinsic(cuda, "__float2half")
[__readNone]
vector<float16_t, N> f32tof16_(vector<float, N> value)
{
    VECTOR_MAP_UNARY(uint, N, f32tof16, value);
}

// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

// Flip surface normal to face forward, if needed
__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
[__readNone]
vector<T,N> faceforward(vector<T,N> n, vector<T,N> i, vector<T,N> ng)
{
    return dot(ng, i) < T(0.0f) ? n : -n;
}

// Find first set bit starting at high bit and working down
__target_intrinsic(hlsl)
__target_intrinsic(glsl,"findMSB")
__target_intrinsic(cuda, "$P_firstbithigh($0)")
__target_intrinsic(cpp, "$P_firstbithigh($0)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FindSMsb _0")
[__readNone]
int firstbithigh(int value);

__target_intrinsic(hlsl)
__target_intrinsic(glsl,"findMSB")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FindSMsb _0")
__generic<let N : int>
[__readNone]
vector<int, N> firstbithigh(vector<int, N> value)
{
    VECTOR_MAP_UNARY(int, N, firstbithigh, value);
}

__target_intrinsic(hlsl)
__target_intrinsic(glsl,"findMSB")
__target_intrinsic(cuda, "$P_firstbithigh($0)")
__target_intrinsic(cpp, "$P_firstbithigh($0)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FindUMsb _0")
[__readNone]
uint firstbithigh(uint value);

__target_intrinsic(hlsl)
__target_intrinsic(glsl,"findMSB")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FindUMsb _0")
__generic<let N : int>
[__readNone]
vector<uint,N> firstbithigh(vector<uint,N> value)
{
    VECTOR_MAP_UNARY(uint, N, firstbithigh, value);
}

// Find first set bit starting at low bit and working up
__target_intrinsic(hlsl)
__target_intrinsic(glsl,"findLSB")
__target_intrinsic(cuda, "$P_firstbitlow($0)")
__target_intrinsic(cpp, "$P_firstbitlow($0)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FindILsb _0")
[__readNone]
int firstbitlow(int value);

__target_intrinsic(hlsl)
__target_intrinsic(glsl,"findLSB")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FindILsb _0")
__generic<let N : int>
[__readNone]
vector<int,N> firstbitlow(vector<int,N> value)
{
    VECTOR_MAP_UNARY(int, N, firstbitlow, value);
}

__target_intrinsic(hlsl)
__target_intrinsic(glsl,"findLSB")
__target_intrinsic(cuda, "$P_firstbitlow($0)")
__target_intrinsic(cpp, "$P_firstbitlow($0)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FindILsb _0")
[__readNone]
uint firstbitlow(uint value);

__target_intrinsic(hlsl)
__target_intrinsic(glsl,"findLSB")
__generic<let N : int>
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FindILsb _0")
[__readNone]
vector<uint,N> firstbitlow(vector<uint,N> value)
{
    VECTOR_MAP_UNARY(uint, N, firstbitlow, value);
}

// Floor (HLSL SM 1.0)

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_floor($0)")
__target_intrinsic(cpp, "$P_floor($0)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Floor _0")
[__readNone]
T floor(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Floor _0")
[__readNone]
vector<T, N> floor(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, floor, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> floor(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, floor, x);
}

// Fused multiply-add for doubles
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_fma($0, $1, $2)")
__target_intrinsic(cpp, "$P_fma($0, $1, $2)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2")
[__readNone]
double fma(double a, double b, double c);

__generic<let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2")
[__readNone]
vector<double, N> fma(vector<double, N> a, vector<double, N> b, vector<double, N> c)
{
    VECTOR_MAP_TRINARY(double, N, fma, a, b, c);
}

__generic<let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<double, N, M> fma(matrix<double, N, M> a, matrix<double, N, M> b, matrix<double, N, M> c)
{
    MATRIX_MAP_TRINARY(double, N, M, fma, a, b, c);
}

// Floating point remainder of x/y
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "$P_fmod($0, $1)")
__target_intrinsic(cpp, "$P_fmod($0, $1)")
[__readNone]
T fmod(T x, T y)
{
    return x - y * trunc(x/y);
}

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
[__readNone]
vector<T, N> fmod(vector<T, N> x, vector<T, N> y)
{
    VECTOR_MAP_BINARY(T, N, fmod, x, y);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> fmod(matrix<T, N, M> x, matrix<T, N, M> y)
{
    MATRIX_MAP_BINARY(T, N, M, fmod, x, y);
}

// Fractional part
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, fract)
__target_intrinsic(cuda, "$P_frac($0)")
__target_intrinsic(cpp, "$P_frac($0)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Fract _0")
[__readNone]
T frac(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, fract)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Fract _0")
[__readNone]
vector<T, N> frac(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, frac, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
[__readNone]
matrix<T, N, M> frac(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, frac, x);
}

// Split float into mantissa and exponent
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Frexp _0 _1")
[__readNone]
T frexp(T x, out T exp);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Frexp _0 _1")
[__readNone]
vector<T, N> frexp(vector<T, N> x, out vector<T, N> exp)
{
    VECTOR_MAP_BINARY(T, N, frexp, x, exp);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> frexp(matrix<T, N, M> x, out matrix<T, N, M> exp)
{
    MATRIX_MAP_BINARY(T, N, M, frexp, x, exp);
}

// Texture filter width
__generic<T : __BuiltinFloatingPointType>
[__readNone]
T fwidth(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
[__readNone]
vector<T, N> fwidth(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, fwidth, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> fwidth(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, fwidth, x);
}

/// Get the value of a vertex attribute at a specific vertex.
///
/// The `GetAttributeAtVertex()` function can be used in a fragment shader
/// to get the value of the given `attribute` at the vertex of the primitive
/// that corresponds to the given `vertexIndex`.
///
/// Note that the `attribute` must have been a declared varying input to
/// the fragment shader with the `nointerpolation` modifier.
///
/// This function can be applied to scalars, vectors, and matrices of
/// built-in scalar types.
///
/// Note: these functions are not curently implemented for Vulkan/SPIR-V output.
///
__generic<T : __BuiltinType>
[__readNone]
__target_intrinsic(hlsl)
__target_intrinsic(GL_NV_fragment_shader_barycentric, "$0[$1]")
__target_intrinsic(GL_EXT_fragment_shader_barycentric, "$0[$1]")
__glsl_version(450)
T GetAttributeAtVertex(T attribute, uint vertexIndex);

/// Get the value of a vertex attribute at a specific vertex.
///
/// The `GetAttributeAtVertex()` function can be used in a fragment shader
/// to get the value of the given `attribute` at the vertex of the primitive
/// that corresponds to the given `vertexIndex`.
///
/// Note that the `attribute` must have been a declared varying input to
/// the fragment shader with the `nointerpolation` modifier.
///
/// This function can be applied to scalars, vectors, and matrices of
/// built-in scalar types.
///
/// Note: these functions are not curently implemented for Vulkan/SPIR-V output.
///
__generic<T : __BuiltinType, let N : int>
[__readNone]
__target_intrinsic(hlsl)
__target_intrinsic(GL_NV_fragment_shader_barycentric, "$0[$1]")
__target_intrinsic(GL_EXT_fragment_shader_barycentric, "$0[$1]")
__glsl_version(450)
vector<T,N> GetAttributeAtVertex(vector<T,N> attribute, uint vertexIndex);

/// Get the value of a vertex attribute at a specific vertex.
///
/// The `GetAttributeAtVertex()` function can be used in a fragment shader
/// to get the value of the given `attribute` at the vertex of the primitive
/// that corresponds to the given `vertexIndex`.
///
/// Note that the `attribute` must have been a declared varying input to
/// the fragment shader with the `nointerpolation` modifier.
///
/// This function can be applied to scalars, vectors, and matrices of
/// built-in scalar types.
///
/// Note: these functions are not curently implemented for Vulkan/SPIR-V output.
///
__generic<T : __BuiltinType, let N : int, let M : int>
[__readNone]
__target_intrinsic(hlsl)
__target_intrinsic(GL_NV_fragment_shader_barycentric, "$0[$1]")
__target_intrinsic(GL_EXT_fragment_shader_barycentric, "$0[$1]")
__glsl_version(450)
matrix<T,N,M> GetAttributeAtVertex(matrix<T,N,M> attribute, uint vertexIndex);


// Get number of samples in render target
[__readNone]
uint GetRenderTargetSampleCount();

// Get position of given sample
[__readNone]
float2 GetRenderTargetSamplePosition(int Index);

// Group memory barrier
__target_intrinsic(glsl, "groupMemoryBarrier")
__target_intrinsic(cuda, "__threadfence_block")
void GroupMemoryBarrier();


__target_intrinsic(glsl, "groupMemoryBarrier(), barrier()")
__target_intrinsic(cuda, "__syncthreads()")
void GroupMemoryBarrierWithGroupSync();

// Atomics

__target_intrinsic(glsl, "$atomicAdd($A, $1)")
__target_intrinsic(cuda, "atomicAdd($0, $1)")
void InterlockedAdd(__ref  int dest,  int value);

__target_intrinsic(glsl, "$atomicAdd($A, $1)")
__target_intrinsic(cuda, "atomicAdd((uint*)$0, $1)")
void InterlockedAdd(__ref uint dest, uint value);

__target_intrinsic(glsl, "($2 = $atomicAdd($A, $1))")
__target_intrinsic(cuda, "(*$2 = atomicAdd($0, $1))")
void InterlockedAdd(__ref  int dest,  int value, out  int original_value);

__target_intrinsic(glsl, "($2 = $atomicAdd($A, $1))")
__target_intrinsic(cuda, "(*$2 = (uint)atomicAdd((uint*)$0, $1))")
void InterlockedAdd(__ref uint dest, uint value, out uint original_value);

__target_intrinsic(glsl, "$atomicAnd($A, $1)")
__target_intrinsic(cuda, "atomicAnd($0, $1)")
void InterlockedAnd(__ref  int dest,  int value);

__target_intrinsic(glsl, "$atomicAnd($A, $1)")
__target_intrinsic(cuda, "atomicAnd((int*)$0, $1)")
void InterlockedAnd(__ref uint dest, uint value);

__target_intrinsic(glsl, "($2 = $atomicAnd($A, $1))")
__target_intrinsic(cuda, "(*$2 = atomicAnd($0, $1))")
void InterlockedAnd(__ref  int dest,  int value, out  int original_value);

__target_intrinsic(glsl, "($2 = $atomicAnd($A, $1))")
__target_intrinsic(cuda, "(*$2 = atomicAnd((int*)$0, $1))")
void InterlockedAnd(__ref uint dest, uint value, out uint original_value);

__target_intrinsic(glsl, "($3 = $atomicCompSwap($A, $1, $2))")
__target_intrinsic(cuda, "(*$3 = atomicCAS($0, $1, $2))")
void InterlockedCompareExchange(__ref  int dest,  int compare_value,  int value, out  int original_value);

__target_intrinsic(glsl, "($3 = $atomicCompSwap($A, $1, $2))")
__target_intrinsic(cuda, "(*$3 = (uint)atomicCAS((int*)$0, $1, $2))")
void InterlockedCompareExchange(__ref uint dest, uint compare_value, uint value, out uint original_value);

__target_intrinsic(glsl, "$atomicCompSwap($A, $1, $2)")
__target_intrinsic(cuda, "atomicCAS($0, $1, $2)")
void InterlockedCompareStore(__ref int dest,  int compare_value,  int value);

__target_intrinsic(glsl, "$atomicCompSwap($A, $1, $2)")
__target_intrinsic(cuda, "atomicCAS((int*)$0, $1, $2)")
void InterlockedCompareStore(__ref uint dest, uint compare_value, uint value);

__target_intrinsic(glsl, "($2 = $atomicExchange($A, $1))")
__target_intrinsic(cuda, "(*$2 = atomicExch($0, $1))")
void InterlockedExchange(__ref  int dest,  int value, out  int original_value);

__target_intrinsic(glsl, "($2 = $atomicExchange($A, $1))")
__target_intrinsic(cuda, "(*$2 = (uint)atomicExch((int*)$0, $1))")
void InterlockedExchange(__ref uint dest, uint value, out uint original_value);

__target_intrinsic(glsl, "$atomicMax($A, $1)")
__target_intrinsic(cuda, "atomicMax($0, $1)")
void InterlockedMax(__ref  int dest,  int value);

__target_intrinsic(glsl, "$atomicMax($A, $1)")
__target_intrinsic(cuda, "atomicMax((int*)$0, $1)")
void InterlockedMax(__ref uint dest, uint value);

__target_intrinsic(glsl, "($2 = $atomicMax($A, $1))")
__target_intrinsic(cuda, "(*$2 = atomicMax($0, $1))")
void InterlockedMax(__ref  int dest,  int value, out  int original_value);

__target_intrinsic(glsl, "($2 = $atomicMax($A, $1))")
__target_intrinsic(cuda, "(*$2 = (uint)atomicMax((int*)$0, $1))")
void InterlockedMax(__ref uint dest, uint value, out uint original_value);

__target_intrinsic(glsl, "$atomicMin($A, $1)")
__target_intrinsic(cuda, "atomicMin($0, $1)")
void InterlockedMin(__ref  int dest,  int value);

__target_intrinsic(glsl, "$atomicMin($A, $1)")
__target_intrinsic(cuda, "atomicMin((int*)$0, $1)")
void InterlockedMin(__ref uint dest, uint value);

__target_intrinsic(glsl, "($2 = $atomicMin($A, $1))")
__target_intrinsic(cuda, "(*$2 = atomicMin($0, $1))")
void InterlockedMin(__ref  int dest,  int value, out  int original_value);

__target_intrinsic(glsl, "($2 = $atomicMin($A, $1))")
__target_intrinsic(cuda, "(*$2 = (uint)atomicMin((int*)$0, $1))")
void InterlockedMin(__ref uint dest, uint value, out uint original_value);

__target_intrinsic(glsl, "$atomicOr($A, $1)")
__target_intrinsic(cuda, "atomicOr($0, $1)")
void InterlockedOr(__ref  int dest,  int value);

__target_intrinsic(glsl, "$atomicOr($A, $1)")
__target_intrinsic(cuda, "atomicOr((int*)$0, $1)")
void InterlockedOr(__ref uint dest, uint value);

__target_intrinsic(glsl, "($2 = $atomicOr($A, $1))")
__target_intrinsic(cuda, "(*$2 = atomicOr($0, $1))")
void InterlockedOr(__ref  int dest,  int value, out  int original_value);

__target_intrinsic(glsl, "($2 = $atomicOr($A, $1))")
__target_intrinsic(cuda, "(*$2 = (uint)atomicOr((int*)$0, $1))")
void InterlockedOr(__ref uint dest, uint value, out uint original_value);

__target_intrinsic(glsl, "$atomicXor($A, $1)")
__target_intrinsic(cuda, "atomicXor($0, $1)")
void InterlockedXor(__ref  int dest,  int value);

__target_intrinsic(glsl, "$atomicXor($A, $1)")
__target_intrinsic(cuda, "atomicXor((int*)$0, $1)")
void InterlockedXor(__ref uint dest, uint value);

__target_intrinsic(glsl, "($2 = $atomicXor($A, $1))")
__target_intrinsic(cuda, "(*$2 = atomicXor($0, $1))")
void InterlockedXor(__ref  int dest,  int value, out  int original_value);

__target_intrinsic(glsl, "($2 = $atomicXor($A, $1))")
__target_intrinsic(cuda, "(*$2 = (uint)atomicXor((int*)$0, $1))")
void InterlockedXor(__ref uint dest, uint value, out uint original_value);

// Is floating-point value finite?

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "$P_isfinite($0)")
__target_intrinsic(cpp, "$P_isfinite($0)")
[__readNone]
bool isfinite(T x)
{
    return !(isinf(x) || isnan(x));
}

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
[__readNone]
vector<bool, N> isfinite(vector<T, N> x)
{
    VECTOR_MAP_UNARY(bool, N, isfinite, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<bool, N, M> isfinite(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(bool, N, M, isfinite, x);
}

// Is floating-point value infinite?
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_isinf($0)")
__target_intrinsic(cpp, "$P_isinf($0)")
[__readNone]
bool isinf(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
[__readNone]
vector<bool, N> isinf(vector<T, N> x)
{
    VECTOR_MAP_UNARY(bool, N, isinf, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<bool, N, M> isinf(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(bool, N, M, isinf, x);
}

// Is floating-point value not-a-number?
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_isnan($0)")
__target_intrinsic(cpp, "$P_isnan($0)")
[__readNone]
bool isnan(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
[__readNone]
vector<bool, N> isnan(vector<T, N> x)
{
    VECTOR_MAP_UNARY(bool, N, isnan, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<bool, N, M> isnan(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(bool, N, M, isnan, x);
}

// Construct float from mantissa and exponent

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Ldexp _0 _1")
[__readNone]
T ldexp(T x, T exp)
{
    return x * exp2(exp);
}

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Ldexp _0 _1")
[__readNone]
vector<T, N> ldexp(vector<T, N> x, vector<T, N> exp)
{
    return x * exp2(exp);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> ldexp(matrix<T, N, M> x, matrix<T, N, M> exp)
{
    MATRIX_MAP_BINARY(T, N, M, ldexp, x, exp);
}

// Vector length
__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Length _0")
[__readNone]
T length(vector<T, N> x)
{
    return sqrt(dot(x, x));
}

// Linear interpolation
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, mix)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FMix _0 _1 _2")
[__readNone]
T lerp(T x, T y, T s)
{
    return x * (T(1.0f) - s) + y * s;
}

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, mix)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 FMix _0 _1 _2")
[__readNone]
vector<T, N> lerp(vector<T, N> x, vector<T, N> y, vector<T, N> s)
{
    return x * (T(1.0f) - s) + y * s;
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> lerp(matrix<T,N,M> x, matrix<T,N,M> y, matrix<T,N,M> s)
{
    MATRIX_MAP_TRINARY(T, N, M, lerp, x, y, s);
}

// Legacy lighting function (obsolete)
__target_intrinsic(hlsl)
[__readNone]
float4 lit(float n_dot_l, float n_dot_h, float m)
{
    let ambient = 1.0f;
    let diffuse = max(n_dot_l, 0.0f);
    let specular = step(0.0f, n_dot_l) * max(pow(n_dot_h, m), 0.0f);
    return float4(ambient, diffuse, specular, 1.0f);
}

// Base-e logarithm
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_log($0)")
__target_intrinsic(cpp, "$P_log($0)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Log _0")
[__readNone]
T log(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Log _0")
[__readNone]
vector<T, N> log(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, log, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> log(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, log, x);
}

// Base-10 logarithm
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "(log( $0 ) * $S0( 0.43429448190325182765112891891661) )" )
__target_intrinsic(cuda, "$P_log10($0)")
__target_intrinsic(cpp, "$P_log10($0)")
__target_intrinsic(spirv_direct, "%baseElog = OpExtInst resultType resultId glsl450 Log _0; OpFMul resultType resultId _0 %baseElog const(_p,0.43429448190325182765112891891661)")
[__readNone]
T log10(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "(log( $0 ) * $S0(0.43429448190325182765112891891661) )" )
__target_intrinsic(spirv_direct, "%baseElog = OpExtInst resultType resultId glsl450 Log _0; OpVectorTimesScalar resultType resultId _0 %baseElog const(_p,0.43429448190325182765112891891661)")
[__readNone]
vector<T,N> log10(vector<T,N> x)
{
    VECTOR_MAP_UNARY(T, N, log10, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> log10(matrix<T,N,M> x)
{
    MATRIX_MAP_UNARY(T, N, M, log10, x);
}

// Base-2 logarithm
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_log2($0)")
__target_intrinsic(cpp, "$P_log2($0)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Log2 _0")
[__readNone]
T log2(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Log2 _0")
[__readNone]
vector<T,N> log2(vector<T,N> x)
{
    VECTOR_MAP_UNARY(T, N, log2, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> log2(matrix<T,N,M> x)
{
    MATRIX_MAP_UNARY(T, N, M, log2, x);
}

// multiply-add

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, fma)
__target_intrinsic(cuda, "$P_fma($0, $1, $2)")
__target_intrinsic(cpp, "$P_fma($0, $1, $2)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2")
[__readNone]
T mad(T mvalue, T avalue, T bvalue);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, fma)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2")
[__readNone]
vector<T, N> mad(vector<T, N> mvalue, vector<T, N> avalue, vector<T, N> bvalue)
{
    VECTOR_MAP_TRINARY(T, N, mad, mvalue, avalue, bvalue);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> mad(matrix<T, N, M> mvalue, matrix<T, N, M> avalue, matrix<T, N, M> bvalue)
{
    MATRIX_MAP_TRINARY(T, N, M, mad, mvalue, avalue, bvalue);
}

__generic<T : __BuiltinIntegerType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, fma)
__target_intrinsic(cuda, "$P_fma($0, $1, $2)")
__target_intrinsic(cpp, "$P_fma($0, $1, $2)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2")
[__readNone]
T mad(T mvalue, T avalue, T bvalue);

__generic<T : __BuiltinIntegerType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, fma)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2")
[__readNone]
vector<T, N> mad(vector<T, N> mvalue, vector<T, N> avalue, vector<T, N> bvalue)
{
    VECTOR_MAP_TRINARY(T, N, mad, mvalue, avalue, bvalue);
}

__generic<T : __BuiltinIntegerType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> mad(matrix<T, N, M> mvalue, matrix<T, N, M> avalue, matrix<T, N, M> bvalue)
{
    MATRIX_MAP_TRINARY(T, N, M, mad, mvalue, avalue, bvalue);
}


// maximum
__generic<T : __BuiltinIntegerType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_max($0, $1)")
__target_intrinsic(cpp, "$P_max($0, $1)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FMax, UMax, SMax) _0")
[__readNone]
T max(T x, T y);
// Note: a stdlib implementation of `max` (or `min`) will require splitting
// floating-point and integer cases apart, because the floating-point
// version needs to correctly handle the case where one of the inputs
// is not-a-number.

__generic<T : __BuiltinIntegerType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FMax, UMax, SMax) _0")
[__readNone]
vector<T, N> max(vector<T, N> x, vector<T, N> y)
{
    VECTOR_MAP_BINARY(T, N, max, x, y);
}

__generic<T : __BuiltinIntegerType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> max(matrix<T, N, M> x, matrix<T, N, M> y)
{
    MATRIX_MAP_BINARY(T, N, M, max, x, y);
}

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_max($0, $1)")
__target_intrinsic(cpp, "$P_max($0, $1)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FMax, UMax, SMax) _0")
[__readNone]
T max(T x, T y);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FMax, UMax, SMax) _0")
[__readNone]
vector<T, N> max(vector<T, N> x, vector<T, N> y)
{
    VECTOR_MAP_BINARY(T, N, max, x, y);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> max(matrix<T, N, M> x, matrix<T, N, M> y)
{
    MATRIX_MAP_BINARY(T, N, M, max, x, y);
}

// minimum
__generic<T : __BuiltinIntegerType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_min($0, $1)")
__target_intrinsic(cpp, "$P_min($0, $1)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FMin, UMin, SMin) _0")
[__readNone]
T min(T x, T y);

__generic<T : __BuiltinIntegerType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FMin, UMin, SMin) _0")
[__readNone]
vector<T,N> min(vector<T,N> x, vector<T,N> y)
{
    VECTOR_MAP_BINARY(T, N, min, x, y);
}

__generic<T : __BuiltinIntegerType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> min(matrix<T,N,M> x, matrix<T,N,M> y)
{
    MATRIX_MAP_BINARY(T, N, M, min, x, y);
}

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_min($0, $1)")
__target_intrinsic(cpp, "$P_min($0, $1)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FMin, UMin, SMin) _0")
[__readNone]
T min(T x, T y);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fus(FMin, UMin, SMin) _0")
[__readNone]
vector<T,N> min(vector<T,N> x, vector<T,N> y)
{
    VECTOR_MAP_BINARY(T, N, min, x, y);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> min(matrix<T,N,M> x, matrix<T,N,M> y)
{
    MATRIX_MAP_BINARY(T, N, M, min, x, y);
}

// split into integer and fractional parts (both with same sign)
__generic<T : __BuiltinFloatingPointType>
[__readNone]
T modf(T x, out T ip);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
[__readNone]
vector<T,N> modf(vector<T,N> x, out vector<T,N> ip)
{
    VECTOR_MAP_BINARY(T, N, modf, x, ip);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> modf(matrix<T,N,M> x, out matrix<T,N,M> ip)
{
    MATRIX_MAP_BINARY(T, N, M, modf, x, ip);
}

// msad4 (whatever that is)
__target_intrinsic(hlsl)
[__readNone]
uint4 msad4(uint reference, uint2 source, uint4 accum)
{
    int4 bytesRef = (reference >> uint4(24, 16, 8, 0)) & 0xFF;
    int4 bytesX   = (source.x  >> uint4(24, 16, 8, 0)) & 0xFF;
    int4 bytesY   = (source.y  >> uint4(24, 16, 8, 0)) & 0xFF;

    uint4 mask = bytesRef == 0 ? 0 : 0xFFFFFFFFu;

    uint4 result = accum;
    result += mask.x & abs(bytesRef - int4(bytesX.x,           bytesY.y, bytesY.z, bytesY.w));
    result += mask.y & abs(bytesRef - int4(bytesX.x, bytesX.y,           bytesY.z, bytesY.w));
    result += mask.z & abs(bytesRef - int4(bytesX.x, bytesX.y, bytesX.z,           bytesY.w));
    result += mask.w & abs(bytesRef - int4(bytesX.x, bytesX.y, bytesX.z, bytesX.w));
    return result;
}

// General inner products

// scalar-scalar
__generic<T : __BuiltinArithmeticType>
__intrinsic_op($(kIROp_Mul))
[__readNone]
T mul(T x, T y);

// scalar-vector and vector-scalar
__generic<T : __BuiltinArithmeticType, let N : int>
__intrinsic_op($(kIROp_Mul))
[__readNone]
vector<T, N> mul(vector<T, N> x, T y);

__generic<T : __BuiltinArithmeticType, let N : int>
__intrinsic_op($(kIROp_Mul))
[__readNone]
vector<T, N> mul(T x, vector<T, N> y);

// scalar-matrix and matrix-scalar
__generic<T : __BuiltinArithmeticType, let N : int, let M :int>
__intrinsic_op($(kIROp_Mul))
[__readNone]
matrix<T, N, M> mul(matrix<T, N, M> x, T y);

__generic<T : __BuiltinArithmeticType, let N : int, let M :int>
__intrinsic_op($(kIROp_Mul))
[__readNone]
matrix<T, N, M> mul(T x, matrix<T, N, M> y);

// vector-vector (dot product)
__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "dot")
[__readNone]
T mul(vector<T, N> x, vector<T, N> y)
{
    return dot(x, y);
}
__generic<T : __BuiltinIntegerType, let N : int>
__target_intrinsic(hlsl)
[__readNone]
T mul(vector<T, N> x, vector<T, N> y)
{
    return dot(x, y);
}

// vector-matrix
__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "($1 * $0)")
[__readNone]
vector<T, M> mul(vector<T, N> left, matrix<T, N, M> right)
{
    vector<T,M> result;
    for( int j = 0; j < M; ++j )
    {
        T sum = T(0);
        for( int i = 0; i < N; ++i )
        {
            sum += left[i] * right[i][j];
        }
        result[j] = sum;
    }
    return result;
}
__generic<T : __BuiltinIntegerType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "($1 * $0)")
[__readNone]
vector<T, M> mul(vector<T, N> left, matrix<T, N, M> right)
{
    vector<T,M> result;
    for( int j = 0; j < M; ++j )
    {
        T sum = T(0);
        for( int i = 0; i < N; ++i )
        {
            sum += left[i] * right[i][j];
        }
        result[j] = sum;
    }
    return result;
}
__generic<T : __BuiltinLogicalType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "($1 * $0)")
[__readNone]
vector<T, M> mul(vector<T, N> left, matrix<T, N, M> right)
{
    vector<T,M> result;
    for( int j = 0; j < M; ++j )
    {
        T sum = T(0);
        for( int i = 0; i < N; ++i )
        {
            sum |= left[i] & right[i][j];
        }
        result[j] = sum;
    }
    return result;
}

// matrix-vector
__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "($1 * $0)")
[__readNone]
vector<T,N> mul(matrix<T,N,M> left, vector<T,M> right)
{
    vector<T,N> result;
    for( int i = 0; i < N; ++i )
    {
        T sum = T(0);
        for( int j = 0; j < M; ++j )
        {
            sum += left[i][j] * right[j];
        }
        result[i] = sum;
    }
    return result;
}
__generic<T : __BuiltinIntegerType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "($1 * $0)")
[__readNone]
vector<T,N> mul(matrix<T,N,M> left, vector<T,M> right)
{
    vector<T,N> result;
    for( int i = 0; i < N; ++i )
    {
        T sum = T(0);
        for( int j = 0; j < M; ++j )
        {
            sum += left[i][j] * right[j];
        }
        result[i] = sum;
    }
    return result;
}
__generic<T : __BuiltinLogicalType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "($1 * $0)")
[__readNone]
vector<T,N> mul(matrix<T,N,M> left, vector<T,M> right)
{
    vector<T,N> result;
    for( int i = 0; i < N; ++i )
    {
        T sum = T(0);
        for( int j = 0; j < M; ++j )
        {
            sum |= left[i][j] & right[j];
        }
        result[i] = sum;
    }
    return result;
}

// matrix-matrix
__generic<T : __BuiltinFloatingPointType, let R : int, let N : int, let C : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "($1 * $0)")
[__readNone]
matrix<T,R,C> mul(matrix<T,R,N> right, matrix<T,N,C> left)
{
    matrix<T,R,C> result;
    for( int r = 0; r < R; ++r)
    for( int c = 0; c < C; ++c)
    {
        T sum = T(0);
        for( int i = 0; i < N; ++i )
        {
            sum += left[r][i] * right[i][c];
        }
        result[r][c] = sum;
    }
    return result;
}
__generic<T : __BuiltinIntegerType, let R : int, let N : int, let C : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "($1 * $0)")
[__readNone]
matrix<T,R,C> mul(matrix<T,R,N> right, matrix<T,N,C> left)
{
    matrix<T,R,C> result;
    for( int r = 0; r < R; ++r)
    for( int c = 0; c < C; ++c)
    {
        T sum = T(0);
        for( int i = 0; i < N; ++i )
        {
            sum += left[r][i] * right[i][c];
        }
        result[r][c] = sum;
    }
    return result;
}
__generic<T : __BuiltinLogicalType, let R : int, let N : int, let C : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "($1 * $0)")
[__readNone]
matrix<T,R,C> mul(matrix<T,R,N> right, matrix<T,N,C> left)
{
    matrix<T,R,C> result;
    for( int r = 0; r < R; ++r)
    for( int c = 0; c < C; ++c)
    {
        T sum = T(0);
        for( int i = 0; i < N; ++i )
        {
            sum |= left[r][i] & right[i][c];
        }
        result[r][c] = sum;
    }
    return result;
}

// noise (deprecated)

[__readNone]
float noise(float x)
{
    return 0;
}

[__readNone]
__generic<let N : int> float noise(vector<float, N> x)
{
    return 0;
}

/// Indicate that an index may be non-uniform at execution time.
///
/// Shader Model 5.1 and 6.x introduce support for dynamic indexing
/// of arrays of resources, but place the restriction that *by default*
/// the implementation can assume that any value used as an index into
/// such arrays will be dynamically uniform across an entire `Draw` or `Dispatch`
/// (when using instancing, the value must be uniform across all instances;
/// it does not seem that the restriction extends to draws within a multi-draw).
///
/// In order to indicate to the implementation that it cannot make the
/// uniformity assumption, a shader programmer is required to pass the index
/// to the `NonUniformResourceIndex` function before using it as an index.
/// The function superficially acts like an identity function.
///
/// Note: a future version of Slang may take responsibility for inserting calls
/// to this function as necessary in output code, rather than make this
/// the user's responsibility, so that the default behavior of the language
/// is more semantically "correct."
__target_intrinsic(hlsl)
__target_intrinsic(glsl, nonuniformEXT)
__glsl_extension(GL_EXT_nonuniform_qualifier)
[__readNone]
uint NonUniformResourceIndex(uint index)
{
    return index;
}

__target_intrinsic(hlsl)
__target_intrinsic(glsl, nonuniformEXT)
__glsl_extension(GL_EXT_nonuniform_qualifier)
[__readNone]
int NonUniformResourceIndex(int index)
{
    return index;
}

// Normalize a vector
__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Normalize _0")
[__readNone]
vector<T,N> normalize(vector<T,N> x)
{
    return x / length(x);
}

// Raise to a power
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_pow($0, $1)")
__target_intrinsic(cpp, "$P_pow($0, $1)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Pow _0 _1")
[__readNone]
T pow(T x, T y);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Pow _0 _1")
[__readNone]
vector<T, N> pow(vector<T, N> x, vector<T, N> y)
{
    VECTOR_MAP_BINARY(T, N, pow, x, y);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> pow(matrix<T,N,M> x, matrix<T,N,M> y)
{
    MATRIX_MAP_BINARY(T, N, M, pow, x, y);
}

// Output message
// TODO: add check to ensure format is const literal.

__target_intrinsic(hlsl)
__target_intrinsic(cpp)
__target_intrinsic(cuda)
__glsl_extension(GL_EXT_debug_printf)
__target_intrinsic(glsl, "debugPrintfEXT($0)")
void printf(NativeString format);

__target_intrinsic(hlsl)
__target_intrinsic(cpp)
__target_intrinsic(cuda)
__glsl_extension(GL_EXT_debug_printf)
__target_intrinsic(glsl, "debugPrintfEXT($0, $1)")
void printf<T0>(NativeString format, T0 arg0);

__target_intrinsic(hlsl)
__target_intrinsic(cpp)
__target_intrinsic(cuda)
__glsl_extension(GL_EXT_debug_printf)
__target_intrinsic(glsl, "debugPrintfEXT($0, $1, $2)")
void printf<T0, T1>(NativeString format, T0 arg0, T1 arg1);

__target_intrinsic(hlsl)
__target_intrinsic(cpp)
__target_intrinsic(cuda)
__glsl_extension(GL_EXT_debug_printf)
__target_intrinsic(glsl, "debugPrintfEXT($0, $1, $2, $3)")
void printf<T0, T1, T2>(NativeString format, T0 arg0, T1 arg1, T2 arg2);

__target_intrinsic(hlsl)
__target_intrinsic(cpp)
__target_intrinsic(cuda)
__glsl_extension(GL_EXT_debug_printf)
__target_intrinsic(glsl, "debugPrintfEXT($0, $1, $2, $3, $4)")
void printf<T0, T1, T2, T3>(NativeString format, T0 arg0, T1 arg1, T2 arg2, T3 arg3);

__target_intrinsic(hlsl)
__target_intrinsic(cpp)
__target_intrinsic(cuda)
__glsl_extension(GL_EXT_debug_printf)
__target_intrinsic(glsl, "debugPrintfEXT($0, $1, $2, $3, $4, $5)")
void printf<T0, T1, T2, T3, T4>(NativeString format, T0 arg0, T1 arg1, T2 arg2, T3 arg3, T4 arg4);

__target_intrinsic(hlsl)
__target_intrinsic(cpp)
__target_intrinsic(cuda)
__glsl_extension(GL_EXT_debug_printf)
__target_intrinsic(glsl, "debugPrintfEXT($0, $1, $2, $3, $4, $5, $6)")
void printf<T0, T1, T2, T3, T4, T5>(NativeString format, T0 arg0, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5);

__target_intrinsic(hlsl)
__target_intrinsic(cpp)
__target_intrinsic(cuda)
__glsl_extension(GL_EXT_debug_printf)
__target_intrinsic(glsl, "debugPrintfEXT($0, $1, $2, $3, $4, $5, $6, $7)")
void printf<T0, T1, T2, T3, T4, T5, T6>(NativeString format, T0 arg0, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6);

__target_intrinsic(hlsl)
__target_intrinsic(cpp)
__target_intrinsic(cuda)
__glsl_extension(GL_EXT_debug_printf)
__target_intrinsic(glsl, "debugPrintfEXT($0, $1, $2, $3, $4, $5, $6, $7, $8)")
void printf<T0, T1, T2, T3, T4, T5, T6, T7>(NativeString format, T0 arg0, T1 arg1, T2 arg2, T3 arg3, T4 arg4, T5 arg5, T6 arg6, T7 arg7);

// Tessellation factor fixup routines

void Process2DQuadTessFactorsAvg(
    in  float4 RawEdgeFactors,
    in  float2 InsideScale,
    out float4 RoundedEdgeTessFactors,
    out float2 RoundedInsideTessFactors,
    out float2 UnroundedInsideTessFactors);

void Process2DQuadTessFactorsMax(
    in  float4 RawEdgeFactors,
    in  float2 InsideScale,
    out float4 RoundedEdgeTessFactors,
    out float2 RoundedInsideTessFactors,
    out float2 UnroundedInsideTessFactors);

void Process2DQuadTessFactorsMin(
    in  float4 RawEdgeFactors,
    in  float2 InsideScale,
    out float4 RoundedEdgeTessFactors,
    out float2 RoundedInsideTessFactors,
    out float2 UnroundedInsideTessFactors);

void ProcessIsolineTessFactors(
    in  float RawDetailFactor,
    in  float RawDensityFactor,
    out float RoundedDetailFactor,
    out float RoundedDensityFactor);

void ProcessQuadTessFactorsAvg(
    in  float4 RawEdgeFactors,
    in  float InsideScale,
    out float4 RoundedEdgeTessFactors,
    out float2 RoundedInsideTessFactors,
    out float2 UnroundedInsideTessFactors);

void ProcessQuadTessFactorsMax(
    in  float4 RawEdgeFactors,
    in  float InsideScale,
    out float4 RoundedEdgeTessFactors,
    out float2 RoundedInsideTessFactors,
    out float2 UnroundedInsideTessFactors);

void ProcessQuadTessFactorsMin(
    in  float4 RawEdgeFactors,
    in  float InsideScale,
    out float4 RoundedEdgeTessFactors,
    out float2 RoundedInsideTessFactors,
    out float2 UnroundedInsideTessFactors);

void ProcessTriTessFactorsAvg(
    in  float3 RawEdgeFactors,
    in  float InsideScale,
    out float3 RoundedEdgeTessFactors,
    out float RoundedInsideTessFactor,
    out float UnroundedInsideTessFactor);

void ProcessTriTessFactorsMax(
    in  float3 RawEdgeFactors,
    in  float InsideScale,
    out float3 RoundedEdgeTessFactors,
    out float RoundedInsideTessFactor,
    out float UnroundedInsideTessFactor);

void ProcessTriTessFactorsMin(
    in  float3 RawEdgeFactors,
    in  float InsideScale,
    out float3 RoundedEdgeTessFactors,
    out float RoundedInsideTessFactors,
    out float UnroundedInsideTessFactors);

// Degrees to radians
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Radians _0")
[__readNone]
T radians(T x)
{
    return x * (T.getPi() / T(180.0f));
}

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Radians _0")
[__readNone]
vector<T, N> radians(vector<T, N> x)
{
    return x * (T.getPi() / T(180.0f));
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> radians(matrix<T, N, M> x)
{
    return x * (T.getPi() / T(180.0f));
}

// Approximate reciprocal
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
[__readNone]
T rcp(T x)
{
    return T(1.0) / x;
}

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
[__readNone]
vector<T, N> rcp(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, rcp, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> rcp(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, rcp, x);
}

// Reflect incident vector across plane with given normal
__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Reflect _0 _1")
[__readNone]
vector<T,N> reflect(vector<T,N> i, vector<T,N> n)
{
    return i - T(2) * dot(n,i) * n;
}

// Refract incident vector given surface normal and index of refraction
__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Refract _0 _1 _2")
[__readNone]
vector<T,N> refract(vector<T,N> i, vector<T,N> n, T eta)
{
    let dotNI = dot(n,i);
    let k = T(1) - eta*eta*(T(1) - dotNI * dotNI);
    if(k < T(0)) return vector<T,N>(T(0));
    return eta * i - (eta * dotNI + sqrt(k)) * n;
}

// Reverse order of bits
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "bitfieldReverse")
__target_intrinsic(cuda, "$P_reversebits($0)")
__target_intrinsic(cpp, "$P_reversebits($0)")
[__readNone]
uint reversebits(uint value);

__target_intrinsic(glsl, "bitfieldReverse")
__generic<let N : int>
[__readNone]
vector<uint, N> reversebits(vector<uint, N> value)
{
    VECTOR_MAP_UNARY(uint, N, reversebits, value);
}

// Round-to-nearest
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_round($0)")
__target_intrinsic(cpp, "$P_round($0)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Round _0")
[__readNone]
T round(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Round _0")
[__readNone]
vector<T, N> round(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, round, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> round(matrix<T,N,M> x)
{
    MATRIX_MAP_UNARY(T, N, M, round, x);
}

// Reciprocal of square root
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "inversesqrt($0)")
__target_intrinsic(cuda, "$P_rsqrt($0)")
__target_intrinsic(cpp, "$P_rsqrt($0)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 InverseSqrt _0")
[__readNone]
T rsqrt(T x)
{
    return T(1.0) / sqrt(x);
}

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "inversesqrt($0)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 InverseSqrt _0")
[__readNone]
vector<T, N> rsqrt(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, rsqrt, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> rsqrt(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, rsqrt, x);
}

// Clamp value to [0,1] range

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
[__readNone]
T saturate(T x)
{
    return clamp<T>(x, T(0), T(1));
}

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
[__readNone]
vector<T,N> saturate(vector<T,N> x)
{
    return clamp<T,N>(x,
        vector<T,N>(T(0)),
        vector<T,N>(T(1)));
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> saturate(matrix<T,N,M> x)
{
    MATRIX_MAP_UNARY(T, N, M, saturate, x);
}

// Extract sign of value
__generic<T : __BuiltinSignedArithmeticType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "int(sign($0))")
__target_intrinsic(cuda, "$P_sign($0)")
__target_intrinsic(cpp, "$P_sign($0)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fi(FSign, SSign) _0")
[__readNone]
int sign(T x);

__generic<T : __BuiltinSignedArithmeticType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "ivec$N0(sign($0))")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 fi(FSign, SSign) _0")
[__readNone]
vector<int, N> sign(vector<T, N> x)
{
    VECTOR_MAP_UNARY(int, N, sign, x);
}

__generic<T : __BuiltinSignedArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<int, N, M> sign(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(int, N, M, sign, x);
}


// Sine

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_sin($0)")
__target_intrinsic(cpp, "$P_sin($0)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Sin _0")
[__readNone]
T sin(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Sin _0")
[__readNone]
vector<T, N> sin(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, sin, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> sin(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, sin, x);
}

// Sine and cosine
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "$P_sincos($0, $1, $2)")
[__readNone]
void sincos(T x, out T s, out T c)
{
    s = sin(x);
    c = cos(x);
}

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
[__readNone]
void sincos(vector<T,N> x, out vector<T,N> s, out vector<T,N> c)
{
    s = sin(x);
    c = cos(x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
void sincos(matrix<T,N,M> x, out matrix<T,N,M> s, out matrix<T,N,M> c)
{
    s = sin(x);
    c = cos(x);
}

// Hyperbolic Sine
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_sinh($0)")
__target_intrinsic(cpp, "$P_sinh($0)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Sinh _0")
[__readNone]
T sinh(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Sinh _0")
[__readNone]
vector<T, N> sinh(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, sinh, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> sinh(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, sinh, x);
}

// Smooth step (Hermite interpolation)
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 SmoothStep _0 _1 _2")
[__readNone]
T smoothstep(T min, T max, T x)
{
    let t = saturate((x - min) / (max - min));
    return t * t * (T(3.0f) - (t + t));
}

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 SmoothStep _0 _1 _2")
[__readNone]
vector<T, N> smoothstep(vector<T, N> min, vector<T, N> max, vector<T, N> x)
{
    VECTOR_MAP_TRINARY(T, N, smoothstep, min, max, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> smoothstep(matrix<T, N, M> min, matrix<T, N, M> max, matrix<T, N, M> x)
{
    MATRIX_MAP_TRINARY(T, N, M, smoothstep, min, max, x);
}

// Square root
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_sqrt($0)")
__target_intrinsic(cpp, "$P_sqrt($0)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Sqrt _0")
[__readNone]
T sqrt(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Sqrt _0")
[__readNone]
vector<T, N> sqrt(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, sqrt, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> sqrt(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, sqrt, x);
}

// Step function
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Step _0 _1")
[__readNone]
T step(T y, T x)
{
    return x < y ? T(0.0f) : T(1.0f);
}

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Step _0 _1")
[__readNone]
vector<T,N> step(vector<T,N> y, vector<T,N> x)
{
    VECTOR_MAP_BINARY(T, N, step, y, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> step(matrix<T, N, M> y, matrix<T, N, M> x)
{
    MATRIX_MAP_BINARY(T, N, M, step, y, x);
}

// Tangent
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_tan($0)")
__target_intrinsic(cpp, "$P_tan($0)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Tan _0")
[__readNone]
T tan(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Tan _0")
[__readNone]
vector<T, N> tan(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, tan, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> tan(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, tan, x);
}

// Hyperbolic tangent
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_tanh($0)")
__target_intrinsic(cpp, "$P_tanh($0)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Tanh _0")
[__readNone]
T tanh(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Tanh _0")
[__readNone]
vector<T,N> tanh(vector<T,N> x)
{
    VECTOR_MAP_UNARY(T, N, tanh, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> tanh(matrix<T,N,M> x)
{
    MATRIX_MAP_UNARY(T, N, M, tanh, x);
}

// Matrix transpose
__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
[__readNone]
matrix<T, M, N> transpose(matrix<T, N, M> x)
{
    matrix<T,M,N> result;
    for(int r = 0; r < M; ++r)
        for(int c = 0; c < N; ++c)
            result[r][c] = x[c][r];
    return result;
}
__generic<T : __BuiltinIntegerType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
[__readNone]
matrix<T, M, N> transpose(matrix<T, N, M> x)
{
    matrix<T, M, N> result;
    for (int r = 0; r < M; ++r)
        for (int c = 0; c < N; ++c)
            result[r][c] = x[c][r];
    return result;
}
__generic<T : __BuiltinLogicalType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
[__readNone]
matrix<T, M, N> transpose(matrix<T, N, M> x)
{
    matrix<T, M, N> result;
    for (int r = 0; r < M; ++r)
        for (int c = 0; c < N; ++c)
            result[r][c] = x[c][r];
    return result;
}

// Truncate to integer
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_trunc($0)")
__target_intrinsic(cpp, "$P_trunc($0)")
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Trunc _0")
[__readNone]
T trunc(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv_direct, "OpExtInst resultType resultId glsl450 Trunc _0")
[__readNone]
vector<T, N> trunc(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, trunc, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> trunc(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, trunc, x);
}

// Slang Specific 'Mask' Wave Intrinsics

typedef uint WaveMask;

__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBallot(true).x")
__target_intrinsic(cuda, "__activemask()")
__target_intrinsic(hlsl, "WaveActiveBallot(true).x")
WaveMask WaveGetConvergedMask();

__intrinsic_op($(kIROp_WaveGetActiveMask))
WaveMask __WaveGetActiveMask();

__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBallot(true).x")
__target_intrinsic(hlsl, "WaveActiveBallot(true).x")
WaveMask WaveGetActiveMask()
{
    return __WaveGetActiveMask();
}

__glsl_extension(GL_KHR_shader_subgroup_basic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupElect()")
__target_intrinsic(cuda, "(($0 & -$0) == (WarpMask(1) << _getLaneId()))")
__target_intrinsic(hlsl, "WaveIsFirstLane()")
bool WaveMaskIsFirstLane(WaveMask mask);

__glsl_extension(GL_KHR_shader_subgroup_vote)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupAll($1)")
__target_intrinsic(cuda, "(__all_sync($0, $1) != 0)")
__target_intrinsic(hlsl, "WaveActiveAllTrue($1)")
bool WaveMaskAllTrue(WaveMask mask, bool condition);

__glsl_extension(GL_KHR_shader_subgroup_vote)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupAny($1)")
__target_intrinsic(cuda, "(__any_sync($0, $1) != 0)")
__target_intrinsic(hlsl, "WaveActiveAnyTrue($1)")
bool WaveMaskAnyTrue(WaveMask mask, bool condition);

__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBallot($1).x")
__target_intrinsic(cuda, "__ballot_sync($0, $1)")
__target_intrinsic(hlsl, "WaveActiveBallot($1)")
WaveMask WaveMaskBallot(WaveMask mask, bool condition);

__glsl_extension(GL_KHR_shader_subgroup_ballot)
__target_intrinsic(cuda, "__popc(__ballot_sync($0, $1))")
__target_intrinsic(hlsl, "WaveActiveCountBits($1)")
uint WaveMaskCountBits(WaveMask mask, bool value)
{
    return _WaveCountBits(WaveActiveBallot(value));
}

// Waits until all warp lanes named in mask have executed a WaveMaskSharedSync (with the same mask)
// before resuming execution. Guarantees memory ordering in shared memory among threads participating
// in the barrier.
//
// The CUDA intrinsic says it orders *all* memory accesses, which appears to match most closely subgroupBarrier.
//
// TODO(JS):
// For HLSL it's not clear what to do. There is no explicit mechanism to 'reconverge' threads. In the docs it describes
// behavior as
// "These intrinsics are dependent on active lanes and therefore flow control. In the model of this document, implementations
// must enforce that the number of active lanes exactly corresponds to the programmer’s view of flow control."
//
// It seems this can only mean the active threads are the "threads the program flow would lead to". This implies a lockstep
// "straight SIMD" style interpretation. That being the case this op on HLSL is just a memory barrier without any Sync.

__target_intrinsic(cuda, "__syncwarp($0)")
__glsl_extension(GL_KHR_shader_subgroup_basic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBarrier()")
__target_intrinsic(hlsl, "AllMemoryBarrier()")
void AllMemoryBarrierWithWaveMaskSync(WaveMask mask);

// On GLSL, it appears we can't use subgroupMemoryBarrierShared, because it only implies a memory ordering, it does not
// imply convergence. For subgroupBarrier we have from the docs..
// "The function subgroupBarrier() enforces that all active invocations within a subgroup must execute this function before any
// are allowed to continue their execution"
// TODO(JS):
// It's not entirely clear what to do here on HLSL.
// Reading the dxc wiki (https://github.com/Microsoft/DirectXShaderCompiler/wiki/Wave-Intrinsics), we have statements like:
//    ... these intrinsics enable the elimination of barrier constructs when the scope of synchronization is within the width of the SIMD processor.
//    Wave: A set of lanes executed simultaneously in the processor. No explicit barriers are required to guarantee that they execute in parallel.
// Which seems to imply at least some memory barriers like Shared might not be needed.
//
// The barrier is left here though, because not only is the barrier make writes before the barrier across the wave appear to others afterwards, it's
// also there to inform the compiler on what order reads and writes can take place. This might seem to be silly because of the 'Active' lanes
// aspect of HLSL seems to make everything in lock step - but that's not quite so, it only has to apparently be that way as far as the programmers
// model appears - divergence could perhaps potentially still happen.
__target_intrinsic(cuda, "__syncwarp($0)")
__glsl_extension(GL_KHR_shader_subgroup_basic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBarrier()")
__target_intrinsic(hlsl, "GroupMemoryBarrier()")
void GroupMemoryBarrierWithWaveMaskSync(WaveMask mask);

__glsl_extension(GL_KHR_shader_subgroup_basic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBarrier()")
__target_intrinsic(hlsl, "AllMemoryBarrier()")
void AllMemoryBarrierWithWaveSync();

__glsl_extension(GL_KHR_shader_subgroup_basic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBarrier()")
__target_intrinsic(hlsl, "GroupMemoryBarrier()")
__target_intrinsic(cuda, "__syncwarp()")
void GroupMemoryBarrierWithWaveSync();

// NOTE! WaveMaskBroadcastLaneAt is *NOT* standard HLSL
// It is provided as access to subgroupBroadcast which can only take a
// constexpr laneId.
// https://github.com/KhronosGroup/GLSL/blob/master/extensions/khr/GL_KHR_shader_subgroup.txt
// Versions SPIR-V greater than 1.4 loosen this restriction, and allow 'dynamic uniform' index
// If that's the behavior required then client code should use WaveReadLaneAt which works this way.

__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBroadcast($1, $2)")
__target_intrinsic(cuda, "__shfl_sync($0, $1, $2)")
__target_intrinsic(hlsl, "WaveReadLaneAt($1, $2)")
T WaveMaskBroadcastLaneAt(WaveMask mask, T value, constexpr int lane);
__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBroadcast($1, $2)")
__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1, $2)")
__target_intrinsic(hlsl, "WaveReadLaneAt($1, $2)")
vector<T,N> WaveMaskBroadcastLaneAt(WaveMask mask, vector<T,N> value, constexpr int lane);
__generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1, $2)")
__target_intrinsic(hlsl, "WaveReadLaneAt($1, $2)")
matrix<T,N,M> WaveMaskBroadcastLaneAt(WaveMask mask, matrix<T,N,M> value, constexpr int lane);

// TODO(JS): If it can be determines that the `laneId` is constExpr, then subgroupBroadcast
// could be used on GLSL. For now we just use subgroupShuffle
__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_shuffle)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupShuffle($1, $2)")
__target_intrinsic(cuda, "__shfl_sync($0, $1, $2)")
__target_intrinsic(hlsl, "WaveReadLaneAt($1, $2)")
T WaveMaskReadLaneAt(WaveMask mask, T value, int lane);
__generic<T : __BuiltinType, let N : int>
__spirv_version(1.3)
__glsl_extension(GL_KHR_shader_subgroup_shuffle)
__target_intrinsic(glsl, "subgroupShuffle($1, $2)")
__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1, $2)")
__target_intrinsic(hlsl, "WaveReadLaneAt($1, $2)")
vector<T,N> WaveMaskReadLaneAt(WaveMask mask, vector<T,N> value, int lane);
__generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1, $2)")
__target_intrinsic(hlsl, "WaveReadLaneAt($1, $2)")
matrix<T,N,M> WaveMaskReadLaneAt(WaveMask mask, matrix<T,N,M> value, int lane);

// NOTE! WaveMaskShuffle is a NON STANDARD HLSL intrinsic! It will map to WaveReadLaneAt on HLSL
// which means it will only work on hardware which allows arbitrary laneIds which is not true
// in general because it breaks the HLSL standard, which requires it's 'dynamically uniform' across the Wave.
__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_shuffle)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupShuffle($1, $2)")
__target_intrinsic(cuda, "__shfl_sync($0, $1, $2)")
__target_intrinsic(hlsl, "WaveReadLaneAt($1, $2)")
T WaveMaskShuffle(WaveMask mask, T value, int lane);
__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_shuffle)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupShuffle($1, $2)")
__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1, $2)")
__target_intrinsic(hlsl, "WaveReadLaneAt($1, $2)")
vector<T,N> WaveMaskShuffle(WaveMask mask, vector<T,N> value, int lane);
__generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1, $2)")
__target_intrinsic(hlsl, "WaveReadLaneAt($1, $2)")
matrix<T,N,M> WaveMaskShuffle(WaveMask mask, matrix<T,N,M> value, int lane);

__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBallotExclusiveBitCount(subgroupBallot($1))")
__target_intrinsic(cuda, "__popc(__ballot_sync($0, $1)  & _getLaneLtMask())")
__target_intrinsic(hlsl, "WavePrefixCountBits($1)")
uint WaveMaskPrefixCountBits(WaveMask mask, bool value);

// Across lane ops

__generic<T : __BuiltinIntegerType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupAnd($1)")
__target_intrinsic(cuda, "_waveAnd($0, $1)")
__target_intrinsic(hlsl, "WaveActiveBitAnd($1)")
T WaveMaskBitAnd(WaveMask mask, T expr);
__generic<T : __BuiltinIntegerType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupAnd($1)")
__target_intrinsic(cuda, "_waveAndMultiple($0, $1)")
__target_intrinsic(hlsl, "WaveActiveBitAnd($1)")
vector<T,N> WaveMaskBitAnd(WaveMask mask, vector<T,N> expr);
__generic<T : __BuiltinIntegerType, let N : int, let M : int>
__target_intrinsic(cuda, "_waveAndMultiple($0, $1)")
__target_intrinsic(hlsl, "WaveActiveBitAnd($1)")
matrix<T,N,M> WaveMaskBitAnd(WaveMask mask, matrix<T,N,M> expr);

__generic<T : __BuiltinIntegerType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupOr($1)")
__target_intrinsic(cuda, "_waveOr($0, $1)")
__target_intrinsic(hlsl, "WaveActiveBitOr($1)")
T WaveMaskBitOr(WaveMask mask, T expr);
__generic<T : __BuiltinIntegerType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupOr($1)")
__target_intrinsic(cuda, "_waveOrMultiple($0, $1)")
__target_intrinsic(hlsl, "WaveActiveBitOr($1)")
vector<T,N> WaveMaskBitOr(WaveMask mask, vector<T,N> expr);
__generic<T : __BuiltinIntegerType, let N : int, let M : int>
__target_intrinsic(cuda, "_waveOrMultiple($0, $1)")
__target_intrinsic(hlsl, "WaveActiveBitOr($1)")
matrix<T,N,M> WaveMaskBitOr(WaveMask mask, matrix<T,N,M> expr);

__generic<T : __BuiltinIntegerType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupXor($1)")
__target_intrinsic(cuda, "_waveXor($0, $1)")
__target_intrinsic(hlsl, "WaveActiveBitXor($1)")
T WaveMaskBitXor(WaveMask mask, T expr);
__generic<T : __BuiltinIntegerType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupXor($1)")
__target_intrinsic(cuda, "_waveXorMultiple($0, $1)")
__target_intrinsic(hlsl, "WaveActiveBitXor($1)")
vector<T,N> WaveMaskBitXor(WaveMask mask, vector<T,N> expr);
__generic<T : __BuiltinIntegerType, let N : int, let M : int>
__target_intrinsic(cuda, "_waveXorMultiple($0, $1)")
__target_intrinsic(hlsl, "WaveActiveBitXor($1)")
matrix<T,N,M> WaveMaskBitXor(WaveMask mask, matrix<T,N,M> expr);

__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupMax($1)")
__target_intrinsic(cuda, "_waveMax($0, $1)")
__target_intrinsic(hlsl, "WaveActiveMax($1)")
T WaveMaskMax(WaveMask mask, T expr);
__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupMax($1)")
__target_intrinsic(cuda, "_waveMaxMultiple($0, $1)")
__target_intrinsic(hlsl, "WaveActiveMax($1)")
vector<T,N> WaveMaskMax(WaveMask mask, vector<T,N> expr);
__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(cuda, "_waveMaxMultiple($0, $1)")
__target_intrinsic(hlsl, "WaveActiveMax($1)")
matrix<T,N,M> WaveMaskMax(WaveMask mask, matrix<T,N,M> expr);

__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupMin($1)")
__target_intrinsic(cuda, "_waveMin($0, $1)")
__target_intrinsic(hlsl, "WaveActiveMin($1)")
T WaveMaskMin(WaveMask mask, T expr);
__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupMin($1)")
__target_intrinsic(cuda, "_waveMinMultiple($0, $1)")
__target_intrinsic(hlsl, "WaveActiveMin($1)")
vector<T,N> WaveMaskMin(WaveMask mask, vector<T,N> expr);
__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(cuda, "_waveMinMultiple($0, $1)")
__target_intrinsic(hlsl, "WaveActiveMin($1)")
matrix<T,N,M> WaveMaskMin(WaveMask mask, matrix<T,N,M> expr);

__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupMul($1)")
__target_intrinsic(cuda, "_waveProduct($0, $1)")
__target_intrinsic(hlsl, "WaveActiveProduct($1)")
T WaveMaskProduct(WaveMask mask, T expr);
__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupMul($1)")
__target_intrinsic(cuda, "_waveProductMultiple($0, $1)")
__target_intrinsic(hlsl, "WaveActiveProduct($1)")
vector<T,N> WaveMaskProduct(WaveMask mask, vector<T,N> expr);
__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(cuda, "_waveProductMultiple($0, $1)")
__target_intrinsic(hlsl, "WaveActiveProduct($1)")
matrix<T,N,M> WaveMaskProduct(WaveMask mask, matrix<T,N,M> expr);

__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupAdd($1)")
__target_intrinsic(cuda, "_waveSum($0, $1)")
__target_intrinsic(hlsl, "WaveActiveSum($1)")
T WaveMaskSum(WaveMask mask, T expr);
__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupAdd($1)")
__target_intrinsic(cuda, "_waveSumMultiple($0, $1)")
__target_intrinsic(hlsl, "WaveActiveSum($1)")
vector<T,N> WaveMaskSum(WaveMask mask, vector<T,N> expr);
__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(cuda, "_waveSumMultiple($0, $1)")
__target_intrinsic(hlsl, "WaveActiveSum($1)")
matrix<T,N,M> WaveMaskSum(WaveMask mask, matrix<T,N,M> expr);

__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_vote)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupAllEqual($1)")
__cuda_sm_version(7.0)
__target_intrinsic(cuda, "_waveAllEqual($0, $1)")
__target_intrinsic(hlsl, "WaveActiveAllEqual($1)")
bool WaveMaskAllEqual(WaveMask mask, T value);
__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_vote)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupAllEqual($1)")
__cuda_sm_version(7.0)
__target_intrinsic(cuda, "_waveAllEqualMultiple($0, $1)")
__target_intrinsic(hlsl, "WaveActiveAllEqual($1)")
bool WaveMaskAllEqual(WaveMask mask, vector<T,N> value);
__generic<T : __BuiltinType, let N : int, let M : int>
__cuda_sm_version(7.0)
__target_intrinsic(cuda, "_waveAllEqualMultiple($0, $1)")
__target_intrinsic(hlsl, "WaveActiveAllEqual($1)")
bool WaveMaskAllEqual(WaveMask mask, matrix<T,N,M> value);

// Prefix

__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupExclusiveMul($1)")
__target_intrinsic(cuda, "_wavePrefixProduct($0, $1)")
__target_intrinsic(hlsl, "WavePrefixProduct($1)")
T WaveMaskPrefixProduct(WaveMask mask, T expr);
__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupExclusiveMul($1)")
__target_intrinsic(cuda, "_wavePrefixProductMultiple($0, $1)")
__target_intrinsic(hlsl, "WavePrefixProduct($1)")
vector<T,N> WaveMaskPrefixProduct(WaveMask mask, vector<T,N> expr);
__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(cuda, "_wavePrefixProductMultiple($0, $1)")
__target_intrinsic(hlsl, "WavePrefixProduct($1)")
matrix<T,N,M> WaveMaskPrefixProduct(WaveMask mask, matrix<T,N,M> expr);

__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupExclusiveAdd($1)")
__target_intrinsic(cuda, "_wavePrefixSum($0, $1)")
__target_intrinsic(hlsl, "WavePrefixSum($1)")
T WaveMaskPrefixSum(WaveMask mask, T expr);
__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupExclusiveAdd($1)")
__target_intrinsic(cuda, "_wavePrefixSumMultiple($0, $1)")
__target_intrinsic(hlsl, "WavePrefixSum($1)")
vector<T,N> WaveMaskPrefixSum(WaveMask mask, vector<T,N> expr);
__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(cuda, "_wavePrefixSumMultiple($0, $1)")
__target_intrinsic(hlsl, "WavePrefixSum($1)")
matrix<T,N,M> WaveMaskPrefixSum(WaveMask mask, matrix<T,N,M> expr);

__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBroadcastFirst($1)")
__target_intrinsic(cuda, "_waveReadFirst($0, $1)")
T WaveMaskReadLaneFirst(WaveMask mask, T expr);
__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBroadcastFirst($1)")
__target_intrinsic(cuda, "_waveReadFirstMultiple($0, $1)")
vector<T,N> WaveMaskReadLaneFirst(WaveMask mask, vector<T,N> expr);
__generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(cuda, "_waveReadFirstMultiple($0, $1)")
matrix<T,N,M> WaveMaskReadLaneFirst(WaveMask mask, matrix<T,N,M> expr);

// WaveMask SM6.5 like intrinsics

// TODO(JS): On HLSL it only works for 32 bits or less

__generic<T : __BuiltinType>
__target_intrinsic(hlsl, "WaveMatch($1).x")
__cuda_sm_version(7.0)
__target_intrinsic(cuda, "_waveMatchScalar($0, $1).x")
WaveMask WaveMaskMatch(WaveMask mask, T value);
__generic<T : __BuiltinType, let N : int>
__target_intrinsic(hlsl, "WaveMatch($1).x")
__cuda_sm_version(7.0)
__target_intrinsic(cuda, "_waveMatchMultiple($0, $1)")
WaveMask WaveMaskMatch(WaveMask mask, vector<T,N> value);
__generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(hlsl, "WaveMatch($1).x")
__cuda_sm_version(7.0)
__target_intrinsic(cuda, "_waveMatchMultiple($0, $1)")
WaveMask WaveMaskMatch(WaveMask mask, matrix<T,N,M> value);

__generic<T : __BuiltinArithmeticType>
__target_intrinsic(hlsl, "WaveMultiPrefixBitAnd($1, uint4($0, 0, 0, 0))")
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
//__target_intrinsic(glsl, "subgroupExclusiveAnd($1)")
__target_intrinsic(cuda, "_wavePrefixAnd($0, $1)")
T WaveMaskPrefixBitAnd(WaveMask mask, T expr);
__target_intrinsic(hlsl, "WaveMultiPrefixBitAnd($1, uint4($0, 0, 0, 0))")
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupExclusiveAnd($1)")
__target_intrinsic(cuda, "_wavePrefixAndMultiple($0, $1)")
__generic<T : __BuiltinArithmeticType, let N : int>
vector<T,N> WaveMaskPrefixBitAnd(WaveMask mask, vector<T,N> expr);
__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl, "WaveMultiPrefixBitAnd($1, uint4($0, 0, 0, 0))")
__target_intrinsic(cuda, "_wavePrefixAndMultiple(_getMultiPrefixMask($0, $1)")
matrix<T,N,M> WaveMaskPrefixBitAnd(WaveMask mask, matrix<T,N,M> expr);

__generic<T : __BuiltinArithmeticType>
__target_intrinsic(hlsl, "WaveMultiPrefixBitOr($1, uint4($0, 0, 0, 0))")
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
//__target_intrinsic(glsl, "subgroupExclusiveOr($1)")
__target_intrinsic(cuda, "_wavePrefixOr($0, $1)")
T WaveMaskPrefixBitOr(WaveMask mask, T expr);
__generic<T : __BuiltinArithmeticType, let N : int>
__target_intrinsic(hlsl, "WaveMultiPrefixBitOr($1, uint4($0, 0, 0, 0))")
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
//__target_intrinsic(glsl, "subgroupExclusiveOr($1)")
__target_intrinsic(cuda, "_wavePrefixOrMultiple($0, $1)")
vector<T,N> WaveMaskPrefixBitOr(WaveMask mask, vector<T,N> expr);
__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl, "WaveMultiPrefixBitOr($1, uint4($0, 0, 0, 0))")
__target_intrinsic(cuda, "_wavePrefixOrMultiple($0, $1)")
matrix<T,N,M> WaveMaskPrefixBitOr(WaveMask mask, matrix<T,N,M> expr);

__generic<T : __BuiltinArithmeticType>
__target_intrinsic(hlsl, "WaveMultiPrefixBitXor($1, uint4($0, 0, 0, 0))")
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupExclusiveXor($1)")
__target_intrinsic(cuda, "_wavePrefixXor($0, $1)")
T WaveMaskPrefixBitXor(WaveMask mask, T expr);
__generic<T : __BuiltinArithmeticType, let N : int>
__target_intrinsic(hlsl, "WaveMultiPrefixBitXor($1, uint4($0, 0, 0, 0))")
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupExclusiveXor($1)")
__target_intrinsic(cuda, "_wavePrefixXorMultiple($0, $1)")
vector<T,N> WaveMaskPrefixBitXor(WaveMask mask, vector<T,N> expr);
__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl, "WaveMultiPrefixBitXor($1, uint4($0, 0, 0, 0))")
__target_intrinsic(cuda, "_wavePrefixXorMultiple($0, $1)")
matrix<T,N,M> WaveMaskPrefixBitXor(WaveMask mask, matrix<T,N,M> expr);

// Shader model 6.0 stuff

// Information for GLSL wave/subgroup support
// https://github.com/KhronosGroup/GLSL/blob/master/extensions/khr/GL_KHR_shader_subgroup.txt

__generic<T : __BuiltinType> T QuadReadLaneAt(T sourceValue, uint quadLaneID);
__generic<T : __BuiltinType, let N : int> vector<T,N> QuadReadLaneAt(vector<T,N> sourceValue, uint quadLaneID);
__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadLaneAt(matrix<T,N,M> sourceValue, uint quadLaneID);

__generic<T : __BuiltinType> T QuadReadAcrossX(T localValue);
__generic<T : __BuiltinType, let N : int> vector<T,N> QuadReadAcrossX(vector<T,N> localValue);
__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadAcrossX(matrix<T,N,M> localValue);

__generic<T : __BuiltinType> T QuadReadAcrossY(T localValue);
__generic<T : __BuiltinType, let N : int> vector<T,N> QuadReadAcrossY(vector<T,N> localValue);
__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadAcrossY(matrix<T,N,M> localValue);

__generic<T : __BuiltinType> T QuadReadAcrossDiagonal(T localValue);
__generic<T : __BuiltinType, let N : int> vector<T,N> QuadReadAcrossDiagonal(vector<T,N> localValue);
__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadAcrossDiagonal(matrix<T,N,M> localValue);


__generic<T : __BuiltinIntegerType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupAnd($0)")
__target_intrinsic(hlsl)
T WaveActiveBitAnd(T expr)
{
    return WaveMaskBitAnd(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinIntegerType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupAnd($0)")
__target_intrinsic(hlsl)
vector<T, N> WaveActiveBitAnd(vector<T, N> expr)
{
    return WaveMaskBitAnd(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinIntegerType, let N : int, let M : int>
__target_intrinsic(hlsl)
matrix<T, N, M> WaveActiveBitAnd(matrix<T, N, M> expr)
{
    return WaveMaskBitAnd(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinIntegerType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupOr($0)")
__target_intrinsic(hlsl)
T WaveActiveBitOr(T expr)
{
    return WaveMaskBitOr(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinIntegerType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupOr($0)")
__target_intrinsic(hlsl)
vector<T,N> WaveActiveBitOr(vector<T,N> expr)
{
    return WaveMaskBitOr(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinIntegerType, let N : int, let M : int>
__target_intrinsic(hlsl)
matrix<T, N, M> WaveActiveBitOr(matrix<T, N, M> expr)
{
    return WaveMaskBitOr(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinIntegerType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupXor($0)")
__target_intrinsic(hlsl)
T WaveActiveBitXor(T expr)
{
    return WaveMaskBitXor(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinIntegerType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupXor($0)")
__target_intrinsic(hlsl)
vector<T,N> WaveActiveBitXor(vector<T,N> expr)
{
    return WaveMaskBitXor(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinIntegerType, let N : int, let M : int>
__target_intrinsic(hlsl)
matrix<T, N, M> WaveActiveBitXor(matrix<T, N, M> expr)
{
    return WaveMaskBitXor(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupMax($0)")
__target_intrinsic(hlsl)
T WaveActiveMax(T expr)
{
    return WaveMaskMax(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupMax($0)")
__target_intrinsic(hlsl)
vector<T, N> WaveActiveMax(vector<T, N> expr)
{
    return WaveMaskMax(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl)
matrix<T, N, M> WaveActiveMax(matrix<T, N, M> expr)
{
    return WaveMaskMax(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupMin($0)")
__target_intrinsic(hlsl)
T WaveActiveMin(T expr)
{
    return WaveMaskMin(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupMin($0)")
__target_intrinsic(hlsl)
vector<T, N> WaveActiveMin(vector<T, N> expr)
{
    return WaveMaskMin(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl)
matrix<T, N, M> WaveActiveMin(matrix<T, N, M> expr)
{
    return WaveMaskMin(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupMul($0)")
__target_intrinsic(hlsl)
T WaveActiveProduct(T expr)
{
    return WaveMaskProduct(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupMul($0)")
__target_intrinsic(hlsl)
vector<T,N> WaveActiveProduct(vector<T,N> expr)
{
    return WaveMaskProduct(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl)
matrix<T, N, M> WaveActiveProduct(matrix<T, N, M> expr)
{
    return WaveMaskProduct(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupAdd($0)")
__target_intrinsic(hlsl)
T WaveActiveSum(T expr)
{
    return WaveMaskSum(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupAdd($0)")
__target_intrinsic(hlsl)
vector<T,N> WaveActiveSum(vector<T,N> expr)
{
    return WaveMaskSum(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl)
matrix<T,N,M> WaveActiveSum(matrix<T,N,M> expr)
{
    return WaveMaskSum(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_vote)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupAllEqual($0)")
__target_intrinsic(hlsl)
bool WaveActiveAllEqual(T value)
{
    return WaveMaskAllEqual(WaveGetActiveMask(), value);
}

__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_vote)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupAllEqual($0)")
__target_intrinsic(hlsl)
bool WaveActiveAllEqual(vector<T,N> value)
{
    return WaveMaskAllEqual(WaveGetActiveMask(), value);
}

__generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(hlsl)
bool WaveActiveAllEqual(matrix<T, N, M> value)
{
    return WaveMaskAllEqual(WaveGetActiveMask(), value);
}

__glsl_extension(GL_KHR_shader_subgroup_vote)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupAll($0)")
__target_intrinsic(hlsl)
bool WaveActiveAllTrue(bool condition)
{
    return WaveMaskAllTrue(WaveGetActiveMask(), condition);
}

__glsl_extension(GL_KHR_shader_subgroup_vote)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupAny($0)")
__target_intrinsic(hlsl)
bool WaveActiveAnyTrue(bool condition)
{
    return WaveMaskAnyTrue(WaveGetActiveMask(), condition);
}

__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBallot($0)")
__target_intrinsic(hlsl)
uint4 WaveActiveBallot(bool condition)
{
    return WaveMaskBallot(WaveGetActiveMask(), condition);
}

__target_intrinsic(hlsl)
uint WaveActiveCountBits(bool value)
{
    return WaveMaskCountBits(WaveGetActiveMask(), value);
}

__glsl_extension(GL_KHR_shader_subgroup_basic)
__spirv_version(1.3)
__target_intrinsic(glsl, "(gl_SubgroupSize)")
__target_intrinsic(cuda, "(warpSize)")
uint WaveGetLaneCount();

__glsl_extension(GL_KHR_shader_subgroup_basic)
__spirv_version(1.3)
__target_intrinsic(glsl, "(gl_SubgroupInvocationID)")
__target_intrinsic(cuda, "_getLaneId()")
uint WaveGetLaneIndex();

__glsl_extension(GL_KHR_shader_subgroup_basic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupElect()")
__target_intrinsic(hlsl)
bool WaveIsFirstLane()
{
    return WaveMaskIsFirstLane(WaveGetActiveMask());
}

// It's useful to have a wave uint4 version of countbits, because some wave functions return uint4.
// This implementation tries to limit the amount of work required by the actual lane count.
uint _WaveCountBits(uint4 value)
{
    // Assume since WaveGetLaneCount should be known at compile time, the branches will hopefully boil away
    const uint waveLaneCount = WaveGetLaneCount();
    switch ((waveLaneCount - 1) / 32)
    {
        default:
        case 0: return countbits(value.x);
        case 1: return countbits(value.x) + countbits(value.y);
        case 2: return countbits(value.x) + countbits(value.y) + countbits(value.z);
        case 3: return countbits(value.x) + countbits(value.y) + countbits(value.z) + countbits(value.w);
    }
}


// Prefix

__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupExclusiveMul($0)")
__target_intrinsic(hlsl)
T WavePrefixProduct(T expr)
{
    return WaveMaskPrefixProduct(WaveGetActiveMask(), expr);
}


__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupExclusiveMul($0)")
__target_intrinsic(hlsl)
vector<T,N> WavePrefixProduct(vector<T,N> expr)
{
    return WaveMaskPrefixProduct(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl)
matrix<T, N, M> WavePrefixProduct(matrix<T, N, M> expr)
{
    return WaveMaskPrefixProduct(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupExclusiveAdd($0)")
__target_intrinsic(hlsl)
T WavePrefixSum(T expr)
{
    return WaveMaskPrefixSum(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupExclusiveAdd($0)")
__target_intrinsic(hlsl)
vector<T,N> WavePrefixSum(vector<T,N> expr)
{
    return WaveMaskPrefixSum(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl)
matrix<T,N,M> WavePrefixSum(matrix<T,N,M> expr)
{
    return WaveMaskPrefixSum(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBroadcastFirst($0)")
__target_intrinsic(hlsl)
T WaveReadLaneFirst(T expr)
{
    return WaveMaskReadLaneFirst(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBroadcastFirst($0)")
__target_intrinsic(hlsl)
vector<T,N> WaveReadLaneFirst(vector<T,N> expr)
{
    return WaveMaskReadLaneFirst(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(hlsl)
matrix<T,N,M> WaveReadLaneFirst(matrix<T,N,M> expr)
{
    return WaveMaskReadLaneFirst(WaveGetActiveMask(), expr);
}

// NOTE! WaveBroadcastLaneAt is *NOT* standard HLSL
// It is provided as access to subgroupBroadcast which can only take a
// constexpr laneId.
// https://github.com/KhronosGroup/GLSL/blob/master/extensions/khr/GL_KHR_shader_subgroup.txt
// Versions SPIR-V greater than 1.4 loosen this restriction, and allow 'dynamic uniform' index
// If that's the behavior required then client code should use WaveReadLaneAt which works this way.
__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBroadcast($0, $1)")
__target_intrinsic(hlsl, "WaveReadLaneAt")
T WaveBroadcastLaneAt(T value, constexpr int lane)
{
    return WaveMaskBroadcastLaneAt(WaveGetActiveMask(), value, lane);
}

__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBroadcast($0, $1)")
__target_intrinsic(hlsl, "WaveReadLaneAt")
vector<T,N> WaveBroadcastLaneAt(vector<T,N> value, constexpr int lane)
{
    return WaveMaskBroadcastLaneAt(WaveGetActiveMask(), value, lane);
}

__generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(cuda, "_waveShuffleMultiple(_getActiveMask(), $0, $1)")
__target_intrinsic(hlsl, "WaveReadLaneAt")
matrix<T, N, M> WaveBroadcastLaneAt(matrix<T, N, M> value, constexpr int lane)
{
    return WaveMaskBroadcastLaneAt(WaveGetActiveMask(), value, lane);
}

// TODO(JS): If it can be determines that the `laneId` is constExpr, then subgroupBroadcast
// could be used on GLSL. For now we just use subgroupShuffle
__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_shuffle)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupShuffle($0, $1)")
__target_intrinsic(hlsl)
T WaveReadLaneAt(T value, int lane)
{
    return WaveMaskReadLaneAt(WaveGetActiveMask(), value, lane);
}

__generic<T : __BuiltinType, let N : int>
__spirv_version(1.3)
__glsl_extension(GL_KHR_shader_subgroup_shuffle)
__target_intrinsic(glsl, "subgroupShuffle($0, $1)")
__target_intrinsic(hlsl)
vector<T,N> WaveReadLaneAt(vector<T,N> value, int lane)
{
    return WaveMaskReadLaneAt(WaveGetActiveMask(), value, lane);
}

__generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(cuda, "_waveShuffleMultiple(_getActiveMask(), $0, $1)")
__target_intrinsic(hlsl)
matrix<T, N, M> WaveReadLaneAt(matrix<T, N, M> value, int lane)
{
    return WaveMaskReadLaneAt(WaveGetActiveMask(), value, lane);
}

// NOTE! WaveShuffle is a NON STANDARD HLSL intrinsic! It will map to WaveReadLaneAt on HLSL
// which means it will only work on hardware which allows arbitrary laneIds which is not true
// in general because it breaks the HLSL standard, which requires it's 'dynamically uniform' across the Wave.
__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_shuffle)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupShuffle($0, $1)")
__target_intrinsic(hlsl, "WaveReadLaneAt")
T WaveShuffle(T value, int lane)
{
    return WaveMaskShuffle(WaveGetActiveMask(), value, lane);
}

__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_shuffle)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupShuffle($0, $1)")
__target_intrinsic(hlsl, "WaveReadLaneAt")
vector<T,N> WaveShuffle(vector<T,N> value, int lane)
{
    return WaveMaskShuffle(WaveGetActiveMask(), value, lane);
}

__generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(hlsl, "WaveReadLaneAt")
matrix<T, N, M> WaveShuffle(matrix<T, N, M> value, int lane)
{
    return WaveMaskShuffle(WaveGetActiveMask(), value, lane);
}

__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBallotExclusiveBitCount(subgroupBallot($0))")
__target_intrinsic(hlsl)
uint WavePrefixCountBits(bool value)
{
    return WaveMaskPrefixCountBits(WaveGetActiveMask(), value);
}

__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBallot(true)")
__target_intrinsic(cuda, "make_uint4(__activemask(), 0, 0, 0)")
__target_intrinsic(hlsl, "WaveActiveBallot(true)")
uint4 WaveGetConvergedMulti();

__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupBallot(true)")
__target_intrinsic(hlsl, "WaveActiveBallot(true)")
uint4 WaveGetActiveMulti();

// Shader model 6.5 stuff
// https://github.com/microsoft/DirectX-Specs/blob/master/d3d/HLSL_ShaderModel6_5.md

__generic<T : __BuiltinType>
__target_intrinsic(hlsl)
uint4 WaveMatch(T value)
{
    return WaveMaskMatch(WaveGetActiveMask(), value);
}

__generic<T : __BuiltinType, let N : int>
__target_intrinsic(hlsl)
uint4 WaveMatch(vector<T,N> value)
{
    return WaveMaskMatch(WaveGetActiveMask(), value);
}

__generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(hlsl)
uint4 WaveMatch(matrix<T,N,M> value)
{
    return WaveMaskMatch(WaveGetActiveMask(), value);
}

__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_popc(__ballot_sync(($1).x, $0) & _getLaneLtMask())")
uint WaveMultiPrefixCountBits(bool value, uint4 mask);

__generic<T : __BuiltinArithmeticType>
__target_intrinsic(hlsl)
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupExclusiveAnd($0)")
__target_intrinsic(cuda, "_wavePrefixAnd(_getMultiPrefixMask(($1).x), $0)")
T WaveMultiPrefixBitAnd(T expr, uint4 mask);

__target_intrinsic(hlsl)
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupExclusiveAnd($0)")
__target_intrinsic(cuda, "_wavePrefixAndMultiple(_getMultiPrefixMask(($1).x), $0)")
__generic<T : __BuiltinArithmeticType, let N : int>
vector<T,N> WaveMultiPrefixBitAnd(vector<T,N> expr, uint4 mask);

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_wavePrefixAndMultiple(_getMultiPrefixMask(($1).x), $0)")
matrix<T,N,M> WaveMultiPrefixBitAnd(matrix<T,N,M> expr, uint4 mask);

__generic<T : __BuiltinArithmeticType>
__target_intrinsic(hlsl)
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
//__target_intrinsic(glsl, "subgroupExclusiveOr($0)")
__target_intrinsic(cuda, "_wavePrefixOr(, _getMultiPrefixMask(($1).x), $0)")
T WaveMultiPrefixBitOr(T expr, uint4 mask);

__generic<T : __BuiltinArithmeticType, let N : int>
__target_intrinsic(hlsl)
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
//__target_intrinsic(glsl, "subgroupExclusiveOr($0)")
__target_intrinsic(cuda, "_wavePrefixOrMultiple(_getMultiPrefixMask(($1).x), $0)")
vector<T,N> WaveMultiPrefixBitOr(vector<T,N> expr, uint4 mask);

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_wavePrefixOrMultiple(_getMultiPrefixMask(($1).x), $0)")
matrix<T,N,M> WaveMultiPrefixBitOr(matrix<T,N,M> expr, uint4 mask);

__generic<T : __BuiltinArithmeticType>
__target_intrinsic(hlsl)
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupExclusiveXor($0)")
__target_intrinsic(cuda, "_wavePrefixXor(_getMultiPrefixMask(($1).x), $0)")
T WaveMultiPrefixBitXor(T expr, uint4 mask);

__generic<T : __BuiltinArithmeticType, let N : int>
__target_intrinsic(hlsl)
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupExclusiveXor($0)")
__target_intrinsic(cuda, "_wavePrefixXorMultiple(_getMultiPrefixMask(($1).x), $0)")
vector<T,N> WaveMultiPrefixBitXor(vector<T,N> expr, uint4 mask);

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_wavePrefixXorMultiple(_getMultiPrefixMask(($1).x), $0)")
matrix<T,N,M> WaveMultiPrefixBitXor(matrix<T,N,M> expr, uint4 mask);

__generic<T : __BuiltinArithmeticType>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_wavePrefixProduct(_getMultiPrefixMask(($1).x), $0)")
T WaveMultiPrefixProduct(T value, uint4 mask);

__generic<T : __BuiltinArithmeticType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_wavePrefixProductMultiple(_getMultiPrefixMask(($1).x), $0)")
vector<T,N> WaveMultiPrefixProduct(vector<T,N> value, uint4 mask);

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_wavePrefixProductMultiple(_getMultiPrefixMask(($1).x), $0)")
matrix<T,N,M> WaveMultiPrefixProduct(matrix<T,N,M> value, uint4 mask);

__generic<T : __BuiltinArithmeticType>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_wavePrefixSum(_getMultiPrefixMask(($1).x), $0)")
T WaveMultiPrefixSum(T value, uint4 mask);

__generic<T : __BuiltinArithmeticType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_wavePrefixSumMultiple(_getMultiPrefixMask(($1).x), $0 )")
vector<T,N> WaveMultiPrefixSum(vector<T,N> value, uint4 mask);

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_wavePrefixSumMultiple(_getMultiPrefixMask(($1).x), $0)")
matrix<T,N,M> WaveMultiPrefixSum(matrix<T,N,M> value, uint4 mask);

// `typedef`s to help with the fact that HLSL has been sorta-kinda case insensitive at various points
typedef Texture2D texture2D;



${{{{

// Buffer types

static const struct {
    char const*         name;
    SlangResourceAccess access;
} kBaseBufferAccessLevels[] = {
    { "",                   SLANG_RESOURCE_ACCESS_READ },
    { "RW",                 SLANG_RESOURCE_ACCESS_READ_WRITE },
    { "RasterizerOrdered",  SLANG_RESOURCE_ACCESS_RASTER_ORDERED },
};
static const int kBaseBufferAccessLevelCount = sizeof(kBaseBufferAccessLevels) / sizeof(kBaseBufferAccessLevels[0]);

for (int aa = 0; aa < kBaseBufferAccessLevelCount; ++aa)
{
    auto access = kBaseBufferAccessLevels[aa].access;
    bool isReadOnly = (access == SLANG_RESOURCE_ACCESS_READ);
    auto flavor = TextureFlavor::create(TextureFlavor::Shape::ShapeBuffer, access).flavor;
    sb << "__generic<T>\n";
    sb << "__magic_type(Texture," << int(flavor) << ")\n";
    sb << "__intrinsic_type(" << (kIROp_TextureType + (int(flavor) << kIROpMeta_OtherShift)) << ")\n";
    sb << "struct ";
    sb << kBaseBufferAccessLevels[aa].name;
    sb << "Buffer {\n";
    sb << "[__readNone]\n";
    sb << "void GetDimensions(out uint dim);\n";

    char const* glslLoadFuncName = (access == SLANG_RESOURCE_ACCESS_READ) ? "texelFetch" : "imageLoad";

    sb << "__glsl_extension(GL_EXT_samplerless_texture_functions)";
    sb << "__target_intrinsic(glsl, \"" << glslLoadFuncName << "($0, $1)$z\")\n";
    if (isReadOnly) sb << "[__readNone]\n";
    sb << "T Load(int location);\n";

    if (isReadOnly) sb << "[__readNone]\n";
    sb << "T Load(int location, out uint status);\n";

    sb << "__subscript(uint index) -> T {\n";

    if (isReadOnly) sb << "[__readNone]\n";
    sb << "__glsl_extension(GL_EXT_samplerless_texture_functions)";
    sb << "__target_intrinsic(glsl, \"" << glslLoadFuncName << "($0, int($1))$z\") get;\n";

    if (access != SLANG_RESOURCE_ACCESS_READ)
    {
        sb << "__target_intrinsic(glsl, \"imageStore($0, int($1), $V2)\") [nonmutating] set;\n";

        sb << "__intrinsic_op(" << int(kIROp_ImageSubscript) << ") ref;\n";
    }

    sb << "}\n";

    sb << "};\n";
}
}}}}


// DirectX Raytracing (DXR) Support
//
// The following is based on the experimental DXR SDK v0.09.01.
//
// Numbering follows the sections in the "D3D12 Raytracing Functional Spec" v0.09 (2018-03-12)
//

// 10.1.1 - Ray Flags

typedef uint RAY_FLAG;

static const RAY_FLAG RAY_FLAG_NONE                             = 0x00;
static const RAY_FLAG RAY_FLAG_FORCE_OPAQUE                     = 0x01;
static const RAY_FLAG RAY_FLAG_FORCE_NON_OPAQUE                 = 0x02;
static const RAY_FLAG RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH  = 0x04;
static const RAY_FLAG RAY_FLAG_SKIP_CLOSEST_HIT_SHADER          = 0x08;
static const RAY_FLAG RAY_FLAG_CULL_BACK_FACING_TRIANGLES       = 0x10;
static const RAY_FLAG RAY_FLAG_CULL_FRONT_FACING_TRIANGLES      = 0x20;
static const RAY_FLAG RAY_FLAG_CULL_OPAQUE                      = 0x40;
static const RAY_FLAG RAY_FLAG_CULL_NON_OPAQUE                  = 0x80;
static const RAY_FLAG RAY_FLAG_SKIP_TRIANGLES                   = 0x100;
static const RAY_FLAG RAY_FLAG_SKIP_PROCEDURAL_PRIMITIVES       = 0x200;

// 10.1.2 - Ray Description Structure

__target_intrinsic(hlsl, RayDesc)
__target_intrinsic(cuda, RayDesc)
struct RayDesc
{
    __target_intrinsic(hlsl, Origin)
    __target_intrinsic(cuda, Origin)
    float3 Origin;

    __target_intrinsic(hlsl, TMin)
    __target_intrinsic(cuda, TMin)
    float  TMin;

    __target_intrinsic(hlsl, Direction)
    __target_intrinsic(cuda, Direction)
    float3 Direction;

    __target_intrinsic(hlsl, TMax)
    __target_intrinsic(cuda, TMax)
    float  TMax;
};

// 10.1.3 - Ray Acceleration Structure

__builtin
__magic_type(RaytracingAccelerationStructureType)
__intrinsic_type($(kIROp_RaytracingAccelerationStructureType))
struct RaytracingAccelerationStructure {};

// 10.1.4 - Subobject Definitions

// TODO: We may decide to support these, but their reliance on C++ implicit
// constructor call syntax (`SomeType someVar(arg0, arg1);`) makes them
// annoying for the current Slang parsing strategy, and using global variables
// for this stuff comes across as a kludge rather than the best possible design.

// 10.1.5 - Intersection Attributes Structure

__target_intrinsic(hlsl, BuiltInTriangleIntersectionAttributes)
struct BuiltInTriangleIntersectionAttributes
{
    __target_intrinsic(hlsl, barycentrics)
    float2 barycentrics;
};

// 10.2 Shaders

// Right now new shader stages need to be added directly to the compiler
// implementation, rather than being something that can be declared in the stdlib.

// 10.3 - Intrinsics

// 10.3.1

__target_intrinsic(hlsl)
void CallShader<Payload>(uint shaderIndex, inout Payload payload);

// `executeCallableNV` is the GLSL intrinsic that will be used to implement
// `CallShader()` for GLSL-based targets.
//
__target_intrinsic(GL_NV_ray_tracing, "executeCallableNV")
__target_intrinsic(GL_EXT_ray_tracing, "executeCallableEXT")
void __executeCallable(uint shaderIndex, int payloadLocation);

// Next is the custom intrinsic that will compute the payload location
// for a type being used in a `CallShader()` call for GLSL-based targets.
//
__generic<Payload>
__target_intrinsic(__glslRayTracing, "$XC")
[__readNone]
[__AlwaysFoldIntoUseSiteAttribute]
int __callablePayloadLocation(__ref Payload payload);

// Now we provide a hard-coded definition of `CallShader()` for GLSL-based
// targets, which maps the generic HLSL operation into the non-generic
// GLSL equivalent.
//
__generic<Payload>
__specialized_for_target(glsl)
void CallShader(uint shaderIndex, inout Payload payload)
{
    [__vulkanCallablePayload]
    static Payload p;

    p = payload;
    __executeCallable(shaderIndex, __callablePayloadLocation(p));
    payload = p;
}

// 10.3.2
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "traceOptiXRay")
void TraceRay<payload_t>(
    RaytracingAccelerationStructure AccelerationStructure,
    uint                            RayFlags,
    uint                            InstanceInclusionMask,
    uint                            RayContributionToHitGroupIndex,
    uint                            MultiplierForGeometryContributionToHitGroupIndex,
    uint                            MissShaderIndex,
    RayDesc                         Ray,
    inout payload_t                 Payload);

__target_intrinsic(GL_NV_ray_tracing, "traceNV")
__target_intrinsic(GL_EXT_ray_tracing, "traceRayEXT")
void __traceRay(
    RaytracingAccelerationStructure AccelerationStructure,
    uint                            RayFlags,
    uint                            InstanceInclusionMask,
    uint                            RayContributionToHitGroupIndex,
    uint                            MultiplierForGeometryContributionToHitGroupIndex,
    uint                            MissShaderIndex,
    float3                          Origin,
    float                           TMin,
    float3                          Direction,
    float                           TMax,
    int                             PayloadLocation);

// TODO: Slang's parsing logic currently puts modifiers on
// the `GenericDecl` rather than the inner decl when
// using our default syntax, which seems wrong. We need
// to fix this, but for now using the expanded `__generic`
// syntax works in a pinch.
//
__generic<Payload>
__target_intrinsic(__glslRayTracing, "$XP")
[__readNone]
[__AlwaysFoldIntoUseSiteAttribute]
int __rayPayloadLocation(__ref Payload payload);

__generic<payload_t>
__specialized_for_target(glsl)
void TraceRay(
    RaytracingAccelerationStructure AccelerationStructure,
    uint                            RayFlags,
    uint                            InstanceInclusionMask,
    uint                            RayContributionToHitGroupIndex,
    uint                            MultiplierForGeometryContributionToHitGroupIndex,
    uint                            MissShaderIndex,
    RayDesc                         Ray,
    inout payload_t                 Payload)
{
    [__vulkanRayPayload]
    static payload_t p;

    p = Payload;
    __traceRay(
        AccelerationStructure,
        RayFlags,
        InstanceInclusionMask,
        RayContributionToHitGroupIndex,
        MultiplierForGeometryContributionToHitGroupIndex,
        MissShaderIndex,
        Ray.Origin,
        Ray.TMin,
        Ray.Direction,
        Ray.TMax,
        __rayPayloadLocation(p));
    Payload = p;
}


// NOTE!
// The name of the following functions may change when DXR supports
// a feature similar to the `GL_NV_ray_tracing_motion_blur` extension
//
// https://github.com/KhronosGroup/GLSL/blob/master/extensions/nv/GLSL_NV_ray_tracing_motion_blur.txt

void TraceMotionRay<payload_t>(
    RaytracingAccelerationStructure AccelerationStructure,
    uint                            RayFlags,
    uint                            InstanceInclusionMask,
    uint                            RayContributionToHitGroupIndex,
    uint                            MultiplierForGeometryContributionToHitGroupIndex,
    uint                            MissShaderIndex,
    RayDesc                         Ray,
    float                           CurrentTime,
    inout payload_t                 Payload);

__target_intrinsic(glsl, "traceRayMotionNV")
__glsl_version(460)
__glsl_extension(GL_NV_ray_tracing_motion_blur)
__glsl_extension(GL_EXT_ray_tracing)
void __traceMotionRay(
    RaytracingAccelerationStructure AccelerationStructure,
    uint                            RayFlags,
    uint                            InstanceInclusionMask,
    uint                            RayContributionToHitGroupIndex,
    uint                            MultiplierForGeometryContributionToHitGroupIndex,
    uint                            MissShaderIndex,
    float3                          Origin,
    float                           TMin,
    float3                          Direction,
    float                           TMax,
    float                           CurrentTime,
    int                             PayloadLocation);

__generic<payload_t>
__specialized_for_target(glsl)
void TraceMotionRay(
    RaytracingAccelerationStructure AccelerationStructure,
    uint                            RayFlags,
    uint                            InstanceInclusionMask,
    uint                            RayContributionToHitGroupIndex,
    uint                            MultiplierForGeometryContributionToHitGroupIndex,
    uint                            MissShaderIndex,
    RayDesc                         Ray,
    float                           CurrentTime,
    inout payload_t                 Payload)
{
    [__vulkanRayPayload]
    static payload_t p;

    p = Payload;
    __traceMotionRay(
        AccelerationStructure,
        RayFlags,
        InstanceInclusionMask,
        RayContributionToHitGroupIndex,
        MultiplierForGeometryContributionToHitGroupIndex,
        MissShaderIndex,
        Ray.Origin,
        Ray.TMin,
        Ray.Direction,
        Ray.TMax,
        CurrentTime,
        __rayPayloadLocation(p));
    Payload = p;
}

// 10.3.3
__target_intrinsic(hlsl)
bool ReportHit<A>(float tHit, uint hitKind, A attributes);

__target_intrinsic(GL_NV_ray_tracing, "reportIntersectionNV")
__target_intrinsic(GL_EXT_ray_tracing, "reportIntersectionEXT")
bool __reportIntersection(float tHit, uint hitKind);

__generic<A>
__specialized_for_target(glsl)
bool ReportHit(float tHit, uint hitKind, A attributes)
{
    [__vulkanHitAttributes]
    static A a;

    a = attributes;
    return __reportIntersection(tHit, hitKind);
}

// 10.3.4
__target_intrinsic(hlsl)
__target_intrinsic(GL_NV_ray_tracing, ignoreIntersectionNV)
__target_intrinsic(GL_EXT_ray_tracing, "ignoreIntersectionEXT;")
__target_intrinsic(cuda, "optixIgnoreIntersection")
void IgnoreHit();

// 10.3.5
__target_intrinsic(hlsl)
__target_intrinsic(GL_NV_ray_tracing, terminateRayNV)
__target_intrinsic(GL_EXT_ray_tracing, "terminateRayEXT;")
__target_intrinsic(cuda, "optixTerminateRay")
void AcceptHitAndEndSearch();

// 10.4 - System Values and Special Semantics

// TODO: Many of these functions need to be restricted so that
// they can only be accessed from specific stages.

// 10.4.1 - Ray Dispatch System Values

__target_intrinsic(GL_NV_ray_tracing, "(gl_LaunchIDNV)")
__target_intrinsic(GL_EXT_ray_tracing, "(gl_LaunchIDEXT)")
__target_intrinsic(cuda, "optixGetLaunchIndex")
uint3 DispatchRaysIndex();

__target_intrinsic(GL_NV_ray_tracing, "(gl_LaunchSizeNV)")
__target_intrinsic(GL_EXT_ray_tracing, "(gl_LaunchSizeEXT)")
__target_intrinsic(cuda, "optixGetLaunchDimensions")
uint3 DispatchRaysDimensions();

// 10.4.2 - Ray System Values

__target_intrinsic(GL_NV_ray_tracing, "(gl_WorldRayOriginNV)")
__target_intrinsic(GL_EXT_ray_tracing, "(gl_WorldRayOriginEXT)")
__target_intrinsic(cuda, "optixGetWorldRayOrigin")
float3 WorldRayOrigin();

__target_intrinsic(GL_NV_ray_tracing, "(gl_WorldRayDirectionNV)")
__target_intrinsic(GL_EXT_ray_tracing, "(gl_WorldRayDirectionEXT)")
__target_intrinsic(cuda, "optixGetWorldRayDirection")
float3 WorldRayDirection();

__target_intrinsic(GL_NV_ray_tracing, "(gl_RayTminNV)")
__target_intrinsic(GL_EXT_ray_tracing, "(gl_RayTminEXT)")
__target_intrinsic(cuda, "optixGetRayTmin")
float RayTMin();

// Note: The `RayTCurrent()` intrinsic should translate to
// either `gl_HitTNV` (for hit shaders) or `gl_RayTmaxNV`
// (for intersection shaders). Right now we are handling this
// during code emission, for simplicity.
//
// TODO: Once the compiler supports a more refined concept
// of profiles/capabilities and overloading based on them,
// we should simply provide two overloads here, specialized
// to the appropriate Vulkan stages.
//
__target_intrinsic(GL_NV_ray_tracing, "(gl_RayTmaxNV)")
__target_intrinsic(GL_EXT_ray_tracing, "(gl_RayTmaxEXT)")
__target_intrinsic(cuda, "optixGetRayTmax")
float RayTCurrent();

__target_intrinsic(GL_NV_ray_tracing, "(gl_IncomingRayFlagsNV)")
__target_intrinsic(GL_EXT_ray_tracing, "(gl_IncomingRayFlagsEXT)")
__target_intrinsic(cuda, "optixGetRayFlags")
uint RayFlags();

// 10.4.3 - Primitive/Object Space System Values

__target_intrinsic(__glslRayTracing, "(gl_InstanceID)")
__target_intrinsic(cuda, "optixGetInstanceIndex")
uint InstanceIndex();

__target_intrinsic(GL_NV_ray_tracing, "(gl_InstanceCustomIndexNV)")
__target_intrinsic(GL_EXT_ray_tracing, "(gl_InstanceCustomIndexEXT)")
__target_intrinsic(cuda, "optixGetInstanceId")
uint InstanceID();

__target_intrinsic(__glslRayTracing, "(gl_PrimitiveID)")
__target_intrinsic(cuda, "optixGetPrimitiveIndex")
uint PrimitiveIndex();

__target_intrinsic(GL_NV_ray_tracing, "(gl_ObjectRayOriginNV)")
__target_intrinsic(GL_EXT_ray_tracing, "(gl_ObjectRayOriginEXT)")
__target_intrinsic(cuda, "optixGetObjectRayOrigin")
float3 ObjectRayOrigin();

__target_intrinsic(GL_NV_ray_tracing, "(gl_ObjectRayDirectionNV)")
__target_intrinsic(GL_EXT_ray_tracing, "(gl_ObjectRayDirectionEXT)")
__target_intrinsic(cuda, "optixGetObjectRayDirection")
float3 ObjectRayDirection();

// TODO: optix has an optixGetObjectToWorldTransformMatrix function that returns 12
// floats by reference.
__target_intrinsic(GL_NV_ray_tracing, "transpose(gl_ObjectToWorldNV)")
__target_intrinsic(GL_EXT_ray_tracing, "transpose(gl_ObjectToWorldEXT)")
float3x4 ObjectToWorld3x4();

__target_intrinsic(GL_NV_ray_tracing, "transpose(gl_WorldToObjectNV)")
__target_intrinsic(GL_EXT_ray_tracing, "transpose(gl_WorldToObjectEXT)")
float3x4 WorldToObject3x4();

__target_intrinsic(GL_NV_ray_tracing, "(gl_ObjectToWorldNV)")
__target_intrinsic(GL_EXT_ray_tracing, "(gl_ObjectToWorld3x4EXT)")
float4x3 ObjectToWorld4x3();

__target_intrinsic(GL_NV_ray_tracing, "(gl_WorldToObjectNV)")
__target_intrinsic(GL_EXT_ray_tracing, "(gl_WorldToObject3x4EXT)")
float4x3 WorldToObject4x3();

// NOTE!
// The name of the following functions may change when DXR supports
// a feature similar to the `GL_NV_ray_tracing_motion_blur` extension

__target_intrinsic(glsl, "(gl_CurrentRayTimeNV)")
__glsl_version(460)
__glsl_extension(GL_NV_ray_tracing_motion_blur)
__glsl_extension(GL_EXT_ray_tracing)
float RayCurrentTime();

// Note: The provisional DXR spec included these unadorned
// `ObjectToWorld()` and `WorldToObject()` functions, so
// we will forward them to the new names as a convience
// for users who are porting their code.
//
// TODO: Should we provide a deprecation warning on these
// declarations, so that users can know they aren't coding
// against the final spec?
//
float3x4 ObjectToWorld() { return ObjectToWorld3x4(); }
float3x4 WorldToObject() { return WorldToObject3x4(); }

// 10.4.4 - Hit Specific System values
__target_intrinsic(GL_NV_ray_tracing, "(gl_HitKindNV)")
__target_intrinsic(GL_EXT_ray_tracing, "(gl_HitKindEXT)")
__target_intrinsic(cuda, "optixGetHitKind")
uint HitKind();

// Pre-defined hit kinds (not documented explicitly)
static const uint HIT_KIND_TRIANGLE_FRONT_FACE  = 254;
static const uint HIT_KIND_TRIANGLE_BACK_FACE   = 255;

//
// Shader Model 6.4
//

// Treats `left` and `right` as 4-component vectors of `UInt8` and computes `dot(left, right) + acc`
uint dot4add_u8packed(uint left, uint right, uint acc);

// Treats `left` and `right` as 4-component vectors of `Int8` and computes `dot(left, right) + acc`
int dot4add_i8packed(uint left, uint right, int acc);

// Computes `dot(left, right) + acc`.
//
// May not produce infinities or NaNs for intermediate results that overflow the range of `half`
float dot2add(float2 left, float2 right, float acc);

//
// Shader Model 6.5
//

//
// Mesh Shaders
//

// Set the number of output vertices and primitives for a mesh shader invocation.
__target_intrinsic(glsl, "SetMeshOutputsEXT")
__glsl_extension(GL_EXT_mesh_shader)
__glsl_version(450)
void SetMeshOutputCounts(uint vertexCount, uint primitiveCount);

// Specify the number of downstream mesh shader thread groups to invoke from an amplification shader,
// and provide the values for per-mesh payload parameters.
//
void DispatchMesh<P>(uint threadGroupCountX, uint threadGroupCountY, uint threadGroupCountZ, P meshPayload);

//
// "Sampler feedback" types `FeedbackTexture2D` and `FeedbackTexture2DArray`.
//

// https://microsoft.github.io/DirectX-Specs/d3d/SamplerFeedback.html

// The docs describe these as 'types' but their syntax makes them seem enum like, and enum is a simpler way to implement them
// But slang enums are always 'enum class like', so I use an empty struct type here

[sealed]
[builtin]
interface __BuiltinSamplerFeedbackType {};

[sealed]
__magic_type(FeedbackType, $(int(FeedbackType::Kind::MinMip)))
__target_intrinsic(hlsl, SAMPLER_FEEDBACK_MIN_MIP)
struct SAMPLER_FEEDBACK_MIN_MIP : __BuiltinSamplerFeedbackType {};

[sealed]
__magic_type(FeedbackType, $(int(FeedbackType::Kind::MipRegionUsed)))
__target_intrinsic(hlsl, SAMPLER_FEEDBACK_MIP_REGION_USED)
struct SAMPLER_FEEDBACK_MIP_REGION_USED : __BuiltinSamplerFeedbackType {};

// All of these objects are write-only resources that point to a special kind of unordered access view meant for sampler feedback.

// Calculate the flavor constants
${{{{
static const int feedbackTexture2DFlavor = int(TextureFlavor::create(TextureFlavor::Shape::Shape2D, SLANG_RESOURCE_ACCESS_WRITE, SLANG_TEXTURE_FEEDBACK_FLAG).flavor);
static const int feedbackTexture2DArrayFlavor = int(TextureFlavor::create(TextureFlavor::Shape::Shape2D, SLANG_RESOURCE_ACCESS_WRITE, SLANG_TEXTURE_FEEDBACK_FLAG | SLANG_TEXTURE_ARRAY_FLAG).flavor);
}}}}

__magic_type(Texture, $(feedbackTexture2DFlavor))
__intrinsic_type($(kIROp_TextureType + (feedbackTexture2DFlavor << kIROpMeta_OtherShift)))
struct FeedbackTexture2D<T : __BuiltinSamplerFeedbackType>
{
    __target_intrinsic
    void GetDimensions(out uint width, out uint height);

    __target_intrinsic
    void GetDimensions(uint mipLevel, out uint width, out uint height, out uint numberOfLevels);

    __target_intrinsic
    void GetDimensions(out float width,out float height);

    __target_intrinsic
    void GetDimensions(uint mipLevel, out float width,out float height, out float numberOfLevels);

    // With Clamp

    __target_intrinsic(hlsl, "($0).WriteSamplerFeedback($1, $2, $3, $4)")
    __target_intrinsic(cpp, "($0).WriteSamplerFeedback($1, $2, $3, $4)")
    void WriteSamplerFeedback<S>(Texture2D<S> tex, SamplerState samp, float2 location, float clamp);

    __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackBias($1, $2, $3, $4, $5)")
    __target_intrinsic(cpp, "($0).WriteSamplerFeedbackBias($1, $2, $3, $4, $5)")
    void WriteSamplerFeedbackBias<S>(Texture2D<S> tex, SamplerState samp, float2 location, float bias, float clamp);

    __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5, $6)")
    __target_intrinsic(cpp, "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5, $6)")
    void WriteSamplerFeedbackGrad<S>(Texture2D<S> tex, SamplerState samp, float2 location, float2 ddx, float2 ddy, float clamp);

    // Level

    __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackLevel($1, $2, $3, $4)")
    __target_intrinsic(cpp, "($0).WriteSamplerFeedbackLevel($1, $2, $3, $4)")
    void WriteSamplerFeedbackLevel<S>(Texture2D<S> tex, SamplerState samp, float2 location, float lod);

    // Without Clamp

    __target_intrinsic(hlsl, "($0).WriteSamplerFeedback($1, $2, $3)")
    __target_intrinsic(cpp, "($0).WriteSamplerFeedback($1, $2, $3)")
    void WriteSamplerFeedback<S>(Texture2D<S> tex, SamplerState samp, float2 location);

    __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackBias($1, $2, $3, $4)")
    __target_intrinsic(cpp, "($0).WriteSamplerFeedbackBias($1, $2, $3, $4)")
    void WriteSamplerFeedbackBias<S>(Texture2D<S> tex, SamplerState samp, float2 location, float bias);

    __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5)")
    __target_intrinsic(cpp, "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5)")
    void WriteSamplerFeedbackGrad<S>(Texture2D<S> tex, SamplerState samp, float2 location, float2 ddx, float2 ddy);
};



__magic_type(Texture, $(feedbackTexture2DArrayFlavor))
__intrinsic_type($(kIROp_TextureType + (feedbackTexture2DArrayFlavor << kIROpMeta_OtherShift)))
struct FeedbackTexture2DArray<T : __BuiltinSamplerFeedbackType>
{
    __target_intrinsic
    void GetDimensions(out uint width,out uint height, out uint elements);

    __target_intrinsic
    void GetDimensions(uint mipLevel, out uint width,out uint height, out uint elements, out uint numberOfLevels);

    __target_intrinsic
    void GetDimensions(out float width,out float height, out float elements);

    __target_intrinsic
    void GetDimensions(uint mipLevel, out float width,out float height, out float elements, out float numberOfLevels);

    // With Clamp

    __target_intrinsic(hlsl, "($0).WriteSamplerFeedback($1, $2, $3, $4)")
    __target_intrinsic(cpp, "($0).WriteSamplerFeedback($1, $2, $3, $4)")
    void WriteSamplerFeedback<S>(Texture2DArray<S> texArray, SamplerState samp, float3 location, float clamp);

    __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackBias($1, $2, $3, $4, $5)")
    __target_intrinsic(cpp, "($0).WriteSamplerFeedbackBias($1, $2, $3, $4, $5)")
    void WriteSamplerFeedbackBias<S>(Texture2DArray<S> texArray, SamplerState samp, float3 location, float bias, float clamp);

    __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5, $6)")
    __target_intrinsic(cpp, "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5, $6)")
    void WriteSamplerFeedbackGrad<S>(Texture2DArray<S> texArray, SamplerState samp, float3 location, float3 ddx, float3 ddy, float clamp);

    // Level

    __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackLevel($1, $2, $3, $4)")
    __target_intrinsic(cpp, "($0).WriteSamplerFeedbackLevel($1, $2, $3, $4)")
    void WriteSamplerFeedbackLevel<S>(Texture2DArray<S> texArray, SamplerState samp, float3 location, float lod);

    // Without Clamp

    __target_intrinsic(hlsl, "($0).WriteSamplerFeedback($1, $2, $3)")
    __target_intrinsic(cpp, "($0).WriteSamplerFeedback($1, $2, $3)")
    void WriteSamplerFeedback<S>(Texture2DArray<S> texArray, SamplerState samp, float3 location);

    __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackBias($1, $2, $3, $4)")
    __target_intrinsic(cpp, "($0).WriteSamplerFeedbackBias($1, $2, $3, $4)")
    void WriteSamplerFeedbackBias<S>(Texture2DArray<S> texArray, SamplerState samp, float3 location, float bias);

    __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5)")
    __target_intrinsic(cpp, "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5)")
    void WriteSamplerFeedbackGrad<S>(Texture2DArray<S> texArray, SamplerState samp, float3 location, float3 ddx, float3 ddy);
};

//
// DXR 1.1 and `TraceRayInline` support
//

// Get the index of the geometry that was hit in an intersection, any-hit, or closest-hit shader
__target_intrinsic(GL_EXT_ray_tracing, "(gl_GeometryIndexEXT)")
uint GeometryIndex();

// Status of whether a (closest) hit has been committed in a `RayQuery`.
typedef uint COMMITTED_STATUS;

// No hit committed.
static const COMMITTED_STATUS COMMITTED_NOTHING = 0;

// Closest hit is a triangle.
//
// This could be an opaque triangle hit found by the fixed-function
// traversal and intersection implementation, or a non-opaque
// triangle hit committed by user code with `RayQuery.CommitNonOpaqueTriangleHit`
//
static const COMMITTED_STATUS COMMITTED_TRIANGLE_HIT = 1;

// Closest hit is a procedural primitive.
//
// A procedural hit primitive is committed using `RayQuery.CommitProceduralPrimitiveHit`.
static const COMMITTED_STATUS COMMITTED_PROCEDURAL_PRIMITIVE_HIT = 2;

// Type of candidate hit that a `RayQuery` is pausing at.
//
// A `RayQuery` can automatically commit hits with opaque triangles,
// but yields to user code for other hits to allow them to be
// dismissed or committed.
//
typedef uint CANDIDATE_TYPE;

// Candidate hit is a non-opaque triangle.
static const CANDIDATE_TYPE CANDIDATE_NON_OPAQUE_TRIANGLE = 0;

// Candidate hit is a procedural primitive.
static const CANDIDATE_TYPE CANDIDATE_PROCEDURAL_PRIMITIVE = 1;

// Handle to state of an in-progress ray-tracing query.
//
// The ray query is effectively a coroutine that user shader
// code can resume to continue tracing the ray, and which yields
// back to the user code at interesting events along the ray.
//
__target_intrinsic(hlsl, RayQuery)
__target_intrinsic(glsl, rayQueryEXT)
__glsl_extension(GL_EXT_ray_query)
__glsl_version(460)
struct RayQuery <let rayFlagsGeneric : RAY_FLAG = RAY_FLAG_NONE>
{
    // Initialize the query object in a "fresh" state.
    //
    __intrinsic_op($(kIROp_DefaultConstruct))
    __init();

    // Initialize a ray-tracing query.
    //
    // This method may be called on a "fresh" ray query, or
    // on one that is already tracing a ray. In the latter
    // case any state related to the ray previously being
    // traced is overwritten.
    //
    // The `rayFlags` here will be bitwise ORed with
    // the `rayFlags` passed as a generic argument to
    // `RayQuery` to get the effective ray flags, which
    // must obey any API-imposed restrictions.
    //
    __target_intrinsic(hlsl)
    void TraceRayInline(
        RaytracingAccelerationStructure accelerationStructure,
        RAY_FLAG                        rayFlags,
        uint                            instanceInclusionMask,
        RayDesc                         ray);

    __target_intrinsic(glsl, "rayQueryInitializeEXT($0, $1, $2, $3, $4, $5, $6, $7)")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    void __rayQueryInitializeEXT(
        RaytracingAccelerationStructure accelerationStructure,
        RAY_FLAG                        rayFlags,
        uint                            instanceInclusionMask,
        float3                          origin,
        float                           tMin,
        float3                          direction,
        float                           tMax);

    [__unsafeForceInlineEarly]
    __specialized_for_target(glsl)
    void TraceRayInline(
        RaytracingAccelerationStructure accelerationStructure,
        RAY_FLAG                        rayFlags,
        uint                            instanceInclusionMask,
        RayDesc                         ray)
    {
        __rayQueryInitializeEXT(
            accelerationStructure,
            rayFlags | rayFlagsGeneric,
            instanceInclusionMask,
            ray.Origin,
            ray.TMin,
            ray.Direction,
            ray.TMax);
    }

    // Resume the ray query coroutine.
    //
    // If the coroutine suspends because of encountering
    // a candidate hit that cannot be resolved with fixed-funciton
    // logic, this function returns `true`, and the `Candidate*()`
    // functions should be used by application code to resolve
    // the candidate hit (by either committing or ignoring it).
    //
    // If the coroutine terminates because traversal is
    // complete (or has been aborted), this function returns
    // `false`, and application code should use the `Committed*()`
    // functions to appropriately handle the closest hit (it any)
    // that was found.
    //
    __target_intrinsic(glsl, rayQueryProceedEXT)
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    bool Proceed();

    // Causes the ray query to terminate.
    //
    // This function cases the ray query to act as if
    // traversal has terminated, so that subsequent
    // `Proceed()` calls will return `false`.
    //
    __target_intrinsic(glsl, rayQueryTerminateEXT)
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    void Abort();

    // Get the type of candidate hit being considered.
    //
    // The ray query coroutine will suspend when it encounters
    // a hit that cannot be resolved with fixed-function logic
    // (either a non-opaque triangle or a procedural primitive).
    // In either of those cases, `CandidateType()` will return
    // the kind of candidate hit that must be resolved by
    // user code.
    //
    __target_intrinsic(glsl, "rayQueryGetIntersectionTypeEXT($0, false)")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    CANDIDATE_TYPE CandidateType();

    // Access properties of a candidate hit.

    __target_intrinsic(glsl, "transpose(rayQueryGetIntersectionObjectToWorldEXT($0, false))")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    float3x4 CandidateObjectToWorld3x4();

    __target_intrinsic(glsl, "rayQueryGetIntersectionObjectToWorldEXT($0, false)")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    float4x3 CandidateObjectToWorld4x3();

    __target_intrinsic(glsl, "transpose(rayQueryGetIntersectionWorldToObjectEXT($0, false))")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    float3x4 CandidateWorldToObject3x4();

    __target_intrinsic(glsl, "rayQueryGetIntersectionWorldToObjectEXT($0, false)")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    float4x3 CandidateWorldToObject4x3();

    __target_intrinsic(glsl, "rayQueryGetIntersectionInstanceIdEXT($0, false)")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    uint CandidateInstanceIndex();

    __target_intrinsic(glsl, "rayQueryGetIntersectionInstanceCustomIndexEXT($0, false)")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    uint CandidateInstanceID();

    __target_intrinsic(glsl, "rayQueryGetIntersectionGeometryIndexEXT($0, false)")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    uint CandidateGeometryIndex();

    __target_intrinsic(glsl, "rayQueryGetIntersectionPrimitiveIndexEXT($0, false)")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    uint CandidatePrimitiveIndex();

    __target_intrinsic(glsl, "rayQueryGetIntersectionInstanceShaderBindingTableRecordOffsetEXT($0, false)")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    uint CandidateInstanceContributionToHitGroupIndex();

    // Access properties of the ray being traced
    // in the object space of a candidate hit.

    __target_intrinsic(glsl, "rayQueryGetIntersectionObjectRayOriginEXT($0, false)")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    float3 CandidateObjectRayOrigin();

    __target_intrinsic(glsl, "rayQueryGetIntersectionObjectRayDirectionEXT($0, false)")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    float3 CandidateObjectRayDirection();

    // Access properties of a candidate procedural primitive hit.

    __target_intrinsic(glsl, "rayQueryGetIntersectionCandidateAABBOpaqueEXT($0, false)")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    bool CandidateProceduralPrimitiveNonOpaque();

    // Access properties of a candidate non-opaque triangle hit.

    __target_intrinsic(glsl, "rayQueryGetIntersectionFrontFaceEXT($0, false)")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    bool CandidateTriangleFrontFace();

    __target_intrinsic(glsl, "rayQueryGetIntersectionBarycentricsEXT($0, false)")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    float2 CandidateTriangleBarycentrics();

    __target_intrinsic(glsl, "rayQueryGetIntersectionTEXT($0, false)")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    float CandidateTriangleRayT();

    // Commit the current non-opaque triangle hit.
    __target_intrinsic(glsl, rayQueryConfirmIntersectionEXT)
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    void CommitNonOpaqueTriangleHit();

    // Commit the current procedural primitive hit, with hit time `t`.
    __target_intrinsic(glsl, rayQueryGenerateIntersectionEXT)
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    void CommitProceduralPrimitiveHit(float t);

    // Get the status of the committed (closest) hit, if any.
    __target_intrinsic(glsl, "rayQueryGetIntersectionTypeEXT($0, true)")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    COMMITTED_STATUS CommittedStatus();

    // Access properties of the committed hit.
    //
    __target_intrinsic(glsl, "transpose(rayQueryGetIntersectionObjectToWorldEXT($0, true))")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    float3x4 CommittedObjectToWorld3x4();

    __target_intrinsic(glsl, "rayQueryGetIntersectionObjectToWorldEXT($0, true)")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    float4x3 CommittedObjectToWorld4x3();

    __target_intrinsic(glsl, "transpose(rayQueryGetIntersectionWorldToObjectEXT($0, true))")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    float3x4 CommittedWorldToObject3x4();

    __target_intrinsic(glsl, "rayQueryGetIntersectionWorldToObjectEXT($0, true)")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    float4x3 CommittedWorldToObject4x3();

    __target_intrinsic(glsl, "rayQueryGetIntersectionTEXT($0, true)")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    float CommittedRayT();

    __target_intrinsic(glsl, "rayQueryGetIntersectionInstanceIdEXT($0, true)")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    uint CommittedInstanceIndex();

    __target_intrinsic(glsl, "rayQueryGetIntersectionInstanceCustomIndexEXT($0, true)")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    uint CommittedInstanceID();

    __target_intrinsic(glsl, "rayQueryGetIntersectionGeometryIndexEXT($0, true)")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    uint CommittedGeometryIndex();

    __target_intrinsic(glsl, "rayQueryGetIntersectionPrimitiveIndexEXT($0, true)")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    uint CommittedPrimitiveIndex();

    __target_intrinsic(glsl, "rayQueryGetIntersectionInstanceShaderBindingTableRecordOffsetEXT($0, true)")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    uint CommittedInstanceContributionToHitGroupIndex();

    // Access properties of the ray being traced
    // in the object space of a committed hit.

    __target_intrinsic(glsl, "rayQueryGetIntersectionObjectRayOriginEXT($0, true)")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    float3 CommittedObjectRayOrigin();

    __target_intrinsic(glsl, "rayQueryGetIntersectionObjectRayDirectionEXT($0, true)")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    float3 CommittedObjectRayDirection();

    // Access properties of a committed triangle hit.

    __target_intrinsic(glsl, "rayQueryGetIntersectionFrontFaceEXT($0, true)")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    bool CommittedTriangleFrontFace();

    __target_intrinsic(glsl, "rayQueryGetIntersectionBarycentricsEXT($0, true)")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    float2 CommittedTriangleBarycentrics();

    // Access properties of the ray being traced.

    __target_intrinsic(glsl, rayQueryGetRayFlagsEXT)
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    uint RayFlags();

    __target_intrinsic(glsl, rayQueryGetWorldRayOriginEXT)
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    float3 WorldRayOrigin();

    __target_intrinsic(glsl, rayQueryGetWorldRayDirectionEXT)
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    float3 WorldRayDirection();

    __target_intrinsic(glsl, rayQueryGetRayTMinEXT)
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    float RayTMin();
}

//
// Vulkan/SPIR-V specific features
//

struct VkSubpassInput<T>
{
    T SubpassLoad();
}

struct VkSubpassInputMS<T>
{
    T SubpassLoad(int sampleIndex);
}


///
/// Shader Execution Reordering (SER)
///
/// NOTE! This API is currently experimental and may change in the future as SER is made available
/// in different APIs and downstream compilers.
///
/// Based on the NVAPI on D3D12 only currently.
///
/// White paper on SER on NVAPI https://developer.nvidia.com/sites/default/files/akamai/gameworks/ser-whitepaper.pdf
///
/// The NVAPI headers (R520) required for this functionality to work can be found here...
///
/// https://developer.nvidia.com/rtx/path-tracing/nvapi/get-started
///
/// For VK the specification is currently in this PR
///
/// https://github.com/KhronosGroup/GLSL/pull/196/files

/// Internal helper functions

// This is a bit of a hack for GLSL HitObjectAttributes
// It relies on [ForceInline] removing the surrounding function and just inserting the *contained* `t` as a global
// The __ref should indicate the desire for the returned value to not be a copy of t, but *t*.
// In practive __ref doesn't have this effect in practice.
// 
// We need this to be able access the payload outside of a function (which is all that TraceRay for example needs)
// We access the HitObjectAttributes via this function for the desired type, and it acts *as if* it's just an access
// to the global t.
[ForceInline]
Ref<T> __hitObjectAttributes<T>()
{
    [__vulkanHitObjectAttributes]   
    static T t;
    return t;
}

// Next is the custom intrinsic that will compute the hitObjectAttributes location
// for GLSL-based targets.
//
__generic<Attributes>
__target_intrinsic(__glslRayTracing, "$XH")
[__readNone]
[__AlwaysFoldIntoUseSiteAttribute]
int __hitObjectAttributesLocation(__ref Attributes attributes);

    /// Immutable data type representing a ray hit or a miss. Can be used to invoke hit or miss shading,
    /// or as a key in ReorderThread. Created by one of several methods described below. HitObject
    /// and its related functions are available in raytracing shader types only.
[__requiresNVAPI]
__target_intrinsic(hlsl, NvHitObject)
__glsl_extension(GL_NV_shader_invocation_reorder)
__glsl_extension(GL_EXT_ray_tracing)
__target_intrinsic(glsl, hitObjectNV)
struct HitObject
{
        /// Executes ray traversal (including anyhit and intersection shaders) like TraceRay, but returns the
        /// resulting hit information as a HitObject and does not trigger closesthit or miss shaders.
    __specialized_for_target(hlsl)
    static HitObject TraceRay<payload_t>(
        RaytracingAccelerationStructure AccelerationStructure,
        uint RayFlags,
        uint InstanceInclusionMask,
        uint RayContributionToHitGroupIndex,
        uint MultiplierForGeometryContributionToHitGroupIndex,
        uint MissShaderIndex,
        RayDesc Ray,
        inout payload_t Payload)
    {
        HitObject hitObj;
        __hlslTraceRay(
            AccelerationStructure, 
            RayFlags, 
            InstanceInclusionMask, 
            RayContributionToHitGroupIndex, 
            MultiplierForGeometryContributionToHitGroupIndex, 
            MissShaderIndex, 
            Ray, 
            Payload,
            hitObj);
        return hitObj;
    }

    [ForceInline]
    __specialized_for_target(glsl)
    static HitObject TraceRay<payload_t>( 
        RaytracingAccelerationStructure AccelerationStructure, 
        uint RayFlags, 
        uint InstanceInclusionMask, 
        uint RayContributionToHitGroupIndex, 
        uint MultiplierForGeometryContributionToHitGroupIndex, 
        uint MissShaderIndex, 
        RayDesc Ray, 
        inout payload_t Payload)
    {
        HitObject hitObj;

        [__vulkanRayPayload]
        static payload_t p;

        // Save the payload
        p = Payload;

        __glslTraceRay(
            hitObj,
            AccelerationStructure,
            RayFlags,                                           // Assumes D3D/VK have some RayFlags values
            InstanceInclusionMask,                              // cullMask
            RayContributionToHitGroupIndex,                     // sbtRecordOffset
            MultiplierForGeometryContributionToHitGroupIndex,   // sbtRecordStride
            MissShaderIndex,
            Ray.Origin,
            Ray.TMin,
            Ray.Direction, 
            Ray.TMax,
            __rayPayloadLocation(p));
        
        // Write the payload out
        Payload = p;

        return hitObj;
    }

        /// Executes motion ray traversal (including anyhit and intersection shaders) like TraceRay, but returns the
        /// resulting hit information as a HitObject and does not trigger closesthit or miss shaders.
    [ForceInline]
    __specialized_for_target(glsl)
    static HitObject TraceMotionRay<payload_t>( 
        RaytracingAccelerationStructure AccelerationStructure, 
        uint RayFlags, 
        uint InstanceInclusionMask, 
        uint RayContributionToHitGroupIndex, 
        uint MultiplierForGeometryContributionToHitGroupIndex, 
        uint MissShaderIndex, 
        RayDesc Ray,
        float CurrentTime,
        inout payload_t Payload)
    {
        HitObject hitObj;

        [__vulkanRayPayload]
        static payload_t p;

        // Save the payload
        p = Payload;

        __glslTraceMotionRay(
            hitObj,
            AccelerationStructure,
            RayFlags,                                           // Assumes D3D/VK have some RayFlags values
            InstanceInclusionMask,                              // cullMask
            RayContributionToHitGroupIndex,                     // sbtRecordOffset
            MultiplierForGeometryContributionToHitGroupIndex,   // sbtRecordStride
            MissShaderIndex,
            Ray.Origin,
            Ray.TMin,
            Ray.Direction, 
            Ray.TMax,
            CurrentTime,
            __rayPayloadLocation(p));
        
        // Write the payload out
        Payload = p;

        return hitObj;
    }

        /// Creates a HitObject representing a hit based on values explicitly passed as arguments, without
        /// tracing a ray. The primitive specified by AccelerationStructure, InstanceIndex, GeometryIndex,
        /// and PrimitiveIndex must exist. The shader table index is computed using the formula used with
        /// TraceRay. The computed index must reference a valid hit group record in the shader table. The
        /// Attributes parameter must either be an attribute struct, such as
        /// BuiltInTriangleIntersectionAttributes, or another HitObject to copy the attributes from.
    __specialized_for_target(hlsl)
    static HitObject MakeHit<attr_t>(
        RaytracingAccelerationStructure AccelerationStructure,
        uint InstanceIndex,
        uint GeometryIndex,
        uint PrimitiveIndex,
        uint HitKind,
        uint RayContributionToHitGroupIndex,
        uint MultiplierForGeometryContributionToHitGroupIndex,
        RayDesc Ray,
        attr_t attributes)
    {
        HitObject hitObj;
        __hlslMakeHit(
            AccelerationStructure, 
            InstanceIndex,
            GeometryIndex,
            PrimitiveIndex,
            HitKind,
            RayContributionToHitGroupIndex,
            MultiplierForGeometryContributionToHitGroupIndex,
            Ray,
            attributes,
            hitObj);
        return hitObj;
    }

    [ForceInline]
    __specialized_for_target(glsl)
    static HitObject MakeHit<attr_t>( 
        RaytracingAccelerationStructure AccelerationStructure, 
        uint InstanceIndex, 
        uint GeometryIndex, 
        uint PrimitiveIndex, 
        uint HitKind, 
        uint RayContributionToHitGroupIndex, 
        uint MultiplierForGeometryContributionToHitGroupIndex, 
        RayDesc Ray, 
        attr_t attributes)
    {
        HitObject hitObj;

        // Save the attributes
        __ref attr_t attr = __hitObjectAttributes<attr_t>();

        attr = attributes;

        __glslMakeHit(hitObj,
            AccelerationStructure,
            InstanceIndex,
            PrimitiveIndex,
            GeometryIndex,
            HitKind,
            RayContributionToHitGroupIndex,                         /// sbtRecordOffset?
            MultiplierForGeometryContributionToHitGroupIndex,       /// sbtRecordStride?
            Ray.Origin,
            Ray.TMin,
            Ray.Direction, 
            Ray.TMax,
            __hitObjectAttributesLocation(__hitObjectAttributes<attr_t>()));

        return hitObj;
    }

        /// See MakeHit but handles Motion 
        /// Currently only supported on VK
    [ForceInline]
    __specialized_for_target(glsl)
    static HitObject MakeMotionHit<attr_t>( 
        RaytracingAccelerationStructure AccelerationStructure, 
        uint InstanceIndex, 
        uint GeometryIndex, 
        uint PrimitiveIndex, 
        uint HitKind, 
        uint RayContributionToHitGroupIndex, 
        uint MultiplierForGeometryContributionToHitGroupIndex, 
        RayDesc Ray,
        float CurrentTime,
        attr_t attributes)
    {
        HitObject hitObj;

        // Save the attributes
        __ref attr_t attr = __hitObjectAttributes<attr_t>();

        attr = attributes;

        __glslMakeMotionHit(hitObj,
            AccelerationStructure,
            InstanceIndex,
            PrimitiveIndex,
            GeometryIndex,
            HitKind,
            RayContributionToHitGroupIndex,                         /// sbtRecordOffset?
            MultiplierForGeometryContributionToHitGroupIndex,       /// sbtRecordStride?
            Ray.Origin,
            Ray.TMin,
            Ray.Direction, 
            Ray.TMax,
            CurrentTime,
            __hitObjectAttributesLocation(__hitObjectAttributes<attr_t>()));

        return hitObj;
    }

        /// Creates a HitObject representing a hit based on values explicitly passed as arguments, without
        /// tracing a ray. The primitive specified by AccelerationStructure, InstanceIndex, GeometryIndex,
        /// and PrimitiveIndex must exist. The shader table index is explicitly provided as an argument
        /// instead of being computed from the indexing formula used in TraceRay. The provided index must
        /// reference a valid hit group record in the shader table. The Attributes parameter must either be an
        /// attribute struct, such as BuiltInTriangleIntersectionAttributes, or another HitObject to copy the
        /// attributes from.
    __specialized_for_target(hlsl)
    static HitObject MakeHit<attr_t>(
        uint HitGroupRecordIndex,
        RaytracingAccelerationStructure AccelerationStructure,
        uint InstanceIndex,
        uint GeometryIndex,
        uint PrimitiveIndex,
        uint HitKind,
        RayDesc Ray,
        attr_t attributes)
    {
        HitObject hitObj;
        __hlslMakeHitWithRecordIndex(
            HitGroupRecordIndex, 
            AccelerationStructure, 
            InstanceIndex,
            GeometryIndex,
            PrimitiveIndex,
            HitKind,
            Ray,
            attributes,
            hitObj);
        return hitObj;
    }

    [ForceInline]
    __specialized_for_target(glsl)
    static HitObject MakeHit<attr_t>( 
        uint HitGroupRecordIndex, 
        RaytracingAccelerationStructure AccelerationStructure, 
        uint InstanceIndex, 
        uint GeometryIndex, 
        uint PrimitiveIndex, 
        uint HitKind, 
        RayDesc Ray, 
        attr_t attributes)
    {
        HitObject hitObj;

        // Save the attributes
        __ref attr_t attr = __hitObjectAttributes<attr_t>();
        attr = attributes;

        __glslMakeHitWithIndex(hitObj,
            AccelerationStructure, 
            InstanceIndex,              ///? Same as instanceid ?
            GeometryIndex, 
            PrimitiveIndex,
            HitKind,                    /// Assuming HitKinds are compatible
            HitGroupRecordIndex,        /// sbtRecordIndex
            Ray.Origin,
            Ray.TMin,
            Ray.Direction, 
            Ray.TMax,
            __hitObjectAttributesLocation(__hitObjectAttributes<attr_t>()));

        return hitObj;
    }

        /// See MakeHit but handles Motion 
        /// Currently only supported on VK
    [ForceInline]
    __specialized_for_target(glsl)
    static HitObject MakeMotionHit<attr_t>( 
        uint HitGroupRecordIndex, 
        RaytracingAccelerationStructure AccelerationStructure, 
        uint InstanceIndex, 
        uint GeometryIndex, 
        uint PrimitiveIndex, 
        uint HitKind, 
        RayDesc Ray, 
        float CurrentTime,
        attr_t attributes)
    {
        HitObject hitObj;

        // Save the attributes
        __ref attr_t attr = __hitObjectAttributes<attr_t>();
        attr = attributes;

        __glslMakeMotionHitWithIndex(hitObj,
            AccelerationStructure, 
            InstanceIndex,              ///? Same as instanceid ?
            GeometryIndex, 
            PrimitiveIndex,
            HitKind,                    /// Assuming HitKinds are compatible
            HitGroupRecordIndex,        /// sbtRecordIndex
            Ray.Origin,
            Ray.TMin,
            Ray.Direction, 
            Ray.TMax,
            CurrentTime,
            __hitObjectAttributesLocation(__hitObjectAttributes<attr_t>()));

        return hitObj;
    }

        /// Creates a HitObject representing a miss based on values explicitly passed as arguments, without
        /// tracing a ray. The provided shader table index must reference a valid miss record in the shader
        /// table.
    [__requiresNVAPI]
    __target_intrinsic(hlsl, "NvMakeMiss")
    static HitObject MakeMiss( 
        uint MissShaderIndex, 
        RayDesc Ray);

    [ForceInline]
    __specialized_for_target(glsl)
    static HitObject MakeMiss( 
        uint MissShaderIndex, 
        RayDesc Ray)
    {
        HitObject hitObj;
        __glslMakeMiss(hitObj, MissShaderIndex, Ray.Origin, Ray.TMin, Ray.Direction, Ray.TMax);
        return hitObj;
    }

        /// See MakeMiss but handles Motion 
        /// Currently only supported on VK
    [ForceInline]
    __specialized_for_target(glsl)
    static HitObject MakeMotionMiss( 
        uint MissShaderIndex, 
        RayDesc Ray,
        float CurrentTime)
    {
        HitObject hitObj;
        __glslMakeMotionMiss(hitObj, MissShaderIndex, Ray.Origin, Ray.TMin, Ray.Direction, Ray.TMax, CurrentTime);
        return hitObj;
    }

        /// Creates a HitObject representing “NOP” (no operation) which is neither a hit nor a miss. Invoking a
        /// NOP hit object using HitObject::Invoke has no effect. Reordering by hit objects using
        /// ReorderThread will group NOP hit objects together. This can be useful in some reordering
        /// scenarios where future control flow for some threads is known to process neither a hit nor a
        /// miss.
    [__requiresNVAPI]
    __target_intrinsic(hlsl, "NvMakeNop")
    static HitObject MakeNop();

    [ForceInline]
    __specialized_for_target(glsl)
    static HitObject MakeNop()
    {
        HitObject hitObj;
        __glslMakeNop(hitObj);
        return hitObj;
    }

        /// Invokes closesthit or miss shading for the specified hit object. In case of a NOP HitObject, no
        /// shader is invoked.
    [__requiresNVAPI]
    __target_intrinsic(hlsl, "NvInvokeHitObject")
    static void Invoke<payload_t>(
        RaytracingAccelerationStructure AccelerationStructure,
        HitObject HitOrMiss,
        inout payload_t Payload);

    __specialized_for_target(glsl)
    static void Invoke<payload_t>(
        RaytracingAccelerationStructure AccelerationStructure,
        HitObject HitOrMiss, 
        inout payload_t Payload)
    {
        [__vulkanRayPayload]
        static payload_t p;

        // Save the payload
        p = Payload;

        __glslInvoke(HitOrMiss, __rayPayloadLocation(p));

        // Write payload result
        Payload = p;
    }

        /// Returns true if the HitObject encodes a miss, otherwise returns false.
    [__requiresNVAPI]
    __target_intrinsic(hlsl)
    __target_intrinsic(glsl, "hitObjectIsMissNV($0)")
    bool IsMiss();

        /// Returns true if the HitObject encodes a hit, otherwise returns false.
    [__requiresNVAPI]
    __target_intrinsic(hlsl)
    __target_intrinsic(glsl, "hitObjectIsHitNV($0)")
    bool IsHit();

        /// Returns true if the HitObject encodes a nop, otherwise returns false.
    [__requiresNVAPI]
    __target_intrinsic(hlsl)
    __target_intrinsic(glsl, "hitObjectIsEmptyNV($0)")
    bool IsNop();

        /// Queries ray properties from HitObject. Valid if the hit object represents a hit or a miss.
    [__requiresNVAPI]
    __target_intrinsic(hlsl)
    RayDesc GetRayDesc();

    __specialized_for_target(glsl)
    RayDesc GetRayDesc()
    {
        RayDesc ray = { __glslGetRayWorldOrigin(), __glslGetTMin(), __glslGetRayDirection(), __glslGetTMax() };
        return ray;
    }

        /// Queries shader table index from HitObject. Valid if the hit object represents a hit or a miss.
    [__requiresNVAPI]
    __target_intrinsic(hlsl)
    __target_intrinsic(glsl, "hitObjectGetShaderBindingTableRecordIndexNV($0)")
    uint GetShaderTableIndex();

        /// Returns the instance index of a hit. Valid if the hit object represents a hit.
    [__requiresNVAPI]
    __target_intrinsic(hlsl)
    __target_intrinsic(glsl, "hitObjectGetInstanceCustomIndexNV($0)")
    uint GetInstanceIndex();

        /// Returns the instance ID of a hit. Valid if the hit object represents a hit.
    [__requiresNVAPI]
    __target_intrinsic(hlsl)
    __target_intrinsic(glsl, "hitObjectGetInstanceIdNV($0)")
    uint GetInstanceID();

        /// Returns the geometry index of a hit. Valid if the hit object represents a hit.
    [__requiresNVAPI]
    __target_intrinsic(hlsl)
    __target_intrinsic(glsl, "hitObjectGetGeometryIndexNV($0)")
    uint GetGeometryIndex();

        /// Returns the primitive index of a hit. Valid if the hit object represents a hit.
    [__requiresNVAPI]
    __target_intrinsic(hlsl)
    __target_intrinsic(glsl, "hitObjectGetPrimitiveIndexNV($0)")
    uint GetPrimitiveIndex();

        /// Returns the hit kind. Valid if the hit object represents a hit.
    [__requiresNVAPI]
    __target_intrinsic(hlsl)
    __target_intrinsic(glsl, "hitObjectGetHitKindNV($0)")
    uint GetHitKind();

        /// Returns the attributes of a hit. Valid if the hit object represents a hit or a miss.
    __specialized_for_target(hlsl)
    attr_t GetAttributes<attr_t>()
    {
        attr_t v;
        __hlslGetAttributesFromHitObject(v);
        return v;
    }

    __specialized_for_target(glsl)
    attr_t GetAttributes<attr_t>()
    {
        // Work out the location
        int attributeLocation = __hitObjectAttributesLocation(__hitObjectAttributes<attr_t>());

        // Load the attributes from the location
        __glslGetAttributes(attributeLocation);

        // Return the attributes
        return __hitObjectAttributes<attr_t>();
    }
        /// Loads a root constant from the local root table referenced by the hit object. Valid if the hit object
        /// represents a hit or a miss. RootConstantOffsetInBytes must be a multiple of 4.
    __target_intrinsic(hlsl)
    [__requiresNVAPI]
    uint LoadLocalRootTableConstant(uint RootConstantOffsetInBytes);

    /// 
    /// !!!! Internal NVAPI HLSL impl. Not part of interface! !!!!!!!!!!!!
    /// 

    __target_intrinsic(hlsl, "NvGetAttributesFromHitObject($0, $1)")
    [__requiresNVAPI]
    void __hlslGetAttributesFromHitObject<T>(out T t);

    __target_intrinsic(hlsl, "NvMakeHitWithRecordIndex")
    [__requiresNVAPI]
    static void __hlslMakeHitWithRecordIndex<attr_t>(uint HitGroupRecordIndex, 
        RaytracingAccelerationStructure AccelerationStructure, 
        uint InstanceIndex, 
        uint GeometryIndex, 
        uint PrimitiveIndex, 
        uint HitKind, 
        RayDesc Ray, 
        attr_t attributes, 
        out HitObject hitObj);

    __target_intrinsic(hlsl, "NvMakeHit")
    [__requiresNVAPI]
    static void __hlslMakeHit<attr_t>(RaytracingAccelerationStructure AccelerationStructure, 
        uint InstanceIndex, 
        uint GeometryIndex, 
        uint PrimitiveIndex, 
        uint HitKind, 
        uint RayContributionToHitGroupIndex, 
        uint MultiplierForGeometryContributionToHitGroupIndex, 
        RayDesc Ray, 
        attr_t attributes, 
        out HitObject hitObj);

    __target_intrinsic(hlsl, "NvTraceRayHitObject")
    [__requiresNVAPI]
    static void __hlslTraceRay<payload_t>( 
        RaytracingAccelerationStructure AccelerationStructure, 
        uint RayFlags, 
        uint InstanceInclusionMask, 
        uint RayContributionToHitGroupIndex, 
        uint MultiplierForGeometryContributionToHitGroupIndex, 
        uint MissShaderIndex, 
        RayDesc Ray, 
        inout payload_t Payload,
        out HitObject hitObj);

    /// 
    /// !!!! Internal GLSL GL_NV_shader_invocation_reorder impl. Not part of interface! !!!!!!!!!!!!
    /// 

    __glsl_extension(GL_NV_shader_invocation_reorder)
    __glsl_extension(GL_EXT_ray_tracing)
    __target_intrinsic(glsl, "hitObjectRecordMissNV")
    static void __glslMakeMiss(
        HitObject hitObj,
        uint MissShaderIndex,
        float3 Origin,
        float TMin,
        float3 Direction,
        float TMax);

    // "void hitObjectRecordMissNV(hitObjectNV, uint, vec3, float, vec3, float);"
    __glsl_extension(GL_NV_shader_invocation_reorder)
    __glsl_extension(GL_EXT_ray_tracing)
    __glsl_extension(GL_NV_ray_tracing_motion_blur)
    __target_intrinsic(glsl, "hitObjectRecordMissNV")
    static void __glslMakeMotionMiss(
        HitObject hitObj,
        uint MissShaderIndex,
        float3 Origin,
        float TMin,
        float3 Direction,
        float TMax, 
        float CurrentTime);

    __glsl_extension(GL_NV_shader_invocation_reorder)
    __glsl_extension(GL_EXT_ray_tracing)
    __target_intrinsic(glsl, "hitObjectRecordEmptyNV($0)")
    static void __glslMakeNop(HitObject hitObj);

    __glsl_extension(GL_NV_shader_invocation_reorder)
    __target_intrinsic(glsl, "hitObjectGetObjectRayDirectionNV($0)")
    float3 __glslGetRayDirection();

    __glsl_extension(GL_NV_shader_invocation_reorder)
    __target_intrinsic(glsl, "hitObjectGetWorldRayOriginNV($0)")
    float3 __glslGetRayWorldOrigin();

    __glsl_extension(GL_NV_shader_invocation_reorder)
    __target_intrinsic(glsl, "hitObjectGetRayTMaxNV($0)")
    float __glslGetTMax();

    __glsl_extension(GL_NV_shader_invocation_reorder)
    __target_intrinsic(glsl, "hitObjectGetRayTMinNV($0)")
    float __glslGetTMin();

    // "void hitObjectRecordHitWithIndexNV(hitObjectNV, accelerationStructureEXT,int,int,int,uint,uint,vec3,float,vec3,float,int);"
    __glsl_extension(GL_NV_shader_invocation_reorder)
    __glsl_extension(GL_EXT_ray_tracing)
    __target_intrinsic(glsl, "hitObjectRecordHitWithIndexNV")
    static void __glslMakeHitWithIndex(
        HitObject hitObj,
        RaytracingAccelerationStructure accelerationStructure,
        int instanceid,
        int primitiveid,
        int geometryindex,
        uint hitKind,
        uint sbtRecordIndex,
        float3 origin,
        float Tmin,
        float3 direction,
        float Tmax,
        int attributeLocation);

    //  "void hitObjectRecordHitWithIndexMotionNV(hitObjectNV, accelerationStructureEXT,int,int,int,uint,uint,vec3,float,vec3,float,float,int);"
    __glsl_extension(GL_NV_shader_invocation_reorder)
    __glsl_extension(GL_EXT_ray_tracing)
    __glsl_extension(GL_NV_ray_tracing_motion_blur)
    __target_intrinsic(glsl, "hitObjectRecordHitWithIndexMotionNV")
    static void __glslMakeMotionHitWithIndex(
        HitObject hitObj,
        RaytracingAccelerationStructure accelerationStructure,
        int instanceid,
        int primitiveid,
        int geometryindex,
        uint hitKind,
        uint sbtRecordIndex,
        float3 origin,
        float Tmin,
        float3 direction,
        float Tmax,
        float CurrentTime,
        int attributeLocation);

    // "void hitObjectRecordHitNV(hitObjectNV,accelerationStructureEXT,int,int,int,uint,uint,uint,vec3,float,vec3,float,int);"
    __glsl_extension(GL_EXT_ray_tracing)
    __glsl_extension(GL_NV_shader_invocation_reorder)
    __target_intrinsic(glsl, "hitObjectRecordHitNV")
    static void __glslMakeHit(
        HitObject hitObj,
        RaytracingAccelerationStructure accelerationStructure,
        int instanceid,
        int primitiveid,
        int geometryindex,
        uint hitKind,
        uint sbtRecordOffset,
        uint sbtRecordStride,
        float3 origin,
        float Tmin,
        float3 direction,
        float Tmax,
        int attributeLocation);

        // "void hitObjectRecordHitMotionNV(hitObjectNV,accelerationStructureEXT,int,int,int,uint,uint,uint,vec3,float,vec3,float,float,int);"
    __glsl_extension(GL_EXT_ray_tracing)
    __glsl_extension(GL_NV_shader_invocation_reorder)
    __glsl_extension(GL_NV_ray_tracing_motion_blur)
    __target_intrinsic(glsl, "hitObjectRecordHitMotionNV")
    static void __glslMakeMotionHit(
        HitObject hitObj,
        RaytracingAccelerationStructure accelerationStructure,
        int instanceid,
        int primitiveid,
        int geometryindex,
        uint hitKind,
        uint sbtRecordOffset,
        uint sbtRecordStride,
        float3 origin,
        float Tmin,
        float3 direction,
        float Tmax,
        float CurrentTime,
        int attributeLocation);

    
    __glsl_extension(GL_EXT_ray_tracing)
    __glsl_extension(GL_NV_shader_invocation_reorder)
    __target_intrinsic(glsl, "hitObjectGetAttributesNV($0, $1)")
    void __glslGetAttributes(int attributeLocation);

    __glsl_extension(GL_EXT_ray_tracing)
    __glsl_extension(GL_NV_shader_invocation_reorder)
    __target_intrinsic(glsl, "hitObjectTraceRayNV")
    static void __glslTraceRay(
        HitObject hitObj,
        RaytracingAccelerationStructure accelerationStructure,
        uint rayFlags,
        uint cullMask,
        uint sbtRecordOffset,
        uint sbtRecordStride,
        uint missIndex,
        float3 origin,
        float Tmin,
        float3 direction,
        float Tmax,
        int payload);

    __glsl_extension(GL_EXT_ray_tracing)
    __glsl_extension(GL_NV_shader_invocation_reorder)
    __glsl_extension(GL_NV_ray_tracing_motion_blur)
    __target_intrinsic(glsl, "hitObjectTraceRayMotionNV")
    static void __glslTraceMotionRay(
        HitObject hitObj,
        RaytracingAccelerationStructure accelerationStructure,
        uint rayFlags,
        uint cullMask,
        uint sbtRecordOffset,
        uint sbtRecordStride,
        uint missIndex,
        float3 origin,
        float Tmin,
        float3 direction,
        float Tmax,
        float currentTime,
        int payload);

    __glsl_extension(GL_EXT_ray_tracing)
    __glsl_extension(GL_NV_shader_invocation_reorder)
    __target_intrinsic(glsl, "hitObjectExecuteShaderNV")
    static void __glslInvoke(
        HitObject hitObj,
        int payload);
};

    /// Reorders threads based on a coherence hint value. NumCoherenceHintBits indicates how many of
    /// the least significant bits of CoherenceHint should be considered during reordering (max: 16).
    /// Applications should set this to the lowest value required to represent all possible values in
    /// CoherenceHint. For best performance, all threads should provide the same value for
    /// NumCoherenceHintBits.
    /// Where possible, reordering will also attempt to retain locality in the thread’s launch indices
    /// (DispatchRaysIndex in DXR).
[__requiresNVAPI]
__target_intrinsic(hlsl, "NvReorderThread")
__glsl_extension(GL_NV_shader_invocation_reorder)
__glsl_extension(GL_EXT_ray_tracing)
__target_intrinsic(glsl, "reorderThreadNV")
void ReorderThread( uint CoherenceHint, uint NumCoherenceHintBitsFromLSB );

    /// Reorders threads based on a hit object, optionally extended by a coherence hint value. Coherence
    /// hints behave as described in the generic variant of ReorderThread. The maximum number of
    /// coherence hint bits in this variant of ReorderThread is 8. If no coherence hint is desired, set
    /// NumCoherenceHitBits to zero.
    /// Reordering will consider information in the HitObject and coherence hint with the following
    /// priority:
    ///
    /// 1. Shader ID stored in the HitObject
    /// 2. Coherence hint, with the most significant hint bit having highest priority
    /// 3. Spatial information stored in the HitObject
    ///
    /// That is, ReorderThread will first attempt to group threads whose HitObject references the
    /// same shader ID. (Miss shaders and NOP HitObjects are grouped separately). Within each of these
    /// groups, it will attempt to order threads by the value of their coherence hints. And within ranges
    /// of equal coherence hints, it will attempt to maximize locality in 3D space of the ray hit (if any).
[__requiresNVAPI]
__target_intrinsic(hlsl, "NvReorderThread")
__glsl_extension(GL_NV_shader_invocation_reorder)
__glsl_extension(GL_EXT_ray_tracing)
__target_intrinsic(glsl, "reorderThreadNV")
void ReorderThread( HitObject HitOrMiss, uint CoherenceHint, uint NumCoherenceHintBitsFromLSB );

    /// Is equivalent to
    /// ```
    /// void ReorderThread( HitObject HitOrMiss, uint CoherenceHint, uint NumCoherenceHintBitsFromLSB );
    /// ```
    /// With CoherenceHint and NumCoherenceHintBitsFromLSB as 0, meaning they are ignored.
[__requiresNVAPI]
__target_intrinsic(hlsl, "NvReorderThread")
__glsl_extension(GL_NV_shader_invocation_reorder)
__target_intrinsic(glsl, "reorderThreadNV")
void ReorderThread( HitObject HitOrMiss );


///
/// DebugBreak support 
///
/// There doesn't appear to be an equivalent for debugBreak for HLSL

__target_intrinsic(hlsl, "/* debugBreak() not currently supported for HLSL */")
__target_intrinsic(cuda,"__brkpt()")
__target_intrinsic(cpp, "SLANG_BREAKPOINT(0)")
void debugBreak();

__specialized_for_target(glsl)
[[vk::spirv_instruction(1, "NonSemantic.DebugBreak")]]
void debugBreak();


__target_intrinsic(cuda, "(threadIdx)")
[__readNone]
uint3 cudaThreadIdx();

__target_intrinsic(cuda, "(blockIdx)")
[__readNone]
uint3 cudaBlockIdx();

__target_intrinsic(cuda, "(blockDim)")
[__readNone]
uint3 cudaBlockDim();
back to top