https://github.com/shader-slang/slang
Raw File
Tip revision: 5902acdabc4445a65741a7a6a3a95f223e301059 authored by Yong He on 23 January 2024, 07:19:40 UTC
[LSP] Fetch configs directly from didConfigurationChanged message. (#3478)
Tip revision: 5902acd
hlsl.meta.slang
// Slang HLSL compatibility library

typedef uint UINT;

__generic<T>
__intrinsic_op($(kIROp_StructuredBufferGetDimensions))
uint2 __structuredBufferGetDimensions(AppendStructuredBuffer<T> buffer);

__generic<T>
__intrinsic_op($(kIROp_StructuredBufferGetDimensions))
uint2 __structuredBufferGetDimensions(ConsumeStructuredBuffer<T> buffer);

__intrinsic_op($(kIROp_StructuredBufferGetDimensions))
uint2 __structuredBufferGetDimensions<T>(StructuredBuffer<T> buffer);

__intrinsic_op($(kIROp_StructuredBufferGetDimensions))
uint2 __structuredBufferGetDimensions<T>(RWStructuredBuffer<T> buffer);

__intrinsic_op($(kIROp_StructuredBufferGetDimensions))
uint2 __structuredBufferGetDimensions<T>(RasterizerOrderedStructuredBuffer<T> buffer);

__generic<T>
__magic_type(HLSLAppendStructuredBufferType)
__intrinsic_type($(kIROp_HLSLAppendStructuredBufferType))
struct AppendStructuredBuffer
{
    __intrinsic_op($(kIROp_StructuredBufferAppend))
    void Append(T value);

    [ForceInline]
    void GetDimensions(
        out uint numStructs,
        out uint stride)
    {
        let result = __structuredBufferGetDimensions(this);
        numStructs = result.x;
        stride = result.y;
    }
};

__magic_type(HLSLByteAddressBufferType)
__intrinsic_type($(kIROp_HLSLByteAddressBufferType))
struct ByteAddressBuffer
{
    [__readNone]
    __target_intrinsic(hlsl)
    __target_intrinsic(cpp)
    __target_intrinsic(cuda)
    [__unsafeForceInlineEarly]
    void GetDimensions(out uint dim);

    [__unsafeForceInlineEarly]
    __specialized_for_target(spirv)
    __specialized_for_target(glsl)
    void GetDimensions(out uint dim)
    {
        dim = __structuredBufferGetDimensions(__getEquivalentStructuredBuffer<uint>(this)).x*4;
    }

    [__readNone]
    [ForceInline]
    __target_intrinsic(hlsl)
    uint Load(int location)
    {
        return __byteAddressBufferLoad<uint>(this, location);
    }

    [__readNone]
    uint Load(int location, out uint status);

    [__readNone]
    [ForceInline]
    __target_intrinsic(hlsl)
    uint2 Load2(int location)
    {
        return __byteAddressBufferLoad<uint2>(this, location);
    }

    [__readNone]
    uint2 Load2(int location, out uint status);

    [__readNone]
    [ForceInline]
    __target_intrinsic(hlsl)
    uint3 Load3(int location)
    {
        return __byteAddressBufferLoad<uint3>(this, location);
    }

    [__readNone]
    uint3 Load3(int location, out uint status);

    [__readNone]
    [ForceInline]
    __target_intrinsic(hlsl)
    uint4 Load4(int location)
    {
        return __byteAddressBufferLoad<uint4>(this, location);
    }

    [__readNone]
    uint4 Load4(int location, out uint status);

    [__readNone]
    T Load<T>(int location)
    {
        return __byteAddressBufferLoad<T>(this, location);
    }
};

// Texture
[sealed]
[builtin]
interface __ITextureShape
{
    static const int flavor;
    static const int dimensions;
    static const int planeDimensions;
}
__magic_type(TextureShape1DType)
__intrinsic_type($(kIROp_TextureShape1DType))
struct __Shape1D : __ITextureShape
{
    static const int flavor = $(SLANG_TEXTURE_1D);
    static const int dimensions = 1;
    static const int planeDimensions = 1;
}
__magic_type(TextureShape2DType)
__intrinsic_type($(kIROp_TextureShape2DType))
struct __Shape2D : __ITextureShape
{
    static const int flavor = $(SLANG_TEXTURE_2D);
    static const int dimensions = 2;
    static const int planeDimensions = 2;
}
__magic_type(TextureShape3DType)
__intrinsic_type($(kIROp_TextureShape3DType))
struct __Shape3D : __ITextureShape
{
    static const int flavor = $(SLANG_TEXTURE_3D);
    static const int dimensions = 3;
    static const int planeDimensions = 3;
}
__magic_type(TextureShapeCubeType)
__intrinsic_type($(kIROp_TextureShapeCubeType))
struct __ShapeCube : __ITextureShape
{
    static const int flavor = $(SLANG_TEXTURE_CUBE);
    static const int dimensions = 3;
    static const int planeDimensions = 2;
}
__magic_type(TextureShapeBufferType)
__intrinsic_type($(kIROp_TextureShapeBufferType))
struct __ShapeBuffer : __ITextureShape
{
    static const int flavor = $(SLANG_TEXTURE_BUFFER);
    static const int dimensions = 1;
    static const int planeDimensions = 1;
}
__intrinsic_op(vectorReshape)
vector<T,N> __vectorReshape<let N : int, T, let M : int>(vector<T,M> vin);

__intrinsic_op(makeVector)
__generic<T, let N:int>
vector<T,N+1> __makeVector(vector<T,N> vec, T scalar);


__magic_type(TextureType)
__intrinsic_type($(kIROp_TextureType))
struct __TextureImpl<T, Shape: __ITextureShape, let isArray:int, let isMS:int, let sampleCount:int, let access:int, let isShadow:int, let isCombined:int, let format:int>
{
}

// Combined texture sampler specific functions
__generic<T, Shape: __ITextureShape, let isArray:int, let isMS:int, let sampleCount:int, let isShadow:int, let format:int>
extension __TextureImpl<T,Shape,isArray,isMS,sampleCount,0,isShadow,1,format>
{
    static const int access = 0;

    typealias TextureCoord = vector<float, Shape.dimensions>;

    [ForceInline]
    [__readNone]
    float CalculateLevelOfDetail(TextureCoord location)
    {
        __target_switch
        {
        case hlsl:
            __intrinsic_asm "CalculateLevelOfDetail";
        case glsl:
            __intrinsic_asm "textureQueryLod($0, $1).x";
        case spirv:
            return (spirv_asm
            {
                result:$$float2 = OpImageQueryLod $this $location
            }).x;
        }
    }

    [ForceInline]
    [__readNone]
    float CalculateLevelOfDetailUnclamped(TextureCoord location)
    {
        __target_switch
        {
        case hlsl:
            __intrinsic_asm "CalculateLevelOfDetailUnclamped";
        case glsl:
            __intrinsic_asm "textureQueryLod($0, $1).y";
        case spirv:
            return (spirv_asm
            {
                result:$$float2 = OpImageQueryLod $this $location
            }).y;
        }
    }
    
    __target_intrinsic(glsl, "texture($0, $1)")
    float __glsl_texture(vector<float, Shape.dimensions+isArray+1> value);

    __glsl_extension(GL_EXT_texture_shadow_lod)
    __target_intrinsic(glsl, "textureOffset($0, $1, $2)")
    float __glsl_texture_offset(vector<float, Shape.dimensions+isArray+1> value, constexpr vector<int, Shape.planeDimensions> offset);

    __glsl_extension(GL_EXT_texture_shadow_lod)
    __target_intrinsic(glsl, "textureLod($0, $1, 0)")
    float __glsl_texture_level_zero(vector<float, Shape.dimensions+isArray+1> value);

    __glsl_extension(GL_EXT_texture_shadow_lod)
    __target_intrinsic(glsl, "textureLodOffset($0, $1, 0, $2)")
    float __glsl_texture_offset_level_zero(vector<float, Shape.dimensions+isArray+1> value, constexpr vector<int, Shape.planeDimensions> offset);
}

__generic<T:IFloat, Shape: __ITextureShape, let isArray:int, let isMS:int, let sampleCount:int, let isShadow:int, let format:int>
extension __TextureImpl<T,Shape,isArray,isMS,sampleCount,0,isShadow,1,format>
{
    [__readNone]
    T Sample(vector<float, Shape.dimensions+isArray> location)
    {
        __target_switch
        {
            case cpp:
            case hlsl:
                __intrinsic_asm ".Sample";
            case glsl:
                __intrinsic_asm "$ctexture($0, $1)$z";
            case cuda:
                if (isArray != 0)
                {
                    switch(Shape.flavor)
                    {
                    case $(SLANG_TEXTURE_1D):
                        __intrinsic_asm "tex1DLayered<$T0>($0, ($1).x, int(($1).y))";
                    case $(SLANG_TEXTURE_2D):
                        __intrinsic_asm "tex2DLayered<$T0>($0, ($1).x, ($1).y, int(($1).z))";
                    case $(SLANG_TEXTURE_CUBE):
                        __intrinsic_asm "texCubemapLayered<$T0>($0, ($1).x, ($1).y, ($1).z, int(($1).w))";
                    }
                }
                else
                {
                    switch(Shape.flavor)
                    {
                    case $(SLANG_TEXTURE_1D):
                        __intrinsic_asm "tex1D<$T0>($0, ($1))";
                    case $(SLANG_TEXTURE_2D):
                        __intrinsic_asm "tex2D<$T0>($0, ($1).x, ($1).y)";
                    case $(SLANG_TEXTURE_3D):
                        __intrinsic_asm "tex3D<$T0>($0, ($1).x, ($1).y, ($1).z)";
                    case $(SLANG_TEXTURE_CUBE):
                        __intrinsic_asm "texCubemap<$T0>($0, ($1).x, ($1).y, ($1).z)";
                    }
                }
            case spirv:
                return spirv_asm
                {
                    %sampled : __sampledType(T) = OpImageSampleImplicitLod $this $location None;
                    __truncate $$T result __sampledType(T) %sampled;
                };
        }
    }

    [__readNone]
    __glsl_extension(GL_ARB_sparse_texture_clamp)
    T Sample(vector<float, Shape.dimensions+isArray> location, vector<int, Shape.planeDimensions> offset, float clamp)
    {
        __target_switch
        {
            case cpp:
            case hlsl:
                __intrinsic_asm ".Sample";
            case glsl:
                __intrinsic_asm "$ctextureOffsetClampARB($0, $1, $2, $3)$z";
            case spirv:
                return spirv_asm
                {
                    OpCapability MinLod;
                    %sampled : __sampledType(T) = OpImageSampleImplicitLod $this $location None|ConstOffset|MinLod $offset $clamp;
                    __truncate $$T result __sampledType(T) %sampled;
                };
        }
    }

    [__readNone]
    __target_intrinsic(hlsl)
    T Sample(vector<float, Shape.dimensions+isArray> location, vector<int, Shape.planeDimensions> offset, float clamp, out uint status)
    {
        status = 0;
        return Sample(location, offset, clamp);
    }

    [__readNone]
    T SampleBias(vector<float, Shape.dimensions+isArray> location, float bias)
    {
        __target_switch
        {
            case cpp:
            case hlsl:
                __intrinsic_asm ".SampleBias";
            case glsl:
                __intrinsic_asm "$ctexture($0, $1, $2)$z";
            case spirv:
                return spirv_asm
                {
                    %sampled : __sampledType(T) = OpImageSampleImplicitLod $this $location None|Bias $bias;
                    __truncate $$T result __sampledType(T) %sampled;

                };
        }
    }

    [__readNone]
    T SampleBias(vector<float, Shape.dimensions+isArray> location, float bias, constexpr vector<int, Shape.planeDimensions> offset)
    {
        __target_switch
        {
            case cpp:
            case hlsl:
            __intrinsic_asm ".SampleBias";
            case glsl:
            __intrinsic_asm "$ctextureOffset($0, $1, $3, $2)$z";
            case spirv:
            return spirv_asm
            {
                %sampled : __sampledType(T) = OpImageSampleImplicitLod $this $location None|Bias|ConstOffset $bias $offset;
                __truncate $$T result __sampledType(T) %sampled;
            };
        }
    }

    [__readNone]
    [ForceInline]
    float SampleCmp(vector<float, Shape.dimensions+isArray> location, float compareValue)
    {
        __target_switch
        {
        case glsl:
            __glsl_texture(__makeVector(location, compareValue));
        case hlsl:
            __intrinsic_asm ".SampleCmp";
        case spirv:
            return spirv_asm
            {
                result:$$float = OpImageSampleDrefImplicitLod $this $location $compareValue;
            };
        }
    }

    [__readNone]
    [ForceInline]
    float SampleCmpLevelZero(vector<float, Shape.dimensions+isArray> location, float compareValue)
    {
        __target_switch
        {
        case glsl:
            __glsl_texture_level_zero(__makeVector(location, compareValue));
        case hlsl:
            __intrinsic_asm ".SampleCmpLevelZero";
        case spirv:
            const float zeroFloat = 0.0f;
            return spirv_asm
            {
                result:$$float = OpImageSampleDrefExplicitLod $this $location $compareValue Lod $zeroFloat;
            };
        }
    }

    [__readNone] 
    [ForceInline]
    float SampleCmp(vector<float, Shape.dimensions+isArray> location, float compareValue, constexpr vector<int, Shape.planeDimensions> offset)
    {
        __target_switch
        {
        case glsl:
            __glsl_texture_offset(__makeVector(location, compareValue), offset);
        case hlsl:
            __intrinsic_asm ".SampleCmp";
        case spirv:
            return spirv_asm
            {
                result:$$float = OpImageSampleDrefImplicitLod $this $location $compareValue ConstOffset $offset;
            };
        }
    }

    [__readNone]
    [ForceInline]
    float SampleCmpLevelZero(vector<float, Shape.dimensions+isArray> location, float compareValue, constexpr vector<int, Shape.planeDimensions> offset)
    {
        __target_switch
        {
        case glsl:
            __glsl_texture_offset_level_zero(__makeVector(location, compareValue), offset);
        case hlsl:
            __intrinsic_asm ".SampleCmpLevelZero";
        case spirv:
            const float zeroFloat = 0.0f;
            return spirv_asm
            {
                result:$$float = OpImageSampleDrefExplicitLod $this $location $compareValue Lod|ConstOffset $zeroFloat $offset;
            };
        }
    }

    [__readNone]
    T SampleGrad(vector<float, Shape.dimensions+isArray> location, vector<float, Shape.planeDimensions> gradX, vector<float, Shape.planeDimensions> gradY)
    {
        __target_switch
        {
            case cpp:
            case hlsl:
            __intrinsic_asm ".SampleGrad";
            case glsl:
            __intrinsic_asm "$ctextureGrad($0, $1, $2, $3)$z";
            case spirv:
                return spirv_asm
                {
                    %sampled : __sampledType(T) = OpImageSampleExplicitLod $this $location None|Grad $gradX $gradY;
                    __truncate $$T result __sampledType(T) %sampled;
                };
        }
    }

    [__readNone]
    T SampleGrad(vector<float, Shape.dimensions+isArray> location, vector<float, Shape.planeDimensions> gradX, vector<float, Shape.planeDimensions> gradY, constexpr vector<int, Shape.planeDimensions> offset)
    {
        __target_switch
        {
            case cpp:
            case hlsl:
            __intrinsic_asm ".SampleGrad";
            case glsl:
            __intrinsic_asm "$ctextureGradOffset($0, $1, $2, $3, $4)$z";
            case spirv:
                return spirv_asm
                {
                    %sampled : __sampledType(T) = OpImageSampleExplicitLod $this $location None|Grad|ConstOffset $gradX $gradY $offset;
                    __truncate $$T result __sampledType(T) %sampled;
                };
        }
    }

    __glsl_extension(GL_ARB_sparse_texture_clamp)
    [__readNone]
    T SampleGrad(vector<float, Shape.dimensions+isArray> location, vector<float, Shape.planeDimensions> gradX, vector<float, Shape.planeDimensions> gradY, constexpr vector<int, Shape.planeDimensions> offset, float lodClamp)
    {
        __target_switch
        {
            case cpp:
            case hlsl:
            __intrinsic_asm ".SampleGrad";
            case glsl:
            __intrinsic_asm "$ctextureGradOffsetClampARB($0, $1, $2, $3, $4, $5)$z";
            case spirv:
                return spirv_asm
                {
                    OpCapability MinLod;
                    %sampled : __sampledType(T) = OpImageSampleExplicitLod $this $location None|Grad|ConstOffset|MinLod $gradX $gradY $offset $lodClamp;
                    __truncate $$T result __sampledType(T) %sampled;
                };
        }
    }

    [__readNone]
    T SampleLevel(vector<float, Shape.dimensions+isArray> location, float level)
    {
        __target_switch
        {
            case cpp:
            case hlsl:
                __intrinsic_asm ".SampleLevel";
            case glsl:
                __intrinsic_asm "$ctextureLod($0, $1, $2)$z";
            case cuda:
                if (isArray != 0)
                {
                    switch(Shape.flavor)
                    {
                    case $(SLANG_TEXTURE_1D):
                        __intrinsic_asm "tex1DLayeredLod<$T0>($0, ($1).x, int(($1).y), ($2))";
                    case $(SLANG_TEXTURE_2D):
                        __intrinsic_asm "tex2DLayeredLod<$T0>($0, ($1).x, ($1).y, int(($1).z), ($2))";
                    case $(SLANG_TEXTURE_CUBE):
                        __intrinsic_asm "texCubemapLayeredLod<$T0>($0, ($1).x, ($1).y, ($1).z, int(($1).w), ($2))";
                    default:
                        __intrinsic_asm "<invalid intrinsic>";
                    }
                }
                else
                {
                    switch(Shape.flavor)
                    {
                    case $(SLANG_TEXTURE_1D):
                        __intrinsic_asm "tex1DLod<$T0>($0, ($1), ($2))";
                    case $(SLANG_TEXTURE_2D):
                        __intrinsic_asm "tex2DLod<$T0>($0, ($1).x, ($1).y, ($2))";
                    case $(SLANG_TEXTURE_3D):
                        __intrinsic_asm "tex3DLod<$T0>($0, ($1).x, ($1).y, ($1).z, ($2))";
                    case $(SLANG_TEXTURE_CUBE):
                        __intrinsic_asm "texCubemapLod<$T0>($0, ($1).x, ($1).y, ($1).z, ($2))";
                    }
                }
            case spirv:
                return spirv_asm
                {
                    %sampled : __sampledType(T) = OpImageSampleExplicitLod $this $location None|Lod $level;
                    __truncate $$T result __sampledType(T) %sampled;
                };
        }
    }

    [__readNone]
    T SampleLevel(vector<float, Shape.dimensions+isArray> location, float level, constexpr vector<int, Shape.planeDimensions> offset)
    {
        __target_switch
        {
            case cpp:
            case hlsl:
                __intrinsic_asm ".SampleLevel";
            case glsl:
                __intrinsic_asm "$ctextureLodOffset($0, $1, $2, $3)$z";
            case spirv:
                return spirv_asm
                {
                    %sampled : __sampledType(T) = OpImageSampleExplicitLod $this $location None|Lod|ConstOffset $level $offset;
                    __truncate $$T result __sampledType(T) %sampled;
                };
        }
    }
}

// Non-combined texture types specific functions
__generic<T, Shape: __ITextureShape, let isArray:int, let isMS:int, let sampleCount:int, let access:int, let isShadow:int, let format:int>
extension __TextureImpl<T,Shape,isArray,isMS,sampleCount,access,isShadow,0,format>
{
    typealias TextureCoord = vector<float, Shape.dimensions>;

    [__readNone]
    [ForceInline]
    float CalculateLevelOfDetail(SamplerState s, TextureCoord location)
    {
        __target_switch
        {
        case hlsl:
            __intrinsic_asm "CalculateLevelOfDetail";
        case glsl:
            __intrinsic_asm "textureQueryLod($p, $2).x";
        case spirv:
            return (spirv_asm {
                %sampledImage : __sampledImageType(this) = OpSampledImage $this $s;
                result:$$float2 = OpImageQueryLod %sampledImage $location;
            }).x;
        }
    }

    [__readNone]
    [ForceInline]
    float CalculateLevelOfDetailUnclamped(SamplerState s, TextureCoord location)
    {
        __target_switch
        {
        case hlsl:
            __intrinsic_asm "CalculateLevelOfDetailUnclamped";
        case glsl:
            __intrinsic_asm "textureQueryLod($p, $2).y";
        case spirv:
            return (spirv_asm {
                %sampledImage : __sampledImageType(this) = OpSampledImage $this $s;
                result:$$float2 = OpImageQueryLod %sampledImage $location;
            }).y;
        }
    }
    
    __target_intrinsic(glsl, "texture($p, $2)")
    float __glsl_texture(SamplerComparisonState s, vector<float, Shape.dimensions+isArray+1> value);

    __glsl_extension(GL_EXT_texture_shadow_lod)
    __target_intrinsic(glsl, "textureOffset($p, $2, $3)")
    float __glsl_texture_offset(SamplerComparisonState s, vector<float, Shape.dimensions+isArray+1> value, constexpr vector<int, Shape.planeDimensions> offset);

    __glsl_extension(GL_EXT_texture_shadow_lod)
    __target_intrinsic(glsl, "textureLod($p, $2, 0)")
    float __glsl_texture_level_zero(SamplerComparisonState s, vector<float, Shape.dimensions+isArray+1> value);

    __glsl_extension(GL_EXT_texture_shadow_lod)
    __target_intrinsic(glsl, "textureLodOffset($p, $2, 0, $3)")
    float __glsl_texture_offset_level_zero(SamplerComparisonState s, vector<float, Shape.dimensions+isArray+1> value, constexpr vector<int, Shape.planeDimensions> offset);

}

__generic<T:IFloat, Shape: __ITextureShape, let isArray:int, let isMS:int, let sampleCount:int, let isShadow:int, let format:int>
extension __TextureImpl<T,Shape,isArray,isMS,sampleCount,0,isShadow,0,format>
{
    [__readNone]
    T Sample(SamplerState s, vector<float, Shape.dimensions+isArray> location)
    {
        __target_switch
        {
            case cpp:
            case hlsl:
                __intrinsic_asm ".Sample";
            case glsl:
                __intrinsic_asm "$ctexture($p, $2)$z";
            case cuda:
                if (isArray != 0)
                {
                    switch(Shape.flavor)
                    {
                    case $(SLANG_TEXTURE_1D):
                        __intrinsic_asm "tex1DLayered<$T0>($0, ($2).x, int(($2).y))";
                    case $(SLANG_TEXTURE_2D):
                        __intrinsic_asm "tex2DLayered<$T0>($0, ($2).x, ($2).y, int(($2).z))";
                    case $(SLANG_TEXTURE_CUBE):
                        __intrinsic_asm "texCubemapLayered<$T0>($0, ($2).x, ($2).y, ($2).z, int(($2).w))";
                    default:
                        __intrinsic_asm "<invalid intrinsic>";
                    }
                }
                else
                {
                    switch(Shape.flavor)
                    {
                    case $(SLANG_TEXTURE_1D):
                        __intrinsic_asm "tex1D<$T0>($0, ($2))";
                    case $(SLANG_TEXTURE_2D):
                        __intrinsic_asm "tex2D<$T0>($0, ($2).x, ($2).y)";
                    case $(SLANG_TEXTURE_3D):
                        __intrinsic_asm "tex3D<$T0>($0, ($2).x, ($2).y, ($2).z)";
                    case $(SLANG_TEXTURE_CUBE):
                        __intrinsic_asm "texCubemap<$T0>($0, ($2).x, ($2).y, ($2).z)";
                    }
                }
            case spirv:
                return spirv_asm
                {
                    %sampledImage : __sampledImageType(this) = OpSampledImage $this $s;
                    %sampled : __sampledType(T) = OpImageSampleImplicitLod %sampledImage $location None;
                    __truncate $$T result __sampledType(T) %sampled;
                };
        }
    }

    [__readNone]
    T Sample(SamplerState s, vector<float, Shape.dimensions+isArray> location, constexpr vector<int, Shape.planeDimensions> offset)
    {
        __target_switch
        {
            case cpp:
            case hlsl:
                __intrinsic_asm ".Sample";
            case glsl:
                __intrinsic_asm "$ctextureOffset($p, $2, $3)$z";
            case spirv:
            return spirv_asm
            {
                %sampledImage : __sampledImageType(this) = OpSampledImage $this $s;
                %sampled : __sampledType(T) = OpImageSampleImplicitLod %sampledImage $location None|ConstOffset $offset;
                __truncate $$T result __sampledType(T) %sampled;
            };
        }
    }

    [__readNone]
    __glsl_extension(GL_ARB_sparse_texture_clamp)
    T Sample(SamplerState s, vector<float, Shape.dimensions+isArray> location, constexpr vector<int, Shape.planeDimensions> offset, float clamp)
    {
        __target_switch
        {
            case cpp:
            case hlsl:
                __intrinsic_asm ".Sample";
            case glsl:
                __intrinsic_asm "$ctextureOffsetClampARB($p, $2, $3, $4)$z";
            case spirv:
                return spirv_asm
                {
                    OpCapability MinLod;
                    %sampledImage : __sampledImageType(this) = OpSampledImage $this $s;
                    %sampled : __sampledType(T) = OpImageSampleImplicitLod %sampledImage $location None|ConstOffset|MinLod $offset $clamp;
                    __truncate $$T result __sampledType(T) %sampled;
                };
        }
    }

    [__readNone]
    __target_intrinsic(hlsl)
    T Sample(SamplerState s, vector<float, Shape.dimensions+isArray> location, constexpr vector<int, Shape.planeDimensions> offset, float clamp, out uint status)
    {
        status = 0;
        return Sample(s, location, offset, clamp);
    }

    [__readNone]
    T SampleBias(SamplerState s, vector<float, Shape.dimensions+isArray> location, float bias)
    {
        __target_switch
        {
            case cpp:
            case hlsl:
            __intrinsic_asm ".SampleBias";
            case glsl:
            __intrinsic_asm "$ctexture($p, $2, $3)$z";
            case spirv:
            return spirv_asm
            {
                %sampledImage : __sampledImageType(this) = OpSampledImage $this $s;
                %sampled : __sampledType(T) = OpImageSampleImplicitLod %sampledImage $location None|Bias $bias;
                __truncate $$T result __sampledType(T) %sampled;
            };
        }
    }

    [__readNone]
    T SampleBias(SamplerState s, vector<float, Shape.dimensions+isArray> location, float bias, constexpr vector<int, Shape.planeDimensions> offset)
    {
        __target_switch
        {
            case cpp:
            case hlsl:
            __intrinsic_asm ".SampleBias";
            case glsl:
            __intrinsic_asm "$ctextureOffset($p, $2, $4, $3)$z";
            case spirv:
            return spirv_asm
            {
                %sampledImage : __sampledImageType(this) = OpSampledImage $this $s;
                %sampled : __sampledType(T) = OpImageSampleImplicitLod %sampledImage $location None|Bias|ConstOffset $bias $offset;
                __truncate $$T result __sampledType(T) %sampled;
            };
        }
    }

    [__readNone] [ForceInline]
    float SampleCmp(SamplerComparisonState s, vector<float, Shape.dimensions+isArray> location, float compareValue)
    {
        __target_switch
        {
        case glsl:
            return __glsl_texture(s, __makeVector(location,compareValue));
        case hlsl:
            __intrinsic_asm ".SampleCmp";
        case spirv:
            return spirv_asm
            {
                %sampledImage : __sampledImageType(this) = OpSampledImage $this $s;
                result:$$float = OpImageSampleDrefImplicitLod %sampledImage $location $compareValue;
            };
        }
    }

    [__readNone] [ForceInline]
    float SampleCmpLevelZero(SamplerComparisonState s, vector<float, Shape.dimensions+isArray> location, float compareValue)
    {
        __target_switch
        {
        case glsl:
            return __glsl_texture_level_zero(s, __makeVector(location,compareValue));
        case hlsl:
            __intrinsic_asm ".SampleCmpLevelZero";
        case spirv:
            const float zeroFloat = 0.0f;
            return spirv_asm
            {
                %sampledImage : __sampledImageType(this) = OpSampledImage $this $s;
                result:$$float = OpImageSampleDrefExplicitLod %sampledImage $location $compareValue Lod $zeroFloat;
            };
        }
    }

    [__readNone] [ForceInline]
    float SampleCmp(SamplerComparisonState s, vector<float, Shape.dimensions+isArray> location, float compareValue, constexpr vector<int, Shape.planeDimensions> offset)
    {
        __target_switch
        {
        case glsl:
            return __glsl_texture_offset(s, __makeVector(location,compareValue), offset);
        case hlsl:
            __intrinsic_asm ".SampleCmp";
        case spirv:
            return spirv_asm
            {
                %sampledImage : __sampledImageType(this) = OpSampledImage $this $s;
                result:$$float = OpImageSampleDrefImplicitLod %sampledImage $location $compareValue ConstOffset $offset;
            };
        }
    }

    [__readNone] [ForceInline]
    float SampleCmpLevelZero(SamplerComparisonState s, vector<float, Shape.dimensions+isArray> location, float compareValue, constexpr vector<int, Shape.planeDimensions> offset)
    {
        __target_switch
        {
        case glsl:
            return __glsl_texture_offset_level_zero(s, __makeVector(location,compareValue), offset);
        case hlsl:
            __intrinsic_asm ".SampleCmpLevelZero";
        case spirv:
            const float zeroFloat = 0.0f;
            return spirv_asm
            {
                %sampledImage : __sampledImageType(this) = OpSampledImage $this $s;
                result:$$float = OpImageSampleDrefExplicitLod %sampledImage $location $compareValue Lod|ConstOffset $zeroFloat $offset;
            };
        }
    }

    [__readNone]
    T SampleGrad(SamplerState s, vector<float, Shape.dimensions+isArray> location, vector<float, Shape.planeDimensions> gradX, vector<float, Shape.planeDimensions> gradY)
    {
        __target_switch
        {
            case cpp:
            case hlsl:
            __intrinsic_asm ".SampleGrad";
            case glsl:
            __intrinsic_asm "$ctextureGrad($p, $2, $3, $4)$z";
            case spirv:
                return spirv_asm
                {
                    %sampledImage : __sampledImageType(this) = OpSampledImage $this $s;
                    %sampled : __sampledType(T) = OpImageSampleExplicitLod %sampledImage $location None|Grad $gradX $gradY;
                    __truncate $$T result __sampledType(T) %sampled;
                };
        }
    }

    [__readNone]
    T SampleGrad(SamplerState s, vector<float, Shape.dimensions+isArray> location, vector<float, Shape.planeDimensions> gradX, vector<float, Shape.planeDimensions> gradY, constexpr vector<int, Shape.planeDimensions> offset)
    {
        __target_switch
        {
            case cpp:
            case hlsl:
            __intrinsic_asm ".SampleGrad";
            case glsl:
            __intrinsic_asm "$ctextureGradOffset($p, $2, $3, $4, $5)$z";
            case spirv:
                return spirv_asm
                {
                    %sampledImage : __sampledImageType(this) = OpSampledImage $this $s;
                    %sampled : __sampledType(T) = OpImageSampleExplicitLod %sampledImage $location None|Grad|ConstOffset $gradX $gradY $offset;
                    __truncate $$T result __sampledType(T) %sampled;

                };
        }
    }

    __glsl_extension(GL_ARB_sparse_texture_clamp)
    [__readNone]
    T SampleGrad(SamplerState s, vector<float, Shape.dimensions+isArray> location, vector<float, Shape.planeDimensions> gradX, vector<float, Shape.planeDimensions> gradY, constexpr vector<int, Shape.planeDimensions> offset, float lodClamp)
    {
        __target_switch
        {
            case cpp:
            case hlsl:
            __intrinsic_asm ".SampleGrad";
            case glsl:
            __intrinsic_asm "$ctextureGradOffsetClampARB($p, $2, $3, $4, $5, $6)$z";
            case spirv:
                return spirv_asm
                {
                    OpCapability MinLod;
                    %sampledImage : __sampledImageType(this) = OpSampledImage $this $s;
                    %sampled : __sampledType(T) = OpImageSampleExplicitLod %sampledImage $location None|Grad|ConstOffset|MinLod $gradX $gradY $offset $lodClamp;
                    __truncate $$T result __sampledType(T) %sampled;
                };
        }
    }

    [__readNone]
    T SampleLevel(SamplerState s, vector<float, Shape.dimensions+isArray> location, float level)
    {
        __target_switch
        {
            case cpp:
            case hlsl:
                __intrinsic_asm ".SampleLevel";
            case glsl:
                __intrinsic_asm "$ctextureLod($p, $2, $3)$z";
            case cuda:
                if (isArray != 0)
                {
                    switch(Shape.flavor)
                    {
                    case $(SLANG_TEXTURE_1D):
                        __intrinsic_asm "tex1DLayeredLod<$T0>($0, ($2).x, int(($2).y), ($3))";
                    case $(SLANG_TEXTURE_2D):
                        __intrinsic_asm "tex2DLayeredLod<$T0>($0, ($2).x, ($2).y, int(($2).z), ($3))";
                    case $(SLANG_TEXTURE_CUBE):
                        __intrinsic_asm "texCubemapLayeredLod<$T0>($0, ($2).x, ($2).y, ($2).z, int(($2).w), ($3))";
                    default:
                        __intrinsic_asm "<invalid intrinsic>";
                    }
                }
                else
                {
                    switch(Shape.flavor)
                    {
                    case $(SLANG_TEXTURE_1D):
                        __intrinsic_asm "tex1DLod<$T0>($0, ($2), ($3))";
                    case $(SLANG_TEXTURE_2D):
                        __intrinsic_asm "tex2DLod<$T0>($0, ($2).x, ($2).y, ($3))";
                    case $(SLANG_TEXTURE_3D):
                        __intrinsic_asm "tex3DLod<$T0>($0, ($2).x, ($2).y, ($2).z, ($3))";
                    case $(SLANG_TEXTURE_CUBE):
                        __intrinsic_asm "texCubemapLod<$T0>($0, ($2).x, ($2).y, ($2).z, ($3))";
                    }
                }
            case spirv:
            return spirv_asm
            {
                %sampledImage : __sampledImageType(this) = OpSampledImage $this $s;
                %sampled : __sampledType(T) = OpImageSampleExplicitLod %sampledImage $location None|Lod $level;
                __truncate $$T result __sampledType(T) %sampled;
            };
        }
    }

    [__readNone]
    T SampleLevel(SamplerState s, vector<float, Shape.dimensions+isArray> location, float level, constexpr vector<int, Shape.planeDimensions> offset)
    {
        __target_switch
        {
            case cpp:
            case hlsl:
                __intrinsic_asm ".SampleLevel";
            case glsl:
                __intrinsic_asm "$ctextureLodOffset($p, $2, $3, $4)$z";
            case spirv:
                return spirv_asm
                {
                    %sampledImage : __sampledImageType(this) = OpSampledImage $this $s;
                    %sampled : __sampledType(T) = OpImageSampleExplicitLod %sampledImage $location None|Lod|ConstOffset $level $offset;
                    __truncate $$T result __sampledType(T) %sampled;
                };
        }
    }
}

// Texture.GetDimensions and Sampler.GetDimensions
${{{{
const char* kTextureShapeTypeNames[] = {
    "__Shape1D", "__Shape2D", "__Shape3D", "__ShapeCube"};
for (int shapeIndex = 0; shapeIndex < 4; shapeIndex++)
for (int isArray = 0; isArray <= 1; isArray++)
for (int isMS = 0; isMS <= 1; isMS++) {
    if (isMS)
    {
        if (shapeIndex != kStdlibShapeIndex2D)
            continue;
    }
    if (isArray)
    {
        if (shapeIndex == kStdlibShapeIndex3D)
            continue;
    }
    auto shapeTypeName = kTextureShapeTypeNames[shapeIndex];
    TextureTypeInfo textureTypeInfo(kBaseTextureShapes[shapeIndex], isArray, isMS, 0, sb, path);
}}}}

__generic<T, let sampleCount:int, let access:int, let isShadow:int, let isCombined:int, let format:int>
extension __TextureImpl<T,$(shapeTypeName),$(isArray),$(isMS),sampleCount,access,isShadow,isCombined,format>
{
    ${{{{
    textureTypeInfo.writeGetDimensionFunctions();
    }}}}
}

${{{{
}
}}}}

// Texture.GetSamplePosition(int s);
__generic<T, Shape: __ITextureShape, let isArray:int, let sampleCount:int, let access:int, let isShadow:int, let isCombined:int, let format:int>
extension __TextureImpl<T,Shape,isArray,1,sampleCount,access,isShadow,isCombined,format>
{
    float2 GetSamplePosition(int s);
}

__intrinsic_op($(kIROp_MakeArray))
Array<T,4> __makeArray<T>(T v0, T v1, T v2, T v3);

// Gather for scalar textures.
__generic<TElement, T, Shape: __ITextureShape, let isArray:int, let sampleCount:int, let access:int, let isShadow:int, let format:int>
[ForceInline]
vector<TElement,4> __glsl_gather(__TextureImpl<T, Shape, isArray, 0, sampleCount, access, isShadow, 0, format> texture, SamplerState s, vector<float, Shape.dimensions+isArray> location, int component)
{
    __target_switch
    {
    case glsl:
        __intrinsic_asm "textureGather($p, $2, $3)";
    case spirv:
        return spirv_asm {
            %sampledImage : __sampledImageType(texture) = OpSampledImage $texture $s;
            result:$$vector<TElement,4> = OpImageGather %sampledImage $location $component;
        };
    }
}
__generic<TElement, T, Shape: __ITextureShape, let isArray:int, let sampleCount:int, let access:int, let isShadow:int, let format:int>
[ForceInline]
vector<TElement,4> __glsl_gather_offset(__TextureImpl<T, Shape, isArray, 0, sampleCount, access, isShadow, 0, format> texture, SamplerState s, vector<float, Shape.dimensions+isArray> location, int component, vector<int, Shape.planeDimensions> offset)
{
    __target_switch
    {
    case glsl:
        __intrinsic_asm "textureGatherOffset($p, $2, $3, $4)";
    case spirv:
        return spirv_asm {
            %sampledImage : __sampledImageType(texture) = OpSampledImage $texture $s;
            result:$$vector<TElement,4> = OpImageGather %sampledImage $location $component ConstOffset $offset;
        };
    }
}
__generic<TElement, T, Shape: __ITextureShape, let isArray:int, let sampleCount:int, let access:int, let isShadow:int, let format:int>
[ForceInline]
vector<TElement,4> __glsl_gather_offsets(__TextureImpl<T, Shape, isArray, 0, sampleCount, access, isShadow, 0, format> texture, SamplerState s, vector<float, Shape.dimensions+isArray> location, int component,
    vector<int, Shape.planeDimensions> offset1,
    vector<int, Shape.planeDimensions> offset2,
    vector<int, Shape.planeDimensions> offset3,
    vector<int, Shape.planeDimensions> offset4)
{
    __target_switch
    {
    case glsl:
        __intrinsic_asm "textureGatherOffsets($p, $2, $3, $T4[]($4, $5, $6, $7))";
    case spirv:
        let offsets = __makeArray(offset1,offset2,offset3,offset4);
        return spirv_asm {
            OpCapability ImageGatherExtended;
            %sampledImage : __sampledImageType(texture) = OpSampledImage $texture $s;
            result:$$vector<TElement,4> = OpImageGather %sampledImage $location $component ConstOffsets $offsets;
        };
    }
}
__generic<TElement, T, Shape: __ITextureShape, let isArray:int, let sampleCount:int, let access:int, let isShadow:int, let format:int>
[ForceInline]
vector<TElement,4> __glsl_gatherCmp(__TextureImpl<T, Shape, isArray, 0, sampleCount, access, isShadow, 0, format> texture, SamplerComparisonState s, vector<float, Shape.dimensions+isArray> location, int componentIndex, TElement compareValue)
{
    __target_switch
    {
    case glsl:
        __intrinsic_asm "textureGather($p, $2, $4)";
    case spirv:
        return spirv_asm {
            %sampledImage : __sampledImageType(texture) = OpSampledImage $texture $s;
            result:$$vector<TElement,4> = OpImageDrefGather %sampledImage $location $compareValue;
        };
    }
}
__generic<TElement, T, Shape: __ITextureShape, let isArray:int, let sampleCount:int, let access:int, let isShadow:int, let format:int>
[ForceInline]
vector<TElement,4> __glsl_gatherCmp_offset(__TextureImpl<T, Shape, isArray, 0, sampleCount, access, isShadow, 0, format> texture, SamplerComparisonState s, vector<float, Shape.dimensions+isArray> location, int componentIndex, TElement compareValue, vector<int, Shape.planeDimensions> offset)
{
    __target_switch
    {
    case glsl:
        __intrinsic_asm "textureGatherOffset($p, $2, $4, $5)";
    case spirv:
        return spirv_asm {
            %sampledImage : __sampledImageType(texture) = OpSampledImage $texture $s;
            result:$$vector<TElement,4> = OpImageDrefGather %sampledImage $location $compareValue ConstOffset $offset;
        };
    }
}
__generic<TElement, T, Shape: __ITextureShape, let isArray:int, let sampleCount:int, let access:int, let isShadow:int, let format:int>
[ForceInline]
vector<TElement,4> __glsl_gatherCmp_offsets(__TextureImpl<T, Shape, isArray, 0, sampleCount, access, isShadow, 0, format> texture, SamplerComparisonState s, vector<float, Shape.dimensions+isArray> location, int componentIndex, TElement compareValue,
    vector<int, Shape.planeDimensions> offset1,
    vector<int, Shape.planeDimensions> offset2,
    vector<int, Shape.planeDimensions> offset3,
    vector<int, Shape.planeDimensions> offset4
    )
{
    __target_switch
    {
    case glsl:
        __intrinsic_asm "textureGatherOffsets($p, $2, $4, $T5[]($5, $6, $7, $8))";
    case spirv:
        let offsets = __makeArray(offset1,offset2,offset3,offset4);
        return spirv_asm {
            OpCapability ImageGatherExtended;
            %sampledImage : __sampledImageType(texture) = OpSampledImage $texture $s;
            result:$$vector<TElement,4> = OpImageDrefGather %sampledImage $location $compareValue ConstOffsets $offsets;
        };
    }
}

${{{{
for (int isScalarTexture = 0; isScalarTexture <= 1; isScalarTexture++) {
    if (isScalarTexture == 0)
    {
        sb << "__generic<T:__BuiltinArithmeticType, Shape: __ITextureShape, let isArray:int, let sampleCount:int, let isShadow:int, let format:int>\n";
        sb << "extension __TextureImpl<T,Shape,isArray,0,sampleCount,0,isShadow,0,format>\n";
    }
    else
    {
        sb << "__generic<T:__BuiltinArithmeticType, let N:int, Shape: __ITextureShape, let isArray:int, let sampleCount:int, let isShadow:int, let format:int>\n";
        sb << "extension __TextureImpl<vector<T,N>,Shape,isArray,0,sampleCount,0,isShadow,0,format>\n";
    }
}}}}
{ // begin extension for gather
${{{{
    // Gather component
    for (int isCmp = 0; isCmp <= 1; ++isCmp) {
        const char* cmp = isCmp ? "Cmp" : "";
        const char* cmpParam = isCmp? "T compareValue, " : "";
        const char* compareArg = isCmp ? "compareValue, " : "";
        const char* samplerStateType = isCmp ? "SamplerComparisonState" : "SamplerState";
        const char* componentNames[] = {"", "Red", "Green", "Blue", "Alpha"};
        for (auto componentId = 0;  componentId < 4; componentId++) {
            auto component = componentNames[componentId];
            auto componentIndex = componentId == 0 ? 0 : componentId - 1;
    }}}}
    [ForceInline]
    vector<T,4> Gather$(cmp)$(component)($(samplerStateType) s, vector<float, Shape.dimensions+isArray> location, $(cmpParam))
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".Gather$(cmp)$(component)";
        case glsl:
        case spirv:
            return __glsl_gather$(cmp)<T>(this, s, location, $(componentIndex), $(compareArg));
        }
    }
    [ForceInline]
    vector<T,4> Gather$(cmp)$(component)($(samplerStateType) s, vector<float, Shape.dimensions+isArray> location, $(cmpParam) vector<int, Shape.planeDimensions> offset)
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".Gather$(cmp)$(component)";
        case glsl:
        case spirv:
            return __glsl_gather$(cmp)_offset<T>(this, s, location, $(componentIndex), $(compareArg) offset);
        }
    }
    [ForceInline]
    vector<T,4> Gather$(cmp)$(component)($(samplerStateType) s, vector<float, Shape.dimensions+isArray> location, $(cmpParam)
        vector<int, Shape.planeDimensions> offset1,
        vector<int, Shape.planeDimensions> offset2,
        vector<int, Shape.planeDimensions> offset3,
        vector<int, Shape.planeDimensions> offset4)
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".Gather$(cmp)$(component)";
        case glsl:
        case spirv:
            return __glsl_gather$(cmp)_offsets<T>(this, s, location, $(componentIndex), $(compareArg) offset1,offset2,offset3,offset4);
        }
    }
    ${{{{
    } // for (component)
    } // for (isCmp)
    }}}}
} // end extension for gather

${{{{
} // for (isScalarTexture)
}}}}

// Load/Subscript for readonly, no MS textures

__generic<T, Shape: __ITextureShape, let isArray:int, let sampleCount:int, let isShadow:int, let isCombined:int, let format:int>
extension __TextureImpl<T,Shape,isArray,0,sampleCount,0,isShadow,isCombined,format>
{
    static const int isMS = 0;
    static const int access = $(kStdlibResourceAccessReadOnly);

    __glsl_extension(GL_EXT_samplerless_texture_functions)
    [__readNone]
    T __glsl_load(vector<int, Shape.dimensions+isArray> location)
    {
        __intrinsic_asm "$ctexelFetch($0, ($1), 0)$z";
    }

    __glsl_extension(GL_EXT_samplerless_texture_functions)
    [__readNone]
    [ForceInline]
    T Load(vector<int, Shape.dimensions+isArray+1> location)
    {
        __target_switch
        {
        case cpp:
        case hlsl:
            __intrinsic_asm ".Load";
        case glsl:
            __intrinsic_asm "$ctexelFetch($0, ($1).$w1b, ($1).$w1e)$z";
        case spirv:
            const int lodLoc = Shape.dimensions+isArray;
            let coord = __vectorReshape<Shape.dimensions+isArray>(location);
            let lod = location[lodLoc];
            if (isCombined != 0)
            {
                return spirv_asm
                {
                    %image:__imageType(this) = OpImage $this;
                    %sampled:__sampledType(T) = OpImageFetch %image $coord Lod $lod;
                    __truncate $$T result __sampledType(T) %sampled;
                };
            }
            else
            {
                return spirv_asm
                {
                    %sampled:__sampledType(T) = OpImageFetch $this $coord Lod $lod;
                    __truncate $$T result __sampledType(T) %sampled;
                };
            }
        }
    }

    __glsl_extension(GL_EXT_samplerless_texture_functions)
    [__readNone]
    [ForceInline]
    T Load(vector<int, Shape.dimensions+isArray+1> location, constexpr vector<int, Shape.planeDimensions> offset)
    {
        __target_switch
        {
            case cpp:
            case hlsl:
            __intrinsic_asm ".Load";
            case glsl:
            __intrinsic_asm "$ctexelFetchOffset($0, ($1).$w1b, ($1).$w1e, ($2))$z";
            case spirv:
                const int lodLoc = Shape.dimensions+isArray;
                let coord = __vectorReshape<Shape.dimensions+isArray>(location);
                let lod = location[lodLoc];
                if (isCombined != 0)
                {
                    return spirv_asm
                    {
                        %image:__imageType(this) = OpImage $this;
                        %sampled:__sampledType(T) = OpImageFetch %image $coord Lod|ConstOffset $lod $offset;
                        __truncate $$T result __sampledType(T) %sampled;
                    };
                }
                else
                {
                    return spirv_asm
                    {
                        %sampled:__sampledType(T) = OpImageFetch $this $coord Lod|ConstOffset $lod $offset;
                        __truncate $$T result __sampledType(T) %sampled;
                    };
                }
        }
    }

    [__readNone]
    [ForceInline]
    __target_intrinsic(hlsl)
    T Load(vector<int, Shape.dimensions+isArray+1> location, constexpr vector<int, Shape.planeDimensions> offset, out uint status)
    {
        status = 0;
        return Load(location, offset);
    }

    __subscript(vector<uint, Shape.dimensions+isArray> location) -> T
    {
        __glsl_extension(GL_EXT_samplerless_texture_functions)
        [__readNone]
        [ForceInline]
        get
        {
            __target_switch
            {
                case cpp:
                case hlsl:
                    __intrinsic_asm ".operator[]";
                case glsl:
                    return __glsl_load(location);
                case spirv:
                    if (isCombined != 0)
                    {
                        return spirv_asm
                        {
                            %image:__imageType(this) = OpImage $this;
                            %sampled:__sampledType(T) = OpImageFetch %image $location;
                            __truncate $$T result __sampledType(T) %sampled;
                        };
                    }
                    else
                    {
                        return spirv_asm
                        {
                            %sampled:__sampledType(T) = OpImageFetch $this $location;
                            __truncate $$T result __sampledType(T) %sampled;
                        };
                    }
            }
        }
    }
}

// Texture Load/Subscript for readonly, MS textures

__generic<T, Shape: __ITextureShape, let isArray:int, let sampleCount:int, let isShadow:int, let isCombined:int, let format:int>
extension __TextureImpl<T,Shape,isArray,1,sampleCount,0,isShadow,isCombined,format>
{
    static const int access = $(kStdlibResourceAccessReadOnly);
    static const int isMS = 1;

    __glsl_extension(GL_EXT_samplerless_texture_functions)
    [__readNone]
    [ForceInline]
    T Load(vector<int, Shape.dimensions+isArray> location, int sampleIndex)
    {
        __target_switch
        {
            case cpp:
            case hlsl:
            __intrinsic_asm ".Load";
            case glsl:
                __intrinsic_asm "$ctexelFetch($0, $1, ($2))$z";
            case spirv:
                if (isCombined != 0)
                {
                    return spirv_asm
                    {
                        %image:__imageType(this) = OpImage $this;
                        %sampled:__sampledType(T) = OpImageFetch %image $location Sample $sampleIndex;
                        __truncate $$T result __sampledType(T) %sampled;
                    };
                }
                else
                {
                    return spirv_asm
                    {
                        %sampled:__sampledType(T) = OpImageFetch $this $location Sample $sampleIndex;
                        __truncate $$T result __sampledType(T) %sampled;
                    };
                }
        }
    }

    __glsl_extension(GL_EXT_samplerless_texture_functions)
    [__readNone]
    [ForceInline]
    T Load(vector<int, Shape.dimensions+isArray> location, int sampleIndex, constexpr vector<int, Shape.planeDimensions> offset)
    {
        __target_switch
        {
            case cpp:
            case hlsl:
            __intrinsic_asm ".Load";
            case glsl:
            __intrinsic_asm "$ctexelFetchOffset($0, $1, ($2), ($3))$z";
            case spirv:
                if (isCombined != 0)
                {
                    return spirv_asm
                    {
                        %image:__imageType(this) = OpImage $this;
                        %sampled:__sampledType(T) = OpImageFetch %image $location ConstOffset|Sample $offset $sampleIndex;
                        __truncate $$T result __sampledType(T) %sampled;
                    };
                }
                else
                {
                    return spirv_asm
                    {
                        %sampled:__sampledType(T) = OpImageFetch $this $location ConstOffset|Sample $offset  $sampleIndex;
                        __truncate $$T result __sampledType(T) %sampled;
                    };
                }
        }
    }

    [__readNone]
    [ForceInline]
    __target_intrinsic(hlsl)
    T Load(vector<int, Shape.dimensions+isArray> location, int sampleIndex, constexpr vector<int, Shape.planeDimensions> offset, out uint status)
    {
        status = 0;
        return Load(location, sampleIndex, offset);
    }

    __subscript(vector<uint, Shape.dimensions+isArray> location, int sampleIndex) -> T
    {
        __glsl_extension(GL_EXT_samplerless_texture_functions)
        [__readNone]
        [ForceInline]
        get
        {
            __target_switch
            {
                case cpp:
                case hlsl:
                    __intrinsic_asm "($0).sample[$2][$1]";
                case glsl:
                case spirv:
                case cuda:
                    return Load(location, sampleIndex);
            }
        }
    }
}

// Load/Subscript for readwrite textures
${{{{
    for (int access = kStdlibResourceAccessReadWrite; access<=kStdlibResourceAccessRasterizerOrdered; access++) {
        const char* glslIntrinsic = "$cimageLoad($0, $1)$z";
        const char* glslIntrinsicOffset = "$cimageLoad($0, ($1)+($2))$z";
        const char* glslIntrinsicMS = "$cimageLoad($0, $1, $2)$z";
        const char* glslIntrinsicMSOffset = "$cimageLoad($0, ($1)+($3), $2)$z";
}}}}
__generic<T, Shape: __ITextureShape, let isArray:int, let sampleCount:int, let isShadow:int, let format:int>
extension __TextureImpl<T,Shape,isArray,0,sampleCount,$(access),isShadow, 0,format>
{
    [__readNone]
    [ForceInline]
    T Load(vector<int, Shape.dimensions+isArray> location)
    {
        __target_switch
        {
            case cpp:
            case hlsl:
                __intrinsic_asm ".Load";
            case glsl:
                __intrinsic_asm "$(glslIntrinsic)";
            case cuda:
                if (isArray != 0)
                {
                    switch(Shape.flavor)
                    {
                    case $(SLANG_TEXTURE_1D):
                        __intrinsic_asm "surf1DLayeredread$C<$T0>($0, ($1).x * $E, ($1).y, SLANG_CUDA_BOUNDARY_MODE)";
                    case $(SLANG_TEXTURE_2D):
                        __intrinsic_asm "surf2DLayeredread$C<$T0>($0, ($1).x * $E, ($1).y, ($1).z, SLANG_CUDA_BOUNDARY_MODE)";
                    case $(SLANG_TEXTURE_3D):
                        __intrinsic_asm "surf3DLayeredread$C<$T0>($0, ($1).x * $E, ($1).y, ($1).z, ($1).w, SLANG_CUDA_BOUNDARY_MODE)";
                    default:
                        __intrinsic_asm "<invalid intrinsic>";
                    }
                }
                else
                {
                    switch(Shape.flavor)
                    {
                    case $(SLANG_TEXTURE_1D):
                        __intrinsic_asm "surf1Dread$C<$T0>($0, ($1) * $E, SLANG_CUDA_BOUNDARY_MODE)";
                    case $(SLANG_TEXTURE_2D):
                        __intrinsic_asm "surf2Dread$C<$T0>($0, ($1).x * $E, ($1).y, SLANG_CUDA_BOUNDARY_MODE)";
                    case $(SLANG_TEXTURE_3D):
                        __intrinsic_asm "surf3Dread$C<$T0>($0, ($1).x * $E, ($1).y, ($1).z, SLANG_CUDA_BOUNDARY_MODE)";
                    default:
                        __intrinsic_asm "<invalid intrinsic>";
                    }
                }
            case spirv:
                return spirv_asm
                {
                    %sampled:__sampledType(T) = OpImageRead $this $location;
                    __truncate $$T result __sampledType(T) %sampled;
                };
        }
    }

    [__readNone]
    [ForceInline]
    T Load(vector<int, Shape.dimensions+isArray> location, vector<int, Shape.dimensions+isArray> offset)
    {
        __target_switch
        {
            case cpp:
            case hlsl:
                __intrinsic_asm ".Load";
            case glsl:
                __intrinsic_asm "$(glslIntrinsicOffset)";
            case spirv:
                return spirv_asm
                {
                    %sampled:__sampledType(T) = OpImageRead $this $location ConstOffset $offset;
                    __truncate $$T result __sampledType(T) %sampled;
                };
        }
    }

    [__readNone]
    [ForceInline]
    T Load(vector<int, Shape.dimensions+isArray> location, vector<int, Shape.dimensions+isArray> offset, out uint status)
    {
        __target_switch
        {
        case hlsl:
        case cpp:
            __intrinsic_asm ".Load";
        default:
            status = 0;
            return Load(location, offset);
        }
    }

    void __glslImageStore(vector<int, Shape.dimensions+isArray> location, T value)
    {
        __intrinsic_asm "imageStore($0, $1, $V2)";
    }

    __subscript(vector<uint, Shape.dimensions+isArray> location) -> T
    {
        [__readNone]
        [ForceInline]
        get
        {
            __target_switch
            {
                case cpp:
                case hlsl:
                    __intrinsic_asm ".operator[]";
                case glsl:
                case spirv:
                case cuda:
                    return Load(location);
            }
        }

        [nonmutating]
        [ForceInline]
        set(T newValue)
        {
            __target_switch
            {
                case cpp:
                case hlsl:
                __intrinsic_asm ".operator[]";
                case glsl:
                    __glslImageStore(location, newValue);
                case cuda:
                    if (isArray != 0)
                    {
                        switch(Shape.flavor)
                        {
                        case $(SLANG_TEXTURE_1D):
                            __intrinsic_asm "surf1DLayeredwrite$C<$T0>($2, $0, ($1).x * $E, ($1).y, SLANG_CUDA_BOUNDARY_MODE)";
                        case $(SLANG_TEXTURE_2D):
                            __intrinsic_asm "surf2DLayeredwrite$C<$T0>($2, $0, ($1).x * $E, ($1).y, ($1).z, SLANG_CUDA_BOUNDARY_MODE)";
                        case $(SLANG_TEXTURE_3D):
                            __intrinsic_asm "surf3DLayeredwrite$C<$T0>($2, $0, ($1).x * $E, ($1).y, ($1).z, ($1).w, SLANG_CUDA_BOUNDARY_MODE)";
                        default:
                            __intrinsic_asm "<invalid intrinsic>";
                        }
                    }
                    else
                    {
                        switch(Shape.flavor)
                        {
                        case $(SLANG_TEXTURE_1D):
                            __intrinsic_asm "surf1Dwrite$C<$T0>($2, $0, ($1) * $E, SLANG_CUDA_BOUNDARY_MODE)";
                        case $(SLANG_TEXTURE_2D):
                            __intrinsic_asm "surf2Dwrite$C<$T0>($2, $0, ($1).x * $E, ($1).y, SLANG_CUDA_BOUNDARY_MODE)";
                        case $(SLANG_TEXTURE_3D):
                            __intrinsic_asm "surf3Dwrite$C<$T0>($2, $0, ($1).x * $E, ($1).y, ($1).z, SLANG_CUDA_BOUNDARY_MODE)";
                        default:
                            __intrinsic_asm "<invalid intrinsic>";
                        }
                    }
                case spirv:
                    return spirv_asm
                    {
                        OpImageWrite $this $location $newValue;
                    };
            }
        }

        __intrinsic_op($(kIROp_ImageSubscript)) ref;
    }
}

${{{{
if (access == kStdlibResourceAccessReadWrite) {
}}}}
// RW MS textures.
__generic<T, Shape: __ITextureShape, let isArray:int, let sampleCount:int, let isShadow:int, let format:int>
extension __TextureImpl<T,Shape,isArray,1,sampleCount,$(access),isShadow, 0,format>
{
    [__readNone]
    [ForceInline]
    T Load(vector<int, Shape.dimensions+isArray> location, int sampleIndex)
    {
        __target_switch
        {
            case cpp:
            case hlsl:
                __intrinsic_asm ".Load";
            case glsl:
                __intrinsic_asm "$(glslIntrinsicMS)";
            case spirv:
                return spirv_asm
                {
                    %sampled:__sampledType(T) = OpImageRead $this $location Sample $sampleIndex;
                    __truncate $$T result __sampledType(T) %sampled;
                };
        }
    }

    [__readNone]
    [ForceInline]
    T Load(vector<int, Shape.dimensions+isArray> location, int sampleIndex, vector<int, Shape.dimensions+isArray> offset)
    {
        __target_switch
        {
            case cpp:
            case hlsl:
                __intrinsic_asm ".Load";
            case glsl:
                __intrinsic_asm "$(glslIntrinsicMSOffset)";
            case spirv:
                return spirv_asm
                {
                    %sampled:__sampledType(T) = OpImageRead $this $location ConstOffset|Sample $offset $sampleIndex;
                    __truncate $$T result __sampledType(T) %sampled;
                };
        }
    }

    [__readNone]
    [ForceInline]
    T Load(vector<int, Shape.dimensions+isArray> location, int sampleIndex, vector<int, Shape.dimensions+isArray> offset, out uint status)
    {
        __target_switch
        {
        case hlsl:
        case cpp:
            __intrinsic_asm ".Load";
        default:
            status = 0;
            return Load(location, sampleIndex, offset);
        }
    }

    void __glslImageStore(vector<int, Shape.dimensions+isArray> location, int sampleIndex, T value)
    {
        __intrinsic_asm "imageStore($0, $1, $2, $V3)";
    }

    __subscript(vector<uint, Shape.dimensions+isArray> location, int sampleIndex) -> T
    {
        [__readNone]
        [ForceInline]
        get
        {
            __target_switch
            {
                case cpp:
                case hlsl:
                    __intrinsic_asm "$0.sample[$2][$1]";
                case glsl:
                case spirv:
                case cuda:
                    return Load(location, sampleIndex);
            }
        }

        [nonmutating]
        [ForceInline]
        set(T newValue)
        {
            __target_switch
            {
                case cpp:
                case hlsl:
                __intrinsic_asm "$0.sample[$2][$1]";
                case glsl:
                    __glslImageStore(location, sampleIndex, newValue);
                case spirv:
                    return spirv_asm
                    {
                        OpImageWrite $this $location $newValue Sample $sampleIndex;
                    };
            }
        }

        __intrinsic_op($(kIROp_ImageSubscript)) ref;
    }
}

${{{{
} // if (access == kStdlibResourceAccessReadWrite) // for RW MS textures.
} // for (access).
}}}}

// Texture type aliases.
// T, Shape: __ITextureShape, let isArray:int, let isMS:int, let sampleCount:int, let access:int, let isShadow:int, let isCombined:int, let format:int
${{{{
    const char* shapeTypeNames[] = {"1D", "2D", "3D", "Cube"};
    const char* accessPrefix[] = {"", "RW", "RasterizerOrdered", "Feedback"};
    const char* arrayPostFix[] = {"", "Array"};
    const char* msPostFix[] = {"", "MS"};
    for (int shape = 0; shape < 4; shape++)
    for (int isArray = 0; isArray<=1; isArray++)
    for (int isMS = 0; isMS<=1; isMS++)
    for (int isCombined = 0; isCombined<=1; isCombined++)
    for (int access = kStdlibResourceAccessReadOnly; access<=kStdlibResourceAccessFeedback; access++) {
        if (access != kStdlibResourceAccessReadOnly)
        {
            // No RW Cube.
            if (shape == kStdlibShapeIndexCube) continue;
        }
        if (access == kStdlibResourceAccessFeedback)
        {
            // Feedback only defined for Texture2D and Texture2DArray.
            if (shape != 1) continue;
            if (isMS) continue;
            if (isCombined) continue;
        }
        if (isMS)
        {
            // Only Texture2DMS.
            if (shape != kStdlibShapeIndex2D)
                continue;
            // Only Texture2DMS or RWTexture2DMS.
            if (access >= kStdlibShapeIndex3D)
                continue;
        }
        // No 3D Array.
        if (shape == kStdlibShapeIndex3D && isArray == 1)
            continue;
        const char* textureTypeName = isCombined ? "Sampler" : "Texture";
}}}}
typealias $(accessPrefix[access])$(textureTypeName)$(shapeTypeNames[shape])$(arrayPostFix[isArray])$(msPostFix[isMS])<T=float4, let sampleCount:int=0, let format:int=0> = __TextureImpl<T, __Shape$(shapeTypeNames[shape]), $(isArray), $(isMS), sampleCount, $(access), 0, $(isCombined), format>;
${{{{
}
}}}}

// AtomicAdd

// Make the GLSL atomicAdd available.
// We have separate int/float implementations, as the float version requires some specific extensions
// https://www.khronos.org/registry/OpenGL/extensions/NV/NV_shader_atomic_float.txt

__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_float)
float __atomicAdd(__ref float value, float amount)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "atomicAdd($0, $1)";
    case spirv:
        return spirv_asm
        {
            OpExtension "SPV_EXT_shader_atomic_float_add";
            OpCapability AtomicFloat32AddEXT;
            result:$$float = OpAtomicFAddEXT &value Device None $amount
        };
    }
}

// Helper for hlsl, using NVAPI
__target_intrinsic(hlsl, "NvInterlockedAddUint64($0, $1, $2)")
[__requiresNVAPI]
uint2 __atomicAdd(RWByteAddressBuffer buf, uint offset, uint2);

// atomic add for hlsl using SM6.6
__target_intrinsic(hlsl, "$0.InterlockedAdd64($1, $2, $3)")
void __atomicAdd(RWByteAddressBuffer buf, uint offset, int64_t value, out int64_t originalValue);
__target_intrinsic(hlsl, "$0.InterlockedAdd64($1, $2, $3)")
void __atomicAdd(RWByteAddressBuffer buf, uint offset, uint64_t value, out uint64_t originalValue);

// Int versions require glsl 4.30
// https://www.khronos.org/registry/OpenGL-Refpages/gl4/html/atomicAdd.xhtml

__glsl_version(430)
int __atomicAdd(__ref int value, int amount)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "atomicAdd($0, $1)";
    case spirv:
        return spirv_asm
        {
            result:$$int = OpAtomicIAdd &value Device None $amount;
        };
    }
}

__glsl_version(430)
uint __atomicAdd(__ref uint value, uint amount)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "atomicAdd($0, $1)";
    case spirv:
        return spirv_asm
        {
            result:$$uint = OpAtomicIAdd &value Device None $amount;
        };
    }
}

__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
int64_t __atomicAdd(__ref int64_t value, int64_t amount)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "atomicAdd($0, $1)";
    case spirv:
        return spirv_asm
        {
            OpCapability Int64Atomics;
            result:$$int64_t = OpAtomicIAdd &value Device None $amount
        };
    }
}

__target_intrinsic(glsl, "atomicAdd($0, $1)")
__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
uint64_t __atomicAdd(__ref uint64_t value, uint64_t amount)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "atomicAdd($0, $1)";
    case spirv:
        return spirv_asm
        {
            OpCapability Int64Atomics;
            result:$$uint64_t = OpAtomicIAdd &value Device None $amount
        };
    }
}

// Cas - Compare and swap

// Helper for HLSL, using NVAPI

__target_intrinsic(hlsl, "NvInterlockedCompareExchangeUint64($0, $1, $2, $3)")
[__requiresNVAPI]
uint2 __cas(RWByteAddressBuffer buf, uint offset, uint2 compareValue, uint2 value);

// CAS using SM6.6
__target_intrinsic(hlsl, "$0.InterlockedCompareExchange64($1, $2, $3, $4)")
void __cas(RWByteAddressBuffer buf, uint offset, in int64_t compare_value, in int64_t value, out int64_t original_value);
__target_intrinsic(hlsl, "$0.InterlockedCompareExchange64($1, $2, $3, $4)")
void __cas(RWByteAddressBuffer buf, uint offset, in uint64_t compare_value, in uint64_t value, out uint64_t original_value);

__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
int64_t __cas(__ref int64_t ioValue, int64_t compareValue, int64_t newValue)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "atomicCompSwap($0, $1, $2)";
    case spirv:
        return spirv_asm
        {
            OpCapability Int64Atomics;
            result:$$int64_t = OpAtomicCompareExchange &ioValue Device None None $newValue $compareValue
        };
    }
}

__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
uint64_t __cas(__ref uint64_t ioValue, uint64_t compareValue, uint64_t newValue)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "atomicCompSwap($0, $1, $2)";
    case spirv:
        return spirv_asm
        {
            OpCapability Int64Atomics;
            result:$$uint64_t = OpAtomicCompareExchange &ioValue Device None None $newValue $compareValue
        };
    }
}

// Max

__target_intrinsic(hlsl, "NvInterlockedMaxUint64($0, $1, $2)")
[__requiresNVAPI]
uint2 __atomicMax(RWByteAddressBuffer buf, uint offset, uint2 value);

__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
uint64_t __atomicMax(__ref uint64_t ioValue, uint64_t value)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "atomicMax($0, $1)";
    case spirv:
        return spirv_asm
        {
            OpCapability Int64Atomics;
            result:$$uint64_t = OpAtomicUMax &ioValue Device None $value
        };
    }
}

// Min

__target_intrinsic(hlsl, "NvInterlockedMinUint64($0, $1, $2)")
[__requiresNVAPI]
uint2 __atomicMin(RWByteAddressBuffer buf, uint offset, uint2 value);

__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
uint64_t __atomicMin(__ref uint64_t ioValue, uint64_t value)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "atomicMin($0, $1)";
    case spirv:
        return spirv_asm
        {
            OpCapability Int64Atomics;
            result:$$uint64_t = OpAtomicUMin &ioValue Device None $value
        };
    }
}

// And

__target_intrinsic(hlsl, "NvInterlockedAndUint64($0, $1, $2)")
[__requiresNVAPI]
uint2 __atomicAnd(RWByteAddressBuffer buf, uint offset, uint2 value);

__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
uint64_t __atomicAnd(__ref uint64_t ioValue, uint64_t value)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "atomicAnd($0, $1)";
    case spirv:
        return spirv_asm
        {
            OpCapability Int64Atomics;
            result:$$uint64_t = OpAtomicAnd &ioValue Device None $value
        };
    }
}

// Or

__target_intrinsic(hlsl, "NvInterlockedOrUint64($0, $1, $2)")
[__requiresNVAPI]
uint2 __atomicOr(RWByteAddressBuffer buf, uint offset, uint2 value);

__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
uint64_t __atomicOr(__ref uint64_t ioValue, uint64_t value)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "atomicOr($0, $1)";
    case spirv:
        return spirv_asm
        {
            OpCapability Int64Atomics;
            result:$$uint64_t = OpAtomicOr &ioValue Device None $value
        };
    }
}

// Xor

__target_intrinsic(hlsl, "NvInterlockedXorUint64($0, $1, $2)")
[__requiresNVAPI]
uint2 __atomicXor(RWByteAddressBuffer buf, uint offset, uint2 value);

__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
uint64_t __atomicXor(__ref uint64_t ioValue, uint64_t value)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "atomicXor($0, $1)";
    case spirv:
        return spirv_asm
        {
            OpCapability Int64Atomics;
            result:$$uint64_t = OpAtomicXor &ioValue Device None $value
        };
    }
}

// Exchange

__target_intrinsic(hlsl, "NvInterlockedExchangeUint64($0, $1, $2)")
[__requiresNVAPI]
uint2 __atomicExchange(RWByteAddressBuffer buf, uint offset, uint2 value);

__glsl_version(430)
__glsl_extension(GL_EXT_shader_atomic_int64)
uint64_t __atomicExchange(__ref uint64_t ioValue, uint64_t value)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "atomicExchange($0, $1)";
    case spirv:
        return spirv_asm
        {
            OpCapability Int64Atomics;
            result:$$uint64_t = OpAtomicExchange &ioValue Device None $value
        };
    }
}

// Conversion between uint64_t and uint2

uint2 __asuint2(uint64_t i)
{
    return uint2(uint(i), uint(uint64_t(i) >> 32));
}

uint64_t __asuint64(uint2 i)
{
    return (uint64_t(i.y) << 32) | i.x;
}

//

__intrinsic_op($(kIROp_ByteAddressBufferLoad))
T __byteAddressBufferLoad<T>(ByteAddressBuffer buffer, int offset);

__intrinsic_op($(kIROp_ByteAddressBufferLoad))
T __byteAddressBufferLoad<T>(RWByteAddressBuffer buffer, int offset);

__intrinsic_op($(kIROp_ByteAddressBufferLoad))
T __byteAddressBufferLoad<T>(RasterizerOrderedByteAddressBuffer buffer, int offset);

__intrinsic_op($(kIROp_ByteAddressBufferStore))
void __byteAddressBufferStore<T>(RWByteAddressBuffer buffer, int offset, T value);

__intrinsic_op($(kIROp_ByteAddressBufferStore))
void __byteAddressBufferStore<T>(RasterizerOrderedByteAddressBuffer buffer, int offset, T value);

__generic<T>
__magic_type(HLSLStructuredBufferType)
__intrinsic_type($(kIROp_HLSLStructuredBufferType))
struct StructuredBuffer
{
    [__readNone]
    [__unsafeForceInlineEarly]
    void GetDimensions(
        out uint numStructs,
        out uint stride)
    {
        let rs = __structuredBufferGetDimensions(this);
        numStructs = rs.x;
        stride = rs.y;
    }

    __intrinsic_op($(kIROp_StructuredBufferLoad))
    __target_intrinsic(glsl, "$0._data[$1]")
    __target_intrinsic(spirv, "%addr = OpAccessChain resultType*StorageBuffer resultId _0 const(int, 0) _1; OpLoad resultType resultId %addr;")
    [__readNone]
    T Load(int location);

    __intrinsic_op($(kIROp_StructuredBufferLoadStatus))
    T Load(int location, out uint status);

    __subscript(uint index) -> T
    {
        [__readNone]
        __intrinsic_op($(kIROp_StructuredBufferLoad))
        get;
    };
};

__generic<T>
__magic_type(HLSLConsumeStructuredBufferType)
__intrinsic_type($(kIROp_HLSLConsumeStructuredBufferType))
struct ConsumeStructuredBuffer
{
    __intrinsic_op($(kIROp_StructuredBufferConsume))
    T Consume();

    [ForceInline]
    void GetDimensions(
        out uint numStructs,
        out uint stride)
    {
        let result = __structuredBufferGetDimensions(this);
        numStructs = result.x;
        stride = result.y;
    }
};

__generic<T, let N : int>
__magic_type(HLSLInputPatchType)
__intrinsic_type($(kIROp_HLSLInputPatchType))
struct InputPatch
{
    __subscript(uint index) -> T;
};

__generic<T, let N : int>
__magic_type(HLSLOutputPatchType)
__intrinsic_type($(kIROp_HLSLOutputPatchType))
struct OutputPatch
{
    __subscript(uint index) -> T;
};

${{{{
static const struct {
    IROp op;
    char const* name;
} kMutableByteAddressBufferCases[] =
{
    { kIROp_HLSLRWByteAddressBufferType,                "RWByteAddressBuffer" },
    { kIROp_HLSLRasterizerOrderedByteAddressBufferType, "RasterizerOrderedByteAddressBuffer" },
};
for(auto item : kMutableByteAddressBufferCases) {
}}}}

__magic_type(HLSL$(item.name)Type)
__intrinsic_type($(item.op))
struct $(item.name)
{
    // Note(tfoley): supports all operations from `ByteAddressBuffer`
    // TODO(tfoley): can this be made a sub-type?

    __target_intrinsic(hlsl)
    __target_intrinsic(cpp)
    __target_intrinsic(cuda)
    [__unsafeForceInlineEarly]
    void GetDimensions(out uint dim);

    [__unsafeForceInlineEarly]
    __specialized_for_target(spirv)
    __specialized_for_target(glsl)
    void GetDimensions(out uint dim)
    {
        dim = __structuredBufferGetDimensions(__getEquivalentStructuredBuffer<uint>(this)).x*4;
    }

    __target_intrinsic(hlsl)
    [__NoSideEffect]
    uint Load(int location)
    {
        return __byteAddressBufferLoad<uint>(this, location);
    }

    [__NoSideEffect]
    uint Load(int location, out uint status);

    __target_intrinsic(hlsl)
    [__NoSideEffect]
    uint2 Load2(int location)
    {
        return __byteAddressBufferLoad<uint2>(this, location);
    }

    [__NoSideEffect]
    uint2 Load2(int location, out uint status);

    __target_intrinsic(hlsl)
    [__NoSideEffect]
    uint3 Load3(int location)
    {
        return __byteAddressBufferLoad<uint3>(this, location);
    }

    [__NoSideEffect]
    uint3 Load3(int location, out uint status);

    __target_intrinsic(hlsl)
    [__NoSideEffect]
    uint4 Load4(int location)
    {
        return __byteAddressBufferLoad<uint4>(this, location);
    }

    [__NoSideEffect]
    uint4 Load4(int location, out uint status);

    [__NoSideEffect]
    T Load<T>(int location)
    {
        return __byteAddressBufferLoad<T>(this, location);
    }
${{{{
    if (item.op == kIROp_HLSLRWByteAddressBufferType)
    {
}}}}

    // float32 and int64 atomic support. This is a Slang specific extension, it uses
    // GL_EXT_shader_atomic_float on Vulkan
    // NvAPI support on DX
    // NOTE! To use this feature on HLSL based targets the path to 'nvHLSLExtns.h' from the NvAPI SDK must
    // be set. That this include will be added to the *output* that is passed to a downstram compiler.
    // Also note that you *can* include NVAPI headers in your Slang source, and directly use NVAPI functions
    // Directly using NVAPI functions does *not* add the #include on the output
    // Finally note you can *mix* NVAPI direct calls, and use of NVAPI intrinsics below. This doesn't cause
    // any clashes, as Slang will emit any NVAPI function it parsed (say via a include in Slang source) with
    // unique functions.
    //
    // https://www.khronos.org/registry/vulkan/specs/1.2-extensions/html/vkspec.html#VK_EXT_shader_atomic_float
    // https://htmlpreview.github.io/?https://github.com/KhronosGroup/SPIRV-Registry/blob/master/extensions/EXT/SPV_EXT_shader_atomic_float_add.html

    // F32 Add

    __cuda_sm_version(2.0)
    [__requiresNVAPI]
    void InterlockedAddF32(uint byteAddress, float valueToAdd, out float originalValue)
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm "($3 = NvInterlockedAddFp32($0, $1, $2))";
        case cuda: __intrinsic_asm "(*$3 = atomicAdd($0._getPtrAt<float>($1), $2))";
        case glsl:
        case spirv:
            {
                let buf = __getEquivalentStructuredBuffer<float>(this);
                originalValue = __atomicAdd(buf[byteAddress / 4], valueToAdd);
                return;
            }
        }
    }

    // Without returning original value

    [__requiresNVAPI]
    __cuda_sm_version(2.0)
    void InterlockedAddF32(uint byteAddress, float valueToAdd)
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm "(NvInterlockedAddFp32($0, $1, $2))";
        case cuda: __intrinsic_asm "atomicAdd($0._getPtrAt<float>($1), $2)";
        case glsl:
        case spirv:
            {
                let buf = __getEquivalentStructuredBuffer<float>(this);
                __atomicAdd(buf[byteAddress / 4], valueToAdd);
                return;
            }
        }
    }

    // Int64 Add
    __cuda_sm_version(6.0)
    void InterlockedAddI64(uint byteAddress, int64_t valueToAdd, out int64_t originalValue)
    {
        __target_switch
        {
        case cuda: __intrinsic_asm "(*$3 = atomicAdd($0._getPtrAt<uint64_t>($1), $2))";
        case hlsl:
            originalValue = __asuint64(__atomicAdd(this, byteAddress, __asuint2(valueToAdd)));
        case glsl:
        case spirv:
            {
                let buf = __getEquivalentStructuredBuffer<int64_t>(this);
                originalValue = __atomicAdd(buf[byteAddress / 8], valueToAdd);
            }
        }
    }

    // Without returning original value
    __cuda_sm_version(6.0)
    __target_intrinsic(cuda, "atomicAdd($0._getPtrAt<uint64_t>($1), $2)")
    void InterlockedAddI64(uint byteAddress, int64_t valueToAdd);

    __specialized_for_target(hlsl)
    void InterlockedAddI64(uint byteAddress, int64_t valueToAdd)
    {
        __atomicAdd(this, byteAddress, __asuint2(valueToAdd));
    }

    __specialized_for_target(glsl)
    __specialized_for_target(spirv)
    void InterlockedAddI64(uint byteAddress, int64_t valueToAdd)
    {
        let buf = __getEquivalentStructuredBuffer<int64_t>(this);
        __atomicAdd(buf[byteAddress / 8], valueToAdd);
    }

    // Cas uint64_t

    __target_intrinsic(cuda, "(*$4 = atomicCAS($0._getPtrAt<uint64_t>($1), $2, $3))")
    void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue);

    __specialized_for_target(hlsl)
    void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue)
    {
        outOriginalValue = __asuint64(__cas(this, byteAddress, __asuint2(compareValue), __asuint2(value)));
    }

    __specialized_for_target(glsl)
    __specialized_for_target(spirv)
    void InterlockedCompareExchangeU64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue)
    {
        let buf = __getEquivalentStructuredBuffer<uint64_t>(this);
        outOriginalValue = __cas(buf[byteAddress / 8], compareValue, value);
    }

    // Max

    __cuda_sm_version(3.5)
    __target_intrinsic(cuda, "atomicMax($0._getPtrAt<uint64_t>($1), $2)")
    uint64_t InterlockedMaxU64(uint byteAddress, uint64_t value);

    __specialized_for_target(hlsl)
    uint64_t InterlockedMaxU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicMax(this, byteAddress, __asuint2(value))); }

    __specialized_for_target(glsl)
    __specialized_for_target(spirv)
    uint64_t InterlockedMaxU64(uint byteAddress, uint64_t value)
    {
        let buf = __getEquivalentStructuredBuffer<uint64_t>(this);
        return __atomicMax(buf[byteAddress / 8], value);
    }

    // Min

    __cuda_sm_version(3.5)
    __target_intrinsic(cuda, "atomicMin($0._getPtrAt<uint64_t>($1), $2)")
    uint64_t InterlockedMinU64(uint byteAddress, uint64_t value);

    __specialized_for_target(hlsl)
    uint64_t InterlockedMinU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicMin(this, byteAddress, __asuint2(value))); }

    __specialized_for_target(glsl)
    __specialized_for_target(spirv)
    uint64_t InterlockedMinU64(uint byteAddress, uint64_t value)
    {
        let buf = __getEquivalentStructuredBuffer<uint64_t>(this);
        return __atomicMin(buf[byteAddress / 8], value);
    }

    // And

    __target_intrinsic(cuda, "atomicAnd($0._getPtrAt<uint64_t>($1), $2)")
    uint64_t InterlockedAndU64(uint byteAddress, uint64_t value);

    __specialized_for_target(hlsl)
    uint64_t InterlockedAndU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicAnd(this, byteAddress, __asuint2(value))); }

    __specialized_for_target(glsl)
    __specialized_for_target(spirv)
    uint64_t InterlockedAndU64(uint byteAddress, uint64_t value)
    {
        let buf = __getEquivalentStructuredBuffer<uint64_t>(this);
        return __atomicAnd(buf[byteAddress / 8], value);
    }

    // Or

    __target_intrinsic(cuda, "atomicOr($0._getPtrAt<uint64_t>($1), $2)")
    uint64_t InterlockedOrU64(uint byteAddress, uint64_t value);

    __specialized_for_target(hlsl)
    uint64_t InterlockedOrU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicOr(this, byteAddress, __asuint2(value))); }

    __specialized_for_target(glsl)
    __specialized_for_target(spirv)
    uint64_t InterlockedOrU64(uint byteAddress, uint64_t value)
    {
        let buf = __getEquivalentStructuredBuffer<uint64_t>(this);
        return __atomicOr(buf[byteAddress / 8], value);
    }

    // Xor

    __target_intrinsic(cuda, "atomicXor($0._getPtrAt<uint64_t>($1), $2)")
    uint64_t InterlockedXorU64(uint byteAddress, uint64_t value);

    __specialized_for_target(hlsl)
    uint64_t InterlockedXorU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicXor(this, byteAddress, __asuint2(value))); }

    __specialized_for_target(glsl)
    __specialized_for_target(spirv)
    uint64_t InterlockedXorU64(uint byteAddress, uint64_t value)
    {
        let buf = __getEquivalentStructuredBuffer<uint64_t>(this);
        return __atomicXor(buf[byteAddress / 8], value);
    }

    // Exchange

    __target_intrinsic(cuda, "atomicExch($0._getPtrAt<uint64_t>($1), $2)")
    uint64_t InterlockedExchangeU64(uint byteAddress, uint64_t value);

    __specialized_for_target(hlsl)
    uint64_t InterlockedExchangeU64(uint byteAddress, uint64_t value) { return __asuint64(__atomicExchange(this, byteAddress, __asuint2(value))); }

    __specialized_for_target(glsl)
    __specialized_for_target(spirv)
    uint64_t InterlockedExchangeU64(uint byteAddress, uint64_t value)
    {
        let buf = __getEquivalentStructuredBuffer<uint64_t>(this);
        return __atomicExchange(buf[byteAddress / 8], value);
    }

    // SM6.6 6 64bit atomics.
    __specialized_for_target(hlsl)
    void InterlockedAdd64(uint byteAddress, int64_t valueToAdd, out int64_t outOriginalValue)
    {
        __atomicAdd(this, byteAddress, valueToAdd, outOriginalValue);
    }
    __specialized_for_target(glsl)
    __specialized_for_target(spirv)
    void InterlockedAdd64(uint byteAddress, int64_t valueToAdd, out int64_t originalValue)
    {
        let buf = __getEquivalentStructuredBuffer<int64_t>(this);
        originalValue = __atomicAdd(buf[byteAddress / 8], valueToAdd);
    }
    __specialized_for_target(hlsl)
    void InterlockedAdd64(uint byteAddress, uint64_t valueToAdd, out uint64_t outOriginalValue)
    {
        __atomicAdd(this, byteAddress, valueToAdd, outOriginalValue);
    }
    __specialized_for_target(glsl)
    __specialized_for_target(spirv)
    void InterlockedAdd64(uint byteAddress, uint64_t valueToAdd, out uint64_t originalValue)
    {
        let buf = __getEquivalentStructuredBuffer<uint64_t>(this);
        originalValue = __atomicAdd(buf[byteAddress / 8], valueToAdd);
    }
    __specialized_for_target(hlsl)
    void InterlockedCompareExchange64(uint byteAddress, int64_t compareValue, int64_t value, out int64_t outOriginalValue)
    {
        __cas(this, byteAddress, compareValue, value, outOriginalValue);
    }
    __specialized_for_target(glsl)
    __specialized_for_target(spirv)
    void InterlockedCompareExchange64(uint byteAddress, int64_t compareValue, int64_t value, out int64_t outOriginalValue)
    {
        let buf = __getEquivalentStructuredBuffer<int64_t>(this);
        outOriginalValue = __cas(buf[byteAddress / 8], compareValue, value);
    }
    __specialized_for_target(hlsl)
    void InterlockedCompareExchange64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue)
    {
        __cas(this, byteAddress, compareValue, value, outOriginalValue);
    }
    __specialized_for_target(glsl)
    __specialized_for_target(spirv)
    void InterlockedCompareExchange64(uint byteAddress, uint64_t compareValue, uint64_t value, out uint64_t outOriginalValue)
    {
        let buf = __getEquivalentStructuredBuffer<uint64_t>(this);
        outOriginalValue = __cas(buf[byteAddress / 8], compareValue, value);
    }
${{{{
    } // endif (type == RWByteAddressBuffer)
}}}}

    // Added operations:
    void InterlockedAdd(
        UINT dest,
        UINT value,
        out UINT original_value)
    {
        __target_switch
        {
        case glsl: __intrinsic_asm "($3 = atomicAdd($0._data[$1/4], $2))";
        case cuda: __intrinsic_asm "(*$3 = atomicAdd($0._getPtrAt<uint32_t>($1), $2))";
        case hlsl: __intrinsic_asm ".InterlockedAdd";
        case spirv:
            let buf = __getEquivalentStructuredBuffer<uint>(this);
            ::InterlockedAdd(buf[dest / 4], value, original_value);
        }
    }

    void InterlockedAdd(
        UINT dest,
        UINT value)
    {
        __target_switch
        {
        case glsl: __intrinsic_asm "atomicAdd($0._data[$1/4], $2)";
        case cuda: __intrinsic_asm "atomicAdd($0._getPtrAt<uint32_t>($1), $2)";
        case hlsl: __intrinsic_asm ".InterlockedAdd";
        case spirv:
            let buf = __getEquivalentStructuredBuffer<uint>(this);
            ::InterlockedAdd(buf[dest / 4], value);
        }
    }

    void InterlockedAnd(
        UINT dest,
        UINT value,
        out UINT original_value)
    {
        __target_switch
        {
        case glsl: __intrinsic_asm "$3 = atomicAnd($0._data[$1/4], $2)";
        case cuda: __intrinsic_asm "(*$3 = atomicAnd($0._getPtrAt<uint32_t>($1), $2))";
        case hlsl: __intrinsic_asm ".InterlockedAnd";
        case spirv:
            let buf = __getEquivalentStructuredBuffer<uint>(this);
            ::InterlockedAnd(buf[dest / 4], value, original_value);
        }
    }

    void InterlockedAnd(
        UINT dest,
        UINT value)
    {
        __target_switch
        {
        case glsl: __intrinsic_asm "atomicAnd($0._data[$1/4], $2)";
        case cuda: __intrinsic_asm "atomicAnd($0._getPtrAt<uint32_t>($1), $2)";
        case hlsl: __intrinsic_asm ".InterlockedAnd";
        case spirv:
            let buf = __getEquivalentStructuredBuffer<uint>(this);
            ::InterlockedAnd(buf[dest / 4], value);
        }
    }

    void InterlockedCompareExchange(
        UINT dest,
        UINT compare_value,
        UINT value,
        out UINT original_value)
    {
        __target_switch
        {
        case glsl: __intrinsic_asm "($4 = atomicCompSwap($0._data[$1/4], $2, $3))";
        case cuda: __intrinsic_asm "(*$4 = atomicCAS($0._getPtrAt<uint32_t>($1), $2, $3))";
        case hlsl: __intrinsic_asm ".InterlockedCompareExchange";
        case spirv:
            let buf = __getEquivalentStructuredBuffer<uint>(this);
            ::InterlockedCompareExchange(buf[dest / 4], compare_value, value, original_value);
        }
    }

    void InterlockedCompareStore(
        UINT dest,
        UINT compare_value,
        UINT value)
    {
        __target_switch
        {
        case glsl: __intrinsic_asm "atomicCompSwap($0._data[$1/4], $2, $3)";
        case cuda: __intrinsic_asm "atomicCAS($0._getPtrAt<uint32_t>($1), $2, $3)";
        case hlsl: __intrinsic_asm ".InterlockedCompareStore";
        case spirv:
            let buf = __getEquivalentStructuredBuffer<uint>(this);
            ::InterlockedCompareStore(buf[dest / 4], compare_value, value);
        }
    }

    void InterlockedExchange(
        UINT dest,
        UINT value,
        out UINT original_value)
    {
        __target_switch
        {
        case glsl: __intrinsic_asm "($3 = atomicExchange($0._data[$1/4], $2))";
        case cuda: __intrinsic_asm "(*$3 = atomicExch($0._getPtrAt<uint32_t>($1), $2))";
        case hlsl: __intrinsic_asm ".InterlockedExchange";
        case spirv:
            let buf = __getEquivalentStructuredBuffer<uint>(this);
            ::InterlockedExchange(buf[dest / 4], value, original_value);
        }
    }

    void InterlockedMax(
        UINT dest,
        UINT value,
        out UINT original_value)
    {
        __target_switch
        {
        case glsl: __intrinsic_asm "($3 = atomicMax($0._data[$1/4], $2))";
        case cuda: __intrinsic_asm "(*$3 = atomicMax($0._getPtrAt<uint32_t>($1), $2))";
        case hlsl: __intrinsic_asm ".InterlockedMax";
        case spirv:
            let buf = __getEquivalentStructuredBuffer<uint>(this);
            ::InterlockedMax(buf[dest / 4], value, original_value);
        }
    }

    void InterlockedMax(
        UINT dest,
        UINT value)
    {
        __target_switch
        {
        case glsl: __intrinsic_asm "atomicMax($0._data[$1/4], $2)";
        case cuda: __intrinsic_asm "atomicMax($0._getPtrAt<uint32_t>($1), $2)";
        case hlsl: __intrinsic_asm ".InterlockedMax";
        case spirv:
            let buf = __getEquivalentStructuredBuffer<uint>(this);
            ::InterlockedMax(buf[dest / 4], value);
        }
    }

    void InterlockedMin(
        UINT dest,
        UINT value,
        out UINT original_value)
    {
        __target_switch
        {
        case glsl: __intrinsic_asm "($3 = atomicMin($0._data[$1/4], $2))";
        case cuda: __intrinsic_asm "(*$3 = atomicMin($0._getPtrAt<uint32_t>($1), $2))";
        case hlsl: __intrinsic_asm ".InterlockedMin";
        case spirv:
            let buf = __getEquivalentStructuredBuffer<uint>(this);
            ::InterlockedMin(buf[dest / 4], value, original_value);
        }
    }

    void InterlockedMin(
        UINT dest,
        UINT value)
    {
        __target_switch
        {
        case glsl: __intrinsic_asm "atomicMin($0._data[$1/4], $2)";
        case cuda: __intrinsic_asm "atomicMin($0._getPtrAt<uint32_t>($1), $2)";
        case hlsl: __intrinsic_asm ".InterlockedMin";
        case spirv:
            let buf = __getEquivalentStructuredBuffer<uint>(this);
            ::InterlockedMin(buf[dest / 4], value);
        }
    }

    void InterlockedOr(
        UINT dest,
        UINT value,
        out UINT original_value)
    {
        __target_switch
        {
        case glsl: __intrinsic_asm "($3 = atomicOr($0._data[$1/4], $2))";
        case cuda: __intrinsic_asm "(*$3 = atomicOr($0._getPtrAt<uint32_t>($1), $2))";
        case hlsl: __intrinsic_asm ".InterlockedOr";
        case spirv:
            let buf = __getEquivalentStructuredBuffer<uint>(this);
            ::InterlockedOr(buf[dest / 4], value, original_value);
        }
    }

    void InterlockedOr(
        UINT dest,
        UINT value)
    {
        __target_switch
        {
        case glsl: __intrinsic_asm "atomicOr($0._data[$1/4], $2)";
        case cuda: __intrinsic_asm "atomicOr($0._getPtrAt<uint32_t>($1), $2)";
        case hlsl: __intrinsic_asm ".InterlockedOr";
        case spirv:
            let buf = __getEquivalentStructuredBuffer<uint>(this);
            ::InterlockedOr(buf[dest / 4], value);
        }
    }

    void InterlockedXor(
        UINT dest,
        UINT value,
        out UINT original_value)
    {
        __target_switch
        {
        case glsl: __intrinsic_asm "($3 = atomicXor($0._data[$1/4], $2))";
        case cuda: __intrinsic_asm "(*$3 = atomicXor($0._getPtrAt<uint32_t>($1), $2))";
        case hlsl: __intrinsic_asm ".InterlockedXor";
        case spirv:
            let buf = __getEquivalentStructuredBuffer<uint>(this);
            ::InterlockedXor(buf[dest / 4], value, original_value);
        }
    }

    void InterlockedXor(
        UINT dest,
        UINT value)
    {
        __target_switch
        {
        case glsl: __intrinsic_asm "atomicXor($0._data[$1/4], $2)";
        case cuda: __intrinsic_asm "atomicXor($0._getPtrAt<uint32_t>($1), $2)";
        case hlsl: __intrinsic_asm ".InterlockedXor";
        case spirv:
            let buf = __getEquivalentStructuredBuffer<uint>(this);
            ::InterlockedXor(buf[dest / 4], value);
        }
    }

    __target_intrinsic(hlsl)
    [ForceInline]
    void Store(
        uint address,
        uint value)
    {
        __byteAddressBufferStore(this, address, value);
    }

    __target_intrinsic(hlsl)
    [ForceInline]
    void Store2(uint address, uint2 value)
    {
        __byteAddressBufferStore(this, address, value);
    }

    __target_intrinsic(hlsl)
    [ForceInline]
    void Store3(
        uint address,
        uint3 value)
    {
        __byteAddressBufferStore(this, address, value);
    }

    __target_intrinsic(hlsl)
    [ForceInline]
    void Store4(
        uint address,
        uint4 value)
    {
        __byteAddressBufferStore(this, address, value);
    }

    void Store<T>(int offset, T value)
    {
        __byteAddressBufferStore(this, offset, value);
    }
};

${{{{
}
}}}}

${{{{
static const struct {
    IROp op;
    char const* name;
} kMutableStructuredBufferCases[] =
{
    { kIROp_HLSLRWStructuredBufferType,                "RWStructuredBuffer" },
    { kIROp_HLSLRasterizerOrderedStructuredBufferType, "RasterizerOrderedStructuredBuffer" },
};
for(auto item : kMutableStructuredBufferCases) {
}}}}

__generic<T>
__magic_type(HLSL$(item.name)Type)
__intrinsic_type($(item.op))
struct $(item.name)
{
    uint DecrementCounter();

    [__readNone]
    [__unsafeForceInlineEarly]
    __target_intrinsic(hlsl)
    void GetDimensions(
        out uint numStructs,
        out uint stride)
    {
        let rs = __structuredBufferGetDimensions(this);
        numStructs = rs.x;
        stride = rs.y;
    }

    uint IncrementCounter();

    [__NoSideEffect]
    __intrinsic_op($(kIROp_RWStructuredBufferLoad))
    T Load(int location);

    [__NoSideEffect]
    __intrinsic_op($(kIROp_RWStructuredBufferLoadStatus))
    T Load(int location, out uint status);

    __subscript(uint index) -> T
    {
        [__NoSideEffect]
        __intrinsic_op($(kIROp_RWStructuredBufferGetElementPtr))
        ref;
    }
};

${{{{
}
}}}}

__generic<T>
__magic_type(HLSLPointStreamType)
__intrinsic_type($(kIROp_HLSLPointStreamType))
struct PointStream
{
    [KnownBuiltin("GeometryStreamAppend")]
    void Append(T value)
    {
        __target_switch
        {
        case glsl: __intrinsic_asm "EmitVertex()";
        case hlsl: __intrinsic_asm ".Append";
        case spirv: spirv_asm { OpEmitVertex; };
        }
    }

    [KnownBuiltin("GeometryStreamRestart")]
    void RestartStrip()
    {
        __target_switch
        {
        case glsl: __intrinsic_asm "EndPrimitive()";
        case hlsl: __intrinsic_asm ".RestartStrip";
        case spirv: spirv_asm { OpEndPrimitive; };
        }
    }
};

__generic<T>
__magic_type(HLSLLineStreamType)
__intrinsic_type($(kIROp_HLSLLineStreamType))
struct LineStream
{
    [KnownBuiltin("GeometryStreamAppend")]
    void Append(T value)
    {
        __target_switch
        {
        case glsl: __intrinsic_asm "EmitVertex()";
        case hlsl: __intrinsic_asm ".Append";
        case spirv: spirv_asm { OpEmitVertex; };
        }
    }

    [KnownBuiltin("GeometryStreamRestart")]
    void RestartStrip()
    {
        __target_switch
        {
        case glsl: __intrinsic_asm "EndPrimitive()";
        case hlsl: __intrinsic_asm ".RestartStrip";
        case spirv: spirv_asm { OpEndPrimitive; };
        }
    }
};

__generic<T>
__magic_type(HLSLTriangleStreamType)
__intrinsic_type($(kIROp_HLSLTriangleStreamType))
struct TriangleStream
{
    [KnownBuiltin("GeometryStreamAppend")]
    void Append(T value)
    {
        __target_switch
        {
        case glsl: __intrinsic_asm "EmitVertex()";
        case hlsl: __intrinsic_asm ".Append";
        case spirv: spirv_asm { OpEmitVertex; };
        }
    }

    [KnownBuiltin("GeometryStreamRestart")]
    void RestartStrip()
    {
        __target_switch
        {
        case glsl: __intrinsic_asm "EndPrimitive()";
        case hlsl: __intrinsic_asm ".RestartStrip";
        case spirv: spirv_asm { OpEndPrimitive; };
        }
    }
};

#define VECTOR_MAP_UNARY(TYPE, COUNT, FUNC, VALUE) \
    vector<TYPE,COUNT> result; for(int i = 0; i < COUNT; ++i) { result[i] = FUNC(VALUE[i]); } return result

#define MATRIX_MAP_UNARY(TYPE, ROWS, COLS, FUNC, VALUE) \
    matrix<TYPE,ROWS,COLS> result; for(int i = 0; i < ROWS; ++i) { result[i] = FUNC(VALUE[i]); } return result

#define VECTOR_MAP_BINARY(TYPE, COUNT, FUNC, LEFT, RIGHT) \
    vector<TYPE,COUNT> result; for(int i = 0; i < COUNT; ++i) { result[i] = FUNC(LEFT[i], RIGHT[i]); } return result

#define MATRIX_MAP_BINARY(TYPE, ROWS, COLS, FUNC, LEFT, RIGHT) \
    matrix<TYPE,ROWS,COLS> result; for(int i = 0; i < ROWS; ++i) { result[i] = FUNC(LEFT[i], RIGHT[i]); } return result

#define VECTOR_MAP_TRINARY(TYPE, COUNT, FUNC, A, B, C) \
    vector<TYPE,COUNT> result; for(int i = 0; i < COUNT; ++i) { result[i] = FUNC(A[i], B[i], C[i]); } return result

#define MATRIX_MAP_TRINARY(TYPE, ROWS, COLS, FUNC, A, B, C) \
    matrix<TYPE,ROWS,COLS> result; for(int i = 0; i < ROWS; ++i) { result[i] = FUNC(A[i], B[i], C[i]); } return result

// Try to terminate the current draw or dispatch call (HLSL SM 4.0)
void abort();

// Absolute value (HLSL SM 1.0)

__generic<T : __BuiltinIntegerType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_abs($0)")
__target_intrinsic(cpp, "$P_abs($0)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 fi(FAbs, SAbs) _0")
[__readNone]
T abs(T x);
/*{
    // Note: this simple definition may not be appropriate for floating-point inputs
    return x < 0 ? -x : x;
}*/

__generic<T : __BuiltinIntegerType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 fi(FAbs, SAbs) _0")
[__readNone]
vector<T, N> abs(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, abs, x);
}

__generic<T : __BuiltinIntegerType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> abs(matrix<T,N,M> x)
{
    MATRIX_MAP_UNARY(T, N, M, abs, x);
}

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_abs($0)")
__target_intrinsic(cpp, "$P_abs($0)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 fi(FAbs, SAbs) _0")
[__readNone]
T abs(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 fi(FAbs, SAbs) _0")
[__readNone]
vector<T, N> abs(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, abs, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> abs(matrix<T,N,M> x)
{
    MATRIX_MAP_UNARY(T, N, M, abs, x);
}

// Inverse cosine (HLSL SM 1.0)

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_acos($0)")
__target_intrinsic(cpp, "$P_acos($0)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Acos _0")
[__readNone]
T acos(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Acos _0")
[__readNone]
vector<T, N> acos(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, acos, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> acos(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, acos, x);
}

// Test if all components are non-zero (HLSL SM 1.0)
__generic<T : __BuiltinType>
[__readNone]
bool all(T x)
{
    __target_switch
    {
    default:
        __intrinsic_asm "bool($0)";
    case hlsl:
        __intrinsic_asm "all";
    case spirv:
        let zero = __default<T>();
        if (__isInt<T>())
            return spirv_asm
            {
                OpINotEqual $$bool result $x $zero
            };
        else if (__isFloat<T>())
            return spirv_asm
            {
                OpFUnordNotEqual $$bool result $x $zero
            };
        else if (__isBool<T>())
            return __slang_noop_cast<bool>(x);
    }
}

__generic<T : __BuiltinType, let N : int>
[__readNone]
bool all(vector<T,N> x)
{
    __target_switch
    {
    case hlsl:
        __intrinsic_asm "all";
    case glsl:
        __intrinsic_asm "all(bvec$N0($0))";
    case spirv:
        if (__isBool<T>())
            return spirv_asm
            {
                OpAll $$bool result $x
            };
        else if (__isInt<T>())
        {
            let zero = __default<vector<T,N>>();
            return spirv_asm
            {
                OpINotEqual $$vector<bool,N> %castResult $x $zero;
                OpAll $$bool result %castResult
            };
        }
        else
        {
            let zero = __default<T>();
            return spirv_asm
            {
                OpFUnordNotEqual $$vector<bool,N> %castResult $x $zero;
                OpAll $$bool result %castResult
            };
        }
    default:
        bool result = true;
        for(int i = 0; i < N; ++i)
            result = result && all(x[i]);
        return result;
    }
}

__generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
bool all(matrix<T,N,M> x)
{
    bool result = true;
    for(int i = 0; i < N; ++i)
        result = result && all(x[i]);
    return result;
}

// Barrier for writes to all memory spaces (HLSL SM 5.0)
__glsl_extension(GL_KHR_memory_scope_semantics)
void AllMemoryBarrier()
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "AllMemoryBarrier";
    case glsl: __intrinsic_asm "memoryBarrier(gl_ScopeDevice, (gl_StorageSemanticsShared|gl_StorageSemanticsImage|gl_StorageSemanticsBuffer), gl_SemanticsAcquireRelease)";
    case cuda: __intrinsic_asm "__threadfence()";
    case spirv: spirv_asm
        {
            OpMemoryBarrier Device AcquireRelease|UniformMemory|WorkgroupMemory|ImageMemory;
        };
    }
}

// Thread-group sync and barrier for writes to all memory spaces (HLSL SM 5.0)
__glsl_extension(GL_KHR_memory_scope_semantics)
void AllMemoryBarrierWithGroupSync()
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "AllMemoryBarrierWithGroupSync";
    case glsl: __intrinsic_asm "controlBarrier(gl_ScopeWorkgroup, gl_ScopeDevice, (gl_StorageSemanticsShared|gl_StorageSemanticsImage|gl_StorageSemanticsBuffer), gl_SemanticsAcquireRelease)";
    case cuda: __intrinsic_asm "__syncthreads()";
    case spirv: spirv_asm
        {
            OpControlBarrier Workgroup Device AcquireRelease|UniformMemory|WorkgroupMemory|ImageMemory;
        };
    }
}

// Test if any components is non-zero (HLSL SM 1.0)

__generic<T : __BuiltinType>
[__readNone]
bool any(T x)
{
    __target_switch
    {
    default:
        __intrinsic_asm "bool($0)";
    case hlsl:
        __intrinsic_asm "any";
    case spirv:
        let zero = __default<T>();
        if (__isInt<T>())
            return spirv_asm
            {
                OpINotEqual $$bool result $x $zero
            };
        else if (__isFloat<T>())
            return spirv_asm
            {
                OpFUnordNotEqual $$bool result $x $zero
            };
        else if (__isBool<T>())
            return __slang_noop_cast<bool>(x);
    }
}

__generic<T : __BuiltinType, let N : int>
[__readNone]
bool any(vector<T, N> x)
{
    __target_switch
    {
    case hlsl:
        __intrinsic_asm "any";
    case glsl:
        __intrinsic_asm "any(bvec$N0($0))";
    case spirv:
        if (__isBool<T>())
            return spirv_asm
            {
                OpAny $$bool result $x
            };
        else if (__isInt<T>())
        {
            let zero = __default<vector<T,N>>();
            return spirv_asm
            {
                OpINotEqual $$vector<bool,N> %castResult $x $zero;
                OpAny $$bool result %castResult
            };
        }
        else
        {
            let zero = __default<T>();
            return spirv_asm
            {
                OpFUnordNotEqual $$vector<bool,N> %castResult $x $zero;
                OpAny $$bool result %castResult
            };
        }
    default:
        bool result = false;
        for(int i = 0; i < N; ++i)
            result = result || any(x[i]);
        return result;
    }
}

__generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
bool any(matrix<T, N, M> x)
{
    bool result = false;
    for(int i = 0; i < N; ++i)
        result = result || any(x[i]);
    return result;
}


// Reinterpret bits as a double (HLSL SM 5.0)

__target_intrinsic(hlsl)
__target_intrinsic(glsl, "packDouble2x32(uvec2($0, $1))")
__target_intrinsic(cpp, "$P_asdouble($0, $1)")
__target_intrinsic(cuda, "$P_asdouble($0, $1)")
__target_intrinsic(spirv, "%v = OpCompositeConstruct _type(uint2) resultId _0 _1; OpExtInst resultType resultId glsl450 59 %v")
__glsl_extension(GL_ARB_gpu_shader5)
[__readNone]
double asdouble(uint lowbits, uint highbits);

// Reinterpret bits as a float (HLSL SM 4.0)

__target_intrinsic(hlsl)
__target_intrinsic(glsl, "intBitsToFloat")
__target_intrinsic(cpp, "$P_asfloat($0)")
__target_intrinsic(cuda, "$P_asfloat($0)")
__target_intrinsic(spirv, "OpBitcast resultType resultId _0")
[__readNone]
float asfloat(int x);

__target_intrinsic(hlsl)
__target_intrinsic(glsl, "uintBitsToFloat")
__target_intrinsic(cpp, "$P_asfloat($0)")
__target_intrinsic(cuda, "$P_asfloat($0)")
__target_intrinsic(spirv, "OpBitcast resultType resultId _0")
[__readNone]
float asfloat(uint x);

__generic<let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "intBitsToFloat")
__target_intrinsic(spirv, "OpBitcast resultType resultId _0")
[__readNone]
vector<float, N> asfloat(vector< int, N> x)
{
    VECTOR_MAP_UNARY(float, N, asfloat, x);
}

__generic<let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "uintBitsToFloat")
__target_intrinsic(spirv, "OpBitcast resultType resultId _0")
[__readNone]
vector<float,N> asfloat(vector<uint,N> x)
{
    VECTOR_MAP_UNARY(float, N, asfloat, x);
}

__generic<let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<float,N,M> asfloat(matrix< int,N,M> x)
{
    MATRIX_MAP_UNARY(float, N, M, asfloat, x);
}

__generic<let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<float,N,M> asfloat(matrix<uint,N,M> x)
{
    MATRIX_MAP_UNARY(float, N, M, asfloat, x);
}

// No op
[__unsafeForceInlineEarly]
[__readNone]
float asfloat(float x)
{ return x; }

__generic<let N : int>
[__unsafeForceInlineEarly]
[__readNone]
vector<float,N> asfloat(vector<float,N> x)
{ return x; }

__generic<let N : int, let M : int>
[__unsafeForceInlineEarly]
[__readNone]
matrix<float,N,M> asfloat(matrix<float,N,M> x)
{ return x; }

// Inverse sine (HLSL SM 1.0)
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_asin($0)")
__target_intrinsic(cpp, "$P_asin($0)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Asin _0")
[__readNone]
T asin(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Asin _0")
[__readNone]
vector<T, N> asin(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T,N,asin,x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> asin(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T,N,M,asin,x);
}

// Reinterpret bits as an int (HLSL SM 4.0)

__target_intrinsic(hlsl)
__target_intrinsic(glsl, "floatBitsToInt")
__target_intrinsic(cpp, "$P_asint($0)")
__target_intrinsic(cuda, "$P_asint($0)")
__target_intrinsic(spirv, "OpBitcast resultType resultId _0")
[__readNone]
int asint(float x);

__target_intrinsic(hlsl)
__target_intrinsic(glsl, "int($0)")
__target_intrinsic(cpp, "$P_asint($0)")
__target_intrinsic(cuda, "$P_asint($0)")
__target_intrinsic(spirv, "OpBitcast resultType resultId _0")
[__readNone]
int asint(uint x);

__generic<let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "floatBitsToInt")
__target_intrinsic(spirv, "OpBitcast resultType resultId _0")
[__readNone]
vector<int, N> asint(vector<float, N> x)
{
    VECTOR_MAP_UNARY(int, N, asint, x);
}

__generic<let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "ivec$N0($0)")
__target_intrinsic(spirv, "OpBitcast resultType resultId _0")
[__readNone]
vector<int, N> asint(vector<uint, N> x)
{
    VECTOR_MAP_UNARY(int, N, asint, x);
}

__generic<let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<int, N, M> asint(matrix<float, N, M> x)
{
    MATRIX_MAP_UNARY(int, N, M, asint, x);
}

__generic<let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<int, N, M> asint(matrix<uint, N, M> x)
{
    MATRIX_MAP_UNARY(int, N, M, asint, x);
}

// No op
[__unsafeForceInlineEarly]
[__readNone]
int asint(int x)
{ return x; }

__generic<let N : int>
[__unsafeForceInlineEarly]
[__readNone]
vector<int,N> asint(vector<int,N> x)
{ return x; }

__generic<let N : int, let M : int>
[__unsafeForceInlineEarly]
[__readNone]
matrix<int,N,M> asint(matrix<int,N,M> x)
{ return x; }

// Reinterpret bits of double as a uint (HLSL SM 5.0)

__glsl_extension(GL_ARB_gpu_shader5)
[__readNone]
void asuint(double value, out uint lowbits, out uint highbits)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "asuint";
    case glsl: __intrinsic_asm "{ uvec2 v = unpackDouble2x32($0); $1 = v.x; $2 = v.y; }";
    case cpp:
    case cuda:
        __intrinsic_asm "$P_asuint($0, $1, $2)";
    case spirv:
        let uv = spirv_asm
        {
            result : $$uint2 = OpBitcast $value;
        };
        lowbits = uv.x;
        highbits = uv.y;
        return;
    }
}

// Reinterpret bits as a uint (HLSL SM 4.0)

__target_intrinsic(hlsl)
__target_intrinsic(glsl, "floatBitsToUint")
__target_intrinsic(spirv, "OpBitcast resultType resultId _0")
__target_intrinsic(cpp, "$P_asuint($0)")
__target_intrinsic(cuda, "$P_asuint($0)")
[__readNone]
uint asuint(float x);

__target_intrinsic(hlsl)
__target_intrinsic(glsl, "uint($0)")
__target_intrinsic(spirv, "OpBitcast resultType resultId _0")
__target_intrinsic(cpp, "$P_asuint($0)")
__target_intrinsic(cuda, "$P_asuint($0)")
[__readNone]
uint asuint(int x);

__generic<let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "floatBitsToUint")
__target_intrinsic(spirv, "OpBitcast resultType resultId _0")
[__readNone]
vector<uint,N> asuint(vector<float,N> x)
{
    VECTOR_MAP_UNARY(uint, N, asuint, x);
}

__generic<let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "uvec$N0($0)")
__target_intrinsic(spirv, "OpBitcast resultType resultId _0")
[__readNone]
vector<uint, N> asuint(vector<int, N> x)
{
    VECTOR_MAP_UNARY(uint, N, asuint, x);
}

__generic<let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<uint,N,M> asuint(matrix<float,N,M> x)
{
    MATRIX_MAP_UNARY(uint, N, M, asuint, x);
}

__generic<let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<uint, N, M> asuint(matrix<int, N, M> x)
{
    MATRIX_MAP_UNARY(uint, N, M, asuint, x);
}

[__unsafeForceInlineEarly]
[__readNone]
uint asuint(uint x)
{ return x; }

__generic<let N : int>
[__unsafeForceInlineEarly]
[__readNone]
vector<uint,N> asuint(vector<uint,N> x)
{ return x; }

__generic<let N : int, let M : int>
[__unsafeForceInlineEarly]
[__readNone]
matrix<uint,N,M> asuint(matrix<uint,N,M> x)
{ return x; }


// 16-bit bitcast ops (HLSL SM 6.2)
//
// TODO: We need to map these to GLSL/SPIR-V
// operations that don't require an intermediate
// conversion to fp32.

// Identity cases:

[__unsafeForceInlineEarly][__readNone] float16_t asfloat16(float16_t value) { return value; }
[__unsafeForceInlineEarly][__readNone] vector<float16_t,N> asfloat16<let N : int>(vector<float16_t,N> value) { return value; }
[__unsafeForceInlineEarly][__readNone] matrix<float16_t,R,C> asfloat16<let R : int, let C : int>(matrix<float16_t,R,C> value) { return value; }

[__unsafeForceInlineEarly][__readNone] int16_t asint16(int16_t value) { return value; }
[__unsafeForceInlineEarly][__readNone] vector<int16_t,N> asint16<let N : int>(vector<int16_t,N> value) { return value; }
[__unsafeForceInlineEarly][__readNone] matrix<int16_t,R,C> asint16<let R : int, let C : int>(matrix<int16_t,R,C> value) { return value; }

[__unsafeForceInlineEarly][__readNone] uint16_t asuint16(uint16_t value) { return value; }
[__unsafeForceInlineEarly][__readNone] vector<uint16_t,N> asuint16<let N : int>(vector<uint16_t,N> value) { return value; }
[__unsafeForceInlineEarly][__readNone] matrix<uint16_t,R,C> asuint16<let R : int, let C : int>(matrix<uint16_t,R,C> value) { return value; }

// Signed<->unsigned cases:

[__unsafeForceInlineEarly][__readNone] int16_t asint16(uint16_t value) { return value; }
[__unsafeForceInlineEarly][__readNone] vector<int16_t,N> asint16<let N : int>(vector<uint16_t,N> value) { return value; }
[__unsafeForceInlineEarly][__readNone] matrix<int16_t,R,C> asint16<let R : int, let C : int>(matrix<uint16_t,R,C> value) { return value; }

[__unsafeForceInlineEarly][__readNone] uint16_t asuint16(int16_t value) { return value; }
[__unsafeForceInlineEarly][__readNone] vector<uint16_t,N> asuint16<let N : int>(vector<int16_t,N> value) { return value; }
[__unsafeForceInlineEarly][__readNone] matrix<uint16_t,R,C> asuint16<let R : int, let C : int>(matrix<int16_t,R,C> value) { return value; }

// Float->unsigned cases:

__target_intrinsic(hlsl)
__target_intrinsic(glsl, "uint16_t(packHalf2x16(vec2($0, 0.0)))")
__target_intrinsic(cuda, "__half_as_ushort")
__target_intrinsic(spirv, "OpBitcast resultType resultId _0")
[__readNone]
uint16_t asuint16(float16_t value);

[__readNone]
vector<uint16_t,N> asuint16<let N : int>(vector<float16_t,N> value)
{ VECTOR_MAP_UNARY(uint16_t, N, asuint16, value); }

[__readNone]
matrix<uint16_t,R,C> asuint16<let R : int, let C : int>(matrix<float16_t,R,C> value)
{ MATRIX_MAP_UNARY(uint16_t, R, C, asuint16, value); }

// Unsigned->float cases:

__target_intrinsic(hlsl)
__target_intrinsic(glsl, "float16_t(unpackHalf2x16($0).x)")
__target_intrinsic(cuda, "__ushort_as_half")
__target_intrinsic(spirv, "OpBitcast resultType resultId _0")
[__readNone]
float16_t asfloat16(uint16_t value);

[__readNone]
vector<float16_t,N> asfloat16<let N : int>(vector<uint16_t,N> value)
{ VECTOR_MAP_UNARY(float16_t, N, asfloat16, value); }

[__readNone]
matrix<float16_t,R,C> asfloat16<let R : int, let C : int>(matrix<uint16_t,R,C> value)
{ MATRIX_MAP_UNARY(float16_t, R, C, asfloat16, value); }

// Float<->signed cases:

__target_intrinsic(hlsl)
__target_intrinsic(cuda, "__half_as_short")
__target_intrinsic(spirv, "OpBitcast resultType resultId _0")
[__unsafeForceInlineEarly][__readNone] int16_t asint16(float16_t value) { return asuint16(value); }
__target_intrinsic(hlsl) [__unsafeForceInlineEarly][__readNone] vector<int16_t,N> asint16<let N : int>(vector<float16_t,N> value) { return asuint16(value); }
__target_intrinsic(hlsl) [__unsafeForceInlineEarly][__readNone] matrix<int16_t,R,C> asint16<let R : int, let C : int>(matrix<float16_t,R,C> value) { return asuint16(value); }

__target_intrinsic(hlsl)
__target_intrinsic(cuda, "__short_as_half")
__target_intrinsic(spirv, "OpBitcast resultType resultId _0")
[__readNone]
[__unsafeForceInlineEarly] float16_t asfloat16(int16_t value) { return asfloat16(asuint16(value)); }

__target_intrinsic(hlsl) [__unsafeForceInlineEarly][__readNone] vector<float16_t,N> asfloat16<let N : int>(vector<int16_t,N> value) { return asfloat16(asuint16(value)); }
__target_intrinsic(hlsl) [__unsafeForceInlineEarly][__readNone] matrix<float16_t,R,C> asfloat16<let R : int, let C : int>(matrix<int16_t,R,C> value) { return asfloat16(asuint16(value)); }

// Inverse tangent (HLSL SM 1.0)
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_atan($0)")
__target_intrinsic(cpp, "$P_atan($0)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Atan _0")
[__readNone]
T atan(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Atan _0")
[__readNone]
vector<T, N> atan(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, atan, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> atan(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, atan, x);
}

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl,"atan($0,$1)")
__target_intrinsic(cuda, "$P_atan2($0, $1)")
__target_intrinsic(cpp, "$P_atan2($0, $1)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Atan2 _0 _1")
[__readNone]
T atan2(T y, T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl,"atan($0,$1)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Atan2 _0 _1")
[__readNone]
vector<T, N> atan2(vector<T, N> y, vector<T, N> x)
{
    VECTOR_MAP_BINARY(T, N, atan2, y, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> atan2(matrix<T,N,M> y, matrix<T,N,M> x)
{
    MATRIX_MAP_BINARY(T, N, M, atan2, y, x);
}

// Ceiling (HLSL SM 1.0)
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_ceil($0)")
__target_intrinsic(cpp, "$P_ceil($0)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Ceil _0")
[__readNone]
T ceil(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Ceil _0")
[__readNone]
vector<T, N> ceil(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, ceil, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> ceil(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, ceil, x);
}


// Check access status to tiled resource
bool CheckAccessFullyMapped(uint status);

// Clamp (HLSL SM 1.0)
__generic<T : __BuiltinIntegerType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 fus(FClamp, UClamp, SClamp) _0 _1 _2")
[__readNone]
T clamp(T x, T minBound, T maxBound)
{
    return min(max(x, minBound), maxBound);
}

__generic<T : __BuiltinIntegerType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 fus(FClamp, UClamp, SClamp) _0 _1 _2")
[__readNone]
vector<T, N> clamp(vector<T, N> x, vector<T, N> minBound, vector<T, N> maxBound)
{
    return min(max(x, minBound), maxBound);
}

__generic<T : __BuiltinIntegerType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> clamp(matrix<T,N,M> x, matrix<T,N,M> minBound, matrix<T,N,M> maxBound)
{
    return min(max(x, minBound), maxBound);
}

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 fus(FClamp, UClamp, SClamp) _0 _1 _2")
[__readNone]
T clamp(T x, T minBound, T maxBound)
{
    return min(max(x, minBound), maxBound);
}

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 fus(FClamp, UClamp, SClamp) _0 _1 _2")
[__readNone]
vector<T, N> clamp(vector<T, N> x, vector<T, N> minBound, vector<T, N> maxBound)
{
    return min(max(x, minBound), maxBound);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> clamp(matrix<T,N,M> x, matrix<T,N,M> minBound, matrix<T,N,M> maxBound)
{
    return min(max(x, minBound), maxBound);
}

// Clip (discard) fragment conditionally
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
void clip(T x)
{
    if(x < T(0)) discard;
}

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
void clip(vector<T,N> x)
{
    if(any(x < T(0))) discard;
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
void clip(matrix<T,N,M> x)
{
    if(any(x < T(0))) discard;
}

// Cosine
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_cos($0)")
__target_intrinsic(cpp, "$P_cos($0)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Cos _0")
[__readNone]
T cos(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Cos _0")
[__readNone]
vector<T, N> cos(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T,N, cos, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> cos(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, cos, x);
}

// Hyperbolic cosine
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_cosh($0)")
__target_intrinsic(cpp, "$P_cosh($0)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Cosh _0")
[__readNone]
T cosh(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Cosh _0")
[__readNone]
vector<T,N> cosh(vector<T,N> x)
{
    VECTOR_MAP_UNARY(T,N, cosh, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> cosh(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, cosh, x);
}

// Population count
[__readNone]
uint countbits(uint value)
{
    __target_switch
    {
    case hlsl:
        __intrinsic_asm "countbits";
    case glsl:
        __intrinsic_asm "bitCount";
    case cuda:
    case cpp:
        __intrinsic_asm "$P_countbits($0)";
    case spirv:
        return spirv_asm {OpBitCount $$uint result $value};
    }
}

// Cross product
// TODO: SPIRV does not support integer vectors.
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Cross _0 _1")
[__readNone]
vector<T,3> cross(vector<T,3> left, vector<T,3> right)
{
    return vector<T,3>(
        left.y * right.z - left.z * right.y,
        left.z * right.x - left.x * right.z,
        left.x * right.y - left.y * right.x);
}

__generic<T : __BuiltinIntegerType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Cross _0 _1")
[__readNone]
vector<T, 3> cross(vector<T, 3> left, vector<T, 3> right)
{
    return vector<T, 3>(
        left.y * right.z - left.z * right.y,
        left.z * right.x - left.x * right.z,
        left.x * right.y - left.y * right.x);
}

// Convert encoded color
__target_intrinsic(hlsl)
[__readNone]
int4 D3DCOLORtoUBYTE4(float4 color)
{
    let scaled = color.zyxw * 255.001999f;
    return int4(scaled);
}

// Partial-difference derivatives
${{{{
const char* diffDimensions[2] = {"x", "y"};
for (auto xOrY : diffDimensions) {
}}}}
__generic<T : __BuiltinFloatingPointType>
[__readNone]
T dd$(xOrY)(T x)
{
    __target_switch
    {
    case hlsl:
    case cpp:
    case cuda:
        __intrinsic_asm "dd$(xOrY)";
    case glsl:
        __intrinsic_asm "dFd$(xOrY)";
    case spirv:
        return spirv_asm {OpDPd$(xOrY) $$T result $x};
    }
}

__generic<T : __BuiltinFloatingPointType, let N : int>
[__readNone]
vector<T, N> dd$(xOrY)(vector<T, N> x)
{
    __target_switch
    {
    case hlsl:
    case cpp:
    case cuda:
        __intrinsic_asm "dd$(xOrY)";
    case glsl:
        __intrinsic_asm "dFd$(xOrY)";
    case spirv:
        return spirv_asm {OpDPd$(xOrY) $$vector<T, N> result $x};
    }
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> dd$(xOrY)(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, dd$(xOrY), x);
}

__generic<T : __BuiltinFloatingPointType>
__glsl_extension(GL_ARB_derivative_control)
[__readNone]
T dd$(xOrY)_coarse(T x)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "dd$(xOrY)_coarse";
    case glsl: __intrinsic_asm "dFd$(xOrY)Coarse";
    case spirv: return spirv_asm {OpCapability DerivativeControl; result:$$T = OpDPd$(xOrY)Coarse $x};
    }
}

__generic<T : __BuiltinFloatingPointType, let N : int>
__glsl_extension(GL_ARB_derivative_control)
[__readNone]
vector<T, N> dd$(xOrY)_coarse(vector<T, N> x)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "dd$(xOrY)_coarse";
    case glsl: __intrinsic_asm "dFd$(xOrY)Coarse";
    case spirv: return spirv_asm {OpCapability DerivativeControl; result:$$vector<T,N> = OpDPd$(xOrY)Coarse $x};
    }
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> dd$(xOrY)_coarse(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, dd$(xOrY)_coarse, x);
}

__generic<T : __BuiltinFloatingPointType>
__glsl_extension(GL_ARB_derivative_control)
[__readNone]
T dd$(xOrY)_fine(T x)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "dd$(xOrY)_fine";
    case glsl: __intrinsic_asm "dFd$(xOrY)Fine";
    case spirv: return spirv_asm {OpCapability DerivativeControl; result:$$T = OpDPd$(xOrY)Fine $x};
    }
}

__generic<T : __BuiltinFloatingPointType, let N : int>
__glsl_extension(GL_ARB_derivative_control)
[__readNone]
vector<T, N> dd$(xOrY)_fine(vector<T, N> x)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "dd$(xOrY)_fine";
    case glsl: __intrinsic_asm "dFd$(xOrY)Fine";
    case spirv: return spirv_asm {OpCapability DerivativeControl; result:$$vector<T,N> = OpDPd$(xOrY)Fine $x};
    }
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> dd$(xOrY)_fine(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, dd$(xOrY)_fine, x);
}

${{{{
} // for (xOrY)
}}}}


// Radians to degrees

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Degrees _0")
[__readNone]
T degrees(T x)
{
    return x * (T(180) / T.getPi());
}

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Degrees _0")
[__readNone]
vector<T, N> degrees(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, degrees, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> degrees(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, degrees, x);
}

// Matrix determinant

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Determinant _0")
[__readNone]
[PreferCheckpoint]
T determinant(matrix<T,N,N> m);

// Barrier for device memory
__glsl_extension(GL_KHR_memory_scope_semantics)
void DeviceMemoryBarrier()
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "DeviceMemoryBarrier";
    case glsl: __intrinsic_asm "memoryBarrier(gl_ScopeDevice, (gl_StorageSemanticsImage|gl_StorageSemanticsBuffer), gl_SemanticsAcquireRelease)";
    case cuda: __intrinsic_asm "__threadfence()";
    case spirv: spirv_asm
        {
            OpMemoryBarrier Device AcquireRelease|UniformMemory|ImageMemory;
        };
    }
}

__glsl_extension(GL_KHR_memory_scope_semantics)
void DeviceMemoryBarrierWithGroupSync()
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "DeviceMemoryBarrierWithGroupSync";
    case glsl: __intrinsic_asm "controlBarrier(gl_ScopeWorkgroup, gl_ScopeDevice, (gl_StorageSemanticsImage|gl_StorageSemanticsBuffer), gl_SemanticsAcquireRelease)";
    case cuda: __intrinsic_asm "__syncthreads()";
    case spirv: spirv_asm
        {
            OpControlBarrier Workgroup Device AcquireRelease|UniformMemory|ImageMemory;
        };
    }
}

// Vector distance

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Distance _0 _1")
[__readNone]
T distance(vector<T, N> x, vector<T, N> y)
{
    return length(x - y);
}

__generic<T : __BuiltinFloatingPointType>
[__readNone]
T distance(T x, T y)
{
    return length(x - y);
}

// Vector dot product

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpDot resultType resultId _0 _1")
[__readNone]
T dot(vector<T, N> x, vector<T, N> y)
{
    T result = T(0);
    for(int i = 0; i < N; ++i)
        result += x[i] * y[i];
    return result;
}

__generic<T : __BuiltinIntegerType, let N : int>
__target_intrinsic(hlsl)
[__readNone]
T dot(vector<T, N> x, vector<T, N> y)
{
    T result = T(0);
    for(int i = 0; i < N; ++i)
        result += x[i] * y[i];
    return result;
}


// Helper for computing distance terms for lighting (obsolete)

__generic<T : __BuiltinFloatingPointType> vector<T,4> dst(vector<T,4> x, vector<T,4> y);

// Given a RWByteAddressBuffer allow it to be interpreted as a RWStructuredBuffer
__intrinsic_op($(kIROp_GetEquivalentStructuredBuffer))
RWStructuredBuffer<T> __getEquivalentStructuredBuffer<T>(RWByteAddressBuffer b);

__intrinsic_op($(kIROp_GetEquivalentStructuredBuffer))
StructuredBuffer<T> __getEquivalentStructuredBuffer<T>(ByteAddressBuffer b);

__intrinsic_op($(kIROp_GetEquivalentStructuredBuffer))
RasterizerOrderedStructuredBuffer<T> __getEquivalentStructuredBuffer<T>(RasterizerOrderedByteAddressBuffer b);

// Error message

// void errorf( string format, ... );

// Attribute evaluation

// TODO: The matrix cases of these functions won't actuall work
// when compiled to GLSL, since they only support scalar/vector

// TODO: Should these be constrains to `__BuiltinFloatingPointType`?
// TODO: SPIRV-direct does not support non-floating-point types.

__generic<T : __BuiltinArithmeticType>
__target_intrinsic(glsl, interpolateAtCentroid)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 InterpolateAtCentroid _0")
[__readNone]
T EvaluateAttributeAtCentroid(T x);

__generic<T : __BuiltinArithmeticType, let N : int>
__target_intrinsic(glsl, interpolateAtCentroid)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 InterpolateAtCentroid _0")
[__readNone]
vector<T,N> EvaluateAttributeAtCentroid(vector<T,N> x);

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(glsl, interpolateAtCentroid)
[__readNone]
matrix<T,N,M> EvaluateAttributeAtCentroid(matrix<T,N,M> x)
{
    MATRIX_MAP_UNARY(T, N, M, EvaluateAttributeAtCentroid, x);
}

__generic<T : __BuiltinArithmeticType>
__target_intrinsic(glsl, "interpolateAtSample($0, int($1))")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 InterpolateAtSample _0 _1")
[__readNone]
T EvaluateAttributeAtSample(T x, uint sampleindex);

__generic<T : __BuiltinArithmeticType, let N : int>
__target_intrinsic(glsl, "interpolateAtSample($0, int($1))")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 InterpolateAtSample _0 _1")
[__readNone]
vector<T,N> EvaluateAttributeAtSample(vector<T,N> x, uint sampleindex);

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(glsl, "interpolateAtSample($0, int($1))")
[__readNone]
matrix<T,N,M> EvaluateAttributeAtSample(matrix<T,N,M> x, uint sampleindex)
{
    matrix<T,N,M> result;
    for(int i = 0; i < N; ++i)
    {
        result[i] = EvaluateAttributeAtSample(x[i], sampleindex);
    }
    return result;
}

__generic<T : __BuiltinArithmeticType>
__target_intrinsic(glsl, "interpolateAtOffset($0, vec2($1) / 16.0f)")
__target_intrinsic(spirv, "%foffset = OpConvertSToF _type(float2) resultId _1; %offsetdiv16 = 136 _type(float2) resultId %foffset const(float2, 16.0, 16.0); OpExtInst resultType resultId glsl450 78 _0 %offsetdiv16")
[__readNone]
T EvaluateAttributeSnapped(T x, int2 offset);

__generic<T : __BuiltinArithmeticType, let N : int>
__target_intrinsic(glsl, "interpolateAtOffset($0, vec2($1) / 16.0f)")
__target_intrinsic(spirv, "%foffset = OpConvertSToF _type(float2) resultId _1; %offsetdiv16 = 136 _type(float2) resultId %foffset const(float2, 16.0, 16.0); OpExtInst resultType resultId glsl450 78 _0 %offsetdiv16")
[__readNone]
vector<T,N> EvaluateAttributeSnapped(vector<T,N> x, int2 offset);

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(glsl, "interpolateAtOffset($0, vec2($1) / 16.0f)")
[__readNone]
matrix<T,N,M> EvaluateAttributeSnapped(matrix<T,N,M> x, int2 offset)
{
    matrix<T,N,M> result;
    for(int i = 0; i < N; ++i)
    {
        result[i] = EvaluateAttributeSnapped(x[i], offset);
    }
    return result;
}

// Base-e exponent

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_exp($0)")
__target_intrinsic(cpp, "$P_exp($0)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Exp _0")
[__readNone]
T exp(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Exp _0")
[__readNone]
vector<T, N> exp(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, exp, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> exp(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, exp, x);
}

// Base-2 exponent

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_exp2($0)")
__target_intrinsic(cpp, "$P_exp2($0)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Exp2 _0")
[__readNone]
T exp2(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Exp2 _0")
[__readNone]
vector<T,N> exp2(vector<T,N> x)
{
    VECTOR_MAP_UNARY(T, N, exp2, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> exp2(matrix<T,N,M> x)
{
    MATRIX_MAP_UNARY(T, N, M, exp2, x);
}


// Convert 16-bit float stored in low bits of integer
__target_intrinsic(glsl, "unpackHalf2x16($0).x")
__glsl_version(420)
__target_intrinsic(hlsl)
__cuda_sm_version(6.0)
__target_intrinsic(cuda, "__half2float(__ushort_as_half($0))")
__target_intrinsic(spirv, R"(
                    %lowBits = OpUConvert _type(uint16_t) resultId _0;
                    %half = OpBitcast _type(half) resultId %lowBits;
                    OpFConvert resultType resultId %half)")
[__readNone]
float f16tof32(uint value);

__generic<let N : int>
__target_intrinsic(hlsl)
[__readNone]
vector<float, N> f16tof32(vector<uint, N> value)
{
    VECTOR_MAP_UNARY(float, N, f16tof32, value);
}



// Convert to 16-bit float stored in low bits of integer
__target_intrinsic(glsl, "packHalf2x16(vec2($0,0.0))")
__glsl_version(420)
__target_intrinsic(hlsl)
__cuda_sm_version(6.0)
__target_intrinsic(cuda, "__half_as_ushort(__float2half($0))")
__target_intrinsic(spirv, R"(
                    %half = OpFConvert _type(half) resultId _0;
                    %lowBits = OpBitcast _type(uint16_t) resultId %half;
                    OpUConvert resultType resultId %lowBits)")
[__readNone]
uint f32tof16(float value);

__generic<let N : int>
__target_intrinsic(hlsl)
[__readNone]
vector<uint, N> f32tof16(vector<float, N> value)
{
    VECTOR_MAP_UNARY(uint, N, f32tof16, value);
}

// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
// The following is Slang specific and NOT part of standard HLSL
// It's not clear what happens with float16 time in HLSL -> can the float16 coerce to uint for example? If so that would
// give the wrong result

__target_intrinsic(glsl, "unpackHalf2x16($0).x")
__target_intrinsic(cuda, "__half2float")
__target_intrinsic(spirv, "OpFConvert resultType resultId _0")
__glsl_version(420)
[__readNone]
float f16tof32(float16_t value);

__generic<let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "__half2float")
__target_intrinsic(spirv, "OpFConvert resultType resultId _0")
[__readNone]
vector<float, N> f16tof32(vector<float16_t, N> value)
{
    VECTOR_MAP_UNARY(float, N, f16tof32, value);
}

// Convert to float16_t
__target_intrinsic(glsl, "packHalf2x16(vec2($0,0.0))")
__glsl_version(420)
__target_intrinsic(cuda, "__float2half")
__target_intrinsic(spirv, "OpFConvert resultType resultId _0")
[__readNone]
float16_t f32tof16_(float value);

__generic<let N : int>
__target_intrinsic(cuda, "__float2half")
__target_intrinsic(spirv, "OpFConvert resultType resultId _0")
[__readNone]
vector<float16_t, N> f32tof16_(vector<float, N> value)
{
    VECTOR_MAP_UNARY(float16_t, N, f32tof16, value);
}

// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

// Flip surface normal to face forward, if needed
__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 FaceForward _0 _1 _2")
[__readNone]
vector<T,N> faceforward(vector<T,N> n, vector<T,N> i, vector<T,N> ng)
{
    return dot(ng, i) < T(0.0f) ? n : -n;
}

// Find first set bit starting at high bit and working down
__target_intrinsic(hlsl)
__target_intrinsic(glsl,"findMSB")
__target_intrinsic(cuda, "$P_firstbithigh($0)")
__target_intrinsic(cpp, "$P_firstbithigh($0)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 FindSMsb _0")
[__readNone]
int firstbithigh(int value);

__target_intrinsic(hlsl)
__target_intrinsic(glsl,"findMSB")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 FindSMsb _0")
__generic<let N : int>
[__readNone]
vector<int, N> firstbithigh(vector<int, N> value)
{
    VECTOR_MAP_UNARY(int, N, firstbithigh, value);
}

__target_intrinsic(hlsl)
__target_intrinsic(glsl,"findMSB")
__target_intrinsic(cuda, "$P_firstbithigh($0)")
__target_intrinsic(cpp, "$P_firstbithigh($0)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 FindUMsb _0")
[__readNone]
uint firstbithigh(uint value);

__target_intrinsic(hlsl)
__target_intrinsic(glsl,"findMSB")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 FindUMsb _0")
__generic<let N : int>
[__readNone]
vector<uint,N> firstbithigh(vector<uint,N> value)
{
    VECTOR_MAP_UNARY(uint, N, firstbithigh, value);
}

// Find first set bit starting at low bit and working up
__target_intrinsic(hlsl)
__target_intrinsic(glsl,"findLSB")
__target_intrinsic(cuda, "$P_firstbitlow($0)")
__target_intrinsic(cpp, "$P_firstbitlow($0)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 FindILsb _0")
[__readNone]
int firstbitlow(int value);

__target_intrinsic(hlsl)
__target_intrinsic(glsl,"findLSB")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 FindILsb _0")
__generic<let N : int>
[__readNone]
vector<int,N> firstbitlow(vector<int,N> value)
{
    VECTOR_MAP_UNARY(int, N, firstbitlow, value);
}

__target_intrinsic(hlsl)
__target_intrinsic(glsl,"findLSB")
__target_intrinsic(cuda, "$P_firstbitlow($0)")
__target_intrinsic(cpp, "$P_firstbitlow($0)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 FindILsb _0")
[__readNone]
uint firstbitlow(uint value);

__target_intrinsic(hlsl)
__target_intrinsic(glsl,"findLSB")
__generic<let N : int>
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 FindILsb _0")
[__readNone]
vector<uint,N> firstbitlow(vector<uint,N> value)
{
    VECTOR_MAP_UNARY(uint, N, firstbitlow, value);
}

// Floor (HLSL SM 1.0)

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_floor($0)")
__target_intrinsic(cpp, "$P_floor($0)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Floor _0")
[__readNone]
T floor(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Floor _0")
[__readNone]
vector<T, N> floor(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, floor, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> floor(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, floor, x);
}

// Fused multiply-add for doubles
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_fma($0, $1, $2)")
__target_intrinsic(cpp, "$P_fma($0, $1, $2)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2")
[__readNone]
double fma(double a, double b, double c);

__generic<let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2")
[__readNone]
vector<double, N> fma(vector<double, N> a, vector<double, N> b, vector<double, N> c)
{
    VECTOR_MAP_TRINARY(double, N, fma, a, b, c);
}

__generic<let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<double, N, M> fma(matrix<double, N, M> a, matrix<double, N, M> b, matrix<double, N, M> c)
{
    MATRIX_MAP_TRINARY(double, N, M, fma, a, b, c);
}

// Floating point remainder of x/y
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "$P_fmod($0, $1)")
__target_intrinsic(cpp, "$P_fmod($0, $1)")
[__readNone]
T fmod(T x, T y)
{
    return x - y * trunc(x/y);
}

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
[__readNone]
vector<T, N> fmod(vector<T, N> x, vector<T, N> y)
{
    VECTOR_MAP_BINARY(T, N, fmod, x, y);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> fmod(matrix<T, N, M> x, matrix<T, N, M> y)
{
    MATRIX_MAP_BINARY(T, N, M, fmod, x, y);
}

// Fractional part
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, fract)
__target_intrinsic(cuda, "$P_frac($0)")
__target_intrinsic(cpp, "$P_frac($0)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Fract _0")
[__readNone]
T frac(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, fract)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Fract _0")
[__readNone]
vector<T, N> frac(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, frac, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
[__readNone]
matrix<T, N, M> frac(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, frac, x);
}

// Split float into mantissa and exponent
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(cpp, "$P_frexp($0, $1)")
__target_intrinsic(cuda, "$P_frexp($0, $1)")
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Frexp _0 _1")
[__readNone]
T frexp(T x, out int exp);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Frexp _0 _1")
[__readNone]
vector<T, N> frexp(vector<T, N> x, out vector<int, N> exp)
{
    VECTOR_MAP_BINARY(T, N, frexp, x, exp);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int, let L : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> frexp(matrix<T, N, M> x, out matrix<int, N, M, L> exp)
{
    MATRIX_MAP_BINARY(T, N, M, frexp, x, exp);
}

// Texture filter width
__generic<T : __BuiltinFloatingPointType>
[__readNone]
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpFwidth resultType resultId _0")
T fwidth(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpFwidth resultType resultId _0")
[__readNone]
vector<T, N> fwidth(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, fwidth, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> fwidth(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, fwidth, x);
}

/// Get the value of a vertex attribute at a specific vertex.
///
/// The `GetAttributeAtVertex()` function can be used in a fragment shader
/// to get the value of the given `attribute` at the vertex of the primitive
/// that corresponds to the given `vertexIndex`.
///
/// Note that the `attribute` must have been a declared varying input to
/// the fragment shader with the `nointerpolation` modifier.
///
/// This function can be applied to scalars, vectors, and matrices of
/// built-in scalar types.
///
__generic<T : __BuiltinType>
[__readNone]
__glsl_version(450)
T GetAttributeAtVertex(T attribute, uint vertexIndex)
{
    __target_switch
    {
    case hlsl:
        __intrinsic_asm "GetAttributeAtVertex";
    case _GL_NV_fragment_shader_barycentric:
    case _GL_EXT_fragment_shader_barycentric: 
        __intrinsic_asm "$0[$1]";
    case spirv:
        return spirv_asm {
            %_ptr_Input_T = OpTypePointer Input $$T;
            %addr = OpAccessChain %_ptr_Input_T $attribute $vertexIndex;
            result:$$T = OpLoad %addr;
        };
    }
}

/// Get the value of a vertex attribute at a specific vertex.
///
/// The `GetAttributeAtVertex()` function can be used in a fragment shader
/// to get the value of the given `attribute` at the vertex of the primitive
/// that corresponds to the given `vertexIndex`.
///
/// Note that the `attribute` must have been a declared varying input to
/// the fragment shader with the `nointerpolation` modifier.
///
/// This function can be applied to scalars, vectors, and matrices of
/// built-in scalar types.
///
__generic<T : __BuiltinType, let N : int>
[__readNone]
__glsl_version(450)
vector<T,N> GetAttributeAtVertex(vector<T,N> attribute, uint vertexIndex)
{
    __target_switch
    {
    case hlsl:
        __intrinsic_asm "GetAttributeAtVertex";
    case _GL_NV_fragment_shader_barycentric:
    case _GL_EXT_fragment_shader_barycentric: 
        __intrinsic_asm "$0[$1]";
    case spirv:
        return spirv_asm {
            %_ptr_Input_vectorT = OpTypePointer Input $$vector<T,N>;
            %addr = OpAccessChain %_ptr_Input_vectorT $attribute $vertexIndex;
            result:$$vector<T,N> = OpLoad %addr;
        };
    }
}

/// Get the value of a vertex attribute at a specific vertex.
///
/// The `GetAttributeAtVertex()` function can be used in a fragment shader
/// to get the value of the given `attribute` at the vertex of the primitive
/// that corresponds to the given `vertexIndex`.
///
/// Note that the `attribute` must have been a declared varying input to
/// the fragment shader with the `nointerpolation` modifier.
///
/// This function can be applied to scalars, vectors, and matrices of
/// built-in scalar types.
///
__generic<T : __BuiltinType, let N : int, let M : int>
[__readNone]
__glsl_version(450)
matrix<T,N,M> GetAttributeAtVertex(matrix<T,N,M> attribute, uint vertexIndex)
{
    __target_switch
    {
    case hlsl:
        __intrinsic_asm "GetAttributeAtVertex";
    case _GL_NV_fragment_shader_barycentric:
    case _GL_EXT_fragment_shader_barycentric: 
        __intrinsic_asm "$0[$1]";
    case spirv:
        return spirv_asm {
            %_ptr_Input_matrixT = OpTypePointer Input $$matrix<T,N,M>;
            %addr = OpAccessChain %_ptr_Input_matrixT $attribute $vertexIndex;
            result:$$matrix<T,N,M> = OpLoad %addr;
        };
    }
}

// Get number of samples in render target
[__readNone]
uint GetRenderTargetSampleCount();

// Get position of given sample
[__readNone]
float2 GetRenderTargetSamplePosition(int Index);

// Group memory barrier
__glsl_extension(GL_KHR_memory_scope_semantics)
void GroupMemoryBarrier()
{
    __target_switch
    {
    case glsl: __intrinsic_asm "memoryBarrier(gl_ScopeWorkgroup, gl_StorageSemanticsShared, gl_SemanticsAcquireRelease)";
    case hlsl: __intrinsic_asm "GroupMemoryBarrier";
    case cuda: __intrinsic_asm "__threadfence_block";
    case spirv:
        spirv_asm
        {
            OpMemoryBarrier Workgroup AcquireRelease|WorkgroupMemory
        };
    }
}

void __subgroupBarrier()
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupBarrier";
    case hlsl: __intrinsic_asm "GroupMemoryBarrierWithGroupSync";
    case cuda: __intrinsic_asm "__syncthreads()";
    case spirv:
        spirv_asm
        {
            OpControlBarrier Subgroup Subgroup AcquireRelease|WorkgroupMemory|ImageMemory|UniformMemory
        };
    }
}

void GroupMemoryBarrierWithGroupSync()
{
    __target_switch
    {
    case glsl: __intrinsic_asm "barrier";
    case hlsl: __intrinsic_asm "GroupMemoryBarrierWithGroupSync";
    case cuda: __intrinsic_asm "__syncthreads()";
    case spirv:
        spirv_asm
        {
            OpControlBarrier Workgroup Workgroup AcquireRelease|WorkgroupMemory
        };
    }
}

// Atomics

__glsl_version(430)
void InterlockedAdd(__ref  int dest,  int value)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedAdd";
    case cuda: __intrinsic_asm "atomicAdd($0, $1)";
    case glsl: __intrinsic_asm "$atomicAdd($A, $1)";
    case spirv:
        spirv_asm
        {
            result:$$int = OpAtomicIAdd &dest Device None $value 
        };
    }
}

__glsl_version(430)
void InterlockedAdd(__ref uint dest, uint value)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedAdd";
    case cuda: __intrinsic_asm "atomicAdd((int*)$0, $1)";
    case glsl: __intrinsic_asm "$atomicAdd($A, $1)";
    case spirv:
        spirv_asm
        {
            result:$$uint = OpAtomicIAdd &dest Device None $value 
        };
    }
}

[ForceInline]
void InterlockedAdd(__ref uint dest, int value)
{
    InterlockedAdd(dest, (uint)value);
}

__glsl_version(430)
void InterlockedAdd(__ref  int dest,  int value, out  int original_value)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedAdd";
    case cuda: __intrinsic_asm "(*$2 = atomicAdd($0, $1))";
    case glsl: __intrinsic_asm "($2 = $atomicAdd($A, $1))";
    case spirv:
        spirv_asm
        {
            %original:$$int = OpAtomicIAdd &dest Device None $value;
            OpStore &original_value %original
        };
    }
}

__glsl_version(430)
void InterlockedAdd(__ref uint dest, uint value, out uint original_value)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedAdd";
    case cuda: __intrinsic_asm "(*$2 = (uint)atomicAdd((int*)$0, $1))";
    case glsl: __intrinsic_asm "($2 = $atomicAdd($A, $1))";
    case spirv:
        spirv_asm
        {
            %original:$$uint = OpAtomicIAdd &dest Device None $value;
            OpStore &original_value %original
        };
    }
}

__glsl_version(430)
void InterlockedAnd(__ref  int dest,  int value)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedAnd";
    case cuda: __intrinsic_asm "atomicAnd($0, $1)";
    case glsl: __intrinsic_asm "$atomicAnd($A, $1)";
    case spirv:
        spirv_asm
        {
            result:$$int = OpAtomicAnd &dest Device None $value;
        };
    }
}

__glsl_version(430)
void InterlockedAnd(__ref uint dest, uint value)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedAnd";
    case cuda: __intrinsic_asm "atomicAnd((int*)$0, $1)";
    case glsl: __intrinsic_asm "$atomicAnd($A, $1)";
    case spirv:
        spirv_asm
        {
            result:$$uint = OpAtomicAnd &dest Device None $value;
        };
    }
}

__glsl_version(430)
void InterlockedAnd(__ref  int dest,  int value, out  int original_value)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedAnd";
    case cuda: __intrinsic_asm "(*$2 = atomicAnd($0, $1))";
    case glsl: __intrinsic_asm "($2 = $atomicAnd($A, $1))";
    case spirv:
        spirv_asm
        {
            %original:$$int = OpAtomicAnd &dest Device None $value;
            OpStore &original_value %original
        };
    }
}

__glsl_version(430)
void InterlockedAnd(__ref uint dest, uint value, out uint original_value)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedAnd";
    case glsl: __intrinsic_asm "($2 = atomicAnd($0, $1))";
    case cuda: __intrinsic_asm "(*$2 = atomicAnd((int*)$0, $1))";
    case spirv:
        spirv_asm
        {
            %original:$$uint = OpAtomicAnd &dest Device None $value;
            OpStore &original_value %original
        };
    }
}

__glsl_version(430)
void InterlockedCompareExchange(__ref  int dest,  int compare_value,  int value, out  int original_value)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedCompareExchange";
    case glsl: __intrinsic_asm "($3 = $atomicCompSwap($A, $1, $2))";
    case cuda: __intrinsic_asm "(*$3 = atomicCAS($0, $1, $2))";
    case spirv:
        spirv_asm
        {
            %original:$$int = OpAtomicCompareExchange &dest Device None None $value $compare_value;
            OpStore &original_value %original
        };
    }
}

__glsl_version(430)
void InterlockedCompareExchange(__ref uint dest, uint compare_value, uint value, out uint original_value)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedCompareExchange";
    case glsl: __intrinsic_asm "($3 = $atomicCompSwap($A, $1, $2))";
    case cuda: __intrinsic_asm "(*$3 = (uint)atomicCAS((int*)$0, $1, $2))";
    case spirv:
        spirv_asm
        {
            %original:$$uint = OpAtomicCompareExchange &dest Device None None $value $compare_value;
            OpStore &original_value %original
        };
    }
}

__glsl_version(430)
void InterlockedCompareStore(__ref int dest,  int compare_value,  int value)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedCompareStore";
    case glsl: __intrinsic_asm "$atomicCompSwap($A, $1, $2)";
    case cuda: __intrinsic_asm "atomicCAS($0, $1, $2)";
    case spirv:
        spirv_asm
        {
            result:$$int = OpAtomicCompareExchange &dest Device None None $value $compare_value;
        };
    }
}

__glsl_version(430)
void InterlockedCompareStore(__ref uint dest, uint compare_value, uint value)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedCompareStore";
    case glsl: __intrinsic_asm "$atomicCompSwap($A, $1, $2)";
    case cuda: __intrinsic_asm "atomicCAS((int*)$0, $1, $2)";
    case spirv:
        spirv_asm
        {
            result:$$uint = OpAtomicCompareExchange &dest Device None None $value $compare_value;
        };
    }
}

__glsl_version(430)
void InterlockedExchange(__ref  int dest,  int value, out  int original_value)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedExchange";
    case glsl: __intrinsic_asm "($2 = $atomicExchange($A, $1))";
    case cuda: __intrinsic_asm "(*$2 = atomicExch($0, $1))";
    case spirv:
        spirv_asm
        {
            %r:$$int = OpAtomicExchange &dest Device None $value;
            OpStore &original_value %r
        };
    }
}

__glsl_version(430)
void InterlockedExchange(__ref uint dest, uint value, out uint original_value)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedExchange";
    case glsl: __intrinsic_asm "($2 = $atomicExchange($A, $1))";
    case cuda: __intrinsic_asm "(*$2 = (uint)atomicExch((int*)$0, $1))";
    case spirv:
        spirv_asm
        {
            %r:$$uint = OpAtomicExchange &dest Device None $value;
            OpStore &original_value %r
        };
    }
}

__glsl_version(430)
void InterlockedMax(__ref  int dest,  int value)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedMax";
    case glsl: __intrinsic_asm "$atomicMax($A, $1)";
    case cuda: __intrinsic_asm "atomicMax($0, $1)";
    case spirv:
        spirv_asm
        {
            result:$$int = OpAtomicSMax &dest Device None $value;
        };
    }
}

__glsl_version(430)
void InterlockedMax(__ref uint dest, uint value)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedMax";
    case glsl: __intrinsic_asm "$atomicMax($A, $1)";
    case cuda: __intrinsic_asm "atomicMax((int*)$0, $1)";
    case spirv:
        spirv_asm
        {
            result:$$uint = OpAtomicUMax &dest Device None $value;
        };
    }
}

__glsl_version(430)
void InterlockedMax(__ref  int dest,  int value, out  int original_value)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedMax";
    case glsl: __intrinsic_asm "($2 = $atomicMax($A, $1))";
    case cuda: __intrinsic_asm "(*$2 = atomicMax($0, $1))";
    case spirv:
        spirv_asm
        {
            %v:$$int = OpAtomicSMax &dest Device None $value;
            OpStore &original_value %v
        };
    }
}

__glsl_version(430)
void InterlockedMax(__ref uint dest, uint value, out uint original_value)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedMax";
    case glsl: __intrinsic_asm "($2 = $atomicMax($A, $1))";
    case cuda: __intrinsic_asm "(*$2 = (uint)atomicMax((int*)$0, $1))";
    case spirv:
        spirv_asm
        {
            %v:$$uint = OpAtomicUMax &dest Device None $value;
            OpStore &original_value %v
        };
    }
}

__glsl_version(430)
void InterlockedMin(__ref  int dest,  int value)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedMin";
    case glsl: __intrinsic_asm "$atomicMin($A, $1)";
    case cuda: __intrinsic_asm "atomicMin($0, $1)";
    case spirv:
        spirv_asm
        {
            result:$$int = OpAtomicSMin &dest Device None $value;
        };
    }
}

__glsl_version(430)
void InterlockedMin(__ref uint dest, uint value)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedMin";
    case glsl: __intrinsic_asm "$atomicMin($A, $1)";
    case cuda: __intrinsic_asm "atomicMin((int*)$0, $1)";
    case spirv:
        spirv_asm
        {
            result:$$uint = OpAtomicUMin &dest Device None $value;
        };
    }
}

__glsl_version(430)
void InterlockedMin(__ref  int dest,  int value, out  int original_value)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedMin";
    case glsl: __intrinsic_asm "($2 = $atomicMin($A, $1))";
    case cuda: __intrinsic_asm "(*$2 = atomicMin($0, $1))";
    case spirv:
        spirv_asm
        {
            %v:$$int = OpAtomicSMin &dest Device None $value;
            OpStore &original_value %v
        };
    }
}

__glsl_version(430)
void InterlockedMin(__ref uint dest, uint value, out uint original_value)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedMin";
    case glsl: __intrinsic_asm "($2 = $atomicMin($A, $1))";
    case cuda: __intrinsic_asm "(*$2 = (uint)atomicMin((int*)$0, $1))";
    case spirv:
        spirv_asm
        {
            %v:$$uint = OpAtomicUMin &dest Device None $value;
            OpStore &original_value %v
        };
    }
}

__glsl_version(430)
void InterlockedOr(__ref  int dest,  int value)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedOr";
    case cuda: __intrinsic_asm "atomicOr((int*)$0, $1)";
    case glsl: __intrinsic_asm "$atomicOr($A, $1)";
    case spirv:
        spirv_asm
        {
            result:$$int = OpAtomicOr &dest Device None $value;
        };
    }
}

__glsl_version(430)
void InterlockedOr(__ref uint dest, uint value)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedOr";
    case cuda: __intrinsic_asm "atomicOr((int*)$0, $1)";
    case glsl: __intrinsic_asm "$atomicOr($A, $1)";
    case spirv:
        spirv_asm
        {
            result:$$uint = OpAtomicOr &dest Device None $value;
        };
    }
}

__glsl_version(430)
void InterlockedOr(__ref  int dest,  int value, out  int original_value)
{
     __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedOr";
    case glsl: __intrinsic_asm "($2 = atomicOr($0, $1))";
    case cuda: __intrinsic_asm "(*$2 = atomicOr($0, $1))";
    case spirv:
        spirv_asm
        {
            %original:$$int = OpAtomicOr &dest Device None $value;
            OpStore &original_value %original
        };
    }
}

__glsl_version(430)
void InterlockedOr(__ref uint dest, uint value, out uint original_value)
{
     __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedOr";
    case glsl: __intrinsic_asm "($2 = atomicOr($0, $1))";
    case cuda: __intrinsic_asm "(*$2 = atomicOr((int*)$0, $1))";
    case spirv:
        spirv_asm
        {
            %original:$$uint = OpAtomicOr &dest Device None $value;
            OpStore &original_value %original
        };
    }
}

__glsl_version(430)
void InterlockedXor(__ref  int dest,  int value)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedXor";
    case cuda: __intrinsic_asm "atomicXor((int*)$0, $1)";
    case glsl: __intrinsic_asm "$atomicXor($A, $1)";
    case spirv:
        spirv_asm
        {
            result:$$int = OpAtomicXor &dest Device None $value;
        };
    }
}

__glsl_version(430)
void InterlockedXor(__ref uint dest, uint value)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedXor";
    case cuda: __intrinsic_asm "atomicXor((int*)$0, $1)";
    case glsl: __intrinsic_asm "$atomicXor($A, $1)";
    case spirv:
        spirv_asm
        {
            result:$$uint = OpAtomicXor &dest Device None $value;
        };
    }
}

__glsl_version(430)
void InterlockedXor(__ref  int dest,  int value, out  int original_value)
{
     __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedXor";
    case glsl: __intrinsic_asm "($2 = atomicXor($0, $1))";
    case cuda: __intrinsic_asm "(*$2 = atomicXor($0, $1))";
    case spirv:
        spirv_asm
        {
            %original:$$int = OpAtomicXor &dest Device None $value;
            OpStore &original_value %original
        };
    }
}

__glsl_version(430)
void InterlockedXor(__ref uint dest, uint value, out uint original_value)
{
     __target_switch
    {
    case hlsl: __intrinsic_asm "InterlockedXor";
    case glsl: __intrinsic_asm "($2 = atomicXor($0, $1))";
    case cuda: __intrinsic_asm "(*$2 = (uint)atomicXor((int*)$0, $1))";
    case spirv:
        spirv_asm
        {
            %original:$$uint = OpAtomicXor &dest Device None $value;
            OpStore &original_value %original
        };
    }
}

// Is floating-point value finite?

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
[__readNone]
bool isfinite(T x)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "isfinite";
    case cuda:
    case cpp:
        __intrinsic_asm "$P_isfinite($0)";
    default:
        return !(isinf(x) || isnan(x));
    }
}

__generic<T : __BuiltinFloatingPointType, let N : int>
[__readNone]
vector<bool, N> isfinite(vector<T, N> x)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "isfinite";
    default:
        VECTOR_MAP_UNARY(bool, N, isfinite, x);
    }
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<bool, N, M> isfinite(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(bool, N, M, isfinite, x);
}

// Is floating-point value infinite?
__generic<T : __BuiltinFloatingPointType>
[__readNone]
bool isinf(T x)
{
    __target_switch
    {
    case hlsl:
    case glsl:
        __intrinsic_asm "isinf";
    case cuda:
    case cpp:
        __intrinsic_asm "$P_isinf($0)";
    case spirv:
        return spirv_asm { result:$$bool = OpIsInf $x};
    }
}

__generic<T : __BuiltinFloatingPointType, let N : int>
[__readNone]
vector<bool, N> isinf(vector<T, N> x)
{
    __target_switch
    {
    case hlsl:
    case glsl:
        __intrinsic_asm "isinf";
    case spirv:
        return spirv_asm { result:$$vector<bool,N> = OpIsInf $x};
    default:
        VECTOR_MAP_UNARY(bool, N, isinf, x);
    }
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<bool, N, M> isinf(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(bool, N, M, isinf, x);
}

// Is floating-point value not-a-number?
__generic<T : __BuiltinFloatingPointType>
[__readNone]
bool isnan(T x)
{
    __target_switch
    {
    case hlsl:
    case glsl:
        __intrinsic_asm "isnan";
    case cuda:
    case cpp:
        __intrinsic_asm "$P_isnan($0)";
    case spirv:
        return spirv_asm { result:$$bool = OpIsNan $x};
    }
}

__generic<T : __BuiltinFloatingPointType, let N : int>
[__readNone]
vector<bool, N> isnan(vector<T, N> x)
{
    __target_switch
    {
    case hlsl:
    case glsl:
        __intrinsic_asm "isnan";
    case spirv:
        return spirv_asm { result:$$vector<bool, N> = OpIsNan $x};
    default:
        VECTOR_MAP_UNARY(bool, N, isnan, x);
    }
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<bool, N, M> isnan(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(bool, N, M, isnan, x);
}

// Construct float from mantissa and exponent

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
[__readNone]
T ldexp(T x, T exp)
{
    return x * exp2(exp);
}

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
[__readNone]
vector<T, N> ldexp(vector<T, N> x, vector<T, N> exp)
{
    return x * exp2(exp);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> ldexp(matrix<T, N, M> x, matrix<T, N, M> exp)
{
    MATRIX_MAP_BINARY(T, N, M, ldexp, x, exp);
}

// Vector length
__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Length _0")
[__readNone]
T length(vector<T, N> x)
{
    return sqrt(dot(x, x));
}

// Scalar float length
__generic<T : __BuiltinFloatingPointType>
T length(T x)
{
    return abs(x);
}

// Linear interpolation
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, mix)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 FMix _0 _1 _2")
[__readNone]
T lerp(T x, T y, T s)
{
    return x * (T(1.0f) - s) + y * s;
}

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, mix)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 FMix _0 _1 _2")
[__readNone]
vector<T, N> lerp(vector<T, N> x, vector<T, N> y, vector<T, N> s)
{
    return x * (T(1.0f) - s) + y * s;
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> lerp(matrix<T,N,M> x, matrix<T,N,M> y, matrix<T,N,M> s)
{
    MATRIX_MAP_TRINARY(T, N, M, lerp, x, y, s);
}

// Legacy lighting function (obsolete)
__target_intrinsic(hlsl)
[__readNone]
float4 lit(float n_dot_l, float n_dot_h, float m)
{
    let ambient = 1.0f;
    let diffuse = max(n_dot_l, 0.0f);
    let specular = step(0.0f, n_dot_l) * max(pow(n_dot_h, m), 0.0f);
    return float4(ambient, diffuse, specular, 1.0f);
}

// Base-e logarithm
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_log($0)")
__target_intrinsic(cpp, "$P_log($0)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Log _0")
[__readNone]
T log(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Log _0")
[__readNone]
vector<T, N> log(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, log, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> log(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, log, x);
}

// Base-10 logarithm
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "(log( $0 ) * $S0( 0.43429448190325182765112891891661) )" )
__target_intrinsic(cuda, "$P_log10($0)")
__target_intrinsic(cpp, "$P_log10($0)")
__target_intrinsic(spirv, "%baseElog = OpExtInst resultType resultId glsl450 Log _0; OpFMul resultType resultId %baseElog const(_p,0.43429448190325182765112891891661)")
[__readNone]
T log10(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "(log( $0 ) * $S0(0.43429448190325182765112891891661) )" )
__target_intrinsic(spirv, "%baseElog = OpExtInst resultType resultId glsl450 Log _0; OpVectorTimesScalar resultType resultId %baseElog const(_p,0.43429448190325182765112891891661)")
[__readNone]
vector<T,N> log10(vector<T,N> x)
{
    VECTOR_MAP_UNARY(T, N, log10, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> log10(matrix<T,N,M> x)
{
    MATRIX_MAP_UNARY(T, N, M, log10, x);
}

// Base-2 logarithm
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_log2($0)")
__target_intrinsic(cpp, "$P_log2($0)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Log2 _0")
[__readNone]
T log2(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Log2 _0")
[__readNone]
vector<T,N> log2(vector<T,N> x)
{
    VECTOR_MAP_UNARY(T, N, log2, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> log2(matrix<T,N,M> x)
{
    MATRIX_MAP_UNARY(T, N, M, log2, x);
}

// multiply-add

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, fma)
__target_intrinsic(cuda, "$P_fma($0, $1, $2)")
__target_intrinsic(cpp, "$P_fma($0, $1, $2)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2")
[__readNone]
T mad(T mvalue, T avalue, T bvalue);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, fma)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2")
[__readNone]
vector<T, N> mad(vector<T, N> mvalue, vector<T, N> avalue, vector<T, N> bvalue)
{
    VECTOR_MAP_TRINARY(T, N, mad, mvalue, avalue, bvalue);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> mad(matrix<T, N, M> mvalue, matrix<T, N, M> avalue, matrix<T, N, M> bvalue)
{
    MATRIX_MAP_TRINARY(T, N, M, mad, mvalue, avalue, bvalue);
}

__generic<T : __BuiltinIntegerType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, fma)
__target_intrinsic(cuda, "$P_fma($0, $1, $2)")
__target_intrinsic(cpp, "$P_fma($0, $1, $2)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2")
[__readNone]
T mad(T mvalue, T avalue, T bvalue);

__generic<T : __BuiltinIntegerType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, fma)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Fma _0 _1 _2")
[__readNone]
vector<T, N> mad(vector<T, N> mvalue, vector<T, N> avalue, vector<T, N> bvalue)
{
    VECTOR_MAP_TRINARY(T, N, mad, mvalue, avalue, bvalue);
}

__generic<T : __BuiltinIntegerType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> mad(matrix<T, N, M> mvalue, matrix<T, N, M> avalue, matrix<T, N, M> bvalue)
{
    MATRIX_MAP_TRINARY(T, N, M, mad, mvalue, avalue, bvalue);
}


// maximum
__generic<T : __BuiltinIntegerType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_max($0, $1)")
__target_intrinsic(cpp, "$P_max($0, $1)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 fus(FMax, UMax, SMax) _0 _1")
[__readNone]
T max(T x, T y);
// Note: a stdlib implementation of `max` (or `min`) will require splitting
// floating-point and integer cases apart, because the floating-point
// version needs to correctly handle the case where one of the inputs
// is not-a-number.

__generic<T : __BuiltinIntegerType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 fus(FMax, UMax, SMax) _0 _1")
[__readNone]
vector<T, N> max(vector<T, N> x, vector<T, N> y)
{
    VECTOR_MAP_BINARY(T, N, max, x, y);
}

__generic<T : __BuiltinIntegerType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> max(matrix<T, N, M> x, matrix<T, N, M> y)
{
    MATRIX_MAP_BINARY(T, N, M, max, x, y);
}

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_max($0, $1)")
__target_intrinsic(cpp, "$P_max($0, $1)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 fus(FMax, UMax, SMax) _0 _1")
[__readNone]
T max(T x, T y);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 fus(FMax, UMax, SMax) _0 _1")
[__readNone]
vector<T, N> max(vector<T, N> x, vector<T, N> y)
{
    VECTOR_MAP_BINARY(T, N, max, x, y);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> max(matrix<T, N, M> x, matrix<T, N, M> y)
{
    MATRIX_MAP_BINARY(T, N, M, max, x, y);
}

// minimum
__generic<T : __BuiltinIntegerType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_min($0, $1)")
__target_intrinsic(cpp, "$P_min($0, $1)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 fus(FMin, UMin, SMin) _0 _1")
[__readNone]
T min(T x, T y);

__generic<T : __BuiltinIntegerType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 fus(FMin, UMin, SMin) _0 _1")
[__readNone]
vector<T,N> min(vector<T,N> x, vector<T,N> y)
{
    VECTOR_MAP_BINARY(T, N, min, x, y);
}

__generic<T : __BuiltinIntegerType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> min(matrix<T,N,M> x, matrix<T,N,M> y)
{
    MATRIX_MAP_BINARY(T, N, M, min, x, y);
}

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_min($0, $1)")
__target_intrinsic(cpp, "$P_min($0, $1)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 fus(FMin, UMin, SMin) _0 _1")
[__readNone]
T min(T x, T y);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 fus(FMin, UMin, SMin) _0 _1")
[__readNone]
vector<T,N> min(vector<T,N> x, vector<T,N> y)
{
    VECTOR_MAP_BINARY(T, N, min, x, y);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> min(matrix<T,N,M> x, matrix<T,N,M> y)
{
    MATRIX_MAP_BINARY(T, N, M, min, x, y);
}

// split into integer and fractional parts (both with same sign)
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Modf _0 _1")
[__readNone]
T modf(T x, out T ip);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
[__readNone]
vector<T,N> modf(vector<T,N> x, out vector<T,N> ip)
{
    VECTOR_MAP_BINARY(T, N, modf, x, ip);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int, let L : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> modf(matrix<T,N,M> x, out matrix<T,N,M,L> ip)
{
    MATRIX_MAP_BINARY(T, N, M, modf, x, ip);
}

// msad4 (whatever that is)
__target_intrinsic(hlsl)
[__readNone]
uint4 msad4(uint reference, uint2 source, uint4 accum)
{
    int4 bytesRef = (reference >> uint4(24, 16, 8, 0)) & 0xFF;
    int4 bytesX   = (source.x  >> uint4(24, 16, 8, 0)) & 0xFF;
    int4 bytesY   = (source.y  >> uint4(24, 16, 8, 0)) & 0xFF;

    uint4 mask = select(bytesRef == 0, 0, 0xFFFFFFFFu);

    uint4 result = accum;
    result += mask.x & abs(bytesRef - int4(bytesX.x,           bytesY.y, bytesY.z, bytesY.w));
    result += mask.y & abs(bytesRef - int4(bytesX.x, bytesX.y,           bytesY.z, bytesY.w));
    result += mask.z & abs(bytesRef - int4(bytesX.x, bytesX.y, bytesX.z,           bytesY.w));
    result += mask.w & abs(bytesRef - int4(bytesX.x, bytesX.y, bytesX.z, bytesX.w));
    return result;
}

// General inner products

// scalar-scalar
__generic<T : __BuiltinArithmeticType>
__intrinsic_op($(kIROp_Mul))
[__readNone]
T mul(T x, T y);

// scalar-vector and vector-scalar
__generic<T : __BuiltinArithmeticType, let N : int>
__intrinsic_op($(kIROp_Mul))
[__readNone]
vector<T, N> mul(vector<T, N> x, T y);

__generic<T : __BuiltinArithmeticType, let N : int>
__intrinsic_op($(kIROp_Mul))
[__readNone]
vector<T, N> mul(T x, vector<T, N> y);

// scalar-matrix and matrix-scalar
__generic<T : __BuiltinArithmeticType, let N : int, let M :int>
__intrinsic_op($(kIROp_Mul))
[__readNone]
matrix<T, N, M> mul(matrix<T, N, M> x, T y);

__generic<T : __BuiltinArithmeticType, let N : int, let M :int>
__intrinsic_op($(kIROp_Mul))
[__readNone]
matrix<T, N, M> mul(T x, matrix<T, N, M> y);

// vector-vector (dot product)
__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "dot")
[__readNone]
T mul(vector<T, N> x, vector<T, N> y)
{
    return dot(x, y);
}
__generic<T : __BuiltinIntegerType, let N : int>
__target_intrinsic(hlsl)
[__readNone]
T mul(vector<T, N> x, vector<T, N> y)
{
    return dot(x, y);
}

// vector-matrix
__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "($1 * $0)")
__target_intrinsic(spirv, "OpMatrixTimesVector resultType resultId _1 _0")
[__readNone]
vector<T, M> mul(vector<T, N> left, matrix<T, N, M> right)
{
    vector<T,M> result;
    for( int j = 0; j < M; ++j )
    {
        T sum = T(0);
        for( int i = 0; i < N; ++i )
        {
            sum += left[i] * right[i][j];
        }
        result[j] = sum;
    }
    return result;
}
__generic<T : __BuiltinIntegerType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "($1 * $0)")
[__readNone]
vector<T, M> mul(vector<T, N> left, matrix<T, N, M> right)
{
    vector<T,M> result;
    for( int j = 0; j < M; ++j )
    {
        T sum = T(0);
        for( int i = 0; i < N; ++i )
        {
            sum += left[i] * right[i][j];
        }
        result[j] = sum;
    }
    return result;
}
__generic<T : __BuiltinLogicalType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "($1 * $0)")
[__readNone]
vector<T, M> mul(vector<T, N> left, matrix<T, N, M> right)
{
    vector<T,M> result;
    for( int j = 0; j < M; ++j )
    {
        T sum = T(0);
        for( int i = 0; i < N; ++i )
        {
            sum |= left[i] & right[i][j];
        }
        result[j] = sum;
    }
    return result;
}

// matrix-vector
__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "($1 * $0)")
__target_intrinsic(spirv, "OpVectorTimesMatrix resultType resultId _1 _0")
[__readNone]
vector<T,N> mul(matrix<T,N,M> left, vector<T,M> right)
{
    vector<T,N> result;
    for( int i = 0; i < N; ++i )
    {
        T sum = T(0);
        for( int j = 0; j < M; ++j )
        {
            sum += left[i][j] * right[j];
        }
        result[i] = sum;
    }
    return result;
}
__generic<T : __BuiltinIntegerType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "($1 * $0)")
[__readNone]
vector<T,N> mul(matrix<T,N,M> left, vector<T,M> right)
{
    vector<T,N> result;
    for( int i = 0; i < N; ++i )
    {
        T sum = T(0);
        for( int j = 0; j < M; ++j )
        {
            sum += left[i][j] * right[j];
        }
        result[i] = sum;
    }
    return result;
}
__generic<T : __BuiltinLogicalType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "($1 * $0)")
[__readNone]
vector<T,N> mul(matrix<T,N,M> left, vector<T,M> right)
{
    vector<T,N> result;
    for( int i = 0; i < N; ++i )
    {
        T sum = T(0);
        for( int j = 0; j < M; ++j )
        {
            sum |= left[i][j] & right[j];
        }
        result[i] = sum;
    }
    return result;
}

// matrix-matrix
__generic<T : __BuiltinFloatingPointType, let R : int, let N : int, let C : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "($1 * $0)")
__target_intrinsic(spirv, "OpMatrixTimesMatrix resultType resultId _1 _0")
[__readNone]
matrix<T,R,C> mul(matrix<T,R,N> right, matrix<T,N,C> left)
{
    matrix<T,R,C> result;
    for( int r = 0; r < R; ++r)
    for( int c = 0; c < C; ++c)
    {
        T sum = T(0);
        for( int i = 0; i < N; ++i )
        {
            sum += left[r][i] * right[i][c];
        }
        result[r][c] = sum;
    }
    return result;
}
__generic<T : __BuiltinIntegerType, let R : int, let N : int, let C : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "($1 * $0)")
[__readNone]
matrix<T,R,C> mul(matrix<T,R,N> right, matrix<T,N,C> left)
{
    matrix<T,R,C> result;
    for( int r = 0; r < R; ++r)
    for( int c = 0; c < C; ++c)
    {
        T sum = T(0);
        for( int i = 0; i < N; ++i )
        {
            sum += left[r][i] * right[i][c];
        }
        result[r][c] = sum;
    }
    return result;
}
__generic<T : __BuiltinLogicalType, let R : int, let N : int, let C : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "($1 * $0)")
[__readNone]
matrix<T,R,C> mul(matrix<T,R,N> right, matrix<T,N,C> left)
{
    matrix<T,R,C> result;
    for( int r = 0; r < R; ++r)
    for( int c = 0; c < C; ++c)
    {
        T sum = T(0);
        for( int i = 0; i < N; ++i )
        {
            sum |= left[r][i] & right[i][c];
        }
        result[r][c] = sum;
    }
    return result;
}

// noise (deprecated)

[__readNone]
[deprecated("Always returns 0")]
float noise(float x)
{
    return 0;
}

[__readNone]
[deprecated("Always returns 0")]
__generic<let N : int> float noise(vector<float, N> x)
{
    return 0;
}

/// Indicate that an index may be non-uniform at execution time.
///
/// Shader Model 5.1 and 6.x introduce support for dynamic indexing
/// of arrays of resources, but place the restriction that *by default*
/// the implementation can assume that any value used as an index into
/// such arrays will be dynamically uniform across an entire `Draw` or `Dispatch`
/// (when using instancing, the value must be uniform across all instances;
/// it does not seem that the restriction extends to draws within a multi-draw).
///
/// In order to indicate to the implementation that it cannot make the
/// uniformity assumption, a shader programmer is required to pass the index
/// to the `NonUniformResourceIndex` function before using it as an index.
/// The function superficially acts like an identity function.
///
/// Note: a future version of Slang may take responsibility for inserting calls
/// to this function as necessary in output code, rather than make this
/// the user's responsibility, so that the default behavior of the language
/// is more semantically "correct."
[ForceInline]
T __copyObject<T>(T v)
{
    __target_switch {
    case spirv:
        return spirv_asm {
           result:$$T = OpCopyObject $v;
        };
   }
}


__glsl_extension(GL_EXT_nonuniform_qualifier)
[__readNone]
[ForceInline]
uint NonUniformResourceIndex(uint index)
{
    __target_switch
    {
    case hlsl:
        __intrinsic_asm "NonUniformResourceIndex";
    case glsl:
        __intrinsic_asm "nonuniformEXT";
    case spirv:
        var indexCopy = __copyObject(index);
        spirv_asm
        {
            OpCapability ShaderNonUniform;
            OpDecorate $indexCopy NonUniform;
        };
        return indexCopy;
    default:
        return index;
    }
}

__glsl_extension(GL_EXT_nonuniform_qualifier)
[__readNone]
[ForceInline]
int NonUniformResourceIndex(int index)
{
    __target_switch
    {
    case hlsl:
        __intrinsic_asm "NonUniformResourceIndex";
    case glsl:
        __intrinsic_asm "nonuniformEXT";
    case spirv:
        var indexCopy = __copyObject(index);
        spirv_asm
        {
            OpCapability ShaderNonUniform;
            OpDecorate $indexCopy NonUniform;
        };
        return indexCopy;
    default:
        return index;
    }
}

/// HLSL allows NonUniformResourceIndex around non int/uint types.
/// It's effect is presumably to ignore it, which the following implementation does.
/// We should also look to add a warning for this scenario.
[__unsafeForceInlineEarly]
[deprecated("NonUniformResourceIndex on a type other than uint/int is deprecated and has no effect")]
T NonUniformResourceIndex<T>(T value) { return value; }

// Normalize a vector
__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Normalize _0")
[__readNone]
vector<T,N> normalize(vector<T,N> x)
{
    return x / length(x);
}

// Raise to a power
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_pow($0, $1)")
__target_intrinsic(cpp, "$P_pow($0, $1)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Pow _0 _1")
[__readNone]
T pow(T x, T y);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Pow _0 _1")
[__readNone]
vector<T, N> pow(vector<T, N> x, vector<T, N> y)
{
    VECTOR_MAP_BINARY(T, N, pow, x, y);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> pow(matrix<T,N,M> x, matrix<T,N,M> y)
{
    MATRIX_MAP_BINARY(T, N, M, pow, x, y);
}

// Output message
// TODO: add check to ensure format is const literal.

${{{{
for (int argCount = 0; argCount < 12; argCount++)
{
    StringBuilder paramList;
    StringBuilder argList;
    StringBuilder spirvArgList;
    StringBuilder genericParamList;
    if (argCount > 0)
        genericParamList << "<";
    for (int i = 0; i < argCount; i++)
    {
        if (i > 0)
            genericParamList << ", ";
        genericParamList << "T" << i;

        paramList << ", T" << i << " v" << i;
        argList << ", $" << i+1;
        spirvArgList << " $v" << i;
    }
    if (argCount > 0)
        genericParamList << ">";
    auto params = paramList.toString();
    auto args = argList.toString();
    auto spirvArgs = spirvArgList.toString();
}}}}
__glsl_extension(GL_EXT_debug_printf)
void printf$(genericParamList.toString())(NativeString format $(paramList))
{
    __target_switch
    {
    case hlsl:
    case cpp:
    case cuda:
        __intrinsic_asm "printf";
    case glsl:
        __intrinsic_asm "debugPrintfEXT($0 $(argList))";
    case spirv:
        spirv_asm {
          OpExtension "SPV_KHR_non_semantic_info";
          result:$$void = OpExtInst debugPrintf 1 $format $(spirvArgs);
        };
    }
}
${{{{
}
}}}}

// Tessellation factor fixup routines

void Process2DQuadTessFactorsAvg(
    in  float4 RawEdgeFactors,
    in  float2 InsideScale,
    out float4 RoundedEdgeTessFactors,
    out float2 RoundedInsideTessFactors,
    out float2 UnroundedInsideTessFactors);

void Process2DQuadTessFactorsMax(
    in  float4 RawEdgeFactors,
    in  float2 InsideScale,
    out float4 RoundedEdgeTessFactors,
    out float2 RoundedInsideTessFactors,
    out float2 UnroundedInsideTessFactors);

void Process2DQuadTessFactorsMin(
    in  float4 RawEdgeFactors,
    in  float2 InsideScale,
    out float4 RoundedEdgeTessFactors,
    out float2 RoundedInsideTessFactors,
    out float2 UnroundedInsideTessFactors);

void ProcessIsolineTessFactors(
    in  float RawDetailFactor,
    in  float RawDensityFactor,
    out float RoundedDetailFactor,
    out float RoundedDensityFactor);

void ProcessQuadTessFactorsAvg(
    in  float4 RawEdgeFactors,
    in  float InsideScale,
    out float4 RoundedEdgeTessFactors,
    out float2 RoundedInsideTessFactors,
    out float2 UnroundedInsideTessFactors);

void ProcessQuadTessFactorsMax(
    in  float4 RawEdgeFactors,
    in  float InsideScale,
    out float4 RoundedEdgeTessFactors,
    out float2 RoundedInsideTessFactors,
    out float2 UnroundedInsideTessFactors);

void ProcessQuadTessFactorsMin(
    in  float4 RawEdgeFactors,
    in  float InsideScale,
    out float4 RoundedEdgeTessFactors,
    out float2 RoundedInsideTessFactors,
    out float2 UnroundedInsideTessFactors);

void ProcessTriTessFactorsAvg(
    in  float3 RawEdgeFactors,
    in  float InsideScale,
    out float3 RoundedEdgeTessFactors,
    out float RoundedInsideTessFactor,
    out float UnroundedInsideTessFactor);

void ProcessTriTessFactorsMax(
    in  float3 RawEdgeFactors,
    in  float InsideScale,
    out float3 RoundedEdgeTessFactors,
    out float RoundedInsideTessFactor,
    out float UnroundedInsideTessFactor);

void ProcessTriTessFactorsMin(
    in  float3 RawEdgeFactors,
    in  float InsideScale,
    out float3 RoundedEdgeTessFactors,
    out float RoundedInsideTessFactors,
    out float UnroundedInsideTessFactors);

// Degrees to radians
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Radians _0")
[__readNone]
T radians(T x)
{
    return x * (T.getPi() / T(180.0f));
}

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Radians _0")
[__readNone]
vector<T, N> radians(vector<T, N> x)
{
    return x * (T.getPi() / T(180.0f));
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> radians(matrix<T, N, M> x)
{
    return x * (T.getPi() / T(180.0f));
}

// Approximate reciprocal
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
[__readNone]
T rcp(T x)
{
    return T(1.0) / x;
}

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
[__readNone]
vector<T, N> rcp(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, rcp, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> rcp(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, rcp, x);
}

// Reflect incident vector across plane with given normal
__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Reflect _0 _1")
[__readNone]
vector<T,N> reflect(vector<T,N> i, vector<T,N> n)
{
    return i - T(2) * dot(n,i) * n;
}

// Refract incident vector given surface normal and index of refraction
__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Refract _0 _1 _2")
[__readNone]
vector<T,N> refract(vector<T,N> i, vector<T,N> n, T eta)
{
    let dotNI = dot(n,i);
    let k = T(1) - eta*eta*(T(1) - dotNI * dotNI);
    if(k < T(0)) return vector<T,N>(T(0));
    return eta * i - (eta * dotNI + sqrt(k)) * n;
}

// Reverse order of bits
[__readNone]
uint reversebits(uint value)
{
    __target_switch
    {
    case hlsl:
        __intrinsic_asm "reversebits";
    case glsl:
        __intrinsic_asm "bitfieldReverse";
    case cuda:
    case cpp:
        __intrinsic_asm "$P_reversebits($0)";
    case spirv:
        return spirv_asm {OpBitReverse $$uint result $value};
    }
}

__target_intrinsic(glsl, "bitfieldReverse")
__generic<let N : int>
[__readNone]
vector<uint, N> reversebits(vector<uint, N> value)
{
    __target_switch
    {
    default:
        VECTOR_MAP_UNARY(uint, N, reversebits, value);
    case glsl:
        __intrinsic_asm "bitfieldReverse";
    case spirv:
        return spirv_asm {OpBitReverse $$vector<uint, N> result $value};
    }
}

// Round-to-nearest
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_round($0)")
__target_intrinsic(cpp, "$P_round($0)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Round _0")
[__readNone]
T round(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Round _0")
[__readNone]
vector<T, N> round(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, round, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> round(matrix<T,N,M> x)
{
    MATRIX_MAP_UNARY(T, N, M, round, x);
}

// Reciprocal of square root
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "inversesqrt($0)")
__target_intrinsic(cuda, "$P_rsqrt($0)")
__target_intrinsic(cpp, "$P_rsqrt($0)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 InverseSqrt _0")
[__readNone]
T rsqrt(T x)
{
    return T(1.0) / sqrt(x);
}

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl, "inversesqrt($0)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 InverseSqrt _0")
[__readNone]
vector<T, N> rsqrt(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, rsqrt, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> rsqrt(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, rsqrt, x);
}

// Clamp value to [0,1] range

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
[__readNone]
T saturate(T x)
{
    return clamp<T>(x, T(0), T(1));
}

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
[__readNone]
vector<T,N> saturate(vector<T,N> x)
{
    return clamp<T,N>(x,
        vector<T,N>(T(0)),
        vector<T,N>(T(1)));
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> saturate(matrix<T,N,M> x)
{
    MATRIX_MAP_UNARY(T, N, M, saturate, x);
}

__generic<T:__BuiltinArithmeticType, U:__BuiltinArithmeticType>
__intrinsic_op($(kIROp_IntCast))
T __int_cast(U val);

__generic<T:__BuiltinArithmeticType, U:__BuiltinArithmeticType, let N : int>
__intrinsic_op($(kIROp_IntCast))
vector<T,N> __int_cast(vector<U,N> val);

// Extract sign of value
__generic<T : __BuiltinSignedArithmeticType>
[__readNone]
int sign(T x)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "sign";
    case glsl: __intrinsic_asm "int(sign($0))";
    case cuda:
    case cpp:
        __intrinsic_asm "$P_sign($0)";
    case spirv:
        if (__isFloat<T>())
            return spirv_asm
            {
                %fsign:$$T = OpExtInst glsl450 FSign $x;
                result:$$int = OpConvertFToS %fsign
            };
        else
            return __int_cast<int>(spirv_asm {OpExtInst $$T result glsl450 SSign $x});
    }
}

__generic<T : __BuiltinSignedArithmeticType, let N : int>
[__readNone]
vector<int, N> sign(vector<T, N> x)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "sign";
    case glsl: __intrinsic_asm "ivec$N0(sign($0))";
    case spirv:
        if (__isFloat<T>())
            return spirv_asm
            {
                %fsign:$$vector<T, N> = OpExtInst glsl450 FSign $x;
                result:$$vector<int, N> = OpConvertFToS %fsign
            };
        else
            return __int_cast<int>(spirv_asm {OpExtInst $$vector<T,N> result glsl450 SSign $x});
    default:
        VECTOR_MAP_UNARY(int, N, sign, x);
    }
}

__generic<T : __BuiltinSignedArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<int, N, M> sign(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(int, N, M, sign, x);
}


// Sine

__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_sin($0)")
__target_intrinsic(cpp, "$P_sin($0)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Sin _0")
[__readNone]
T sin(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Sin _0")
[__readNone]
vector<T, N> sin(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, sin, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> sin(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, sin, x);
}

// Sine and cosine
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "$P_sincos($0, $1, $2)")
[__readNone]
void sincos(T x, out T s, out T c)
{
    s = sin(x);
    c = cos(x);
}

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
[__readNone]
void sincos(vector<T,N> x, out vector<T,N> s, out vector<T,N> c)
{
    s = sin(x);
    c = cos(x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int, let L1: int, let L2 : int>
__target_intrinsic(hlsl)
[__readNone]
void sincos(matrix<T,N,M> x, out matrix<T,N,M,L1> s, out matrix<T,N,M,L2> c)
{
    s = sin(x);
    c = cos(x);
}

// Hyperbolic Sine
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_sinh($0)")
__target_intrinsic(cpp, "$P_sinh($0)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Sinh _0")
[__readNone]
T sinh(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Sinh _0")
[__readNone]
vector<T, N> sinh(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, sinh, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> sinh(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, sinh, x);
}

// Smooth step (Hermite interpolation)
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 SmoothStep _0 _1 _2")
[__readNone]
T smoothstep(T min, T max, T x)
{
    let t = saturate((x - min) / (max - min));
    return t * t * (T(3.0f) - (t + t));
}

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 SmoothStep _0 _1 _2")
[__readNone]
vector<T, N> smoothstep(vector<T, N> min, vector<T, N> max, vector<T, N> x)
{
    VECTOR_MAP_TRINARY(T, N, smoothstep, min, max, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> smoothstep(matrix<T, N, M> min, matrix<T, N, M> max, matrix<T, N, M> x)
{
    MATRIX_MAP_TRINARY(T, N, M, smoothstep, min, max, x);
}

// Square root
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_sqrt($0)")
__target_intrinsic(cpp, "$P_sqrt($0)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Sqrt _0")
[__readNone]
T sqrt(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Sqrt _0")
[__readNone]
vector<T, N> sqrt(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, sqrt, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> sqrt(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, sqrt, x);
}

// Step function
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Step _0 _1")
[__readNone]
T step(T y, T x)
{
    return x < y ? T(0.0f) : T(1.0f);
}

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Step _0 _1")
[__readNone]
vector<T,N> step(vector<T,N> y, vector<T,N> x)
{
    VECTOR_MAP_BINARY(T, N, step, y, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> step(matrix<T, N, M> y, matrix<T, N, M> x)
{
    MATRIX_MAP_BINARY(T, N, M, step, y, x);
}

// Tangent
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_tan($0)")
__target_intrinsic(cpp, "$P_tan($0)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Tan _0")
[__readNone]
T tan(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Tan _0")
[__readNone]
vector<T, N> tan(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, tan, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> tan(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, tan, x);
}

// Hyperbolic tangent
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_tanh($0)")
__target_intrinsic(cpp, "$P_tanh($0)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Tanh _0")
[__readNone]
T tanh(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Tanh _0")
[__readNone]
vector<T,N> tanh(vector<T,N> x)
{
    VECTOR_MAP_UNARY(T, N, tanh, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T,N,M> tanh(matrix<T,N,M> x)
{
    MATRIX_MAP_UNARY(T, N, M, tanh, x);
}

// Matrix transpose
__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpTranspose resultType resultId _0")
[__readNone]
[PreferRecompute]
matrix<T, M, N> transpose(matrix<T, N, M> x)
{
    matrix<T,M,N> result;
    for(int r = 0; r < M; ++r)
        for(int c = 0; c < N; ++c)
            result[r][c] = x[c][r];
    return result;
}
__generic<T : __BuiltinIntegerType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpTranspose resultType resultId _0")
[__readNone]
[PreferRecompute]
matrix<T, M, N> transpose(matrix<T, N, M> x)
{
    matrix<T, M, N> result;
    for (int r = 0; r < M; ++r)
        for (int c = 0; c < N; ++c)
            result[r][c] = x[c][r];
    return result;
}
__generic<T : __BuiltinLogicalType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpTranspose resultType resultId _0")
[__readNone]
[PreferRecompute]
matrix<T, M, N> transpose(matrix<T, N, M> x)
{
    matrix<T, M, N> result;
    for (int r = 0; r < M; ++r)
        for (int c = 0; c < N; ++c)
            result[r][c] = x[c][r];
    return result;
}

// Truncate to integer
__generic<T : __BuiltinFloatingPointType>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(cuda, "$P_trunc($0)")
__target_intrinsic(cpp, "$P_trunc($0)")
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Trunc _0")
[__readNone]
T trunc(T x);

__generic<T : __BuiltinFloatingPointType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(glsl)
__target_intrinsic(spirv, "OpExtInst resultType resultId glsl450 Trunc _0")
[__readNone]
vector<T, N> trunc(vector<T, N> x)
{
    VECTOR_MAP_UNARY(T, N, trunc, x);
}

__generic<T : __BuiltinFloatingPointType, let N : int, let M : int>
__target_intrinsic(hlsl)
[__readNone]
matrix<T, N, M> trunc(matrix<T, N, M> x)
{
    MATRIX_MAP_UNARY(T, N, M, trunc, x);
}

// Slang Specific 'Mask' Wave Intrinsics

typedef uint WaveMask;

__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
WaveMask WaveGetConvergedMask()
{
    __target_switch
    {
    case glsl:
        __intrinsic_asm "subgroupBallot(true).x";
    case hlsl:
        __intrinsic_asm "WaveActiveBallot(true).x";
    case cuda:
        __intrinsic_asm "__activemask()";
    case spirv:
        let _true = true;
        return (spirv_asm
        {
            OpCapability GroupNonUniformBallot;
            OpGroupNonUniformBallot $$uint4 result Subgroup $_true
        }).x;
    }
}

__intrinsic_op($(kIROp_WaveGetActiveMask))
WaveMask __WaveGetActiveMask();

__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
WaveMask WaveGetActiveMask()
{
    __target_switch
    {
    case glsl:
        __intrinsic_asm "subgroupBallot(true).x";
    case hlsl:
        __intrinsic_asm "WaveActiveBallot(true).x";
    case spirv:
        let _true = true;
        return (spirv_asm
        {
            OpCapability GroupNonUniformBallot;
            OpGroupNonUniformBallot $$uint4 result Subgroup $_true
        }).x;
    default:
        return __WaveGetActiveMask();
    }
}

__glsl_extension(GL_KHR_shader_subgroup_basic)
__spirv_version(1.3)
bool WaveMaskIsFirstLane(WaveMask mask)
{
    __target_switch
    {
    case glsl:
        __intrinsic_asm "subgroupElect()";
    case cuda:
        __intrinsic_asm "(($0 & -$0) == (WarpMask(1) << _getLaneId()))";
    case hlsl:
        __intrinsic_asm "WaveIsFirstLane()";
    case spirv:
        return spirv_asm
        {
            OpCapability GroupNonUniformBallot;
            OpGroupNonUniformElect $$bool result Subgroup
        };
    default:
        return false;
    }
}

__glsl_extension(GL_KHR_shader_subgroup_vote)
__spirv_version(1.3)
bool WaveMaskAllTrue(WaveMask mask, bool condition)
{
    __target_switch
    {
    case glsl:
        __intrinsic_asm "subgroupAll($1)";
    case cuda:
        __intrinsic_asm "(__all_sync($0, $1) != 0)";
    case hlsl:
        __intrinsic_asm "WaveActiveAllTrue($1)";
    case spirv:
        return spirv_asm
        {
            OpCapability GroupNonUniformBallot;
            OpGroupNonUniformAll $$bool result Subgroup $condition
        };
    default:
        return false;
    }
}

__glsl_extension(GL_KHR_shader_subgroup_vote)
__spirv_version(1.3)
bool WaveMaskAnyTrue(WaveMask mask, bool condition)
{
    __target_switch
    {
    case glsl:
        __intrinsic_asm "subgroupAny($1)";
    case cuda:
        __intrinsic_asm "(__any_sync($0, $1) != 0)";
    case hlsl:
        __intrinsic_asm "WaveActiveAnyTrue($1)";
    case spirv:
        return spirv_asm
        {
            OpCapability GroupNonUniformBallot;
            OpGroupNonUniformAny $$bool result Subgroup $condition
        };
    default:
        return false;
    }
}

__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
WaveMask WaveMaskBallot(WaveMask mask, bool condition)
{
    __target_switch
    {
    case glsl:
        __intrinsic_asm "subgroupBallot($1).x";
    case cuda:
        __intrinsic_asm "__ballot_sync($0, $1)";
    case hlsl:
        __intrinsic_asm "WaveActiveBallot($1)";
    case spirv:
        return (spirv_asm
        {
            OpCapability GroupNonUniformBallot;
            OpGroupNonUniformBallot $$uint4 result Subgroup $condition
        }).x;
    default:
        return 0;
    }
}

uint WaveMaskCountBits(WaveMask mask, bool value)
{
    __target_switch
    {
    case cuda:
        __intrinsic_asm  "__popc(__ballot_sync($0, $1))";
    case hlsl:
        __intrinsic_asm  "WaveActiveCountBits($1)";
    default:
        return _WaveCountBits(WaveActiveBallot(value));
    }
}

// Waits until all warp lanes named in mask have executed a WaveMaskSharedSync (with the same mask)
// before resuming execution. Guarantees memory ordering in shared memory among threads participating
// in the barrier.
//
// The CUDA intrinsic says it orders *all* memory accesses, which appears to match most closely subgroupBarrier.
//
// TODO(JS):
// For HLSL it's not clear what to do. There is no explicit mechanism to 'reconverge' threads. In the docs it describes
// behavior as
// "These intrinsics are dependent on active lanes and therefore flow control. In the model of this document, implementations
// must enforce that the number of active lanes exactly corresponds to the programmer’s view of flow control."
//
// It seems this can only mean the active threads are the "threads the program flow would lead to". This implies a lockstep
// "straight SIMD" style interpretation. That being the case this op on HLSL is just a memory barrier without any Sync.

void AllMemoryBarrierWithWaveMaskSync(WaveMask mask)
{
    __target_switch
    {
    case cuda:
        __intrinsic_asm "__syncwarp($0)";
    case hlsl:
        __intrinsic_asm "AllMemoryBarrier()";
    case glsl:
    case spirv:
        __subgroupBarrier();
        return;
    }
}

// On GLSL, it appears we can't use subgroupMemoryBarrierShared, because it only implies a memory ordering, it does not
// imply convergence. For subgroupBarrier we have from the docs..
// "The function subgroupBarrier() enforces that all active invocations within a subgroup must execute this function before any
// are allowed to continue their execution"
// TODO(JS):
// It's not entirely clear what to do here on HLSL.
// Reading the dxc wiki (https://github.com/Microsoft/DirectXShaderCompiler/wiki/Wave-Intrinsics), we have statements like:
//    ... these intrinsics enable the elimination of barrier constructs when the scope of synchronization is within the width of the SIMD processor.
//    Wave: A set of lanes executed simultaneously in the processor. No explicit barriers are required to guarantee that they execute in parallel.
// Which seems to imply at least some memory barriers like Shared might not be needed.
//
// The barrier is left here though, because not only is the barrier make writes before the barrier across the wave appear to others afterwards, it's
// also there to inform the compiler on what order reads and writes can take place. This might seem to be silly because of the 'Active' lanes
// aspect of HLSL seems to make everything in lock step - but that's not quite so, it only has to apparently be that way as far as the programmers
// model appears - divergence could perhaps potentially still happen.

void GroupMemoryBarrierWithWaveMaskSync(WaveMask mask)
{
    __target_switch
    {
    case cuda:
        __intrinsic_asm "__syncwarp($0)";
    case hlsl:
        __intrinsic_asm "GroupMemoryBarrier()";
    case glsl:
    case spirv:
        __subgroupBarrier();
        return;
    }
}

void AllMemoryBarrierWithWaveSync()
{
    __target_switch
    {
    case cuda:
        __intrinsic_asm "__syncwarp()";
    case hlsl:
        __intrinsic_asm "AllMemoryBarrier()";
    case glsl:
    case spirv:
        __subgroupBarrier();
        return;
    }
}

void GroupMemoryBarrierWithWaveSync()
{
    __target_switch
    {
    case cuda:
        __intrinsic_asm "__syncwarp()";
    case hlsl:
        __intrinsic_asm "GroupMemoryBarrier()";
    case glsl:
    case spirv:
        __subgroupBarrier();
        return;
    }
}

// NOTE! WaveMaskBroadcastLaneAt is *NOT* standard HLSL
// It is provided as access to subgroupBroadcast which can only take a
// constexpr laneId.
// https://github.com/KhronosGroup/GLSL/blob/master/extensions/khr/GL_KHR_shader_subgroup.txt
// Versions SPIR-V greater than 1.4 loosen this restriction, and allow 'dynamic uniform' index
// If that's the behavior required then client code should use WaveReadLaneAt which works this way.

__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
T WaveMaskBroadcastLaneAt(WaveMask mask, T value, constexpr int lane)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupBroadcast($1, $2)";
    case cuda: __intrinsic_asm "__shfl_sync($0, $1, $2)";
    case hlsl: __intrinsic_asm "WaveReadLaneAt($1, $2)";
    case spirv:
        let ulane = uint(lane);
        return spirv_asm {
            OpCapability GroupNonUniformBallot;
            OpGroupNonUniformBroadcast $$T result Subgroup $value $ulane;
        };
    }
}

__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
vector<T,N> WaveMaskBroadcastLaneAt(WaveMask mask, vector<T,N> value, constexpr int lane)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupBroadcast($1, $2)";
    case cuda: __intrinsic_asm "_waveShuffleMultiple($0, $1, $2)";
    case hlsl: __intrinsic_asm "WaveReadLaneAt($1, $2)";
    case spirv:
        let ulane = uint(lane);
        return spirv_asm {
            OpCapability GroupNonUniformBallot;
            OpGroupNonUniformBroadcast $$vector<T,N> result Subgroup $value $ulane;
        };
    }
}
__generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1, $2)")
__target_intrinsic(hlsl, "WaveReadLaneAt($1, $2)")
matrix<T,N,M> WaveMaskBroadcastLaneAt(WaveMask mask, matrix<T,N,M> value, constexpr int lane);

// TODO(JS): If it can be determines that the `laneId` is constExpr, then subgroupBroadcast
// could be used on GLSL. For now we just use subgroupShuffle
__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_shuffle)
__spirv_version(1.3)
T WaveMaskReadLaneAt(WaveMask mask, T value, int lane)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupShuffle($1, $2)";
    case cuda: __intrinsic_asm "__shfl_sync($0, $1, $2)";
    case hlsl: __intrinsic_asm "WaveReadLaneAt($1, $2)";
    case spirv:
        let ulane = uint(lane);
        return spirv_asm {
            OpCapability GroupNonUniformShuffle;
            OpGroupNonUniformShuffle $$T result Subgroup $value $ulane;
        };
    }
}
__generic<T : __BuiltinType, let N : int>
__spirv_version(1.3)__glsl_extension(GL_KHR_shader_subgroup_shuffle)
__spirv_version(1.3)
vector<T,N> WaveMaskReadLaneAt(WaveMask mask, vector<T,N> value, int lane)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupShuffle($1, $2)";
    case cuda: __intrinsic_asm "_waveShuffleMultiple($0, $1, $2)";
    case hlsl: __intrinsic_asm "WaveReadLaneAt($1, $2)";
    case spirv:
        let ulane = uint(lane);
        return spirv_asm {
            OpCapability GroupNonUniformShuffle;
            OpGroupNonUniformShuffle $$vector<T,N> result Subgroup $value $ulane;
        };
    }
}
__generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(cuda, "_waveShuffleMultiple($0, $1, $2)")
__target_intrinsic(hlsl, "WaveReadLaneAt($1, $2)")
matrix<T,N,M> WaveMaskReadLaneAt(WaveMask mask, matrix<T,N,M> value, int lane);

// NOTE! WaveMaskShuffle is a NON STANDARD HLSL intrinsic! It will map to WaveReadLaneAt on HLSL
// which means it will only work on hardware which allows arbitrary laneIds which is not true
// in general because it breaks the HLSL standard, which requires it's 'dynamically uniform' across the Wave.
__generic<T : __BuiltinType>
[__unsafeForceInlineEarly]
T WaveMaskShuffle(WaveMask mask, T value, int lane)
{
    return WaveMaskReadLaneAt(mask, value, lane);
}
__generic<T : __BuiltinType, let N : int>
[__unsafeForceInlineEarly]
vector<T,N> WaveMaskShuffle(WaveMask mask, vector<T,N> value, int lane)
{
    return WaveMaskReadLaneAt(mask, value, lane);
}
__generic<T : __BuiltinType, let N : int, let M : int>
[__unsafeForceInlineEarly]
matrix<T,N,M> WaveMaskShuffle(WaveMask mask, matrix<T,N,M> value, int lane)
{
    return WaveMaskReadLaneAt(mask, value, lane);
}

__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
uint WaveMaskPrefixCountBits(WaveMask mask, bool value)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupBallotExclusiveBitCount(subgroupBallot($1))";
    case cuda: __intrinsic_asm "__popc(__ballot_sync($0, $1)  & _getLaneLtMask())";
    case hlsl: __intrinsic_asm "WavePrefixCountBits($1)";
    case spirv:
        return spirv_asm
        {
            OpCapability GroupNonUniformBallot;
            %mask:$$uint4 = OpGroupNonUniformBallot Subgroup $value;
            OpGroupNonUniformBallotBitCount $$uint result Subgroup 2 %mask
        };
    }
}

// Across lane ops

__generic<T : __BuiltinIntegerType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
T WaveMaskBitAnd(WaveMask mask, T expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupAnd($1)";
    case cuda: __intrinsic_asm "_waveAnd($0, $1)";
    case hlsl: __intrinsic_asm "WaveActiveBitAnd($1)";
    case spirv:
        return spirv_asm {
            OpCapability GroupNonUniformArithmetic;
            OpGroupNonUniformBitwiseAnd $$T result Subgroup 0 $expr
        };
    }
}

__generic<T : __BuiltinIntegerType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
vector<T,N> WaveMaskBitAnd(WaveMask mask, vector<T,N> expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupAnd($1)";
    case cuda: __intrinsic_asm "_waveAndMultiple($0, $1)";
    case hlsl: __intrinsic_asm "WaveActiveBitAnd($1)";
    case spirv:
        return spirv_asm {
            OpCapability GroupNonUniformArithmetic;
            OpGroupNonUniformBitwiseAnd $$vector<T,N> result Subgroup 0 $expr
        };
    }
}
__generic<T : __BuiltinIntegerType, let N : int, let M : int>
__target_intrinsic(cuda, "_waveAndMultiple($0, $1)")
__target_intrinsic(hlsl, "WaveActiveBitAnd($1)")
matrix<T,N,M> WaveMaskBitAnd(WaveMask mask, matrix<T,N,M> expr);

__generic<T : __BuiltinIntegerType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
T WaveMaskBitOr(WaveMask mask, T expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupOr($1)";
    case cuda: __intrinsic_asm "_waveOr($0, $1)";
    case hlsl: __intrinsic_asm "WaveActiveBitOr($1)";
    case spirv:
        return spirv_asm {
            OpCapability GroupNonUniformArithmetic;
            OpGroupNonUniformBitwiseOr $$T result Subgroup 0 $expr
        };
    }
}
__generic<T : __BuiltinIntegerType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
vector<T,N> WaveMaskBitOr(WaveMask mask, vector<T,N> expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupOr($1)";
    case cuda: __intrinsic_asm "_waveOrMultiple($0, $1)";
    case hlsl: __intrinsic_asm "WaveActiveBitOr($1)";
    case spirv:
        return spirv_asm {
            OpCapability GroupNonUniformArithmetic;
            OpGroupNonUniformBitwiseOr $$vector<T,N> result Subgroup 0 $expr
        };
    }
}
__generic<T : __BuiltinIntegerType, let N : int, let M : int>
__target_intrinsic(cuda, "_waveOrMultiple($0, $1)")
__target_intrinsic(hlsl, "WaveActiveBitOr($1)")
matrix<T,N,M> WaveMaskBitOr(WaveMask mask, matrix<T,N,M> expr);

__generic<T : __BuiltinIntegerType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
T WaveMaskBitXor(WaveMask mask, T expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupXor($1)";
    case cuda: __intrinsic_asm "_waveXor($0, $1)";
    case hlsl: __intrinsic_asm "WaveActiveBitXor($1)";
    case spirv:
        return spirv_asm {
            OpCapability GroupNonUniformArithmetic;
            OpGroupNonUniformBitwiseXor $$T result Subgroup 0 $expr
        };
    }
}
__generic<T : __BuiltinIntegerType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
vector<T,N> WaveMaskBitXor(WaveMask mask, vector<T,N> expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupXor($1)";
    case cuda: __intrinsic_asm "_waveXorMultiple($0, $1)";
    case hlsl: __intrinsic_asm "WaveActiveBitXor($1)";
    case spirv:
        return spirv_asm {
            OpCapability GroupNonUniformArithmetic;
            OpGroupNonUniformBitwiseXor $$vector<T,N> result Subgroup 0 $expr
        };
    }
}
__generic<T : __BuiltinIntegerType, let N : int, let M : int>
__target_intrinsic(cuda, "_waveXorMultiple($0, $1)")
__target_intrinsic(hlsl, "WaveActiveBitXor($1)")
matrix<T,N,M> WaveMaskBitXor(WaveMask mask, matrix<T,N,M> expr);

__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
T WaveMaskMax(WaveMask mask, T expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupMax($1)";
    case cuda: __intrinsic_asm "_waveMax($0, $1)";
    case hlsl: __intrinsic_asm "WaveActiveMax($1)";
    case spirv:
        if (__isFloat<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformFMax $$T result Subgroup 0 $expr};
        else if (__isSignedInt<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformSMax $$T result Subgroup 0 $expr};
        else if (__isUnsignedInt<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformUMax $$T result Subgroup 0 $expr};
    }
}
__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
vector<T,N> WaveMaskMax(WaveMask mask, vector<T,N> expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupMax($1)";
    case cuda: __intrinsic_asm "_waveMaxMultiple($0, $1)";
    case hlsl: __intrinsic_asm "WaveActiveMax($1)";
    case spirv:
        if (__isFloat<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformFMax $$vector<T,N> result Subgroup 0 $expr};
        else if (__isSignedInt<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformSMax $$vector<T,N> result Subgroup 0 $expr};
        else if (__isUnsignedInt<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformUMax $$vector<T,N> result Subgroup 0 $expr};
    }
}

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(cuda, "_waveMaxMultiple($0, $1)")
__target_intrinsic(hlsl, "WaveActiveMax($1)")
matrix<T,N,M> WaveMaskMax(WaveMask mask, matrix<T,N,M> expr);

__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
T WaveMaskMin(WaveMask mask, T expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupMin($1)";
    case cuda: __intrinsic_asm "_waveMin($0, $1)";
    case hlsl: __intrinsic_asm "WaveActiveMin($1)";
    case spirv:
        if (__isFloat<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformFMin $$T result Subgroup 0 $expr};
        else if (__isSignedInt<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformSMin $$T result Subgroup 0 $expr};
        else if (__isUnsignedInt<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformUMin $$T result Subgroup 0 $expr};
    }
}

__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
vector<T,N> WaveMaskMin(WaveMask mask, vector<T,N> expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupMin($1)";
    case cuda: __intrinsic_asm "_waveMinMultiple($0, $1)";
    case hlsl: __intrinsic_asm "WaveActiveMin($1)";
    case spirv:
        if (__isFloat<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformFMin $$vector<T,N>  result Subgroup 0 $expr};
        else if (__isSignedInt<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformSMin $$vector<T,N>  result Subgroup 0 $expr};
        else if (__isUnsignedInt<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformUMin $$vector<T,N>  result Subgroup 0 $expr};
    }
}

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(cuda, "_waveMinMultiple($0, $1)")
__target_intrinsic(hlsl, "WaveActiveMin($1)")
matrix<T,N,M> WaveMaskMin(WaveMask mask, matrix<T,N,M> expr);

__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
T WaveMaskProduct(WaveMask mask, T expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupMul($1)";
    case cuda: __intrinsic_asm "_waveProduct($0, $1)";
    case hlsl: __intrinsic_asm "WaveActiveProduct($1)";
    case spirv:
        if (__isFloat<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformFMul $$T result Subgroup 0 $expr};
        else if (__isSignedInt<T>())
        {
            return spirv_asm
            {
                OpCapability GroupNonUniformArithmetic;
                // TODO: use the correct integer width
                OpBitcast $$uint %uvalue $expr;
                OpGroupNonUniformIMul $$uint %mulResult Subgroup 0 %uvalue;
                OpBitcast $$T result %mulResult
            };
        }
        else if (__isUnsignedInt<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformIMul $$T result Subgroup 0 $expr};
    }
}

__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
vector<T,N> WaveMaskProduct(WaveMask mask, vector<T,N> expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupMul($1)";
    case cuda: __intrinsic_asm "_waveProductMultiple($0, $1)";
    case hlsl: __intrinsic_asm "WaveActiveProduct($1)";
    case spirv:
        if (__isFloat<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformFMul $$vector<T,N> result Subgroup 0 $expr};
        else if (__isSignedInt<T>())
        {
            return spirv_asm
            {
                OpCapability GroupNonUniformArithmetic;
                // TODO: use the correct integer width
                OpBitcast $$vector<uint,N> %uvalue $expr;
                OpGroupNonUniformIMul $$vector<uint,N> %mulResult Subgroup 0 %uvalue;
                OpBitcast $$vector<T,N> result %mulResult
            };
        }
        else if (__isUnsignedInt<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformIMul $$vector<T,N> result Subgroup 0 $expr};
    }
}

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(cuda, "_waveProductMultiple($0, $1)")
__target_intrinsic(hlsl, "WaveActiveProduct($1)")
matrix<T,N,M> WaveMaskProduct(WaveMask mask, matrix<T,N,M> expr);

__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
T WaveMaskSum(WaveMask mask, T expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupAdd($1)";
    case cuda: __intrinsic_asm "_waveSum($0, $1)";
    case hlsl: __intrinsic_asm "WaveActiveSum($1)";
    case spirv:
        if (__isFloat<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformFAdd $$T result Subgroup 0 $expr};
        else if (__isSignedInt<T>())
        {
            return spirv_asm
            {
                OpCapability GroupNonUniformArithmetic;
                // TODO: use the correct integer width
                OpBitcast $$uint %uvalue $expr;
                OpGroupNonUniformIAdd $$uint %mulResult Subgroup 0 %uvalue;
                OpBitcast $$T result %mulResult
            };
        }
        else if (__isUnsignedInt<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformIAdd $$T result Subgroup 0 $expr};
    }
}
__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
vector<T,N> WaveMaskSum(WaveMask mask, vector<T,N> expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupAdd($1)";
    case cuda: __intrinsic_asm "_waveSumMultiple($0, $1)";
    case hlsl: __intrinsic_asm "WaveActiveSum($1)";
    case spirv:
        if (__isFloat<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformFAdd $$vector<T,N> result Subgroup 0 $expr};
        else if (__isSignedInt<T>())
        {
            return spirv_asm
            {
                OpCapability GroupNonUniformArithmetic; 
                // TODO: use the correct integer width
                OpBitcast $$vector<uint,N> %uvalue $expr;
                OpGroupNonUniformIAdd $$vector<uint,N> %mulResult Subgroup 0 %uvalue;
                OpBitcast $$vector<T,N> result %mulResult
            };
        }
        else if (__isUnsignedInt<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformIAdd $$vector<T,N> result Subgroup 0 $expr};
    }
}
__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(cuda, "_waveSumMultiple($0, $1)")
__target_intrinsic(hlsl, "WaveActiveSum($1)")
matrix<T,N,M> WaveMaskSum(WaveMask mask, matrix<T,N,M> expr);

__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_vote)
__spirv_version(1.3)
__cuda_sm_version(7.0)
bool WaveMaskAllEqual(WaveMask mask, T value)
{
    __target_switch
    {
    case glsl:
        __intrinsic_asm "subgroupAllEqual($1)";
    case hlsl:
        __intrinsic_asm "WaveActiveAllEqual($1)";
    case cuda:
        __intrinsic_asm "_waveAllEqual($0, $1)";
    case spirv:
        return spirv_asm
        {
            OpCapability GroupNonUniformVote;
            OpGroupNonUniformAllEqual $$bool result Subgroup $value
        };
    default:
        return false;
    }
}
__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_vote)
__spirv_version(1.3)
__cuda_sm_version(7.0)
bool WaveMaskAllEqual(WaveMask mask, vector<T,N> value)
{
    __target_switch
    {
    case glsl:
        __intrinsic_asm "subgroupAllEqual($1)";
    case hlsl:
        __intrinsic_asm "WaveActiveAllEqual($1)";
    case cuda:
        __intrinsic_asm "_waveAllEqualMultiple($0, $1)";
    case spirv:
        return spirv_asm
        {
            OpCapability GroupNonUniformVote;
            OpGroupNonUniformAllEqual $$bool result Subgroup $value
        };
    default:
        return false;
    }
}
__generic<T : __BuiltinType, let N : int, let M : int>
__cuda_sm_version(7.0)
__target_intrinsic(cuda, "_waveAllEqualMultiple($0, $1)")
__target_intrinsic(hlsl, "WaveActiveAllEqual($1)")
bool WaveMaskAllEqual(WaveMask mask, matrix<T,N,M> value);

// Prefix

__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
T WaveMaskPrefixProduct(WaveMask mask, T expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupExclusiveMul($1)";
    case cuda: __intrinsic_asm "_wavePrefixProduct($0, $1)";
    case hlsl: __intrinsic_asm "WavePrefixProduct($1)";
    case spirv:
        if (__isFloat<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformFMul $$T result Subgroup ExclusiveScan $expr};
        else if (__isSignedInt<T>())
        {
            return spirv_asm
            {
                OpCapability GroupNonUniformArithmetic;
                // TODO: use the correct integer width
                OpBitcast $$uint %uvalue $expr;
                OpGroupNonUniformIMul $$uint %mulResult Subgroup ExclusiveScan %uvalue;
                OpBitcast $$T result %mulResult
            };
        }
        else if (__isUnsignedInt<T>())
            return spirv_asm {OpGroupNonUniformIMul $$T result Subgroup ExclusiveScan $expr};
    }
}
__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
vector<T,N> WaveMaskPrefixProduct(WaveMask mask, vector<T,N> expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupExclusiveMul($1)";
    case cuda: __intrinsic_asm "_wavePrefixProductMultiple($0, $1)";
    case hlsl: __intrinsic_asm "WavePrefixProduct($1)";
    case spirv:
        if (__isFloat<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformFMul $$vector<T,N> result Subgroup ExclusiveScan $expr};
        else if (__isSignedInt<T>())
        {
            return spirv_asm
            {
                OpCapability GroupNonUniformArithmetic;
                // TODO: use the correct integer width
                OpBitcast $$vector<uint,N> %uvalue $expr;
                OpGroupNonUniformIMul $$vector<uint,N> %mulResult Subgroup ExclusiveScan %uvalue;
                OpBitcast $$vector<T,N> result %mulResult
            };
        }
        else if (__isUnsignedInt<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformIMul $$vector<T,N> result Subgroup ExclusiveScan $expr};
    }
}
__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(cuda, "_wavePrefixProductMultiple($0, $1)")
__target_intrinsic(hlsl, "WavePrefixProduct($1)")
matrix<T,N,M> WaveMaskPrefixProduct(WaveMask mask, matrix<T,N,M> expr);

__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
T WaveMaskPrefixSum(WaveMask mask, T expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupExclusiveAdd($1)";
    case cuda: __intrinsic_asm "_wavePrefixSum($0, $1)";
    case hlsl: __intrinsic_asm "WavePrefixSum($1)";
    case spirv:
        if (__isFloat<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformFAdd $$T result Subgroup ExclusiveScan $expr};
        else if (__isSignedInt<T>())
        {
            return spirv_asm
            {
                OpCapability GroupNonUniformArithmetic;
                // TODO: use the correct integer width
                %uvalue:$$uint = OpBitcast $expr;
                %mulResult:$$uint = OpGroupNonUniformIAdd Subgroup ExclusiveScan %uvalue;
                result:$$T = OpBitcast %mulResult
            };
        }
        else if (__isUnsignedInt<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformIAdd $$T result Subgroup ExclusiveScan $expr};
    }
}

__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
vector<T,N> WaveMaskPrefixSum(WaveMask mask, vector<T,N> expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupExclusiveAdd($1)";
    case cuda: __intrinsic_asm "_wavePrefixSumMultiple($0, $1)";
    case hlsl: __intrinsic_asm "WavePrefixSum($1)";
    case spirv:
        if (__isFloat<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformFAdd $$vector<T,N> result Subgroup ExclusiveScan $expr};
        else if (__isSignedInt<T>())
        {
            return spirv_asm
            {
                OpCapability GroupNonUniformArithmetic;
                // TODO: use the correct integer width
                %uvalue: $$vector<uint,N> = OpBitcast $expr;
                %mulResult: $$vector<uint,N> = OpGroupNonUniformIAdd Subgroup ExclusiveScan %uvalue;
                result: $$vector<T,N> = OpBitcast  %mulResult
            };
        }
        else if (__isUnsignedInt<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformIAdd $$vector<T,N> result Subgroup ExclusiveScan $expr};
    }
}
__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(cuda, "_wavePrefixSumMultiple($0, $1)")
__target_intrinsic(hlsl, "WavePrefixSum($1)")
matrix<T,N,M> WaveMaskPrefixSum(WaveMask mask, matrix<T,N,M> expr);

__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
T WaveMaskReadLaneFirst(WaveMask mask, T expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupBroadcastFirst($1)";
    case cuda: __intrinsic_asm "_waveReadFirst($0, $1)";
    case hlsl: __intrinsic_asm "WaveReadLaneFirst($1)";
    case spirv:
        return spirv_asm {OpCapability GroupNonUniformBallot; OpGroupNonUniformBroadcastFirst $$T result Subgroup $expr};
    }
}
__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
vector<T,N> WaveMaskReadLaneFirst(WaveMask mask, vector<T,N> expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupBroadcastFirst($1)";
    case cuda: __intrinsic_asm "_waveReadFirstMultiple($0, $1)";
    case hlsl: __intrinsic_asm "WaveReadLaneFirst($1)";
    case spirv:
        return spirv_asm {OpCapability GroupNonUniformBallot; OpGroupNonUniformBroadcastFirst $$vector<T,N> result Subgroup $expr};
    }
}

__generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(cuda, "_waveReadFirstMultiple($0, $1)")
matrix<T,N,M> WaveMaskReadLaneFirst(WaveMask mask, matrix<T,N,M> expr);

// WaveMask SM6.5 like intrinsics

// TODO(JS): On HLSL it only works for 32 bits or less

__generic<T : __BuiltinType>
__glsl_extension(GL_NV_shader_subgroup_partitioned)
__spirv_version(1.1)
__cuda_sm_version(7.0)
WaveMask WaveMaskMatch(WaveMask mask, T value)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupPartitionNV($1).x";
    case cuda: __intrinsic_asm "_waveMatchScalar($0, $1).x";
    case hlsl: __intrinsic_asm "WaveMatch($1).x";
    case spirv:
        return (spirv_asm
        {
            OpCapability GroupNonUniformPartitionedNV;
            OpExtension "SPV_NV_shader_subgroup_partitioned";
            OpGroupNonUniformPartitionNV $$uint4 result $value
        }).x;
    }
}
__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_NV_shader_subgroup_partitioned)
__spirv_version(1.1)
__cuda_sm_version(7.0)
WaveMask WaveMaskMatch(WaveMask mask, vector<T,N> value)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupPartitionNV($1).x";
    case cuda: __intrinsic_asm "_waveMatchMultiple($0, $1).x";
    case hlsl: __intrinsic_asm "WaveMatch($1).x";
    case spirv:
        return (spirv_asm
        {
            OpCapability GroupNonUniformPartitionedNV;
            OpExtension "SPV_NV_shader_subgroup_partitioned";
            OpGroupNonUniformPartitionNV $$uint4 result $value
        }).x;
    }
}

__generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(hlsl, "WaveMatch($1).x")
__glsl_extension(GL_NV_shader_subgroup_partitioned)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupPartitionNV($1).x")
__cuda_sm_version(7.0)
__target_intrinsic(cuda, "_waveMatchMultiple($0, $1)")
WaveMask WaveMaskMatch(WaveMask mask, matrix<T,N,M> value);

__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
T WaveMaskPrefixBitAnd(WaveMask mask, T expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupExclusiveAnd($1)";
    case cuda: __intrinsic_asm "_wavePrefixAnd($0, $1)";
    case hlsl: __intrinsic_asm "WaveMultiPrefixBitAnd($1, uint4($0, 0, 0, 0))";
    case spirv:
        return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformBitwiseAnd $$T result Subgroup ExclusiveScan $expr};
    }
}

__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
vector<T,N> WaveMaskPrefixBitAnd(WaveMask mask, vector<T,N> expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupExclusiveAnd($1)";
    case cuda: __intrinsic_asm "_wavePrefixAndMultiple($0, $1)";
    case hlsl: __intrinsic_asm "WaveMultiPrefixBitAnd($1, uint4($0, 0, 0, 0))";
    case spirv:
        return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformBitwiseAnd $$vector<T,N> result Subgroup ExclusiveScan $expr};
    }
}

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl, "WaveMultiPrefixBitAnd($1, uint4($0, 0, 0, 0))")
__target_intrinsic(cuda, "_wavePrefixAndMultiple(_getMultiPrefixMask($0, $1)")
matrix<T,N,M> WaveMaskPrefixBitAnd(WaveMask mask, matrix<T,N,M> expr);

__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
T WaveMaskPrefixBitOr(WaveMask mask, T expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupExclusiveOr($1)";
    case cuda: __intrinsic_asm "_wavePrefixOr($0, $1)";
    case hlsl: __intrinsic_asm "WaveMultiPrefixBitOr($1, uint4($0, 0, 0, 0))";
    case spirv:
        return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformBitwiseAnd $$T result Subgroup ExclusiveScan $expr};
    }
}

__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
vector<T,N> WaveMaskPrefixBitOr(WaveMask mask, vector<T,N> expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupExclusiveOr($1)";
    case cuda: __intrinsic_asm "_wavePrefixOrMultiple($0, $1)";
    case hlsl: __intrinsic_asm "WaveMultiPrefixBitOr($1, uint4($0, 0, 0, 0))";
    case spirv:
        return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformBitwiseOr $$vector<T,N> result Subgroup ExclusiveScan $expr};
    }
}

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl, "WaveMultiPrefixBitOr($1, uint4($0, 0, 0, 0))")
__target_intrinsic(cuda, "_wavePrefixOrMultiple($0, $1)")
matrix<T,N,M> WaveMaskPrefixBitOr(WaveMask mask, matrix<T,N,M> expr);

__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
T WaveMaskPrefixBitXor(WaveMask mask, T expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupExclusiveXor($1)";
    case cuda: __intrinsic_asm "_wavePrefixXor($0, $1)";
    case hlsl: __intrinsic_asm "WaveMultiPrefixBitXor($1, uint4($0, 0, 0, 0))";
    case spirv:
        return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformBitwiseXor $$T result Subgroup ExclusiveScan $expr};
    }
}

__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
vector<T,N> WaveMaskPrefixBitXor(WaveMask mask, vector<T,N> expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupExclusiveXor($1)";
    case cuda: __intrinsic_asm "_wavePrefixXorMultiple($0, $1)";
    case hlsl: __intrinsic_asm "WaveMultiPrefixBitXor($1, uint4($0, 0, 0, 0))";
    case spirv:
        return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformBitwiseXor $$vector<T,N> result Subgroup ExclusiveScan $expr};
    }
}

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl, "WaveMultiPrefixBitXor($1, uint4($0, 0, 0, 0))")
__target_intrinsic(cuda, "_wavePrefixXorMultiple($0, $1)")
matrix<T,N,M> WaveMaskPrefixBitXor(WaveMask mask, matrix<T,N,M> expr);

// Shader model 6.0 stuff

// Information for GLSL wave/subgroup support
// https://github.com/KhronosGroup/GLSL/blob/master/extensions/khr/GL_KHR_shader_subgroup.txt

__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_quad)
__spirv_version(1.3)
T QuadReadLaneAt(T sourceValue, uint quadLaneID)
{
    __target_switch
    {
    case hlsl:
        __intrinsic_asm "QuadReadLaneAt";
    case glsl:
        __intrinsic_asm "subgroupQuadBroadcast";
    case spirv:
        return spirv_asm {
            OpCapability GroupNonUniformQuad;
            result:$$T = OpGroupNonUniformQuadBroadcast Subgroup $sourceValue $quadLaneID;
        };
    }
}
__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_quad)
__spirv_version(1.3)
vector<T,N> QuadReadLaneAt(vector<T,N> sourceValue, uint quadLaneID)
{
    __target_switch
    {
    case hlsl:
        __intrinsic_asm "QuadReadLaneAt";
    case glsl:
        __intrinsic_asm "subgroupQuadBroadcast";
    case spirv:
        return spirv_asm {
            OpCapability GroupNonUniformQuad;
            result:$$vector<T,N> = OpGroupNonUniformQuadBroadcast Subgroup $sourceValue $quadLaneID;
        };
    }
}
__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadLaneAt(matrix<T,N,M> sourceValue, uint quadLaneID);


__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_quad)
__spirv_version(1.3)
T QuadReadAcrossX(T localValue)
{
    __target_switch
    {
    case hlsl:
        __intrinsic_asm "QuadReadAcrossX";
    case glsl:
        __intrinsic_asm "subgroupQuadSwapHorizontal($0)";
    case spirv:
        uint direction = 0u;
        return spirv_asm {
            OpCapability GroupNonUniformQuad;
            result:$$T = OpGroupNonUniformQuadSwap Subgroup $localValue $direction;
        };
    }
}

__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_quad)
__spirv_version(1.3)
vector<T,N> QuadReadAcrossX(vector<T,N> localValue)
{
    __target_switch
    {
    case hlsl:
        __intrinsic_asm "QuadReadAcrossX";
    case glsl:
        __intrinsic_asm "subgroupQuadSwapHorizontal($0)";
    case spirv:
        uint direction = 0u;
        return spirv_asm {
            OpCapability GroupNonUniformQuad;
            result:$$vector<T,N> = OpGroupNonUniformQuadSwap Subgroup $localValue $direction;
        };
    }
}
__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadAcrossX(matrix<T,N,M> localValue);

__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_quad)
__spirv_version(1.3)
T QuadReadAcrossY(T localValue)
{
    __target_switch
    {
    case hlsl:
        __intrinsic_asm "QuadReadAcrossY";
    case glsl:
        __intrinsic_asm "subgroupQuadSwapVertical($0)";
    case spirv:
        uint direction = 1u;
        return spirv_asm {
            OpCapability GroupNonUniformQuad;
            result:$$T = OpGroupNonUniformQuadSwap Subgroup $localValue $direction;
        };
    }
}
__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_quad)
__spirv_version(1.3)
vector<T,N> QuadReadAcrossY(vector<T,N> localValue)
{
    __target_switch
    {
    case hlsl:
        __intrinsic_asm "QuadReadAcrossY";
    case glsl:
        __intrinsic_asm "subgroupQuadSwapVertical($0)";
    case spirv:
        uint direction = 1u;
        return spirv_asm {
            OpCapability GroupNonUniformQuad;
            result:$$vector<T,N> = OpGroupNonUniformQuadSwap Subgroup $localValue $direction;
        };
    }
}

__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadAcrossY(matrix<T,N,M> localValue);

__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_quad)
__spirv_version(1.3)
T QuadReadAcrossDiagonal(T localValue)
{
    __target_switch
    {
    case hlsl:
        __intrinsic_asm "QuadReadAcrossDiagonal";
    case glsl:
        __intrinsic_asm "subgroupQuadSwapDiagonal($0)";
    case spirv:
        uint direction = 2u;
        return spirv_asm {
            OpCapability GroupNonUniformQuad;
            result:$$T = OpGroupNonUniformQuadSwap Subgroup $localValue $direction;
        };
    }
}
__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_quad)
__spirv_version(1.3)
vector<T,N> QuadReadAcrossDiagonal(vector<T,N> localValue)
{
    __target_switch
    {
    case hlsl:
        __intrinsic_asm "QuadReadAcrossDiagonal";
    case glsl:
        __intrinsic_asm "subgroupQuadSwapDiagonal($0)";
    case spirv:
        uint direction = 2u;
        return spirv_asm {
            OpCapability GroupNonUniformQuad;
            result:$$vector<T,N> = OpGroupNonUniformQuadSwap Subgroup $localValue $direction;
        };
    }
}
__generic<T : __BuiltinType, let N : int, let M : int> matrix<T,N,M> QuadReadAcrossDiagonal(matrix<T,N,M> localValue);

// WaveActiveBitAnd, WaveActiveBitOr, WaveActiveBitXor
${{{{
struct WaveActiveBitOpEntry { const char* hlslName; const char* glslName; const char* spirvName; };
const WaveActiveBitOpEntry kWaveActiveBitOpEntries[] = {{"BitAnd", "And", "BitwiseAnd"}, {"BitOr", "Or", "BitwiseOr"}, {"BitXor", "Xor", "BitwiseXor"}};
for (auto opName : kWaveActiveBitOpEntries) {
}}}}

__generic<T : __BuiltinIntegerType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
T WaveActive$(opName.hlslName)(T expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroup$(opName.glslName)($0)";
    case hlsl: __intrinsic_asm "WaveActive$(opName.hlslName)";
    case spirv:
        return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniform$(opName.spirvName) $$T result Subgroup Reduce $expr};
    default:
        return WaveMask$(opName.hlslName)(WaveGetActiveMask(), expr);
    }
}

__generic<T : __BuiltinIntegerType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
vector<T, N> WaveActive$(opName.hlslName)(vector<T, N> expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroup$(opName.glslName)($0)";
    case hlsl: __intrinsic_asm "WaveActive$(opName.hlslName)";
    case spirv:
        return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniform$(opName.spirvName) $$vector<T, N> result Subgroup Reduce $expr};
    default:
        return WaveMask$(opName.hlslName)(WaveGetActiveMask(), expr);
    }
}

__generic<T : __BuiltinIntegerType, let N : int, let M : int>
__target_intrinsic(hlsl)
matrix<T, N, M> WaveActive$(opName.hlslName)(matrix<T, N, M> expr)
{
    return WaveMask$(opName.hlslName)(WaveGetActiveMask(), expr);
}
${{{{
} // WaveActiveBitAnd, WaveActiveBitOr, WaveActiveBitXor
}}}}

// WaveActiveMin/Max
${{{{
const char* kWaveActiveMinMaxNames[] = {"Min", "Max"};
for (const char* opName : kWaveActiveMinMaxNames) {
}}}}

__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
T WaveActive$(opName)(T expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroup$(opName)($0)";
    case hlsl: __intrinsic_asm "WaveActive$(opName)";
    case spirv:
        if (__isFloat<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformF$(opName) $$T result Subgroup Reduce $expr};
        else if (__isUnsignedInt<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformU$(opName) $$T result Subgroup Reduce $expr};
        else
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformS$(opName) $$T result Subgroup Reduce $expr};
    default:
        return WaveMask$(opName)(WaveGetActiveMask(), expr);
    }
}

__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
vector<T, N> WaveActive$(opName)(vector<T, N> expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroup$(opName)($0)";
    case hlsl: __intrinsic_asm "WaveActive$(opName)";
    case spirv:
        if (__isFloat<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformF$(opName) $$vector<T, N> result Subgroup Reduce $expr};
        else if (__isUnsignedInt<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformU$(opName) $$vector<T, N> result Subgroup Reduce $expr};
        else
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformS$(opName) $$vector<T, N> result Subgroup Reduce $expr};
    default:
        return WaveMask$(opName)(WaveGetActiveMask(), expr);
    }
}

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl)
matrix<T, N, M> WaveActive$(opName)(matrix<T, N, M> expr)
{
    return WaveMask$(opName)(WaveGetActiveMask(), expr);
}

${{{{
} // WaveActiveMinMax.
}}}}

// WaveActiveProduct/Sum
${{{{
struct WaveActiveProductSumEntry { const char* hlslName; const char* glslName; };
const WaveActiveProductSumEntry kWaveActivProductSumNames[] = {{"Product", "Mul"}, {"Sum", "Add"}};
for (auto opName : kWaveActivProductSumNames) {
}}}}

__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
T WaveActive$(opName.hlslName)(T expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroup$(opName.glslName)($0)";
    case hlsl: __intrinsic_asm "WaveActive$(opName.hlslName)";
    case spirv:
        if (__isFloat<T>())
            return spirv_asm {
                OpCapability GroupNonUniformArithmetic;
                OpGroupNonUniformF$(opName.glslName) $$T result Subgroup 0 $expr
            };
        else if (__isSignedInt<T>())
        {
            return spirv_asm
            {
                OpCapability GroupNonUniformArithmetic;
                // TODO: use the correct integer width
                OpBitcast $$uint %uvalue $expr;
                OpGroupNonUniformI$(opName.glslName) $$uint %mulResult Subgroup 0 %uvalue;
                OpBitcast $$T result %mulResult
            };
        }
        else if (__isUnsignedInt<T>())
            return spirv_asm
            {
                OpCapability GroupNonUniformArithmetic;
                OpGroupNonUniformI$(opName.glslName) $$T result Subgroup 0 $expr
            };
    default:
        return WaveMask$(opName.hlslName)(WaveGetActiveMask(), expr);
    }
}

__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(hlsl)
vector<T,N> WaveActive$(opName.hlslName)(vector<T,N> expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroup$(opName.glslName)($0)";
    case hlsl: __intrinsic_asm "WaveActive$(opName.hlslName)";
    case spirv:
        if (__isFloat<T>())
            return spirv_asm {
                OpCapability GroupNonUniformArithmetic;
                OpGroupNonUniformF$(opName.glslName) $$vector<T,N> result Subgroup 0 $expr
            };
        else if (__isSignedInt<T>())
        {
            return spirv_asm
            {
                OpCapability GroupNonUniformArithmetic;
                // TODO: use the correct integer width
                OpBitcast $$vector<uint,N> %uvalue $expr;
                OpGroupNonUniformI$(opName.glslName) $$vector<uint,N> %$(opName.glslName)Result Subgroup 0 %uvalue;
                OpBitcast $$vector<T,N> result %$(opName.glslName)Result
            };
        }
        else if (__isUnsignedInt<T>())
            return spirv_asm
            {
                OpCapability GroupNonUniformArithmetic;
                OpGroupNonUniformI$(opName.glslName) $$vector<T,N> result Subgroup 0 $expr
            };
    default:
        return WaveMask$(opName.hlslName)(WaveGetActiveMask(), expr);
    }
}

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl)
matrix<T, N, M> WaveActive$(opName.hlslName)(matrix<T, N, M> expr)
{
    return WaveMask$(opName.hlslName)(WaveGetActiveMask(), expr);
}
${{{{
} // WaveActiveProduct/WaveActiveProductSum.
}}}}

__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_vote)
__spirv_version(1.3)
bool WaveActiveAllEqual(T value)
{
    __target_switch
    {
    case glsl:
        __intrinsic_asm "subgroupAllEqual($0)";
    case hlsl:
        __intrinsic_asm "WaveActiveAllEqual";
    case spirv:
        return spirv_asm
        {
            OpCapability GroupNonUniformVote;
            OpGroupNonUniformAllEqual $$bool result Subgroup $value
        };
    default:
        return WaveMaskAllEqual(WaveGetActiveMask(), value);
    }
}

__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_vote)
__spirv_version(1.3)
bool WaveActiveAllEqual(vector<T,N> value)
{
    __target_switch
    {
    case glsl:
        __intrinsic_asm "subgroupAllEqual($0)";
    case hlsl:
        __intrinsic_asm "WaveActiveAllEqual";
    case spirv:
        return spirv_asm
        {
            OpCapability GroupNonUniformVote;
            OpGroupNonUniformAllEqual $$bool result Subgroup $value
        };
    default:
        return WaveMaskAllEqual(WaveGetActiveMask(), value);
    }
}

__generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(hlsl)
bool WaveActiveAllEqual(matrix<T, N, M> value)
{
    return WaveMaskAllEqual(WaveGetActiveMask(), value);
}

__glsl_extension(GL_KHR_shader_subgroup_vote)
__spirv_version(1.3)
bool WaveActiveAllTrue(bool condition)
{
    __target_switch
    {
    case glsl:
        __intrinsic_asm "subgroupAll($0)";
    case hlsl:
        __intrinsic_asm "WaveActiveAllTrue($0)";
    case spirv:
        return spirv_asm
        {
            OpCapability GroupNonUniformVote;
            OpGroupNonUniformAll $$bool result Subgroup $condition
        };
    default:
        return WaveMaskAllTrue(WaveGetActiveMask(), condition);
    }
}

__glsl_extension(GL_KHR_shader_subgroup_vote)
__spirv_version(1.3)
bool WaveActiveAnyTrue(bool condition)
{
    __target_switch
    {
    case glsl:
        __intrinsic_asm "subgroupAny($0)";
    case hlsl:
        __intrinsic_asm "WaveActiveAnyTrue($0)";
    case spirv:
        return spirv_asm
        {
            OpCapability GroupNonUniformVote;
            OpGroupNonUniformAny $$bool result Subgroup $condition
        };
    default:
        return WaveMaskAnyTrue(WaveGetActiveMask(), condition);
    }
}

__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
uint4 WaveActiveBallot(bool condition)
{
    __target_switch
    {
    case glsl:
        __intrinsic_asm "subgroupBallot($0)";
    case hlsl:
        __intrinsic_asm "WaveActiveBallot";
    case spirv:
        return spirv_asm
        {
            OpCapability GroupNonUniformBallot;
            OpGroupNonUniformBallot $$uint4 result Subgroup $condition
        };
    default:
        return WaveMaskBallot(WaveGetActiveMask(), condition);
    }
}

__target_intrinsic(hlsl)
uint WaveActiveCountBits(bool value)
{
    return WaveMaskCountBits(WaveGetActiveMask(), value);
}

__glsl_extension(GL_KHR_shader_subgroup_basic)
__spirv_version(1.3)
uint WaveGetLaneCount()
{
    __target_switch
    {
    case glsl: __intrinsic_asm  "(gl_SubgroupSize)";
    case cuda: __intrinsic_asm  "(warpSize)";
    case hlsl: __intrinsic_asm  "WaveGetLaneCount()";
    case spirv:
        return spirv_asm
        {
            OpCapability GroupNonUniform;
            result:$$uint = OpLoad builtin(SubgroupSize:uint)
        };
    }
}

__glsl_extension(GL_KHR_shader_subgroup_basic)
__spirv_version(1.3)
uint WaveGetLaneIndex()
{
    __target_switch
    {
    case glsl: __intrinsic_asm  "(gl_SubgroupInvocationID)";
    case cuda: __intrinsic_asm  "_getLaneId()";
    case hlsl: __intrinsic_asm  "WaveGetLaneIndex()";
    case spirv:
        return spirv_asm
        {
            OpCapability GroupNonUniform;
            result:$$uint = OpLoad builtin(SubgroupLocalInvocationId:uint)
        };
    }
}

__glsl_extension(GL_KHR_shader_subgroup_basic)
__spirv_version(1.3)
bool WaveIsFirstLane()
{
    __target_switch
    {
    case glsl:
        __intrinsic_asm "subgroupElect()";
    case hlsl:
        __intrinsic_asm "WaveIsFirstLane()";
    case spirv:
        return spirv_asm
        {
            OpCapability GroupNonUniformBallot;
            OpGroupNonUniformElect $$bool result Subgroup
        };
    default:
        return WaveMaskIsFirstLane(WaveGetActiveMask());
    }
}

// It's useful to have a wave uint4 version of countbits, because some wave functions return uint4.
// This implementation tries to limit the amount of work required by the actual lane count.
uint _WaveCountBits(uint4 value)
{
    __target_switch
    {
    case spirv:
        return spirv_asm
        {
            OpCapability GroupNonUniformBallot;
            OpGroupNonUniformBallotBitCount $$uint result Subgroup Reduce $value
        };
    default:
        // Assume since WaveGetLaneCount should be known at compile time, the branches will hopefully boil away
        const uint waveLaneCount = WaveGetLaneCount();
        switch ((waveLaneCount - 1) / 32)
        {
            default:
            case 0: return countbits(value.x);
            case 1: return countbits(value.x) + countbits(value.y);
            case 2: return countbits(value.x) + countbits(value.y) + countbits(value.z);
            case 3: return countbits(value.x) + countbits(value.y) + countbits(value.z) + countbits(value.w);
        }
    }
}


// Prefix

__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
T WavePrefixProduct(T expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupExclusiveMul($0)";
    case hlsl: __intrinsic_asm "WavePrefixProduct";
    case spirv:
        if (__isFloat<T>())
            return spirv_asm {
                OpCapability GroupNonUniformArithmetic;
                OpGroupNonUniformFMul $$T result Subgroup ExclusiveScan $expr
            };
        else if (__isSignedInt<T>())
        {
            return spirv_asm
            {
                OpCapability GroupNonUniformArithmetic;
                // TODO: use the correct integer width
                OpBitcast $$uint %uvalue $expr;
                OpGroupNonUniformIMul $$uint %mulResult Subgroup ExclusiveScan %uvalue;
                OpBitcast $$T result %mulResult
            };
        }
        else if (__isUnsignedInt<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformIMul $$T result Subgroup ExclusiveScan $expr};
    default:
        return WaveMaskPrefixProduct(WaveGetActiveMask(), expr);
    }
}


__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
vector<T,N> WavePrefixProduct(vector<T,N> expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupExclusiveMul($0)";
    case hlsl: __intrinsic_asm "WavePrefixProduct";
    case spirv:
        if (__isFloat<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformFMul $$vector<T,N> result Subgroup ExclusiveScan $expr};
        else if (__isSignedInt<T>())
        {
            return spirv_asm
            {
                OpCapability GroupNonUniformArithmetic;
                // TODO: use the correct integer width
                OpBitcast $$vector<uint,N> %uvalue $expr;
                OpGroupNonUniformIMul $$vector<uint,N> %mulResult Subgroup ExclusiveScan %uvalue;
                OpBitcast $$vector<T,N> result %mulResult
            };
        }
        else if (__isUnsignedInt<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformIMul $$vector<T,N> result Subgroup ExclusiveScan $expr};
    default:
        return WaveMaskPrefixProduct(WaveGetActiveMask(), expr);
    }
}

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl)
matrix<T, N, M> WavePrefixProduct(matrix<T, N, M> expr)
{
    return WaveMaskPrefixProduct(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinArithmeticType>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
T WavePrefixSum(T expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupExclusiveAdd($0)";
    case hlsl: __intrinsic_asm "WavePrefixSum";
    case spirv:
        if (__isFloat<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformFAdd $$T result Subgroup ExclusiveScan $expr};
        else if (__isSignedInt<T>())
        {
            return spirv_asm
            {
                OpCapability GroupNonUniformArithmetic;
                // TODO: use the correct integer width
                %uvalue:$$uint = OpBitcast $expr;
                %mulResult:$$uint = OpGroupNonUniformIAdd Subgroup ExclusiveScan %uvalue;
                result:$$T = OpBitcast %mulResult
            };
        }
        else if (__isUnsignedInt<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformIAdd $$T result Subgroup ExclusiveScan $expr};
    default:
        return WaveMaskPrefixSum(WaveGetActiveMask(), expr);
    }
}

__generic<T : __BuiltinArithmeticType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
vector<T,N> WavePrefixSum(vector<T,N> expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupExclusiveAdd($0)";
    case hlsl: __intrinsic_asm "WavePrefixSum";
    case spirv:
        if (__isFloat<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformFAdd $$vector<T,N> result Subgroup ExclusiveScan $expr};
        else if (__isSignedInt<T>())
        {
            return spirv_asm
            {
                OpCapability GroupNonUniformArithmetic;
                // TODO: use the correct integer width
                %uvalue:$$vector<uint,N> = OpBitcast $expr;
                %mulResult:$$vector<uint,N> = OpGroupNonUniformIAdd Subgroup ExclusiveScan %uvalue;
                result:$$vector<T,N> = OpBitcast %mulResult
            };
        }
        else if (__isUnsignedInt<T>())
            return spirv_asm {OpCapability GroupNonUniformArithmetic; OpGroupNonUniformIAdd $$vector<T,N> result Subgroup ExclusiveScan $expr};
    default:
        return WaveMaskPrefixSum(WaveGetActiveMask(), expr);
    }
}

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl)
matrix<T,N,M> WavePrefixSum(matrix<T,N,M> expr)
{
    return WaveMaskPrefixSum(WaveGetActiveMask(), expr);
}

__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
T WaveReadLaneFirst(T expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupBroadcastFirst($0)";
    case hlsl: __intrinsic_asm "WaveReadLaneFirst";
    case spirv:
        return spirv_asm {OpCapability GroupNonUniformBallot; OpGroupNonUniformBroadcastFirst $$T result Subgroup $expr};
    default:
        return WaveMaskReadLaneFirst(WaveGetActiveMask(), expr);
    }
}

__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
vector<T,N> WaveReadLaneFirst(vector<T,N> expr)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupBroadcastFirst($0)";
    case hlsl: __intrinsic_asm "WaveReadLaneFirst";
    case spirv:
        return spirv_asm {OpCapability GroupNonUniformBallot; OpGroupNonUniformBroadcastFirst $$vector<T,N> result Subgroup $expr};
    default:
        return WaveMaskReadLaneFirst(WaveGetActiveMask(), expr);
    }
}

__generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(hlsl)
matrix<T,N,M> WaveReadLaneFirst(matrix<T,N,M> expr)
{
    return WaveMaskReadLaneFirst(WaveGetActiveMask(), expr);
}

// NOTE! WaveBroadcastLaneAt is *NOT* standard HLSL
// It is provided as access to subgroupBroadcast which can only take a
// constexpr laneId.
// https://github.com/KhronosGroup/GLSL/blob/master/extensions/khr/GL_KHR_shader_subgroup.txt
// Versions SPIR-V greater than 1.4 loosen this restriction, and allow 'dynamic uniform' index
// If that's the behavior required then client code should use WaveReadLaneAt which works this way.
__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
T WaveBroadcastLaneAt(T value, constexpr int lane)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupBroadcast($0, $1)";
    case hlsl: __intrinsic_asm "WaveReadLaneAt";
    case spirv:
        let ulane = uint(lane);
        return spirv_asm {OpCapability GroupNonUniformBallot; OpGroupNonUniformBroadcast $$T result Subgroup $value $ulane};
    default:
        return WaveMaskBroadcastLaneAt(WaveGetActiveMask(), value, lane);
    }
}

__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
vector<T,N> WaveBroadcastLaneAt(vector<T,N> value, constexpr int lane)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupBroadcast($0, $1)";
    case hlsl: __intrinsic_asm "WaveReadLaneAt";
    case spirv:
        let ulane = uint(lane);
        return spirv_asm {OpCapability GroupNonUniformBallot; OpGroupNonUniformBroadcast $$vector<T,N> result Subgroup $value $ulane};
    default:
        return WaveMaskBroadcastLaneAt(WaveGetActiveMask(), value, lane);
    }
}

__generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(cuda, "_waveShuffleMultiple(_getActiveMask(), $0, $1)")
__target_intrinsic(hlsl, "WaveReadLaneAt")
matrix<T, N, M> WaveBroadcastLaneAt(matrix<T, N, M> value, constexpr int lane)
{
    return WaveMaskBroadcastLaneAt(WaveGetActiveMask(), value, lane);
}

// TODO(JS): If it can be determines that the `laneId` is constExpr, then subgroupBroadcast
// could be used on GLSL. For now we just use subgroupShuffle
__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_shuffle)
__spirv_version(1.3)
T WaveReadLaneAt(T value, int lane)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupShuffle($0, $1)";
    case hlsl: __intrinsic_asm "WaveReadLaneAt";
    case spirv:
        let ulane = uint(lane);
        return spirv_asm {OpCapability GroupNonUniformShuffle; OpGroupNonUniformShuffle $$T result Subgroup $value $ulane};
    default:
        return WaveMaskReadLaneAt(WaveGetActiveMask(), value, lane);
    }
}

__generic<T : __BuiltinType, let N : int>
__spirv_version(1.3)
__glsl_extension(GL_KHR_shader_subgroup_shuffle)
vector<T,N> WaveReadLaneAt(vector<T,N> value, int lane)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupShuffle($0, $1)";
    case hlsl: __intrinsic_asm "WaveReadLaneAt";
    case spirv:
        let ulane = uint(lane);
        return spirv_asm {OpCapability GroupNonUniformShuffle; OpGroupNonUniformShuffle $$vector<T,N> result Subgroup $value $ulane};
    default:
        return WaveMaskReadLaneAt(WaveGetActiveMask(), value, lane);
    }
}

__generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(cuda, "_waveShuffleMultiple(_getActiveMask(), $0, $1)")
__target_intrinsic(hlsl)
matrix<T, N, M> WaveReadLaneAt(matrix<T, N, M> value, int lane)
{
    return WaveMaskReadLaneAt(WaveGetActiveMask(), value, lane);
}

// NOTE! WaveShuffle is a NON STANDARD HLSL intrinsic! It will map to WaveReadLaneAt on HLSL
// which means it will only work on hardware which allows arbitrary laneIds which is not true
// in general because it breaks the HLSL standard, which requires it's 'dynamically uniform' across the Wave.
__generic<T : __BuiltinType>
__glsl_extension(GL_KHR_shader_subgroup_shuffle)
__spirv_version(1.3)
T WaveShuffle(T value, int lane)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupShuffle($0, $1)";
    case hlsl: __intrinsic_asm "WaveReadLaneAt";
    case spirv:
        let ulane = uint(lane);
        return spirv_asm {OpCapability GroupNonUniformShuffle; OpGroupNonUniformShuffle $$T result Subgroup $value $ulane};
    default:
        return WaveMaskShuffle(WaveGetActiveMask(), value, lane);
    }
}

__generic<T : __BuiltinType, let N : int>
__glsl_extension(GL_KHR_shader_subgroup_shuffle)
__spirv_version(1.3)
vector<T,N> WaveShuffle(vector<T,N> value, int lane)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupShuffle($0, $1)";
    case hlsl: __intrinsic_asm "WaveReadLaneAt";
    case spirv:
        let ulane = uint(lane);
        return spirv_asm {OpCapability GroupNonUniformShuffle; OpGroupNonUniformShuffle $$vector<T,N> result Subgroup $value $ulane};
    default:
        return WaveMaskShuffle(WaveGetActiveMask(), value, lane);
    }
}

__generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(hlsl, "WaveReadLaneAt")
matrix<T, N, M> WaveShuffle(matrix<T, N, M> value, int lane)
{
    return WaveMaskShuffle(WaveGetActiveMask(), value, lane);
}

__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
uint WavePrefixCountBits(bool value)
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupBallotExclusiveBitCount(subgroupBallot($0))";
    case hlsl: __intrinsic_asm "WavePrefixCountBits($0)";
    case spirv:
        return spirv_asm
        {
            OpCapability GroupNonUniformBallot;
            %mask:$$uint4 = OpGroupNonUniformBallot Subgroup $value;
            OpGroupNonUniformBallotBitCount $$uint result Subgroup 2 %mask
        };
    default:
        return WaveMaskPrefixCountBits(WaveGetActiveMask(), value);
    }
}

__glsl_extension(GL_KHR_shader_subgroup_ballot)
__spirv_version(1.3)
uint4 WaveGetConvergedMulti()
{
    __target_switch
    {
    case glsl: __intrinsic_asm "subgroupBallot(true)";
    case hlsl: __intrinsic_asm "WaveActiveBallot(true)";
    case cuda: __intrinsic_asm "make_uint4(__activemask(), 0, 0, 0)";
    case spirv:
        let _true = true;
        return spirv_asm
        {
            OpCapability GroupNonUniformBallot;
            OpGroupNonUniformBallot $$uint4 result Subgroup $_true
        };
    }
}

[ForceInline]
uint4 WaveGetActiveMulti()
{
    return WaveGetConvergedMulti();
}

// Shader model 6.5 stuff
// https://github.com/microsoft/DirectX-Specs/blob/master/d3d/HLSL_ShaderModel6_5.md

__generic<T : __BuiltinType>
__target_intrinsic(hlsl)
uint4 WaveMatch(T value)
{
    return WaveMaskMatch(WaveGetActiveMask(), value);
}

__generic<T : __BuiltinType, let N : int>
__target_intrinsic(hlsl)
uint4 WaveMatch(vector<T,N> value)
{
    return WaveMaskMatch(WaveGetActiveMask(), value);
}

__generic<T : __BuiltinType, let N : int, let M : int>
__target_intrinsic(hlsl)
uint4 WaveMatch(matrix<T,N,M> value)
{
    return WaveMaskMatch(WaveGetActiveMask(), value);
}

__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_popc(__ballot_sync(($1).x, $0) & _getLaneLtMask())")
uint WaveMultiPrefixCountBits(bool value, uint4 mask);

__generic<T : __BuiltinArithmeticType>
__target_intrinsic(hlsl)
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupExclusiveAnd($0)")
__target_intrinsic(cuda, "_wavePrefixAnd(_getMultiPrefixMask(($1).x), $0)")
T WaveMultiPrefixBitAnd(T expr, uint4 mask);

__target_intrinsic(hlsl)
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupExclusiveAnd($0)")
__target_intrinsic(cuda, "_wavePrefixAndMultiple(_getMultiPrefixMask(($1).x), $0)")
__generic<T : __BuiltinArithmeticType, let N : int>
vector<T,N> WaveMultiPrefixBitAnd(vector<T,N> expr, uint4 mask);

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_wavePrefixAndMultiple(_getMultiPrefixMask(($1).x), $0)")
matrix<T,N,M> WaveMultiPrefixBitAnd(matrix<T,N,M> expr, uint4 mask);

__generic<T : __BuiltinArithmeticType>
__target_intrinsic(hlsl)
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
//__target_intrinsic(glsl, "subgroupExclusiveOr($0)")
__target_intrinsic(cuda, "_wavePrefixOr(, _getMultiPrefixMask(($1).x), $0)")
T WaveMultiPrefixBitOr(T expr, uint4 mask);

__generic<T : __BuiltinArithmeticType, let N : int>
__target_intrinsic(hlsl)
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
//__target_intrinsic(glsl, "subgroupExclusiveOr($0)")
__target_intrinsic(cuda, "_wavePrefixOrMultiple(_getMultiPrefixMask(($1).x), $0)")
vector<T,N> WaveMultiPrefixBitOr(vector<T,N> expr, uint4 mask);

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_wavePrefixOrMultiple(_getMultiPrefixMask(($1).x), $0)")
matrix<T,N,M> WaveMultiPrefixBitOr(matrix<T,N,M> expr, uint4 mask);

__generic<T : __BuiltinArithmeticType>
__target_intrinsic(hlsl)
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupExclusiveXor($0)")
__target_intrinsic(cuda, "_wavePrefixXor(_getMultiPrefixMask(($1).x), $0)")
T WaveMultiPrefixBitXor(T expr, uint4 mask);

__generic<T : __BuiltinArithmeticType, let N : int>
__target_intrinsic(hlsl)
__glsl_extension(GL_KHR_shader_subgroup_arithmetic)
__spirv_version(1.3)
__target_intrinsic(glsl, "subgroupExclusiveXor($0)")
__target_intrinsic(cuda, "_wavePrefixXorMultiple(_getMultiPrefixMask(($1).x), $0)")
vector<T,N> WaveMultiPrefixBitXor(vector<T,N> expr, uint4 mask);

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_wavePrefixXorMultiple(_getMultiPrefixMask(($1).x), $0)")
matrix<T,N,M> WaveMultiPrefixBitXor(matrix<T,N,M> expr, uint4 mask);

__generic<T : __BuiltinArithmeticType>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_wavePrefixProduct(_getMultiPrefixMask(($1).x), $0)")
T WaveMultiPrefixProduct(T value, uint4 mask);

__generic<T : __BuiltinArithmeticType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_wavePrefixProductMultiple(_getMultiPrefixMask(($1).x), $0)")
vector<T,N> WaveMultiPrefixProduct(vector<T,N> value, uint4 mask);

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_wavePrefixProductMultiple(_getMultiPrefixMask(($1).x), $0)")
matrix<T,N,M> WaveMultiPrefixProduct(matrix<T,N,M> value, uint4 mask);

__generic<T : __BuiltinArithmeticType>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_wavePrefixSum(_getMultiPrefixMask(($1).x), $0)")
T WaveMultiPrefixSum(T value, uint4 mask);

__generic<T : __BuiltinArithmeticType, let N : int>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_wavePrefixSumMultiple(_getMultiPrefixMask(($1).x), $0 )")
vector<T,N> WaveMultiPrefixSum(vector<T,N> value, uint4 mask);

__generic<T : __BuiltinArithmeticType, let N : int, let M : int>
__target_intrinsic(hlsl)
__target_intrinsic(cuda, "_wavePrefixSumMultiple(_getMultiPrefixMask(($1).x), $0)")
matrix<T,N,M> WaveMultiPrefixSum(matrix<T,N,M> value, uint4 mask);

// `typedef`s to help with the fact that HLSL has been sorta-kinda case insensitive at various points
typedef Texture2D texture2D;

${{{{

// Buffer types

static const struct {
    char const*         name;
    SlangResourceAccess access;
} kBaseBufferAccessLevels[] = {
    { "",                   SLANG_RESOURCE_ACCESS_READ },
    { "RW",                 SLANG_RESOURCE_ACCESS_READ_WRITE },
    { "RasterizerOrdered",  SLANG_RESOURCE_ACCESS_RASTER_ORDERED },
};
static const int kBaseBufferAccessLevelCount = sizeof(kBaseBufferAccessLevels) / sizeof(kBaseBufferAccessLevels[0]);

for (int aa = 0; aa < kBaseBufferAccessLevelCount; ++aa)
{
    auto access = kBaseBufferAccessLevels[aa].access;
    sb << "__generic<T,let format:int=0>\n";
    sb << "typealias ";
    sb << kBaseBufferAccessLevels[aa].name;
    sb << "Buffer = __TextureImpl<T, __ShapeBuffer, 0, 0, 0, " << aa << ", 0, 0, format>;\n";
    
    bool isReadOnly = aa == 0;

    char const* glslTextureSizeFunc = (isReadOnly) ? "textureSize" : "imageSize";
    char const* glslLoadFuncName = (isReadOnly) ? "texelFetch" : "imageLoad";
    char const* spvLoadInstName = (isReadOnly) ? "OpImageFetch" : "OpImageRead";
}}}}

__generic<T, let format:int>
extension __TextureImpl<T, __ShapeBuffer, 0, 0, 0, $(aa), 0, 0, format>
{
    [__readNone]
    void GetDimensions(out uint dim)
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".GetDimensions";
        case glsl: __intrinsic_asm "($1 = $(glslTextureSizeFunc)($0))";
        case spirv:
            dim = spirv_asm {
                OpCapability ImageQuery;
                result:$$uint = OpImageQuerySize $this;
            };
        }
    }

    __glsl_extension(GL_EXT_samplerless_texture_functions)
    $(isReadOnly?"[__readNone] ":"")
    T Load(int location)
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".Load";
        case glsl: __intrinsic_asm "$(glslLoadFuncName)($0, $1)$z";
        case spirv: return spirv_asm {
                %sampled:__sampledType(T) = $(spvLoadInstName) $this $location;
                __truncate $$T result __sampledType(T) %sampled;
            };
        }
    }

    $(isReadOnly?"[__readNone] ":"")
    T Load(int location, out uint status);

    __subscript(uint index) -> T {

        $(isReadOnly?"[__readNone] ":"")
        [ForceInline]
        get { return Load((int)index); }
${{{{
        if (access != SLANG_RESOURCE_ACCESS_READ) {
}}}}
            [nonmutating] set
            {
                __target_switch
                {
                case hlsl: __intrinsic_asm "($0)[$1] = $2";
                case glsl: __intrinsic_asm "imageStore($0, int($1), $V2)";
                case spirv: spirv_asm {
                        OpImageWrite $this $index $newValue;
                    };
                }
            }

            __intrinsic_op($(kIROp_ImageSubscript))
            ref;
${{{{
        } // access != SLANG_RESOURCE_ACCESS_READ
}}}}

        }
    

    };  // end extension
${{{{
}
}}}}


// DirectX Raytracing (DXR) Support
//
// The following is based on the experimental DXR SDK v0.09.01.
//
// Numbering follows the sections in the "D3D12 Raytracing Functional Spec" v0.09 (2018-03-12)
//

// 10.1.1 - Ray Flags

typedef uint RAY_FLAG;

static const RAY_FLAG RAY_FLAG_NONE                             = 0x00;
static const RAY_FLAG RAY_FLAG_FORCE_OPAQUE                     = 0x01;
static const RAY_FLAG RAY_FLAG_FORCE_NON_OPAQUE                 = 0x02;
static const RAY_FLAG RAY_FLAG_ACCEPT_FIRST_HIT_AND_END_SEARCH  = 0x04;
static const RAY_FLAG RAY_FLAG_SKIP_CLOSEST_HIT_SHADER          = 0x08;
static const RAY_FLAG RAY_FLAG_CULL_BACK_FACING_TRIANGLES       = 0x10;
static const RAY_FLAG RAY_FLAG_CULL_FRONT_FACING_TRIANGLES      = 0x20;
static const RAY_FLAG RAY_FLAG_CULL_OPAQUE                      = 0x40;
static const RAY_FLAG RAY_FLAG_CULL_NON_OPAQUE                  = 0x80;
static const RAY_FLAG RAY_FLAG_SKIP_TRIANGLES                   = 0x100;
static const RAY_FLAG RAY_FLAG_SKIP_PROCEDURAL_PRIMITIVES       = 0x200;

// 10.1.2 - Ray Description Structure

__target_intrinsic(hlsl, RayDesc)
__target_intrinsic(cuda, RayDesc)
struct RayDesc
{
    __target_intrinsic(hlsl, Origin)
    __target_intrinsic(cuda, Origin)
    float3 Origin;

    __target_intrinsic(hlsl, TMin)
    __target_intrinsic(cuda, TMin)
    float  TMin;

    __target_intrinsic(hlsl, Direction)
    __target_intrinsic(cuda, Direction)
    float3 Direction;

    __target_intrinsic(hlsl, TMax)
    __target_intrinsic(cuda, TMax)
    float  TMax;
};

// 10.1.3 - Ray Acceleration Structure

__builtin
__magic_type(RaytracingAccelerationStructureType)
__intrinsic_type($(kIROp_RaytracingAccelerationStructureType))
struct RaytracingAccelerationStructure {};

// 10.1.4 - Subobject Definitions

// TODO: We may decide to support these, but their reliance on C++ implicit
// constructor call syntax (`SomeType someVar(arg0, arg1);`) makes them
// annoying for the current Slang parsing strategy, and using global variables
// for this stuff comes across as a kludge rather than the best possible design.

// 10.1.5 - Intersection Attributes Structure

__target_intrinsic(hlsl, BuiltInTriangleIntersectionAttributes)
struct BuiltInTriangleIntersectionAttributes
{
    __target_intrinsic(hlsl, barycentrics)
    float2 barycentrics;
};

// 10.2 Shaders

// Right now new shader stages need to be added directly to the compiler
// implementation, rather than being something that can be declared in the stdlib.

// 10.3 - Intrinsics

// 10.3.1

// `executeCallableNV` is the GLSL intrinsic that will be used to implement
// `CallShader()` for GLSL-based targets.
//
__target_intrinsic(GL_NV_ray_tracing, "executeCallableNV")
__target_intrinsic(GL_EXT_ray_tracing, "executeCallableEXT")
void __executeCallable(uint shaderIndex, int payloadLocation);

// Next is the custom intrinsic that will compute the payload location
// for a type being used in a `CallShader()` call for GLSL-based targets.
//
__generic<Payload>
[__readNone]
__intrinsic_op($(kIROp_GetVulkanRayTracingPayloadLocation))
int __callablePayloadLocation(__ref Payload payload);

// Now we provide a hard-coded definition of `CallShader()` for GLSL-based
// targets, which maps the generic HLSL operation into the non-generic
// GLSL equivalent.
//
__generic<Payload>
void CallShader(uint shaderIndex, inout Payload payload)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "CallShader";
    case glsl:
        {
            [__vulkanCallablePayload]
            static Payload p;

            p = payload;
            __executeCallable(shaderIndex, __callablePayloadLocation(p));
            payload = p;
        }
    case spirv:
        {
            [__vulkanCallablePayload]
            static Payload p;

            p = payload;
            spirv_asm {
                OpExecuteCallableKHR $shaderIndex &p
            };
            payload = p;
        }
    }
}

// 10.3.2

__target_intrinsic(GL_NV_ray_tracing, "traceNV")
__target_intrinsic(GL_EXT_ray_tracing, "traceRayEXT")
void __traceRay(
    RaytracingAccelerationStructure AccelerationStructure,
    uint                            RayFlags,
    uint                            InstanceInclusionMask,
    uint                            RayContributionToHitGroupIndex,
    uint                            MultiplierForGeometryContributionToHitGroupIndex,
    uint                            MissShaderIndex,
    float3                          Origin,
    float                           TMin,
    float3                          Direction,
    float                           TMax,
    int                             PayloadLocation);

// TODO: Slang's parsing logic currently puts modifiers on
// the `GenericDecl` rather than the inner decl when
// using our default syntax, which seems wrong. We need
// to fix this, but for now using the expanded `__generic`
// syntax works in a pinch.
//
__generic<Payload>
[__readNone]
__intrinsic_op($(kIROp_GetVulkanRayTracingPayloadLocation))
int __rayPayloadLocation(__ref Payload payload);

__generic<payload_t>
void TraceRay(
    RaytracingAccelerationStructure AccelerationStructure,
    uint                            RayFlags,
    uint                            InstanceInclusionMask,
    uint                            RayContributionToHitGroupIndex,
    uint                            MultiplierForGeometryContributionToHitGroupIndex,
    uint                            MissShaderIndex,
    RayDesc                         Ray,
    inout payload_t                 Payload)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "TraceRay";
    case cuda: __intrinsic_asm "traceOptiXRay";
    case glsl:
    {
        [__vulkanRayPayload]
        static payload_t p;

        p = Payload;
        __traceRay(
            AccelerationStructure,
            RayFlags,
            InstanceInclusionMask,
            RayContributionToHitGroupIndex,
            MultiplierForGeometryContributionToHitGroupIndex,
            MissShaderIndex,
            Ray.Origin,
            Ray.TMin,
            Ray.Direction,
            Ray.TMax,
            __rayPayloadLocation(p));
        Payload = p;
    }
    case spirv:
    {
        [__vulkanRayPayload]
        static payload_t p;

        p = Payload;
        let origin = Ray.Origin;
        let direction = Ray.Direction;
        let tmin = Ray.TMin;
        let tmax = Ray.TMax;
        spirv_asm {
            OpTraceRayKHR 
                /**/ $AccelerationStructure
                /**/ $RayFlags
                /**/ $InstanceInclusionMask
                /**/ $RayContributionToHitGroupIndex
                /**/ $MultiplierForGeometryContributionToHitGroupIndex
                /**/ $MissShaderIndex
                /**/ $origin
                /**/ $tmin
                /**/ $direction
                /**/ $tmax
                /**/ &p;
        };
        Payload = p;
    }
    }
}

// NOTE!
// The name of the following functions may change when DXR supports
// a feature similar to the `GL_NV_ray_tracing_motion_blur` extension
//
// https://github.com/KhronosGroup/GLSL/blob/master/extensions/nv/GLSL_NV_ray_tracing_motion_blur.txt

__target_intrinsic(glsl, "traceRayMotionNV")
__glsl_version(460)
__glsl_extension(GL_NV_ray_tracing_motion_blur)
__glsl_extension(GL_EXT_ray_tracing)
void __traceMotionRay(
    RaytracingAccelerationStructure AccelerationStructure,
    uint                            RayFlags,
    uint                            InstanceInclusionMask,
    uint                            RayContributionToHitGroupIndex,
    uint                            MultiplierForGeometryContributionToHitGroupIndex,
    uint                            MissShaderIndex,
    float3                          Origin,
    float                           TMin,
    float3                          Direction,
    float                           TMax,
    float                           CurrentTime,
    int                             PayloadLocation);

__generic<payload_t>
void TraceMotionRay(
    RaytracingAccelerationStructure AccelerationStructure,
    uint                            RayFlags,
    uint                            InstanceInclusionMask,
    uint                            RayContributionToHitGroupIndex,
    uint                            MultiplierForGeometryContributionToHitGroupIndex,
    uint                            MissShaderIndex,
    RayDesc                         Ray,
    float                           CurrentTime,
    inout payload_t                 Payload)
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "TraceMotionRay";
    case glsl:
    {
        [__vulkanRayPayload]
        static payload_t p;

        p = Payload;
        __traceMotionRay(
            AccelerationStructure,
            RayFlags,
            InstanceInclusionMask,
            RayContributionToHitGroupIndex,
            MultiplierForGeometryContributionToHitGroupIndex,
            MissShaderIndex,
            Ray.Origin,
            Ray.TMin,
            Ray.Direction,
            Ray.TMax,
            CurrentTime,
            __rayPayloadLocation(p));
        Payload = p;
    }
    case spirv:
    {
        [__vulkanRayPayload]
        static payload_t p;
        
        let origin = Ray.Origin;
        let direction = Ray.Direction;
        let tmin = Ray.TMin;
        let tmax = Ray.TMax;

        p = Payload;
        spirv_asm {
            OpCapability RayTracingMotionBlurNV;
            OpExtension "SPV_NV_ray_tracing_motion_blur";

            OpTraceRayMotionNV
                /**/ $AccelerationStructure
                /**/ $RayFlags
                /**/ $InstanceInclusionMask
                /**/ $RayContributionToHitGroupIndex
                /**/ $MultiplierForGeometryContributionToHitGroupIndex
                /**/ $MissShaderIndex
                /**/ $origin
                /**/ $tmin
                /**/ $direction
                /**/ $tmax
                /**/ $CurrentTime
                /**/ &p;
        };
        Payload = p;
    }
    }
}

// 10.3.3
__target_intrinsic(hlsl)
bool ReportHit<A>(float tHit, uint hitKind, A attributes);

bool __reportIntersection(float tHit, uint hitKind)
{
    __target_switch
    {
    case _GL_EXT_ray_tracing: __intrinsic_asm "reportIntersectionEXT";
    case _GL_NV_ray_tracing: __intrinsic_asm "reportIntersectionNV";
    case spirv:
        return spirv_asm {
            result:$$bool = OpReportIntersectionKHR $tHit $hitKind;
        };
    }
}

__generic<A>
__specialized_for_target(glsl)
__specialized_for_target(spirv)
bool ReportHit(float tHit, uint hitKind, A attributes)
{
    [__vulkanHitAttributes]
    static A a;

    a = attributes;
    return __reportIntersection(tHit, hitKind);
}

// 10.3.4
void IgnoreHit()
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "IgnoreHit";
    case _GL_EXT_ray_tracing: __intrinsic_asm "ignoreIntersectionEXT;";
    case _GL_NV_ray_tracing: __intrinsic_asm "ignoreIntersectionNV";
    case cuda: __intrinsic_asm "optixIgnoreIntersection";
    case spirv: spirv_asm { OpIgnoreIntersectionKHR; %_ = OpLabel };
    }
}

// 10.3.5
void AcceptHitAndEndSearch()
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "AcceptHitAndEndSearch";
    case _GL_EXT_ray_tracing: __intrinsic_asm "terminateRayEXT;";
    case _GL_NV_ray_tracing: __intrinsic_asm "terminateRayNV";
    case cuda: __intrinsic_asm "optixTerminateRay";
    case spirv: spirv_asm { OpTerminateRayKHR; %_ = OpLabel };
    }
}

// 10.4 - System Values and Special Semantics

// TODO: Many of these functions need to be restricted so that
// they can only be accessed from specific stages.

// 10.4.1 - Ray Dispatch System Values

uint3 DispatchRaysIndex()
{
    __target_switch
    {
    case hlsl:  __intrinsic_asm "DispatchRaysIndex";
    case _GL_EXT_ray_tracing: __intrinsic_asm "(gl_LaunchIDEXT)";
    case _GL_NV_ray_tracing: __intrinsic_asm "(gl_LaunchIDNV)";
    case cuda: __intrinsic_asm "optixGetLaunchIndex";
    case spirv:
        return spirv_asm {
            result:$$uint3 = OpLoad builtin(LaunchIdKHR:uint3);
        };
    }
}

uint3 DispatchRaysDimensions()
{
    __target_switch
    {
    case hlsl:  __intrinsic_asm "DispatchRaysDimensions";
    case _GL_EXT_ray_tracing: __intrinsic_asm "(gl_LaunchSizeEXT)";
    case _GL_NV_ray_tracing: __intrinsic_asm "(gl_LaunchSizeNV)";
    case cuda: __intrinsic_asm "optixGetLaunchDimensions";
    case spirv:
        return spirv_asm {
            result:$$uint3 = OpLoad builtin(LaunchSizeKHR:uint3);
        };
    }
}

// 10.4.2 - Ray System Values

float3 WorldRayOrigin()
{
    __target_switch
    {
    case hlsl:  __intrinsic_asm "WorldRayOrigin";
    case _GL_EXT_ray_tracing: __intrinsic_asm "(gl_WorldRayOriginEXT)";
    case _GL_NV_ray_tracing: __intrinsic_asm "(gl_WorldRayOriginNV)";
    case cuda: __intrinsic_asm "optixGetWorldRayOrigin";
    case spirv:
        return spirv_asm {
            result:$$float3 = OpLoad builtin(WorldRayOriginKHR:float3);
        };
    }
}

float3 WorldRayDirection()
{
    __target_switch
    {
    case hlsl:  __intrinsic_asm "WorldRayDirection";
    case _GL_EXT_ray_tracing: __intrinsic_asm "(gl_WorldRayDirectionEXT)";
    case _GL_NV_ray_tracing: __intrinsic_asm "(gl_WorldRayDirectionNV)";
    case cuda: __intrinsic_asm "optixGetWorldRayDirection";
    case spirv:
        return spirv_asm {
            result:$$float3 = OpLoad builtin(WorldRayDirectionKHR:float3);
        };
    }
}

float RayTMin()
{
    __target_switch
    {
    case hlsl:  __intrinsic_asm "RayTMin";
    case _GL_EXT_ray_tracing: __intrinsic_asm "(gl_RayTminEXT)";
    case _GL_NV_ray_tracing: __intrinsic_asm "(gl_RayTminNV)";
    case cuda: __intrinsic_asm "optixGetRayTmin";
    case spirv:
        return spirv_asm {
            result:$$float = OpLoad builtin(RayTminKHR:float);
        };
    }
}

// Note: The `RayTCurrent()` intrinsic should translate to
// either `gl_HitTNV` (for hit shaders) or `gl_RayTmaxNV`
// (for intersection shaders). Right now we are handling this
// during code emission, for simplicity.
//
// TODO: Once the compiler supports a more refined concept
// of profiles/capabilities and overloading based on them,
// we should simply provide two overloads here, specialized
// to the appropriate Vulkan stages.
//
float RayTCurrent()
{
    __target_switch
    {
    case hlsl:  __intrinsic_asm "RayTCurrent";
    case _GL_EXT_ray_tracing: __intrinsic_asm "(gl_RayTmaxEXT)";
    case _GL_NV_ray_tracing: __intrinsic_asm "(gl_RayTmaxNV)";
    case cuda: __intrinsic_asm "optixGetRayTmax";
    case spirv:
        return spirv_asm {
            result:$$float = OpLoad builtin(RayTmaxKHR:float);
        };
    }
}

uint RayFlags()
{
    __target_switch
    {
    case hlsl:  __intrinsic_asm "RayFlags";
    case _GL_EXT_ray_tracing: __intrinsic_asm "(gl_IncomingRayFlagsEXT)";
    case _GL_NV_ray_tracing: __intrinsic_asm "(gl_IncomingRayFlagsNV)";
    case cuda: __intrinsic_asm "optixGetRayFlags";
    case spirv:
        return spirv_asm {
            result:$$uint = OpLoad builtin(IncomingRayFlagsKHR:uint);
        };
    }
}

// 10.4.3 - Primitive/Object Space System Values

uint InstanceIndex()
{
    __target_switch
    {
    case hlsl:  __intrinsic_asm "InstanceIndex";
    case _GL_EXT_ray_tracing: __intrinsic_asm "(gl_InstanceID)";
    case cuda: __intrinsic_asm "optixGetInstanceIndex";
    case spirv:
        return spirv_asm {
            result:$$uint = OpLoad builtin(InstanceId:uint);
        };
    }
}

uint InstanceID()
{
    __target_switch
    {
    case hlsl:  __intrinsic_asm "InstanceID";
    case _GL_EXT_ray_tracing: __intrinsic_asm "(gl_InstanceCustomIndexEXT)";
    case _GL_NV_ray_tracing:  __intrinsic_asm "(gl_InstanceCustomIndexNV)";
    case cuda: __intrinsic_asm "optixGetInstanceId";
    case spirv:
        return spirv_asm {
            result:$$uint = OpLoad builtin(InstanceCustomIndexKHR:uint);
        };
    }
}

uint PrimitiveIndex()
{
    __target_switch
    {
    case hlsl:  __intrinsic_asm "PrimitiveIndex";
    case _GL_EXT_ray_tracing:  __intrinsic_asm "(gl_PrimitiveID)";
    case cuda: __intrinsic_asm "optixGetPrimitiveIndex";
    case spirv:
        return spirv_asm {
            result:$$uint = OpLoad builtin(PrimitiveId:uint);
        };
    }
}

float3 ObjectRayOrigin()
{
    __target_switch
    {
    case hlsl:  __intrinsic_asm "ObjectRayOrigin";
    case _GL_EXT_ray_tracing: __intrinsic_asm "(gl_ObjectRayOriginEXT)";
    case _GL_NV_ray_tracing:  __intrinsic_asm "(gl_ObjectRayOriginNV)";
    case cuda: __intrinsic_asm "optixGetObjectRayOrigin";
    case spirv:
        return spirv_asm {
            result:$$float3 = OpLoad builtin(ObjectRayOriginKHR:float3);
        };
    }
}

float3 ObjectRayDirection()
{
    __target_switch
    {
    case hlsl:  __intrinsic_asm "ObjectRayDirection";
    case _GL_EXT_ray_tracing: __intrinsic_asm "(gl_ObjectRayDirectionEXT)";
    case _GL_NV_ray_tracing:  __intrinsic_asm "(gl_ObjectRayDirectionNV)";
    case cuda: __intrinsic_asm "optixGetObjectRayDirection";
    case spirv:
        return spirv_asm {
            result:$$float3 = OpLoad builtin(ObjectRayDirectionKHR:float3);
        };
    }
}

// TODO: optix has an optixGetObjectToWorldTransformMatrix function that returns 12
// floats by reference.
float3x4 ObjectToWorld3x4()
{
    __target_switch
    {
    case hlsl:  __intrinsic_asm "ObjectToWorld3x4";
    case _GL_EXT_ray_tracing: __intrinsic_asm "transpose(gl_ObjectToWorldEXT)";
    case _GL_NV_ray_tracing:  __intrinsic_asm "transpose(gl_ObjectToWorldNV)";
    case spirv:
        return spirv_asm {
            %mat:$$float4x3 = OpLoad builtin(ObjectToWorldKHR:float4x3);
            result:$$float3x4 = OpTranspose %mat;
        };
    }
}

float3x4 WorldToObject3x4()
{
    __target_switch
    {
    case hlsl:  __intrinsic_asm "WorldToObject3x4";
    case _GL_EXT_ray_tracing: __intrinsic_asm "transpose(gl_WorldToObjectEXT)";
    case _GL_NV_ray_tracing:  __intrinsic_asm "transpose(gl_WorldToObjectNV)";
    case spirv:
        return spirv_asm {
            %mat:$$float4x3 = OpLoad builtin(WorldToObjectKHR:float4x3);
            result:$$float3x4 = OpTranspose %mat;
        };
    }
}

float4x3 ObjectToWorld4x3()
{
    __target_switch
    {
    case hlsl:  __intrinsic_asm "ObjectToWorld4x3";
    case _GL_EXT_ray_tracing: __intrinsic_asm "(gl_ObjectToWorldEXT)";
    case _GL_NV_ray_tracing:  __intrinsic_asm "(gl_ObjectToWorldNV)";
    case spirv:
        return spirv_asm {
            result:$$float4x3 = OpLoad builtin(ObjectToWorldKHR:float4x3);
        };
    }
}

float4x3 WorldToObject4x3()
{
    __target_switch
    {
    case hlsl:  __intrinsic_asm "WorldToObject4x3";
    case _GL_EXT_ray_tracing: __intrinsic_asm "(gl_WorldToObjectEXT)";
    case _GL_NV_ray_tracing:  __intrinsic_asm "(gl_WorldToObjectNV)";
    case spirv:
        return spirv_asm {
            result:$$float4x3 = OpLoad builtin(WorldToObjectKHR:float4x3);
        };
    }
}

// NOTE!
// The name of the following functions may change when DXR supports
// a feature similar to the `GL_NV_ray_tracing_motion_blur` extension

__glsl_version(460)
__glsl_extension(GL_NV_ray_tracing_motion_blur)
__glsl_extension(GL_EXT_ray_tracing)
float RayCurrentTime()
{
    __target_switch
    {
    case hlsl:  __intrinsic_asm "RayCurrentTime";
    case glsl:  __intrinsic_asm "(gl_CurrentRayTimeNV)";
    case spirv:
        return spirv_asm {
            result:$$float = OpLoad builtin(CurrentRayTimeNV:float);
        };
    }
}

// Note: The provisional DXR spec included these unadorned
// `ObjectToWorld()` and `WorldToObject()` functions, so
// we will forward them to the new names as a convience
// for users who are porting their code.
//
// TODO: Should we provide a deprecation warning on these
// declarations, so that users can know they aren't coding
// against the final spec?
//
float3x4 ObjectToWorld() { return ObjectToWorld3x4(); }
float3x4 WorldToObject() { return WorldToObject3x4(); }

// 10.4.4 - Hit Specific System values
uint HitKind()
{
    __target_switch
    {
    case hlsl:  __intrinsic_asm "HitKind";
    case _GL_EXT_ray_tracing:  __intrinsic_asm "(gl_HitKindEXT)";
    case _GL_NV_ray_tracing:  __intrinsic_asm "(gl_HitKindNV)";
    case cuda:  __intrinsic_asm "optixGetHitKind";
    case spirv:
        return spirv_asm {
            result:$$uint = OpLoad builtin(HitKindKHR:uint);
        };
    }
}

// Pre-defined hit kinds (not documented explicitly)
static const uint HIT_KIND_TRIANGLE_FRONT_FACE  = 254;
static const uint HIT_KIND_TRIANGLE_BACK_FACE   = 255;

//
// Shader Model 6.4
//

// Treats `left` and `right` as 4-component vectors of `UInt8` and computes `dot(left, right) + acc`
uint dot4add_u8packed(uint left, uint right, uint acc);

// Treats `left` and `right` as 4-component vectors of `Int8` and computes `dot(left, right) + acc`
int dot4add_i8packed(uint left, uint right, int acc);

// Computes `dot(left, right) + acc`.
//
// May not produce infinities or NaNs for intermediate results that overflow the range of `half`
float dot2add(float2 left, float2 right, float acc);

//
// Shader Model 6.5
//

//
// Mesh Shaders
//

// Set the number of output vertices and primitives for a mesh shader invocation.
__glsl_extension(GL_EXT_mesh_shader)
__glsl_version(450)
void SetMeshOutputCounts(uint vertexCount, uint primitiveCount)
{
    __target_switch
    {
    case hlsl:
        __intrinsic_asm "SetMeshOutputCounts";
    case glsl:
        __intrinsic_asm "SetMeshOutputsEXT";
    case spirv:
        return spirv_asm
        {
            OpCapability MeshShadingEXT;
            OpExtension "SPV_EXT_mesh_shader";
            OpSetMeshOutputsEXT $vertexCount $primitiveCount;
        };
    }
}

// Specify the number of downstream mesh shader thread groups to invoke from an amplification shader,
// and provide the values for per-mesh payload parameters.
//
// This function doesn't return.
//
[KnownBuiltin("DispatchMesh")]
void DispatchMesh<P>(uint threadGroupCountX, uint threadGroupCountY, uint threadGroupCountZ, __ref P meshPayload)
{
    __target_switch
    {
    case hlsl:
        __intrinsic_asm "DispatchMesh";
    case glsl:
        // This intrinsic doesn't take into account writing meshPayload. That
        // is dealt with separately by 'legalizeDispatchMeshPayloadForGLSL'.
        __intrinsic_asm "EmitMeshTasksEXT($0, $1, $2)";
    case spirv:
        return spirv_asm
        {
            OpCapability MeshShadingEXT;
            OpExtension "SPV_EXT_mesh_shader";
            OpEmitMeshTasksEXT $threadGroupCountX $threadGroupCountY $threadGroupCountZ &meshPayload;
            // OpEmitMeshTasksExt is a terminator, so we need to start a new
            // block to hold whatever comes after this intrinsic
            %_ = OpLabel
        };
    }
}

//
// "Sampler feedback" types `FeedbackTexture2D` and `FeedbackTexture2DArray`.
//

// https://microsoft.github.io/DirectX-Specs/d3d/SamplerFeedback.html

// The docs describe these as 'types' but their syntax makes them seem enum like, and enum is a simpler way to implement them
// But slang enums are always 'enum class like', so I use an empty struct type here

[sealed]
[builtin]
interface __BuiltinSamplerFeedbackType {};

[sealed]
__magic_type(FeedbackType, $(int(FeedbackType::Kind::MinMip)))
__target_intrinsic(hlsl, SAMPLER_FEEDBACK_MIN_MIP)
struct SAMPLER_FEEDBACK_MIN_MIP : __BuiltinSamplerFeedbackType {};

[sealed]
__magic_type(FeedbackType, $(int(FeedbackType::Kind::MipRegionUsed)))
__target_intrinsic(hlsl, SAMPLER_FEEDBACK_MIP_REGION_USED)
struct SAMPLER_FEEDBACK_MIP_REGION_USED : __BuiltinSamplerFeedbackType {};

// All of these objects are write-only resources that point to a special kind of unordered access view meant for sampler feedback.
__generic<T:__BuiltinSamplerFeedbackType>
extension __TextureImpl<T,__Shape2D, 0, 0, 0, $(kStdlibResourceAccessFeedback), 0, 0, 0>
{
    // With Clamp

    __target_intrinsic(hlsl, "($0).WriteSamplerFeedback($1, $2, $3, $4)")
    __target_intrinsic(cpp, "($0).WriteSamplerFeedback($1, $2, $3, $4)")
    void WriteSamplerFeedback<S>(Texture2D<S> tex, SamplerState samp, float2 location, float clamp);

    __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackBias($1, $2, $3, $4, $5)")
    __target_intrinsic(cpp, "($0).WriteSamplerFeedbackBias($1, $2, $3, $4, $5)")
    void WriteSamplerFeedbackBias<S>(Texture2D<S> tex, SamplerState samp, float2 location, float bias, float clamp);

    __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5, $6)")
    __target_intrinsic(cpp, "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5, $6)")
    void WriteSamplerFeedbackGrad<S>(Texture2D<S> tex, SamplerState samp, float2 location, float2 ddx, float2 ddy, float clamp);

    // Level

    __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackLevel($1, $2, $3, $4)")
    __target_intrinsic(cpp, "($0).WriteSamplerFeedbackLevel($1, $2, $3, $4)")
    void WriteSamplerFeedbackLevel<S>(Texture2D<S> tex, SamplerState samp, float2 location, float lod);

    // Without Clamp

    __target_intrinsic(hlsl, "($0).WriteSamplerFeedback($1, $2, $3)")
    __target_intrinsic(cpp, "($0).WriteSamplerFeedback($1, $2, $3)")
    void WriteSamplerFeedback<S>(Texture2D<S> tex, SamplerState samp, float2 location);

    __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackBias($1, $2, $3, $4)")
    __target_intrinsic(cpp, "($0).WriteSamplerFeedbackBias($1, $2, $3, $4)")
    void WriteSamplerFeedbackBias<S>(Texture2D<S> tex, SamplerState samp, float2 location, float bias);

    __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5)")
    __target_intrinsic(cpp, "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5)")
    void WriteSamplerFeedbackGrad<S>(Texture2D<S> tex, SamplerState samp, float2 location, float2 ddx, float2 ddy);
};

__generic<T:__BuiltinSamplerFeedbackType>
extension __TextureImpl<T,__Shape2D, 1, 0, 0, $(kStdlibResourceAccessFeedback), 0, 0, 0>
{
    // With Clamp

    __target_intrinsic(hlsl, "($0).WriteSamplerFeedback($1, $2, $3, $4)")
    __target_intrinsic(cpp, "($0).WriteSamplerFeedback($1, $2, $3, $4)")
    void WriteSamplerFeedback<S>(Texture2DArray<S> texArray, SamplerState samp, float3 location, float clamp);

    __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackBias($1, $2, $3, $4, $5)")
    __target_intrinsic(cpp, "($0).WriteSamplerFeedbackBias($1, $2, $3, $4, $5)")
    void WriteSamplerFeedbackBias<S>(Texture2DArray<S> texArray, SamplerState samp, float3 location, float bias, float clamp);

    __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5, $6)")
    __target_intrinsic(cpp, "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5, $6)")
    void WriteSamplerFeedbackGrad<S>(Texture2DArray<S> texArray, SamplerState samp, float3 location, float3 ddx, float3 ddy, float clamp);

    // Level

    __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackLevel($1, $2, $3, $4)")
    __target_intrinsic(cpp, "($0).WriteSamplerFeedbackLevel($1, $2, $3, $4)")
    void WriteSamplerFeedbackLevel<S>(Texture2DArray<S> texArray, SamplerState samp, float3 location, float lod);

    // Without Clamp

    __target_intrinsic(hlsl, "($0).WriteSamplerFeedback($1, $2, $3)")
    __target_intrinsic(cpp, "($0).WriteSamplerFeedback($1, $2, $3)")
    void WriteSamplerFeedback<S>(Texture2DArray<S> texArray, SamplerState samp, float3 location);

    __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackBias($1, $2, $3, $4)")
    __target_intrinsic(cpp, "($0).WriteSamplerFeedbackBias($1, $2, $3, $4)")
    void WriteSamplerFeedbackBias<S>(Texture2DArray<S> texArray, SamplerState samp, float3 location, float bias);

    __target_intrinsic(hlsl, "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5)")
    __target_intrinsic(cpp, "($0).WriteSamplerFeedbackGrad($1, $2, $3, $4, $5)")
    void WriteSamplerFeedbackGrad<S>(Texture2DArray<S> texArray, SamplerState samp, float3 location, float3 ddx, float3 ddy);
};

//
// DXR 1.1 and `TraceRayInline` support
//

// Get the index of the geometry that was hit in an intersection, any-hit, or closest-hit shader
__glsl_extension(GL_EXT_ray_tracing)
uint GeometryIndex()
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "GeometryIndex";
    case glsl: __intrinsic_asm "(gl_GeometryIndexEXT)";
    case spirv: return spirv_asm {
            result:$$uint = OpLoad builtin(RayGeometryIndexKHR:uint);
        };
    }
}

// Get the vertex positions of the currently hit triangle in any-hit or closest-hit shader.
// https://github.com/KhronosGroup/GLSL/blob/master/extensions/ext/GLSL_EXT_ray_tracing_position_fetch.txt
__glsl_extension(GL_EXT_ray_tracing)
__glsl_extension(GL_EXT_ray_tracing_position_fetch)
__glsl_version(460)
[ForceInline]
float3 HitTriangleVertexPosition(uint index)
{
    __target_switch
    {
        case glsl:
            __intrinsic_asm "gl_HitTriangleVertexPositionsEXT[$0]";
        case spirv:
            return spirv_asm {
                OpCapability RayTracingKHR;
                OpCapability RayTracingPositionFetchKHR;
                OpExtension "SPV_KHR_ray_tracing";
                OpExtension "SPV_KHR_ray_tracing_position_fetch";
                %_ptr_Input_v3float = OpTypePointer Input $$float3;
                %addr : %_ptr_Input_v3float = OpAccessChain builtin(HitTriangleVertexPositionsKHR:float3[3]) $index;
                result:$$float3 = OpLoad %addr;
            };
    }
}

// Status of whether a (closest) hit has been committed in a `RayQuery`.
typedef uint COMMITTED_STATUS;

// No hit committed.
static const COMMITTED_STATUS COMMITTED_NOTHING = 0;

// Closest hit is a triangle.
//
// This could be an opaque triangle hit found by the fixed-function
// traversal and intersection implementation, or a non-opaque
// triangle hit committed by user code with `RayQuery.CommitNonOpaqueTriangleHit`
//
static const COMMITTED_STATUS COMMITTED_TRIANGLE_HIT = 1;

// Closest hit is a procedural primitive.
//
// A procedural hit primitive is committed using `RayQuery.CommitProceduralPrimitiveHit`.
static const COMMITTED_STATUS COMMITTED_PROCEDURAL_PRIMITIVE_HIT = 2;

// Type of candidate hit that a `RayQuery` is pausing at.
//
// A `RayQuery` can automatically commit hits with opaque triangles,
// but yields to user code for other hits to allow them to be
// dismissed or committed.
//
typedef uint CANDIDATE_TYPE;

// Candidate hit is a non-opaque triangle.
static const CANDIDATE_TYPE CANDIDATE_NON_OPAQUE_TRIANGLE = 0;

// Candidate hit is a procedural primitive.
static const CANDIDATE_TYPE CANDIDATE_PROCEDURAL_PRIMITIVE = 1;

// Handle to state of an in-progress ray-tracing query.
//
// The ray query is effectively a coroutine that user shader
// code can resume to continue tracing the ray, and which yields
// back to the user code at interesting events along the ray.
//
// Note: The treatment of the `RayQuery` type in Slang does not
// perfectly match its semantics in vanilla HLSL in some corner
// cases. Specifically, a `RayQuery` in vanilla HLSL is an
// opaque handle to mutable storage, and assigning a `RayQuery`
// or passing one as a parameter will only copy the *handle*,
// potentially resulting in aliasing of the underlying mutable
// storage.
//
// In contrast, Slang considers a `RayQuery` to own its mutable
// state, and (because the API does not support cloning of queries),
// `RayQuery` values are non-copyable (aka "move-only").
//
// The main place where this arises as a consideration is when
// passing a `RayQuery` down into a function that will perform
// mutating operations on it (e.g., `TraceRay` or `Proceed`):
//
//      void myFunc( inout RayQuery<FLAGS> q )
//      {
//          q.Proceed();
//      }
//
// In Slang, a parameter like `q` above should be declared `inout`.
// HLSL does not care about whether `q` is declared `inout` or not.
//
__glsl_extension(GL_EXT_ray_query)
__glsl_version(460)
[__NonCopyableType]
__intrinsic_type($(kIROp_RayQueryType))
struct RayQuery <let rayFlagsGeneric : RAY_FLAG = RAY_FLAG_NONE>
{
    // Create a new ray query, initialized to its default state.
    //
    __intrinsic_op($(kIROp_AllocateOpaqueHandle))
    __init();

    
    __target_intrinsic(glsl, "rayQueryInitializeEXT($0, $1, $2, $3, $4, $5, $6, $7)")
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    [mutating]
    void __rayQueryInitializeEXT(
        RaytracingAccelerationStructure accelerationStructure,
        RAY_FLAG                        rayFlags,
        uint                            instanceInclusionMask,
        float3                          origin,
        float                           tMin,
        float3                          direction,
        float                           tMax)
    {
        __target_switch
        {
        case glsl: __intrinsic_asm "rayQueryInitializeEXT($0, $1, $2, $3, $4, $5, $6, $7)";
        case spirv:
            spirv_asm {
                OpRayQueryInitializeKHR &this $accelerationStructure $rayFlags $instanceInclusionMask $origin $tMin $direction $tMax;
            };
        }
    }

    // Initialize a ray-tracing query.
    //
    // This method may be called on a "fresh" ray query, or
    // on one that is already tracing a ray. In the latter
    // case any state related to the ray previously being
    // traced is overwritten.
    //
    // The `rayFlags` here will be bitwise ORed with
    // the `rayFlags` passed as a generic argument to
    // `RayQuery` to get the effective ray flags, which
    // must obey any API-imposed restrictions.
    //
    [__unsafeForceInlineEarly]
    [mutating]
    void TraceRayInline(
        RaytracingAccelerationStructure accelerationStructure,
        RAY_FLAG                        rayFlags,
        uint                            instanceInclusionMask,
        RayDesc                         ray)
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".TraceRayInline";
        case glsl:
        case spirv:
            __rayQueryInitializeEXT(
                accelerationStructure,
                rayFlags | rayFlagsGeneric,
                instanceInclusionMask,
                ray.Origin,
                ray.TMin,
                ray.Direction,
                ray.TMax);
        }
    }
    
    // Resume the ray query coroutine.
    //
    // If the coroutine suspends because of encountering
    // a candidate hit that cannot be resolved with fixed-funciton
    // logic, this function returns `true`, and the `Candidate*()`
    // functions should be used by application code to resolve
    // the candidate hit (by either committing or ignoring it).
    //
    // If the coroutine terminates because traversal is
    // complete (or has been aborted), this function returns
    // `false`, and application code should use the `Committed*()`
    // functions to appropriately handle the closest hit (it any)
    // that was found.
    //
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    [mutating]
    bool Proceed()
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".Proceed";
        case glsl: __intrinsic_asm "rayQueryProceedEXT";
        case spirv: return spirv_asm
            {
                result:$$bool = OpRayQueryProceedKHR &this
            };
        }
    }

    // Causes the ray query to terminate.
    //
    // This function cases the ray query to act as if
    // traversal has terminated, so that subsequent
    // `Proceed()` calls will return `false`.
    //
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    [mutating]
    void Abort()
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".Abort";
        case glsl: __intrinsic_asm "rayQueryTerminateEXT";
        case spirv: spirv_asm { OpRayQueryTerminateKHR &this };
        }
    }

    // Commit the current non-opaque triangle hit.
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    [__NoSideEffect]
    [mutating]
    void CommitNonOpaqueTriangleHit()
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".CommitNonOpaqueTriangleHit";
        case glsl: __intrinsic_asm "rayQueryConfirmIntersectionEXT";
        case spirv: spirv_asm { OpRayQueryConfirmIntersectionKHR &this };
        }
    }

    // Commit the current procedural primitive hit, with hit time `t`.
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    [__NoSideEffect]
    [mutating]
    void CommitProceduralPrimitiveHit(float t)
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".CommitProceduralPrimitiveHit";
        case glsl: __intrinsic_asm "rayQueryGenerateIntersectionEXT";
        case spirv: spirv_asm { OpRayQueryGenerateIntersectionKHR &this $t };
        }
    }

    // Get the type of candidate hit being considered.
    //
    // The ray query coroutine will suspend when it encounters
    // a hit that cannot be resolved with fixed-function logic
    // (either a non-opaque triangle or a procedural primitive).
    // In either of those cases, `CandidateType()` will return
    // the kind of candidate hit that must be resolved by
    // user code.
    //
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    [__NoSideEffect]
    CANDIDATE_TYPE CandidateType()
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".CandidateType";
        case glsl: __intrinsic_asm "rayQueryGetIntersectionTypeEXT($0, false)";
        case spirv:
            uint RayQueryCandidateIntersectionKHR = 0;
            return spirv_asm {
                result:$$CANDIDATE_TYPE = OpRayQueryGetIntersectionTypeKHR &this $RayQueryCandidateIntersectionKHR;
            };
        }
    }

    // Get the status of the committed (closest) hit, if any.
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    [__NoSideEffect]
    COMMITTED_STATUS CommittedStatus()
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".CommittedStatus";
        case glsl: __intrinsic_asm "rayQueryGetIntersectionTypeEXT($0, true)";
        case spirv:
            uint RayQueryCommittedIntersectionKHR = 1;
            return spirv_asm
            {
                result:$$COMMITTED_STATUS = OpRayQueryGetIntersectionTypeKHR &this $RayQueryCommittedIntersectionKHR;
            };
        }
    }

    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    [__NoSideEffect]
    bool CandidateProceduralPrimitiveNonOpaque()
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".CandidateProceduralPrimitiveNonOpaque";
        case glsl: __intrinsic_asm "(!rayQueryGetIntersectionCandidateAABBOpaqueEXT($0, false))";
        case spirv:
            uint iCandidateOrCommitted = 0;
            return spirv_asm
            {
                %rr:$$bool = OpRayQueryGetIntersectionCandidateAABBOpaqueKHR &this;
                result:$$bool = OpLogicalNot %rr;
            };
        }
    }

    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    [__NoSideEffect]
    float CandidateTriangleRayT()
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".CandidateTriangleRayT";
        case glsl: __intrinsic_asm "rayQueryGetIntersectionTEXT($0, false)";
        case spirv:
            uint iCandidateOrCommitted = 0;
            return spirv_asm
            {
                result:$$float = OpRayQueryGetIntersectionTKHR &this $iCandidateOrCommitted;
            };
        }
    }
    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    [__NoSideEffect]
    float CommittedRayT()
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".CommittedRayT";
        case glsl: __intrinsic_asm "rayQueryGetIntersectionTEXT($0, true)";
        case spirv:
            uint iCandidateOrCommitted = 1;
            return spirv_asm
            {
                result:$$float = OpRayQueryGetIntersectionTKHR &this $iCandidateOrCommitted;
            };
        }
    }

${{{{
    const char* kCandidateCommitted[] = {"Candidate", "Committed"};

    // Access Candidate and Committed Matrices.
    for (uint32_t candidateOrCommitted = 0; candidateOrCommitted < 2; candidateOrCommitted++)
    {
        auto ccName = kCandidateCommitted[candidateOrCommitted];
        auto ccTF = candidateOrCommitted == 0 ? "false" : "true";
}}}}

    // CandidateObjectToWorld3x4, CandidateWorldToObject4x3
    // CommittedObjectToWorld3x4, CommittedObjectToWorld4x3
    ${{{{
    const char* kRayQueryMatrixNames[] = {"ObjectToWorld", "WorldToObject"};
    for (auto matName : kRayQueryMatrixNames) {
    }}}}

    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    [__NoSideEffect]
    float3x4 $(ccName)$(matName)3x4()
    {
        __target_switch
        {
        case glsl: __intrinsic_asm "transpose(rayQueryGetIntersection$(matName)EXT($0, $(ccTF)))";
        case hlsl: __intrinsic_asm ".$(ccName)$(matName)3x4";
        case spirv:
            uint iCandidateOrCommitted = $(candidateOrCommitted);
            return spirv_asm {
                %m:$$float4x3 = OpRayQueryGetIntersection$(matName)KHR &this $iCandidateOrCommitted;
                result:$$float3x4 = OpTranspose %m;
            };
        }
    }

    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    [__readNone]
    float4x3 $(ccName)$(matName)4x3()
    {
        __target_switch
        {
        case glsl: __intrinsic_asm "rayQueryGetIntersection$(matName)EXT($0, $(ccTF))";
        case hlsl: __intrinsic_asm ".$(ccName)$(matName)4x3";
        case spirv:
            uint iCandidateOrCommitted = $(candidateOrCommitted);
            return spirv_asm {
                result:$$float4x3 = OpRayQueryGetIntersection$(matName)KHR &this $iCandidateOrCommitted;
            };
        }
    }

${{{{
    } // ObjectToWorld/WorldToObject.

    // Access Candidate and Committed properties.
    struct RayQueryMethodEntry
    {
        const char* type;
        const char* hlslName;
        const char* glslName;
    };
    const RayQueryMethodEntry rayQueryMethods[] = {
        {"uint", "InstanceIndex", "InstanceId"},
        {"uint", "InstanceID", "InstanceCustomIndex"},
        {"uint", "PrimitiveIndex", "PrimitiveIndex"},
        {"uint", "GeometryIndex", "GeometryIndex"},
        {"uint", "InstanceContributionToHitGroupIndex", "InstanceShaderBindingTableRecordOffset"},
        {"float3", "ObjectRayOrigin", "ObjectRayOrigin"},
        {"float3", "ObjectRayDirection", "ObjectRayDirection"},
        {"bool", "TriangleFrontFace", "FrontFace"},
        {"float2", "TriangleBarycentrics", "Barycentrics"},
    };
    for (auto method : rayQueryMethods) {
}}}}

    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    [__NoSideEffect]
    $(method.type) $(ccName)$(method.hlslName)()
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".$(ccName)$(method.hlslName)";
        case glsl: __intrinsic_asm "rayQueryGetIntersection$(method.glslName)EXT($0, $(ccTF))";
        case spirv:
            uint iCandidateOrCommitted = $(candidateOrCommitted);
            return spirv_asm {
                result:$$$(method.type) = OpRayQueryGetIntersection$(method.glslName)KHR &this $iCandidateOrCommitted;
            };
        }
    }
${{{{
    } // Candidate/Committed properties.
    } // for ("Candidate", "Committed")
}}}}

    // Access properties of the ray being traced.

    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    [__NoSideEffect]
    uint RayFlags()
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".RayFlags";
        case glsl: __intrinsic_asm "rayQueryGetRayFlagsEXT";
        case spirv:
            return spirv_asm {
                result:$$uint = OpRayQueryGetRayFlagsKHR &this;
            };
        }
    }

    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    [__NoSideEffect]
    float3 WorldRayOrigin()
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".WorldRayOrigin";
        case glsl: __intrinsic_asm "rayQueryGetWorldRayOriginEXT";
        case spirv:
            return spirv_asm {
                result:$$float3 = OpRayQueryGetWorldRayOriginKHR &this;
            };
        }
    }

    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    [__NoSideEffect]
    float3 WorldRayDirection()
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".WorldRayDirection";
        case glsl: __intrinsic_asm "rayQueryGetWorldRayDirectionEXT";
        case spirv:
            return spirv_asm {
                result:$$float3 = OpRayQueryGetWorldRayDirectionKHR &this;
            };
        }
    }

    __glsl_extension(GL_EXT_ray_query)
    __glsl_version(460)
    [__NoSideEffect]
    float RayTMin()
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".RayTMin";
        case glsl: __intrinsic_asm "rayQueryGetRayTMinEXT";
        case spirv:
            return spirv_asm {
                result:$$float = OpRayQueryGetRayTMinKHR &this;
            };
        }
    };
}

//
// Vulkan/SPIR-V specific features
//

struct VkSubpassInput<T>
{
    T SubpassLoad();
}

struct VkSubpassInputMS<T>
{
    T SubpassLoad(int sampleIndex);
}


///
/// Shader Execution Reordering (SER)
///
/// NOTE! This API is currently experimental and may change in the future as SER is made available
/// in different APIs and downstream compilers.
///
/// Based on the NVAPI on D3D12 only currently.
///
/// White paper on SER on NVAPI https://developer.nvidia.com/sites/default/files/akamai/gameworks/ser-whitepaper.pdf
///
/// The NVAPI headers (R520) required for this functionality to work can be found here...
///
/// https://developer.nvidia.com/rtx/path-tracing/nvapi/get-started
///
/// For VK the specification is currently in this PR
///
/// https://github.com/KhronosGroup/GLSL/pull/196/files

/// Internal helper functions

// This is a bit of a hack for GLSL HitObjectAttributes
// It relies on [ForceInline] removing the surrounding function and just inserting the *contained* `t` as a global
// The __ref should indicate the desire for the returned value to not be a copy of t, but *t*.
// In practive __ref doesn't have this effect in practice.
// 
// We need this to be able access the payload outside of a function (which is all that TraceRay for example needs)
// We access the HitObjectAttributes via this function for the desired type, and it acts *as if* it's just an access
// to the global t.
[ForceInline]
Ref<T> __hitObjectAttributes<T>()
{
    [__vulkanHitObjectAttributes]   
    static T t;
    return t;
}
[ForceInline]
Ptr<T> __allocHitObjectAttributes<T>()
{
    [__vulkanHitObjectAttributes]   
    static T t;
    return &t;
}

// Next is the custom intrinsic that will compute the hitObjectAttributes location
// for GLSL-based targets.
//
__generic<Attributes>
__intrinsic_op($(kIROp_GetVulkanRayTracingPayloadLocation))
int __hitObjectAttributesLocation(__ref Attributes attributes);

    /// Immutable data type representing a ray hit or a miss. Can be used to invoke hit or miss shading,
    /// or as a key in ReorderThread. Created by one of several methods described below. HitObject
    /// and its related functions are available in raytracing shader types only.
[__requiresNVAPI]
__glsl_extension(GL_NV_shader_invocation_reorder)
__glsl_extension(GL_EXT_ray_tracing)
[__NonCopyableType]
__intrinsic_type($(kIROp_HitObjectType))
struct HitObject
{
    __intrinsic_op($(kIROp_AllocateOpaqueHandle))
    __init();

        /// Executes ray traversal (including anyhit and intersection shaders) like TraceRay, but returns the
        /// resulting hit information as a HitObject and does not trigger closesthit or miss shaders.
    [ForceInline]
    static HitObject TraceRay<payload_t>(
        RaytracingAccelerationStructure AccelerationStructure,
        uint RayFlags,
        uint InstanceInclusionMask,
        uint RayContributionToHitGroupIndex,
        uint MultiplierForGeometryContributionToHitGroupIndex,
        uint MissShaderIndex,
        RayDesc Ray,
        inout payload_t Payload)
    {
        __target_switch
        {
        case hlsl:
            {
                HitObject hitObj;
                __hlslTraceRay(
                    AccelerationStructure, 
                    RayFlags, 
                    InstanceInclusionMask, 
                    RayContributionToHitGroupIndex, 
                    MultiplierForGeometryContributionToHitGroupIndex, 
                    MissShaderIndex, 
                    Ray, 
                    Payload,
                    hitObj);
                return hitObj;
            }
        case glsl:
            {
                [__vulkanRayPayload]
                static payload_t p;

                // Save the payload
                p = Payload;

                __glslTraceRay(
                    __return_val,
                    AccelerationStructure,
                    RayFlags,                                           // Assumes D3D/VK have some RayFlags values
                    InstanceInclusionMask,                              // cullMask
                    RayContributionToHitGroupIndex,                     // sbtRecordOffset
                    MultiplierForGeometryContributionToHitGroupIndex,   // sbtRecordStride
                    MissShaderIndex,
                    Ray.Origin,
                    Ray.TMin,
                    Ray.Direction, 
                    Ray.TMax,
                    __rayPayloadLocation(p));
        
                // Write the payload out
                Payload = p;
            }
        case spirv:
            {
                [__vulkanRayPayload]
                static payload_t p;

                // Save the payload
                p = Payload;

                let origin = Ray.Origin;
                let direction = Ray.Direction;
                let tmin = Ray.TMin;
                let tmax = Ray.TMax;
                spirv_asm {
                    OpHitObjectTraceRayNV
                        /**/ &__return_val
                        /**/ $AccelerationStructure
                        /**/ $RayFlags
                        /**/ $InstanceInclusionMask
                        /**/ $RayContributionToHitGroupIndex
                        /**/ $MultiplierForGeometryContributionToHitGroupIndex
                        /**/ $MissShaderIndex
                        /**/ $origin
                        /**/ $tmin
                        /**/ $direction
                        /**/ $tmax
                        /**/ &p;
                };

                // Write the payload out
                Payload = p;
            }
        }
    }

        /// Executes motion ray traversal (including anyhit and intersection shaders) like TraceRay, but returns the
        /// resulting hit information as a HitObject and does not trigger closesthit or miss shaders.
    [ForceInline]
    static HitObject TraceMotionRay<payload_t>( 
        RaytracingAccelerationStructure AccelerationStructure, 
        uint RayFlags, 
        uint InstanceInclusionMask, 
        uint RayContributionToHitGroupIndex, 
        uint MultiplierForGeometryContributionToHitGroupIndex, 
        uint MissShaderIndex, 
        RayDesc Ray,
        float CurrentTime,
        inout payload_t Payload)
    {
        __target_switch
        {
        case hlsl:
            __intrinsic_asm "TraceMotionRay";
        case glsl:
            {
                [__vulkanRayPayload]
                static payload_t p;

                // Save the payload
                p = Payload;

                __glslTraceMotionRay(
                    __return_val,
                    AccelerationStructure,
                    RayFlags,                                           // Assumes D3D/VK have some RayFlags values
                    InstanceInclusionMask,                              // cullMask
                    RayContributionToHitGroupIndex,                     // sbtRecordOffset
                    MultiplierForGeometryContributionToHitGroupIndex,   // sbtRecordStride
                    MissShaderIndex,
                    Ray.Origin,
                    Ray.TMin,
                    Ray.Direction, 
                    Ray.TMax,
                    CurrentTime,
                    __rayPayloadLocation(p));
        
                // Write the payload out
                Payload = p;
            }
        case spirv:
            {
                [__vulkanRayPayload]
                static payload_t p;

                // Save the payload
                p = Payload;

                let origin = Ray.Origin;
                let direction = Ray.Direction;
                let tmin = Ray.TMin;
                let tmax = Ray.TMax;
                spirv_asm {
                    OpCapability RayTracingMotionBlurNV;
                    OpExtension "SPV_NV_ray_tracing_motion_blur";
                    OpHitObjectTraceRayMotionNV
                        /**/ &__return_val
                        /**/ $AccelerationStructure
                        /**/ $RayFlags
                        /**/ $InstanceInclusionMask
                        /**/ $RayContributionToHitGroupIndex
                        /**/ $MultiplierForGeometryContributionToHitGroupIndex
                        /**/ $MissShaderIndex
                        /**/ $origin
                        /**/ $tmin
                        /**/ $direction
                        /**/ $tmax
                        /**/ $CurrentTime
                        /**/ &p;
                };
        
                // Write the payload out
                Payload = p;
            }
        }
        
    }

        /// Creates a HitObject representing a hit based on values explicitly passed as arguments, without
        /// tracing a ray. The primitive specified by AccelerationStructure, InstanceIndex, GeometryIndex,
        /// and PrimitiveIndex must exist. The shader table index is computed using the formula used with
        /// TraceRay. The computed index must reference a valid hit group record in the shader table. The
        /// Attributes parameter must either be an attribute struct, such as
        /// BuiltInTriangleIntersectionAttributes, or another HitObject to copy the attributes from.
    [ForceInline]
    static HitObject MakeHit<attr_t>(
        RaytracingAccelerationStructure AccelerationStructure,
        uint InstanceIndex,
        uint GeometryIndex,
        uint PrimitiveIndex,
        uint HitKind,
        uint RayContributionToHitGroupIndex,
        uint MultiplierForGeometryContributionToHitGroupIndex,
        RayDesc Ray,
        attr_t attributes)
    {
        __target_switch
        {
        case hlsl:
            HitObject hitObj;
            __hlslMakeHit(
                AccelerationStructure, 
                InstanceIndex,
                GeometryIndex,
                PrimitiveIndex,
                HitKind,
                RayContributionToHitGroupIndex,
                MultiplierForGeometryContributionToHitGroupIndex,
                Ray,
                attributes,
                hitObj);
            return hitObj;
        case glsl:
            {
                // Save the attributes
                __hitObjectAttributes<attr_t>() = attributes;

                __glslMakeHit(
                    __return_val,
                    AccelerationStructure,
                    InstanceIndex,
                    PrimitiveIndex,
                    GeometryIndex,
                    HitKind,
                    RayContributionToHitGroupIndex,                         /// sbtRecordOffset?
                    MultiplierForGeometryContributionToHitGroupIndex,       /// sbtRecordStride?
                    Ray.Origin,
                    Ray.TMin,
                    Ray.Direction, 
                    Ray.TMax,
                    __hitObjectAttributesLocation(__hitObjectAttributes<attr_t>()));
            }
        case spirv:
            {
                // Save the attributes
                Ptr<attr_t> attr = __allocHitObjectAttributes<attr_t>();

                *attr = attributes;

                let origin = Ray.Origin;
                let direction = Ray.Direction;
                let tmin = Ray.TMin;
                let tmax = Ray.TMax;
                spirv_asm {
                    OpHitObjectRecordHitNV
                        /**/ &__return_val
                        /**/ $AccelerationStructure
                        /**/ $InstanceIndex
                        /**/ $PrimitiveIndex
                        /**/ $GeometryIndex
                        /**/ $HitKind
                        /**/ $RayContributionToHitGroupIndex
                        /**/ $MultiplierForGeometryContributionToHitGroupIndex
                        /**/ $origin
                        /**/ $tmin
                        /**/ $direction
                        /**/ $tmax
                        /**/ $attr;
                };
            }
        }
    }

        /// See MakeHit but handles Motion 
        /// Currently only supported on VK
    [ForceInline]
    static HitObject MakeMotionHit<attr_t>( 
        RaytracingAccelerationStructure AccelerationStructure, 
        uint InstanceIndex, 
        uint GeometryIndex, 
        uint PrimitiveIndex, 
        uint HitKind, 
        uint RayContributionToHitGroupIndex, 
        uint MultiplierForGeometryContributionToHitGroupIndex, 
        RayDesc Ray,
        float CurrentTime,
        attr_t attributes)
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm "MakeMotionHit";
        case glsl:
        {
            // Save the attributes
            __hitObjectAttributes<attr_t>() = attributes;

            __glslMakeMotionHit(
                __return_val,
                AccelerationStructure,
                InstanceIndex,
                PrimitiveIndex,
                GeometryIndex,
                HitKind,
                RayContributionToHitGroupIndex,                         /// sbtRecordOffset?
                MultiplierForGeometryContributionToHitGroupIndex,       /// sbtRecordStride?
                Ray.Origin,
                Ray.TMin,
                Ray.Direction, 
                Ray.TMax,
                CurrentTime,
                __hitObjectAttributesLocation(__hitObjectAttributes<attr_t>()));
        }
        case spirv:
        {
            // Save the attributes
            Ptr<attr_t> attr = __allocHitObjectAttributes<attr_t>();

            *attr = attributes;

            let origin = Ray.Origin;
            let direction = Ray.Direction;
            let tmin = Ray.TMin;
            let tmax = Ray.TMax;
            spirv_asm {
                OpCapability RayTracingMotionBlurNV;
                OpExtension "SPV_NV_ray_tracing_motion_blur";
                OpHitObjectRecordHitMotionNV
                    /**/ &__return_val
                    /**/ $AccelerationStructure
                    /**/ $InstanceIndex
                    /**/ $PrimitiveIndex
                    /**/ $GeometryIndex
                    /**/ $HitKind
                    /**/ $RayContributionToHitGroupIndex
                    /**/ $MultiplierForGeometryContributionToHitGroupIndex
                    /**/ $origin
                    /**/ $tmin
                    /**/ $direction
                    /**/ $tmax
                    /**/ $CurrentTime
                    /**/ $attr;
            };
        }
        }
    }

        /// Creates a HitObject representing a hit based on values explicitly passed as arguments, without
        /// tracing a ray. The primitive specified by AccelerationStructure, InstanceIndex, GeometryIndex,
        /// and PrimitiveIndex must exist. The shader table index is explicitly provided as an argument
        /// instead of being computed from the indexing formula used in TraceRay. The provided index must
        /// reference a valid hit group record in the shader table. The Attributes parameter must either be an
        /// attribute struct, such as BuiltInTriangleIntersectionAttributes, or another HitObject to copy the
        /// attributes from.
    [ForceInline]
    static HitObject MakeHit<attr_t>(
        uint HitGroupRecordIndex,
        RaytracingAccelerationStructure AccelerationStructure,
        uint InstanceIndex,
        uint GeometryIndex,
        uint PrimitiveIndex,
        uint HitKind,
        RayDesc Ray,
        attr_t attributes)
    {
        __target_switch
        {
        case hlsl:
            HitObject hitObj;
            __hlslMakeHitWithRecordIndex(
                HitGroupRecordIndex, 
                AccelerationStructure, 
                InstanceIndex,
                GeometryIndex,
                PrimitiveIndex,
                HitKind,
                Ray,
                attributes,
                hitObj);
            return hitObj;
        case glsl:
        {
            // Save the attributes
            __hitObjectAttributes<attr_t>() = attributes;

            __glslMakeHitWithIndex(
                __return_val,
                AccelerationStructure, 
                InstanceIndex,              ///? Same as instanceid ?
                GeometryIndex, 
                PrimitiveIndex,
                HitKind,                    /// Assuming HitKinds are compatible
                HitGroupRecordIndex,        /// sbtRecordIndex
                Ray.Origin,
                Ray.TMin,
                Ray.Direction, 
                Ray.TMax,
                __hitObjectAttributesLocation(__hitObjectAttributes<attr_t>()));
        }
        case spirv:
        {
            // Save the attributes
            Ptr<attr_t> attr = __allocHitObjectAttributes<attr_t>();
            *attr = attributes;
            let origin = Ray.Origin;
            let direction = Ray.Direction;
            let tmin = Ray.TMin;
            let tmax = Ray.TMax;
            spirv_asm {
                OpHitObjectRecordHitWithIndexNV
                    /**/ &__return_val
                    /**/ $AccelerationStructure
                    /**/ $InstanceIndex
                    /**/ $PrimitiveIndex
                    /**/ $GeometryIndex
                    /**/ $HitKind
                    /**/ $HitGroupRecordIndex
                    /**/ $origin
                    /**/ $tmin
                    /**/ $direction
                    /**/ $tmax
                    /**/ $attr;
            };
        }
        }
    }
        /// See MakeHit but handles Motion 
        /// Currently only supported on VK

    [ForceInline]
    static HitObject MakeMotionHit<attr_t>( 
        uint HitGroupRecordIndex, 
        RaytracingAccelerationStructure AccelerationStructure, 
        uint InstanceIndex, 
        uint GeometryIndex, 
        uint PrimitiveIndex, 
        uint HitKind, 
        RayDesc Ray, 
        float CurrentTime,
        attr_t attributes)
    {
        __target_switch
        {
        case glsl:
        {
            // Save the attributes
            __hitObjectAttributes<attr_t>() = attributes;

            __glslMakeMotionHitWithIndex(
                __return_val,
                AccelerationStructure, 
                InstanceIndex,              ///? Same as instanceid ?
                GeometryIndex, 
                PrimitiveIndex,
                HitKind,                    /// Assuming HitKinds are compatible
                HitGroupRecordIndex,        /// sbtRecordIndex
                Ray.Origin,
                Ray.TMin,
                Ray.Direction, 
                Ray.TMax,
                CurrentTime,
                __hitObjectAttributesLocation(__hitObjectAttributes<attr_t>()));
        }
        case spirv:
        {
            // Save the attributes
            Ptr<attr_t> attr = __allocHitObjectAttributes<attr_t>();
            *attr = attributes;
            let origin = Ray.Origin;
            let direction = Ray.Direction;
            let tmin = Ray.TMin;
            let tmax = Ray.TMax;
            spirv_asm {
                OpCapability RayTracingMotionBlurNV;
                OpExtension "SPV_NV_ray_tracing_motion_blur";
                OpHitObjectRecordHitWithIndexMotionNV
                    /**/ &__return_val
                    /**/ $AccelerationStructure
                    /**/ $InstanceIndex
                    /**/ $PrimitiveIndex
                    /**/ $GeometryIndex
                    /**/ $HitKind
                    /**/ $HitGroupRecordIndex
                    /**/ $origin
                    /**/ $tmin
                    /**/ $direction
                    /**/ $tmax
                    /**/ $CurrentTime
                    /**/ $attr;
            };
        }
        }
    }

        /// Creates a HitObject representing a miss based on values explicitly passed as arguments, without
        /// tracing a ray. The provided shader table index must reference a valid miss record in the shader
        /// table.
    [__requiresNVAPI]
    [ForceInline]
    static HitObject MakeMiss( 
        uint MissShaderIndex, 
        RayDesc Ray)
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm "($2=NvMakeMiss($0,$1))";
        case glsl:
            __glslMakeMiss(__return_val, MissShaderIndex, Ray.Origin, Ray.TMin, Ray.Direction, Ray.TMax);
        case spirv:
            {
                let origin = Ray.Origin;
                let direction = Ray.Direction;
                let tmin = Ray.TMin;
                let tmax = Ray.TMax;
                spirv_asm {
                    OpHitObjectRecordMissNV
                        /**/ &__return_val
                        /**/ $MissShaderIndex
                        /**/ $origin
                        /**/ $tmin
                        /**/ $direction
                        /**/ $tmax;
                };
            }
        }
    }

        /// See MakeMiss but handles Motion 
        /// Currently only supported on VK
    [ForceInline]
    __specialized_for_target(glsl)
    static HitObject MakeMotionMiss( 
        uint MissShaderIndex, 
        RayDesc Ray,
        float CurrentTime)
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm "($3=NvMakeMotionMiss($0,$1,$2))";
        case glsl:
            __glslMakeMotionMiss(__return_val, MissShaderIndex, Ray.Origin, Ray.TMin, Ray.Direction, Ray.TMax, CurrentTime);
        case spirv:
            {
                let origin = Ray.Origin;
                let direction = Ray.Direction;
                let tmin = Ray.TMin;
                let tmax = Ray.TMax;
                spirv_asm {
                    OpCapability RayTracingMotionBlurNV;
                    OpExtension "SPV_NV_ray_tracing_motion_blur";
                    OpHitObjectRecordMissMotionNV
                        /**/ &__return_val
                        /**/ $MissShaderIndex
                        /**/ $origin
                        /**/ $tmin
                        /**/ $direction
                        /**/ $tmax
                        /**/ $CurrentTime;
                };
            }
        }
    }

        /// Creates a HitObject representing “NOP” (no operation) which is neither a hit nor a miss. Invoking a
        /// NOP hit object using HitObject::Invoke has no effect. Reordering by hit objects using
        /// ReorderThread will group NOP hit objects together. This can be useful in some reordering
        /// scenarios where future control flow for some threads is known to process neither a hit nor a
        /// miss.
    [__requiresNVAPI]
    [ForceInline]
    static HitObject MakeNop()
    {
        __target_switch
        {
        case hlsl:
            __intrinsic_asm "($0 = NvMakeNop())";
        case glsl:
            __glslMakeNop(__return_val);
        case spirv:
            spirv_asm {
                OpHitObjectRecordEmptyNV
                    /**/ &__return_val;
            };
        }
    }

        /// Invokes closesthit or miss shading for the specified hit object. In case of a NOP HitObject, no
        /// shader is invoked.
    [__requiresNVAPI]
    [ForceInline]
    static void Invoke<payload_t>(
        RaytracingAccelerationStructure AccelerationStructure,
        HitObject HitOrMiss,
        inout payload_t Payload)
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm "NvInvokeHitObject";
        case glsl:
            {
                [__vulkanRayPayload]
                static payload_t p;

                // Save the payload
                p = Payload;

                __glslInvoke(HitOrMiss, __rayPayloadLocation(p));

                // Write payload result
                Payload = p;
            }
        case spirv:
            {
                [__vulkanRayPayload]
                static payload_t p;

                // Save the payload
                p = Payload;

                spirv_asm {
                    OpHitObjectExecuteShaderNV
                        /**/ &HitOrMiss
                        /**/ &p;
                };

                // Write payload result
                Payload = p;
            }
        }
    }

        /// Returns true if the HitObject encodes a miss, otherwise returns false.
    [__requiresNVAPI]
    [ForceInline]
    __glsl_extension(GL_EXT_ray_tracing)
    bool IsMiss()
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".IsMiss";
        case glsl: __intrinsic_asm "hitObjectIsMissNV($0)";
        case spirv: return spirv_asm {
                result:$$bool = OpHitObjectIsMissNV &this;
            };
        }
    }

        /// Returns true if the HitObject encodes a hit, otherwise returns false.
    [__requiresNVAPI]
    [ForceInline]
    __glsl_extension(GL_EXT_ray_tracing)
    bool IsHit()
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".IsHit";
        case glsl: __intrinsic_asm "hitObjectIsHitNV($0)";
        case spirv: return spirv_asm {
                result:$$bool = OpHitObjectIsHitNV &this;
            };
        }
    }

        /// Returns true if the HitObject encodes a nop, otherwise returns false.
    [__requiresNVAPI]
    [ForceInline]
    __glsl_extension(GL_EXT_ray_tracing)
    bool IsNop()
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".IsNop";
        case glsl: __intrinsic_asm "hitObjectIsEmptyNV($0)";
        case spirv: return spirv_asm {
                result:$$bool = OpHitObjectIsEmptyNV &this;
            };
        }
    }

        /// Queries ray properties from HitObject. Valid if the hit object represents a hit or a miss.
    [__requiresNVAPI]
    [ForceInline]
    __target_intrinsic(hlsl)
    RayDesc GetRayDesc()
    {
        __target_switch
        {
        case hlsl:
            __intrinsic_asm ".GetRayDesc";
        case glsl:
            {
                RayDesc ray = { __glslGetRayWorldOrigin(), __glslGetTMin(), __glslGetRayWorldDirection(), __glslGetTMax() };
                return ray;
            }
        case spirv:
            return spirv_asm {
                %origin:$$float3 = OpHitObjectGetWorldRayOriginNV &this;
                %tmin:$$float = OpHitObjectGetRayTMinNV &this;
                %direction:$$float3 = OpHitObjectGetWorldRayDirectionNV &this;
                %tmax:$$float = OpHitObjectGetRayTMaxNV &this;
                result:$$RayDesc = OpCompositeConstruct %origin %tmin %direction %tmax;
            };
        }
    }

        /// Queries shader table index from HitObject. Valid if the hit object represents a hit or a miss.
    [__requiresNVAPI]
    [ForceInline]
    __glsl_extension(GL_EXT_ray_tracing)
    uint GetShaderTableIndex()
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".GetShaderTableIndex";
        case glsl: __intrinsic_asm "hitObjectGetShaderBindingTableRecordIndexNV($0)";
        case spirv: return spirv_asm {
                result:$$uint = OpHitObjectGetShaderBindingTableRecordIndexNV &this;
            };
        }
    }

        /// Returns the instance index of a hit. Valid if the hit object represents a hit.
    [__requiresNVAPI]
    [ForceInline]
    __glsl_extension(GL_EXT_ray_tracing)
    uint GetInstanceIndex()
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".GetInstanceIndex";
        case glsl: __intrinsic_asm "hitObjectGetInstanceIdNV($0)";
        case spirv: return spirv_asm {
                result:$$uint = OpHitObjectGetInstanceIdNV &this;
            };
        }
    }

        /// Returns the instance ID of a hit. Valid if the hit object represents a hit.
    [__requiresNVAPI]
    [ForceInline]
    __glsl_extension(GL_EXT_ray_tracing)
    uint GetInstanceID()
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".GetInstanceID";
        case glsl: __intrinsic_asm "hitObjectGetInstanceCustomIndexNV($0)";
        case spirv: return spirv_asm {
                result:$$uint = OpHitObjectGetInstanceCustomIndexNV &this;
            };
        }
    }

        /// Returns the geometry index of a hit. Valid if the hit object represents a hit.
    [__requiresNVAPI]
    [ForceInline]
    __glsl_extension(GL_EXT_ray_tracing)
    uint GetGeometryIndex()
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".GetGeometryIndex";
        case glsl: __intrinsic_asm "hitObjectGetGeometryIndexNV($0)";
        case spirv: return spirv_asm {
                result:$$uint = OpHitObjectGetGeometryIndexNV &this;
            };
        }
    }

        /// Returns the primitive index of a hit. Valid if the hit object represents a hit.
    [__requiresNVAPI]
    [ForceInline]
    __glsl_extension(GL_EXT_ray_tracing)
    uint GetPrimitiveIndex()
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".GetPrimitiveIndex";
        case glsl: __intrinsic_asm "hitObjectGetPrimitiveIndexNV($0)";
        case spirv: return spirv_asm {
                result:$$uint = OpHitObjectGetPrimitiveIndexNV &this;
            };
        }
    }

        /// Returns the hit kind. Valid if the hit object represents a hit.
    [__requiresNVAPI]
    [ForceInline]
    __glsl_extension(GL_EXT_ray_tracing)
    uint GetHitKind()
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".GetHitKind";
        case glsl: __intrinsic_asm "hitObjectGetHitKindNV($0)";
        case spirv: return spirv_asm {
                result:$$uint = OpHitObjectGetHitKindNV &this;
            };
        }
    }

    [__requiresNVAPI]
    [ForceInline]
    __glsl_extension(GL_EXT_ray_tracing)
    float4x3 GetWorldToObject()
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".GetWorldToObject";
        case glsl: __intrinsic_asm "hitObjectGetWorldToObjectNV($0)";
        case spirv: return spirv_asm {
                result:$$float4x3 = OpHitObjectGetWorldToObjectNV &this;
            };
        }
    }

    [__requiresNVAPI]
    [ForceInline]
    __glsl_extension(GL_EXT_ray_tracing)
    float4x3 GetObjectToWorld()
    {
        __target_switch
        {
        case hlsl: __intrinsic_asm ".GetObjectToWorld";
        case glsl: __intrinsic_asm "hitObjectGetObjectToWorldNV($0)";
        case spirv: return spirv_asm {
                result:$$float4x3 = OpHitObjectGetObjectToWorldNV &this;
            };
        }
    }

        /// Returns the attributes of a hit. Valid if the hit object represents a hit or a miss.
    [ForceInline]
    attr_t GetAttributes<attr_t>()
    {
        __target_switch
        {
        case hlsl:
            {
                attr_t v;
                __hlslGetAttributesFromHitObject(v);
                return v;
            }
        case glsl:
            {
                // Work out the location
                int attributeLocation = __hitObjectAttributesLocation(__hitObjectAttributes<attr_t>());

                // Load the attributes from the location
                __glslGetAttributes(attributeLocation);

                // Return the attributes
                return __hitObjectAttributes<attr_t>();
            }
        case spirv:
            {
                Ptr<attr_t> attr = __allocHitObjectAttributes<attr_t>();
                spirv_asm {
                    OpHitObjectGetAttributesNV &this $attr;
                };
                return *attr;
            }
        }
    }
        /// Loads a root constant from the local root table referenced by the hit object. Valid if the hit object
        /// represents a hit or a miss. RootConstantOffsetInBytes must be a multiple of 4.
    __target_intrinsic(hlsl)
    [__requiresNVAPI]
    uint LoadLocalRootTableConstant(uint RootConstantOffsetInBytes);

    /// 
    /// !!!! Internal NVAPI HLSL impl. Not part of interface! !!!!!!!!!!!!
    /// 

    __target_intrinsic(hlsl, "NvGetAttributesFromHitObject($0, $1)")
    [__requiresNVAPI]
    void __hlslGetAttributesFromHitObject<T>(out T t);

    __target_intrinsic(hlsl, "NvMakeHitWithRecordIndex")
    [__requiresNVAPI]
    static void __hlslMakeHitWithRecordIndex<attr_t>(
        uint HitGroupRecordIndex, 
        RaytracingAccelerationStructure AccelerationStructure, 
        uint InstanceIndex, 
        uint GeometryIndex, 
        uint PrimitiveIndex, 
        uint HitKind, 
        RayDesc Ray, 
        attr_t attributes, 
        out HitObject hitObj);

    __target_intrinsic(hlsl, "NvMakeHit")
    [__requiresNVAPI]
    static void __hlslMakeHit<attr_t>(RaytracingAccelerationStructure AccelerationStructure, 
        uint InstanceIndex, 
        uint GeometryIndex, 
        uint PrimitiveIndex, 
        uint HitKind, 
        uint RayContributionToHitGroupIndex, 
        uint MultiplierForGeometryContributionToHitGroupIndex, 
        RayDesc Ray, 
        attr_t attributes, 
        out HitObject hitObj);

    __target_intrinsic(hlsl, "NvTraceRayHitObject")
    [__requiresNVAPI]
    static void __hlslTraceRay<payload_t>( 
        RaytracingAccelerationStructure AccelerationStructure, 
        uint RayFlags, 
        uint InstanceInclusionMask, 
        uint RayContributionToHitGroupIndex, 
        uint MultiplierForGeometryContributionToHitGroupIndex, 
        uint MissShaderIndex, 
        RayDesc Ray, 
        inout payload_t Payload,
        out HitObject hitObj);

    /// 
    /// !!!! Internal GLSL GL_NV_shader_invocation_reorder impl. Not part of interface! !!!!!!!!!!!!
    /// 

    __glsl_extension(GL_NV_shader_invocation_reorder)
    __glsl_extension(GL_EXT_ray_tracing)
    __glsl_version(460)
    __target_intrinsic(glsl, "hitObjectRecordMissNV")
    static void __glslMakeMiss(
        out HitObject hitObj,
        uint MissShaderIndex,
        float3 Origin,
        float TMin,
        float3 Direction,
        float TMax);

    // "void hitObjectRecordMissNV(hitObjectNV, uint, vec3, float, vec3, float);"
    __glsl_extension(GL_NV_shader_invocation_reorder)
    __glsl_extension(GL_EXT_ray_tracing)
    __glsl_extension(GL_NV_ray_tracing_motion_blur)
    __glsl_version(460)
    __target_intrinsic(glsl, "hitObjectRecordMissMotionNV")
    static void __glslMakeMotionMiss(
        out HitObject hitObj,
        uint MissShaderIndex,
        float3 Origin,
        float TMin,
        float3 Direction,
        float TMax, 
        float CurrentTime);

    __glsl_extension(GL_NV_shader_invocation_reorder)
    __glsl_extension(GL_EXT_ray_tracing)
    __target_intrinsic(glsl, "hitObjectRecordEmptyNV")
    static void __glslMakeNop(out HitObject hitObj);

    __glsl_extension(GL_NV_shader_invocation_reorder)
    __target_intrinsic(glsl, "hitObjectGetObjectRayDirectionNV($0)")
    float3 __glslGetRayDirection();

    __glsl_extension(GL_NV_shader_invocation_reorder)
    __target_intrinsic(glsl, "hitObjectGetWorldRayDirectionNV($0)")
    float3 __glslGetRayWorldDirection();

    __glsl_extension(GL_NV_shader_invocation_reorder)
    __target_intrinsic(glsl, "hitObjectGetWorldRayOriginNV($0)")
    float3 __glslGetRayWorldOrigin();

    __glsl_extension(GL_NV_shader_invocation_reorder)
    __target_intrinsic(glsl, "hitObjectGetRayTMaxNV($0)")
    float __glslGetTMax();

    __glsl_extension(GL_NV_shader_invocation_reorder)
    __target_intrinsic(glsl, "hitObjectGetRayTMinNV($0)")
    float __glslGetTMin();

    // "void hitObjectRecordHitWithIndexNV(hitObjectNV, accelerationStructureEXT,int,int,int,uint,uint,vec3,float,vec3,float,int);"
    __glsl_extension(GL_NV_shader_invocation_reorder)
    __glsl_extension(GL_EXT_ray_tracing)
    __glsl_version(460)
    __target_intrinsic(glsl, "hitObjectRecordHitWithIndexNV")
    static void __glslMakeHitWithIndex(
        out HitObject hitObj,
        RaytracingAccelerationStructure accelerationStructure,
        int instanceid,
        int primitiveid,
        int geometryindex,
        uint hitKind,
        uint sbtRecordIndex,
        float3 origin,
        float Tmin,
        float3 direction,
        float Tmax,
        int attributeLocation);

    //  "void hitObjectRecordHitWithIndexMotionNV(hitObjectNV, accelerationStructureEXT,int,int,int,uint,uint,vec3,float,vec3,float,float,int);"
    __glsl_extension(GL_NV_shader_invocation_reorder)
    __glsl_extension(GL_EXT_ray_tracing)
    __glsl_extension(GL_NV_ray_tracing_motion_blur)
    __target_intrinsic(glsl, "hitObjectRecordHitWithIndexMotionNV")
    static void __glslMakeMotionHitWithIndex(
        out HitObject hitObj,
        RaytracingAccelerationStructure accelerationStructure,
        int instanceid,
        int primitiveid,
        int geometryindex,
        uint hitKind,
        uint sbtRecordIndex,
        float3 origin,
        float Tmin,
        float3 direction,
        float Tmax,
        float CurrentTime,
        int attributeLocation);

    // "void hitObjectRecordHitNV(hitObjectNV,accelerationStructureEXT,int,int,int,uint,uint,uint,vec3,float,vec3,float,int);"
    __glsl_extension(GL_EXT_ray_tracing)
    __glsl_extension(GL_NV_shader_invocation_reorder)
    __target_intrinsic(glsl, "hitObjectRecordHitNV")
    static void __glslMakeHit(
        out HitObject hitObj,
        RaytracingAccelerationStructure accelerationStructure,
        int instanceid,
        int primitiveid,
        int geometryindex,
        uint hitKind,
        uint sbtRecordOffset,
        uint sbtRecordStride,
        float3 origin,
        float Tmin,
        float3 direction,
        float Tmax,
        int attributeLocation);

        // "void hitObjectRecordHitMotionNV(hitObjectNV,accelerationStructureEXT,int,int,int,uint,uint,uint,vec3,float,vec3,float,float,int);"
    __glsl_extension(GL_EXT_ray_tracing)
    __glsl_extension(GL_NV_shader_invocation_reorder)
    __glsl_extension(GL_NV_ray_tracing_motion_blur)
    __target_intrinsic(glsl, "hitObjectRecordHitMotionNV")
    static void __glslMakeMotionHit(
        out HitObject hitObj,
        RaytracingAccelerationStructure accelerationStructure,
        int instanceid,
        int primitiveid,
        int geometryindex,
        uint hitKind,
        uint sbtRecordOffset,
        uint sbtRecordStride,
        float3 origin,
        float Tmin,
        float3 direction,
        float Tmax,
        float CurrentTime,
        int attributeLocation);

    
    __glsl_extension(GL_EXT_ray_tracing)
    __glsl_extension(GL_NV_shader_invocation_reorder)
    __target_intrinsic(glsl, "hitObjectGetAttributesNV($0, $1)")
    void __glslGetAttributes(int attributeLocation);

    __glsl_extension(GL_EXT_ray_tracing)
    __glsl_extension(GL_NV_shader_invocation_reorder)
    __target_intrinsic(glsl, "hitObjectTraceRayNV")
    static void __glslTraceRay(
        out HitObject hitObject,
        RaytracingAccelerationStructure accelerationStructure,
        uint rayFlags,
        uint cullMask,
        uint sbtRecordOffset,
        uint sbtRecordStride,
        uint missIndex,
        float3 origin,
        float Tmin,
        float3 direction,
        float Tmax,
        int payload);

    __glsl_extension(GL_EXT_ray_tracing)
    __glsl_extension(GL_NV_shader_invocation_reorder)
    __glsl_extension(GL_NV_ray_tracing_motion_blur)
    __target_intrinsic(glsl, "hitObjectTraceRayMotionNV")
    static void __glslTraceMotionRay(
        out HitObject hitObject,
        RaytracingAccelerationStructure accelerationStructure,
        uint rayFlags,
        uint cullMask,
        uint sbtRecordOffset,
        uint sbtRecordStride,
        uint missIndex,
        float3 origin,
        float Tmin,
        float3 direction,
        float Tmax,
        float currentTime,
        int payload);

    __glsl_extension(GL_EXT_ray_tracing)
    __glsl_extension(GL_NV_shader_invocation_reorder)
    __target_intrinsic(glsl, "hitObjectExecuteShaderNV")
    static void __glslInvoke(
        HitObject hitObj,
        int payload);
};

    /// Reorders threads based on a coherence hint value. NumCoherenceHintBits indicates how many of
    /// the least significant bits of CoherenceHint should be considered during reordering (max: 16).
    /// Applications should set this to the lowest value required to represent all possible values in
    /// CoherenceHint. For best performance, all threads should provide the same value for
    /// NumCoherenceHintBits.
    /// Where possible, reordering will also attempt to retain locality in the thread’s launch indices
    /// (DispatchRaysIndex in DXR).
[__requiresNVAPI]
__glsl_extension(GL_NV_shader_invocation_reorder)
__glsl_extension(GL_EXT_ray_tracing)
void ReorderThread( uint CoherenceHint, uint NumCoherenceHintBitsFromLSB )
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "NvReorderThread";
    case glsl: __intrinsic_asm "reorderThreadNV";
    case spirv:
        spirv_asm {
            OpCapability ShaderInvocationReorderNV;
            OpExtension "SPV_NV_shader_invocation_reorder";
            OpReorderThreadWithHintNV $CoherenceHint $NumCoherenceHintBitsFromLSB;
        };
    }
}

    /// Reorders threads based on a hit object, optionally extended by a coherence hint value. Coherence
    /// hints behave as described in the generic variant of ReorderThread. The maximum number of
    /// coherence hint bits in this variant of ReorderThread is 8. If no coherence hint is desired, set
    /// NumCoherenceHitBits to zero.
    /// Reordering will consider information in the HitObject and coherence hint with the following
    /// priority:
    ///
    /// 1. Shader ID stored in the HitObject
    /// 2. Coherence hint, with the most significant hint bit having highest priority
    /// 3. Spatial information stored in the HitObject
    ///
    /// That is, ReorderThread will first attempt to group threads whose HitObject references the
    /// same shader ID. (Miss shaders and NOP HitObjects are grouped separately). Within each of these
    /// groups, it will attempt to order threads by the value of their coherence hints. And within ranges
    /// of equal coherence hints, it will attempt to maximize locality in 3D space of the ray hit (if any).
[__requiresNVAPI]
__glsl_extension(GL_NV_shader_invocation_reorder)
__glsl_extension(GL_EXT_ray_tracing)
void ReorderThread( HitObject HitOrMiss, uint CoherenceHint, uint NumCoherenceHintBitsFromLSB )
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "NvReorderThread";
    case glsl: __intrinsic_asm "reorderThreadNV";
    case spirv:
        spirv_asm {
            OpReorderThreadWithHitObjectNV &HitOrMiss $CoherenceHint $NumCoherenceHintBitsFromLSB;
        };
    }
}

    /// Is equivalent to
    /// ```
    /// void ReorderThread( HitObject HitOrMiss, uint CoherenceHint, uint NumCoherenceHintBitsFromLSB );
    /// ```
    /// With CoherenceHint and NumCoherenceHintBitsFromLSB as 0, meaning they are ignored.
[__requiresNVAPI]
__glsl_extension(GL_NV_shader_invocation_reorder)
void ReorderThread( HitObject HitOrMiss )
{
    __target_switch
    {
    case hlsl: __intrinsic_asm "NvReorderThread";
    case glsl: __intrinsic_asm "reorderThreadNV";
    case spirv:
        spirv_asm {
            OpReorderThreadWithHitObjectNV &HitOrMiss;
        };
    }
}


///
/// DebugBreak support 
///
/// There doesn't appear to be an equivalent for debugBreak for HLSL

__target_intrinsic(hlsl, "/* debugBreak() not currently supported for HLSL */")
__target_intrinsic(cuda,"__brkpt()")
__target_intrinsic(cpp, "SLANG_BREAKPOINT(0)")
void debugBreak();

__specialized_for_target(glsl)
[[vk::spirv_instruction(1, "NonSemantic.DebugBreak")]]
void debugBreak();

// 
// Realtime Clock support
//

// https://github.com/KhronosGroup/GLSL/blob/master/extensions/ext/GL_EXT_shader_realtime_clock.txt

[__requiresNVAPI]
__glsl_extension(GL_EXT_shader_realtime_clock)
uint getRealtimeClockLow()
{
    __target_switch
    {
    case hlsl:
        __intrinsic_asm "NvGetSpecial( NV_SPECIALOP_GLOBAL_TIMER_LO)";
    case glsl:
        return getRealtimeClock().x;
    case cuda:
        __intrinsic_asm "clock";
    case spirv:
        return getRealtimeClock().x;
    }
}

__target_intrinsic(cuda, "clock64")
int64_t __cudaGetRealtimeClock();

[__requiresNVAPI]
__glsl_extension(GL_EXT_shader_realtime_clock)
uint2 getRealtimeClock()
{
    __target_switch
    {
    case hlsl:
        __intrinsic_asm "uint2(NvGetSpecial(NV_SPECIALOP_GLOBAL_TIMER_LO), NvGetSpecial( NV_SPECIALOP_GLOBAL_TIMER_HI))";
    case glsl:
        __intrinsic_asm "clockRealtime2x32EXT()";
    case cuda:
        int64_t ticks = __cudaGetRealtimeClock();
        return uint2(uint(ticks), uint(uint64_t(ticks) >> 32));
    case spirv:
        return spirv_asm
        {
            OpCapability ShaderClockKHR;
            OpExtension "SPV_KHR_shader_clock";
            result : $$uint2 = OpReadClockKHR Device
        };
    }
}

// 
// CUDA specific 
// 

__target_intrinsic(cuda, "(threadIdx)")
[__readNone]
uint3 cudaThreadIdx();

__target_intrinsic(cuda, "(blockIdx)")
[__readNone]
uint3 cudaBlockIdx();

__target_intrinsic(cuda, "(blockDim)")
[__readNone]
uint3 cudaBlockDim();

//
// Workgroup cooperation
//

//
// `saturated_cooperation(c, f, s, u)` will call `f(s, u)` if not all lanes in the
// workgroup are currently executing. however if all lanes are saturated, then
// for each unique `s` across all the active lanes `c(s, u)` is called. The
// return value is the one corresponding to the input `s` from this lane.
//
// Adjacent calls to saturated_cooperation are subject to fusion, i.e.
//      saturated_cooperation(c1, f1, s, u1);
//      saturated_cooperation(c2, f2, s, u2);
// will be transformed to:
//      saturated_cooperation(c1c2, f1f2, s, u1u2);
// where
//      c1c2 is a function which calls c1(s, u1) and then c2(s, u2);
//      f1f2 is a function which calls f1(s, u1) and then f2(s, u2);
//
// When the input differs, calls are fused
//      saturated_cooperation(c1, f1, s1, u1);
//      saturated_cooperation(c2, f2, s2, u2);
// will be transformed to:
//      saturated_cooperation(c1c2, f1f2, s1s2, u1u2);
// where
//      s1s2 is a tuple of s1 and s2
//      c1c2 is a function which calls c1(s1, u1) and then c2(s2, u2);
//      f1f2 is a function which calls f1(s1, u1) and then f2(s2, u2);
// Note that in this case, we will make a call to c1c2 for every unique pair
// s1s2 across all lanes
//
// (This fusion takes place in the fuse-satcoop pass, and as such any changes to
// the signature or behavior of this function should be adjusted for there).
//
[KnownBuiltin("saturated_cooperation")]
func saturated_cooperation<A : __BuiltinType, B, C>(
    cooperate : functype (A, B) -> C,
    fallback : functype (A, B) -> C,
    A input,
    B otherArg)
    -> C
{
    return saturated_cooperation_using(cooperate, fallback, __WaveMatchBuitin<A>, __WaveReadLaneAtBuiltin<A>, input, otherArg);
}

// These two functions are a temporary (circa May 2023) workaround to the fact
// that we can't deduce which overload to pass to saturated_cooperation_using
// in the call above
[__unsafeForceInlineEarly]
func __WaveMatchBuitin<T : __BuiltinType>(T t) -> uint4
{
    return WaveMatch(t);
}
[__unsafeForceInlineEarly]
func __WaveReadLaneAtBuiltin<T : __BuiltinType>(T t, int i) -> T
{
    return WaveReadLaneAt(t, i);
}

//
// saturated_cooperation, but you're able to specify manually the functions:
//
// waveMatch: a function to return a mask of lanes with the same input as this one
// broadcast: a function which returns the value passed into it on the specified lane
//
[KnownBuiltin("saturated_cooperation_using")]
func saturated_cooperation_using<A, B, C>(
    cooperate : functype (A, B) -> C,
    fallback : functype (A, B) -> C,
    waveMatch : functype (A) -> uint4,
    broadcast : functype (A, int) -> A,
    A input,
    B otherArg)
    -> C
{
    const bool isWaveSaturated = WaveActiveCountBits(true) == WaveGetLaneCount();
    if(isWaveSaturated)
    {
        let lanesWithSameInput = waveMatch(input).x;
        // Keep least significant lane in our set
        let ourRepresentative = lanesWithSameInput & -lanesWithSameInput;
        // The representative lanes for all lanes
        var allRepresentatives = WaveActiveBitOr(ourRepresentative);

        C ret;

        // Iterate over set bits in mask from low to high.
        // In each iteration the lowest bit is cleared.
        while(bool(allRepresentatives))
        {
            // Broadcast input across warp.
            let laneIdx = firstbitlow(allRepresentatives);
            let uniformInput = broadcast(input, int(laneIdx));

            // All lanes perform some cooperative computation with dynamic
            // uniform input
            C c = cooperate(uniformInput, otherArg);

            // Update our return value until it
            if(bool(allRepresentatives & ourRepresentative))
                ret = c;

            // Clear the lowest bit
            allRepresentatives &= allRepresentatives - 1;
        }

        return ret;
    }
    else
    {
        return fallback(input, otherArg);
    }
}


${
// The NVAPI operations are defined to take the space/register
// indices of their texture and sampler parameters, rather than
// taking the texture/sampler objects directly.
//
// In order to support this approach, we need intrinsics that
// can magically fetch the binding information for a resource.
//
// TODO: These operations are kind of *screaming* for us to
// have a built-in `interface` that all of the opaque resource
// types conform to, so that we can define builtins that work
// for any resource type.
}

__intrinsic_op($(kIROp_GetRegisterSpace)) uint __getRegisterSpace<T, Shape: __ITextureShape, let isArray:int, let isMS:int, let sampleCount:int, let access:int, let isShadow:int, let isCombined:int, let format:int>(__TextureImpl<T,Shape,isArray,isMS,sampleCount,access,isShadow,isCombined,format> texture);
__intrinsic_op($(kIROp_GetRegisterSpace)) uint __getRegisterSpace(SamplerState sampler);

__intrinsic_op($(kIROp_GetRegisterIndex)) uint __getRegisterIndex<T, Shape: __ITextureShape, let isArray:int, let isMS:int, let sampleCount:int, let access:int, let isShadow:int, let isCombined:int, let format:int>(__TextureImpl<T,Shape,isArray,isMS,sampleCount,access,isShadow,isCombined,format> texture);
__intrinsic_op($(kIROp_GetRegisterIndex)) uint __getRegisterIndex(SamplerState sampler);


${{{{
//
// Texture Footprint Queries
//
// This section introduces the types and methods related
// to the `GL_NV_shader_texture_footprint` GLSL extension,
// and the matching NVAPI operations.
//
// Footprint queries are allowed on both 2D and 3D textures,
// and are structurally similar for the two, so we will
// use a meta-loop to deduplicate the code for the two
// cases.
//

// A footprint query yields a data structure
// that describes blocks of texels that
// conservatively cover the data that might
// be fetched in the query.
//
// A given sampling operation might access two
// mip levels of a texture when, e.g., trilinear
// filtering is on. A footprint query may ask for
// a footprint in either the coarse or fine level
// of the pair.
//
// We first define a `struct` type that closely maps
// to how a footprint is defined for each of the
// implementations we support, and then wrap that
// in a derived `struct` that includes the extra
// data that is returned by the GLSL API via the
// function reuslt.
//
}}}}

[__NoSideEffect]
[__requiresNVAPI]
vector<uint, ND> __textureFootprintGetAnchor<let ND:int>(__TextureFootprintData<ND> data, int nd)
{
    __target_switch
    {
    case hlsl:
        __intrinsic_asm "NvFootprintExtractAnchorTileLoc$!1D($0)";
    case glsl:
        __intrinsic_asm "$0.anchor";
    case spirv:
        return spirv_asm {
            result:$$vector<uint,ND> = OpCompositeExtract $data 1;
        };
    }
}

[__NoSideEffect]
[__requiresNVAPI]
vector<uint, ND> __textureFootprintGetOffset<let ND:int>(__TextureFootprintData<ND> data, int nd)
{
    __target_switch
    {
    case hlsl:
        __intrinsic_asm "NvFootprintExtractOffset$!1D($0)";
    case glsl:
        __intrinsic_asm "$0.offset";
    case spirv:
        return spirv_asm {
            result:$$vector<uint,ND> = OpCompositeExtract $data 2;
        };
    }
}

__intrinsic_type($(kIROp_TextureFootprintType))
struct __TextureFootprintData<let ND:int>
{
    typealias Anchor        = vector<uint, ND>;
    typealias Offset        = vector<uint, ND>;
    typealias Mask          = uint2;
    typealias LOD           = uint;
    typealias Granularity   = uint;

    property anchor : Anchor
    {
        [__NoSideEffect]
        [__requiresNVAPI]
        [ForceInline]
        get { return __textureFootprintGetAnchor(this, ND); }
    }

    property offset : Offset
    {
        [__NoSideEffect]
        [__requiresNVAPI]
        [ForceInline]
        get { return __textureFootprintGetOffset(this, ND); }
    }

    property mask : Mask
    {
        [__NoSideEffect]
        [__requiresNVAPI]
        get
        {
            __target_switch
            {
            case hlsl:
                __intrinsic_asm "NvFootprintExtractBitmask";
            case glsl:
                __intrinsic_asm "$0.mask";
            case spirv:
                return spirv_asm {
                    result:$$Mask = OpCompositeExtract $this 3;
                };
            }
        }
    }

    property lod : LOD
    {
        [__NoSideEffect]
        [__requiresNVAPI]
        get
        {
            __target_switch
            {
            case hlsl:
                __intrinsic_asm "NvFootprintExtractLOD";
            case glsl:
                __intrinsic_asm "$0.lod";
            case spirv:
                return spirv_asm {
                    result:$$LOD = OpCompositeExtract $this 4;
                };
            }
        }
    }

    property granularity : Granularity
    {
        [__NoSideEffect]
        [__requiresNVAPI]
        get
        {
            __target_switch
            {
            case hlsl:
                __intrinsic_asm "NvFootprintExtractReturnGran";
            case glsl:
                __intrinsic_asm "$0.granularity";
            case spirv:
                return spirv_asm {
                    result:$$Granularity = OpCompositeExtract $this 5;
                };
            }
        }
    }
}

struct TextureFootprint<let ND:int> : __TextureFootprintData<ND>
{
    bool _isSingleLevel;

    property isSingleLevel : bool
    {
        [__NoSideEffect]
        get
        {
            return _isSingleLevel;
        }
    }
}

typealias TextureFootprint2D = TextureFootprint<2>;
typealias TextureFootprint3D = TextureFootprint<3>;

${
// We define the new operations via an `extension`
// on the relevant texture type(s), rather than
// further clutter the original type declarations.
}

__generic<T, Shape: __ITextureShape, let sampleCount:int, let isShadow:int, let format:int>
extension __TextureImpl<T,Shape,0,0,sampleCount,0,isShadow,0,format>
{
${
// We introduce a few convenience type aliases here,
// which both keep our declarations simpler and easier
// to understand, but which might *also* be useful to
// users of the stdlib, so that they can write things
// like `Texture2D.Footprint`, and also have auto-complete
// help them find such members.
//
// TODO: The `Coords` type really ought to be something
// defined on the base texture types, rather than via
// this `extension`.
}
    typealias Coords = vector<float, Shape.dimensions>;
    typealias Footprint = TextureFootprint<Shape.dimensions>;
    typealias __FootprintData = __TextureFootprintData<Shape.dimensions>;
    typealias FootprintGranularity = Footprint.Granularity;

${
// For the GLSL extension, the choice between the
// coarse and fine level is modeled as a `bool`
// parameter to the query operation(s). We define
// the GLSL functions here as intrinsics, so that
// we can refer to them later in the definitions
// of our stdlib operaitons.
//
// Note: despite the GLSL extension defining the `granularity`
// member of the query result as having type `uint`, the
// function signatures all take `int` parameters for the
// granularity instead.
//
}

    [__NoSideEffect]
    __glsl_version(450)
    __glsl_extension(GL_NV_shader_texture_footprint)
    bool __queryFootprintGLSL(
            SamplerState    sampler,
            Coords          coords,
            int             granularity,
            bool            useCoarseLevel,
            out __FootprintData footprint)
    {
        __target_switch
        {
        case glsl:
            __intrinsic_asm "textureFootprintNV($p, $*2)";
        case spirv:
            return spirv_asm {
                OpCapability ImageFootprintNV;
                OpExtension "SPV_NV_shader_image_footprint";
                %sampledImage:__sampledImageType(this) = OpSampledImage $this $sampler;
                %resultVal:$$__FootprintData = OpImageSampleFootprintNV %sampledImage $coords $granularity $useCoarseLevel;
                OpStore &footprint %resultVal;
                result:$$bool = OpCompositeExtract %resultVal 0;
            };
        }
    }

    [__NoSideEffect]
    __glsl_version(450)
    __glsl_extension(GL_NV_shader_texture_footprint)
    bool __queryFootprintGLSL(
            SamplerState    sampler,
            Coords          coords,
            int             granularity,
            bool            useCoarseLevel,
            out __FootprintData footprint,
            float           bias)
    {
        __target_switch
        {
        case glsl:
            __intrinsic_asm "textureFootprintNV($p, $*2)";
        case spirv:
            return spirv_asm {
                OpCapability ImageFootprintNV;
                OpExtension "SPV_NV_shader_image_footprint";
                %sampledImage:__sampledImageType(this) = OpSampledImage $this $sampler;
                %resultVal:$$__FootprintData = OpImageSampleFootprintNV %sampledImage $coords $granularity $useCoarseLevel Bias $bias;
                OpStore &footprint %resultVal;
                result:$$bool = OpCompositeExtract %resultVal 0;
            };
        }
    }

    [__NoSideEffect]
    __glsl_version(450)
    __glsl_extension(GL_NV_shader_texture_footprint)
    __glsl_extension(GL_ARB_sparse_texture_clamp)
    __target_intrinsic(glsl,
        "textureFootprintClampNV($p, $*2)")
    bool __queryFootprintClampGLSL(
            SamplerState    sampler,
            Coords          coords,
            float           lodClamp,
            int             granularity,
            bool            useCoarseLevel,
            out __FootprintData footprint)
    {
        __target_switch
        {
        case glsl:
            __intrinsic_asm "textureFootprintClampNV($p, $*2)";
        case spirv:
            return spirv_asm {
                OpCapability ImageFootprintNV;
                OpCapability MinLod;
                OpExtension "SPV_NV_shader_image_footprint";
                %sampledImage:__sampledImageType(this) = OpSampledImage $this $sampler;
                %resultVal:$$__FootprintData = OpImageSampleFootprintNV %sampledImage $coords $granularity $useCoarseLevel MinLod $lodClamp;
                OpStore &footprint %resultVal;
                result:$$bool = OpCompositeExtract %resultVal 0;
            };
        }
    }

    [__NoSideEffect]
    __glsl_version(450)
    __glsl_extension(GL_NV_shader_texture_footprint)
    __glsl_extension(GL_ARB_sparse_texture_clamp)
    bool __queryFootprintClampGLSL(
            SamplerState    sampler,
            Coords          coords,
            float           lodClamp,
            int             granularity,
            bool            useCoarseLevel,
            out __FootprintData footprint,
            float           bias)
    {
        __target_switch
        {
        case glsl:
            __intrinsic_asm "textureFootprintClampNV($p, $*2)";
        case spirv:
            return spirv_asm {
                OpCapability ImageFootprintNV;
                OpCapability MinLod;
                OpExtension "SPV_NV_shader_image_footprint";
                %sampledImage:__sampledImageType(this) = OpSampledImage $this $sampler;
                %resultVal:$$__FootprintData = OpImageSampleFootprintNV %sampledImage $coords $granularity $useCoarseLevel Bias|MinLod $bias $lodClamp;
                OpStore &footprint %resultVal;
                result:$$bool = OpCompositeExtract %resultVal 0;
            };
        }
    }

    [__NoSideEffect]
    __glsl_version(450)
    __glsl_extension(GL_NV_shader_texture_footprint)
    [__requiresNVAPI]
    bool __queryFootprintLodGLSL(
            SamplerState            sampler,
            Coords                  coords,
            float                   lod,
            int                     granularity,
            bool                    useCoarseLevel,
            out __FootprintData         footprint)
    {
        __target_switch
        {
        case glsl:
            __intrinsic_asm "textureFootprintLodNV($p, $*2)";
        case spirv:
            return spirv_asm {
                OpCapability ImageFootprintNV;
                OpExtension "SPV_NV_shader_image_footprint";
                %sampledImage:__sampledImageType(this) = OpSampledImage $this $sampler;
                %resultVal:$$__FootprintData = OpImageSampleFootprintNV %sampledImage $coords $granularity $useCoarseLevel Lod $lod;
                OpStore &footprint %resultVal;
                result:$$bool = OpCompositeExtract %resultVal 0;
            };
        }
    }


${{{
    // Texture sampling with gradient is only available for 2D textures.
}}}
    [__NoSideEffect]
    __glsl_version(450)
    __glsl_extension(GL_NV_shader_texture_footprint)
    [__requiresNVAPI]
    bool __queryFootprintGradGLSL(
            SamplerState    sampler,
            Coords          coords,
            Coords          dx,
            Coords          dy,
            int             granularity,
            bool            useCoarseLevel,
            out __FootprintData footprint)
    {
        __target_switch
        {
        case glsl:
            __intrinsic_asm "textureFootprintGradNV($p, $*2)";
        case spirv:
            return spirv_asm {
                OpCapability ImageFootprintNV;
                OpExtension "SPV_NV_shader_image_footprint";
                %sampledImage:__sampledImageType(this) = OpSampledImage $this $sampler;
                %resultVal:$$__FootprintData = OpImageSampleFootprintNV %sampledImage $coords $granularity $useCoarseLevel Grad $dx $dy;
                OpStore &footprint %resultVal;
                result:$$bool = OpCompositeExtract %resultVal 0;
            };
        }
    }

    [__NoSideEffect]
    __glsl_version(450)
    __glsl_extension(GL_NV_shader_texture_footprint)
    __glsl_extension(GL_ARB_sparse_texture_clamp)
    bool __queryFootprintGradClampGLSL(
            SamplerState    sampler,
            Coords          coords,
            Coords          dx,
            Coords          dy,
            float           lodClamp,
            int             granularity,
            bool            useCoarseLevel,
            out __FootprintData footprint)
    {
        __target_switch
        {
        case glsl:
            __intrinsic_asm "textureFootprintGradClampNV($p, $*2)";
        case spirv:
            return spirv_asm {
                OpCapability ImageFootprintNV;
                OpCapability MinLod;
                OpExtension "SPV_NV_shader_image_footprint";
                %sampledImage:__sampledImageType(this) = OpSampledImage $this $sampler;
                %resultVal:$$__FootprintData = OpImageSampleFootprintNV %sampledImage $coords $granularity $useCoarseLevel Grad|MinLod $dx $dy $lodClamp;
                OpStore &footprint %resultVal;
                result:$$bool = OpCompositeExtract %resultVal 0;
            };
        }
    }
${{{
    // End texture2D specific functions.
}}}


${{{{
// The NVAPI texture query operations encode the choice
// between coarse and fine levels as part of the function
// name, and so we are forced to match this convention
// if we want to provide a more portable API.
//
// TODO: We could conceivably define the functions to use
// a parameter for the coarse/fine choice, which is required
// to be `constexpr` for the HLSL/NVAPI target.
//
static const struct LevelChoice
{
char const* name;
char const* isCoarseVal;
} kLevelChoices[] =
{
    { "Coarse", "true" },
    { "Fine", "false" },
};
for(auto levelChoice : kLevelChoices)
{
    auto CoarseOrFine = levelChoice.name;
    auto isCoarseVal = levelChoice.isCoarseVal;

// We now go ahead and define the intrinsics provided by NVAPI,
// which have a very different signature from the GLSL ones.
//
// Note: the NVAPI functions also support an optional texel
// offset parameter. For now we are not including overloads
// with that parameter, since they have no equivalent in
// the GLSL extension.
//
}}}}

    [__NoSideEffect]
    [__requiresNVAPI]
    __target_intrinsic(hlsl,
        "NvFootprint$(CoarseOrFine)($1, $2, $3, $4, NV_EXTN_TEXTURE_$!0D, $*5)")
    static __FootprintData __queryFootprint$(CoarseOrFine)NVAPI(
        int                     nd,
        uint                    textureSpace,
        uint                    textureIndex,
        uint                    samplerSpace,
        uint                    samplerIndex,
        float3                  coords,
        FootprintGranularity    granularity, 
        out uint                isSingleLod);

    [__NoSideEffect]
    [__requiresNVAPI]
    __target_intrinsic(hlsl,
        "NvFootprint$(CoarseOrFine)Bias($1, $2, $3, $4, NV_EXTN_TEXTURE_$!0D, $*5)")
    static __FootprintData __queryFootprint$(CoarseOrFine)BiasNVAPI(
        int                     nd,
        uint                    textureSpace,
        uint                    textureIndex,
        uint                    samplerSpace,
        uint                    samplerIndex,
        float3                  coords,
        FootprintGranularity    granularity,
        float                   lodBias, 
        out uint                isSingleLod);

    [__NoSideEffect]
    [__requiresNVAPI]
    __target_intrinsic(hlsl,
        "NvFootprint$(CoarseOrFine)Level($1, $2, $3, $4, NV_EXTN_TEXTURE_$!0D, $*5)")
    static __FootprintData __queryFootprint$(CoarseOrFine)LevelNVAPI(
        int                     nd,
        uint                    textureSpace,
        uint                    textureIndex,
        uint                    samplerSpace,
        uint                    samplerIndex,
        float3                  coords,
        FootprintGranularity    granularity,
        float                   lod, 
        out uint                isSingleLod);

    [__NoSideEffect]
    [__requiresNVAPI]
    __target_intrinsic(hlsl,
        "NvFootprint$(CoarseOrFine)Grad($1, $2, $3, $4, NV_EXTN_TEXTURE_$!0D, $*5)")
    static __FootprintData __queryFootprint$(CoarseOrFine)GradNVAPI(
        int                     nd,
        uint                    textureSpace,
        uint                    textureIndex,
        uint                    samplerSpace,
        uint                    samplerIndex,
        float3                  coords,
        FootprintGranularity    granularity,
        float3                  dx,
        float3                  dy, 
        out uint                isSingleLod);

${
// We now define the portable operations that will be officially
// supported by the standard library. For each operation, we
// need to provide both a version that maps to the GLSL extension,
// and a version that uses the NVAPI functions.
//
// Some function variations are only available with one extension
// or the other, so we try our best to only define them where
// each is available.
//
// Note that these functions cannot be marked as [ForceInline] for now
// because the texture resource may get removed after DCE, since the only
// use of those resources are done through __GetRegisterIndex/Space, which is
// replaced early with their binding slot in the compilation process.
// Not inlining these function is a quick way to make sure the texture always
// has live uses.
//
}

    /// Query the footprint that would be accessed by a texture sampling operation.
    ///
    /// This operation queries the footprint that would be accessed
    /// by a comparable call to:
    ///
    ///     t.Sample(sampler, coords);
    ///
    [__NoSideEffect]
    Footprint queryFootprint$(CoarseOrFine)(
            FootprintGranularity    granularity,
            SamplerState            sampler,
            Coords                  coords)
    {
        __target_switch
        {
        case glsl:
        case spirv:
            Footprint footprint;
            footprint._isSingleLevel = __queryFootprintGLSL(sampler, coords, granularity, $(isCoarseVal), footprint);
            return footprint;

        case hlsl:
            uint isSingleLod = 0;
            Footprint footprint =  {__queryFootprint$(CoarseOrFine)NVAPI(
                Shape.dimensions,
                __getRegisterSpace(this), __getRegisterIndex(this),
                __getRegisterSpace(sampler), __getRegisterIndex(sampler),
                __vectorReshape<3>(coords), granularity, /* out */isSingleLod), false};
            footprint._isSingleLevel = (isSingleLod != 0);
            return footprint;
        }
    }

    /// Query the footprint that would be accessed by a texture sampling operation.
    ///
    /// This operation queries the footprint that would be accessed
    /// by a comparable call to:
    ///
    ///     t.SampleBias(sampler, coords, lodBias);
    ///
    [__NoSideEffect]
    Footprint queryFootprint$(CoarseOrFine)Bias(
            FootprintGranularity    granularity,
            SamplerState            sampler,
            Coords                  coords,
            float                   lodBias)
    {
        __target_switch
        {
        case glsl:
        case spirv:
            Footprint footprint;
            footprint._isSingleLevel = __queryFootprintGLSL(sampler, coords, granularity, $(isCoarseVal), footprint, lodBias);
            return footprint;
        case hlsl:
            uint isSingleLod = 0;
            Footprint footprint = {__queryFootprint$(CoarseOrFine)BiasNVAPI(
                Shape.dimensions,
                __getRegisterSpace(this), __getRegisterIndex(this),
                __getRegisterSpace(sampler), __getRegisterIndex(sampler),
                __vectorReshape<3>(coords), granularity, lodBias, /* out */isSingleLod), false}; 
            
            footprint._isSingleLevel = (isSingleLod != 0);
            return footprint;
        }
    }

    /// Query the footprint that would be accessed by a texture sampling operation.
    ///
    /// This operation queries the footprint that would be accessed
    /// by a comparable call to:
    ///
    ///     t.SampleClamp(sampler, coords, lodClamp);
    ///
    [__NoSideEffect]
    Footprint queryFootprint$(CoarseOrFine)Clamp(
            FootprintGranularity    granularity,
            SamplerState            sampler,
            Coords                  coords,
            float                   lodClamp)
    {
        __target_switch
        {
        case glsl:
        case spirv:
            Footprint footprint;
            footprint._isSingleLevel = __queryFootprintClampGLSL(sampler, coords, lodClamp, granularity, $(isCoarseVal), footprint);
            return footprint;
        }
    }

    /// Query the footprint that would be accessed by a texture sampling operation.
    ///
    /// This operation queries the footprint that would be accessed
    /// by a comparable call to:
    ///
    ///     t.SampleBiasClamp(sampler, coords, lodBias, lodClamp);
    ///
    [__NoSideEffect]
    Footprint queryFootprint$(CoarseOrFine)BiasClamp(
            FootprintGranularity    granularity,
            SamplerState            sampler,
            Coords                  coords,
            float                   lodBias,
            float                   lodClamp)
    {
        __target_switch
        {
        case glsl:
        case spirv:
            Footprint footprint;
            footprint._isSingleLevel = __queryFootprintClampGLSL(sampler, coords, lodClamp, granularity, $(isCoarseVal), footprint, lodBias);
            return footprint;
        }
    }

    /// Query the footprint that would be accessed by a texture sampling operation.
    ///
    /// This operation queries the footprint that would be accessed
    /// by a comparable call to:
    ///
    ///     t.SampleLevel(sampler, coords, lod);
    ///
    [__NoSideEffect]
    Footprint queryFootprint$(CoarseOrFine)Level(
            FootprintGranularity    granularity,
            SamplerState            sampler,
            Coords                  coords,
            float                   lod)
    {
        __target_switch
        {
        case glsl:
        case spirv:
            Footprint footprint;
            footprint._isSingleLevel = __queryFootprintLodGLSL(sampler, coords, lod, granularity, $(isCoarseVal), footprint);
            return footprint;
        case hlsl:
            uint isSingleLod = 0;
            Footprint footprint = {__queryFootprint$(CoarseOrFine)LevelNVAPI(
                Shape.dimensions,
                __getRegisterSpace(this), __getRegisterIndex(this),
                __getRegisterSpace(sampler), __getRegisterIndex(sampler),
                __vectorReshape<3>(coords), granularity, lod, /* out */isSingleLod), false};
            
            footprint._isSingleLevel = (isSingleLod != 0);
            return footprint;
        }
    }

${{{
    // TODO: Texture sampling with gradient is only available for 2D textures.
}}}

    /// Query the footprint that would be accessed by a texture sampling operation.
    ///
    /// This operation queries the footprint that would be accessed
    /// by a comparable call to:
    ///
    ///     t.SampleGrad(sampler, coords, dx, dy);
    ///
    [__NoSideEffect] [ForceInline]
    Footprint queryFootprint$(CoarseOrFine)Grad(
            FootprintGranularity    granularity,
            SamplerState            sampler,
            Coords                  coords,
            Coords                  dx,
            Coords                  dy)
    {
        __target_switch
        {
        case glsl:
        case spirv:
            Footprint footprint;
            footprint._isSingleLevel = __queryFootprintGradGLSL(sampler, coords, dx, dy, granularity, $(isCoarseVal), footprint);
            return footprint;
        case hlsl:
            uint isSingleLod = 0;
            Footprint footprint = {__queryFootprint$(CoarseOrFine)GradNVAPI(
                Shape.dimensions,
                __getRegisterSpace(this), __getRegisterIndex(this),
                __getRegisterSpace(sampler), __getRegisterIndex(sampler),
                __vectorReshape<3>(coords), granularity, __vectorReshape<3>(dx), __vectorReshape<3>(dy), /* out */isSingleLod), false};

            footprint._isSingleLevel = (isSingleLod != 0);
            return footprint;
        }
    }

    /// Query the footprint that would be accessed by a texture sampling operation.
    ///
    /// This operation queries the footprint that would be accessed
    /// by a comparable call to:
    ///
    ///     t.SampleGradClamp(sampler, coords, dx, dy, lodClamp);
    ///
    [__NoSideEffect][ForceInline]
    Footprint queryFootprint$(CoarseOrFine)GradClamp(
            FootprintGranularity    granularity,
            SamplerState            sampler,
            Coords                  coords,
            Coords                  dx,
            Coords                  dy,
            float                   lodClamp)
    {
        __target_switch
        {
        case glsl:
        case spirv:
            Footprint footprint;
            footprint._isSingleLevel = __queryFootprintGradClampGLSL(sampler, coords, dx, dy, lodClamp, granularity, $(isCoarseVal), footprint);
            return footprint;
        }
    }

${{{
    // TODO: end texture2D specific functions.
}}}

${{{{
}
}}}}

} // extension

// Buffer Pointer

__generic<T, let Alignment : int = 16>
__intrinsic_type($(kIROp_HLSLConstBufferPointerType))
__glsl_extension(GL_EXT_buffer_reference)
__magic_type(ConstBufferPointerType)
struct ConstBufferPointer
{
    __glsl_version(450)
    __glsl_extension(GL_EXT_buffer_reference)
    [__NoSideEffect]
    T get()
    {
        __target_switch
        {
        case glsl:
            __intrinsic_asm "$0._data";
        case spirv:
            return spirv_asm {
                result:$$T = OpLoad $this Aligned !Alignment;
            };
        }
    }

    __subscript(int index) -> T
    {
        [ForceInline]
        get {return ConstBufferPointer<T>.fromUInt(toUInt() + __naturalStrideOf<T>() * index).get(); }
    }

    __glsl_version(450)
    __glsl_extension(GL_EXT_shader_explicit_arithmetic_types_int64)
    __glsl_extension(GL_EXT_buffer_reference)
    static ConstBufferPointer<T> fromUInt(uint64_t val)
    {
        __target_switch
        {
        case glsl:
            __intrinsic_asm "$TR($0)";
        case spirv:
            return spirv_asm {
                result:$$ConstBufferPointer<T> = OpConvertUToPtr $val;
            };
        }
    }

    __glsl_version(450)
    __glsl_extension(GL_EXT_shader_explicit_arithmetic_types_int64)
    __glsl_extension(GL_EXT_buffer_reference)
    uint64_t toUInt()
    {
        __target_switch
        {
        case glsl:
            __intrinsic_asm "uint64_t($0)";
        case spirv:
            return spirv_asm {
                result:$$uint64_t = OpConvertPtrToU $this;
            };
        }
    }

    __glsl_version(450)
    __glsl_extension(GL_EXT_shader_explicit_arithmetic_types_int64)
    __glsl_extension(GL_EXT_buffer_reference)
    [__NoSideEffect]
    [ForceInline]
    bool isValid()
    {
        __target_switch
        {
        case glsl:
            __intrinsic_asm "(uint64_t($0) != 0)";
        case spirv:
            uint64_t zero = 0ULL;
            return spirv_asm {
                %ptrval:$$uint64_t = OpConvertPtrToU $this;
                result:$$bool = OpINotEqual %ptrval $zero;
            };
        }
    }
}
back to top