Revision cb7be2dad25ce8a3b9340104c005cd9e1fd0a12c authored by Alec Jacobson on 06 June 2018, 22:59:18 UTC, committed by Alec Jacobson on 06 June 2018, 22:59:18 UTC
1 parent 2acebd0
VM_SSEFunc.h
/*
* Copyright (c) 2018 Side Effects Software Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
* COMMENTS:
* SIMD wrapper functions for SSE instructions
*/
#pragma once
#ifndef __VM_SSEFunc__
#define __VM_SSEFunc__
#include "../SYS/SYS_Types.h"
#if defined(_MSC_VER)
#pragma warning(push)
#pragma warning(disable:4799)
#endif
#define CPU_HAS_SIMD_INSTR 1
#define VM_SSE_STYLE 1
#include <emmintrin.h>
typedef __m128 v4sf;
typedef __m128i v4si;
#if defined(__SSE4_1__)
#define VM_SSE41_STYLE 1
#include <smmintrin.h>
#endif
#if defined(_MSC_VER)
#pragma warning(pop)
#endif
// Plain casting (no conversion)
// MSVC has problems casting between __m128 and __m128i, so we implement a
// custom casting routine specifically for windows.
#if defined(_MSC_VER)
static SYS_FORCE_INLINE v4sf
vm_v4sf(const v4si &a)
{
union {
v4si ival;
v4sf fval;
};
ival = a;
return fval;
}
static SYS_FORCE_INLINE v4si
vm_v4si(const v4sf &a)
{
union {
v4si ival;
v4sf fval;
};
fval = a;
return ival;
}
#define V4SF(A) vm_v4sf(A)
#define V4SI(A) vm_v4si(A)
#else
#define V4SF(A) (v4sf)A
#define V4SI(A) (v4si)A
#endif
#define VM_SHUFFLE_MASK(a0,a1, b0,b1) ((b1)<<6|(b0)<<4 | (a1)<<2|(a0))
template <int mask>
static SYS_FORCE_INLINE v4sf
vm_shuffle(const v4sf &a, const v4sf &b)
{
return _mm_shuffle_ps(a, b, mask);
}
template <int mask>
static SYS_FORCE_INLINE v4si
vm_shuffle(const v4si &a, const v4si &b)
{
return V4SI(_mm_shuffle_ps(V4SF(a), V4SF(b), mask));
}
template <int A, int B, int C, int D, typename T>
static SYS_FORCE_INLINE T
vm_shuffle(const T &a, const T &b)
{
return vm_shuffle<VM_SHUFFLE_MASK(A,B,C,D)>(a, b);
}
template <int mask, typename T>
static SYS_FORCE_INLINE T
vm_shuffle(const T &a)
{
return vm_shuffle<mask>(a, a);
}
template <int A, int B, int C, int D, typename T>
static SYS_FORCE_INLINE T
vm_shuffle(const T &a)
{
return vm_shuffle<A,B,C,D>(a, a);
}
#if defined(VM_SSE41_STYLE)
static SYS_FORCE_INLINE v4si
vm_insert(const v4si v, int32 a, int n)
{
switch (n)
{
case 0: return _mm_insert_epi32(v, a, 0);
case 1: return _mm_insert_epi32(v, a, 1);
case 2: return _mm_insert_epi32(v, a, 2);
case 3: return _mm_insert_epi32(v, a, 3);
}
return v;
}
static SYS_FORCE_INLINE v4sf
vm_insert(const v4sf v, float a, int n)
{
switch (n)
{
case 0: return _mm_insert_ps(v, _mm_set_ss(a), _MM_MK_INSERTPS_NDX(0,0,0));
case 1: return _mm_insert_ps(v, _mm_set_ss(a), _MM_MK_INSERTPS_NDX(0,1,0));
case 2: return _mm_insert_ps(v, _mm_set_ss(a), _MM_MK_INSERTPS_NDX(0,2,0));
case 3: return _mm_insert_ps(v, _mm_set_ss(a), _MM_MK_INSERTPS_NDX(0,3,0));
}
return v;
}
static SYS_FORCE_INLINE int
vm_extract(const v4si v, int n)
{
switch (n)
{
case 0: return _mm_extract_epi32(v, 0);
case 1: return _mm_extract_epi32(v, 1);
case 2: return _mm_extract_epi32(v, 2);
case 3: return _mm_extract_epi32(v, 3);
}
return 0;
}
static SYS_FORCE_INLINE float
vm_extract(const v4sf v, int n)
{
SYS_FPRealUnionF tmp;
switch (n)
{
case 0: tmp.ival = _mm_extract_ps(v, 0); break;
case 1: tmp.ival = _mm_extract_ps(v, 1); break;
case 2: tmp.ival = _mm_extract_ps(v, 2); break;
case 3: tmp.ival = _mm_extract_ps(v, 3); break;
}
return tmp.fval;
}
#else
static SYS_FORCE_INLINE v4si
vm_insert(const v4si v, int32 a, int n)
{
union { v4si vector; int32 comp[4]; };
vector = v;
comp[n] = a;
return vector;
}
static SYS_FORCE_INLINE v4sf
vm_insert(const v4sf v, float a, int n)
{
union { v4sf vector; float comp[4]; };
vector = v;
comp[n] = a;
return vector;
}
static SYS_FORCE_INLINE int
vm_extract(const v4si v, int n)
{
union { v4si vector; int32 comp[4]; };
vector = v;
return comp[n];
}
static SYS_FORCE_INLINE float
vm_extract(const v4sf v, int n)
{
union { v4sf vector; float comp[4]; };
vector = v;
return comp[n];
}
#endif
static SYS_FORCE_INLINE v4sf
vm_splats(float a)
{
return _mm_set1_ps(a);
}
static SYS_FORCE_INLINE v4si
vm_splats(uint32 a)
{
SYS_FPRealUnionF tmp;
tmp.uval = a;
return V4SI(vm_splats(tmp.fval));
}
static SYS_FORCE_INLINE v4si
vm_splats(int32 a)
{
SYS_FPRealUnionF tmp;
tmp.ival = a;
return V4SI(vm_splats(tmp.fval));
}
static SYS_FORCE_INLINE v4sf
vm_splats(float a, float b, float c, float d)
{
return vm_shuffle<0,2,0,2>(
vm_shuffle<0>(_mm_set_ss(a), _mm_set_ss(b)),
vm_shuffle<0>(_mm_set_ss(c), _mm_set_ss(d)));
}
static SYS_FORCE_INLINE v4si
vm_splats(uint32 a, uint32 b, uint32 c, uint32 d)
{
SYS_FPRealUnionF af, bf, cf, df;
af.uval = a;
bf.uval = b;
cf.uval = c;
df.uval = d;
return V4SI(vm_splats(af.fval, bf.fval, cf.fval, df.fval));
}
static SYS_FORCE_INLINE v4si
vm_splats(int32 a, int32 b, int32 c, int32 d)
{
SYS_FPRealUnionF af, bf, cf, df;
af.ival = a;
bf.ival = b;
cf.ival = c;
df.ival = d;
return V4SI(vm_splats(af.fval, bf.fval, cf.fval, df.fval));
}
static SYS_FORCE_INLINE v4si
vm_load(const int32 v[4])
{
return V4SI(_mm_loadu_ps((const float *)v));
}
static SYS_FORCE_INLINE v4sf
vm_load(const float v[4])
{
return _mm_loadu_ps(v);
}
static SYS_FORCE_INLINE void
vm_store(float dst[4], v4sf value)
{
_mm_storeu_ps(dst, value);
}
static SYS_FORCE_INLINE v4sf
vm_negate(v4sf a)
{
return _mm_sub_ps(_mm_setzero_ps(), a);
}
static SYS_FORCE_INLINE v4sf
vm_abs(v4sf a)
{
return _mm_max_ps(a, vm_negate(a));
}
static SYS_FORCE_INLINE v4sf
vm_fdiv(v4sf a, v4sf b)
{
return _mm_mul_ps(a, _mm_rcp_ps(b));
}
static SYS_FORCE_INLINE v4sf
vm_fsqrt(v4sf a)
{
return _mm_rcp_ps(_mm_rsqrt_ps(a));
}
static SYS_FORCE_INLINE v4sf
vm_madd(v4sf a, v4sf b, v4sf c)
{
return _mm_add_ps(_mm_mul_ps(a, b), c);
}
static const v4si theSSETrue = vm_splats(0xFFFFFFFF);
static SYS_FORCE_INLINE bool
vm_allbits(const v4si &a)
{
return _mm_movemask_ps(V4SF(_mm_cmpeq_epi32(a, theSSETrue))) == 0xF;
}
#define VM_EXTRACT vm_extract
#define VM_INSERT vm_insert
#define VM_SPLATS vm_splats
#define VM_LOAD vm_load
#define VM_STORE vm_store
#define VM_CMPLT(A,B) V4SI(_mm_cmplt_ps(A,B))
#define VM_CMPLE(A,B) V4SI(_mm_cmple_ps(A,B))
#define VM_CMPGT(A,B) V4SI(_mm_cmpgt_ps(A,B))
#define VM_CMPGE(A,B) V4SI(_mm_cmpge_ps(A,B))
#define VM_CMPEQ(A,B) V4SI(_mm_cmpeq_ps(A,B))
#define VM_CMPNE(A,B) V4SI(_mm_cmpneq_ps(A,B))
#define VM_ICMPLT _mm_cmplt_epi32
#define VM_ICMPGT _mm_cmpgt_epi32
#define VM_ICMPEQ _mm_cmpeq_epi32
#define VM_IADD _mm_add_epi32
#define VM_ISUB _mm_sub_epi32
#define VM_ADD _mm_add_ps
#define VM_SUB _mm_sub_ps
#define VM_MUL _mm_mul_ps
#define VM_DIV _mm_div_ps
#define VM_SQRT _mm_sqrt_ps
#define VM_ISQRT _mm_rsqrt_ps
#define VM_INVERT _mm_rcp_ps
#define VM_ABS vm_abs
#define VM_FDIV vm_fdiv
#define VM_NEG vm_negate
#define VM_FSQRT vm_fsqrt
#define VM_MADD vm_madd
#define VM_MIN _mm_min_ps
#define VM_MAX _mm_max_ps
#define VM_AND _mm_and_si128
#define VM_ANDNOT _mm_andnot_si128
#define VM_OR _mm_or_si128
#define VM_XOR _mm_xor_si128
#define VM_ALLBITS vm_allbits
#define VM_SHUFFLE vm_shuffle
// Integer to float conversions
#define VM_SSE_ROUND_MASK 0x6000
#define VM_SSE_ROUND_ZERO 0x6000
#define VM_SSE_ROUND_UP 0x4000
#define VM_SSE_ROUND_DOWN 0x2000
#define VM_SSE_ROUND_NEAR 0x0000
#define GETROUND() (_mm_getcsr()&VM_SSE_ROUND_MASK)
#define SETROUND(x) (_mm_setcsr(x|(_mm_getcsr()&~VM_SSE_ROUND_MASK)))
// The P functions must be invoked before FLOOR, the E functions invoked
// afterwards to reset the state.
#define VM_P_FLOOR() uint rounding = GETROUND(); \
SETROUND(VM_SSE_ROUND_DOWN);
#define VM_FLOOR _mm_cvtps_epi32
#define VM_INT _mm_cvttps_epi32
#define VM_E_FLOOR() SETROUND(rounding);
// Float to integer conversion
#define VM_IFLOAT _mm_cvtepi32_ps
#endif
Computing file changes ...