diff --git a/include/bx/allocator.h b/include/bx/allocator.h index da59cc6..9091033 100644 --- a/include/bx/allocator.h +++ b/include/bx/allocator.h @@ -7,7 +7,6 @@ #define BX_ALLOCATOR_H_HEADER_GUARD #include "bx.h" -#include "uint32_t.h" #define BX_NEW(_allocator, _type) BX_PLACEMENT_NEW(bx::alloc(_allocator, sizeof(_type) ), _type) #define BX_ALIGNED_NEW(_allocator, _type, _align) BX_PLACEMENT_NEW(bx::alloc(_allocator, sizeof(_type), _align), _type) diff --git a/include/bx/bx.h b/include/bx/bx.h index 3a167b7..6b7fa8e 100644 --- a/include/bx/bx.h +++ b/include/bx/bx.h @@ -399,6 +399,53 @@ namespace bx , uint32_t _numStrides ); + /// Greatest common divisor. + /// + BX_CONSTEXPR_FUNC uint32_t gcd(uint32_t _a, uint32_t _b); + + /// Least common multiple. + /// + BX_CONSTEXPR_FUNC uint32_t lcm(uint32_t _a, uint32_t _b); + + /// Align to arbitrary stride. + /// + BX_CONSTEXPR_FUNC uint32_t strideAlign(uint32_t _offset, uint32_t _stride); + + /// Align to arbitrary stride and Min bytes. + /// + template + BX_CONSTEXPR_FUNC uint32_t strideAlign(uint32_t _offset, uint32_t _stride); + + /// Returns true if value is aligned to _align boundary. + /// + template + BX_CONSTEXPR_FUNC bool isAligned(Ty _a, size_t _align); + + template<> + BX_CONSTEXPR_FUNC bool isAligned(const void* _ptr, size_t _align); + + /// Aligns _a down to nearest multiple of _align. + /// + template + BX_CONSTEXPR_FUNC Ty alignDown(Ty _a, size_t _align); + + template + BX_CONSTEXPR_FUNC Ty* alignDown(Ty* _ptr, size_t _align); + + template + BX_CONSTEXPR_FUNC const Ty* alignDown(const Ty* _ptr, size_t _align); + + /// Aligns _a up to nearest multiple of _align. + /// + template + BX_CONSTEXPR_FUNC Ty alignUp(Ty _a, size_t _align); + + template + BX_CONSTEXPR_FUNC Ty* alignUp(Ty* _ptr, size_t _align); + + template + BX_CONSTEXPR_FUNC const Ty* alignUp(const Ty* _ptr, size_t _align); + } // namespace bx #include "inline/bx.inl" diff --git a/include/bx/float4x4_t.h b/include/bx/float4x4_t.h index 351532a..ce026bd 100644 --- a/include/bx/float4x4_t.h +++ b/include/bx/float4x4_t.h @@ -17,10 +17,10 @@ namespace bx }; /// Multiplies vector `_a` with matrix `_b` ignoring W component of vector `_a`. - simd128_t simd_mul_xyz1(simd128_t _a, const float4x4_t* _b); + simd128_t simd128_mul_xyz1(simd128_t _a, const float4x4_t* _b); /// Multiplies vector `_a` with matrix `_b`. - simd128_t simd_mul(simd128_t _a, const float4x4_t* _b); + simd128_t simd128_mul(simd128_t _a, const float4x4_t* _b); /// Multiplies two matrices. void float4x4_mul(float4x4_t* _result, const float4x4_t* _a, const float4x4_t* _b); diff --git a/include/bx/handlealloc.h b/include/bx/handlealloc.h index 22c21b7..1ab6b46 100644 --- a/include/bx/handlealloc.h +++ b/include/bx/handlealloc.h @@ -8,7 +8,7 @@ #include "bx.h" #include "allocator.h" -#include "uint32_t.h" +#include "simd_t.h" namespace bx { diff --git a/include/bx/inline/bx.inl b/include/bx/inline/bx.inl index afadbb4..6c2da7a 100644 --- a/include/bx/inline/bx.inl +++ b/include/bx/inline/bx.inl @@ -227,6 +227,9 @@ namespace bx return __builtin_is_constant_evaluated(); } + BX_PRAGMA_DIAGNOSTIC_PUSH(); + BX_PRAGMA_DIAGNOSTIC_IGNORED_CLANG_GCC("-Wpsabi"); + template inline constexpr Ty bitCast(const FromT& _from) { @@ -246,6 +249,8 @@ namespace bx return __builtin_bit_cast(Ty, _from); } + BX_PRAGMA_DIAGNOSTIC_POP(); + template requires (isInteger< Ty>() || isFloatingPoint< Ty>() ) && (isInteger() || isFloatingPoint() ) @@ -322,4 +327,104 @@ namespace bx constexpr float kFloatInfinity = bitCast(kFloatExponentMask); constexpr double kDoubleInfinity = bitCast(kDoubleExponentMask); + inline BX_CONSTEXPR_FUNC uint32_t gcd(uint32_t _a, uint32_t _b) + { + do + { + const uint32_t tmp = _a % _b; + _a = _b; + _b = tmp; + } + while (_b); + + return _a; + } + + inline BX_CONSTEXPR_FUNC uint32_t lcm(uint32_t _a, uint32_t _b) + { + return _a * (_b / gcd(_a, _b) ); + } + + inline BX_CONSTEXPR_FUNC uint32_t strideAlign(uint32_t _offset, uint32_t _stride) + { + const uint32_t mod = _offset % _stride; + const uint32_t add = _stride - mod; + const uint32_t tmp = (0 == mod) ? 0 : add; + const uint32_t result = _offset + tmp; + + return result; + } + + template + inline BX_CONSTEXPR_FUNC uint32_t strideAlign(uint32_t _offset, uint32_t _stride) + { + const uint32_t align = lcm(Min, _stride); + const uint32_t mod = _offset % align; + const uint32_t tmp0 = (0 == mod) ? 0 : align; + const uint32_t tmp1 = _offset + tmp0; + const uint32_t result = tmp1 - mod; + + return result; + } + + template + inline BX_CONSTEXPR_FUNC bool isAligned(Ty _a, size_t _align) + { + const size_t mask = max(1, _align) - 1; + return 0 == (size_t(_a) & mask); + } + + template<> + inline BX_CONSTEXPR_FUNC bool isAligned(const void* _ptr, size_t _align) + { + const uintptr_t addr = bitCast(_ptr); + return isAligned(addr, _align); + } + + template + inline BX_CONSTEXPR_FUNC Ty alignDown(Ty _a, size_t _align) + { + const size_t mask = max(1, _align) - 1; + return Ty(size_t(_a) & ~mask); + } + + template + inline BX_CONSTEXPR_FUNC Ty* alignDown(Ty* _ptr, size_t _align) + { + uintptr_t addr = bitCast(_ptr); + addr = alignDown(addr, _align); + return bitCast(addr); + } + + template + inline BX_CONSTEXPR_FUNC const Ty* alignDown(const Ty* _ptr, size_t _align) + { + uintptr_t addr = bitCast(_ptr); + addr = alignDown(addr, _align); + return bitCast(addr); + } + + template + inline BX_CONSTEXPR_FUNC Ty alignUp(Ty _a, size_t _align) + { + const size_t mask = max(1, _align) - 1; + return Ty( (size_t(_a) + mask) & ~mask); + } + + template + inline BX_CONSTEXPR_FUNC Ty* alignUp(Ty* _ptr, size_t _align) + { + uintptr_t addr = bitCast(_ptr); + addr = alignUp(addr, _align); + return bitCast(addr); + } + + template + inline BX_CONSTEXPR_FUNC const Ty* alignUp(const Ty* _ptr, size_t _align) + { + uintptr_t addr = bitCast(_ptr); + addr = alignUp(addr, _align); + return bitCast(addr); + } + } // namespace bx diff --git a/include/bx/inline/float4x4_t.inl b/include/bx/inline/float4x4_t.inl index aa57d5a..1e7a3cb 100644 --- a/include/bx/inline/float4x4_t.inl +++ b/include/bx/inline/float4x4_t.inl @@ -9,31 +9,31 @@ namespace bx { - BX_SIMD_FORCE_INLINE simd128_t simd_mul_xyz1(simd128_t _a, const float4x4_t* _b) + BX_SIMD_FORCE_INLINE simd128_t simd128_mul_xyz1(simd128_t _a, const float4x4_t* _b) { - const simd128_t xxxx = simd_swiz_xxxx(_a); - const simd128_t yyyy = simd_swiz_yyyy(_a); - const simd128_t zzzz = simd_swiz_zzzz(_a); - const simd128_t col0 = simd_mul(_b->col[0], xxxx); - const simd128_t col1 = simd_mul(_b->col[1], yyyy); - const simd128_t col2 = simd_madd(_b->col[2], zzzz, col0); - const simd128_t col3 = simd_add(_b->col[3], col1); - const simd128_t result = simd_add(col2, col3); + const simd128_t xxxx = simd128_x32_swiz_xxxx(_a); + const simd128_t yyyy = simd128_x32_swiz_yyyy(_a); + const simd128_t zzzz = simd128_x32_swiz_zzzz(_a); + const simd128_t col0 = simd128_f32_mul(_b->col[0], xxxx); + const simd128_t col1 = simd128_f32_mul(_b->col[1], yyyy); + const simd128_t col2 = simd128_f32_madd(_b->col[2], zzzz, col0); + const simd128_t col3 = simd128_f32_add(_b->col[3], col1); + const simd128_t result = simd128_f32_add(col2, col3); return result; } - BX_SIMD_FORCE_INLINE simd128_t simd_mul(simd128_t _a, const float4x4_t* _b) + BX_SIMD_FORCE_INLINE simd128_t simd128_mul(simd128_t _a, const float4x4_t* _b) { - const simd128_t xxxx = simd_swiz_xxxx(_a); - const simd128_t yyyy = simd_swiz_yyyy(_a); - const simd128_t zzzz = simd_swiz_zzzz(_a); - const simd128_t wwww = simd_swiz_wwww(_a); - const simd128_t col0 = simd_mul(_b->col[0], xxxx); - const simd128_t col1 = simd_mul(_b->col[1], yyyy); - const simd128_t col2 = simd_madd(_b->col[2], zzzz, col0); - const simd128_t col3 = simd_madd(_b->col[3], wwww, col1); - const simd128_t result = simd_add(col2, col3); + const simd128_t xxxx = simd128_x32_swiz_xxxx(_a); + const simd128_t yyyy = simd128_x32_swiz_yyyy(_a); + const simd128_t zzzz = simd128_x32_swiz_zzzz(_a); + const simd128_t wwww = simd128_x32_swiz_wwww(_a); + const simd128_t col0 = simd128_f32_mul(_b->col[0], xxxx); + const simd128_t col1 = simd128_f32_mul(_b->col[1], yyyy); + const simd128_t col2 = simd128_f32_madd(_b->col[2], zzzz, col0); + const simd128_t col3 = simd128_f32_madd(_b->col[3], wwww, col1); + const simd128_t result = simd128_f32_add(col2, col3); return result; } @@ -41,10 +41,10 @@ namespace bx BX_SIMD_INLINE void float4x4_mul(float4x4_t* _result, const float4x4_t* _a, const float4x4_t* _b) { #if BX_SIMD_SUPPORTED - _result->col[0] = simd_mul(_a->col[0], _b); - _result->col[1] = simd_mul(_a->col[1], _b); - _result->col[2] = simd_mul(_a->col[2], _b); - _result->col[3] = simd_mul(_a->col[3], _b); + _result->col[0] = simd128_mul(_a->col[0], _b); + _result->col[1] = simd128_mul(_a->col[1], _b); + _result->col[2] = simd128_mul(_a->col[2], _b); + _result->col[3] = simd128_mul(_a->col[3], _b); #else const float* aa = (const float*)_a; const float* bb = (const float*)_b; @@ -137,103 +137,103 @@ namespace bx BX_SIMD_FORCE_INLINE void float4x4_transpose(float4x4_t* _result, const float4x4_t* _mtx) { - const simd128_t aibj = simd_shuf_xAyB(_mtx->col[0], _mtx->col[2]); // aibj - const simd128_t emfn = simd_shuf_xAyB(_mtx->col[1], _mtx->col[3]); // emfn - const simd128_t ckdl = simd_shuf_zCwD(_mtx->col[0], _mtx->col[2]); // ckdl - const simd128_t gohp = simd_shuf_zCwD(_mtx->col[1], _mtx->col[3]); // gohp - _result->col[0] = simd_shuf_xAyB(aibj, emfn); // aeim - _result->col[1] = simd_shuf_zCwD(aibj, emfn); // bfjn - _result->col[2] = simd_shuf_xAyB(ckdl, gohp); // cgko - _result->col[3] = simd_shuf_zCwD(ckdl, gohp); // dhlp + const simd128_t aibj = simd128_x32_shuf_xAyB(_mtx->col[0], _mtx->col[2]); // aibj + const simd128_t emfn = simd128_x32_shuf_xAyB(_mtx->col[1], _mtx->col[3]); // emfn + const simd128_t ckdl = simd128_x32_shuf_zCwD(_mtx->col[0], _mtx->col[2]); // ckdl + const simd128_t gohp = simd128_x32_shuf_zCwD(_mtx->col[1], _mtx->col[3]); // gohp + _result->col[0] = simd128_x32_shuf_xAyB(aibj, emfn); // aeim + _result->col[1] = simd128_x32_shuf_zCwD(aibj, emfn); // bfjn + _result->col[2] = simd128_x32_shuf_xAyB(ckdl, gohp); // cgko + _result->col[3] = simd128_x32_shuf_zCwD(ckdl, gohp); // dhlp } BX_SIMD_INLINE void float4x4_inverse(float4x4_t* _result, const float4x4_t* _a) { - const simd128_t tmp0 = simd_shuf_xAzC(_a->col[0], _a->col[1]); - const simd128_t tmp1 = simd_shuf_xAzC(_a->col[2], _a->col[3]); - const simd128_t tmp2 = simd_shuf_yBwD(_a->col[0], _a->col[1]); - const simd128_t tmp3 = simd_shuf_yBwD(_a->col[2], _a->col[3]); - const simd128_t t0 = simd_shuf_xyAB(tmp0, tmp1); - const simd128_t t1 = simd_shuf_xyAB(tmp3, tmp2); - const simd128_t t2 = simd_shuf_zwCD(tmp0, tmp1); - const simd128_t t3 = simd_shuf_zwCD(tmp3, tmp2); + const simd128_t tmp0 = simd128_x32_shuf_xAzC(_a->col[0], _a->col[1]); + const simd128_t tmp1 = simd128_x32_shuf_xAzC(_a->col[2], _a->col[3]); + const simd128_t tmp2 = simd128_x32_shuf_yBwD(_a->col[0], _a->col[1]); + const simd128_t tmp3 = simd128_x32_shuf_yBwD(_a->col[2], _a->col[3]); + const simd128_t t0 = simd128_x32_shuf_xyAB(tmp0, tmp1); + const simd128_t t1 = simd128_x32_shuf_xyAB(tmp3, tmp2); + const simd128_t t2 = simd128_x32_shuf_zwCD(tmp0, tmp1); + const simd128_t t3 = simd128_x32_shuf_zwCD(tmp3, tmp2); - const simd128_t t23 = simd_mul(t2, t3); - const simd128_t t23_yxwz = simd_swiz_yxwz(t23); - const simd128_t t23_wzyx = simd_swiz_wzyx(t23); + const simd128_t t23 = simd128_f32_mul(t2, t3); + const simd128_t t23_yxwz = simd128_x32_swiz_yxwz(t23); + const simd128_t t23_wzyx = simd128_x32_swiz_wzyx(t23); simd128_t cof0, cof1, cof2, cof3; - const simd128_t zero = simd_zero(); - cof0 = simd_nmsub(t1, t23_yxwz, zero); - cof0 = simd_madd(t1, t23_wzyx, cof0); + const simd128_t zero = simd128_zero(); + cof0 = simd128_f32_nmsub(t1, t23_yxwz, zero); + cof0 = simd128_f32_madd(t1, t23_wzyx, cof0); - cof1 = simd_nmsub(t0, t23_yxwz, zero); - cof1 = simd_madd(t0, t23_wzyx, cof1); - cof1 = simd_swiz_zwxy(cof1); + cof1 = simd128_f32_nmsub(t0, t23_yxwz, zero); + cof1 = simd128_f32_madd(t0, t23_wzyx, cof1); + cof1 = simd128_x32_swiz_zwxy(cof1); - const simd128_t t12 = simd_mul(t1, t2); - const simd128_t t12_yxwz = simd_swiz_yxwz(t12); - const simd128_t t12_wzyx = simd_swiz_wzyx(t12); + const simd128_t t12 = simd128_f32_mul(t1, t2); + const simd128_t t12_yxwz = simd128_x32_swiz_yxwz(t12); + const simd128_t t12_wzyx = simd128_x32_swiz_wzyx(t12); - cof0 = simd_madd(t3, t12_yxwz, cof0); - cof0 = simd_nmsub(t3, t12_wzyx, cof0); + cof0 = simd128_f32_madd(t3, t12_yxwz, cof0); + cof0 = simd128_f32_nmsub(t3, t12_wzyx, cof0); - cof3 = simd_mul(t0, t12_yxwz); - cof3 = simd_nmsub(t0, t12_wzyx, cof3); - cof3 = simd_swiz_zwxy(cof3); + cof3 = simd128_f32_mul(t0, t12_yxwz); + cof3 = simd128_f32_nmsub(t0, t12_wzyx, cof3); + cof3 = simd128_x32_swiz_zwxy(cof3); - const simd128_t t1_zwxy = simd_swiz_zwxy(t1); - const simd128_t t2_zwxy = simd_swiz_zwxy(t2); + const simd128_t t1_zwxy = simd128_x32_swiz_zwxy(t1); + const simd128_t t2_zwxy = simd128_x32_swiz_zwxy(t2); - const simd128_t t13 = simd_mul(t1_zwxy, t3); - const simd128_t t13_yxwz = simd_swiz_yxwz(t13); - const simd128_t t13_wzyx = simd_swiz_wzyx(t13); + const simd128_t t13 = simd128_f32_mul(t1_zwxy, t3); + const simd128_t t13_yxwz = simd128_x32_swiz_yxwz(t13); + const simd128_t t13_wzyx = simd128_x32_swiz_wzyx(t13); - cof0 = simd_madd(t2_zwxy, t13_yxwz, cof0); - cof0 = simd_nmsub(t2_zwxy, t13_wzyx, cof0); + cof0 = simd128_f32_madd(t2_zwxy, t13_yxwz, cof0); + cof0 = simd128_f32_nmsub(t2_zwxy, t13_wzyx, cof0); - cof2 = simd_mul(t0, t13_yxwz); - cof2 = simd_nmsub(t0, t13_wzyx, cof2); - cof2 = simd_swiz_zwxy(cof2); + cof2 = simd128_f32_mul(t0, t13_yxwz); + cof2 = simd128_f32_nmsub(t0, t13_wzyx, cof2); + cof2 = simd128_x32_swiz_zwxy(cof2); - const simd128_t t01 = simd_mul(t0, t1); - const simd128_t t01_yxwz = simd_swiz_yxwz(t01); - const simd128_t t01_wzyx = simd_swiz_wzyx(t01); + const simd128_t t01 = simd128_f32_mul(t0, t1); + const simd128_t t01_yxwz = simd128_x32_swiz_yxwz(t01); + const simd128_t t01_wzyx = simd128_x32_swiz_wzyx(t01); - cof2 = simd_nmsub(t3, t01_yxwz, cof2); - cof2 = simd_madd(t3, t01_wzyx, cof2); + cof2 = simd128_f32_nmsub(t3, t01_yxwz, cof2); + cof2 = simd128_f32_madd(t3, t01_wzyx, cof2); - cof3 = simd_madd(t2_zwxy, t01_yxwz, cof3); - cof3 = simd_nmsub(t2_zwxy, t01_wzyx, cof3); + cof3 = simd128_f32_madd(t2_zwxy, t01_yxwz, cof3); + cof3 = simd128_f32_nmsub(t2_zwxy, t01_wzyx, cof3); - const simd128_t t03 = simd_mul(t0, t3); - const simd128_t t03_yxwz = simd_swiz_yxwz(t03); - const simd128_t t03_wzyx = simd_swiz_wzyx(t03); + const simd128_t t03 = simd128_f32_mul(t0, t3); + const simd128_t t03_yxwz = simd128_x32_swiz_yxwz(t03); + const simd128_t t03_wzyx = simd128_x32_swiz_wzyx(t03); - cof1 = simd_nmsub(t2_zwxy, t03_yxwz, cof1); - cof1 = simd_madd(t2_zwxy, t03_wzyx, cof1); + cof1 = simd128_f32_nmsub(t2_zwxy, t03_yxwz, cof1); + cof1 = simd128_f32_madd(t2_zwxy, t03_wzyx, cof1); - cof2 = simd_madd(t1, t03_yxwz, cof2); - cof2 = simd_nmsub(t1, t03_wzyx, cof2); + cof2 = simd128_f32_madd(t1, t03_yxwz, cof2); + cof2 = simd128_f32_nmsub(t1, t03_wzyx, cof2); - const simd128_t t02 = simd_mul(t0, t2_zwxy); - const simd128_t t02_yxwz = simd_swiz_yxwz(t02); - const simd128_t t02_wzyx = simd_swiz_wzyx(t02); + const simd128_t t02 = simd128_f32_mul(t0, t2_zwxy); + const simd128_t t02_yxwz = simd128_x32_swiz_yxwz(t02); + const simd128_t t02_wzyx = simd128_x32_swiz_wzyx(t02); - cof1 = simd_madd(t3, t02_yxwz, cof1); - cof1 = simd_nmsub(t3, t02_wzyx, cof1); + cof1 = simd128_f32_madd(t3, t02_yxwz, cof1); + cof1 = simd128_f32_nmsub(t3, t02_wzyx, cof1); - cof3 = simd_nmsub(t1, t02_yxwz, cof3); - cof3 = simd_madd(t1, t02_wzyx, cof3); + cof3 = simd128_f32_nmsub(t1, t02_yxwz, cof3); + cof3 = simd128_f32_madd(t1, t02_wzyx, cof3); - const simd128_t det = simd_dot(t0, cof0); - const simd128_t invdet = simd_rcp(det); + const simd128_t det = simd128_f32_dot(t0, cof0); + const simd128_t invdet = simd128_f32_rcp(det); - _result->col[0] = simd_mul(cof0, invdet); - _result->col[1] = simd_mul(cof1, invdet); - _result->col[2] = simd_mul(cof2, invdet); - _result->col[3] = simd_mul(cof3, invdet); + _result->col[0] = simd128_f32_mul(cof0, invdet); + _result->col[1] = simd128_f32_mul(cof1, invdet); + _result->col[2] = simd128_f32_mul(cof2, invdet); + _result->col[3] = simd128_f32_mul(cof3, invdet); } } // namespace bx diff --git a/include/bx/inline/handlealloc.inl b/include/bx/inline/handlealloc.inl index 0735a90..fa5cd54 100644 --- a/include/bx/inline/handlealloc.inl +++ b/include/bx/inline/handlealloc.inl @@ -7,6 +7,8 @@ # error "Must be included from bx/handlealloc.h!" #endif // BX_HANDLE_ALLOC_H_HEADER_GUARD +#include + namespace bx { inline HandleAlloc::HandleAlloc(uint16_t _maxHandles) @@ -597,19 +599,25 @@ namespace bx template inline uint32_t HandleHashMapT::mix(uint32_t _x) const { - const uint32_t tmp0 = uint32_mul(_x, UINT32_C(2246822519) ); - const uint32_t tmp1 = uint32_rol(tmp0, 13); - const uint32_t result = uint32_mul(tmp1, UINT32_C(2654435761) ); - return result; + const simd32_t x = simd32_splat(_x); + const simd32_t c0 = simd32_splat(2246822519u); + const simd32_t tmp0 = simd32_u32_mul(x, c0); + const simd32_t tmp1 = simd32_x32_rol(tmp0, 13); + const simd32_t c1 = simd32_splat(2654435761u); + const simd32_t result = simd32_u32_mul(tmp1, c1); + return result.u32; } template inline uint64_t HandleHashMapT::mix(uint64_t _x) const { - const uint64_t tmp0 = uint64_mul(_x, UINT64_C(14029467366897019727) ); - const uint64_t tmp1 = uint64_rol(tmp0, 31); - const uint64_t result = uint64_mul(tmp1, UINT64_C(11400714785074694791) ); - return result; + const simd64_t x = simd64_splat(_x); + const simd64_t c0 = simd64_splat(uint64_t(14029467366897019727ull) ); + const simd64_t tmp0 = simd64_u64_mul(x, c0); + const simd64_t tmp1 = simd64_x64_rol(tmp0, 31); + const simd64_t c1 = simd64_splat(uint64_t(11400714785074694791ull) ); + const simd64_t result = simd64_u64_mul(tmp1, c1); + return result.u64; } template diff --git a/include/bx/inline/math.inl b/include/bx/inline/math.inl index 25af7c8..aa262ca 100644 --- a/include/bx/inline/math.inl +++ b/include/bx/inline/math.inl @@ -10,7 +10,6 @@ #endif // BX_MATH_H_HEADER_GUARD #include -#include #if BX_COMPILER_MSVC extern "C" unsigned char _BitScanReverse(unsigned long* _Index, unsigned long _Mask); @@ -65,11 +64,14 @@ namespace bx // Reference(s): // - http://archive.fo/2012.12.08-212402/http://stereopsis.com/radix.html // - const uint32_t tmp0 = uint32_sra(_value, 31); - const uint32_t tmp1 = uint32_neg(tmp0); - const uint32_t mask = uint32_or(tmp1, kFloatSignMask); - const uint32_t result = uint32_xor(_value, mask); - return result; + const simd32_t signMask = simd32_splat(kFloatSignMask); + const simd32_t value = simd32_splat(_value); + const simd32_t tmp0 = simd32_x32_sra(value, 31); + const simd32_t tmp1 = simd32_i32_neg(tmp0); + const simd32_t mask = simd32_or(tmp1, signMask); + const simd32_t result = simd32_xor(value, mask); + + return result.u32; } inline BX_CONSTEXPR_FUNC bool isNan(float _f) @@ -199,6 +201,51 @@ namespace bx return _a - _b; } + template + inline BX_CONSTEXPR_FUNC Ty satAdd(Ty _a, Ty _b) + { + static_assert(isInteger(), "Type Ty must be an integer type."); + + using UTy = MakeUnsignedType; + + const UTy ua = UTy(_a); + const UTy ub = UTy(_b); + const UTy sum = UTy(ua + ub); + + if constexpr (isSigned() ) + { + const UTy signBit = UTy(UTy(1) << (sizeof(Ty)*8 - 1) ); + const UTy overflow = UTy(~(ua ^ ub) & (ua ^ sum) & signBit); + const Ty satVal = (ua & signBit) ? LimitsT::min : LimitsT::max; + return 0 != overflow ? satVal : Ty(sum); + } + + return sum < ua ? LimitsT::max : Ty(sum); + } + + template + inline BX_CONSTEXPR_FUNC Ty satSub(Ty _a, Ty _b) + { + static_assert(isInteger(), "Type Ty must be an integer type."); + + using UTy = MakeUnsignedType; + + const UTy ua = UTy(_a); + const UTy ub = UTy(_b); + const UTy diff = UTy(ua - ub); + + if constexpr (isSigned() ) + { + const UTy signBit = UTy(UTy(1) << (sizeof(Ty)*8 - 1) ); + const UTy overflow = UTy( (ua ^ ub) & (ua ^ diff) & signBit); + const Ty satVal = (ua & signBit) ? LimitsT::min : LimitsT::max; + + return 0 != overflow ? satVal : Ty(diff); + } + + return ua > ub ? Ty(diff) : Ty(0); + } + inline BX_CONSTEXPR_FUNC float mul(float _a, float _b) { return _a * _b; @@ -394,16 +441,18 @@ namespace bx inline BX_CONSTEXPR_FUNC float ldexp(float _a, int32_t _b) { - const uint32_t ftob = floatToBits(_a); - const uint32_t masked = uint32_and(ftob, kFloatSignMask | kFloatExponentMask); - const uint32_t expsign0 = uint32_sra(masked, kFloatExponentBitShift); - const uint32_t tmp = uint32_iadd(expsign0, _b); - const uint32_t expsign1 = uint32_sll(tmp, kFloatExponentBitShift); - const uint32_t mantissa = uint32_and(ftob, kFloatMantissaMask); - const uint32_t bits = uint32_or(mantissa, expsign1); - const float result = bitsToFloat(bits); + const simd32_t ftob = simd32_splat(floatToBits(_a)); + const simd32_t signexpmask = simd32_splat(kFloatSignMask | kFloatExponentMask); + const simd32_t mantmask = simd32_splat(kFloatMantissaMask); + const simd32_t b = simd32_splat(_b); + const simd32_t masked = simd32_and(ftob, signexpmask); + const simd32_t expsign0 = simd32_x32_sra(masked, kFloatExponentBitShift); + const simd32_t tmp = simd32_i32_add(expsign0, b); + const simd32_t expsign1 = simd32_x32_sll(tmp, kFloatExponentBitShift); + const simd32_t mantissa = simd32_and(ftob, mantmask); + const simd32_t bits = simd32_or(mantissa, expsign1); - return result; + return bitsToFloat(bits.u32); } inline BX_CONSTEXPR_FUNC float log(float _a) @@ -418,15 +467,19 @@ namespace bx return -kFloatInfinity; } - const uint32_t ftob = floatToBits(_a); + const simd32_t ftob = simd32_splat(floatToBits(_a)); + const simd32_t expmask = simd32_splat(kFloatExponentMask); + const simd32_t signmantmask = simd32_splat(kFloatSignMask | kFloatMantissaMask); + const simd32_t half = simd32_splat(UINT32_C(0x3f000000)); - const uint32_t masked0 = uint32_and(ftob, kFloatExponentMask); - const uint32_t exp0 = uint32_srl(masked0, kFloatExponentBitShift); - int32_t exp = int32_t(exp0 - 0x7e); + const simd32_t masked0 = simd32_and(ftob, expmask); + const simd32_t exp0 = simd32_x32_srl(masked0, kFloatExponentBitShift); - const uint32_t masked1 = uint32_and(ftob, kFloatSignMask | kFloatMantissaMask); - const uint32_t bits = uint32_or(masked1, UINT32_C(0x3f000000) ); - float ff = bitsToFloat(bits); + int32_t exp = int32_t(exp0.u32) - 0x7e; + + const simd32_t masked1 = simd32_and(ftob, signmantmask); + const simd32_t bits = simd32_or(masked1, half); + float ff = bitsToFloat(bits.u32); if (ff < kSqrt2*0.5f) { @@ -434,13 +487,13 @@ namespace bx --exp; } - constexpr float kLogC0 = 6.666666666666735130e-01f; - constexpr float kLogC1 = 3.999999999940941908e-01f; - constexpr float kLogC2 = 2.857142874366239149e-01f; - constexpr float kLogC3 = 2.222219843214978396e-01f; - constexpr float kLogC4 = 1.818357216161805012e-01f; - constexpr float kLogC5 = 1.531383769920937332e-01f; - constexpr float kLogC6 = 1.479819860511658591e-01f; + constexpr float kLogC0 = 6.666666666666735130e-01f; + constexpr float kLogC1 = 3.999999999940941908e-01f; + constexpr float kLogC2 = 2.857142874366239149e-01f; + constexpr float kLogC3 = 2.222219843214978396e-01f; + constexpr float kLogC4 = 1.818357216161805012e-01f; + constexpr float kLogC5 = 1.531383769920937332e-01f; + constexpr float kLogC6 = 1.479819860511658591e-01f; constexpr float kLogNat2Lo = 1.90821492927058770002e-10f; ff -= 1.0f; @@ -541,23 +594,23 @@ namespace bx #if BX_COMPILER_GCC || BX_COMPILER_CLANG return __builtin_popcount(_val); #else - const uint32_t tmp0 = uint32_srl(_val, 1); - const uint32_t tmp1 = uint32_and(tmp0, 0x55555555); - const uint32_t tmp2 = uint32_sub(_val, tmp1); - const uint32_t tmp3 = uint32_and(tmp2, 0xc30c30c3); - const uint32_t tmp4 = uint32_srl(tmp2, 2); - const uint32_t tmp5 = uint32_and(tmp4, 0xc30c30c3); - const uint32_t tmp6 = uint32_srl(tmp2, 4); - const uint32_t tmp7 = uint32_and(tmp6, 0xc30c30c3); - const uint32_t tmp8 = uint32_add(tmp3, tmp5); - const uint32_t tmp9 = uint32_add(tmp7, tmp8); - const uint32_t tmpA = uint32_srl(tmp9, 6); - const uint32_t tmpB = uint32_add(tmp9, tmpA); - const uint32_t tmpC = uint32_srl(tmpB, 12); - const uint32_t tmpD = uint32_srl(tmpB, 24); - const uint32_t tmpE = uint32_add(tmpB, tmpC); - const uint32_t tmpF = uint32_add(tmpD, tmpE); - const uint32_t result = uint32_and(tmpF, 0x3f); + const uint32_t tmp0 = (_val >> 1); + const uint32_t tmp1 = (tmp0 & 0x55555555); + const uint32_t tmp2 = (_val - tmp1); + const uint32_t tmp3 = (tmp2 & 0xc30c30c3); + const uint32_t tmp4 = (tmp2 >> 2); + const uint32_t tmp5 = (tmp4 & 0xc30c30c3); + const uint32_t tmp6 = (tmp2 >> 4); + const uint32_t tmp7 = (tmp6 & 0xc30c30c3); + const uint32_t tmp8 = (tmp3 + tmp5); + const uint32_t tmp9 = (tmp7 + tmp8); + const uint32_t tmpA = (tmp9 >> 6); + const uint32_t tmpB = (tmp9 + tmpA); + const uint32_t tmpC = (tmpB >> 12); + const uint32_t tmpD = (tmpB >> 24); + const uint32_t tmpE = (tmpB + tmpC); + const uint32_t tmpF = (tmpD + tmpE); + const uint32_t result = (tmpF & 0x3f); return uint8_t(result); #endif // BX_COMPILER_* @@ -607,20 +660,21 @@ namespace bx ; } # endif // BX_COMPILER_MSVC - const uint32_t tmp0 = uint32_srl(_val, 1); - const uint32_t tmp1 = uint32_or(tmp0, _val); - const uint32_t tmp2 = uint32_srl(tmp1, 2); - const uint32_t tmp3 = uint32_or(tmp2, tmp1); - const uint32_t tmp4 = uint32_srl(tmp3, 4); - const uint32_t tmp5 = uint32_or(tmp4, tmp3); - const uint32_t tmp6 = uint32_srl(tmp5, 8); - const uint32_t tmp7 = uint32_or(tmp6, tmp5); - const uint32_t tmp8 = uint32_srl(tmp7, 16); - const uint32_t tmp9 = uint32_or(tmp8, tmp7); - const uint32_t tmpA = uint32_not(tmp9); - const uint32_t result = uint32_cntbits(tmpA); + const simd32_t val = simd32_splat(_val); + const simd32_t tmp0 = simd32_x32_srl(val, 1); + const simd32_t tmp1 = simd32_or(tmp0, val); + const simd32_t tmp2 = simd32_x32_srl(tmp1, 2); + const simd32_t tmp3 = simd32_or(tmp2, tmp1); + const simd32_t tmp4 = simd32_x32_srl(tmp3, 4); + const simd32_t tmp5 = simd32_or(tmp4, tmp3); + const simd32_t tmp6 = simd32_x32_srl(tmp5, 8); + const simd32_t tmp7 = simd32_or(tmp6, tmp5); + const simd32_t tmp8 = simd32_x32_srl(tmp7, 16); + const simd32_t tmp9 = simd32_or(tmp8, tmp7); + const simd32_t tmpA = simd32_not(tmp9); + const simd32_t result = simd32_x32_cntbits(tmpA); - return uint8_t(result); + return uint8_t(result.u32); #endif // BX_COMPILER_* } @@ -676,12 +730,14 @@ namespace bx ; } # endif // BX_COMPILER_MSVC - const uint32_t tmp0 = uint32_not(_val); - const uint32_t tmp1 = uint32_dec(_val); - const uint32_t tmp2 = uint32_and(tmp0, tmp1); - const uint32_t result = uint32_cntbits(tmp2); + const simd32_t val = simd32_splat(_val); + const simd32_t one = simd32_splat(1); + const simd32_t tmp0 = simd32_not(val); + const simd32_t tmp1 = simd32_u32_sub(val, one); + const simd32_t tmp2 = simd32_and(tmp0, tmp1); + const simd32_t result = simd32_x32_cntbits(tmp2); - return uint8_t(result); + return uint8_t(result.u32); #endif // BX_COMPILER_* } @@ -778,15 +834,11 @@ namespace bx return kFloatInfinity; } - const simd128_t aa = simd_splat(_a); -#if BX_SIMD_NEON - const simd128_t rsqrta = simd_rsqrt_nr(aa); -#else - const simd128_t rsqrta = simd_rsqrt_ni(aa); -#endif // BX_SIMD_NEON + const simd128_t aa = simd_splat(_a); + const simd128_t rsqrta = simd_f32_rsqrt(aa); float result = 0.0f; - simd_stx(&result, rsqrta); + simd_x32_st1(&result, rsqrta); return result; } @@ -812,11 +864,11 @@ namespace bx return 0.0f; } - const simd128_t aa = simd_splat(_a); - const simd128_t sqrt = simd_sqrt(aa); + const simd128_t aa = simd_splat(_a); + const simd128_t sqrt = simd_f32_sqrt(aa); float result = 0.0f; - simd_stx(&result, sqrt); + simd_x32_st1(&result, sqrt); return result; } @@ -1914,4 +1966,18 @@ namespace bx return result; } + inline BX_CONST_FUNC uint16_t halfFromFloat(float _a) + { + const simd32_t a = { .u32 = bitCast(_a) }; + const simd32_t result = simd_f16_fromf32_ni(a); + return uint16_t(result.u32); + } + + inline BX_CONST_FUNC float halfToFloat(uint16_t _a) + { + const simd32_t a = simd32_splat(uint32_t(_a) ); + const simd32_t result = simd_f16_tof32_ni(a); + return bitCast(result.u32); + } + } // namespace bx diff --git a/include/bx/inline/readerwriter.inl b/include/bx/inline/readerwriter.inl index 356f278..1e754fd 100644 --- a/include/bx/inline/readerwriter.inl +++ b/include/bx/inline/readerwriter.inl @@ -128,7 +128,7 @@ namespace bx } int64_t remainder = m_top-m_pos; - int32_t size = uint32_min(_size, uint32_t(min(remainder, INT32_MAX) ) ); + int32_t size = int32_t(max(0, min(_size, remainder, INT32_MAX) ) ); m_pos += size; if (size != _size) { @@ -173,7 +173,7 @@ namespace bx BX_ASSERT(NULL != _err, "Reader/Writer interface calling functions must handle errors."); int64_t remainder = m_top-m_pos; - int32_t size = uint32_min(_size, uint32_t(min(remainder, INT32_MAX) ) ); + int32_t size = int32_t(max(0, min(_size, remainder, INT32_MAX) ) ); memCopy(_data, &m_data[m_pos], size); m_pos += size; if (size != _size) @@ -245,7 +245,7 @@ namespace bx } int64_t remainder = m_size-m_pos; - int32_t size = uint32_min(_size, uint32_t(min(remainder, INT32_MAX) ) ); + int32_t size = int32_t(max(0, min(_size, remainder, INT32_MAX) ) ); memCopy(&m_data[m_pos], _data, size); m_pos += size; m_top = max(m_top, m_pos); @@ -301,16 +301,16 @@ namespace bx { BX_ERROR_SCOPE(_err); - const uint32_t tmp0 = uint32_sels(64 - _size, 64, _size); - const uint32_t tmp1 = uint32_sels(256 - _size, 256, tmp0); - const uint32_t blockSize = uint32_sels(1024 - _size, 1024, tmp1); + const uint32_t tmp0 = simd32_sels(simd32_splat(64 - _size), simd32_splat( 64), simd32_splat(_size)).u32; + const uint32_t tmp1 = simd32_sels(simd32_splat(256 - _size), simd32_splat( 256), simd32_splat(tmp0 )).u32; + const uint32_t blockSize = simd32_sels(simd32_splat(1024 - _size), simd32_splat(1024), simd32_splat(tmp1 )).u32; uint8_t* temp = (uint8_t*)BX_STACK_ALLOC(blockSize); memSet(temp, _byte, blockSize); int32_t size = 0; while (0 < _size && _err->isOk() ) { - int32_t bytes = write(_writer, temp, uint32_min(blockSize, _size), _err); + int32_t bytes = write(_writer, temp, min(blockSize, _size), _err); size += bytes; _size -= bytes; } diff --git a/include/bx/inline/ringbuffer.inl b/include/bx/inline/ringbuffer.inl index 51d6dbe..d50e286 100644 --- a/include/bx/inline/ringbuffer.inl +++ b/include/bx/inline/ringbuffer.inl @@ -62,48 +62,67 @@ namespace bx inline uint32_t RingBufferControl::consume(uint32_t _size) { - const uint32_t maxSize = distance(m_read, m_current); - const uint32_t sizeNoSign = uint32_and(_size, 0x7fffffff); - const uint32_t test = uint32_sub(sizeNoSign, maxSize); - const uint32_t size = uint32_sels(test, _size, maxSize); - const uint32_t advance = uint32_add(m_read, size); - const uint32_t read = uint32_mod(advance, m_size); - m_read = read; - return size; + const simd32_t maxSize = simd32_splat(distance(m_read, m_current) ); + const simd32_t size = simd32_splat(_size); + const simd32_t signMask = simd32_splat(0x7fffffffu); + const simd32_t sizeNoSign = simd32_and(size, signMask); + const simd32_t test = simd32_u32_sub(sizeNoSign, maxSize); + const simd32_t result = simd32_sels(test, size, maxSize); + const simd32_t readVal = simd32_splat(m_read); + const simd32_t advance = simd32_u32_add(readVal, result); + const simd32_t sizeVal = simd32_splat(m_size); + const simd32_t read = simd32_u32_mod(advance, sizeVal); + m_read = read.u32; + return result.u32; } inline uint32_t RingBufferControl::reserve(uint32_t _size, bool _mustSucceed) { - const uint32_t dist = distance(m_write, m_read)-1; - const uint32_t maxSize = uint32_sels(dist, m_size-1, dist); - const uint32_t sizeNoSign = uint32_and(_size, 0x7fffffff); - const uint32_t test = uint32_sub(sizeNoSign, maxSize); - const uint32_t size = uint32_sels(test, _size, _mustSucceed ? 0 : maxSize); - const uint32_t advance = uint32_add(m_write, size); - const uint32_t write = uint32_mod(advance, m_size); - m_write = write; - return size; + const simd32_t distVal = simd32_splat(distance(m_write, m_read) ); + const simd32_t one = simd32_splat(1u); + const simd32_t dist = simd32_u32_sub(distVal, one); + const simd32_t sizeVal = simd32_splat(m_size); + const simd32_t sizem1 = simd32_u32_sub(sizeVal, one); + const simd32_t maxSize = simd32_sels(dist, sizem1, dist); + const simd32_t size = simd32_splat(_size); + const simd32_t signMask = simd32_splat(0x7fffffffu); + const simd32_t sizeNoSign = simd32_and(size, signMask); + const simd32_t test = simd32_u32_sub(sizeNoSign, maxSize); + const simd32_t zero = simd32_splat(0u); + const simd32_t fail = _mustSucceed ? zero : maxSize; + const simd32_t result = simd32_sels(test, size, fail); + const simd32_t writeVal = simd32_splat(m_write); + const simd32_t advance = simd32_u32_add(writeVal, result); + const simd32_t write = simd32_u32_mod(advance, sizeVal); + m_write = write.u32; + return result.u32; } inline uint32_t RingBufferControl::commit(uint32_t _size) { - const uint32_t maxSize = distance(m_current, m_write); - const uint32_t sizeNoSign = uint32_and(_size, 0x7fffffff); - const uint32_t test = uint32_sub(sizeNoSign, maxSize); - const uint32_t size = uint32_sels(test, _size, maxSize); - const uint32_t advance = uint32_add(m_current, size); - const uint32_t current = uint32_mod(advance, m_size); - m_current = current; - return size; + const simd32_t maxSize = simd32_splat(distance(m_current, m_write) ); + const simd32_t size = simd32_splat(_size); + const simd32_t signMask = simd32_splat(0x7fffffffu); + const simd32_t sizeNoSign = simd32_and(size, signMask); + const simd32_t test = simd32_u32_sub(sizeNoSign, maxSize); + const simd32_t result = simd32_sels(test, size, maxSize); + const simd32_t currentVal = simd32_splat(m_current); + const simd32_t advance = simd32_u32_add(currentVal, result); + const simd32_t sizeVal = simd32_splat(m_size); + const simd32_t current = simd32_u32_mod(advance, sizeVal); + m_current = current.u32; + return result.u32; } inline uint32_t RingBufferControl::distance(uint32_t _from, uint32_t _to) const { - const uint32_t diff = uint32_sub(_to, _from); - const uint32_t le = uint32_add(m_size, diff); - const uint32_t result = uint32_sels(diff, le, diff); - - return result; + const simd32_t to = simd32_splat(_to); + const simd32_t from = simd32_splat(_from); + const simd32_t diff = simd32_u32_sub(to, from); + const simd32_t sizeVal = simd32_splat(m_size); + const simd32_t le = simd32_u32_add(sizeVal, diff); + const simd32_t result = simd32_sels(diff, le, diff); + return result.u32; } inline void RingBufferControl::reset() @@ -166,52 +185,71 @@ namespace bx inline uint32_t SpScRingBufferControl::consume(uint32_t _size) { - const uint32_t maxSize = distance(m_read, m_current); - const uint32_t sizeNoSign = uint32_and(_size, 0x7fffffff); - const uint32_t test = uint32_sub(sizeNoSign, maxSize); - const uint32_t size = uint32_sels(test, _size, maxSize); - const uint32_t advance = uint32_add(m_read, size); - const uint32_t read = uint32_mod(advance, m_size); - m_read = read; - return size; + const simd32_t maxSize = simd32_splat(distance(m_read, m_current) ); + const simd32_t size = simd32_splat(_size); + const simd32_t signMask = simd32_splat(0x7fffffffu); + const simd32_t sizeNoSign = simd32_and(size, signMask); + const simd32_t test = simd32_u32_sub(sizeNoSign, maxSize); + const simd32_t result = simd32_sels(test, size, maxSize); + const simd32_t readVal = simd32_splat(m_read); + const simd32_t advance = simd32_u32_add(readVal, result); + const simd32_t sizeVal = simd32_splat(m_size); + const simd32_t read = simd32_u32_mod(advance, sizeVal); + m_read = read.u32; + return result.u32; } inline uint32_t SpScRingBufferControl::reserve(uint32_t _size, bool _mustSucceed) { - const uint32_t dist = distance(m_write, m_read)-1; - const uint32_t maxSize = uint32_sels(dist, m_size-1, dist); - const uint32_t sizeNoSign = uint32_and(_size, 0x7fffffff); - const uint32_t test = uint32_sub(sizeNoSign, maxSize); - const uint32_t size = uint32_sels(test, _size, _mustSucceed ? 0 : maxSize); - const uint32_t advance = uint32_add(m_write, size); - const uint32_t write = uint32_mod(advance, m_size); - m_write = write; - return size; + const simd32_t distVal = simd32_splat(distance(m_write, m_read) ); + const simd32_t one = simd32_splat(1u); + const simd32_t dist = simd32_u32_sub(distVal, one); + const simd32_t sizeVal = simd32_splat(m_size); + const simd32_t sizem1 = simd32_u32_sub(sizeVal, one); + const simd32_t maxSize = simd32_sels(dist, sizem1, dist); + const simd32_t size = simd32_splat(_size); + const simd32_t signMask = simd32_splat(0x7fffffffu); + const simd32_t sizeNoSign = simd32_and(size, signMask); + const simd32_t test = simd32_u32_sub(sizeNoSign, maxSize); + const simd32_t zero = simd32_splat(0u); + const simd32_t fail = _mustSucceed ? zero : maxSize; + const simd32_t result = simd32_sels(test, size, fail); + const simd32_t writeVal = simd32_splat(m_write); + const simd32_t advance = simd32_u32_add(writeVal, result); + const simd32_t write = simd32_u32_mod(advance, sizeVal); + m_write = write.u32; + return result.u32; } inline uint32_t SpScRingBufferControl::commit(uint32_t _size) { - const uint32_t maxSize = distance(m_current, m_write); - const uint32_t sizeNoSign = uint32_and(_size, 0x7fffffff); - const uint32_t test = uint32_sub(sizeNoSign, maxSize); - const uint32_t size = uint32_sels(test, _size, maxSize); - const uint32_t advance = uint32_add(m_current, size); - const uint32_t current = uint32_mod(advance, m_size); + const simd32_t maxSize = simd32_splat(distance(m_current, m_write) ); + const simd32_t size = simd32_splat(_size); + const simd32_t signMask = simd32_splat(0x7fffffffu); + const simd32_t sizeNoSign = simd32_and(size, signMask); + const simd32_t test = simd32_u32_sub(sizeNoSign, maxSize); + const simd32_t result = simd32_sels(test, size, maxSize); + const simd32_t currentVal = simd32_splat(m_current); + const simd32_t advance = simd32_u32_add(currentVal, result); + const simd32_t sizeVal = simd32_splat(m_size); + const simd32_t current = simd32_u32_mod(advance, sizeVal); // must commit all memory writes before moving m_current pointer // once m_current pointer moves data is used by consumer thread memoryBarrier(); - m_current = current; - return size; + m_current = current.u32; + return result.u32; } inline uint32_t SpScRingBufferControl::distance(uint32_t _from, uint32_t _to) const { - const uint32_t diff = uint32_sub(_to, _from); - const uint32_t le = uint32_add(m_size, diff); - const uint32_t result = uint32_sels(diff, le, diff); - - return result; + const simd32_t to = simd32_splat(_to); + const simd32_t from = simd32_splat(_from); + const simd32_t diff = simd32_u32_sub(to, from); + const simd32_t sizeVal = simd32_splat(m_size); + const simd32_t le = simd32_u32_add(sizeVal, diff); + const simd32_t result = simd32_sels(diff, le, diff); + return result.u32; } inline void SpScRingBufferControl::reset() diff --git a/include/bx/inline/simd128_langext.inl b/include/bx/inline/simd128_langext.inl deleted file mode 100644 index 3539120..0000000 --- a/include/bx/inline/simd128_langext.inl +++ /dev/null @@ -1,676 +0,0 @@ -/* - * Copyright 2010-2026 Branimir Karadzic. All rights reserved. - * License: https://github.com/bkaradzic/bx/blob/master/LICENSE - */ - -#ifndef BX_SIMD_T_H_HEADER_GUARD -# error "Must be included from bx/simd_t.h!" -#endif // BX_SIMD_T_H_HEADER_GUARD - -namespace bx -{ - BX_CONSTEXPR_FUNC float sqrtRef(float); - -#define ELEMx 0 -#define ELEMy 1 -#define ELEMz 2 -#define ELEMw 3 -#define BX_SIMD128_IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ - template<> \ - BX_SIMD_FORCE_INLINE simd128_langext_t simd_swiz_##_x##_y##_z##_w(simd128_langext_t _a) \ - { \ - simd128_langext_t result; \ - result.vf = __builtin_shufflevector(_a.vf, _a.vf, ELEM##_x, ELEM##_y, ELEM##_z, ELEM##_w); \ - return result; \ - } - -#include "simd128_swizzle.inl" - -#undef BX_SIMD128_IMPLEMENT_SWIZZLE -#undef ELEMw -#undef ELEMz -#undef ELEMy -#undef ELEMx - -#define BX_SIMD128_IMPLEMENT_TEST(_xyzw, _mask) \ - template<> \ - BX_SIMD_FORCE_INLINE bool simd_test_any_##_xyzw(simd128_langext_t _test) \ - { \ - uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \ - | ( (_test.uxyzw[2]>>31)<<2) \ - | ( (_test.uxyzw[1]>>31)<<1) \ - | ( _test.uxyzw[0]>>31) \ - ; \ - return 0 != (tmp&(_mask) ); \ - } \ - \ - template<> \ - BX_SIMD_FORCE_INLINE bool simd_test_all_##_xyzw(simd128_langext_t _test) \ - { \ - uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \ - | ( (_test.uxyzw[2]>>31)<<2) \ - | ( (_test.uxyzw[1]>>31)<<1) \ - | ( _test.uxyzw[0]>>31) \ - ; \ - return (_mask) == (tmp&(_mask) ); \ - } - -BX_SIMD128_IMPLEMENT_TEST(x , 0x1); -BX_SIMD128_IMPLEMENT_TEST(y , 0x2); -BX_SIMD128_IMPLEMENT_TEST(xy , 0x3); -BX_SIMD128_IMPLEMENT_TEST(z , 0x4); -BX_SIMD128_IMPLEMENT_TEST(xz , 0x5); -BX_SIMD128_IMPLEMENT_TEST(yz , 0x6); -BX_SIMD128_IMPLEMENT_TEST(xyz , 0x7); -BX_SIMD128_IMPLEMENT_TEST(w , 0x8); -BX_SIMD128_IMPLEMENT_TEST(xw , 0x9); -BX_SIMD128_IMPLEMENT_TEST(yw , 0xa); -BX_SIMD128_IMPLEMENT_TEST(xyw , 0xb); -BX_SIMD128_IMPLEMENT_TEST(zw , 0xc); -BX_SIMD128_IMPLEMENT_TEST(xzw , 0xd); -BX_SIMD128_IMPLEMENT_TEST(yzw , 0xe); -BX_SIMD128_IMPLEMENT_TEST(xyzw , 0xf); - -#undef BX_SIMD128_IMPLEMENT_TEST - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_shuf_xyAB(simd128_langext_t _a, simd128_langext_t _b) - { - simd128_langext_t result; - result.vf = __builtin_shufflevector(_a.vf, _b.vf, 0, 1, 4, 5); - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_shuf_ABxy(simd128_langext_t _a, simd128_langext_t _b) - { - simd128_langext_t result; - result.vf = __builtin_shufflevector(_a.vf, _b.vf, 4, 5, 0, 1); - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_shuf_CDzw(simd128_langext_t _a, simd128_langext_t _b) - { - simd128_langext_t result; - result.vf = __builtin_shufflevector(_a.vf, _b.vf, 6, 7, 2, 3); - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_shuf_zwCD(simd128_langext_t _a, simd128_langext_t _b) - { - simd128_langext_t result; - result.vf = __builtin_shufflevector(_a.vf, _b.vf, 2, 3, 6, 7); - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_shuf_xAyB(simd128_langext_t _a, simd128_langext_t _b) - { - simd128_langext_t result; - result.vf = __builtin_shufflevector(_a.vf, _b.vf, 0, 4, 1, 5); - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_shuf_AxBy(simd128_langext_t _a, simd128_langext_t _b) - { - simd128_langext_t result; - result.vf = __builtin_shufflevector(_a.vf, _b.vf, 1, 5, 0, 4); - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_shuf_zCwD(simd128_langext_t _a, simd128_langext_t _b) - { - simd128_langext_t result; - result.vf = __builtin_shufflevector(_a.vf, _b.vf, 2, 6, 3, 7); - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_shuf_CzDw(simd128_langext_t _a, simd128_langext_t _b) - { - simd128_langext_t result; - result.vf = __builtin_shufflevector(_a.vf, _b.vf, 6, 2, 7, 3); - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_shuf_xAzC(simd128_langext_t _a, simd128_langext_t _b) - { - simd128_langext_t result; - result.vf = __builtin_shufflevector(_a.vf, _b.vf, 0, 4, 2, 6); - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_shuf_yBwD(simd128_langext_t _a, simd128_langext_t _b) - { - simd128_langext_t result; - result.vf = __builtin_shufflevector(_a.vf, _b.vf, 1, 5, 3, 7); - return result; - } - - template<> - BX_SIMD_FORCE_INLINE float simd_x(simd128_langext_t _a) - { - return _a.fxyzw[0]; - } - - template<> - BX_SIMD_FORCE_INLINE float simd_y(simd128_langext_t _a) - { - return _a.fxyzw[1]; - } - - template<> - BX_SIMD_FORCE_INLINE float simd_z(simd128_langext_t _a) - { - return _a.fxyzw[2]; - } - - template<> - BX_SIMD_FORCE_INLINE float simd_w(simd128_langext_t _a) - { - return _a.fxyzw[3]; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_ld(const void* _ptr) - { - const uint32_t* input = reinterpret_cast(_ptr); - simd128_langext_t result; - result.uxyzw[0] = input[0]; - result.uxyzw[1] = input[1]; - result.uxyzw[2] = input[2]; - result.uxyzw[3] = input[3]; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE void simd_st(void* _ptr, simd128_langext_t _a) - { - uint32_t* result = reinterpret_cast(_ptr); - result[0] = _a.uxyzw[0]; - result[1] = _a.uxyzw[1]; - result[2] = _a.uxyzw[2]; - result[3] = _a.uxyzw[3]; - } - - template<> - BX_SIMD_FORCE_INLINE void simd_stx(void* _ptr, simd128_langext_t _a) - { - uint32_t* result = reinterpret_cast(_ptr); - result[0] = _a.uxyzw[0]; - } - - template<> - BX_SIMD_FORCE_INLINE void simd_stream(void* _ptr, simd128_langext_t _a) - { - uint32_t* result = reinterpret_cast(_ptr); - result[0] = _a.uxyzw[0]; - result[1] = _a.uxyzw[1]; - result[2] = _a.uxyzw[2]; - result[3] = _a.uxyzw[3]; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_ld(float _x, float _y, float _z, float _w) - { - simd128_langext_t result; - result.vf = (float __attribute__((vector_size(16)))){ _x, _y, _z, _w }; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) - { - simd128_langext_t result; - result.vu = (uint32_t __attribute__((vector_size(16)))){ _x, _y, _z, _w }; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_splat(const void* _ptr) - { - const uint32_t val = *reinterpret_cast(_ptr); - simd128_langext_t result; - result.vu = (uint32_t __attribute__((vector_size(16)))){ val, val, val, val }; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_splat(float _a) - { - return simd_ld(_a, _a, _a, _a); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_isplat(uint32_t _a) - { - return simd_ild(_a, _a, _a, _a); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_zero() - { - return simd_ild(0, 0, 0, 0); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_itof(simd128_langext_t _a) - { - simd128_langext_t result; - result.vf = __builtin_convertvector(_a.vi, float __attribute__((vector_size(16))) ); - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_ftoi(simd128_langext_t _a) - { - simd128_langext_t result; - result.vi = __builtin_convertvector(_a.vf, int32_t __attribute__((vector_size(16))) ); - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_round(simd128_langext_t _a) - { - const simd128_langext_t tmp = simd_ftoi(_a); - const simd128_langext_t result = simd_itof(tmp); - - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_add(simd128_langext_t _a, simd128_langext_t _b) - { - simd128_langext_t result; - result.vf = _a.vf + _b.vf; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_sub(simd128_langext_t _a, simd128_langext_t _b) - { - simd128_langext_t result; - result.vf = _a.vf - _b.vf; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_mul(simd128_langext_t _a, simd128_langext_t _b) - { - simd128_langext_t result; - result.vf = _a.vf * _b.vf; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_div(simd128_langext_t _a, simd128_langext_t _b) - { - simd128_langext_t result; - result.vf = _a.vf / _b.vf; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_sqrt(simd128_langext_t _a) - { - simd128_langext_t result; - result.vf[0] = sqrtRef(_a.vf[0]); - result.vf[1] = sqrtRef(_a.vf[1]); - result.vf[2] = sqrtRef(_a.vf[2]); - result.vf[3] = sqrtRef(_a.vf[3]); - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_rsqrt_est(simd128_langext_t _a) - { - simd128_langext_t result; - result.vf[0] = 1.0f / sqrtRef(_a.vf[0]); - result.vf[1] = 1.0f / sqrtRef(_a.vf[1]); - result.vf[2] = 1.0f / sqrtRef(_a.vf[2]); - result.vf[3] = 1.0f / sqrtRef(_a.vf[3]); - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_cmpeq(simd128_langext_t _a, simd128_langext_t _b) - { - simd128_langext_t result; - result.vi = _a.vf == _b.vf; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_cmpneq(simd128_langext_t _a, simd128_langext_t _b) - { - simd128_langext_t result; - result.vi = _a.vf != _b.vf; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_cmplt(simd128_langext_t _a, simd128_langext_t _b) - { - simd128_langext_t result; - result.vi = _a.vf < _b.vf; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_cmple(simd128_langext_t _a, simd128_langext_t _b) - { - simd128_langext_t result; - result.vi = _a.vf <= _b.vf; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_cmpgt(simd128_langext_t _a, simd128_langext_t _b) - { - simd128_langext_t result; - result.vi = _a.vf > _b.vf; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_cmpge(simd128_langext_t _a, simd128_langext_t _b) - { - simd128_langext_t result; - result.vi = _a.vf >= _b.vf; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_and(simd128_langext_t _a, simd128_langext_t _b) - { - simd128_langext_t result; - result.vu = _a.vu & _b.vu; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_andc(simd128_langext_t _a, simd128_langext_t _b) - { - simd128_langext_t result; - result.vu = _a.vu & ~_b.vu; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_or(simd128_langext_t _a, simd128_langext_t _b) - { - simd128_langext_t result; - result.vu = _a.vu | _b.vu; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_xor(simd128_langext_t _a, simd128_langext_t _b) - { - simd128_langext_t result; - result.vu = _a.vu ^ _b.vu; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_sll(simd128_langext_t _a, int _count) - { - simd128_langext_t result; - const simd128_langext_t count = simd_isplat(_count); - result.vu = _a.vu << count.vi; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_srl(simd128_langext_t _a, int _count) - { - simd128_langext_t result; - const simd128_langext_t count = simd_isplat(_count); - result.vu = _a.vu >> count.vi; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_sra(simd128_langext_t _a, int _count) - { - simd128_langext_t result; - const simd128_langext_t count = simd_isplat(_count); - result.vi = _a.vi >> count.vi; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_icmpeq(simd128_langext_t _a, simd128_langext_t _b) - { - simd128_langext_t result; - result.vi = _a.vi == _b.vi; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_icmplt(simd128_langext_t _a, simd128_langext_t _b) - { - simd128_langext_t result; - result.vi = _a.vi < _b.vi; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_icmpgt(simd128_langext_t _a, simd128_langext_t _b) - { - simd128_langext_t result; - result.vi = _a.vi > _b.vi; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_iadd(simd128_langext_t _a, simd128_langext_t _b) - { - simd128_langext_t result; - result.vi = _a.vi + _b.vi; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_isub(simd128_langext_t _a, simd128_langext_t _b) - { - simd128_langext_t result; - result.vi = _a.vi - _b.vi; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_rcp(simd128_langext_t _a) - { - return simd_rcp_ni(_a); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_orx(simd128_langext_t _a) - { - return simd_orx_ni(_a); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_orc(simd128_langext_t _a, simd128_langext_t _b) - { - return simd_orc_ni(_a, _b); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_neg(simd128_langext_t _a) - { - return simd_neg_ni(_a); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_madd(simd128_langext_t _a, simd128_langext_t _b, simd128_langext_t _c) - { - return simd_madd_ni(_a, _b, _c); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_nmsub(simd128_langext_t _a, simd128_langext_t _b, simd128_langext_t _c) - { - return simd_nmsub_ni(_a, _b, _c); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_div_nr(simd128_langext_t _a, simd128_langext_t _b) - { - return simd_div_nr_ni(_a, _b); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_selb(simd128_langext_t _mask, simd128_langext_t _a, simd128_langext_t _b) - { - return simd_selb_ni(_mask, _a, _b); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_sels(simd128_langext_t _test, simd128_langext_t _a, simd128_langext_t _b) - { - return simd_sels_ni(_test, _a, _b); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_not(simd128_langext_t _a) - { - return simd_not_ni(_a); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_abs(simd128_langext_t _a) - { - return simd_abs_ni(_a); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_clamp(simd128_langext_t _a, simd128_langext_t _min, simd128_langext_t _max) - { - return simd_clamp_ni(_a, _min, _max); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_lerp(simd128_langext_t _a, simd128_langext_t _b, simd128_langext_t _s) - { - return simd_lerp_ni(_a, _b, _s); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_rcp_est(simd128_langext_t _a) - { - return simd_rcp_ni(_a); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_rsqrt(simd128_langext_t _a) - { - return simd_rsqrt_ni(_a); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_rsqrt_nr(simd128_langext_t _a) - { - return simd_rsqrt_nr_ni(_a); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_rsqrt_carmack(simd128_langext_t _a) - { - return simd_rsqrt_carmack_ni(_a); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_sqrt_nr(simd128_langext_t _a) - { - return simd_sqrt_nr_ni(_a); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_log2(simd128_langext_t _a) - { - return simd_log2_ni(_a); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_exp2(simd128_langext_t _a) - { - return simd_exp2_ni(_a); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_pow(simd128_langext_t _a, simd128_langext_t _b) - { - return simd_pow_ni(_a, _b); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_cross3(simd128_langext_t _a, simd128_langext_t _b) - { - return simd_cross3_ni(_a, _b); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_normalize3(simd128_langext_t _a) - { - return simd_normalize3_ni(_a); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_dot3(simd128_langext_t _a, simd128_langext_t _b) - { - return simd_dot3_ni(_a, _b); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_dot(simd128_langext_t _a, simd128_langext_t _b) - { - return simd_dot_ni(_a, _b); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_ceil(simd128_langext_t _a) - { - return simd_ceil_ni(_a); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_floor(simd128_langext_t _a) - { - return simd_floor_ni(_a); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_min(simd128_langext_t _a, simd128_langext_t _b) - { - return simd_min_ni(_a, _b); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_max(simd128_langext_t _a, simd128_langext_t _b) - { - return simd_max_ni(_a, _b); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_imin(simd128_langext_t _a, simd128_langext_t _b) - { - return simd_imin_ni(_a, _b); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_langext_t simd_imax(simd128_langext_t _a, simd128_langext_t _b) - { - return simd_imax_ni(_a, _b); - } - - typedef simd128_langext_t simd128_t; - -} // namespace bx diff --git a/include/bx/inline/simd128_neon.inl b/include/bx/inline/simd128_neon.inl index c1e079b..f11d054 100644 --- a/include/bx/inline/simd128_neon.inl +++ b/include/bx/inline/simd128_neon.inl @@ -11,22 +11,22 @@ namespace bx { #if BX_COMPILER_CLANG -# define SHUFFLE_A(_a, _i0, _i1, _i2, _i3) __builtin_shufflevector(_a, _a, _i0, _i1, _i2, _i3 ) -# define SHUFFLE_AB(_a, _b, _i0, _i1, _i2, _i3) __builtin_shufflevector(_a, _b, _i0, _i1, _i2, _i3 ) +# define BX_SIMD_SHUFFLE_A(_a, _i0, _i1, _i2, _i3) __builtin_shufflevector(_a, _a, _i0, _i1, _i2, _i3 ) +# define BX_SIMD_SHUFFLE_AB(_a, _b, _i0, _i1, _i2, _i3) __builtin_shufflevector(_a, _b, _i0, _i1, _i2, _i3 ) #else -# define SHUFFLE_A(_a, _i0, _i1, _i2, _i3) __builtin_shuffle(_a, (uint32x4_t){ _i0, _i1, _i2, _i3 }) -# define SHUFFLE_AB(_a, _b, _i0, _i1, _i2, _i3) __builtin_shuffle(_a, _b, (uint32x4_t){ _i0, _i1, _i2, _i3 }) +# define BX_SIMD_SHUFFLE_A(_a, _i0, _i1, _i2, _i3) __builtin_shuffle(_a, (uint32x4_t){ _i0, _i1, _i2, _i3 }) +# define BX_SIMD_SHUFFLE_AB(_a, _b, _i0, _i1, _i2, _i3) __builtin_shuffle(_a, _b, (uint32x4_t){ _i0, _i1, _i2, _i3 }) #endif #define ELEMx 0 #define ELEMy 1 #define ELEMz 2 #define ELEMw 3 -#define BX_SIMD128_IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ - template<> \ - BX_SIMD_FORCE_INLINE simd128_neon_t simd_swiz_##_x##_y##_z##_w(simd128_neon_t _a) \ - { \ - return SHUFFLE_A(_a, ELEM##_x, ELEM##_y, ELEM##_z, ELEM##_w ); \ +#define BX_SIMD128_IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ + template<> \ + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_x32_swiz_##_x##_y##_z##_w(simd128_neon_t _a) \ + { \ + return BX_SIMD_SHUFFLE_A(_a, ELEM##_x, ELEM##_y, ELEM##_z, ELEM##_w); \ } #include "simd128_swizzle.inl" @@ -37,687 +37,1407 @@ namespace bx #undef ELEMy #undef ELEMx -#define BX_SIMD128_IMPLEMENT_TEST(_xyzw, _swizzle) \ - template<> \ - BX_SIMD_FORCE_INLINE bool simd_test_any_##_xyzw(simd128_neon_t _test) \ - { \ - const simd128_neon_t tmp0 = simd_swiz_##_swizzle(_test); \ - return simd_test_any_ni(tmp0); \ - } \ - \ - template<> \ - BX_SIMD_FORCE_INLINE bool simd_test_all_##_xyzw(simd128_neon_t _test) \ - { \ - const simd128_neon_t tmp0 = simd_swiz_##_swizzle(_test); \ - return simd_test_all_ni(tmp0); \ +#define BX_SIMD128_IMPLEMENT_TEST(_xyzw, _swizzle) \ + template<> \ + BX_SIMD_FORCE_INLINE bool simd128_test_any_##_xyzw(simd128_neon_t _test) \ + { \ + const simd128_neon_t tmp0 = simd128_x32_swiz_##_swizzle(_test); \ + return simd128_test_any_ni(tmp0); \ + } \ + \ + template<> \ + BX_SIMD_FORCE_INLINE bool simd128_test_all_##_xyzw(simd128_neon_t _test) \ + { \ + const simd128_neon_t tmp0 = simd128_x32_swiz_##_swizzle(_test); \ + return simd128_test_all_ni(tmp0); \ } -BX_SIMD128_IMPLEMENT_TEST(x, xxxx); -BX_SIMD128_IMPLEMENT_TEST(y, yyyy); -BX_SIMD128_IMPLEMENT_TEST(xy, xyyy); -BX_SIMD128_IMPLEMENT_TEST(z, zzzz); -BX_SIMD128_IMPLEMENT_TEST(xz, xzzz); -BX_SIMD128_IMPLEMENT_TEST(yz, yzzz); -BX_SIMD128_IMPLEMENT_TEST(xyz, xyzz); -BX_SIMD128_IMPLEMENT_TEST(w, wwww); -BX_SIMD128_IMPLEMENT_TEST(xw, xwww); -BX_SIMD128_IMPLEMENT_TEST(yw, ywww); -BX_SIMD128_IMPLEMENT_TEST(xyw, xyww); -BX_SIMD128_IMPLEMENT_TEST(zw, zwww); -BX_SIMD128_IMPLEMENT_TEST(xzw, xzww); -BX_SIMD128_IMPLEMENT_TEST(yzw, yzww); + BX_SIMD128_IMPLEMENT_TEST(x, xxxx); + BX_SIMD128_IMPLEMENT_TEST(y, yyyy); + BX_SIMD128_IMPLEMENT_TEST(xy, xyyy); + BX_SIMD128_IMPLEMENT_TEST(z, zzzz); + BX_SIMD128_IMPLEMENT_TEST(xz, xzzz); + BX_SIMD128_IMPLEMENT_TEST(yz, yzzz); + BX_SIMD128_IMPLEMENT_TEST(xyz, xyzz); + BX_SIMD128_IMPLEMENT_TEST(w, wwww); + BX_SIMD128_IMPLEMENT_TEST(xw, xwww); + BX_SIMD128_IMPLEMENT_TEST(yw, ywww); + BX_SIMD128_IMPLEMENT_TEST(xyw, xyww); + BX_SIMD128_IMPLEMENT_TEST(zw, zwww); + BX_SIMD128_IMPLEMENT_TEST(xzw, xzww); + BX_SIMD128_IMPLEMENT_TEST(yzw, yzww); + #undef BX_SIMD128_IMPLEMENT_TEST template<> - BX_SIMD_FORCE_INLINE bool simd_test_any_xyzw(simd128_neon_t _test) + BX_SIMD_FORCE_INLINE bool simd128_test_any_xyzw(simd128_neon_t _test) { - return simd_test_any_ni(_test); + return simd128_test_any_ni(_test); } template<> - BX_SIMD_FORCE_INLINE bool simd_test_all_xyzw(simd128_neon_t _test) + BX_SIMD_FORCE_INLINE bool simd128_test_all_xyzw(simd128_neon_t _test) { - return simd_test_all_ni(_test); + return simd128_test_all_ni(_test); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_xyAB(simd128_neon_t _a, simd128_neon_t _b) + BX_SIMD_FORCE_INLINE bool simd128_test_zero(simd128_neon_t _a, simd128_neon_t _b) { - return SHUFFLE_AB(_a, _b, 0, 1, 4, 5 ); +#if BX_ARCH_64BIT + const uint32x4_t ai = vreinterpretq_u32_f32(_a); + const uint32x4_t bi = vreinterpretq_u32_f32(_b); + const uint32x4_t masked = vandq_u32(ai, bi); + return 0 == vmaxvq_u32(masked); +#else + return simd128_test_zero_ni(_a, _b); +#endif // BX_ARCH_64BIT } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_ABxy(simd128_neon_t _a, simd128_neon_t _b) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_x32_shuf_xyAB(simd128_neon_t _a, simd128_neon_t _b) { - return SHUFFLE_AB(_a, _b, 4, 5, 0, 1 ); + return BX_SIMD_SHUFFLE_AB(_a, _b, 0, 1, 4, 5); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_CDzw(simd128_neon_t _a, simd128_neon_t _b) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_x32_shuf_ABxy(simd128_neon_t _a, simd128_neon_t _b) { - return SHUFFLE_AB(_a, _b, 6, 7, 2, 3 ); + return BX_SIMD_SHUFFLE_AB(_a, _b, 4, 5, 0, 1); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_zwCD(simd128_neon_t _a, simd128_neon_t _b) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_x32_shuf_CDzw(simd128_neon_t _a, simd128_neon_t _b) { - return SHUFFLE_AB(_a, _b, 2, 3, 6, 7 ); + return BX_SIMD_SHUFFLE_AB(_a, _b, 6, 7, 2, 3); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_xAyB(simd128_neon_t _a, simd128_neon_t _b) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_x32_shuf_zwCD(simd128_neon_t _a, simd128_neon_t _b) { - return SHUFFLE_AB(_a, _b, 0, 4, 1, 5 ); + return BX_SIMD_SHUFFLE_AB(_a, _b, 2, 3, 6, 7); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_AxBy(simd128_neon_t _a, simd128_neon_t _b) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_x32_shuf_xAyB(simd128_neon_t _a, simd128_neon_t _b) { - return SHUFFLE_AB(_a, _b, 4, 0, 5, 1 ); + return BX_SIMD_SHUFFLE_AB(_a, _b, 0, 4, 1, 5); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_zCwD(simd128_neon_t _a, simd128_neon_t _b) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_x32_shuf_AxBy(simd128_neon_t _a, simd128_neon_t _b) { - return SHUFFLE_AB(_a, _b, 2, 6, 3, 7 ); + return BX_SIMD_SHUFFLE_AB(_a, _b, 4, 0, 5, 1); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_shuf_CzDw(simd128_neon_t _a, simd128_neon_t _b) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_x32_shuf_zCwD(simd128_neon_t _a, simd128_neon_t _b) { - return SHUFFLE_AB(_a, _b, 6, 2, 7, 3 ); - } -#undef SHUFFLE_A -#undef SHUFFLE_AB - - template<> - BX_SIMD_FORCE_INLINE float simd_x(simd128_neon_t _a) - { - return vgetq_lane_f32(_a, 0); + return BX_SIMD_SHUFFLE_AB(_a, _b, 2, 6, 3, 7); } template<> - BX_SIMD_FORCE_INLINE float simd_y(simd128_neon_t _a) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_x32_shuf_CzDw(simd128_neon_t _a, simd128_neon_t _b) { - return vgetq_lane_f32(_a, 1); + return BX_SIMD_SHUFFLE_AB(_a, _b, 6, 2, 7, 3); } template<> - BX_SIMD_FORCE_INLINE float simd_z(simd128_neon_t _a) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_x32_shuf_xzAC(simd128_neon_t _a, simd128_neon_t _b) { - return vgetq_lane_f32(_a, 2); + return BX_SIMD_SHUFFLE_AB(_a, _b, 0, 2, 4, 6); } template<> - BX_SIMD_FORCE_INLINE float simd_w(simd128_neon_t _a) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_x32_shuf_ywBD(simd128_neon_t _a, simd128_neon_t _b) { - return vgetq_lane_f32(_a, 3); + return BX_SIMD_SHUFFLE_AB(_a, _b, 1, 3, 5, 7); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_ld(const void* _ptr) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_x32_shuf_xxAA(simd128_neon_t _a, simd128_neon_t _b) { - return vld1q_f32( (const float32_t*)_ptr); + return BX_SIMD_SHUFFLE_AB(_a, _b, 0, 0, 4, 4); } template<> - BX_SIMD_FORCE_INLINE void simd_st(void* _ptr, simd128_neon_t _a) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_x32_shuf_yyBB(simd128_neon_t _a, simd128_neon_t _b) { - vst1q_f32( (float32_t*)_ptr, _a); + return BX_SIMD_SHUFFLE_AB(_a, _b, 1, 1, 5, 5); } template<> - BX_SIMD_FORCE_INLINE void simd_stx(void* _ptr, simd128_neon_t _a) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_x32_shuf_zzCC(simd128_neon_t _a, simd128_neon_t _b) { - vst1q_lane_f32( (float32_t*)_ptr, _a, 0); + return BX_SIMD_SHUFFLE_AB(_a, _b, 2, 2, 6, 6); } template<> - BX_SIMD_FORCE_INLINE void simd_stream(void* _ptr, simd128_neon_t _a) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_x32_shuf_wwDD(simd128_neon_t _a, simd128_neon_t _b) { - vst1q_f32( (float32_t*)_ptr, _a); + return BX_SIMD_SHUFFLE_AB(_a, _b, 3, 3, 7, 7); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_ld(float _x, float _y, float _z, float _w) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_x32_shuf_xAzC(simd128_neon_t _a, simd128_neon_t _b) + { + return BX_SIMD_SHUFFLE_AB(_a, _b, 0, 4, 2, 6); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_x32_shuf_yBwD(simd128_neon_t _a, simd128_neon_t _b) + { + return BX_SIMD_SHUFFLE_AB(_a, _b, 1, 5, 3, 7); + } + +#undef BX_SIMD_SHUFFLE_A +#undef BX_SIMD_SHUFFLE_AB + + template<> + BX_SIMD_FORCE_INLINE float simd128_f32_x(simd128_neon_t _a) { return vgetq_lane_f32(_a, 0); } + + template<> + BX_SIMD_FORCE_INLINE float simd128_f32_y(simd128_neon_t _a) { return vgetq_lane_f32(_a, 1); } + + template<> + BX_SIMD_FORCE_INLINE float simd128_f32_z(simd128_neon_t _a) { return vgetq_lane_f32(_a, 2); } + + template<> + BX_SIMD_FORCE_INLINE float simd128_f32_w(simd128_neon_t _a) { return vgetq_lane_f32(_a, 3); } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_ld(const void* _ptr) + { + return vld1q_f32(reinterpret_cast(_ptr)); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_ldu(const void* _ptr) + { + return simd128_ld(_ptr); + } + + template<> + BX_SIMD_FORCE_INLINE void simd128_st(void* _ptr, simd128_neon_t _a) + { + vst1q_f32(reinterpret_cast(_ptr), _a); + } + + template<> + BX_SIMD_FORCE_INLINE void simd128_stu(void* _ptr, simd128_neon_t _a) + { + simd128_st(_ptr, _a); + } + + template<> + BX_SIMD_FORCE_INLINE void simd128_x32_st1(void* _ptr, simd128_neon_t _a) + { + vst1q_lane_f32(reinterpret_cast(_ptr), _a, 0); + } + + template<> + BX_SIMD_FORCE_INLINE void simd128_stream(void* _ptr, simd128_neon_t _a) + { + vst1q_f32(reinterpret_cast(_ptr), _a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_ld(float _x, float _y, float _z, float _w) { const float32_t val[4] = {_x, _y, _z, _w}; - return simd_ld(val); + return vld1q_f32(val); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_ld(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) { - const uint32_t val[4] = {_x, _y, _z, _w}; - const uint32x4_t tmp = vld1q_u32(val); - const simd128_neon_t result = vreinterpretq_f32_u32(tmp); - - return result; + const uint32_t val[4] = {_x, _y, _z, _w}; + const uint32x4_t tmp = vld1q_u32(val); + return vreinterpretq_f32_u32(tmp); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_splat(const void* _ptr) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_ld(int32_t _x, int32_t _y, int32_t _z, int32_t _w) { - const simd128_neon_t tmp0 = vld1q_f32( (const float32_t*)_ptr); - const float32x2_t tmp1 = vget_low_f32(tmp0); - const simd128_neon_t result = vdupq_lane_f32(tmp1, 0); - - return result; + const int32_t val[4] = {_x, _y, _z, _w}; + const int32x4_t tmp = vld1q_s32(val); + return vreinterpretq_f32_s32(tmp); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_splat(float _a) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_splat(float _a) { return vdupq_n_f32(_a); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_isplat(uint32_t _a) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_splat(int32_t _a) { - const int32x4_t tmp = vdupq_n_s32(_a); - const simd128_neon_t result = vreinterpretq_f32_s32(tmp); - - return result; + const int32x4_t tmp = vdupq_n_s32(_a); + return vreinterpretq_f32_s32(tmp); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_zero() + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_splat(uint32_t _a) { - return simd_isplat(0); + const int32x4_t tmp = vdupq_n_s32(int32_t(_a)); + return vreinterpretq_f32_s32(tmp); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_itof(simd128_neon_t _a) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_splat(double _a) { - const int32x4_t itof = vreinterpretq_s32_f32(_a); - const simd128_neon_t result = vcvtq_f32_s32(itof); - - return result; + const float64x2_t tmp = vdupq_n_f64(_a); + return vreinterpretq_f32_f64(tmp); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_ftoi(simd128_neon_t _a) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_zero() { - const int32x4_t ftoi = vcvtq_s32_f32(_a); - const simd128_neon_t result = vreinterpretq_f32_s32(ftoi); - - return result; + return simd128_splat(0); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_add(simd128_neon_t _a, simd128_neon_t _b) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_i32_itof(simd128_neon_t _a) + { + const int32x4_t itof = vreinterpretq_s32_f32(_a); + return vcvtq_f32_s32(itof); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_ftoi_trunc(simd128_neon_t _a) + { + const int32x4_t ftoi = vcvtq_s32_f32(_a); + return vreinterpretq_f32_s32(ftoi); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_ftoi_round(simd128_neon_t _a) + { +#if BX_ARCH_64BIT + const int32x4_t ftoi = vcvtnq_s32_f32(_a); + return vreinterpretq_f32_s32(ftoi); +#else + return simd_f32_ftoi_round_ni(_a); +#endif // BX_ARCH_64BIT + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_round(simd128_neon_t _a) + { +#if BX_ARCH_64BIT + return vrndnq_f32(_a); +#else + return simd_f32_round_ni(_a); +#endif + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_ceil(simd128_neon_t _a) + { +#if BX_ARCH_64BIT + return vrndpq_f32(_a); +#else + return simd_f32_ceil_ni(_a); +#endif + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_floor(simd128_neon_t _a) + { +#if BX_ARCH_64BIT + return vrndmq_f32(_a); +#else + return simd_f32_floor_ni(_a); +#endif + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_add(simd128_neon_t _a, simd128_neon_t _b) { return vaddq_f32(_a, _b); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_sub(simd128_neon_t _a, simd128_neon_t _b) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_sub(simd128_neon_t _a, simd128_neon_t _b) { return vsubq_f32(_a, _b); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_mul(simd128_neon_t _a, simd128_neon_t _b) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_mul(simd128_neon_t _a, simd128_neon_t _b) { return vmulq_f32(_a, _b); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_rcp_est(simd128_neon_t _a) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_div(simd128_neon_t _a, simd128_neon_t _b) + { +#if BX_ARCH_64BIT + return vdivq_f32(_a, _b); +#else + return simd_f32_div_nr_ni(_a, _b); +#endif + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_rcp_est(simd128_neon_t _a) { return vrecpeq_f32(_a); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_rsqrt_est(simd128_neon_t _a) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_rcp(simd128_neon_t _a) + { + return simd_f32_rcp_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_sqrt(simd128_neon_t _a) + { +#if BX_ARCH_64BIT + return vsqrtq_f32(_a); +#else + const simd128_neon_t rsqrt = simd128_f32_rsqrt_est(_a); + return simd128_f32_mul(_a, rsqrt); +#endif + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_rsqrt_est(simd128_neon_t _a) { return vrsqrteq_f32(_a); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmpeq(simd128_neon_t _a, simd128_neon_t _b) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_rsqrt(simd128_neon_t _a) { - const uint32x4_t tmp = vceqq_f32(_a, _b); - const simd128_neon_t result = vreinterpretq_f32_u32(tmp); - - return result; + return simd_f32_rsqrt_ni(_a); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmpneq(simd128_neon_t _a, simd128_neon_t _b) - { - return simd_cmpneq_ni(_a, _b); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmplt(simd128_neon_t _a, simd128_neon_t _b) - { - const uint32x4_t tmp = vcltq_f32(_a, _b); - const simd128_neon_t result = vreinterpretq_f32_u32(tmp); - - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmple(simd128_neon_t _a, simd128_neon_t _b) - { - const uint32x4_t tmp = vcleq_f32(_a, _b); - const simd128_neon_t result = vreinterpretq_f32_u32(tmp); - - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmpgt(simd128_neon_t _a, simd128_neon_t _b) - { - const uint32x4_t tmp = vcgtq_f32(_a, _b); - const simd128_neon_t result = vreinterpretq_f32_u32(tmp); - - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_cmpge(simd128_neon_t _a, simd128_neon_t _b) - { - const uint32x4_t tmp = vcgeq_f32(_a, _b); - const simd128_neon_t result = vreinterpretq_f32_u32(tmp); - - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_min(simd128_neon_t _a, simd128_neon_t _b) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_min(simd128_neon_t _a, simd128_neon_t _b) { return vminq_f32(_a, _b); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_max(simd128_neon_t _a, simd128_neon_t _b) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_max(simd128_neon_t _a, simd128_neon_t _b) { return vmaxq_f32(_a, _b); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_and(simd128_neon_t _a, simd128_neon_t _b) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_cmpeq(simd128_neon_t _a, simd128_neon_t _b) { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const int32x4_t tmp2 = vandq_s32(tmp0, tmp1); - const simd128_neon_t result = vreinterpretq_f32_s32(tmp2); - - return result; + const uint32x4_t result = vceqq_f32(_a, _b); + return vreinterpretq_f32_u32(result); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_andc(simd128_neon_t _a, simd128_neon_t _b) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_cmplt(simd128_neon_t _a, simd128_neon_t _b) { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const int32x4_t tmp2 = vbicq_s32(tmp0, tmp1); - const simd128_neon_t result = vreinterpretq_f32_s32(tmp2); - - return result; + const uint32x4_t result = vcltq_f32(_a, _b); + return vreinterpretq_f32_u32(result); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_or(simd128_neon_t _a, simd128_neon_t _b) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_cmple(simd128_neon_t _a, simd128_neon_t _b) { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const int32x4_t tmp2 = vorrq_s32(tmp0, tmp1); - const simd128_neon_t result = vreinterpretq_f32_s32(tmp2); - - return result; + const uint32x4_t result = vcleq_f32(_a, _b); + return vreinterpretq_f32_u32(result); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_xor(simd128_neon_t _a, simd128_neon_t _b) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_cmpgt(simd128_neon_t _a, simd128_neon_t _b) { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const int32x4_t tmp2 = veorq_s32(tmp0, tmp1); - const simd128_neon_t result = vreinterpretq_f32_s32(tmp2); - - return result; + const uint32x4_t result = vcgtq_f32(_a, _b); + return vreinterpretq_f32_u32(result); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_sll(simd128_neon_t _a, int _count) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_cmpge(simd128_neon_t _a, simd128_neon_t _b) { -#if !BX_COMPILER_CLANG - if (__builtin_constant_p(_count) ) - { - const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); - const uint32x4_t tmp1 = vshlq_n_u32(tmp0, _count); - const simd128_neon_t result = vreinterpretq_f32_u32(tmp1); + const uint32x4_t result = vcgeq_f32(_a, _b); + return vreinterpretq_f32_u32(result); + } - return result; - } -#endif - const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_i32_add(simd128_neon_t _a, simd128_neon_t _b) + { + const int32x4_t a = vreinterpretq_s32_f32(_a); + const int32x4_t b = vreinterpretq_s32_f32(_b); + const int32x4_t add = vaddq_s32(a, b); + return vreinterpretq_f32_s32(add); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_i32_sub(simd128_neon_t _a, simd128_neon_t _b) + { + const int32x4_t a = vreinterpretq_s32_f32(_a); + const int32x4_t b = vreinterpretq_s32_f32(_b); + const int32x4_t sub = vsubq_s32(a, b); + return vreinterpretq_f32_s32(sub); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_i32_neg(simd128_neon_t _a) + { + const int32x4_t a = vreinterpretq_s32_f32(_a); + const int32x4_t result = vnegq_s32(a); + return vreinterpretq_f32_s32(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_i32_abs(simd128_neon_t _a) + { + const int32x4_t a = vreinterpretq_s32_f32(_a); + const int32x4_t result = vabsq_s32(a); + return vreinterpretq_f32_s32(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_i32_min(simd128_neon_t _a, simd128_neon_t _b) + { + const int32x4_t a = vreinterpretq_s32_f32(_a); + const int32x4_t b = vreinterpretq_s32_f32(_b); + const int32x4_t result = vminq_s32(a, b); + return vreinterpretq_f32_s32(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_i32_max(simd128_neon_t _a, simd128_neon_t _b) + { + const int32x4_t a = vreinterpretq_s32_f32(_a); + const int32x4_t b = vreinterpretq_s32_f32(_b); + const int32x4_t result = vmaxq_s32(a, b); + return vreinterpretq_f32_s32(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_i32_cmpeq(simd128_neon_t _a, simd128_neon_t _b) + { + const int32x4_t a = vreinterpretq_s32_f32(_a); + const int32x4_t b = vreinterpretq_s32_f32(_b); + const uint32x4_t result = vceqq_s32(a, b); + return vreinterpretq_f32_u32(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_i32_cmplt(simd128_neon_t _a, simd128_neon_t _b) + { + const int32x4_t a = vreinterpretq_s32_f32(_a); + const int32x4_t b = vreinterpretq_s32_f32(_b); + const uint32x4_t result = vcltq_s32(a, b); + return vreinterpretq_f32_u32(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_i32_cmpgt(simd128_neon_t _a, simd128_neon_t _b) + { + const int32x4_t a = vreinterpretq_s32_f32(_a); + const int32x4_t b = vreinterpretq_s32_f32(_b); + const uint32x4_t result = vcgtq_s32(a, b); + return vreinterpretq_f32_u32(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_i32_clamp(simd128_neon_t _a, simd128_neon_t _min, simd128_neon_t _max) + { + return simd128_i32_min(simd128_i32_max(_a, _min), _max); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_u32_add(simd128_neon_t _a, simd128_neon_t _b) + { + const uint32x4_t a = vreinterpretq_u32_f32(_a); + const uint32x4_t b = vreinterpretq_u32_f32(_b); + const uint32x4_t result = vaddq_u32(a, b); + return vreinterpretq_f32_u32(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_u32_sub(simd128_neon_t _a, simd128_neon_t _b) + { + const uint32x4_t a = vreinterpretq_u32_f32(_a); + const uint32x4_t b = vreinterpretq_u32_f32(_b); + const uint32x4_t result = vsubq_u32(a, b); + return vreinterpretq_f32_u32(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_u32_mul(simd128_neon_t _a, simd128_neon_t _b) + { + const uint32x4_t a = vreinterpretq_u32_f32(_a); + const uint32x4_t b = vreinterpretq_u32_f32(_b); + const uint32x4_t result = vmulq_u32(a, b); + return vreinterpretq_f32_u32(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_u32_min(simd128_neon_t _a, simd128_neon_t _b) + { + const uint32x4_t a = vreinterpretq_u32_f32(_a); + const uint32x4_t b = vreinterpretq_u32_f32(_b); + const uint32x4_t result = vminq_u32(a, b); + return vreinterpretq_f32_u32(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_u32_max(simd128_neon_t _a, simd128_neon_t _b) + { + const uint32x4_t a = vreinterpretq_u32_f32(_a); + const uint32x4_t b = vreinterpretq_u32_f32(_b); + const uint32x4_t result = vmaxq_u32(a, b); + return vreinterpretq_f32_u32(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_u32_clamp(simd128_neon_t _a, simd128_neon_t _min, simd128_neon_t _max) + { + return simd128_u32_min(simd128_u32_max(_a, _min), _max); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_u32_cmpeq(simd128_neon_t _a, simd128_neon_t _b) + { + const uint32x4_t a = vreinterpretq_u32_f32(_a); + const uint32x4_t b = vreinterpretq_u32_f32(_b); + const uint32x4_t result = vceqq_u32(a, b); + return vreinterpretq_f32_u32(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_u32_cmplt(simd128_neon_t _a, simd128_neon_t _b) + { + const uint32x4_t a = vreinterpretq_u32_f32(_a); + const uint32x4_t b = vreinterpretq_u32_f32(_b); + const uint32x4_t result = vcltq_u32(a, b); + return vreinterpretq_f32_u32(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_u32_cmpgt(simd128_neon_t _a, simd128_neon_t _b) + { + const uint32x4_t a = vreinterpretq_u32_f32(_a); + const uint32x4_t b = vreinterpretq_u32_f32(_b); + const uint32x4_t result = vcgtq_u32(a, b); + return vreinterpretq_f32_u32(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_i16_add(simd128_neon_t _a, simd128_neon_t _b) + { + const int16x8_t a = vreinterpretq_s16_f32(_a); + const int16x8_t b = vreinterpretq_s16_f32(_b); + const int16x8_t result = vaddq_s16(a, b); + return vreinterpretq_f32_s16(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_i16_sub(simd128_neon_t _a, simd128_neon_t _b) + { + const int16x8_t a = vreinterpretq_s16_f32(_a); + const int16x8_t b = vreinterpretq_s16_f32(_b); + const int16x8_t result = vsubq_s16(a, b); + return vreinterpretq_f32_s16(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_i16_mullo(simd128_neon_t _a, simd128_neon_t _b) + { + const int16x8_t a = vreinterpretq_s16_f32(_a); + const int16x8_t b = vreinterpretq_s16_f32(_b); + const int16x8_t result = vmulq_s16(a, b); + return vreinterpretq_f32_s16(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_i16_cmpeq(simd128_neon_t _a, simd128_neon_t _b) + { + const int16x8_t a = vreinterpretq_s16_f32(_a); + const int16x8_t b = vreinterpretq_s16_f32(_b); + const uint16x8_t result = vceqq_s16(a, b); + return vreinterpretq_f32_u16(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_x16_sll(simd128_neon_t _a, int _count) + { + const uint16x8_t a = vreinterpretq_u16_f32(_a); + const int16x8_t shift = vdupq_n_s16((int16_t)_count); + const uint16x8_t result = vshlq_u16(a, shift); + return vreinterpretq_f32_u16(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_x16_srl(simd128_neon_t _a, int _count) + { + const uint16x8_t a = vreinterpretq_u16_f32(_a); + const int16x8_t shift = vdupq_n_s16((int16_t)-_count); + const uint16x8_t result = vshlq_u16(a, shift); + return vreinterpretq_f32_u16(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_splat(int16_t _a) + { + const int16x8_t result = vdupq_n_s16(_a); + return vreinterpretq_f32_s16(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_splat(uint16_t _a) + { + const uint16x8_t result = vdupq_n_u16(_a); + return vreinterpretq_f32_u16(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_i8_add(simd128_neon_t _a, simd128_neon_t _b) + { + const int8x16_t a = vreinterpretq_s8_f32(_a); + const int8x16_t b = vreinterpretq_s8_f32(_b); + const int8x16_t result = vaddq_s8(a, b); + return vreinterpretq_f32_s8(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_i8_sub(simd128_neon_t _a, simd128_neon_t _b) + { + const int8x16_t a = vreinterpretq_s8_f32(_a); + const int8x16_t b = vreinterpretq_s8_f32(_b); + const int8x16_t result = vsubq_s8(a, b); + return vreinterpretq_f32_s8(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_u8_satadd(simd128_neon_t _a, simd128_neon_t _b) + { + const uint8x16_t a = vreinterpretq_u8_f32(_a); + const uint8x16_t b = vreinterpretq_u8_f32(_b); + const uint8x16_t result = vqaddq_u8(a, b); + return vreinterpretq_f32_u8(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_u8_satsub(simd128_neon_t _a, simd128_neon_t _b) + { + const uint8x16_t a = vreinterpretq_u8_f32(_a); + const uint8x16_t b = vreinterpretq_u8_f32(_b); + const uint8x16_t result = vqsubq_u8(a, b); + return vreinterpretq_f32_u8(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_u16_satadd(simd128_neon_t _a, simd128_neon_t _b) + { + const uint16x8_t a = vreinterpretq_u16_f32(_a); + const uint16x8_t b = vreinterpretq_u16_f32(_b); + const uint16x8_t result = vqaddq_u16(a, b); + return vreinterpretq_f32_u16(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_u16_satsub(simd128_neon_t _a, simd128_neon_t _b) + { + const uint16x8_t a = vreinterpretq_u16_f32(_a); + const uint16x8_t b = vreinterpretq_u16_f32(_b); + const uint16x8_t result = vqsubq_u16(a, b); + return vreinterpretq_f32_u16(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_and(simd128_neon_t _a, simd128_neon_t _b) + { + const uint32x4_t a = vreinterpretq_u32_f32(_a); + const uint32x4_t b = vreinterpretq_u32_f32(_b); + const uint32x4_t result = vandq_u32(a, b); + return vreinterpretq_f32_u32(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_andc(simd128_neon_t _a, simd128_neon_t _b) + { + const uint32x4_t a = vreinterpretq_u32_f32(_a); + const uint32x4_t b = vreinterpretq_u32_f32(_b); + const uint32x4_t result = vbicq_u32(a, b); + return vreinterpretq_f32_u32(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_or(simd128_neon_t _a, simd128_neon_t _b) + { + const uint32x4_t a = vreinterpretq_u32_f32(_a); + const uint32x4_t b = vreinterpretq_u32_f32(_b); + const uint32x4_t result = vorrq_u32(a, b); + return vreinterpretq_f32_u32(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_xor(simd128_neon_t _a, simd128_neon_t _b) + { + const uint32x4_t a = vreinterpretq_u32_f32(_a); + const uint32x4_t b = vreinterpretq_u32_f32(_b); + const uint32x4_t result = veorq_u32(a, b); + return vreinterpretq_f32_u32(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_x32_sll(simd128_neon_t _a, int _count) + { + const uint32x4_t a = vreinterpretq_u32_f32(_a); const int32x4_t shift = vdupq_n_s32(_count); - const uint32x4_t tmp1 = vshlq_u32(tmp0, shift); - const simd128_neon_t result = vreinterpretq_f32_u32(tmp1); - - return result; + const uint32x4_t result = vshlq_u32(a, shift); + return vreinterpretq_f32_u32(result); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_srl(simd128_neon_t _a, int _count) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_x32_srl(simd128_neon_t _a, int _count) { -#if !BX_COMPILER_CLANG - if (__builtin_constant_p(_count) ) - { - const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); - const uint32x4_t tmp1 = vshrq_n_u32(tmp0, _count); - const simd128_neon_t result = vreinterpretq_f32_u32(tmp1); - - return result; - } -#endif - const uint32x4_t tmp0 = vreinterpretq_u32_f32(_a); + const uint32x4_t a = vreinterpretq_u32_f32(_a); const int32x4_t shift = vdupq_n_s32(-_count); - const uint32x4_t tmp1 = vshlq_u32(tmp0, shift); - const simd128_neon_t result = vreinterpretq_f32_u32(tmp1); - - return result; + const uint32x4_t result = vshlq_u32(a, shift); + return vreinterpretq_f32_u32(result); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_sra(simd128_neon_t _a, int _count) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_x32_sra(simd128_neon_t _a, int _count) { -#if !BX_COMPILER_CLANG - if (__builtin_constant_p(_count) ) - { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vshrq_n_s32(tmp0, _count); - const simd128_neon_t result = vreinterpretq_f32_s32(tmp1); - - return result; - } -#endif - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); + const int32x4_t a = vreinterpretq_s32_f32(_a); const int32x4_t shift = vdupq_n_s32(-_count); - const int32x4_t tmp1 = vshlq_s32(tmp0, shift); - const simd128_neon_t result = vreinterpretq_f32_s32(tmp1); - - return result; + const int32x4_t result = vshlq_s32(a, shift); + return vreinterpretq_f32_s32(result); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_icmpeq(simd128_neon_t _a, simd128_neon_t _b) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_x32_sll(simd128_neon_t _a, simd128_neon_t _count) { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const uint32x4_t tmp2 = vceqq_s32(tmp0, tmp1); - const simd128_neon_t result = vreinterpretq_f32_u32(tmp2); - - return result; + const uint32x4_t a = vreinterpretq_u32_f32(_a); + const int32x4_t c = vreinterpretq_s32_f32(_count); + const uint32x4_t result = vshlq_u32(a, c); + return vreinterpretq_f32_u32(result); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_icmplt(simd128_neon_t _a, simd128_neon_t _b) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_x32_srl(simd128_neon_t _a, simd128_neon_t _count) { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const uint32x4_t tmp2 = vcltq_s32(tmp0, tmp1); - const simd128_neon_t result = vreinterpretq_f32_u32(tmp2); - - return result; + const uint32x4_t a = vreinterpretq_u32_f32(_a); + const int32x4_t c = vreinterpretq_s32_f32(_count); + const int32x4_t cneg = vnegq_s32(c); + const uint32x4_t result = vshlq_u32(a, cneg); + return vreinterpretq_f32_u32(result); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_icmpgt(simd128_neon_t _a, simd128_neon_t _b) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_x32_sra(simd128_neon_t _a, simd128_neon_t _count) { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const uint32x4_t tmp2 = vcgtq_s32(tmp0, tmp1); - const simd128_neon_t result = vreinterpretq_f32_u32(tmp2); - - return result; + const int32x4_t a = vreinterpretq_s32_f32(_a); + const int32x4_t c = vreinterpretq_s32_f32(_count); + const int32x4_t cneg = vnegq_s32(c); + const int32x4_t result = vshlq_s32(a, cneg); + return vreinterpretq_f32_s32(result); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_imin(simd128_neon_t _a, simd128_neon_t _b) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_x8_shuffle(simd128_neon_t _a, simd128_neon_t _indices) { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const int32x4_t tmp2 = vminq_s32(tmp0, tmp1); - const simd128_neon_t result = vreinterpretq_f32_s32(tmp2); - - return result; + const uint8x16_t a = vreinterpretq_u8_f32(_a); + const uint8x16_t indices = vreinterpretq_u8_f32(_indices); + const uint8x16_t result = vqtbl1q_u8(a, indices); + return vreinterpretq_f32_u8(result); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_imax(simd128_neon_t _a, simd128_neon_t _b) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_x8_shuffle(simd128_neon_t _a, simd128_neon_t _b, simd128_neon_t _indices) { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const int32x4_t tmp2 = vmaxq_s32(tmp0, tmp1); - const simd128_neon_t result = vreinterpretq_f32_s32(tmp2); - - return result; + const uint8x16x2_t tbl = { { vreinterpretq_u8_f32(_a), vreinterpretq_u8_f32(_b) } }; + const uint8x16_t indices = vreinterpretq_u8_f32(_indices); + const uint8x16_t result = vqtbl2q_u8(tbl, indices); + return vreinterpretq_f32_u8(result); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_iadd(simd128_neon_t _a, simd128_neon_t _b) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_selb(simd128_neon_t _mask, simd128_neon_t _a, simd128_neon_t _b) { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const int32x4_t tmp2 = vaddq_s32(tmp0, tmp1); - const simd128_neon_t result = vreinterpretq_f32_s32(tmp2); - - return result; + const uint32x4_t mask = vreinterpretq_u32_f32(_mask); + const uint32x4_t a = vreinterpretq_u32_f32(_a); + const uint32x4_t b = vreinterpretq_u32_f32(_b); + const uint32x4_t result = vbslq_u32(mask, a, b); + return vreinterpretq_f32_u32(result); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_isub(simd128_neon_t _a, simd128_neon_t _b) - { - const int32x4_t tmp0 = vreinterpretq_s32_f32(_a); - const int32x4_t tmp1 = vreinterpretq_s32_f32(_b); - const int32x4_t tmp2 = vsubq_s32(tmp0, tmp1); - const simd128_neon_t result = vreinterpretq_f32_s32(tmp2); - - return result; - } - - template<> - BX_SIMD_INLINE simd128_neon_t simd_shuf_xAzC(simd128_neon_t _a, simd128_neon_t _b) - { - return simd_shuf_xAzC_ni(_a, _b); - } - - template<> - BX_SIMD_INLINE simd128_neon_t simd_shuf_yBwD(simd128_neon_t _a, simd128_neon_t _b) - { - return simd_shuf_yBwD_ni(_a, _b); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_rcp(simd128_neon_t _a) - { - const simd128_neon_t tmp0 = simd_rcp_est(_a); - const simd128_neon_t tmp1 = vrecpsq_f32(_a, tmp0); - const simd128_neon_t tmp2 = simd_mul(tmp0, tmp1); - const simd128_neon_t tmp3 = vrecpsq_f32(_a, tmp2); - const simd128_neon_t result = simd_mul(tmp2, tmp3); - - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_orx(simd128_neon_t _a) - { - return simd_orx_ni(_a); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_orc(simd128_neon_t _a, simd128_neon_t _b) - { - return simd_orc_ni(_a, _b); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_neg(simd128_neon_t _a) - { - return simd_neg_ni(_a); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_madd(simd128_neon_t _a, simd128_neon_t _b, simd128_neon_t _c) - { - return simd_madd_ni(_a, _b, _c); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_nmsub(simd128_neon_t _a, simd128_neon_t _b, simd128_neon_t _c) - { - return simd_nmsub_ni(_a, _b, _c); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_div_nr(simd128_neon_t _a, simd128_neon_t _b) - { - return simd_div_nr_ni(_a, _b); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_div(simd128_neon_t _a, simd128_neon_t _b) - { - return simd_div_nr_ni(_a, _b); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_selb(simd128_neon_t _mask, simd128_neon_t _a, simd128_neon_t _b) - { - return simd_selb_ni(_mask, _a, _b); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_sels(simd128_neon_t _test, simd128_neon_t _a, simd128_neon_t _b) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_sels(simd128_neon_t _test, simd128_neon_t _a, simd128_neon_t _b) { return simd_sels_ni(_test, _a, _b); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_not(simd128_neon_t _a) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_not(simd128_neon_t _a) { - return simd_not_ni(_a); + const uint32x4_t a = vreinterpretq_u32_f32(_a); + const uint32x4_t result = vmvnq_u32(a); + return vreinterpretq_f32_u32(result); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_abs(simd128_neon_t _a) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_orc(simd128_neon_t _a, simd128_neon_t _b) { - return simd_abs_ni(_a); + const uint32x4_t a = vreinterpretq_u32_f32(_a); + const uint32x4_t b = vreinterpretq_u32_f32(_b); + const uint32x4_t result = vornq_u32(a, b); + return vreinterpretq_f32_u32(result); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_clamp(simd128_neon_t _a, simd128_neon_t _min, simd128_neon_t _max) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_orx(simd128_neon_t _a) { - return simd_clamp_ni(_a, _min, _max); + return simd128_orx_ni(_a); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_lerp(simd128_neon_t _a, simd128_neon_t _b, simd128_neon_t _s) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_madd(simd128_neon_t _a, simd128_neon_t _b, simd128_neon_t _c) { - return simd_lerp_ni(_a, _b, _s); +#if BX_ARCH_64BIT + return vfmaq_f32(_c, _a, _b); +#else + return simd_f32_madd_ni(_a, _b, _c); +#endif } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_rsqrt(simd128_neon_t _a) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_msub(simd128_neon_t _a, simd128_neon_t _b, simd128_neon_t _c) { - const simd128_neon_t tmp0 = simd_rsqrt_est(_a); - const simd128_neon_t tmp1 = simd_mul(_a, tmp0); - const simd128_neon_t tmp2 = vrsqrtsq_f32(tmp1, tmp0); - const simd128_neon_t tmp3 = simd_mul(tmp0, tmp2); - const simd128_neon_t tmp4 = simd_mul(_a, tmp3); - const simd128_neon_t tmp5 = vrsqrtsq_f32(tmp4, tmp3); - const simd128_neon_t result = simd_mul(tmp3, tmp5); - - return result; + return simd_f32_msub_ni(_a, _b, _c); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_rsqrt_nr(simd128_neon_t _a) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_nmsub(simd128_neon_t _a, simd128_neon_t _b, simd128_neon_t _c) { - return simd_rsqrt(_a); +#if BX_ARCH_64BIT + return vfmsq_f32(_c, _a, _b); +#else + return simd_f32_nmsub_ni(_a, _b, _c); +#endif } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_rsqrt_carmack(simd128_neon_t _a) + BX_SIMD_FORCE_INLINE int simd128_x32_signbitsmask(simd128_neon_t _a) { - return simd_rsqrt_carmack_ni(_a); + return simd_x32_signbitsmask_ni(_a); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_sqrt(simd128_neon_t _a) + BX_SIMD_FORCE_INLINE int simd128_x8_signbitsmask(simd128_neon_t _a) { - const simd128_neon_t rsqrt = simd_rsqrt(_a); - const simd128_neon_t result = simd_mul(_a, rsqrt); - - return result; + return simd_x8_signbitsmask_ni(_a); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_sqrt_nr(simd128_neon_t _a) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_neg(simd128_neon_t _a) { - return simd_sqrt(_a); + return vnegq_f32(_a); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_log2(simd128_neon_t _a) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_abs(simd128_neon_t _a) { - return simd_log2_ni(_a); + return vabsq_f32(_a); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_exp2(simd128_neon_t _a) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_clamp(simd128_neon_t _a, simd128_neon_t _min, simd128_neon_t _max) { - return simd_exp2_ni(_a); + return simd_f32_clamp_ni(_a, _min, _max); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_pow(simd128_neon_t _a, simd128_neon_t _b) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_lerp(simd128_neon_t _a, simd128_neon_t _b, simd128_neon_t _s) { - return simd_pow_ni(_a, _b); + return simd_f32_lerp_ni(_a, _b, _s); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_cross3(simd128_neon_t _a, simd128_neon_t _b) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_sqrt_nr(simd128_neon_t _a) { - return simd_cross3_ni(_a, _b); + return simd_f32_sqrt_nr_ni(_a); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_normalize3(simd128_neon_t _a) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_rsqrt_nr(simd128_neon_t _a) { - return simd_normalize3_ni(_a); + return simd_f32_rsqrt_nr_ni(_a); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_dot3(simd128_neon_t _a, simd128_neon_t _b) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_rsqrt_carmack(simd128_neon_t _a) { - return simd_dot3_ni(_a, _b); + return simd_f32_rsqrt_carmack_ni(_a); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_dot(simd128_neon_t _a, simd128_neon_t _b) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_div_nr(simd128_neon_t _a, simd128_neon_t _b) { - return simd_dot_ni(_a, _b); + return simd_f32_div_nr_ni(_a, _b); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_ceil(simd128_neon_t _a) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_dot3(simd128_neon_t _a, simd128_neon_t _b) { - return simd_ceil_ni(_a); + return simd128_f32_dot3_ni(_a, _b); } template<> - BX_SIMD_FORCE_INLINE simd128_neon_t simd_floor(simd128_neon_t _a) + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_dot(simd128_neon_t _a, simd128_neon_t _b) { - return simd_floor_ni(_a); + return simd128_f32_dot_ni(_a, _b); } - typedef simd128_neon_t simd128_t; + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_cross3(simd128_neon_t _a, simd128_neon_t _b) + { + return simd128_f32_cross3_ni(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_normalize3(simd128_neon_t _a) + { + return simd128_f32_normalize3_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f32_cmpneq(simd128_neon_t _a, simd128_neon_t _b) + { + return simd_f32_cmpneq_ni(_a, _b); + } + +#if BX_ARCH_64BIT + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_add(simd128_neon_t _a, simd128_neon_t _b) + { + const float64x2_t a = vreinterpretq_f64_f32(_a); + const float64x2_t b = vreinterpretq_f64_f32(_b); + const float64x2_t result = vaddq_f64(a, b); + return vreinterpretq_f32_f64(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_sub(simd128_neon_t _a, simd128_neon_t _b) + { + const float64x2_t a = vreinterpretq_f64_f32(_a); + const float64x2_t b = vreinterpretq_f64_f32(_b); + const float64x2_t result = vsubq_f64(a, b); + return vreinterpretq_f32_f64(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_mul(simd128_neon_t _a, simd128_neon_t _b) + { + const float64x2_t a = vreinterpretq_f64_f32(_a); + const float64x2_t b = vreinterpretq_f64_f32(_b); + const float64x2_t result = vmulq_f64(a, b); + return vreinterpretq_f32_f64(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_div(simd128_neon_t _a, simd128_neon_t _b) + { + const float64x2_t a = vreinterpretq_f64_f32(_a); + const float64x2_t b = vreinterpretq_f64_f32(_b); + const float64x2_t result = vdivq_f64(a, b); + return vreinterpretq_f32_f64(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_min(simd128_neon_t _a, simd128_neon_t _b) + { + const float64x2_t a = vreinterpretq_f64_f32(_a); + const float64x2_t b = vreinterpretq_f64_f32(_b); + const float64x2_t result = vminq_f64(a, b); + return vreinterpretq_f32_f64(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_max(simd128_neon_t _a, simd128_neon_t _b) + { + const float64x2_t a = vreinterpretq_f64_f32(_a); + const float64x2_t b = vreinterpretq_f64_f32(_b); + const float64x2_t result = vmaxq_f64(a, b); + return vreinterpretq_f32_f64(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_madd(simd128_neon_t _a, simd128_neon_t _b, simd128_neon_t _c) + { + const float64x2_t a = vreinterpretq_f64_f32(_a); + const float64x2_t b = vreinterpretq_f64_f32(_b); + const float64x2_t c = vreinterpretq_f64_f32(_c); + const float64x2_t result = vmlaq_f64(c, a, b); + return vreinterpretq_f32_f64(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_nmsub(simd128_neon_t _a, simd128_neon_t _b, simd128_neon_t _c) + { + const float64x2_t a = vreinterpretq_f64_f32(_a); + const float64x2_t b = vreinterpretq_f64_f32(_b); + const float64x2_t c = vreinterpretq_f64_f32(_c); + const float64x2_t result = vfmsq_f64(c, a, b); + return vreinterpretq_f32_f64(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_neg(simd128_neon_t _a) + { + const float64x2_t a = vreinterpretq_f64_f32(_a); + const float64x2_t result = vnegq_f64(a); + return vreinterpretq_f32_f64(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_abs(simd128_neon_t _a) + { + const float64x2_t a = vreinterpretq_f64_f32(_a); + const float64x2_t result = vabsq_f64(a); + return vreinterpretq_f32_f64(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_clamp(simd128_neon_t _a, simd128_neon_t _min, simd128_neon_t _max) + { + return simd128_f64_min(simd128_f64_max(_a, _min), _max); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_lerp(simd128_neon_t _a, simd128_neon_t _b, simd128_neon_t _s) + { + return simd_f64_lerp_ni(_a, _b, _s); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_rcp(simd128_neon_t _a) + { + return simd_f64_rcp_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_sqrt(simd128_neon_t _a) + { + const float64x2_t a = vreinterpretq_f64_f32(_a); + const float64x2_t result = vsqrtq_f64(a); + return vreinterpretq_f32_f64(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_rsqrt(simd128_neon_t _a) + { + return simd_f64_rsqrt_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_round(simd128_neon_t _a) + { + const float64x2_t a = vreinterpretq_f64_f32(_a); + const float64x2_t result = vrndnq_f64(a); + return vreinterpretq_f32_f64(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_ceil(simd128_neon_t _a) + { + const float64x2_t a = vreinterpretq_f64_f32(_a); + const float64x2_t result = vrndpq_f64(a); + return vreinterpretq_f32_f64(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_floor(simd128_neon_t _a) + { + const float64x2_t a = vreinterpretq_f64_f32(_a); + const float64x2_t result = vrndmq_f64(a); + return vreinterpretq_f32_f64(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_cmpeq(simd128_neon_t _a, simd128_neon_t _b) + { + const float64x2_t a = vreinterpretq_f64_f32(_a); + const float64x2_t b = vreinterpretq_f64_f32(_b); + const uint64x2_t result = vceqq_f64(a, b); + return vreinterpretq_f32_u64(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_cmpneq(simd128_neon_t _a, simd128_neon_t _b) + { + return simd_f64_cmpneq_ni(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_cmplt(simd128_neon_t _a, simd128_neon_t _b) + { + const float64x2_t a = vreinterpretq_f64_f32(_a); + const float64x2_t b = vreinterpretq_f64_f32(_b); + const uint64x2_t result = vcltq_f64(a, b); + return vreinterpretq_f32_u64(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_cmple(simd128_neon_t _a, simd128_neon_t _b) + { + const float64x2_t a = vreinterpretq_f64_f32(_a); + const float64x2_t b = vreinterpretq_f64_f32(_b); + const uint64x2_t result = vcleq_f64(a, b); + return vreinterpretq_f32_u64(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_cmpgt(simd128_neon_t _a, simd128_neon_t _b) + { + const float64x2_t a = vreinterpretq_f64_f32(_a); + const float64x2_t b = vreinterpretq_f64_f32(_b); + const uint64x2_t result = vcgtq_f64(a, b); + return vreinterpretq_f32_u64(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_cmpge(simd128_neon_t _a, simd128_neon_t _b) + { + const float64x2_t a = vreinterpretq_f64_f32(_a); + const float64x2_t b = vreinterpretq_f64_f32(_b); + const uint64x2_t result = vcgeq_f64(a, b); + return vreinterpretq_f32_u64(result); + } + +#else + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_add(simd128_neon_t _a, simd128_neon_t _b) + { + double a[2], b[2], r[2]; + vst1q_f32((float*)a, _a); vst1q_f32((float*)b, _b); + r[0] = a[0] + b[0]; r[1] = a[1] + b[1]; + return vld1q_f32((const float*)r); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_sub(simd128_neon_t _a, simd128_neon_t _b) + { + double a[2], b[2], r[2]; + vst1q_f32((float*)a, _a); vst1q_f32((float*)b, _b); + r[0] = a[0] - b[0]; r[1] = a[1] - b[1]; + return vld1q_f32((const float*)r); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_mul(simd128_neon_t _a, simd128_neon_t _b) + { + double a[2], b[2], r[2]; + vst1q_f32((float*)a, _a); vst1q_f32((float*)b, _b); + r[0] = a[0] * b[0]; r[1] = a[1] * b[1]; + return vld1q_f32((const float*)r); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_div(simd128_neon_t _a, simd128_neon_t _b) + { + double a[2], b[2], r[2]; + vst1q_f32((float*)a, _a); vst1q_f32((float*)b, _b); + r[0] = a[0] / b[0]; r[1] = a[1] / b[1]; + return vld1q_f32((const float*)r); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_min(simd128_neon_t _a, simd128_neon_t _b) + { + double a[2], b[2], r[2]; + vst1q_f32((float*)a, _a); vst1q_f32((float*)b, _b); + r[0] = a[0] < b[0] ? a[0] : b[0]; r[1] = a[1] < b[1] ? a[1] : b[1]; + return vld1q_f32((const float*)r); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_max(simd128_neon_t _a, simd128_neon_t _b) + { + double a[2], b[2], r[2]; + vst1q_f32((float*)a, _a); vst1q_f32((float*)b, _b); + r[0] = a[0] > b[0] ? a[0] : b[0]; r[1] = a[1] > b[1] ? a[1] : b[1]; + return vld1q_f32((const float*)r); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_madd(simd128_neon_t _a, simd128_neon_t _b, simd128_neon_t _c) + { + return simd_f64_madd_ni(_a, _b, _c); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_nmsub(simd128_neon_t _a, simd128_neon_t _b, simd128_neon_t _c) + { + return simd_f64_nmsub_ni(_a, _b, _c); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_neg(simd128_neon_t _a) + { + return simd_f64_neg_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_abs(simd128_neon_t _a) + { + return simd_f64_abs_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_clamp(simd128_neon_t _a, simd128_neon_t _min, simd128_neon_t _max) + { + return simd128_f64_min(simd128_f64_max(_a, _min), _max); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_lerp(simd128_neon_t _a, simd128_neon_t _b, simd128_neon_t _s) + { + return simd_f64_lerp_ni(_a, _b, _s); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_rcp(simd128_neon_t _a) + { + return simd_f64_rcp_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_sqrt(simd128_neon_t _a) + { + double a[2], r[2]; + vst1q_f32((float*)a, _a); + r[0] = (double)sqrt((float)a[0]); r[1] = (double)sqrt((float)a[1]); + return vld1q_f32((const float*)r); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_rsqrt(simd128_neon_t _a) + { + return simd_f64_rsqrt_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_round(simd128_neon_t _a) + { + double a[2], r[2]; + vst1q_f32((float*)a, _a); + r[0] = (double)round((float)a[0]); r[1] = (double)round((float)a[1]); + return vld1q_f32((const float*)r); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_ceil(simd128_neon_t _a) + { + double a[2], r[2]; + vst1q_f32((float*)a, _a); + r[0] = (double)ceil((float)a[0]); r[1] = (double)ceil((float)a[1]); + return vld1q_f32((const float*)r); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_floor(simd128_neon_t _a) + { + double a[2], r[2]; + vst1q_f32((float*)a, _a); + r[0] = (double)floor((float)a[0]); r[1] = (double)floor((float)a[1]); + return vld1q_f32((const float*)r); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_cmpeq(simd128_neon_t _a, simd128_neon_t _b) + { + double a[2], b[2]; uint64_t r[2]; + vst1q_f32((float*)a, _a); vst1q_f32((float*)b, _b); + r[0] = a[0] == b[0] ? UINT64_MAX : 0; r[1] = a[1] == b[1] ? UINT64_MAX : 0; + return vld1q_f32((const float*)r); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_cmpneq(simd128_neon_t _a, simd128_neon_t _b) + { + return simd_f64_cmpneq_ni(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_cmplt(simd128_neon_t _a, simd128_neon_t _b) + { + double a[2], b[2]; uint64_t r[2]; + vst1q_f32((float*)a, _a); vst1q_f32((float*)b, _b); + r[0] = a[0] < b[0] ? UINT64_MAX : 0; r[1] = a[1] < b[1] ? UINT64_MAX : 0; + return vld1q_f32((const float*)r); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_cmple(simd128_neon_t _a, simd128_neon_t _b) + { + double a[2], b[2]; uint64_t r[2]; + vst1q_f32((float*)a, _a); vst1q_f32((float*)b, _b); + r[0] = a[0] <= b[0] ? UINT64_MAX : 0; r[1] = a[1] <= b[1] ? UINT64_MAX : 0; + return vld1q_f32((const float*)r); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_cmpgt(simd128_neon_t _a, simd128_neon_t _b) + { + double a[2], b[2]; uint64_t r[2]; + vst1q_f32((float*)a, _a); vst1q_f32((float*)b, _b); + r[0] = a[0] > b[0] ? UINT64_MAX : 0; r[1] = a[1] > b[1] ? UINT64_MAX : 0; + return vld1q_f32((const float*)r); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_f64_cmpge(simd128_neon_t _a, simd128_neon_t _b) + { + double a[2], b[2]; uint64_t r[2]; + vst1q_f32((float*)a, _a); vst1q_f32((float*)b, _b); + r[0] = a[0] >= b[0] ? UINT64_MAX : 0; r[1] = a[1] >= b[1] ? UINT64_MAX : 0; + return vld1q_f32((const float*)r); + } + +#endif // BX_ARCH_64BIT + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_i64_add(simd128_neon_t _a, simd128_neon_t _b) + { + const int64x2_t a = vreinterpretq_s64_f32(_a); + const int64x2_t b = vreinterpretq_s64_f32(_b); + const int64x2_t result = vaddq_s64(a, b); + return vreinterpretq_f32_s64(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_i64_sub(simd128_neon_t _a, simd128_neon_t _b) + { + const int64x2_t a = vreinterpretq_s64_f32(_a); + const int64x2_t b = vreinterpretq_s64_f32(_b); + const int64x2_t result = vsubq_s64(a, b); + return vreinterpretq_f32_s64(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_u64_add(simd128_neon_t _a, simd128_neon_t _b) + { + const uint64x2_t a = vreinterpretq_u64_f32(_a); + const uint64x2_t b = vreinterpretq_u64_f32(_b); + const uint64x2_t result = vaddq_u64(a, b); + return vreinterpretq_f32_u64(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_neon_t simd128_u64_sub(simd128_neon_t _a, simd128_neon_t _b) + { + const uint64x2_t a = vreinterpretq_u64_f32(_a); + const uint64x2_t b = vreinterpretq_u64_f32(_b); + const uint64x2_t result = vsubq_u64(a, b); + return vreinterpretq_f32_u64(result); + } } // namespace bx diff --git a/include/bx/inline/simd128_ref.inl b/include/bx/inline/simd128_ref.inl index 5aa8732..917d4a5 100644 --- a/include/bx/inline/simd128_ref.inl +++ b/include/bx/inline/simd128_ref.inl @@ -9,23 +9,43 @@ namespace bx { - BX_CONSTEXPR_FUNC float sqrt(float); - BX_CONSTEXPR_FUNC float rsqrt(float); + BX_ALIGN_DECL(16, struct) simd128_f32_ref_t { float f32[4]; }; + BX_ALIGN_DECL(16, struct) simd128_f64_ref_t { double f64[2]; }; + BX_ALIGN_DECL(16, struct) simd128_i8_ref_t { int8_t i8[16]; }; + BX_ALIGN_DECL(16, struct) simd128_i16_ref_t { int16_t i16[8]; }; + BX_ALIGN_DECL(16, struct) simd128_i32_ref_t { int32_t i32[4]; }; + BX_ALIGN_DECL(16, struct) simd128_i64_ref_t { int64_t i64[2]; }; + BX_ALIGN_DECL(16, struct) simd128_u8_ref_t { uint8_t u8[16]; }; + BX_ALIGN_DECL(16, struct) simd128_u16_ref_t { uint16_t u16[8]; }; + BX_ALIGN_DECL(16, struct) simd128_u64_ref_t { uint64_t u64[2]; }; + +#if BX_SIMD_LANGEXT + typedef float simd128_f32_langext_t __attribute__((__vector_size__(16), __aligned__(16))); + typedef double simd128_f64_langext_t __attribute__((__vector_size__(16), __aligned__(16))); + typedef int8_t simd128_i8_langext_t __attribute__((__vector_size__(16), __aligned__(16))); + typedef int16_t simd128_i16_langext_t __attribute__((__vector_size__(16), __aligned__(16))); + typedef int32_t simd128_i32_langext_t __attribute__((__vector_size__(16), __aligned__(16))); + typedef int64_t simd128_i64_langext_t __attribute__((__vector_size__(16), __aligned__(16))); + typedef uint8_t simd128_u8_langext_t __attribute__((__vector_size__(16), __aligned__(16))); + typedef uint16_t simd128_u16_langext_t __attribute__((__vector_size__(16), __aligned__(16))); + typedef uint32_t simd128_u32_langext_t __attribute__((__vector_size__(16), __aligned__(16))); + typedef uint64_t simd128_u64_langext_t __attribute__((__vector_size__(16), __aligned__(16))); +#endif // BX_SIMD_LANGEXT #define ELEMx 0 #define ELEMy 1 #define ELEMz 2 #define ELEMw 3 -#define BX_SIMD128_IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ - template<> \ - BX_SIMD_FORCE_INLINE simd128_ref_t simd_swiz_##_x##_y##_z##_w(simd128_ref_t _a) \ - { \ - simd128_ref_t result; \ - result.ixyzw[0] = _a.ixyzw[ELEM##_x]; \ - result.ixyzw[1] = _a.ixyzw[ELEM##_y]; \ - result.ixyzw[2] = _a.ixyzw[ELEM##_z]; \ - result.ixyzw[3] = _a.ixyzw[ELEM##_w]; \ - return result; \ +#define BX_SIMD128_IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ + template<> \ + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_x32_swiz_##_x##_y##_z##_w(simd128_ref_t _a) \ + { \ + simd128_ref_t result; \ + result.u32[0] = _a.u32[ELEM##_x]; \ + result.u32[1] = _a.u32[ELEM##_y]; \ + result.u32[2] = _a.u32[ELEM##_z]; \ + result.u32[3] = _a.u32[ELEM##_w]; \ + return result; \ } #include "simd128_swizzle.inl" @@ -36,798 +56,1829 @@ namespace bx #undef ELEMy #undef ELEMx -#define BX_SIMD128_IMPLEMENT_TEST(_xyzw, _mask) \ - template<> \ - BX_SIMD_FORCE_INLINE bool simd_test_any_##_xyzw(simd128_ref_t _test) \ - { \ - uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \ - | ( (_test.uxyzw[2]>>31)<<2) \ - | ( (_test.uxyzw[1]>>31)<<1) \ - | ( _test.uxyzw[0]>>31) \ - ; \ - return 0 != (tmp&(_mask) ); \ - } \ - \ - template<> \ - BX_SIMD_FORCE_INLINE bool simd_test_all_##_xyzw(simd128_ref_t _test) \ - { \ - uint32_t tmp = ( (_test.uxyzw[3]>>31)<<3) \ - | ( (_test.uxyzw[2]>>31)<<2) \ - | ( (_test.uxyzw[1]>>31)<<1) \ - | ( _test.uxyzw[0]>>31) \ - ; \ - return (_mask) == (tmp&(_mask) ); \ +#define BX_SIMD128_IMPLEMENT_TEST(_xyzw, _mask) \ + template<> \ + inline BX_CONSTEXPR_FUNC bool simd128_test_any_##_xyzw(simd128_ref_t _test) \ + { \ + uint32_t tmp = ( (_test.u32[3]>>31)<<3) \ + | ( (_test.u32[2]>>31)<<2) \ + | ( (_test.u32[1]>>31)<<1) \ + | ( _test.u32[0]>>31) \ + ; \ + return 0 != (tmp&(_mask) ); \ + } \ + \ + template<> \ + inline BX_CONSTEXPR_FUNC bool simd128_test_all_##_xyzw(simd128_ref_t _test) \ + { \ + uint32_t tmp = ( (_test.u32[3]>>31)<<3) \ + | ( (_test.u32[2]>>31)<<2) \ + | ( (_test.u32[1]>>31)<<1) \ + | ( _test.u32[0]>>31) \ + ; \ + return (_mask) == (tmp&(_mask) ); \ } -BX_SIMD128_IMPLEMENT_TEST(x , 0x1) -BX_SIMD128_IMPLEMENT_TEST(y , 0x2) -BX_SIMD128_IMPLEMENT_TEST(xy , 0x3) -BX_SIMD128_IMPLEMENT_TEST(z , 0x4) -BX_SIMD128_IMPLEMENT_TEST(xz , 0x5) -BX_SIMD128_IMPLEMENT_TEST(yz , 0x6) -BX_SIMD128_IMPLEMENT_TEST(xyz , 0x7) -BX_SIMD128_IMPLEMENT_TEST(w , 0x8) -BX_SIMD128_IMPLEMENT_TEST(xw , 0x9) -BX_SIMD128_IMPLEMENT_TEST(yw , 0xa) -BX_SIMD128_IMPLEMENT_TEST(xyw , 0xb) -BX_SIMD128_IMPLEMENT_TEST(zw , 0xc) -BX_SIMD128_IMPLEMENT_TEST(xzw , 0xd) -BX_SIMD128_IMPLEMENT_TEST(yzw , 0xe) -BX_SIMD128_IMPLEMENT_TEST(xyzw , 0xf) + BX_SIMD128_IMPLEMENT_TEST(x , 0x1) + BX_SIMD128_IMPLEMENT_TEST(y , 0x2) + BX_SIMD128_IMPLEMENT_TEST(xy , 0x3) + BX_SIMD128_IMPLEMENT_TEST(z , 0x4) + BX_SIMD128_IMPLEMENT_TEST(xz , 0x5) + BX_SIMD128_IMPLEMENT_TEST(yz , 0x6) + BX_SIMD128_IMPLEMENT_TEST(xyz , 0x7) + BX_SIMD128_IMPLEMENT_TEST(w , 0x8) + BX_SIMD128_IMPLEMENT_TEST(xw , 0x9) + BX_SIMD128_IMPLEMENT_TEST(yw , 0xa) + BX_SIMD128_IMPLEMENT_TEST(xyw , 0xb) + BX_SIMD128_IMPLEMENT_TEST(zw , 0xc) + BX_SIMD128_IMPLEMENT_TEST(xzw , 0xd) + BX_SIMD128_IMPLEMENT_TEST(yzw , 0xe) + BX_SIMD128_IMPLEMENT_TEST(xyzw , 0xf) #undef BX_SIMD128_IMPLEMENT_TEST template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_xyAB(simd128_ref_t _a, simd128_ref_t _b) + inline BX_CONSTEXPR_FUNC bool simd128_test_zero(simd128_ref_t _a, simd128_ref_t _b) + { + return 0 == ( (_a.u32[0] & _b.u32[0]) + | (_a.u32[1] & _b.u32[1]) + | (_a.u32[2] & _b.u32[2]) + | (_a.u32[3] & _b.u32[3]) + ); + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_x32_shuf_xyAB(simd128_ref_t _a, simd128_ref_t _b) { simd128_ref_t result; - result.uxyzw[0] = _a.uxyzw[0]; - result.uxyzw[1] = _a.uxyzw[1]; - result.uxyzw[2] = _b.uxyzw[0]; - result.uxyzw[3] = _b.uxyzw[1]; + result.u32[0] = _a.u32[0]; + result.u32[1] = _a.u32[1]; + result.u32[2] = _b.u32[0]; + result.u32[3] = _b.u32[1]; return result; } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_ABxy(simd128_ref_t _a, simd128_ref_t _b) + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_x32_shuf_ABxy(simd128_ref_t _a, simd128_ref_t _b) { simd128_ref_t result; - result.uxyzw[0] = _b.uxyzw[0]; - result.uxyzw[1] = _b.uxyzw[1]; - result.uxyzw[2] = _a.uxyzw[0]; - result.uxyzw[3] = _a.uxyzw[1]; + result.u32[0] = _b.u32[0]; + result.u32[1] = _b.u32[1]; + result.u32[2] = _a.u32[0]; + result.u32[3] = _a.u32[1]; return result; } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_CDzw(simd128_ref_t _a, simd128_ref_t _b) + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_x32_shuf_CDzw(simd128_ref_t _a, simd128_ref_t _b) { simd128_ref_t result; - result.uxyzw[0] = _b.uxyzw[2]; - result.uxyzw[1] = _b.uxyzw[3]; - result.uxyzw[2] = _a.uxyzw[2]; - result.uxyzw[3] = _a.uxyzw[3]; + result.u32[0] = _b.u32[2]; + result.u32[1] = _b.u32[3]; + result.u32[2] = _a.u32[2]; + result.u32[3] = _a.u32[3]; return result; } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_zwCD(simd128_ref_t _a, simd128_ref_t _b) + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_x32_shuf_zwCD(simd128_ref_t _a, simd128_ref_t _b) { simd128_ref_t result; - result.uxyzw[0] = _a.uxyzw[2]; - result.uxyzw[1] = _a.uxyzw[3]; - result.uxyzw[2] = _b.uxyzw[2]; - result.uxyzw[3] = _b.uxyzw[3]; + result.u32[0] = _a.u32[2]; + result.u32[1] = _a.u32[3]; + result.u32[2] = _b.u32[2]; + result.u32[3] = _b.u32[3]; return result; } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_xAyB(simd128_ref_t _a, simd128_ref_t _b) + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_x32_shuf_xAyB(simd128_ref_t _a, simd128_ref_t _b) { simd128_ref_t result; - result.uxyzw[0] = _a.uxyzw[0]; - result.uxyzw[1] = _b.uxyzw[0]; - result.uxyzw[2] = _a.uxyzw[1]; - result.uxyzw[3] = _b.uxyzw[1]; + result.u32[0] = _a.u32[0]; + result.u32[1] = _b.u32[0]; + result.u32[2] = _a.u32[1]; + result.u32[3] = _b.u32[1]; return result; } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_AxBy(simd128_ref_t _a, simd128_ref_t _b) + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_x32_shuf_AxBy(simd128_ref_t _a, simd128_ref_t _b) { simd128_ref_t result; - result.uxyzw[0] = _b.uxyzw[0]; - result.uxyzw[1] = _a.uxyzw[0]; - result.uxyzw[2] = _b.uxyzw[1]; - result.uxyzw[3] = _a.uxyzw[1]; + result.u32[0] = _b.u32[0]; + result.u32[1] = _a.u32[0]; + result.u32[2] = _b.u32[1]; + result.u32[3] = _a.u32[1]; return result; } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_zCwD(simd128_ref_t _a, simd128_ref_t _b) + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_x32_shuf_zCwD(simd128_ref_t _a, simd128_ref_t _b) { simd128_ref_t result; - result.uxyzw[0] = _a.uxyzw[2]; - result.uxyzw[1] = _b.uxyzw[2]; - result.uxyzw[2] = _a.uxyzw[3]; - result.uxyzw[3] = _b.uxyzw[3]; + result.u32[0] = _a.u32[2]; + result.u32[1] = _b.u32[2]; + result.u32[2] = _a.u32[3]; + result.u32[3] = _b.u32[3]; return result; } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_CzDw(simd128_ref_t _a, simd128_ref_t _b) + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_x32_shuf_CzDw(simd128_ref_t _a, simd128_ref_t _b) { simd128_ref_t result; - result.uxyzw[0] = _b.uxyzw[2]; - result.uxyzw[1] = _a.uxyzw[2]; - result.uxyzw[2] = _b.uxyzw[3]; - result.uxyzw[3] = _a.uxyzw[3]; + result.u32[0] = _b.u32[2]; + result.u32[1] = _a.u32[2]; + result.u32[2] = _b.u32[3]; + result.u32[3] = _a.u32[3]; return result; } template<> - BX_SIMD_FORCE_INLINE float simd_x(simd128_ref_t _a) + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_x32_shuf_xzAC(simd128_ref_t _a, simd128_ref_t _b) { - return _a.fxyzw[0]; - } - - template<> - BX_SIMD_FORCE_INLINE float simd_y(simd128_ref_t _a) - { - return _a.fxyzw[1]; - } - - template<> - BX_SIMD_FORCE_INLINE float simd_z(simd128_ref_t _a) - { - return _a.fxyzw[2]; - } - - template<> - BX_SIMD_FORCE_INLINE float simd_w(simd128_ref_t _a) - { - return _a.fxyzw[3]; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_ld(const void* _ptr) - { - const uint32_t* input = reinterpret_cast(_ptr); simd128_ref_t result; - result.uxyzw[0] = input[0]; - result.uxyzw[1] = input[1]; - result.uxyzw[2] = input[2]; - result.uxyzw[3] = input[3]; + result.u32[0] = _a.u32[0]; + result.u32[1] = _a.u32[2]; + result.u32[2] = _b.u32[0]; + result.u32[3] = _b.u32[2]; return result; } template<> - BX_SIMD_FORCE_INLINE void simd_st(void* _ptr, simd128_ref_t _a) - { - uint32_t* result = reinterpret_cast(_ptr); - result[0] = _a.uxyzw[0]; - result[1] = _a.uxyzw[1]; - result[2] = _a.uxyzw[2]; - result[3] = _a.uxyzw[3]; - } - - template<> - BX_SIMD_FORCE_INLINE void simd_stx(void* _ptr, simd128_ref_t _a) - { - uint32_t* result = reinterpret_cast(_ptr); - result[0] = _a.uxyzw[0]; - } - - template<> - BX_SIMD_FORCE_INLINE void simd_stream(void* _ptr, simd128_ref_t _a) - { - uint32_t* result = reinterpret_cast(_ptr); - result[0] = _a.uxyzw[0]; - result[1] = _a.uxyzw[1]; - result[2] = _a.uxyzw[2]; - result[3] = _a.uxyzw[3]; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_ld(float _x, float _y, float _z, float _w) + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_x32_shuf_ywBD(simd128_ref_t _a, simd128_ref_t _b) { simd128_ref_t result; - result.fxyzw[0] = _x; - result.fxyzw[1] = _y; - result.fxyzw[2] = _z; - result.fxyzw[3] = _w; + result.u32[0] = _a.u32[1]; + result.u32[1] = _a.u32[3]; + result.u32[2] = _b.u32[1]; + result.u32[3] = _b.u32[3]; return result; } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_x32_shuf_xxAA(simd128_ref_t _a, simd128_ref_t _b) { simd128_ref_t result; - result.uxyzw[0] = _x; - result.uxyzw[1] = _y; - result.uxyzw[2] = _z; - result.uxyzw[3] = _w; + result.u32[0] = _a.u32[0]; + result.u32[1] = _a.u32[0]; + result.u32[2] = _b.u32[0]; + result.u32[3] = _b.u32[0]; return result; } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_splat(const void* _ptr) + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_x32_shuf_yyBB(simd128_ref_t _a, simd128_ref_t _b) { - const uint32_t val = *reinterpret_cast(_ptr); simd128_ref_t result; - result.uxyzw[0] = val; - result.uxyzw[1] = val; - result.uxyzw[2] = val; - result.uxyzw[3] = val; + result.u32[0] = _a.u32[1]; + result.u32[1] = _a.u32[1]; + result.u32[2] = _b.u32[1]; + result.u32[3] = _b.u32[1]; return result; } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_splat(float _a) - { - return simd_ld(_a, _a, _a, _a); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_isplat(uint32_t _a) - { - return simd_ild(_a, _a, _a, _a); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_zero() - { - return simd_ild(0, 0, 0, 0); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_itof(simd128_ref_t _a) + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_x32_shuf_zzCC(simd128_ref_t _a, simd128_ref_t _b) { simd128_ref_t result; - result.fxyzw[0] = (float)_a.ixyzw[0]; - result.fxyzw[1] = (float)_a.ixyzw[1]; - result.fxyzw[2] = (float)_a.ixyzw[2]; - result.fxyzw[3] = (float)_a.ixyzw[3]; + result.u32[0] = _a.u32[2]; + result.u32[1] = _a.u32[2]; + result.u32[2] = _b.u32[2]; + result.u32[3] = _b.u32[2]; return result; } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_ftoi(simd128_ref_t _a) + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_x32_shuf_wwDD(simd128_ref_t _a, simd128_ref_t _b) { simd128_ref_t result; - result.ixyzw[0] = (int)_a.fxyzw[0]; - result.ixyzw[1] = (int)_a.fxyzw[1]; - result.ixyzw[2] = (int)_a.fxyzw[2]; - result.ixyzw[3] = (int)_a.fxyzw[3]; + result.u32[0] = _a.u32[3]; + result.u32[1] = _a.u32[3]; + result.u32[2] = _b.u32[3]; + result.u32[3] = _b.u32[3]; return result; } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_round(simd128_ref_t _a) - { - return simd_round_ni(_a); - } + inline BX_CONSTEXPR_FUNC float simd128_f32_x(simd128_ref_t _a) { return bitCast(_a).f32[0]; } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_add(simd128_ref_t _a, simd128_ref_t _b) + inline BX_CONSTEXPR_FUNC float simd128_f32_y(simd128_ref_t _a) { return bitCast(_a).f32[1]; } + + template<> + inline BX_CONSTEXPR_FUNC float simd128_f32_z(simd128_ref_t _a) { return bitCast(_a).f32[2]; } + + template<> + inline BX_CONSTEXPR_FUNC float simd128_f32_w(simd128_ref_t _a) { return bitCast(_a).f32[3]; } + + template<> + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_ld(const void* _ptr) { simd128_ref_t result; - result.fxyzw[0] = _a.fxyzw[0] + _b.fxyzw[0]; - result.fxyzw[1] = _a.fxyzw[1] + _b.fxyzw[1]; - result.fxyzw[2] = _a.fxyzw[2] + _b.fxyzw[2]; - result.fxyzw[3] = _a.fxyzw[3] + _b.fxyzw[3]; + memCopy(&result, _ptr, sizeof(simd128_ref_t) ); return result; } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_sub(simd128_ref_t _a, simd128_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_ldu(const void* _ptr) + { + return simd128_ld(_ptr); + } + + template<> + BX_SIMD_FORCE_INLINE void simd128_st(void* _ptr, simd128_ref_t _a) + { + memCopy(_ptr, &_a, sizeof(simd128_ref_t) ); + } + + template<> + BX_SIMD_FORCE_INLINE void simd128_stu(void* _ptr, simd128_ref_t _a) + { + simd128_st(_ptr, _a); + } + + template<> + BX_SIMD_FORCE_INLINE void simd128_x32_st1(void* _ptr, simd128_ref_t _a) + { + float* result = reinterpret_cast(_ptr); + *result = bitCast(_a).f32[0]; + } + + template<> + BX_SIMD_FORCE_INLINE void simd128_stream(void* _ptr, simd128_ref_t _a) + { + simd128_ref_t* result = reinterpret_cast(_ptr); + *result = _a; + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_ld(float _x, float _y, float _z, float _w) + { + const simd128_f32_ref_t result = { { _x, _y, _z, _w } }; + return bitCast(result); + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_ld(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) { simd128_ref_t result; - result.fxyzw[0] = _a.fxyzw[0] - _b.fxyzw[0]; - result.fxyzw[1] = _a.fxyzw[1] - _b.fxyzw[1]; - result.fxyzw[2] = _a.fxyzw[2] - _b.fxyzw[2]; - result.fxyzw[3] = _a.fxyzw[3] - _b.fxyzw[3]; + result.u32[0] = _x; + result.u32[1] = _y; + result.u32[2] = _z; + result.u32[3] = _w; return result; } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_mul(simd128_ref_t _a, simd128_ref_t _b) + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_ld(int32_t _x, int32_t _y, int32_t _z, int32_t _w) { simd128_ref_t result; - result.fxyzw[0] = _a.fxyzw[0] * _b.fxyzw[0]; - result.fxyzw[1] = _a.fxyzw[1] * _b.fxyzw[1]; - result.fxyzw[2] = _a.fxyzw[2] * _b.fxyzw[2]; - result.fxyzw[3] = _a.fxyzw[3] * _b.fxyzw[3]; + result.u32[0] = uint32_t(_x); + result.u32[1] = uint32_t(_y); + result.u32[2] = uint32_t(_z); + result.u32[3] = uint32_t(_w); return result; } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_div(simd128_ref_t _a, simd128_ref_t _b) + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_splat(float _a) { + return simd128_ld(_a, _a, _a, _a); + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_splat(int32_t _a) + { + return simd128_ld(_a, _a, _a, _a); + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_splat(uint32_t _a) + { + return simd128_ld(_a, _a, _a, _a); + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_splat(double _a) + { + const simd128_f64_ref_t result = { { _a, _a } }; + return bitCast(result); + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_zero() + { + return simd128_ld(0u, 0u, 0u, 0u); + } + + template<> + inline BX_CONST_FUNC simd128_ref_t simd128_i32_itof(simd128_ref_t _a) + { +#if BX_SIMD_LANGEXT + const simd128_i32_langext_t a = bitCast(_a); + const simd128_f32_langext_t conv = __builtin_convertvector(a, simd128_f32_langext_t); + const simd128_ref_t result = bitCast(conv); + return result; +#else + const simd128_i32_ref_t a = bitCast(_a); + const simd128_f32_ref_t result = { { (float)a.i32[0], (float)a.i32[1], (float)a.i32[2], (float)a.i32[3] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONST_FUNC simd128_ref_t simd128_f32_ftoi_trunc(simd128_ref_t _a) + { +#if BX_SIMD_LANGEXT + const simd128_f32_langext_t a = bitCast(_a); + const simd128_i32_langext_t conv = __builtin_convertvector(a, simd128_i32_langext_t); + const simd128_ref_t result = bitCast(conv); + return result; +#else + const simd128_f32_ref_t a = bitCast(_a); + const simd128_i32_ref_t result = { { (int32_t)a.f32[0], (int32_t)a.f32[1], (int32_t)a.f32[2], (int32_t)a.f32[3] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f32_ftoi_round(simd128_ref_t _a) + { + const simd128_f32_ref_t a = bitCast(_a); + const simd128_i32_ref_t result = { { (int32_t)round(a.f32[0]), (int32_t)round(a.f32[1]), (int32_t)round(a.f32[2]), (int32_t)round(a.f32[3]) } }; + return bitCast(result); + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f32_add(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_f32_langext_t a = bitCast(_a); + const simd128_f32_langext_t b = bitCast(_b); + const simd128_f32_langext_t sum = a + b; + const simd128_ref_t result = bitCast(sum); + return result; +#else + const simd128_f32_ref_t a = bitCast(_a); + const simd128_f32_ref_t b = bitCast(_b); + const simd128_f32_ref_t result = { { a.f32[0] + b.f32[0], a.f32[1] + b.f32[1], a.f32[2] + b.f32[2], a.f32[3] + b.f32[3] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f32_sub(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_f32_langext_t a = bitCast(_a); + const simd128_f32_langext_t b = bitCast(_b); + const simd128_f32_langext_t diff = a - b; + const simd128_ref_t result = bitCast(diff); + return result; +#else + const simd128_f32_ref_t a = bitCast(_a); + const simd128_f32_ref_t b = bitCast(_b); + const simd128_f32_ref_t result = { { a.f32[0] - b.f32[0], a.f32[1] - b.f32[1], a.f32[2] - b.f32[2], a.f32[3] - b.f32[3] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f32_mul(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_f32_langext_t a = bitCast(_a); + const simd128_f32_langext_t b = bitCast(_b); + const simd128_f32_langext_t prod = a * b; + const simd128_ref_t result = bitCast(prod); + return result; +#else + const simd128_f32_ref_t a = bitCast(_a); + const simd128_f32_ref_t b = bitCast(_b); + const simd128_f32_ref_t result = { { a.f32[0] * b.f32[0], a.f32[1] * b.f32[1], a.f32[2] * b.f32[2], a.f32[3] * b.f32[3] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f32_div(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_f32_langext_t a = bitCast(_a); + const simd128_f32_langext_t b = bitCast(_b); + const simd128_f32_langext_t quot = a / b; + const simd128_ref_t result = bitCast(quot); + return result; +#else + const simd128_f32_ref_t a = bitCast(_a); + const simd128_f32_ref_t b = bitCast(_b); + const simd128_f32_ref_t result = { { a.f32[0] / b.f32[0], a.f32[1] / b.f32[1], a.f32[2] / b.f32[2], a.f32[3] / b.f32[3] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f32_rcp_est(simd128_ref_t _a) + { +#if BX_SIMD_LANGEXT + const simd128_f32_langext_t one = {1.0f, 1.0f, 1.0f, 1.0f}; + const simd128_f32_langext_t a = bitCast(_a); + const simd128_f32_langext_t quot = one / a; + const simd128_ref_t result = bitCast(quot); + return result; +#else + const simd128_f32_ref_t a = bitCast(_a); + const simd128_f32_ref_t result = { { 1.0f / a.f32[0], 1.0f / a.f32[1], 1.0f / a.f32[2], 1.0f / a.f32[3] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_f32_sqrt(simd128_ref_t _a) + { + const simd128_f32_ref_t a = bitCast(_a); + const simd128_f32_ref_t result = { { + sqrt(a.f32[0]), + sqrt(a.f32[1]), + sqrt(a.f32[2]), + sqrt(a.f32[3]), + } }; + return bitCast(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_f32_rsqrt_est(simd128_ref_t _a) + { + const simd128_f32_ref_t a = bitCast(_a); + const simd128_f32_ref_t result = { { + 1.0f / sqrt(a.f32[0]), + 1.0f / sqrt(a.f32[1]), + 1.0f / sqrt(a.f32[2]), + 1.0f / sqrt(a.f32[3]), + } }; + return bitCast(result); + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f32_min(simd128_ref_t _a, simd128_ref_t _b) + { + const simd128_f32_ref_t a = bitCast(_a); + const simd128_f32_ref_t b = bitCast(_b); + const simd128_f32_ref_t result = { { a.f32[0] < b.f32[0] ? a.f32[0] : b.f32[0], a.f32[1] < b.f32[1] ? a.f32[1] : b.f32[1], a.f32[2] < b.f32[2] ? a.f32[2] : b.f32[2], a.f32[3] < b.f32[3] ? a.f32[3] : b.f32[3] } }; + return bitCast(result); + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f32_max(simd128_ref_t _a, simd128_ref_t _b) + { + const simd128_f32_ref_t a = bitCast(_a); + const simd128_f32_ref_t b = bitCast(_b); + const simd128_f32_ref_t result = { { a.f32[0] > b.f32[0] ? a.f32[0] : b.f32[0], a.f32[1] > b.f32[1] ? a.f32[1] : b.f32[1], a.f32[2] > b.f32[2] ? a.f32[2] : b.f32[2], a.f32[3] > b.f32[3] ? a.f32[3] : b.f32[3] } }; + return bitCast(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_f32_round(simd128_ref_t _a) + { + return simd_f32_round_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_f32_ceil(simd128_ref_t _a) + { + return simd_f32_ceil_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_f32_floor(simd128_ref_t _a) + { + return simd_f32_floor_ni(_a); + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f32_cmpeq(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_f32_langext_t a = bitCast(_a); + const simd128_f32_langext_t b = bitCast(_b); + const auto mask = a == b; + const simd128_ref_t result = bitCast(mask); + return result; +#else + const simd128_f32_ref_t a = bitCast(_a); + const simd128_f32_ref_t b = bitCast(_b); simd128_ref_t result; - result.fxyzw[0] = _a.fxyzw[0] / _b.fxyzw[0]; - result.fxyzw[1] = _a.fxyzw[1] / _b.fxyzw[1]; - result.fxyzw[2] = _a.fxyzw[2] / _b.fxyzw[2]; - result.fxyzw[3] = _a.fxyzw[3] / _b.fxyzw[3]; + result.u32[0] = a.f32[0] == b.f32[0] ? 0xffffffff : 0; + result.u32[1] = a.f32[1] == b.f32[1] ? 0xffffffff : 0; + result.u32[2] = a.f32[2] == b.f32[2] ? 0xffffffff : 0; + result.u32[3] = a.f32[3] == b.f32[3] ? 0xffffffff : 0; + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f32_cmplt(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_f32_langext_t a = bitCast(_a); + const simd128_f32_langext_t b = bitCast(_b); + const auto mask = a < b; + const simd128_ref_t result = bitCast(mask); + return result; +#else + const simd128_f32_ref_t a = bitCast(_a); + const simd128_f32_ref_t b = bitCast(_b); + simd128_ref_t result; + result.u32[0] = a.f32[0] < b.f32[0] ? 0xffffffff : 0; + result.u32[1] = a.f32[1] < b.f32[1] ? 0xffffffff : 0; + result.u32[2] = a.f32[2] < b.f32[2] ? 0xffffffff : 0; + result.u32[3] = a.f32[3] < b.f32[3] ? 0xffffffff : 0; + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f32_cmple(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_f32_langext_t a = bitCast(_a); + const simd128_f32_langext_t b = bitCast(_b); + const auto mask = a <= b; + const simd128_ref_t result = bitCast(mask); + return result; +#else + const simd128_f32_ref_t a = bitCast(_a); + const simd128_f32_ref_t b = bitCast(_b); + simd128_ref_t result; + result.u32[0] = a.f32[0] <= b.f32[0] ? 0xffffffff : 0; + result.u32[1] = a.f32[1] <= b.f32[1] ? 0xffffffff : 0; + result.u32[2] = a.f32[2] <= b.f32[2] ? 0xffffffff : 0; + result.u32[3] = a.f32[3] <= b.f32[3] ? 0xffffffff : 0; + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f32_cmpgt(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_f32_langext_t a = bitCast(_a); + const simd128_f32_langext_t b = bitCast(_b); + const auto mask = a > b; + const simd128_ref_t result = bitCast(mask); + return result; +#else + const simd128_f32_ref_t a = bitCast(_a); + const simd128_f32_ref_t b = bitCast(_b); + simd128_ref_t result; + result.u32[0] = a.f32[0] > b.f32[0] ? 0xffffffff : 0; + result.u32[1] = a.f32[1] > b.f32[1] ? 0xffffffff : 0; + result.u32[2] = a.f32[2] > b.f32[2] ? 0xffffffff : 0; + result.u32[3] = a.f32[3] > b.f32[3] ? 0xffffffff : 0; + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f32_cmpge(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_f32_langext_t a = bitCast(_a); + const simd128_f32_langext_t b = bitCast(_b); + const auto mask = a >= b; + const simd128_ref_t result = bitCast(mask); + return result; +#else + const simd128_f32_ref_t a = bitCast(_a); + const simd128_f32_ref_t b = bitCast(_b); + simd128_ref_t result; + result.u32[0] = a.f32[0] >= b.f32[0] ? 0xffffffff : 0; + result.u32[1] = a.f32[1] >= b.f32[1] ? 0xffffffff : 0; + result.u32[2] = a.f32[2] >= b.f32[2] ? 0xffffffff : 0; + result.u32[3] = a.f32[3] >= b.f32[3] ? 0xffffffff : 0; + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_i32_add(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_i32_langext_t a = bitCast(_a); + const simd128_i32_langext_t b = bitCast(_b); + const simd128_i32_langext_t sum = a + b; + const simd128_ref_t result = bitCast(sum); + return result; +#else + const simd128_i32_ref_t a = bitCast(_a); + const simd128_i32_ref_t b = bitCast(_b); + const simd128_i32_ref_t result = { { a.i32[0] + b.i32[0], a.i32[1] + b.i32[1], a.i32[2] + b.i32[2], a.i32[3] + b.i32[3] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_i32_sub(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_i32_langext_t a = bitCast(_a); + const simd128_i32_langext_t b = bitCast(_b); + const simd128_i32_langext_t diff = a - b; + const simd128_ref_t result = bitCast(diff); + return result; +#else + const simd128_i32_ref_t a = bitCast(_a); + const simd128_i32_ref_t b = bitCast(_b); + const simd128_i32_ref_t result = { { a.i32[0] - b.i32[0], a.i32[1] - b.i32[1], a.i32[2] - b.i32[2], a.i32[3] - b.i32[3] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_i32_neg(simd128_ref_t _a) + { + return simd_i32_neg_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_i32_abs(simd128_ref_t _a) + { + return simd_i32_abs_ni(_a); + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_i32_min(simd128_ref_t _a, simd128_ref_t _b) + { + const simd128_i32_ref_t a = bitCast(_a); + const simd128_i32_ref_t b = bitCast(_b); + const simd128_i32_ref_t result = { { + a.i32[0] < b.i32[0] ? a.i32[0] : b.i32[0], + a.i32[1] < b.i32[1] ? a.i32[1] : b.i32[1], + a.i32[2] < b.i32[2] ? a.i32[2] : b.i32[2], + a.i32[3] < b.i32[3] ? a.i32[3] : b.i32[3], + } }; + return bitCast(result); + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_i32_max(simd128_ref_t _a, simd128_ref_t _b) + { + const simd128_i32_ref_t a = bitCast(_a); + const simd128_i32_ref_t b = bitCast(_b); + const simd128_i32_ref_t result = { { + a.i32[0] > b.i32[0] ? a.i32[0] : b.i32[0], + a.i32[1] > b.i32[1] ? a.i32[1] : b.i32[1], + a.i32[2] > b.i32[2] ? a.i32[2] : b.i32[2], + a.i32[3] > b.i32[3] ? a.i32[3] : b.i32[3], + } }; + return bitCast(result); + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_i32_cmpeq(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_i32_langext_t a = bitCast(_a); + const simd128_i32_langext_t b = bitCast(_b); + const auto mask = a == b; + const simd128_ref_t result = bitCast(mask); + return result; +#else + const simd128_i32_ref_t a = bitCast(_a); + const simd128_i32_ref_t b = bitCast(_b); + simd128_ref_t result; + result.u32[0] = a.i32[0] == b.i32[0] ? 0xffffffff : 0; + result.u32[1] = a.i32[1] == b.i32[1] ? 0xffffffff : 0; + result.u32[2] = a.i32[2] == b.i32[2] ? 0xffffffff : 0; + result.u32[3] = a.i32[3] == b.i32[3] ? 0xffffffff : 0; + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_i32_cmplt(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_i32_langext_t a = bitCast(_a); + const simd128_i32_langext_t b = bitCast(_b); + const auto mask = a < b; + const simd128_ref_t result = bitCast(mask); + return result; +#else + const simd128_i32_ref_t a = bitCast(_a); + const simd128_i32_ref_t b = bitCast(_b); + simd128_ref_t result; + result.u32[0] = a.i32[0] < b.i32[0] ? 0xffffffff : 0; + result.u32[1] = a.i32[1] < b.i32[1] ? 0xffffffff : 0; + result.u32[2] = a.i32[2] < b.i32[2] ? 0xffffffff : 0; + result.u32[3] = a.i32[3] < b.i32[3] ? 0xffffffff : 0; + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_i32_cmpgt(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_i32_langext_t a = bitCast(_a); + const simd128_i32_langext_t b = bitCast(_b); + const auto mask = a > b; + const simd128_ref_t result = bitCast(mask); + return result; +#else + const simd128_i32_ref_t a = bitCast(_a); + const simd128_i32_ref_t b = bitCast(_b); + simd128_ref_t result; + result.u32[0] = a.i32[0] > b.i32[0] ? 0xffffffff : 0; + result.u32[1] = a.i32[1] > b.i32[1] ? 0xffffffff : 0; + result.u32[2] = a.i32[2] > b.i32[2] ? 0xffffffff : 0; + result.u32[3] = a.i32[3] > b.i32[3] ? 0xffffffff : 0; + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_i32_clamp(simd128_ref_t _a, simd128_ref_t _min, simd128_ref_t _max) + { + const simd128_ref_t hi = simd128_i32_max(_a, _min); + const simd128_ref_t result = simd128_i32_min(hi, _max); return result; } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_rcp_est(simd128_ref_t _a) + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_u32_add(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_u32_langext_t a = bitCast(_a); + const simd128_u32_langext_t b = bitCast(_b); + const simd128_u32_langext_t sum = a + b; + const simd128_ref_t result = bitCast(sum); + return result; +#else + simd128_ref_t result; + result.u32[0] = _a.u32[0] + _b.u32[0]; + result.u32[1] = _a.u32[1] + _b.u32[1]; + result.u32[2] = _a.u32[2] + _b.u32[2]; + result.u32[3] = _a.u32[3] + _b.u32[3]; + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_u32_sub(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_u32_langext_t a = bitCast(_a); + const simd128_u32_langext_t b = bitCast(_b); + const simd128_u32_langext_t diff = a - b; + const simd128_ref_t result = bitCast(diff); + return result; +#else + simd128_ref_t result; + result.u32[0] = _a.u32[0] - _b.u32[0]; + result.u32[1] = _a.u32[1] - _b.u32[1]; + result.u32[2] = _a.u32[2] - _b.u32[2]; + result.u32[3] = _a.u32[3] - _b.u32[3]; + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_u32_mul(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_u32_langext_t a = bitCast(_a); + const simd128_u32_langext_t b = bitCast(_b); + const simd128_u32_langext_t prod = a * b; + const simd128_ref_t result = bitCast(prod); + return result; +#else + simd128_ref_t result; + result.u32[0] = _a.u32[0] * _b.u32[0]; + result.u32[1] = _a.u32[1] * _b.u32[1]; + result.u32[2] = _a.u32[2] * _b.u32[2]; + result.u32[3] = _a.u32[3] * _b.u32[3]; + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_u32_min(simd128_ref_t _a, simd128_ref_t _b) { simd128_ref_t result; - result.fxyzw[0] = 1.0f / _a.fxyzw[0]; - result.fxyzw[1] = 1.0f / _a.fxyzw[1]; - result.fxyzw[2] = 1.0f / _a.fxyzw[2]; - result.fxyzw[3] = 1.0f / _a.fxyzw[3]; + result.u32[0] = _a.u32[0] < _b.u32[0] ? _a.u32[0] : _b.u32[0]; + result.u32[1] = _a.u32[1] < _b.u32[1] ? _a.u32[1] : _b.u32[1]; + result.u32[2] = _a.u32[2] < _b.u32[2] ? _a.u32[2] : _b.u32[2]; + result.u32[3] = _a.u32[3] < _b.u32[3] ? _a.u32[3] : _b.u32[3]; return result; } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_sqrt(simd128_ref_t _a) + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_u32_max(simd128_ref_t _a, simd128_ref_t _b) { simd128_ref_t result; - result.fxyzw[0] = sqrt(_a.fxyzw[0]); - result.fxyzw[1] = sqrt(_a.fxyzw[1]); - result.fxyzw[2] = sqrt(_a.fxyzw[2]); - result.fxyzw[3] = sqrt(_a.fxyzw[3]); + result.u32[0] = _a.u32[0] > _b.u32[0] ? _a.u32[0] : _b.u32[0]; + result.u32[1] = _a.u32[1] > _b.u32[1] ? _a.u32[1] : _b.u32[1]; + result.u32[2] = _a.u32[2] > _b.u32[2] ? _a.u32[2] : _b.u32[2]; + result.u32[3] = _a.u32[3] > _b.u32[3] ? _a.u32[3] : _b.u32[3]; return result; } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_rsqrt_est(simd128_ref_t _a) + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_u32_clamp(simd128_ref_t _a, simd128_ref_t _min, simd128_ref_t _max) { - simd128_ref_t result; - result.fxyzw[0] = rsqrt(_a.fxyzw[0]); - result.fxyzw[1] = rsqrt(_a.fxyzw[1]); - result.fxyzw[2] = rsqrt(_a.fxyzw[2]); - result.fxyzw[3] = rsqrt(_a.fxyzw[3]); + const simd128_ref_t hi = simd128_u32_max(_a, _min); + const simd128_ref_t result = simd128_u32_min(hi, _max); return result; } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_cmpeq(simd128_ref_t _a, simd128_ref_t _b) + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_u32_cmpeq(simd128_ref_t _a, simd128_ref_t _b) { - simd128_ref_t result; - result.ixyzw[0] = _a.fxyzw[0] == _b.fxyzw[0] ? 0xffffffff : 0x0; - result.ixyzw[1] = _a.fxyzw[1] == _b.fxyzw[1] ? 0xffffffff : 0x0; - result.ixyzw[2] = _a.fxyzw[2] == _b.fxyzw[2] ? 0xffffffff : 0x0; - result.ixyzw[3] = _a.fxyzw[3] == _b.fxyzw[3] ? 0xffffffff : 0x0; +#if BX_SIMD_LANGEXT + const simd128_u32_langext_t a = bitCast(_a); + const simd128_u32_langext_t b = bitCast(_b); + const auto mask = a == b; + const simd128_ref_t result = bitCast(mask); return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_cmpneq(simd128_ref_t _a, simd128_ref_t _b) - { +#else simd128_ref_t result; - result.ixyzw[0] = _a.fxyzw[0] != _b.fxyzw[0] ? 0xffffffff : 0x0; - result.ixyzw[1] = _a.fxyzw[1] != _b.fxyzw[1] ? 0xffffffff : 0x0; - result.ixyzw[2] = _a.fxyzw[2] != _b.fxyzw[2] ? 0xffffffff : 0x0; - result.ixyzw[3] = _a.fxyzw[3] != _b.fxyzw[3] ? 0xffffffff : 0x0; + result.u32[0] = _a.u32[0] == _b.u32[0] ? 0xffffffff : 0; + result.u32[1] = _a.u32[1] == _b.u32[1] ? 0xffffffff : 0; + result.u32[2] = _a.u32[2] == _b.u32[2] ? 0xffffffff : 0; + result.u32[3] = _a.u32[3] == _b.u32[3] ? 0xffffffff : 0; return result; +#endif // BX_SIMD_LANGEXT } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_cmplt(simd128_ref_t _a, simd128_ref_t _b) + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_u32_cmplt(simd128_ref_t _a, simd128_ref_t _b) { +#if BX_SIMD_LANGEXT + const simd128_u32_langext_t a = bitCast(_a); + const simd128_u32_langext_t b = bitCast(_b); + const auto mask = a < b; + const simd128_ref_t result = bitCast(mask); + return result; +#else simd128_ref_t result; - result.ixyzw[0] = _a.fxyzw[0] < _b.fxyzw[0] ? 0xffffffff : 0x0; - result.ixyzw[1] = _a.fxyzw[1] < _b.fxyzw[1] ? 0xffffffff : 0x0; - result.ixyzw[2] = _a.fxyzw[2] < _b.fxyzw[2] ? 0xffffffff : 0x0; - result.ixyzw[3] = _a.fxyzw[3] < _b.fxyzw[3] ? 0xffffffff : 0x0; + result.u32[0] = _a.u32[0] < _b.u32[0] ? 0xffffffff : 0; + result.u32[1] = _a.u32[1] < _b.u32[1] ? 0xffffffff : 0; + result.u32[2] = _a.u32[2] < _b.u32[2] ? 0xffffffff : 0; + result.u32[3] = _a.u32[3] < _b.u32[3] ? 0xffffffff : 0; return result; +#endif // BX_SIMD_LANGEXT } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_cmple(simd128_ref_t _a, simd128_ref_t _b) + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_u32_cmpgt(simd128_ref_t _a, simd128_ref_t _b) { +#if BX_SIMD_LANGEXT + const simd128_u32_langext_t a = bitCast(_a); + const simd128_u32_langext_t b = bitCast(_b); + const auto mask = a > b; + const simd128_ref_t result = bitCast(mask); + return result; +#else simd128_ref_t result; - result.ixyzw[0] = _a.fxyzw[0] <= _b.fxyzw[0] ? 0xffffffff : 0x0; - result.ixyzw[1] = _a.fxyzw[1] <= _b.fxyzw[1] ? 0xffffffff : 0x0; - result.ixyzw[2] = _a.fxyzw[2] <= _b.fxyzw[2] ? 0xffffffff : 0x0; - result.ixyzw[3] = _a.fxyzw[3] <= _b.fxyzw[3] ? 0xffffffff : 0x0; + result.u32[0] = _a.u32[0] > _b.u32[0] ? 0xffffffff : 0; + result.u32[1] = _a.u32[1] > _b.u32[1] ? 0xffffffff : 0; + result.u32[2] = _a.u32[2] > _b.u32[2] ? 0xffffffff : 0; + result.u32[3] = _a.u32[3] > _b.u32[3] ? 0xffffffff : 0; return result; +#endif // BX_SIMD_LANGEXT } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_cmpgt(simd128_ref_t _a, simd128_ref_t _b) + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_i16_add(simd128_ref_t _a, simd128_ref_t _b) { +#if BX_SIMD_LANGEXT + const simd128_i16_langext_t a = bitCast(_a); + const simd128_i16_langext_t b = bitCast(_b); + const simd128_i16_langext_t sum = a + b; + const simd128_ref_t result = bitCast(sum); + return result; +#else + const simd128_i16_ref_t a = bitCast(_a); + const simd128_i16_ref_t b = bitCast(_b); + simd128_i16_ref_t result; + for (int ii = 0; ii < 8; ++ii) { result.i16[ii] = a.i16[ii] + b.i16[ii]; } + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_i16_sub(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_i16_langext_t a = bitCast(_a); + const simd128_i16_langext_t b = bitCast(_b); + const simd128_i16_langext_t diff = a - b; + const simd128_ref_t result = bitCast(diff); + return result; +#else + const simd128_i16_ref_t a = bitCast(_a); + const simd128_i16_ref_t b = bitCast(_b); + simd128_i16_ref_t result; + for (int ii = 0; ii < 8; ++ii) { result.i16[ii] = a.i16[ii] - b.i16[ii]; } + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_i16_mullo(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_i16_langext_t a = bitCast(_a); + const simd128_i16_langext_t b = bitCast(_b); + const simd128_i16_langext_t prod = a * b; + const simd128_ref_t result = bitCast(prod); + return result; +#else + const simd128_i16_ref_t a = bitCast(_a); + const simd128_i16_ref_t b = bitCast(_b); + simd128_i16_ref_t result; + for (int ii = 0; ii < 8; ++ii) { result.i16[ii] = (int16_t)(a.i16[ii] * b.i16[ii]); } + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_i16_cmpeq(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_i16_langext_t a = bitCast(_a); + const simd128_i16_langext_t b = bitCast(_b); + const auto mask = a == b; + const simd128_ref_t result = bitCast(mask); + return result; +#else + const simd128_i16_ref_t a = bitCast(_a); + const simd128_i16_ref_t b = bitCast(_b); + simd128_u16_ref_t result; + for (int ii = 0; ii < 8; ++ii) { result.u16[ii] = a.i16[ii] == b.i16[ii] ? uint16_t(0xffff) : uint16_t(0); } + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_x16_sll(simd128_ref_t _a, int _count) + { +#if BX_SIMD_LANGEXT + const simd128_u16_langext_t a = bitCast(_a); + const simd128_u16_langext_t shifted = a << _count; + const simd128_ref_t result = bitCast(shifted); + return result; +#else + const simd128_u16_ref_t a = bitCast(_a); + simd128_u16_ref_t result; + for (int ii = 0; ii < 8; ++ii) { result.u16[ii] = uint16_t(a.u16[ii] << _count); } + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_x16_srl(simd128_ref_t _a, int _count) + { +#if BX_SIMD_LANGEXT + const simd128_u16_langext_t a = bitCast(_a); + const simd128_u16_langext_t shifted = a >> _count; + const simd128_ref_t result = bitCast(shifted); + return result; +#else + const simd128_u16_ref_t a = bitCast(_a); + simd128_u16_ref_t result; + for (int ii = 0; ii < 8; ++ii) { result.u16[ii] = uint16_t(a.u16[ii] >> _count); } + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_splat(int16_t _a) + { + const simd128_i16_ref_t result = { { _a, _a, _a, _a, _a, _a, _a, _a } }; + return bitCast(result); + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_splat(uint16_t _a) + { + const simd128_u16_ref_t result = { { _a, _a, _a, _a, _a, _a, _a, _a } }; + return bitCast(result); + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_i8_add(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_i8_langext_t a = bitCast(_a); + const simd128_i8_langext_t b = bitCast(_b); + const simd128_i8_langext_t sum = a + b; + const simd128_ref_t result = bitCast(sum); + return result; +#else + const simd128_i8_ref_t a = bitCast(_a); + const simd128_i8_ref_t b = bitCast(_b); + simd128_i8_ref_t result; + for (int ii = 0; ii < 16; ++ii) { result.i8[ii] = a.i8[ii] + b.i8[ii]; } + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_i8_sub(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_i8_langext_t a = bitCast(_a); + const simd128_i8_langext_t b = bitCast(_b); + const simd128_i8_langext_t diff = a - b; + const simd128_ref_t result = bitCast(diff); + return result; +#else + const simd128_i8_ref_t a = bitCast(_a); + const simd128_i8_ref_t b = bitCast(_b); + simd128_i8_ref_t result; + for (int ii = 0; ii < 16; ++ii) { result.i8[ii] = a.i8[ii] - b.i8[ii]; } + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_u8_satadd(simd128_ref_t _a, simd128_ref_t _b) + { + const simd128_u8_ref_t a = bitCast(_a); + const simd128_u8_ref_t b = bitCast(_b); + simd128_u8_ref_t result; + for (int ii = 0; ii < 16; ++ii) + { + uint16_t sum = (uint16_t)a.u8[ii] + (uint16_t)b.u8[ii]; + result.u8[ii] = sum > 255 ? 255 : (uint8_t)sum; + } + return bitCast(result); + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_u8_satsub(simd128_ref_t _a, simd128_ref_t _b) + { + const simd128_u8_ref_t a = bitCast(_a); + const simd128_u8_ref_t b = bitCast(_b); + simd128_u8_ref_t result; + for (int ii = 0; ii < 16; ++ii) + { + result.u8[ii] = a.u8[ii] > b.u8[ii] ? (uint8_t)(a.u8[ii] - b.u8[ii]) : 0; + } + return bitCast(result); + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_u16_satadd(simd128_ref_t _a, simd128_ref_t _b) + { + const simd128_u16_ref_t a = bitCast(_a); + const simd128_u16_ref_t b = bitCast(_b); + simd128_u16_ref_t result; + for (int ii = 0; ii < 8; ++ii) + { + uint32_t sum = (uint32_t)a.u16[ii] + (uint32_t)b.u16[ii]; + result.u16[ii] = sum > 65535 ? 65535 : (uint16_t)sum; + } + return bitCast(result); + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_u16_satsub(simd128_ref_t _a, simd128_ref_t _b) + { + const simd128_u16_ref_t a = bitCast(_a); + const simd128_u16_ref_t b = bitCast(_b); + simd128_u16_ref_t result; + for (int ii = 0; ii < 8; ++ii) + { + result.u16[ii] = a.u16[ii] > b.u16[ii] ? (uint16_t)(a.u16[ii] - b.u16[ii]) : 0; + } + return bitCast(result); + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_and(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_u32_langext_t a = bitCast(_a); + const simd128_u32_langext_t b = bitCast(_b); + const simd128_u32_langext_t masked = a & b; + const simd128_ref_t result = bitCast(masked); + return result; +#else simd128_ref_t result; - result.ixyzw[0] = _a.fxyzw[0] > _b.fxyzw[0] ? 0xffffffff : 0x0; - result.ixyzw[1] = _a.fxyzw[1] > _b.fxyzw[1] ? 0xffffffff : 0x0; - result.ixyzw[2] = _a.fxyzw[2] > _b.fxyzw[2] ? 0xffffffff : 0x0; - result.ixyzw[3] = _a.fxyzw[3] > _b.fxyzw[3] ? 0xffffffff : 0x0; + result.u32[0] = _a.u32[0] & _b.u32[0]; + result.u32[1] = _a.u32[1] & _b.u32[1]; + result.u32[2] = _a.u32[2] & _b.u32[2]; + result.u32[3] = _a.u32[3] & _b.u32[3]; return result; +#endif // BX_SIMD_LANGEXT } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_cmpge(simd128_ref_t _a, simd128_ref_t _b) + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_andc(simd128_ref_t _a, simd128_ref_t _b) { +#if BX_SIMD_LANGEXT + const simd128_u32_langext_t a = bitCast(_a); + const simd128_u32_langext_t b = bitCast(_b); + const simd128_u32_langext_t notb = ~b; + const simd128_u32_langext_t masked = a & notb; + const simd128_ref_t result = bitCast(masked); + return result; +#else simd128_ref_t result; - result.ixyzw[0] = _a.fxyzw[0] >= _b.fxyzw[0] ? 0xffffffff : 0x0; - result.ixyzw[1] = _a.fxyzw[1] >= _b.fxyzw[1] ? 0xffffffff : 0x0; - result.ixyzw[2] = _a.fxyzw[2] >= _b.fxyzw[2] ? 0xffffffff : 0x0; - result.ixyzw[3] = _a.fxyzw[3] >= _b.fxyzw[3] ? 0xffffffff : 0x0; + result.u32[0] = _a.u32[0] & ~_b.u32[0]; + result.u32[1] = _a.u32[1] & ~_b.u32[1]; + result.u32[2] = _a.u32[2] & ~_b.u32[2]; + result.u32[3] = _a.u32[3] & ~_b.u32[3]; return result; +#endif // BX_SIMD_LANGEXT } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_min(simd128_ref_t _a, simd128_ref_t _b) + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_or(simd128_ref_t _a, simd128_ref_t _b) { +#if BX_SIMD_LANGEXT + const simd128_u32_langext_t a = bitCast(_a); + const simd128_u32_langext_t b = bitCast(_b); + const simd128_u32_langext_t masked = a | b; + const simd128_ref_t result = bitCast(masked); + return result; +#else simd128_ref_t result; - result.fxyzw[0] = _a.fxyzw[0] < _b.fxyzw[0] ? _a.fxyzw[0] : _b.fxyzw[0]; - result.fxyzw[1] = _a.fxyzw[1] < _b.fxyzw[1] ? _a.fxyzw[1] : _b.fxyzw[1]; - result.fxyzw[2] = _a.fxyzw[2] < _b.fxyzw[2] ? _a.fxyzw[2] : _b.fxyzw[2]; - result.fxyzw[3] = _a.fxyzw[3] < _b.fxyzw[3] ? _a.fxyzw[3] : _b.fxyzw[3]; + result.u32[0] = _a.u32[0] | _b.u32[0]; + result.u32[1] = _a.u32[1] | _b.u32[1]; + result.u32[2] = _a.u32[2] | _b.u32[2]; + result.u32[3] = _a.u32[3] | _b.u32[3]; return result; +#endif // BX_SIMD_LANGEXT } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_max(simd128_ref_t _a, simd128_ref_t _b) + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_xor(simd128_ref_t _a, simd128_ref_t _b) { +#if BX_SIMD_LANGEXT + const simd128_u32_langext_t a = bitCast(_a); + const simd128_u32_langext_t b = bitCast(_b); + const simd128_u32_langext_t masked = a ^ b; + const simd128_ref_t result = bitCast(masked); + return result; +#else simd128_ref_t result; - result.fxyzw[0] = _a.fxyzw[0] > _b.fxyzw[0] ? _a.fxyzw[0] : _b.fxyzw[0]; - result.fxyzw[1] = _a.fxyzw[1] > _b.fxyzw[1] ? _a.fxyzw[1] : _b.fxyzw[1]; - result.fxyzw[2] = _a.fxyzw[2] > _b.fxyzw[2] ? _a.fxyzw[2] : _b.fxyzw[2]; - result.fxyzw[3] = _a.fxyzw[3] > _b.fxyzw[3] ? _a.fxyzw[3] : _b.fxyzw[3]; + result.u32[0] = _a.u32[0] ^ _b.u32[0]; + result.u32[1] = _a.u32[1] ^ _b.u32[1]; + result.u32[2] = _a.u32[2] ^ _b.u32[2]; + result.u32[3] = _a.u32[3] ^ _b.u32[3]; return result; +#endif // BX_SIMD_LANGEXT } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_and(simd128_ref_t _a, simd128_ref_t _b) + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_x32_sll(simd128_ref_t _a, int _count) { +#if BX_SIMD_LANGEXT + const simd128_u32_langext_t a = bitCast(_a); + const simd128_u32_langext_t shifted = a << _count; + const simd128_ref_t result = bitCast(shifted); + return result; +#else simd128_ref_t result; - result.uxyzw[0] = _a.uxyzw[0] & _b.uxyzw[0]; - result.uxyzw[1] = _a.uxyzw[1] & _b.uxyzw[1]; - result.uxyzw[2] = _a.uxyzw[2] & _b.uxyzw[2]; - result.uxyzw[3] = _a.uxyzw[3] & _b.uxyzw[3]; + result.u32[0] = _a.u32[0] << _count; + result.u32[1] = _a.u32[1] << _count; + result.u32[2] = _a.u32[2] << _count; + result.u32[3] = _a.u32[3] << _count; return result; +#endif // BX_SIMD_LANGEXT } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_andc(simd128_ref_t _a, simd128_ref_t _b) + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_x32_srl(simd128_ref_t _a, int _count) { +#if BX_SIMD_LANGEXT + const simd128_u32_langext_t a = bitCast(_a); + const simd128_u32_langext_t shifted = a >> _count; + const simd128_ref_t result = bitCast(shifted); + return result; +#else simd128_ref_t result; - result.uxyzw[0] = _a.uxyzw[0] & ~_b.uxyzw[0]; - result.uxyzw[1] = _a.uxyzw[1] & ~_b.uxyzw[1]; - result.uxyzw[2] = _a.uxyzw[2] & ~_b.uxyzw[2]; - result.uxyzw[3] = _a.uxyzw[3] & ~_b.uxyzw[3]; + result.u32[0] = _a.u32[0] >> _count; + result.u32[1] = _a.u32[1] >> _count; + result.u32[2] = _a.u32[2] >> _count; + result.u32[3] = _a.u32[3] >> _count; return result; +#endif // BX_SIMD_LANGEXT } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_or(simd128_ref_t _a, simd128_ref_t _b) + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_x32_sra(simd128_ref_t _a, int _count) { - simd128_ref_t result; - result.uxyzw[0] = _a.uxyzw[0] | _b.uxyzw[0]; - result.uxyzw[1] = _a.uxyzw[1] | _b.uxyzw[1]; - result.uxyzw[2] = _a.uxyzw[2] | _b.uxyzw[2]; - result.uxyzw[3] = _a.uxyzw[3] | _b.uxyzw[3]; +#if BX_SIMD_LANGEXT + const simd128_i32_langext_t a = bitCast(_a); + const simd128_i32_langext_t shifted = a >> _count; + const simd128_ref_t result = bitCast(shifted); return result; +#else + const simd128_i32_ref_t a = bitCast(_a); + const simd128_i32_ref_t result = { { a.i32[0] >> _count, a.i32[1] >> _count, a.i32[2] >> _count, a.i32[3] >> _count } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_xor(simd128_ref_t _a, simd128_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_x32_sll(simd128_ref_t _a, simd128_ref_t _count) { - simd128_ref_t result; - result.uxyzw[0] = _a.uxyzw[0] ^ _b.uxyzw[0]; - result.uxyzw[1] = _a.uxyzw[1] ^ _b.uxyzw[1]; - result.uxyzw[2] = _a.uxyzw[2] ^ _b.uxyzw[2]; - result.uxyzw[3] = _a.uxyzw[3] ^ _b.uxyzw[3]; - return result; + return simd_x32_sll_ni(_a, _count); } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_sll(simd128_ref_t _a, int _count) + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_x32_srl(simd128_ref_t _a, simd128_ref_t _count) { - simd128_ref_t result; - result.uxyzw[0] = _a.uxyzw[0] << _count; - result.uxyzw[1] = _a.uxyzw[1] << _count; - result.uxyzw[2] = _a.uxyzw[2] << _count; - result.uxyzw[3] = _a.uxyzw[3] << _count; - return result; + return simd_x32_srl_ni(_a, _count); } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_srl(simd128_ref_t _a, int _count) + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_x32_sra(simd128_ref_t _a, simd128_ref_t _count) { - simd128_ref_t result; - result.uxyzw[0] = _a.uxyzw[0] >> _count; - result.uxyzw[1] = _a.uxyzw[1] >> _count; - result.uxyzw[2] = _a.uxyzw[2] >> _count; - result.uxyzw[3] = _a.uxyzw[3] >> _count; - return result; + return simd_x32_sra_ni(_a, _count); } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_sra(simd128_ref_t _a, int _count) + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_x8_shuffle(simd128_ref_t _a, simd128_ref_t _indices) { - simd128_ref_t result; - result.ixyzw[0] = _a.ixyzw[0] >> _count; - result.ixyzw[1] = _a.ixyzw[1] >> _count; - result.ixyzw[2] = _a.ixyzw[2] >> _count; - result.ixyzw[3] = _a.ixyzw[3] >> _count; - return result; + return simd_x8_shuffle_ni(_a, _indices); } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_icmpeq(simd128_ref_t _a, simd128_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_x8_shuffle(simd128_ref_t _a, simd128_ref_t _b, simd128_ref_t _indices) { - simd128_ref_t result; - result.ixyzw[0] = _a.ixyzw[0] == _b.ixyzw[0] ? 0xffffffff : 0x0; - result.ixyzw[1] = _a.ixyzw[1] == _b.ixyzw[1] ? 0xffffffff : 0x0; - result.ixyzw[2] = _a.ixyzw[2] == _b.ixyzw[2] ? 0xffffffff : 0x0; - result.ixyzw[3] = _a.ixyzw[3] == _b.ixyzw[3] ? 0xffffffff : 0x0; - return result; + return simd_x8_shuffle_ni(_a, _b, _indices); } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_icmplt(simd128_ref_t _a, simd128_ref_t _b) - { - simd128_ref_t result; - result.ixyzw[0] = _a.ixyzw[0] < _b.ixyzw[0] ? 0xffffffff : 0x0; - result.ixyzw[1] = _a.ixyzw[1] < _b.ixyzw[1] ? 0xffffffff : 0x0; - result.ixyzw[2] = _a.ixyzw[2] < _b.ixyzw[2] ? 0xffffffff : 0x0; - result.ixyzw[3] = _a.ixyzw[3] < _b.ixyzw[3] ? 0xffffffff : 0x0; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_icmpgt(simd128_ref_t _a, simd128_ref_t _b) - { - simd128_ref_t result; - result.ixyzw[0] = _a.ixyzw[0] > _b.ixyzw[0] ? 0xffffffff : 0x0; - result.ixyzw[1] = _a.ixyzw[1] > _b.ixyzw[1] ? 0xffffffff : 0x0; - result.ixyzw[2] = _a.ixyzw[2] > _b.ixyzw[2] ? 0xffffffff : 0x0; - result.ixyzw[3] = _a.ixyzw[3] > _b.ixyzw[3] ? 0xffffffff : 0x0; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_imin(simd128_ref_t _a, simd128_ref_t _b) - { - simd128_ref_t result; - result.ixyzw[0] = _a.ixyzw[0] < _b.ixyzw[0] ? _a.ixyzw[0] : _b.ixyzw[0]; - result.ixyzw[1] = _a.ixyzw[1] < _b.ixyzw[1] ? _a.ixyzw[1] : _b.ixyzw[1]; - result.ixyzw[2] = _a.ixyzw[2] < _b.ixyzw[2] ? _a.ixyzw[2] : _b.ixyzw[2]; - result.ixyzw[3] = _a.ixyzw[3] < _b.ixyzw[3] ? _a.ixyzw[3] : _b.ixyzw[3]; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_imax(simd128_ref_t _a, simd128_ref_t _b) - { - simd128_ref_t result; - result.ixyzw[0] = _a.ixyzw[0] > _b.ixyzw[0] ? _a.ixyzw[0] : _b.ixyzw[0]; - result.ixyzw[1] = _a.ixyzw[1] > _b.ixyzw[1] ? _a.ixyzw[1] : _b.ixyzw[1]; - result.ixyzw[2] = _a.ixyzw[2] > _b.ixyzw[2] ? _a.ixyzw[2] : _b.ixyzw[2]; - result.ixyzw[3] = _a.ixyzw[3] > _b.ixyzw[3] ? _a.ixyzw[3] : _b.ixyzw[3]; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_iadd(simd128_ref_t _a, simd128_ref_t _b) - { - simd128_ref_t result; - result.ixyzw[0] = _a.ixyzw[0] + _b.ixyzw[0]; - result.ixyzw[1] = _a.ixyzw[1] + _b.ixyzw[1]; - result.ixyzw[2] = _a.ixyzw[2] + _b.ixyzw[2]; - result.ixyzw[3] = _a.ixyzw[3] + _b.ixyzw[3]; - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_isub(simd128_ref_t _a, simd128_ref_t _b) - { - simd128_ref_t result; - result.ixyzw[0] = _a.ixyzw[0] - _b.ixyzw[0]; - result.ixyzw[1] = _a.ixyzw[1] - _b.ixyzw[1]; - result.ixyzw[2] = _a.ixyzw[2] - _b.ixyzw[2]; - result.ixyzw[3] = _a.ixyzw[3] - _b.ixyzw[3]; - return result; - } - - BX_SIMD_FORCE_INLINE simd128_t simd_zero() - { - return simd_zero(); - } - - BX_SIMD_FORCE_INLINE simd128_t simd_ld(const void* _ptr) - { - return simd_ld(_ptr); - } - - BX_SIMD_FORCE_INLINE simd128_t simd_ld(float _x, float _y, float _z, float _w) - { - return simd_ld(_x, _y, _z, _w); - } - - BX_SIMD_FORCE_INLINE simd128_t simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) - { - return simd_ild(_x, _y, _z, _w); - } - - BX_SIMD_FORCE_INLINE simd128_t simd_splat(const void* _ptr) - { - return simd_splat(_ptr); - } - - BX_SIMD_FORCE_INLINE simd128_t simd_splat(float _a) - { - return simd_splat(_a); - } - - BX_SIMD_FORCE_INLINE simd128_t simd_isplat(uint32_t _a) - { - return simd_isplat(_a); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_xAzC(simd128_ref_t _a, simd128_ref_t _b) - { - return simd_shuf_xAzC_ni(_a, _b); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_shuf_yBwD(simd128_ref_t _a, simd128_ref_t _b) - { - return simd_shuf_yBwD_ni(_a, _b); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_rcp(simd128_ref_t _a) - { - return simd_rcp_ni(_a); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_orx(simd128_ref_t _a) - { - return simd_orx_ni(_a); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_orc(simd128_ref_t _a, simd128_ref_t _b) - { - return simd_orc_ni(_a, _b); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_neg(simd128_ref_t _a) - { - return simd_neg_ni(_a); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_madd(simd128_ref_t _a, simd128_ref_t _b, simd128_ref_t _c) - { - return simd_madd_ni(_a, _b, _c); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_nmsub(simd128_ref_t _a, simd128_ref_t _b, simd128_ref_t _c) - { - return simd_nmsub_ni(_a, _b, _c); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_div_nr(simd128_ref_t _a, simd128_ref_t _b) - { - return simd_div_nr_ni(_a, _b); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_selb(simd128_ref_t _mask, simd128_ref_t _a, simd128_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_selb(simd128_ref_t _mask, simd128_ref_t _a, simd128_ref_t _b) { return simd_selb_ni(_mask, _a, _b); } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_sels(simd128_ref_t _test, simd128_ref_t _a, simd128_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_sels(simd128_ref_t _test, simd128_ref_t _a, simd128_ref_t _b) { return simd_sels_ni(_test, _a, _b); } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_not(simd128_ref_t _a) + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_not(simd128_ref_t _a) { return simd_not_ni(_a); } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_abs(simd128_ref_t _a) + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_orc(simd128_ref_t _a, simd128_ref_t _b) { - return simd_abs_ni(_a); + return simd_orc_ni(_a, _b); } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_clamp(simd128_ref_t _a, simd128_ref_t _min, simd128_ref_t _max) + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_orx(simd128_ref_t _a) { - return simd_clamp_ni(_a, _min, _max); + return simd128_orx_ni(_a); } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_lerp(simd128_ref_t _a, simd128_ref_t _b, simd128_ref_t _s) + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_f32_madd(simd128_ref_t _a, simd128_ref_t _b, simd128_ref_t _c) { - return simd_lerp_ni(_a, _b, _s); + return simd_f32_madd_ni(_a, _b, _c); } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_rsqrt(simd128_ref_t _a) + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_f32_msub(simd128_ref_t _a, simd128_ref_t _b, simd128_ref_t _c) { - return simd_rsqrt_ni(_a); + return simd_f32_msub_ni(_a, _b, _c); } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_rsqrt_nr(simd128_ref_t _a) + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_f32_nmsub(simd128_ref_t _a, simd128_ref_t _b, simd128_ref_t _c) { - return simd_rsqrt_nr_ni(_a); + return simd_f32_nmsub_ni(_a, _b, _c); } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_rsqrt_carmack(simd128_ref_t _a) + BX_SIMD_FORCE_INLINE int simd128_x32_signbitsmask(simd128_ref_t _a) { - return simd_rsqrt_carmack_ni(_a); + return simd_x32_signbitsmask_ni(_a); } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_sqrt_nr(simd128_ref_t _a) + BX_SIMD_FORCE_INLINE int simd128_x8_signbitsmask(simd128_ref_t _a) { - return simd_sqrt_nr_ni(_a); + return simd_x8_signbitsmask_ni(_a); } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_log2(simd128_ref_t _a) + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_f32_neg(simd128_ref_t _a) { - return simd_log2_ni(_a); + return simd_f32_neg_ni(_a); } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_exp2(simd128_ref_t _a) + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_f32_abs(simd128_ref_t _a) { - return simd_exp2_ni(_a); + return simd_f32_abs_ni(_a); } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_pow(simd128_ref_t _a, simd128_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_f32_clamp(simd128_ref_t _a, simd128_ref_t _min, simd128_ref_t _max) { - return simd_pow_ni(_a, _b); + return simd_f32_clamp_ni(_a, _min, _max); } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_cross3(simd128_ref_t _a, simd128_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_f32_lerp(simd128_ref_t _a, simd128_ref_t _b, simd128_ref_t _s) { - return simd_cross3_ni(_a, _b); + return simd_f32_lerp_ni(_a, _b, _s); } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_normalize3(simd128_ref_t _a) + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_f32_rcp(simd128_ref_t _a) { - return simd_normalize3_ni(_a); + return simd_f32_rcp_ni(_a); } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_dot3(simd128_ref_t _a, simd128_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_f32_sqrt_nr(simd128_ref_t _a) { - return simd_dot3_ni(_a, _b); + return simd_f32_sqrt_nr_ni(_a); } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_dot(simd128_ref_t _a, simd128_ref_t _b) + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_f32_rsqrt(simd128_ref_t _a) { - return simd_dot_ni(_a, _b); + return simd_f32_rsqrt_ni(_a); } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_ceil(simd128_ref_t _a) + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_f32_rsqrt_nr(simd128_ref_t _a) { - return simd_ceil_ni(_a); + return simd_f32_rsqrt_nr_ni(_a); } template<> - BX_SIMD_FORCE_INLINE simd128_ref_t simd_floor(simd128_ref_t _a) + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_f32_rsqrt_carmack(simd128_ref_t _a) { - return simd_floor_ni(_a); + return simd_f32_rsqrt_carmack_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_f32_div_nr(simd128_ref_t _a, simd128_ref_t _b) + { + return simd_f32_div_nr_ni(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_f32_dot3(simd128_ref_t _a, simd128_ref_t _b) + { + return simd128_f32_dot3_ni(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_f32_dot(simd128_ref_t _a, simd128_ref_t _b) + { + return simd128_f32_dot_ni(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_f32_cross3(simd128_ref_t _a, simd128_ref_t _b) + { + return simd128_f32_cross3_ni(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_f32_normalize3(simd128_ref_t _a) + { + return simd128_f32_normalize3_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_ref_t simd128_f32_cmpneq(simd128_ref_t _a, simd128_ref_t _b) + { + return simd_f32_cmpneq_ni(_a, _b); + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_x32_shuf_xAzC(simd128_ref_t _a, simd128_ref_t _b) + { + const simd128_ref_t xAyB = simd128_x32_shuf_xAyB(_a, _b); + const simd128_ref_t zCwD = simd128_x32_shuf_zCwD(_a, _b); + const simd128_ref_t result = simd128_x32_shuf_xyAB(xAyB, zCwD); + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_x32_shuf_yBwD(simd128_ref_t _a, simd128_ref_t _b) + { + const simd128_ref_t xAyB = simd128_x32_shuf_xAyB(_a, _b); + const simd128_ref_t zCwD = simd128_x32_shuf_zCwD(_a, _b); + const simd128_ref_t result = simd128_x32_shuf_zwCD(xAyB, zCwD); + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f64_add(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_f64_langext_t a = bitCast(_a); + const simd128_f64_langext_t b = bitCast(_b); + const simd128_f64_langext_t sum = a + b; + const simd128_ref_t result = bitCast(sum); + return result; +#else + const simd128_f64_ref_t a = bitCast(_a); + const simd128_f64_ref_t b = bitCast(_b); + const simd128_f64_ref_t result = { { a.f64[0] + b.f64[0], a.f64[1] + b.f64[1] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f64_sub(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_f64_langext_t a = bitCast(_a); + const simd128_f64_langext_t b = bitCast(_b); + const simd128_f64_langext_t diff = a - b; + const simd128_ref_t result = bitCast(diff); + return result; +#else + const simd128_f64_ref_t a = bitCast(_a); + const simd128_f64_ref_t b = bitCast(_b); + const simd128_f64_ref_t result = { { a.f64[0] - b.f64[0], a.f64[1] - b.f64[1] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f64_mul(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_f64_langext_t a = bitCast(_a); + const simd128_f64_langext_t b = bitCast(_b); + const simd128_f64_langext_t prod = a * b; + const simd128_ref_t result = bitCast(prod); + return result; +#else + const simd128_f64_ref_t a = bitCast(_a); + const simd128_f64_ref_t b = bitCast(_b); + const simd128_f64_ref_t result = { { a.f64[0] * b.f64[0], a.f64[1] * b.f64[1] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f64_div(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_f64_langext_t a = bitCast(_a); + const simd128_f64_langext_t b = bitCast(_b); + const simd128_f64_langext_t quot = a / b; + const simd128_ref_t result = bitCast(quot); + return result; +#else + const simd128_f64_ref_t a = bitCast(_a); + const simd128_f64_ref_t b = bitCast(_b); + const simd128_f64_ref_t result = { { a.f64[0] / b.f64[0], a.f64[1] / b.f64[1] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f64_min(simd128_ref_t _a, simd128_ref_t _b) + { + const simd128_f64_ref_t a = bitCast(_a); + const simd128_f64_ref_t b = bitCast(_b); + const simd128_f64_ref_t result = { { a.f64[0] < b.f64[0] ? a.f64[0] : b.f64[0], a.f64[1] < b.f64[1] ? a.f64[1] : b.f64[1] } }; + return bitCast(result); + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f64_max(simd128_ref_t _a, simd128_ref_t _b) + { + const simd128_f64_ref_t a = bitCast(_a); + const simd128_f64_ref_t b = bitCast(_b); + const simd128_f64_ref_t result = { { a.f64[0] > b.f64[0] ? a.f64[0] : b.f64[0], a.f64[1] > b.f64[1] ? a.f64[1] : b.f64[1] } }; + return bitCast(result); + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f64_madd(simd128_ref_t _a, simd128_ref_t _b, simd128_ref_t _c) + { +#if BX_SIMD_LANGEXT + const simd128_f64_langext_t a = bitCast(_a); + const simd128_f64_langext_t b = bitCast(_b); + const simd128_f64_langext_t c = bitCast(_c); + const simd128_f64_langext_t prod = a * b; + const simd128_f64_langext_t sum = prod + c; + const simd128_ref_t result = bitCast(sum); + return result; +#else + const simd128_f64_ref_t a = bitCast(_a); + const simd128_f64_ref_t b = bitCast(_b); + const simd128_f64_ref_t c = bitCast(_c); + const simd128_f64_ref_t result = { { a.f64[0] * b.f64[0] + c.f64[0], a.f64[1] * b.f64[1] + c.f64[1] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f64_nmsub(simd128_ref_t _a, simd128_ref_t _b, simd128_ref_t _c) + { +#if BX_SIMD_LANGEXT + const simd128_f64_langext_t a = bitCast(_a); + const simd128_f64_langext_t b = bitCast(_b); + const simd128_f64_langext_t c = bitCast(_c); + const simd128_f64_langext_t prod = a * b; + const simd128_f64_langext_t diff = c - prod; + const simd128_ref_t result = bitCast(diff); + return result; +#else + const simd128_f64_ref_t a = bitCast(_a); + const simd128_f64_ref_t b = bitCast(_b); + const simd128_f64_ref_t c = bitCast(_c); + const simd128_f64_ref_t result = { { c.f64[0] - a.f64[0] * b.f64[0], c.f64[1] - a.f64[1] * b.f64[1] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f64_neg(simd128_ref_t _a) + { +#if BX_SIMD_LANGEXT + const simd128_f64_langext_t a = bitCast(_a); + const simd128_f64_langext_t neg = -a; + const simd128_ref_t result = bitCast(neg); + return result; +#else + const simd128_f64_ref_t a = bitCast(_a); + const simd128_f64_ref_t result = { { -a.f64[0], -a.f64[1] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f64_abs(simd128_ref_t _a) + { + const simd128_ref_t a_neg = simd128_f64_neg(_a); + const simd128_ref_t result = simd128_f64_max(a_neg, _a); + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f64_clamp(simd128_ref_t _a, simd128_ref_t _min, simd128_ref_t _max) + { + const simd128_ref_t hi = simd128_f64_max(_a, _min); + const simd128_ref_t result = simd128_f64_min(hi, _max); + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f64_lerp(simd128_ref_t _a, simd128_ref_t _b, simd128_ref_t _s) + { +#if BX_SIMD_LANGEXT + const simd128_f64_langext_t a = bitCast(_a); + const simd128_f64_langext_t b = bitCast(_b); + const simd128_f64_langext_t s = bitCast(_s); + const simd128_f64_langext_t diff = b - a; + const simd128_f64_langext_t scaled = diff * s; + const simd128_f64_langext_t sum = a + scaled; + const simd128_ref_t result = bitCast(sum); + return result; +#else + const simd128_f64_ref_t a = bitCast(_a); + const simd128_f64_ref_t b = bitCast(_b); + const simd128_f64_ref_t s = bitCast(_s); + const simd128_f64_ref_t result = { { a.f64[0] + (b.f64[0] - a.f64[0]) * s.f64[0], a.f64[1] + (b.f64[1] - a.f64[1]) * s.f64[1] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f64_rcp(simd128_ref_t _a) + { + const simd128_f64_ref_t a = bitCast(_a); + const simd128_f64_ref_t result = { { 1.0 / a.f64[0], 1.0 / a.f64[1] } }; + return bitCast(result); + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f64_sqrt(simd128_ref_t _a) + { + const simd128_f64_ref_t a = bitCast(_a); + const simd128_f64_ref_t result = { { simd_sqrt(a.f64[0]), simd_sqrt(a.f64[1]) } }; + return bitCast(result); + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f64_rsqrt(simd128_ref_t _a) + { + const simd128_f64_ref_t a = bitCast(_a); + const simd128_f64_ref_t result = { { 1.0 / simd_sqrt(a.f64[0]), 1.0 / simd_sqrt(a.f64[1]) } }; + return bitCast(result); + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f64_round(simd128_ref_t _a) + { + const simd128_f64_ref_t a = bitCast(_a); + const simd128_f64_ref_t result = { { simd_round(a.f64[0]), simd_round(a.f64[1]) } }; + return bitCast(result); + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f64_ceil(simd128_ref_t _a) + { + const simd128_f64_ref_t a = bitCast(_a); + const simd128_f64_ref_t result = { { simd_ceil(a.f64[0]), simd_ceil(a.f64[1]) } }; + return bitCast(result); + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f64_floor(simd128_ref_t _a) + { + const simd128_f64_ref_t a = bitCast(_a); + const simd128_f64_ref_t result = { { simd_floor(a.f64[0]), simd_floor(a.f64[1]) } }; + return bitCast(result); + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f64_cmpeq(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_f64_langext_t a = bitCast(_a); + const simd128_f64_langext_t b = bitCast(_b); + const auto mask = a == b; + const simd128_ref_t result = bitCast(mask); + return result; +#else + const simd128_f64_ref_t a = bitCast(_a); + const simd128_f64_ref_t b = bitCast(_b); + const simd128_u64_ref_t result = { { a.f64[0] == b.f64[0] ? UINT64_MAX : 0, a.f64[1] == b.f64[1] ? UINT64_MAX : 0 } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f64_cmpneq(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_f64_langext_t a = bitCast(_a); + const simd128_f64_langext_t b = bitCast(_b); + const auto mask = a != b; + const simd128_ref_t result = bitCast(mask); + return result; +#else + const simd128_f64_ref_t a = bitCast(_a); + const simd128_f64_ref_t b = bitCast(_b); + const simd128_u64_ref_t result = { { a.f64[0] != b.f64[0] ? UINT64_MAX : 0, a.f64[1] != b.f64[1] ? UINT64_MAX : 0 } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f64_cmplt(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_f64_langext_t a = bitCast(_a); + const simd128_f64_langext_t b = bitCast(_b); + const auto mask = a < b; + const simd128_ref_t result = bitCast(mask); + return result; +#else + const simd128_f64_ref_t a = bitCast(_a); + const simd128_f64_ref_t b = bitCast(_b); + const simd128_u64_ref_t result = { { a.f64[0] < b.f64[0] ? UINT64_MAX : 0, a.f64[1] < b.f64[1] ? UINT64_MAX : 0 } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f64_cmple(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_f64_langext_t a = bitCast(_a); + const simd128_f64_langext_t b = bitCast(_b); + const auto mask = a <= b; + const simd128_ref_t result = bitCast(mask); + return result; +#else + const simd128_f64_ref_t a = bitCast(_a); + const simd128_f64_ref_t b = bitCast(_b); + const simd128_u64_ref_t result = { { a.f64[0] <= b.f64[0] ? UINT64_MAX : 0, a.f64[1] <= b.f64[1] ? UINT64_MAX : 0 } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f64_cmpgt(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_f64_langext_t a = bitCast(_a); + const simd128_f64_langext_t b = bitCast(_b); + const auto mask = a > b; + const simd128_ref_t result = bitCast(mask); + return result; +#else + const simd128_f64_ref_t a = bitCast(_a); + const simd128_f64_ref_t b = bitCast(_b); + const simd128_u64_ref_t result = { { a.f64[0] > b.f64[0] ? UINT64_MAX : 0, a.f64[1] > b.f64[1] ? UINT64_MAX : 0 } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_f64_cmpge(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_f64_langext_t a = bitCast(_a); + const simd128_f64_langext_t b = bitCast(_b); + const auto mask = a >= b; + const simd128_ref_t result = bitCast(mask); + return result; +#else + const simd128_f64_ref_t a = bitCast(_a); + const simd128_f64_ref_t b = bitCast(_b); + const simd128_u64_ref_t result = { { a.f64[0] >= b.f64[0] ? UINT64_MAX : 0, a.f64[1] >= b.f64[1] ? UINT64_MAX : 0 } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_i64_add(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_i64_langext_t a = bitCast(_a); + const simd128_i64_langext_t b = bitCast(_b); + const simd128_i64_langext_t sum = a + b; + const simd128_ref_t result = bitCast(sum); + return result; +#else + const simd128_i64_ref_t a = bitCast(_a); + const simd128_i64_ref_t b = bitCast(_b); + const simd128_i64_ref_t result = { { a.i64[0] + b.i64[0], a.i64[1] + b.i64[1] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_i64_sub(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_i64_langext_t a = bitCast(_a); + const simd128_i64_langext_t b = bitCast(_b); + const simd128_i64_langext_t diff = a - b; + const simd128_ref_t result = bitCast(diff); + return result; +#else + const simd128_i64_ref_t a = bitCast(_a); + const simd128_i64_ref_t b = bitCast(_b); + const simd128_i64_ref_t result = { { a.i64[0] - b.i64[0], a.i64[1] - b.i64[1] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_u64_add(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_u64_langext_t a = bitCast(_a); + const simd128_u64_langext_t b = bitCast(_b); + const simd128_u64_langext_t sum = a + b; + const simd128_ref_t result = bitCast(sum); + return result; +#else + const simd128_u64_ref_t a = bitCast(_a); + const simd128_u64_ref_t b = bitCast(_b); + const simd128_u64_ref_t result = { { a.u64[0] + b.u64[0], a.u64[1] + b.u64[1] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd128_ref_t simd128_u64_sub(simd128_ref_t _a, simd128_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd128_u64_langext_t a = bitCast(_a); + const simd128_u64_langext_t b = bitCast(_b); + const simd128_u64_langext_t diff = a - b; + const simd128_ref_t result = bitCast(diff); + return result; +#else + const simd128_u64_ref_t a = bitCast(_a); + const simd128_u64_ref_t b = bitCast(_b); + const simd128_u64_ref_t result = { { a.u64[0] - b.u64[0], a.u64[1] - b.u64[1] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT } } // namespace bx diff --git a/include/bx/inline/simd128_sse.inl b/include/bx/inline/simd128_sse.inl index 016eb54..5ae6ab3 100644 --- a/include/bx/inline/simd128_sse.inl +++ b/include/bx/inline/simd128_sse.inl @@ -9,15 +9,16 @@ namespace bx { + #define ELEMx 0 #define ELEMy 1 #define ELEMz 2 #define ELEMw 3 -#define BX_SIMD128_IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ - template<> \ - BX_SIMD_FORCE_INLINE simd128_sse_t simd_swiz_##_x##_y##_z##_w(simd128_sse_t _a) \ - { \ - return _mm_shuffle_ps( _a, _a, _MM_SHUFFLE(ELEM##_w, ELEM##_z, ELEM##_y, ELEM##_x ) ); \ +#define BX_SIMD128_IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ + template<> \ + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_x32_swiz_##_x##_y##_z##_w(simd128_sse_t _a) \ + { \ + return _mm_shuffle_ps(_a, _a, _MM_SHUFFLE(ELEM##_w, ELEM##_z, ELEM##_y, ELEM##_x)); \ } #include "simd128_swizzle.inl" @@ -28,627 +29,1164 @@ namespace bx #undef ELEMy #undef ELEMx -#define BX_SIMD128_IMPLEMENT_TEST(_xyzw, _mask) \ - template<> \ - BX_SIMD_FORCE_INLINE bool simd_test_any_##_xyzw(simd128_sse_t _test) \ - { \ - return 0x0 != (_mm_movemask_ps(_test)&(_mask) ); \ - } \ - \ - template<> \ - BX_SIMD_FORCE_INLINE bool simd_test_all_##_xyzw(simd128_sse_t _test) \ - { \ - return (_mask) == (_mm_movemask_ps(_test)&(_mask) ); \ +#define BX_SIMD128_IMPLEMENT_TEST(_xyzw, _mask) \ + template<> \ + BX_SIMD_FORCE_INLINE bool simd128_test_any_##_xyzw(simd128_sse_t _test)\ + { \ + return 0x0 != (_mm_movemask_ps(_test)&(_mask) ); \ + } \ + \ + template<> \ + BX_SIMD_FORCE_INLINE bool simd128_test_all_##_xyzw(simd128_sse_t _test)\ + { \ + return (_mask) == (_mm_movemask_ps(_test)&(_mask) ); \ } -BX_SIMD128_IMPLEMENT_TEST(x , 0x1) -BX_SIMD128_IMPLEMENT_TEST(y , 0x2) -BX_SIMD128_IMPLEMENT_TEST(xy , 0x3) -BX_SIMD128_IMPLEMENT_TEST(z , 0x4) -BX_SIMD128_IMPLEMENT_TEST(xz , 0x5) -BX_SIMD128_IMPLEMENT_TEST(yz , 0x6) -BX_SIMD128_IMPLEMENT_TEST(xyz , 0x7) -BX_SIMD128_IMPLEMENT_TEST(w , 0x8) -BX_SIMD128_IMPLEMENT_TEST(xw , 0x9) -BX_SIMD128_IMPLEMENT_TEST(yw , 0xa) -BX_SIMD128_IMPLEMENT_TEST(xyw , 0xb) -BX_SIMD128_IMPLEMENT_TEST(zw , 0xc) -BX_SIMD128_IMPLEMENT_TEST(xzw , 0xd) -BX_SIMD128_IMPLEMENT_TEST(yzw , 0xe) -BX_SIMD128_IMPLEMENT_TEST(xyzw , 0xf) + BX_SIMD128_IMPLEMENT_TEST(x , 0x1) + BX_SIMD128_IMPLEMENT_TEST(y , 0x2) + BX_SIMD128_IMPLEMENT_TEST(xy , 0x3) + BX_SIMD128_IMPLEMENT_TEST(z , 0x4) + BX_SIMD128_IMPLEMENT_TEST(xz , 0x5) + BX_SIMD128_IMPLEMENT_TEST(yz , 0x6) + BX_SIMD128_IMPLEMENT_TEST(xyz , 0x7) + BX_SIMD128_IMPLEMENT_TEST(w , 0x8) + BX_SIMD128_IMPLEMENT_TEST(xw , 0x9) + BX_SIMD128_IMPLEMENT_TEST(yw , 0xa) + BX_SIMD128_IMPLEMENT_TEST(xyw , 0xb) + BX_SIMD128_IMPLEMENT_TEST(zw , 0xc) + BX_SIMD128_IMPLEMENT_TEST(xzw , 0xd) + BX_SIMD128_IMPLEMENT_TEST(yzw , 0xe) + BX_SIMD128_IMPLEMENT_TEST(xyzw , 0xf) #undef BX_SIMD128_IMPLEMENT_TEST template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_shuf_xyAB(simd128_sse_t _a, simd128_sse_t _b) + BX_SIMD_FORCE_INLINE bool simd128_test_zero(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i ai = _mm_castps_si128(_a); + const __m128i bi = _mm_castps_si128(_b); + return 0 != _mm_testz_si128(ai, bi); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_x32_shuf_xyAB(simd128_sse_t _a, simd128_sse_t _b) { return _mm_movelh_ps(_a, _b); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_shuf_ABxy(simd128_sse_t _a, simd128_sse_t _b) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_x32_shuf_ABxy(simd128_sse_t _a, simd128_sse_t _b) { return _mm_movelh_ps(_b, _a); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_shuf_CDzw(simd128_sse_t _a, simd128_sse_t _b) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_x32_shuf_CDzw(simd128_sse_t _a, simd128_sse_t _b) { return _mm_movehl_ps(_a, _b); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_shuf_zwCD(simd128_sse_t _a, simd128_sse_t _b) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_x32_shuf_zwCD(simd128_sse_t _a, simd128_sse_t _b) { return _mm_movehl_ps(_b, _a); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_shuf_xAyB(simd128_sse_t _a, simd128_sse_t _b) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_x32_shuf_xAyB(simd128_sse_t _a, simd128_sse_t _b) { return _mm_unpacklo_ps(_a, _b); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_shuf_AxBy(simd128_sse_t _a, simd128_sse_t _b) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_x32_shuf_AxBy(simd128_sse_t _a, simd128_sse_t _b) { return _mm_unpacklo_ps(_b, _a); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_shuf_zCwD(simd128_sse_t _a, simd128_sse_t _b) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_x32_shuf_zCwD(simd128_sse_t _a, simd128_sse_t _b) { return _mm_unpackhi_ps(_a, _b); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_shuf_CzDw(simd128_sse_t _a, simd128_sse_t _b) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_x32_shuf_CzDw(simd128_sse_t _a, simd128_sse_t _b) { return _mm_unpackhi_ps(_b, _a); } template<> - BX_SIMD_FORCE_INLINE float simd_x(simd128_sse_t _a) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_x32_shuf_xzAC(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_shuffle_ps(_a, _b, _MM_SHUFFLE(2, 0, 2, 0) ); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_x32_shuf_ywBD(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_shuffle_ps(_a, _b, _MM_SHUFFLE(3, 1, 3, 1) ); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_x32_shuf_xxAA(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_shuffle_ps(_a, _b, _MM_SHUFFLE(0, 0, 0, 0) ); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_x32_shuf_yyBB(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_shuffle_ps(_a, _b, _MM_SHUFFLE(1, 1, 1, 1) ); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_x32_shuf_zzCC(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_shuffle_ps(_a, _b, _MM_SHUFFLE(2, 2, 2, 2) ); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_x32_shuf_wwDD(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_shuffle_ps(_a, _b, _MM_SHUFFLE(3, 3, 3, 3) ); + } + + template<> + BX_SIMD_FORCE_INLINE float simd128_f32_x(simd128_sse_t _a) { return _mm_cvtss_f32(_a); } template<> - BX_SIMD_FORCE_INLINE float simd_y(simd128_sse_t _a) + BX_SIMD_FORCE_INLINE float simd128_f32_y(simd128_sse_t _a) { - const simd128_sse_t yyyy = simd_swiz_yyyy(_a); - const float result = _mm_cvtss_f32(yyyy); - - return result; + const simd128_sse_t tmp = simd128_x32_swiz_yyyy(_a); + return _mm_cvtss_f32(tmp); } template<> - BX_SIMD_FORCE_INLINE float simd_z(simd128_sse_t _a) + BX_SIMD_FORCE_INLINE float simd128_f32_z(simd128_sse_t _a) { - const simd128_sse_t zzzz = simd_swiz_zzzz(_a); - const float result = _mm_cvtss_f32(zzzz); - - return result; + const simd128_sse_t tmp = simd128_x32_swiz_zzzz(_a); + return _mm_cvtss_f32(tmp); } template<> - BX_SIMD_FORCE_INLINE float simd_w(simd128_sse_t _a) + BX_SIMD_FORCE_INLINE float simd128_f32_w(simd128_sse_t _a) { - const simd128_sse_t wwww = simd_swiz_wwww(_a); - const float result = _mm_cvtss_f32(wwww); - - return result; + const simd128_sse_t tmp = simd128_x32_swiz_wwww(_a); + return _mm_cvtss_f32(tmp); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_ld(const void* _ptr) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_ld(const void* _ptr) { return _mm_load_ps(reinterpret_cast(_ptr) ); } template<> - BX_SIMD_FORCE_INLINE void simd_st(void* _ptr, simd128_sse_t _a) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_ldu(const void* _ptr) + { + return _mm_loadu_ps(reinterpret_cast(_ptr) ); + } + + template<> + BX_SIMD_FORCE_INLINE void simd128_st(void* _ptr, simd128_sse_t _a) { _mm_store_ps(reinterpret_cast(_ptr), _a); } template<> - BX_SIMD_FORCE_INLINE void simd_stx(void* _ptr, simd128_sse_t _a) + BX_SIMD_FORCE_INLINE void simd128_stu(void* _ptr, simd128_sse_t _a) + { + _mm_storeu_ps(reinterpret_cast(_ptr), _a); + } + + template<> + BX_SIMD_FORCE_INLINE void simd128_x32_st1(void* _ptr, simd128_sse_t _a) { _mm_store_ss(reinterpret_cast(_ptr), _a); } template<> - BX_SIMD_FORCE_INLINE void simd_stream(void* _ptr, simd128_sse_t _a) + BX_SIMD_FORCE_INLINE void simd128_stream(void* _ptr, simd128_sse_t _a) { _mm_stream_ps(reinterpret_cast(_ptr), _a); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_ld(float _x, float _y, float _z, float _w) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_ld(float _x, float _y, float _z, float _w) { return _mm_set_ps(_w, _z, _y, _x); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_ld(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) { - const __m128i set = _mm_set_epi32(_w, _z, _y, _x); - const simd128_sse_t result = _mm_castsi128_ps(set); - - return result; + const __m128i tmp = _mm_set_epi32(int32_t(_w), int32_t(_z), int32_t(_y), int32_t(_x)); + return _mm_castsi128_ps(tmp); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_splat(const void* _ptr) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_ld(int32_t _x, int32_t _y, int32_t _z, int32_t _w) { - const simd128_sse_t x___ = _mm_load_ss(reinterpret_cast(_ptr) ); - const simd128_sse_t result = simd_swiz_xxxx(x___); - - return result; + const __m128i tmp = _mm_set_epi32(_w, _z, _y, _x); + return _mm_castsi128_ps(tmp); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_splat(float _a) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_splat(float _a) { return _mm_set1_ps(_a); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_isplat(uint32_t _a) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_splat(int32_t _a) { - const __m128i splat = _mm_set1_epi32(_a); - const simd128_sse_t result = _mm_castsi128_ps(splat); - - return result; + const __m128i tmp = _mm_set1_epi32(_a); + return _mm_castsi128_ps(tmp); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_zero() + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_splat(uint32_t _a) + { + const __m128i tmp = _mm_set1_epi32(int32_t(_a)); + return _mm_castsi128_ps(tmp); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_splat(double _a) + { + const __m128d tmp = _mm_set1_pd(_a); + return _mm_castpd_ps(tmp); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_zero() { return _mm_setzero_ps(); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_itof(simd128_sse_t _a) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_i32_itof(simd128_sse_t _a) { - const __m128i itof = _mm_castps_si128(_a); - const simd128_sse_t result = _mm_cvtepi32_ps(itof); - + const __m128i a = _mm_castps_si128(_a); + const __m128 result = _mm_cvtepi32_ps(a); return result; } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_ftoi(simd128_sse_t _a) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_ftoi_trunc(simd128_sse_t _a) { - const __m128i ftoi = _mm_cvtps_epi32(_a); - const simd128_sse_t result = _mm_castsi128_ps(ftoi); - - return result; + const __m128i result = _mm_cvttps_epi32(_a); + return _mm_castsi128_ps(result); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_round(simd128_sse_t _a) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_ftoi_round(simd128_sse_t _a) { -#if defined(__SSE4_1__) - return _mm_round_ps(_a, _MM_FROUND_NINT); -#else - const __m128i round = _mm_cvtps_epi32(_a); - const simd128_sse_t result = _mm_cvtepi32_ps(round); - - return result; -#endif // defined(__SSE4_1__) + const __m128i result = _mm_cvtps_epi32(_a); + return _mm_castsi128_ps(result); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_add(simd128_sse_t _a, simd128_sse_t _b) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_round(simd128_sse_t _a) + { + return _mm_round_ps(_a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_ceil(simd128_sse_t _a) + { + return _mm_ceil_ps(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_floor(simd128_sse_t _a) + { + return _mm_floor_ps(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_add(simd128_sse_t _a, simd128_sse_t _b) { return _mm_add_ps(_a, _b); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_sub(simd128_sse_t _a, simd128_sse_t _b) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_sub(simd128_sse_t _a, simd128_sse_t _b) { return _mm_sub_ps(_a, _b); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_mul(simd128_sse_t _a, simd128_sse_t _b) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_mul(simd128_sse_t _a, simd128_sse_t _b) { return _mm_mul_ps(_a, _b); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_div(simd128_sse_t _a, simd128_sse_t _b) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_div(simd128_sse_t _a, simd128_sse_t _b) { return _mm_div_ps(_a, _b); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_rcp_est(simd128_sse_t _a) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_rcp_est(simd128_sse_t _a) { return _mm_rcp_ps(_a); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_sqrt(simd128_sse_t _a) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_sqrt(simd128_sse_t _a) { return _mm_sqrt_ps(_a); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_rsqrt_est(simd128_sse_t _a) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_rsqrt_est(simd128_sse_t _a) { return _mm_rsqrt_ps(_a); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_dot3(simd128_sse_t _a, simd128_sse_t _b) - { -#if defined(__SSE4_1__) - return _mm_dp_ps(_a, _b, 0x77); -#else - return simd_dot3_ni(_a, _b); -#endif // defined(__SSE4__) - } - - template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_dot(simd128_sse_t _a, simd128_sse_t _b) - { -#if defined(__SSE4_1__) - return _mm_dp_ps(_a, _b, 0xFF); -#else - return simd_dot_ni(_a, _b); -#endif // defined(__SSE4__) - } - - template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_cmpeq(simd128_sse_t _a, simd128_sse_t _b) - { - return _mm_cmpeq_ps(_a, _b); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_cmpneq(simd128_sse_t _a, simd128_sse_t _b) - { - return _mm_cmpneq_ps(_a, _b); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_cmplt(simd128_sse_t _a, simd128_sse_t _b) - { - return _mm_cmplt_ps(_a, _b); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_cmple(simd128_sse_t _a, simd128_sse_t _b) - { - return _mm_cmple_ps(_a, _b); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_cmpgt(simd128_sse_t _a, simd128_sse_t _b) - { - return _mm_cmpgt_ps(_a, _b); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_cmpge(simd128_sse_t _a, simd128_sse_t _b) - { - return _mm_cmpge_ps(_a, _b); - } - - template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_min(simd128_sse_t _a, simd128_sse_t _b) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_min(simd128_sse_t _a, simd128_sse_t _b) { return _mm_min_ps(_a, _b); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_max(simd128_sse_t _a, simd128_sse_t _b) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_max(simd128_sse_t _a, simd128_sse_t _b) { return _mm_max_ps(_a, _b); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_and(simd128_sse_t _a, simd128_sse_t _b) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_dot3(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_dp_ps(_a, _b, 0x77); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_dot(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_dp_ps(_a, _b, 0xff); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_cmpeq(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_cmpeq_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_cmplt(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_cmplt_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_cmple(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_cmple_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_cmpgt(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_cmpgt_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_cmpge(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_cmpge_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_i32_add(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i add = _mm_add_epi32(a, b); + return _mm_castsi128_ps(add); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_i32_sub(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i sub = _mm_sub_epi32(a, b); + return _mm_castsi128_ps(sub); + } + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_i32_neg(simd128_sse_t _a) + { + return simd_i32_neg_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_i32_abs(simd128_sse_t _a) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i result = _mm_abs_epi32(a); + return _mm_castsi128_ps(result); + } + // SSE4.1 — always available with SSE4.2 minspec + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_i32_min(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i result = _mm_min_epi32(a, b); + return _mm_castsi128_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_i32_max(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i result = _mm_max_epi32(a, b); + return _mm_castsi128_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_i32_cmpeq(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i result = _mm_cmpeq_epi32(a, b); + return _mm_castsi128_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_i32_cmplt(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i result = _mm_cmplt_epi32(a, b); + return _mm_castsi128_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_i32_cmpgt(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i result = _mm_cmpgt_epi32(a, b); + return _mm_castsi128_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_i32_clamp(simd128_sse_t _a, simd128_sse_t _min, simd128_sse_t _max) + { + const simd128_sse_t lo = simd128_i32_max(_a, _min); + const simd128_sse_t result = simd128_i32_min(lo, _max); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_u32_add(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i result = _mm_add_epi32(a, b); + return _mm_castsi128_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_u32_sub(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i result = _mm_sub_epi32(a, b); + return _mm_castsi128_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_u32_mul(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i result = _mm_mullo_epi32(a, b); + return _mm_castsi128_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_u32_min(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i result = _mm_min_epu32(a, b); + return _mm_castsi128_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_u32_max(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i result = _mm_max_epu32(a, b); + return _mm_castsi128_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_u32_clamp(simd128_sse_t _a, simd128_sse_t _min, simd128_sse_t _max) + { + const simd128_sse_t lo = simd128_u32_max(_a, _min); + const simd128_sse_t result = simd128_u32_min(lo, _max); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_u32_cmpeq(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i result = _mm_cmpeq_epi32(a, b); + return _mm_castsi128_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_u32_cmplt(simd128_sse_t _a, simd128_sse_t _b) + { + return simd_u32_cmplt_ni(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_u32_cmpgt(simd128_sse_t _a, simd128_sse_t _b) + { + return simd_u32_cmpgt_ni(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_i16_add(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i result = _mm_add_epi16(a, b); + return _mm_castsi128_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_i16_sub(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i result = _mm_sub_epi16(a, b); + return _mm_castsi128_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_i16_mullo(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i result = _mm_mullo_epi16(a, b); + return _mm_castsi128_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_i16_cmpeq(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i result = _mm_cmpeq_epi16(a, b); + return _mm_castsi128_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_x16_sll(simd128_sse_t _a, int _count) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i result = _mm_slli_epi16(a, _count); + return _mm_castsi128_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_x16_srl(simd128_sse_t _a, int _count) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i result = _mm_srli_epi16(a, _count); + return _mm_castsi128_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_splat(int16_t _a) + { + const __m128i tmp = _mm_set1_epi16(_a); + return _mm_castsi128_ps(tmp); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_splat(uint16_t _a) + { + const __m128i tmp = _mm_set1_epi16(int16_t(_a)); + return _mm_castsi128_ps(tmp); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_i8_add(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i result = _mm_add_epi8(a, b); + return _mm_castsi128_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_i8_sub(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i result = _mm_sub_epi8(a, b); + return _mm_castsi128_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_u8_satadd(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i result = _mm_adds_epu8(a, b); + return _mm_castsi128_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_u8_satsub(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i result = _mm_subs_epu8(a, b); + return _mm_castsi128_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_u16_satadd(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i result = _mm_adds_epu16(a, b); + return _mm_castsi128_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_u16_satsub(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i result = _mm_subs_epu16(a, b); + return _mm_castsi128_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_and(simd128_sse_t _a, simd128_sse_t _b) { return _mm_and_ps(_a, _b); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_andc(simd128_sse_t _a, simd128_sse_t _b) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_andc(simd128_sse_t _a, simd128_sse_t _b) { return _mm_andnot_ps(_b, _a); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_or(simd128_sse_t _a, simd128_sse_t _b) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_or(simd128_sse_t _a, simd128_sse_t _b) { return _mm_or_ps(_a, _b); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_xor(simd128_sse_t _a, simd128_sse_t _b) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_xor(simd128_sse_t _a, simd128_sse_t _b) { return _mm_xor_ps(_a, _b); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_sll(simd128_sse_t _a, int _count) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_x32_sll(simd128_sse_t _a, int _count) { - const __m128i a = _mm_castps_si128(_a); - const __m128i shift = _mm_slli_epi32(a, _count); - const simd128_sse_t result = _mm_castsi128_ps(shift); - - return result; + const __m128i a = _mm_castps_si128(_a); + const __m128i result = _mm_slli_epi32(a, _count); + return _mm_castsi128_ps(result); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_srl(simd128_sse_t _a, int _count) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_x32_srl(simd128_sse_t _a, int _count) { - const __m128i a = _mm_castps_si128(_a); - const __m128i shift = _mm_srli_epi32(a, _count); - const simd128_sse_t result = _mm_castsi128_ps(shift); - - return result; + const __m128i a = _mm_castps_si128(_a); + const __m128i result = _mm_srli_epi32(a, _count); + return _mm_castsi128_ps(result); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_sra(simd128_sse_t _a, int _count) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_x32_sra(simd128_sse_t _a, int _count) { - const __m128i a = _mm_castps_si128(_a); - const __m128i shift = _mm_srai_epi32(a, _count); - const simd128_sse_t result = _mm_castsi128_ps(shift); - - return result; + const __m128i a = _mm_castps_si128(_a); + const __m128i result = _mm_srai_epi32(a, _count); + return _mm_castsi128_ps(result); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_icmpeq(simd128_sse_t _a, simd128_sse_t _b) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_x32_sll(simd128_sse_t _a, simd128_sse_t _count) { - const __m128i tmp0 = _mm_castps_si128(_a); - const __m128i tmp1 = _mm_castps_si128(_b); - const __m128i tmp2 = _mm_cmpeq_epi32(tmp0, tmp1); - const simd128_sse_t result = _mm_castsi128_ps(tmp2); - - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_icmplt(simd128_sse_t _a, simd128_sse_t _b) - { - const __m128i tmp0 = _mm_castps_si128(_a); - const __m128i tmp1 = _mm_castps_si128(_b); - const __m128i tmp2 = _mm_cmplt_epi32(tmp0, tmp1); - const simd128_sse_t result = _mm_castsi128_ps(tmp2); - - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_icmpgt(simd128_sse_t _a, simd128_sse_t _b) - { - const __m128i tmp0 = _mm_castps_si128(_a); - const __m128i tmp1 = _mm_castps_si128(_b); - const __m128i tmp2 = _mm_cmpgt_epi32(tmp0, tmp1); - const simd128_sse_t result = _mm_castsi128_ps(tmp2); - - return result; - } - - template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_imin(simd128_sse_t _a, simd128_sse_t _b) - { -#if defined(__SSE4_1__) - const __m128i tmp0 = _mm_castps_si128(_a); - const __m128i tmp1 = _mm_castps_si128(_b); - const __m128i tmp2 = _mm_min_epi32(tmp0, tmp1); - const simd128_sse_t result = _mm_castsi128_ps(tmp2); - - return result; +#if BX_SIMD_AVX2 + const __m128i a = _mm_castps_si128(_a); + const __m128i c = _mm_castps_si128(_count); + const __m128i result = _mm_sllv_epi32(a, c); + return _mm_castsi128_ps(result); #else - return simd_imin_ni(_a, _b); -#endif // defined(__SSE4_1__) + return simd_x32_sll_ni(_a, _count); +#endif // BX_SIMD_AVX2 } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_imax(simd128_sse_t _a, simd128_sse_t _b) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_x32_srl(simd128_sse_t _a, simd128_sse_t _count) { -#if defined(__SSE4_1__) - const __m128i tmp0 = _mm_castps_si128(_a); - const __m128i tmp1 = _mm_castps_si128(_b); - const __m128i tmp2 = _mm_max_epi32(tmp0, tmp1); - const simd128_sse_t result = _mm_castsi128_ps(tmp2); - - return result; +#if BX_SIMD_AVX2 + const __m128i a = _mm_castps_si128(_a); + const __m128i c = _mm_castps_si128(_count); + const __m128i result = _mm_srlv_epi32(a, c); + return _mm_castsi128_ps(result); #else - return simd_imax_ni(_a, _b); -#endif // defined(__SSE4_1__) + return simd_x32_srl_ni(_a, _count); +#endif // BX_SIMD_AVX2 } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_iadd(simd128_sse_t _a, simd128_sse_t _b) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_x32_sra(simd128_sse_t _a, simd128_sse_t _count) + { + // Note: _mm_srav_epi32 is AVX-512VL, not AVX2. Fall back to _ni decomposition. + return simd_x32_sra_ni(_a, _count); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_selb(simd128_sse_t _mask, simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_blendv_ps(_b, _a, _mask); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_sels(simd128_sse_t _test, simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_blendv_ps(_b, _a, _test); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_x8_shuffle(simd128_sse_t _a, simd128_sse_t _indices) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i indices = _mm_castps_si128(_indices); + const __m128i result = _mm_shuffle_epi8(a, indices); + return _mm_castsi128_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_x8_shuffle(simd128_sse_t _a, simd128_sse_t _b, simd128_sse_t _indices) { const __m128i a = _mm_castps_si128(_a); const __m128i b = _mm_castps_si128(_b); - const __m128i add = _mm_add_epi32(a, b); - const simd128_sse_t result = _mm_castsi128_ps(add); - - return result; + const __m128i indices = _mm_castps_si128(_indices); + // PSHUFB ignores bits 4..6 of the index byte; only bits 0..3 select and + // bit 7 zeroes. Shuffle from a and b independently, then blend per-byte + // based on bit 4 of the original index. + const __m128i pa = _mm_shuffle_epi8(a, indices); + const __m128i pb = _mm_shuffle_epi8(b, indices); + const __m128i bit4 = _mm_set1_epi8(0x10); + const __m128i masked = _mm_and_si128(indices, bit4); + // Convert "bit4 set" into a per-byte high-bit mask for blendv_epi8. + const __m128i pickb = _mm_cmpeq_epi8(masked, bit4); + const __m128i result = _mm_blendv_epi8(pa, pb, pickb); + return _mm_castsi128_ps(result); } template<> - BX_SIMD_FORCE_INLINE simd128_sse_t simd_isub(simd128_sse_t _a, simd128_sse_t _b) - { - const __m128i a = _mm_castps_si128(_a); - const __m128i b = _mm_castps_si128(_b); - const __m128i sub = _mm_sub_epi32(a, b); - const simd128_sse_t result = _mm_castsi128_ps(sub); - - return result; - } - - template<> - BX_SIMD_INLINE simd128_sse_t simd_shuf_xAzC(simd128_sse_t _a, simd128_sse_t _b) - { - return simd_shuf_xAzC_ni(_a, _b); - } - - template<> - BX_SIMD_INLINE simd128_sse_t simd_shuf_yBwD(simd128_sse_t _a, simd128_sse_t _b) - { - return simd_shuf_yBwD_ni(_a, _b); - } - - template<> - BX_SIMD_INLINE simd128_sse_t simd_rcp(simd128_sse_t _a) - { - return simd_rcp_ni(_a); - } - - template<> - BX_SIMD_INLINE simd128_sse_t simd_orx(simd128_sse_t _a) - { - return simd_orx_ni(_a); - } - - template<> - BX_SIMD_INLINE simd128_sse_t simd_orc(simd128_sse_t _a, simd128_sse_t _b) - { - return simd_orc_ni(_a, _b); - } - - template<> - BX_SIMD_INLINE simd128_sse_t simd_neg(simd128_sse_t _a) - { - return simd_neg_ni(_a); - } - - template<> - BX_SIMD_INLINE simd128_sse_t simd_madd(simd128_sse_t _a, simd128_sse_t _b, simd128_sse_t _c) - { - return simd_madd_ni(_a, _b, _c); - } - - template<> - BX_SIMD_INLINE simd128_sse_t simd_nmsub(simd128_sse_t _a, simd128_sse_t _b, simd128_sse_t _c) - { - return simd_nmsub_ni(_a, _b, _c); - } - - template<> - BX_SIMD_INLINE simd128_sse_t simd_div_nr(simd128_sse_t _a, simd128_sse_t _b) - { - return simd_div_nr_ni(_a, _b); - } - - template<> - BX_SIMD_INLINE simd128_sse_t simd_selb(simd128_sse_t _mask, simd128_sse_t _a, simd128_sse_t _b) - { - return simd_selb_ni(_mask, _a, _b); - } - - template<> - BX_SIMD_INLINE simd128_sse_t simd_sels(simd128_sse_t _test, simd128_sse_t _a, simd128_sse_t _b) - { - return simd_sels_ni(_test, _a, _b); - } - - template<> - BX_SIMD_INLINE simd128_sse_t simd_not(simd128_sse_t _a) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_not(simd128_sse_t _a) { return simd_not_ni(_a); } template<> - BX_SIMD_INLINE simd128_sse_t simd_abs(simd128_sse_t _a) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_orc(simd128_sse_t _a, simd128_sse_t _b) { - return simd_abs_ni(_a); + return simd_orc_ni(_a, _b); } template<> - BX_SIMD_INLINE simd128_sse_t simd_clamp(simd128_sse_t _a, simd128_sse_t _min, simd128_sse_t _max) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_orx(simd128_sse_t _a) { - return simd_clamp_ni(_a, _min, _max); + return simd128_orx_ni(_a); } template<> - BX_SIMD_INLINE simd128_sse_t simd_lerp(simd128_sse_t _a, simd128_sse_t _b, simd128_sse_t _s) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_madd(simd128_sse_t _a, simd128_sse_t _b, simd128_sse_t _c) { - return simd_lerp_ni(_a, _b, _s); + return simd_f32_madd_ni(_a, _b, _c); } template<> - BX_SIMD_INLINE simd128_sse_t simd_rsqrt(simd128_sse_t _a) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_msub(simd128_sse_t _a, simd128_sse_t _b, simd128_sse_t _c) { - return simd_rsqrt_ni(_a); + return simd_f32_msub_ni(_a, _b, _c); } template<> - BX_SIMD_INLINE simd128_sse_t simd_rsqrt_nr(simd128_sse_t _a) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_nmsub(simd128_sse_t _a, simd128_sse_t _b, simd128_sse_t _c) { -#if BX_COMPILER_MSVC - return simd_rsqrt_ni(_a); -#else - return simd_rsqrt_nr_ni(_a); -#endif // BX_COMPILER_MSVC + return simd_f32_nmsub_ni(_a, _b, _c); } template<> - BX_SIMD_INLINE simd128_sse_t simd_rsqrt_carmack(simd128_sse_t _a) + BX_SIMD_FORCE_INLINE int simd128_x32_signbitsmask(simd128_sse_t _a) { - return simd_rsqrt_carmack_ni(_a); + return _mm_movemask_ps(_a); } template<> - BX_SIMD_INLINE simd128_sse_t simd_sqrt_nr(simd128_sse_t _a) + BX_SIMD_FORCE_INLINE int simd128_x8_signbitsmask(simd128_sse_t _a) { - return simd_sqrt_nr_ni(_a); + const __m128i ai = _mm_castps_si128(_a); + const int result = _mm_movemask_epi8(ai); + return result; } template<> - BX_SIMD_INLINE simd128_sse_t simd_log2(simd128_sse_t _a) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_neg(simd128_sse_t _a) { - return simd_log2_ni(_a); + const __m128i signi = _mm_set1_epi32(int32_t(kFloatSignMask) ); + const __m128 signf = _mm_castsi128_ps(signi); + return _mm_xor_ps(_a, signf); } template<> - BX_SIMD_INLINE simd128_sse_t simd_exp2(simd128_sse_t _a) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_abs(simd128_sse_t _a) { - return simd_exp2_ni(_a); + const __m128i maski = _mm_set1_epi32(int32_t(kFloatExponentMask | kFloatMantissaMask) ); + const __m128 maskf = _mm_castsi128_ps(maski); + return _mm_and_ps(_a, maskf); } template<> - BX_SIMD_INLINE simd128_sse_t simd_pow(simd128_sse_t _a, simd128_sse_t _b) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_clamp(simd128_sse_t _a, simd128_sse_t _min, simd128_sse_t _max) { - return simd_pow_ni(_a, _b); + const simd128_sse_t lo = _mm_max_ps(_a, _min); + const simd128_sse_t result = _mm_min_ps(lo, _max); + return result; } template<> - BX_SIMD_INLINE simd128_sse_t simd_cross3(simd128_sse_t _a, simd128_sse_t _b) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_lerp(simd128_sse_t _a, simd128_sse_t _b, simd128_sse_t _s) { - return simd_cross3_ni(_a, _b); + return simd_f32_lerp_ni(_a, _b, _s); } template<> - BX_SIMD_INLINE simd128_sse_t simd_normalize3(simd128_sse_t _a) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_rcp(simd128_sse_t _a) { - return simd_normalize3_ni(_a); + return simd_f32_rcp_ni(_a); } template<> - BX_SIMD_INLINE simd128_sse_t simd_ceil(simd128_sse_t _a) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_sqrt_nr(simd128_sse_t _a) { - return simd_ceil_ni(_a); + return simd_f32_sqrt_nr_ni(_a); } template<> - BX_SIMD_INLINE simd128_sse_t simd_floor(simd128_sse_t _a) + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_rsqrt(simd128_sse_t _a) { - return simd_floor_ni(_a); + return simd_f32_rsqrt_ni(_a); } - typedef simd128_sse_t simd128_t; + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_rsqrt_nr(simd128_sse_t _a) + { + return simd_f32_rsqrt_nr_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_rsqrt_carmack(simd128_sse_t _a) + { + return simd_f32_rsqrt_carmack_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_div_nr(simd128_sse_t _a, simd128_sse_t _b) + { + return simd_f32_div_nr_ni(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_cross3(simd128_sse_t _a, simd128_sse_t _b) + { + return simd128_f32_cross3_ni(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_normalize3(simd128_sse_t _a) + { + return simd128_f32_normalize3_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f32_cmpneq(simd128_sse_t _a, simd128_sse_t _b) + { + return _mm_cmpneq_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_x32_shuf_xAzC(simd128_sse_t _a, simd128_sse_t _b) + { + const simd128_sse_t lo = _mm_unpacklo_ps(_a, _b); + const simd128_sse_t hi = _mm_unpackhi_ps(_a, _b); + const simd128_sse_t result = _mm_movelh_ps(lo, hi); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_x32_shuf_yBwD(simd128_sse_t _a, simd128_sse_t _b) + { + const simd128_sse_t lo = _mm_unpacklo_ps(_a, _b); + const simd128_sse_t hi = _mm_unpackhi_ps(_a, _b); + const simd128_sse_t result = _mm_movehl_ps(hi, lo); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f64_add(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128d a = _mm_castps_pd(_a); + const __m128d b = _mm_castps_pd(_b); + const __m128d result = _mm_add_pd(a, b); + return _mm_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f64_sub(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128d a = _mm_castps_pd(_a); + const __m128d b = _mm_castps_pd(_b); + const __m128d result = _mm_sub_pd(a, b); + return _mm_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f64_mul(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128d a = _mm_castps_pd(_a); + const __m128d b = _mm_castps_pd(_b); + const __m128d result = _mm_mul_pd(a, b); + return _mm_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f64_div(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128d a = _mm_castps_pd(_a); + const __m128d b = _mm_castps_pd(_b); + const __m128d result = _mm_div_pd(a, b); + return _mm_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f64_min(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128d a = _mm_castps_pd(_a); + const __m128d b = _mm_castps_pd(_b); + const __m128d result = _mm_min_pd(a, b); + return _mm_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f64_max(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128d a = _mm_castps_pd(_a); + const __m128d b = _mm_castps_pd(_b); + const __m128d result = _mm_max_pd(a, b); + return _mm_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f64_madd(simd128_sse_t _a, simd128_sse_t _b, simd128_sse_t _c) + { + return simd_f64_madd_ni(_a, _b, _c); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f64_nmsub(simd128_sse_t _a, simd128_sse_t _b, simd128_sse_t _c) + { + return simd_f64_nmsub_ni(_a, _b, _c); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f64_neg(simd128_sse_t _a) + { + const __m128i signi = _mm_set1_epi64x(int64_t(kDoubleSignMask) ); + const __m128d signd = _mm_castsi128_pd(signi); + const __m128d a = _mm_castps_pd(_a); + const __m128d r = _mm_xor_pd(a, signd); + return _mm_castpd_ps(r); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f64_abs(simd128_sse_t _a) + { + const __m128i maski = _mm_set1_epi64x(int64_t(kDoubleExponentMask | kDoubleMantissaMask) ); + const __m128d maskd = _mm_castsi128_pd(maski); + const __m128d a = _mm_castps_pd(_a); + const __m128d r = _mm_and_pd(a, maskd); + return _mm_castpd_ps(r); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f64_clamp(simd128_sse_t _a, simd128_sse_t _min, simd128_sse_t _max) + { + const simd128_sse_t lo = simd128_f64_max(_a, _min); + const simd128_sse_t result = simd128_f64_min(lo, _max); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f64_lerp(simd128_sse_t _a, simd128_sse_t _b, simd128_sse_t _s) + { + return simd_f64_lerp_ni(_a, _b, _s); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f64_rcp(simd128_sse_t _a) + { + return simd_f64_rcp_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f64_sqrt(simd128_sse_t _a) + { + const __m128d a = _mm_castps_pd(_a); + const __m128d result = _mm_sqrt_pd(a); + return _mm_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f64_rsqrt(simd128_sse_t _a) + { + return simd_f64_rsqrt_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f64_round(simd128_sse_t _a) + { + const __m128d a = _mm_castps_pd(_a); + const __m128d result = _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + return _mm_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f64_ceil(simd128_sse_t _a) + { + const __m128d a = _mm_castps_pd(_a); + const __m128d result = _mm_ceil_pd(a); + return _mm_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f64_floor(simd128_sse_t _a) + { + const __m128d a = _mm_castps_pd(_a); + const __m128d result = _mm_floor_pd(a); + return _mm_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f64_cmpeq(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128d a = _mm_castps_pd(_a); + const __m128d b = _mm_castps_pd(_b); + const __m128d result = _mm_cmpeq_pd(a, b); + return _mm_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f64_cmpneq(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128d a = _mm_castps_pd(_a); + const __m128d b = _mm_castps_pd(_b); + const __m128d result = _mm_cmpneq_pd(a, b); + return _mm_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f64_cmplt(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128d a = _mm_castps_pd(_a); + const __m128d b = _mm_castps_pd(_b); + const __m128d result = _mm_cmplt_pd(a, b); + return _mm_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f64_cmple(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128d a = _mm_castps_pd(_a); + const __m128d b = _mm_castps_pd(_b); + const __m128d result = _mm_cmple_pd(a, b); + return _mm_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f64_cmpgt(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128d a = _mm_castps_pd(_a); + const __m128d b = _mm_castps_pd(_b); + const __m128d result = _mm_cmpgt_pd(a, b); + return _mm_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_f64_cmpge(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128d a = _mm_castps_pd(_a); + const __m128d b = _mm_castps_pd(_b); + const __m128d result = _mm_cmpge_pd(a, b); + return _mm_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_i64_add(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i result = _mm_add_epi64(a, b); + return _mm_castsi128_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_i64_sub(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i result = _mm_sub_epi64(a, b); + return _mm_castsi128_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_u64_add(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i result = _mm_add_epi64(a, b); + return _mm_castsi128_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_sse_t simd128_u64_sub(simd128_sse_t _a, simd128_sse_t _b) + { + const __m128i a = _mm_castps_si128(_a); + const __m128i b = _mm_castps_si128(_b); + const __m128i result = _mm_sub_epi64(a, b); + return _mm_castsi128_ps(result); + } } // namespace bx diff --git a/include/bx/inline/simd128_swizzle.inl b/include/bx/inline/simd128_swizzle.inl index 34ac39b..ed22f7d 100644 --- a/include/bx/inline/simd128_swizzle.inl +++ b/include/bx/inline/simd128_swizzle.inl @@ -3,9 +3,9 @@ * License: https://github.com/bkaradzic/bx/blob/master/LICENSE */ -#ifndef BX_SIMD_T_H_HEADER_GUARD -# error "xmacro file, must be included from simd_*.h" -#endif // BX_FLOAT4_T_H_HEADER_GUARD +#if !defined(BX_SIMD_T_H_HEADER_GUARD) +# error "xmacro file, must be included from simd_t.h" +#endif // BX_SIMD_T_H_HEADER_GUARD // included from float4_t.h BX_SIMD128_IMPLEMENT_SWIZZLE(x, x, x, x) diff --git a/include/bx/inline/simd128_wasm.inl b/include/bx/inline/simd128_wasm.inl new file mode 100644 index 0000000..f4059c8 --- /dev/null +++ b/include/bx/inline/simd128_wasm.inl @@ -0,0 +1,1001 @@ +/* + * Copyright 2010-2026 Branimir Karadzic. All rights reserved. + * License: https://github.com/bkaradzic/bx/blob/master/LICENSE + */ + +#ifndef BX_SIMD_T_H_HEADER_GUARD +# error "Must be included from bx/simd_t.h!" +#endif // BX_SIMD_T_H_HEADER_GUARD + +namespace bx +{ + +#define ELEMx 0 +#define ELEMy 1 +#define ELEMz 2 +#define ELEMw 3 +#define BX_SIMD128_IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ + template<> \ + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_x32_swiz_##_x##_y##_z##_w(simd128_wasm_t _a) \ + { \ + return wasm_i32x4_shuffle(_a, _a, ELEM##_x, ELEM##_y, ELEM##_z, ELEM##_w); \ + } + +#include "simd128_swizzle.inl" + +#undef BX_SIMD128_IMPLEMENT_SWIZZLE +#undef ELEMw +#undef ELEMz +#undef ELEMy +#undef ELEMx + +#define BX_SIMD128_IMPLEMENT_TEST(_xyzw, _mask) \ + template<> \ + BX_SIMD_FORCE_INLINE bool simd128_test_any_##_xyzw(simd128_wasm_t _test) \ + { \ + return 0x0 != (wasm_i32x4_bitmask(_test)&(_mask) ); \ + } \ + \ + template<> \ + BX_SIMD_FORCE_INLINE bool simd128_test_all_##_xyzw(simd128_wasm_t _test) \ + { \ + return (_mask) == (wasm_i32x4_bitmask(_test)&(_mask) ); \ + } + + BX_SIMD128_IMPLEMENT_TEST(x , 0x1) + BX_SIMD128_IMPLEMENT_TEST(y , 0x2) + BX_SIMD128_IMPLEMENT_TEST(xy , 0x3) + BX_SIMD128_IMPLEMENT_TEST(z , 0x4) + BX_SIMD128_IMPLEMENT_TEST(xz , 0x5) + BX_SIMD128_IMPLEMENT_TEST(yz , 0x6) + BX_SIMD128_IMPLEMENT_TEST(xyz , 0x7) + BX_SIMD128_IMPLEMENT_TEST(w , 0x8) + BX_SIMD128_IMPLEMENT_TEST(xw , 0x9) + BX_SIMD128_IMPLEMENT_TEST(yw , 0xa) + BX_SIMD128_IMPLEMENT_TEST(xyw , 0xb) + BX_SIMD128_IMPLEMENT_TEST(zw , 0xc) + BX_SIMD128_IMPLEMENT_TEST(xzw , 0xd) + BX_SIMD128_IMPLEMENT_TEST(yzw , 0xe) + BX_SIMD128_IMPLEMENT_TEST(xyzw , 0xf) + +#undef BX_SIMD128_IMPLEMENT_TEST + + template<> + BX_SIMD_FORCE_INLINE bool simd128_test_zero(simd128_wasm_t _a, simd128_wasm_t _b) + { + const simd128_wasm_t masked = wasm_v128_and(_a, _b); + const bool result = !wasm_v128_any_true(masked); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_x32_shuf_xyAB(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i32x4_shuffle(_a, _b, 0, 1, 4, 5); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_x32_shuf_ABxy(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i32x4_shuffle(_a, _b, 4, 5, 0, 1); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_x32_shuf_CDzw(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i32x4_shuffle(_a, _b, 6, 7, 2, 3); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_x32_shuf_zwCD(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i32x4_shuffle(_a, _b, 2, 3, 6, 7); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_x32_shuf_xAyB(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i32x4_shuffle(_a, _b, 0, 4, 1, 5); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_x32_shuf_AxBy(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i32x4_shuffle(_a, _b, 4, 0, 5, 1); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_x32_shuf_zCwD(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i32x4_shuffle(_a, _b, 2, 6, 3, 7); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_x32_shuf_CzDw(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i32x4_shuffle(_a, _b, 6, 2, 7, 3); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_x32_shuf_xzAC(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i32x4_shuffle(_a, _b, 0, 2, 4, 6); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_x32_shuf_ywBD(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i32x4_shuffle(_a, _b, 1, 3, 5, 7); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_x32_shuf_xxAA(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i32x4_shuffle(_a, _b, 0, 0, 4, 4); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_x32_shuf_yyBB(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i32x4_shuffle(_a, _b, 1, 1, 5, 5); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_x32_shuf_zzCC(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i32x4_shuffle(_a, _b, 2, 2, 6, 6); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_x32_shuf_wwDD(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i32x4_shuffle(_a, _b, 3, 3, 7, 7); + } + + template<> + BX_SIMD_FORCE_INLINE float simd128_f32_x(simd128_wasm_t _a) + { + return wasm_f32x4_extract_lane(_a, 0); + } + + template<> + BX_SIMD_FORCE_INLINE float simd128_f32_y(simd128_wasm_t _a) + { + return wasm_f32x4_extract_lane(_a, 1); + } + + template<> + BX_SIMD_FORCE_INLINE float simd128_f32_z(simd128_wasm_t _a) + { + return wasm_f32x4_extract_lane(_a, 2); + } + + template<> + BX_SIMD_FORCE_INLINE float simd128_f32_w(simd128_wasm_t _a) + { + return wasm_f32x4_extract_lane(_a, 3); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_ld(const void* _ptr) + { + return wasm_v128_load(_ptr); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_ldu(const void* _ptr) + { + return wasm_v128_load(_ptr); + } + + template<> + BX_SIMD_FORCE_INLINE void simd128_st(void* _ptr, simd128_wasm_t _a) + { + wasm_v128_store(_ptr, _a); + } + + template<> + BX_SIMD_FORCE_INLINE void simd128_stu(void* _ptr, simd128_wasm_t _a) + { + wasm_v128_store(_ptr, _a); + } + + template<> + BX_SIMD_FORCE_INLINE void simd128_x32_st1(void* _ptr, simd128_wasm_t _a) + { + float* result = reinterpret_cast(_ptr); + *result = wasm_f32x4_extract_lane(_a, 0); + } + + template<> + BX_SIMD_FORCE_INLINE void simd128_stream(void* _ptr, simd128_wasm_t _a) + { + wasm_v128_store(_ptr, _a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_ld(float _x, float _y, float _z, float _w) + { + return wasm_f32x4_make(_x, _y, _z, _w); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_ld(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) + { + return wasm_i32x4_make(int32_t(_x), int32_t(_y), int32_t(_z), int32_t(_w)); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_ld(int32_t _x, int32_t _y, int32_t _z, int32_t _w) + { + return wasm_i32x4_make(_x, _y, _z, _w); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_splat(float _a) + { + return wasm_f32x4_splat(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_splat(int32_t _a) + { + return wasm_i32x4_splat(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_splat(uint32_t _a) + { + return wasm_i32x4_splat(int32_t(_a)); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_splat(double _a) + { + return wasm_f64x2_splat(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_zero() + { + return wasm_i32x4_splat(0); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_i32_itof(simd128_wasm_t _a) + { + return wasm_f32x4_convert_i32x4(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_ftoi_trunc(simd128_wasm_t _a) + { + return wasm_i32x4_trunc_sat_f32x4(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_ftoi_round(simd128_wasm_t _a) + { + const simd128_wasm_t rounded = wasm_f32x4_nearest(_a); + const simd128_wasm_t result = wasm_i32x4_trunc_sat_f32x4(rounded); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_round(simd128_wasm_t _a) + { + return wasm_f32x4_nearest(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_ceil(simd128_wasm_t _a) + { + return wasm_f32x4_ceil(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_floor(simd128_wasm_t _a) + { + return wasm_f32x4_floor(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_add(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_f32x4_add(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_sub(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_f32x4_sub(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_mul(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_f32x4_mul(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_div(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_f32x4_div(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_rcp_est(simd128_wasm_t _a) + { + const simd128_wasm_t one = wasm_f32x4_splat(1.0f); + const simd128_wasm_t result = wasm_f32x4_div(one, _a); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_sqrt(simd128_wasm_t _a) + { + return wasm_f32x4_sqrt(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_rsqrt_est(simd128_wasm_t _a) + { + const simd128_wasm_t one = wasm_f32x4_splat(1.0f); + const simd128_wasm_t sqrt = wasm_f32x4_sqrt(_a); + const simd128_wasm_t result = wasm_f32x4_div(one, sqrt); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_min(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_f32x4_pmin(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_max(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_f32x4_pmax(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_dot3(simd128_wasm_t _a, simd128_wasm_t _b) + { + return simd128_f32_dot3_ni(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_dot(simd128_wasm_t _a, simd128_wasm_t _b) + { + return simd128_f32_dot_ni(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_cmpeq(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_f32x4_eq(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_cmplt(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_f32x4_lt(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_cmple(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_f32x4_le(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_cmpgt(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_f32x4_gt(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_cmpge(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_f32x4_ge(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_i32_add(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i32x4_add(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_i32_sub(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i32x4_sub(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_i32_neg(simd128_wasm_t _a) + { + return wasm_i32x4_neg(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_i32_abs(simd128_wasm_t _a) + { + return wasm_i32x4_abs(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_i32_min(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i32x4_min(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_i32_max(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i32x4_max(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_i32_cmpeq(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i32x4_eq(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_i32_cmplt(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i32x4_lt(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_i32_cmpgt(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i32x4_gt(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_i32_clamp(simd128_wasm_t _a, simd128_wasm_t _min, simd128_wasm_t _max) + { + const simd128_wasm_t lo = wasm_i32x4_max(_a, _min); + const simd128_wasm_t result = wasm_i32x4_min(lo, _max); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_u32_add(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i32x4_add(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_u32_sub(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i32x4_sub(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_u32_mul(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i32x4_mul(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_u32_min(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_u32x4_min(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_u32_max(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_u32x4_max(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_u32_clamp(simd128_wasm_t _a, simd128_wasm_t _min, simd128_wasm_t _max) + { + const simd128_wasm_t lo = wasm_u32x4_max(_a, _min); + const simd128_wasm_t result = wasm_u32x4_min(lo, _max); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_u32_cmpeq(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i32x4_eq(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_u32_cmplt(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_u32x4_lt(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_u32_cmpgt(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_u32x4_gt(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_i16_add(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i16x8_add(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_i16_sub(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i16x8_sub(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_i16_mullo(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i16x8_mul(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_i16_cmpeq(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i16x8_eq(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_x16_sll(simd128_wasm_t _a, int _count) + { + return wasm_i16x8_shl(_a, _count); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_x16_srl(simd128_wasm_t _a, int _count) + { + return wasm_u16x8_shr(_a, _count); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_splat(int16_t _a) + { + return wasm_i16x8_splat(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_splat(uint16_t _a) + { + return wasm_i16x8_splat(int16_t(_a)); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_i8_add(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i8x16_add(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_i8_sub(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i8x16_sub(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_u8_satadd(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_u8x16_add_sat(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_u8_satsub(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_u8x16_sub_sat(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_u16_satadd(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_u16x8_add_sat(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_u16_satsub(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_u16x8_sub_sat(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_and(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_v128_and(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_andc(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_v128_andnot(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_or(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_v128_or(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_xor(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_v128_xor(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_x32_sll(simd128_wasm_t _a, int _count) + { + return wasm_i32x4_shl(_a, _count); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_x32_srl(simd128_wasm_t _a, int _count) + { + return wasm_u32x4_shr(_a, _count); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_x32_sra(simd128_wasm_t _a, int _count) + { + return wasm_i32x4_shr(_a, _count); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_x32_sll(simd128_wasm_t _a, simd128_wasm_t _count) + { + return simd_x32_sll_ni(_a, _count); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_x32_srl(simd128_wasm_t _a, simd128_wasm_t _count) + { + return simd_x32_srl_ni(_a, _count); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_x32_sra(simd128_wasm_t _a, simd128_wasm_t _count) + { + return simd_x32_sra_ni(_a, _count); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_x8_shuffle(simd128_wasm_t _a, simd128_wasm_t _indices) + { + return wasm_i8x16_swizzle(_a, _indices); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_x8_shuffle(simd128_wasm_t _a, simd128_wasm_t _b, simd128_wasm_t _indices) + { + return simd_x8_shuffle_ni(_a, _b, _indices); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_selb(simd128_wasm_t _mask, simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_v128_bitselect(_a, _b, _mask); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_sels(simd128_wasm_t _test, simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_v128_bitselect(_a, _b, _test); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_not(simd128_wasm_t _a) + { + return wasm_v128_not(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_orc(simd128_wasm_t _a, simd128_wasm_t _b) + { + const simd128_wasm_t notb = wasm_v128_not(_b); + const simd128_wasm_t result = wasm_v128_or(_a, notb); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_orx(simd128_wasm_t _a) + { + return simd128_orx_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_madd(simd128_wasm_t _a, simd128_wasm_t _b, simd128_wasm_t _c) + { + return simd_f32_madd_ni(_a, _b, _c); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_msub(simd128_wasm_t _a, simd128_wasm_t _b, simd128_wasm_t _c) + { + return simd_f32_msub_ni(_a, _b, _c); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_nmsub(simd128_wasm_t _a, simd128_wasm_t _b, simd128_wasm_t _c) + { + return simd_f32_nmsub_ni(_a, _b, _c); + } + + template<> + BX_SIMD_FORCE_INLINE int simd128_x32_signbitsmask(simd128_wasm_t _a) + { + return wasm_i32x4_bitmask(_a); + } + + template<> + BX_SIMD_FORCE_INLINE int simd128_x8_signbitsmask(simd128_wasm_t _a) + { + return wasm_i8x16_bitmask(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_neg(simd128_wasm_t _a) + { + return wasm_f32x4_neg(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_abs(simd128_wasm_t _a) + { + return wasm_f32x4_abs(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_clamp(simd128_wasm_t _a, simd128_wasm_t _min, simd128_wasm_t _max) + { + const simd128_wasm_t lo = wasm_f32x4_pmax(_a, _min); + const simd128_wasm_t result = wasm_f32x4_pmin(lo, _max); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_lerp(simd128_wasm_t _a, simd128_wasm_t _b, simd128_wasm_t _s) + { + return simd_f32_lerp_ni(_a, _b, _s); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_rcp(simd128_wasm_t _a) + { + return simd_f32_rcp_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_sqrt_nr(simd128_wasm_t _a) + { + return simd_f32_sqrt_nr_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_rsqrt(simd128_wasm_t _a) + { + return simd_f32_rsqrt_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_rsqrt_nr(simd128_wasm_t _a) + { + return simd_f32_rsqrt_nr_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_rsqrt_carmack(simd128_wasm_t _a) + { + return simd_f32_rsqrt_carmack_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_div_nr(simd128_wasm_t _a, simd128_wasm_t _b) + { + return simd_f32_div_nr_ni(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_cross3(simd128_wasm_t _a, simd128_wasm_t _b) + { + return simd128_f32_cross3_ni(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_normalize3(simd128_wasm_t _a) + { + return simd128_f32_normalize3_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f32_cmpneq(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_f32x4_ne(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_x32_shuf_xAzC(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i32x4_shuffle(_a, _b, 0, 4, 2, 6); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_x32_shuf_yBwD(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i32x4_shuffle(_a, _b, 1, 5, 3, 7); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f64_add(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_f64x2_add(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f64_sub(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_f64x2_sub(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f64_mul(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_f64x2_mul(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f64_div(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_f64x2_div(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f64_min(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_f64x2_pmin(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f64_max(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_f64x2_pmax(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f64_madd(simd128_wasm_t _a, simd128_wasm_t _b, simd128_wasm_t _c) + { + return simd_f64_madd_ni(_a, _b, _c); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f64_nmsub(simd128_wasm_t _a, simd128_wasm_t _b, simd128_wasm_t _c) + { + return simd_f64_nmsub_ni(_a, _b, _c); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f64_neg(simd128_wasm_t _a) + { + return wasm_f64x2_neg(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f64_abs(simd128_wasm_t _a) + { + return wasm_f64x2_abs(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f64_clamp(simd128_wasm_t _a, simd128_wasm_t _min, simd128_wasm_t _max) + { + const simd128_wasm_t lo = wasm_f64x2_pmax(_a, _min); + const simd128_wasm_t result = wasm_f64x2_pmin(lo, _max); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f64_lerp(simd128_wasm_t _a, simd128_wasm_t _b, simd128_wasm_t _s) + { + return simd_f64_lerp_ni(_a, _b, _s); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f64_rcp(simd128_wasm_t _a) + { + return simd_f64_rcp_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f64_sqrt(simd128_wasm_t _a) + { + return wasm_f64x2_sqrt(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f64_rsqrt(simd128_wasm_t _a) + { + return simd_f64_rsqrt_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f64_round(simd128_wasm_t _a) + { + return wasm_f64x2_nearest(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f64_ceil(simd128_wasm_t _a) + { + return wasm_f64x2_ceil(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f64_floor(simd128_wasm_t _a) + { + return wasm_f64x2_floor(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f64_cmpeq(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_f64x2_eq(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f64_cmpneq(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_f64x2_ne(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f64_cmplt(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_f64x2_lt(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f64_cmple(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_f64x2_le(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f64_cmpgt(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_f64x2_gt(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_f64_cmpge(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_f64x2_ge(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_i64_add(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i64x2_add(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_i64_sub(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i64x2_sub(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_u64_add(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i64x2_add(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd128_wasm_t simd128_u64_sub(simd128_wasm_t _a, simd128_wasm_t _b) + { + return wasm_i64x2_sub(_a, _b); + } + +} // namespace bx diff --git a/include/bx/inline/simd256_avx.inl b/include/bx/inline/simd256_avx.inl index 8be840e..ad904a3 100644 --- a/include/bx/inline/simd256_avx.inl +++ b/include/bx/inline/simd256_avx.inl @@ -9,66 +9,1638 @@ namespace bx { + template<> - BX_SIMD_FORCE_INLINE simd256_avx_t simd_ld(const void* _ptr) + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_ld(const void* _ptr) { - return _mm256_load_ps(reinterpret_cast(_ptr) ); + return _mm256_load_ps(reinterpret_cast(_ptr)); } template<> - BX_SIMD_FORCE_INLINE void simd_st(void* _ptr, simd256_avx_t _a) + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_ldu(const void* _ptr) + { + return _mm256_loadu_ps(reinterpret_cast(_ptr)); + } + + template<> + BX_SIMD_FORCE_INLINE void simd256_st(void* _ptr, simd256_avx_t _a) { _mm256_store_ps(reinterpret_cast(_ptr), _a); } template<> - BX_SIMD_FORCE_INLINE simd256_avx_t simd_ld(float _x, float _y, float _z, float _w, float _A, float _B, float _C, float _D) + BX_SIMD_FORCE_INLINE void simd256_stu(void* _ptr, simd256_avx_t _a) { - return _mm256_set_ps(_D, _C, _B, _A, _w, _z, _y, _x); + _mm256_storeu_ps(reinterpret_cast(_ptr), _a); } template<> - BX_SIMD_FORCE_INLINE simd256_avx_t simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w, uint32_t _A, uint32_t _B, uint32_t _C, uint32_t _D) + BX_SIMD_FORCE_INLINE void simd256_x32_st1(void* _ptr, simd256_avx_t _a) { - const __m256i set = _mm256_set_epi32(_D, _C, _B, _A, _w, _z, _y, _x); - const simd256_avx_t result = _mm256_castsi256_ps(set); - - return result; + const __m128 lo = _mm256_castps256_ps128(_a); + _mm_store_ss(reinterpret_cast(_ptr), lo); } template<> - BX_SIMD_FORCE_INLINE simd256_avx_t simd_splat(float _a) + BX_SIMD_FORCE_INLINE void simd256_stream(void* _ptr, simd256_avx_t _a) + { + _mm256_stream_ps(reinterpret_cast(_ptr), _a); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_splat(float _a) { return _mm256_set1_ps(_a); } template<> - BX_SIMD_FORCE_INLINE simd256_avx_t simd_isplat(uint32_t _a) + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_splat(uint32_t _a) { - const __m256i splat = _mm256_set1_epi32(_a); - const simd256_avx_t result = _mm256_castsi256_ps(splat); + const __m256i tmp = _mm256_set1_epi32(int32_t(_a)); + return _mm256_castsi256_ps(tmp); + } + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_splat(double _a) + { + const __m256d tmp = _mm256_set1_pd(_a); + return _mm256_castpd_ps(tmp); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_splat(int32_t _a) + { + const __m256i tmp = _mm256_set1_epi32(_a); + return _mm256_castsi256_ps(tmp); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_splat(int16_t _a) + { + const __m256i tmp = _mm256_set1_epi16(_a); + return _mm256_castsi256_ps(tmp); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_splat(uint16_t _a) + { + const __m256i tmp = _mm256_set1_epi16(int16_t(_a)); + return _mm256_castsi256_ps(tmp); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_ld(float _x0, float _x1, float _x2, float _x3, float _x4, float _x5, float _x6, float _x7) + { + return _mm256_setr_ps(_x0, _x1, _x2, _x3, _x4, _x5, _x6, _x7); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_ld(uint32_t _x0, uint32_t _x1, uint32_t _x2, uint32_t _x3, uint32_t _x4, uint32_t _x5, uint32_t _x6, uint32_t _x7) + { + const __m256i tmp = _mm256_setr_epi32(int32_t(_x0), int32_t(_x1), int32_t(_x2), int32_t(_x3), int32_t(_x4), int32_t(_x5), int32_t(_x6), int32_t(_x7)); + return _mm256_castsi256_ps(tmp); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_ld(int32_t _x0, int32_t _x1, int32_t _x2, int32_t _x3, int32_t _x4, int32_t _x5, int32_t _x6, int32_t _x7) + { + const __m256i tmp = _mm256_setr_epi32(_x0, _x1, _x2, _x3, _x4, _x5, _x6, _x7); + return _mm256_castsi256_ps(tmp); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_zero() + { + return _mm256_setzero_ps(); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i32_itof(simd256_avx_t _a) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256 result = _mm256_cvtepi32_ps(a); return result; } template<> - BX_SIMD_FORCE_INLINE simd256_avx_t simd_itof(simd256_avx_t _a) + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f32_ftoi_trunc(simd256_avx_t _a) { - const __m256i itof = _mm256_castps_si256(_a); - const simd256_avx_t result = _mm256_cvtepi32_ps(itof); + const __m256i result = _mm256_cvttps_epi32(_a); + return _mm256_castsi256_ps(result); + } + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f32_ftoi_round(simd256_avx_t _a) + { + const __m256i result = _mm256_cvtps_epi32(_a); + return _mm256_castsi256_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f32_add(simd256_avx_t _a, simd256_avx_t _b) + { + return _mm256_add_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f32_sub(simd256_avx_t _a, simd256_avx_t _b) + { + return _mm256_sub_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f32_mul(simd256_avx_t _a, simd256_avx_t _b) + { + return _mm256_mul_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f32_div(simd256_avx_t _a, simd256_avx_t _b) + { + return _mm256_div_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f32_rcp_est(simd256_avx_t _a) + { + return _mm256_rcp_ps(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f32_sqrt(simd256_avx_t _a) + { + return _mm256_sqrt_ps(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f32_rsqrt_est(simd256_avx_t _a) + { + return _mm256_rsqrt_ps(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f32_madd(simd256_avx_t _a, simd256_avx_t _b, simd256_avx_t _c) + { + return simd_f32_madd_ni(_a, _b, _c); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f32_msub(simd256_avx_t _a, simd256_avx_t _b, simd256_avx_t _c) + { + return simd_f32_msub_ni(_a, _b, _c); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f32_nmsub(simd256_avx_t _a, simd256_avx_t _b, simd256_avx_t _c) + { + return simd_f32_nmsub_ni(_a, _b, _c); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f32_neg(simd256_avx_t _a) + { + const __m256i signi = _mm256_set1_epi32(int32_t(kFloatSignMask) ); + const __m256 signf = _mm256_castsi256_ps(signi); + return _mm256_xor_ps(_a, signf); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f32_abs(simd256_avx_t _a) + { + const __m256i maski = _mm256_set1_epi32(int32_t(kFloatExponentMask | kFloatMantissaMask) ); + const __m256 maskf = _mm256_castsi256_ps(maski); + return _mm256_and_ps(_a, maskf); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f32_rcp(simd256_avx_t _a) + { + return simd_f32_rcp_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f32_rsqrt(simd256_avx_t _a) + { + return simd_f32_rsqrt_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f32_clamp(simd256_avx_t _a, simd256_avx_t _min, simd256_avx_t _max) + { + const simd256_avx_t maxed = _mm256_max_ps(_a, _min); + const simd256_avx_t result = _mm256_min_ps(maxed, _max); return result; } template<> - BX_SIMD_FORCE_INLINE simd256_avx_t simd_ftoi(simd256_avx_t _a) + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f32_lerp(simd256_avx_t _a, simd256_avx_t _b, simd256_avx_t _s) { - const __m256i ftoi = _mm256_cvtps_epi32(_a); - const simd256_avx_t result = _mm256_castsi256_ps(ftoi); + return simd_f32_lerp_ni(_a, _b, _s); + } + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f32_min(simd256_avx_t _a, simd256_avx_t _b) + { + return _mm256_min_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f32_max(simd256_avx_t _a, simd256_avx_t _b) + { + return _mm256_max_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f32_round(simd256_avx_t _a) + { + return _mm256_round_ps(_a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f32_ceil(simd256_avx_t _a) + { + return _mm256_ceil_ps(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f32_floor(simd256_avx_t _a) + { + return _mm256_floor_ps(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f32_cmpeq(simd256_avx_t _a, simd256_avx_t _b) + { + return _mm256_cmp_ps(_a, _b, _CMP_EQ_OQ); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f32_cmpneq(simd256_avx_t _a, simd256_avx_t _b) + { + return _mm256_cmp_ps(_a, _b, _CMP_NEQ_OQ); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f32_cmplt(simd256_avx_t _a, simd256_avx_t _b) + { + return _mm256_cmp_ps(_a, _b, _CMP_LT_OQ); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f32_cmple(simd256_avx_t _a, simd256_avx_t _b) + { + return _mm256_cmp_ps(_a, _b, _CMP_LE_OQ); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f32_cmpgt(simd256_avx_t _a, simd256_avx_t _b) + { + return _mm256_cmp_ps(_a, _b, _CMP_GT_OQ); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f32_cmpge(simd256_avx_t _a, simd256_avx_t _b) + { + return _mm256_cmp_ps(_a, _b, _CMP_GE_OQ); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_and(simd256_avx_t _a, simd256_avx_t _b) + { + return _mm256_and_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_andc(simd256_avx_t _a, simd256_avx_t _b) + { + return _mm256_andnot_ps(_b, _a); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_or(simd256_avx_t _a, simd256_avx_t _b) + { + return _mm256_or_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_xor(simd256_avx_t _a, simd256_avx_t _b) + { + return _mm256_xor_ps(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_selb(simd256_avx_t _mask, simd256_avx_t _a, simd256_avx_t _b) + { + return _mm256_blendv_ps(_b, _a, _mask); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_not(simd256_avx_t _a) + { + return simd_not_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_orc(simd256_avx_t _a, simd256_avx_t _b) + { + return simd_orc_ni(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_sels(simd256_avx_t _test, simd256_avx_t _a, simd256_avx_t _b) + { + return _mm256_blendv_ps(_b, _a, _test); + } + +#if BX_SIMD_AVX2 + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i32_add(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i b = _mm256_castps_si256(_b); + const __m256i sum = _mm256_add_epi32(a, b); + const __m256 result = _mm256_castsi256_ps(sum); return result; } - typedef simd256_avx_t simd256_t; + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i32_sub(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i b = _mm256_castps_si256(_b); + const __m256i diff = _mm256_sub_epi32(a, b); + const __m256 result = _mm256_castsi256_ps(diff); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i32_neg(simd256_avx_t _a) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i zero = _mm256_setzero_si256(); + const __m256i diff = _mm256_sub_epi32(zero, a); + const __m256 result = _mm256_castsi256_ps(diff); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i32_abs(simd256_avx_t _a) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i absa = _mm256_abs_epi32(a); + const __m256 result = _mm256_castsi256_ps(absa); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i32_min(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i b = _mm256_castps_si256(_b); + const __m256i minv = _mm256_min_epi32(a, b); + const __m256 result = _mm256_castsi256_ps(minv); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i32_max(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i b = _mm256_castps_si256(_b); + const __m256i maxv = _mm256_max_epi32(a, b); + const __m256 result = _mm256_castsi256_ps(maxv); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i32_cmpeq(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i b = _mm256_castps_si256(_b); + const __m256i cmp = _mm256_cmpeq_epi32(a, b); + const __m256 result = _mm256_castsi256_ps(cmp); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i32_cmpgt(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i b = _mm256_castps_si256(_b); + const __m256i cmp = _mm256_cmpgt_epi32(a, b); + const __m256 result = _mm256_castsi256_ps(cmp); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i32_cmplt(simd256_avx_t _a, simd256_avx_t _b) + { + return simd256_i32_cmpgt(_b, _a); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i32_clamp(simd256_avx_t _a, simd256_avx_t _min, simd256_avx_t _max) + { + const simd256_avx_t maxed = simd256_i32_max(_a, _min); + const simd256_avx_t result = simd256_i32_min(maxed, _max); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_u32_add(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i b = _mm256_castps_si256(_b); + const __m256i sum = _mm256_add_epi32(a, b); + const __m256 result = _mm256_castsi256_ps(sum); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_u32_sub(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i b = _mm256_castps_si256(_b); + const __m256i diff = _mm256_sub_epi32(a, b); + const __m256 result = _mm256_castsi256_ps(diff); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_u32_mul(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i b = _mm256_castps_si256(_b); + const __m256i prod = _mm256_mullo_epi32(a, b); + const __m256 result = _mm256_castsi256_ps(prod); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_u32_min(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i b = _mm256_castps_si256(_b); + const __m256i minv = _mm256_min_epu32(a, b); + const __m256 result = _mm256_castsi256_ps(minv); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_u32_max(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i b = _mm256_castps_si256(_b); + const __m256i maxv = _mm256_max_epu32(a, b); + const __m256 result = _mm256_castsi256_ps(maxv); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_u32_clamp(simd256_avx_t _a, simd256_avx_t _min, simd256_avx_t _max) + { + const simd256_avx_t maxed = simd256_u32_max(_a, _min); + const simd256_avx_t result = simd256_u32_min(maxed, _max); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_u32_cmpeq(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i b = _mm256_castps_si256(_b); + const __m256i cmp = _mm256_cmpeq_epi32(a, b); + const __m256 result = _mm256_castsi256_ps(cmp); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_u32_cmplt(simd256_avx_t _a, simd256_avx_t _b) + { + return simd_u32_cmplt_ni(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_u32_cmpgt(simd256_avx_t _a, simd256_avx_t _b) + { + return simd_u32_cmpgt_ni(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_x32_sll(simd256_avx_t _a, int _count) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i shifted = _mm256_slli_epi32(a, _count); + const __m256 result = _mm256_castsi256_ps(shifted); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_x32_srl(simd256_avx_t _a, int _count) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i shifted = _mm256_srli_epi32(a, _count); + const __m256 result = _mm256_castsi256_ps(shifted); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_x32_sra(simd256_avx_t _a, int _count) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i shifted = _mm256_srai_epi32(a, _count); + const __m256 result = _mm256_castsi256_ps(shifted); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_x32_sll(simd256_avx_t _a, simd256_avx_t _count) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i c = _mm256_castps_si256(_count); + const __m256i result = _mm256_sllv_epi32(a, c); + return _mm256_castsi256_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_x32_srl(simd256_avx_t _a, simd256_avx_t _count) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i c = _mm256_castps_si256(_count); + const __m256i result = _mm256_srlv_epi32(a, c); + return _mm256_castsi256_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_x32_sra(simd256_avx_t _a, simd256_avx_t _count) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i c = _mm256_castps_si256(_count); + const __m256i result = _mm256_srav_epi32(a, c); + return _mm256_castsi256_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i16_add(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i b = _mm256_castps_si256(_b); + const __m256i sum = _mm256_add_epi16(a, b); + const __m256 result = _mm256_castsi256_ps(sum); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i16_sub(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i b = _mm256_castps_si256(_b); + const __m256i diff = _mm256_sub_epi16(a, b); + const __m256 result = _mm256_castsi256_ps(diff); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i16_mullo(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i b = _mm256_castps_si256(_b); + const __m256i prod = _mm256_mullo_epi16(a, b); + const __m256 result = _mm256_castsi256_ps(prod); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i16_cmpeq(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i b = _mm256_castps_si256(_b); + const __m256i cmp = _mm256_cmpeq_epi16(a, b); + const __m256 result = _mm256_castsi256_ps(cmp); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_x16_sll(simd256_avx_t _a, int _count) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i shifted = _mm256_slli_epi16(a, _count); + const __m256 result = _mm256_castsi256_ps(shifted); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_x16_srl(simd256_avx_t _a, int _count) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i shifted = _mm256_srli_epi16(a, _count); + const __m256 result = _mm256_castsi256_ps(shifted); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i8_add(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i b = _mm256_castps_si256(_b); + const __m256i sum = _mm256_add_epi8(a, b); + const __m256 result = _mm256_castsi256_ps(sum); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i8_sub(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i b = _mm256_castps_si256(_b); + const __m256i diff = _mm256_sub_epi8(a, b); + const __m256 result = _mm256_castsi256_ps(diff); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_u8_satadd(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i b = _mm256_castps_si256(_b); + const __m256i sat = _mm256_adds_epu8(a, b); + const __m256 result = _mm256_castsi256_ps(sat); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_u8_satsub(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i b = _mm256_castps_si256(_b); + const __m256i sat = _mm256_subs_epu8(a, b); + const __m256 result = _mm256_castsi256_ps(sat); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_u16_satadd(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i b = _mm256_castps_si256(_b); + const __m256i sat = _mm256_adds_epu16(a, b); + const __m256 result = _mm256_castsi256_ps(sat); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_u16_satsub(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i b = _mm256_castps_si256(_b); + const __m256i sat = _mm256_subs_epu16(a, b); + const __m256 result = _mm256_castsi256_ps(sat); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_x8_shuffle(simd256_avx_t _a, simd256_avx_t _indices) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i indices = _mm256_castps_si256(_indices); + const __m256i result = _mm256_shuffle_epi8(a, indices); + return _mm256_castsi256_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_x8_shuffle(simd256_avx_t _a, simd256_avx_t _b, simd256_avx_t _indices) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i b = _mm256_castps_si256(_b); + const __m256i indices = _mm256_castps_si256(_indices); + const __m256i pa = _mm256_shuffle_epi8(a, indices); + const __m256i pb = _mm256_shuffle_epi8(b, indices); + const __m256i bit4 = _mm256_set1_epi8(0x10); + const __m256i masked = _mm256_and_si256(indices, bit4); + const __m256i pickb = _mm256_cmpeq_epi8(masked, bit4); + const __m256i result = _mm256_blendv_epi8(pa, pb, pickb); + return _mm256_castsi256_ps(result); + } + +#else // !BX_SIMD_AVX2 + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i32_add(simd256_avx_t _a, simd256_avx_t _b) + { + const __m128 alo = _mm256_castps256_ps128(_a); + const __m128 ahi = _mm256_extractf128_ps(_a, 1); + const __m128 blo = _mm256_castps256_ps128(_b); + const __m128 bhi = _mm256_extractf128_ps(_b, 1); + const __m128i aloi = _mm_castps_si128(alo); + const __m128i ahii = _mm_castps_si128(ahi); + const __m128i bloi = _mm_castps_si128(blo); + const __m128i bhii = _mm_castps_si128(bhi); + const __m128i sumlo = _mm_add_epi32(aloi, bloi); + const __m128i sumhi = _mm_add_epi32(ahii, bhii); + const __m128 rlo = _mm_castsi128_ps(sumlo); + const __m128 rhi = _mm_castsi128_ps(sumhi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i32_sub(simd256_avx_t _a, simd256_avx_t _b) + { + const __m128 alo = _mm256_castps256_ps128(_a); + const __m128 ahi = _mm256_extractf128_ps(_a, 1); + const __m128 blo = _mm256_castps256_ps128(_b); + const __m128 bhi = _mm256_extractf128_ps(_b, 1); + const __m128i aloi = _mm_castps_si128(alo); + const __m128i ahii = _mm_castps_si128(ahi); + const __m128i bloi = _mm_castps_si128(blo); + const __m128i bhii = _mm_castps_si128(bhi); + const __m128i difflo = _mm_sub_epi32(aloi, bloi); + const __m128i diffhi = _mm_sub_epi32(ahii, bhii); + const __m128 rlo = _mm_castsi128_ps(difflo); + const __m128 rhi = _mm_castsi128_ps(diffhi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i32_neg(simd256_avx_t _a) + { + const __m128i zero = _mm_setzero_si128(); + const __m128 alo = _mm256_castps256_ps128(_a); + const __m128 ahi = _mm256_extractf128_ps(_a, 1); + const __m128i aloi = _mm_castps_si128(alo); + const __m128i ahii = _mm_castps_si128(ahi); + const __m128i difflo = _mm_sub_epi32(zero, aloi); + const __m128i diffhi = _mm_sub_epi32(zero, ahii); + const __m128 rlo = _mm_castsi128_ps(difflo); + const __m128 rhi = _mm_castsi128_ps(diffhi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i32_abs(simd256_avx_t _a) + { + const __m128 alo = _mm256_castps256_ps128(_a); + const __m128 ahi = _mm256_extractf128_ps(_a, 1); + const __m128i aloi = _mm_castps_si128(alo); + const __m128i ahii = _mm_castps_si128(ahi); + const __m128i abslo = _mm_abs_epi32(aloi); + const __m128i abshi = _mm_abs_epi32(ahii); + const __m128 rlo = _mm_castsi128_ps(abslo); + const __m128 rhi = _mm_castsi128_ps(abshi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i32_min(simd256_avx_t _a, simd256_avx_t _b) + { + const __m128 alo = _mm256_castps256_ps128(_a); + const __m128 ahi = _mm256_extractf128_ps(_a, 1); + const __m128 blo = _mm256_castps256_ps128(_b); + const __m128 bhi = _mm256_extractf128_ps(_b, 1); + const __m128i aloi = _mm_castps_si128(alo); + const __m128i ahii = _mm_castps_si128(ahi); + const __m128i bloi = _mm_castps_si128(blo); + const __m128i bhii = _mm_castps_si128(bhi); + const __m128i minvlo = _mm_min_epi32(aloi, bloi); + const __m128i minvhi = _mm_min_epi32(ahii, bhii); + const __m128 rlo = _mm_castsi128_ps(minvlo); + const __m128 rhi = _mm_castsi128_ps(minvhi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i32_max(simd256_avx_t _a, simd256_avx_t _b) + { + const __m128 alo = _mm256_castps256_ps128(_a); + const __m128 ahi = _mm256_extractf128_ps(_a, 1); + const __m128 blo = _mm256_castps256_ps128(_b); + const __m128 bhi = _mm256_extractf128_ps(_b, 1); + const __m128i aloi = _mm_castps_si128(alo); + const __m128i ahii = _mm_castps_si128(ahi); + const __m128i bloi = _mm_castps_si128(blo); + const __m128i bhii = _mm_castps_si128(bhi); + const __m128i maxvlo = _mm_max_epi32(aloi, bloi); + const __m128i maxvhi = _mm_max_epi32(ahii, bhii); + const __m128 rlo = _mm_castsi128_ps(maxvlo); + const __m128 rhi = _mm_castsi128_ps(maxvhi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i32_cmpeq(simd256_avx_t _a, simd256_avx_t _b) + { + const __m128 alo = _mm256_castps256_ps128(_a); + const __m128 ahi = _mm256_extractf128_ps(_a, 1); + const __m128 blo = _mm256_castps256_ps128(_b); + const __m128 bhi = _mm256_extractf128_ps(_b, 1); + const __m128i aloi = _mm_castps_si128(alo); + const __m128i ahii = _mm_castps_si128(ahi); + const __m128i bloi = _mm_castps_si128(blo); + const __m128i bhii = _mm_castps_si128(bhi); + const __m128i cmplo = _mm_cmpeq_epi32(aloi, bloi); + const __m128i cmphi = _mm_cmpeq_epi32(ahii, bhii); + const __m128 rlo = _mm_castsi128_ps(cmplo); + const __m128 rhi = _mm_castsi128_ps(cmphi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i32_cmpgt(simd256_avx_t _a, simd256_avx_t _b) + { + const __m128 alo = _mm256_castps256_ps128(_a); + const __m128 ahi = _mm256_extractf128_ps(_a, 1); + const __m128 blo = _mm256_castps256_ps128(_b); + const __m128 bhi = _mm256_extractf128_ps(_b, 1); + const __m128i aloi = _mm_castps_si128(alo); + const __m128i ahii = _mm_castps_si128(ahi); + const __m128i bloi = _mm_castps_si128(blo); + const __m128i bhii = _mm_castps_si128(bhi); + const __m128i cmplo = _mm_cmpgt_epi32(aloi, bloi); + const __m128i cmphi = _mm_cmpgt_epi32(ahii, bhii); + const __m128 rlo = _mm_castsi128_ps(cmplo); + const __m128 rhi = _mm_castsi128_ps(cmphi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i32_cmplt(simd256_avx_t _a, simd256_avx_t _b) + { + return simd256_i32_cmpgt(_b, _a); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i32_clamp(simd256_avx_t _a, simd256_avx_t _min, simd256_avx_t _max) + { + const simd256_avx_t maxed = simd256_i32_max(_a, _min); + const simd256_avx_t result = simd256_i32_min(maxed, _max); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_u32_add(simd256_avx_t _a, simd256_avx_t _b) + { + const __m128 alo = _mm256_castps256_ps128(_a); + const __m128 ahi = _mm256_extractf128_ps(_a, 1); + const __m128 blo = _mm256_castps256_ps128(_b); + const __m128 bhi = _mm256_extractf128_ps(_b, 1); + const __m128i aloi = _mm_castps_si128(alo); + const __m128i ahii = _mm_castps_si128(ahi); + const __m128i bloi = _mm_castps_si128(blo); + const __m128i bhii = _mm_castps_si128(bhi); + const __m128i sumlo = _mm_add_epi32(aloi, bloi); + const __m128i sumhi = _mm_add_epi32(ahii, bhii); + const __m128 rlo = _mm_castsi128_ps(sumlo); + const __m128 rhi = _mm_castsi128_ps(sumhi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_u32_sub(simd256_avx_t _a, simd256_avx_t _b) + { + const __m128 alo = _mm256_castps256_ps128(_a); + const __m128 ahi = _mm256_extractf128_ps(_a, 1); + const __m128 blo = _mm256_castps256_ps128(_b); + const __m128 bhi = _mm256_extractf128_ps(_b, 1); + const __m128i aloi = _mm_castps_si128(alo); + const __m128i ahii = _mm_castps_si128(ahi); + const __m128i bloi = _mm_castps_si128(blo); + const __m128i bhii = _mm_castps_si128(bhi); + const __m128i difflo = _mm_sub_epi32(aloi, bloi); + const __m128i diffhi = _mm_sub_epi32(ahii, bhii); + const __m128 rlo = _mm_castsi128_ps(difflo); + const __m128 rhi = _mm_castsi128_ps(diffhi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_u32_mul(simd256_avx_t _a, simd256_avx_t _b) + { + const __m128 alo = _mm256_castps256_ps128(_a); + const __m128 ahi = _mm256_extractf128_ps(_a, 1); + const __m128 blo = _mm256_castps256_ps128(_b); + const __m128 bhi = _mm256_extractf128_ps(_b, 1); + const __m128i aloi = _mm_castps_si128(alo); + const __m128i ahii = _mm_castps_si128(ahi); + const __m128i bloi = _mm_castps_si128(blo); + const __m128i bhii = _mm_castps_si128(bhi); + const __m128i prodlo = _mm_mullo_epi32(aloi, bloi); + const __m128i prodhi = _mm_mullo_epi32(ahii, bhii); + const __m128 rlo = _mm_castsi128_ps(prodlo); + const __m128 rhi = _mm_castsi128_ps(prodhi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_u32_min(simd256_avx_t _a, simd256_avx_t _b) + { + const __m128 alo = _mm256_castps256_ps128(_a); + const __m128 ahi = _mm256_extractf128_ps(_a, 1); + const __m128 blo = _mm256_castps256_ps128(_b); + const __m128 bhi = _mm256_extractf128_ps(_b, 1); + const __m128i aloi = _mm_castps_si128(alo); + const __m128i ahii = _mm_castps_si128(ahi); + const __m128i bloi = _mm_castps_si128(blo); + const __m128i bhii = _mm_castps_si128(bhi); + const __m128i minvlo = _mm_min_epu32(aloi, bloi); + const __m128i minvhi = _mm_min_epu32(ahii, bhii); + const __m128 rlo = _mm_castsi128_ps(minvlo); + const __m128 rhi = _mm_castsi128_ps(minvhi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_u32_max(simd256_avx_t _a, simd256_avx_t _b) + { + const __m128 alo = _mm256_castps256_ps128(_a); + const __m128 ahi = _mm256_extractf128_ps(_a, 1); + const __m128 blo = _mm256_castps256_ps128(_b); + const __m128 bhi = _mm256_extractf128_ps(_b, 1); + const __m128i aloi = _mm_castps_si128(alo); + const __m128i ahii = _mm_castps_si128(ahi); + const __m128i bloi = _mm_castps_si128(blo); + const __m128i bhii = _mm_castps_si128(bhi); + const __m128i maxvlo = _mm_max_epu32(aloi, bloi); + const __m128i maxvhi = _mm_max_epu32(ahii, bhii); + const __m128 rlo = _mm_castsi128_ps(maxvlo); + const __m128 rhi = _mm_castsi128_ps(maxvhi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_u32_clamp(simd256_avx_t _a, simd256_avx_t _min, simd256_avx_t _max) + { + const simd256_avx_t maxed = simd256_u32_max(_a, _min); + const simd256_avx_t result = simd256_u32_min(maxed, _max); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_u32_cmpeq(simd256_avx_t _a, simd256_avx_t _b) + { + const __m128 alo = _mm256_castps256_ps128(_a); + const __m128 ahi = _mm256_extractf128_ps(_a, 1); + const __m128 blo = _mm256_castps256_ps128(_b); + const __m128 bhi = _mm256_extractf128_ps(_b, 1); + const __m128i aloi = _mm_castps_si128(alo); + const __m128i ahii = _mm_castps_si128(ahi); + const __m128i bloi = _mm_castps_si128(blo); + const __m128i bhii = _mm_castps_si128(bhi); + const __m128i cmplo = _mm_cmpeq_epi32(aloi, bloi); + const __m128i cmphi = _mm_cmpeq_epi32(ahii, bhii); + const __m128 rlo = _mm_castsi128_ps(cmplo); + const __m128 rhi = _mm_castsi128_ps(cmphi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_u32_cmplt(simd256_avx_t _a, simd256_avx_t _b) + { + return simd_u32_cmplt_ni(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_u32_cmpgt(simd256_avx_t _a, simd256_avx_t _b) + { + return simd_u32_cmpgt_ni(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_x32_sll(simd256_avx_t _a, int _count) + { + const __m128 alo = _mm256_castps256_ps128(_a); + const __m128 ahi = _mm256_extractf128_ps(_a, 1); + const __m128i aloi = _mm_castps_si128(alo); + const __m128i ahii = _mm_castps_si128(ahi); + const __m128i shiftedlo = _mm_slli_epi32(aloi, _count); + const __m128i shiftedhi = _mm_slli_epi32(ahii, _count); + const __m128 rlo = _mm_castsi128_ps(shiftedlo); + const __m128 rhi = _mm_castsi128_ps(shiftedhi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_x32_srl(simd256_avx_t _a, int _count) + { + const __m128 alo = _mm256_castps256_ps128(_a); + const __m128 ahi = _mm256_extractf128_ps(_a, 1); + const __m128i aloi = _mm_castps_si128(alo); + const __m128i ahii = _mm_castps_si128(ahi); + const __m128i shiftedlo = _mm_srli_epi32(aloi, _count); + const __m128i shiftedhi = _mm_srli_epi32(ahii, _count); + const __m128 rlo = _mm_castsi128_ps(shiftedlo); + const __m128 rhi = _mm_castsi128_ps(shiftedhi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_x32_sra(simd256_avx_t _a, int _count) + { + const __m128 alo = _mm256_castps256_ps128(_a); + const __m128 ahi = _mm256_extractf128_ps(_a, 1); + const __m128i aloi = _mm_castps_si128(alo); + const __m128i ahii = _mm_castps_si128(ahi); + const __m128i shiftedlo = _mm_srai_epi32(aloi, _count); + const __m128i shiftedhi = _mm_srai_epi32(ahii, _count); + const __m128 rlo = _mm_castsi128_ps(shiftedlo); + const __m128 rhi = _mm_castsi128_ps(shiftedhi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_x32_sll(simd256_avx_t _a, simd256_avx_t _count) + { + return simd_x32_sll_ni(_a, _count); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_x32_srl(simd256_avx_t _a, simd256_avx_t _count) + { + return simd_x32_srl_ni(_a, _count); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_x32_sra(simd256_avx_t _a, simd256_avx_t _count) + { + return simd_x32_sra_ni(_a, _count); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_x8_shuffle(simd256_avx_t _a, simd256_avx_t _indices) + { + // No AVX2: do PSHUFB on each 128-bit half independently. + const __m128 aloF = _mm256_castps256_ps128(_a); + const __m128 ahiF = _mm256_extractf128_ps(_a, 1); + const __m128 iloF = _mm256_castps256_ps128(_indices); + const __m128 ihiF = _mm256_extractf128_ps(_indices, 1); + const __m128i alo = _mm_castps_si128(aloF); + const __m128i ahi = _mm_castps_si128(ahiF); + const __m128i ilo = _mm_castps_si128(iloF); + const __m128i ihi = _mm_castps_si128(ihiF); + const __m128i shuflo = _mm_shuffle_epi8(alo, ilo); + const __m128i shufhi = _mm_shuffle_epi8(ahi, ihi); + const __m128 rlo = _mm_castsi128_ps(shuflo); + const __m128 rhi = _mm_castsi128_ps(shufhi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_x8_shuffle(simd256_avx_t _a, simd256_avx_t _b, simd256_avx_t _indices) + { + return simd_x8_shuffle_ni(_a, _b, _indices); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i16_add(simd256_avx_t _a, simd256_avx_t _b) + { + const __m128 alo = _mm256_castps256_ps128(_a); + const __m128 ahi = _mm256_extractf128_ps(_a, 1); + const __m128 blo = _mm256_castps256_ps128(_b); + const __m128 bhi = _mm256_extractf128_ps(_b, 1); + const __m128i aloi = _mm_castps_si128(alo); + const __m128i ahii = _mm_castps_si128(ahi); + const __m128i bloi = _mm_castps_si128(blo); + const __m128i bhii = _mm_castps_si128(bhi); + const __m128i sumlo = _mm_add_epi16(aloi, bloi); + const __m128i sumhi = _mm_add_epi16(ahii, bhii); + const __m128 rlo = _mm_castsi128_ps(sumlo); + const __m128 rhi = _mm_castsi128_ps(sumhi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i16_sub(simd256_avx_t _a, simd256_avx_t _b) + { + const __m128 alo = _mm256_castps256_ps128(_a); + const __m128 ahi = _mm256_extractf128_ps(_a, 1); + const __m128 blo = _mm256_castps256_ps128(_b); + const __m128 bhi = _mm256_extractf128_ps(_b, 1); + const __m128i aloi = _mm_castps_si128(alo); + const __m128i ahii = _mm_castps_si128(ahi); + const __m128i bloi = _mm_castps_si128(blo); + const __m128i bhii = _mm_castps_si128(bhi); + const __m128i difflo = _mm_sub_epi16(aloi, bloi); + const __m128i diffhi = _mm_sub_epi16(ahii, bhii); + const __m128 rlo = _mm_castsi128_ps(difflo); + const __m128 rhi = _mm_castsi128_ps(diffhi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i16_mullo(simd256_avx_t _a, simd256_avx_t _b) + { + const __m128 alo = _mm256_castps256_ps128(_a); + const __m128 ahi = _mm256_extractf128_ps(_a, 1); + const __m128 blo = _mm256_castps256_ps128(_b); + const __m128 bhi = _mm256_extractf128_ps(_b, 1); + const __m128i aloi = _mm_castps_si128(alo); + const __m128i ahii = _mm_castps_si128(ahi); + const __m128i bloi = _mm_castps_si128(blo); + const __m128i bhii = _mm_castps_si128(bhi); + const __m128i prodlo = _mm_mullo_epi16(aloi, bloi); + const __m128i prodhi = _mm_mullo_epi16(ahii, bhii); + const __m128 rlo = _mm_castsi128_ps(prodlo); + const __m128 rhi = _mm_castsi128_ps(prodhi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i16_cmpeq(simd256_avx_t _a, simd256_avx_t _b) + { + const __m128 alo = _mm256_castps256_ps128(_a); + const __m128 ahi = _mm256_extractf128_ps(_a, 1); + const __m128 blo = _mm256_castps256_ps128(_b); + const __m128 bhi = _mm256_extractf128_ps(_b, 1); + const __m128i aloi = _mm_castps_si128(alo); + const __m128i ahii = _mm_castps_si128(ahi); + const __m128i bloi = _mm_castps_si128(blo); + const __m128i bhii = _mm_castps_si128(bhi); + const __m128i cmplo = _mm_cmpeq_epi16(aloi, bloi); + const __m128i cmphi = _mm_cmpeq_epi16(ahii, bhii); + const __m128 rlo = _mm_castsi128_ps(cmplo); + const __m128 rhi = _mm_castsi128_ps(cmphi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_x16_sll(simd256_avx_t _a, int _count) + { + const __m128 alo = _mm256_castps256_ps128(_a); + const __m128 ahi = _mm256_extractf128_ps(_a, 1); + const __m128i aloi = _mm_castps_si128(alo); + const __m128i ahii = _mm_castps_si128(ahi); + const __m128i shiftedlo = _mm_slli_epi16(aloi, _count); + const __m128i shiftedhi = _mm_slli_epi16(ahii, _count); + const __m128 rlo = _mm_castsi128_ps(shiftedlo); + const __m128 rhi = _mm_castsi128_ps(shiftedhi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_x16_srl(simd256_avx_t _a, int _count) + { + const __m128 alo = _mm256_castps256_ps128(_a); + const __m128 ahi = _mm256_extractf128_ps(_a, 1); + const __m128i aloi = _mm_castps_si128(alo); + const __m128i ahii = _mm_castps_si128(ahi); + const __m128i shiftedlo = _mm_srli_epi16(aloi, _count); + const __m128i shiftedhi = _mm_srli_epi16(ahii, _count); + const __m128 rlo = _mm_castsi128_ps(shiftedlo); + const __m128 rhi = _mm_castsi128_ps(shiftedhi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i8_add(simd256_avx_t _a, simd256_avx_t _b) + { + const __m128 alo = _mm256_castps256_ps128(_a); + const __m128 ahi = _mm256_extractf128_ps(_a, 1); + const __m128 blo = _mm256_castps256_ps128(_b); + const __m128 bhi = _mm256_extractf128_ps(_b, 1); + const __m128i aloi = _mm_castps_si128(alo); + const __m128i ahii = _mm_castps_si128(ahi); + const __m128i bloi = _mm_castps_si128(blo); + const __m128i bhii = _mm_castps_si128(bhi); + const __m128i sumlo = _mm_add_epi8(aloi, bloi); + const __m128i sumhi = _mm_add_epi8(ahii, bhii); + const __m128 rlo = _mm_castsi128_ps(sumlo); + const __m128 rhi = _mm_castsi128_ps(sumhi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i8_sub(simd256_avx_t _a, simd256_avx_t _b) + { + const __m128 alo = _mm256_castps256_ps128(_a); + const __m128 ahi = _mm256_extractf128_ps(_a, 1); + const __m128 blo = _mm256_castps256_ps128(_b); + const __m128 bhi = _mm256_extractf128_ps(_b, 1); + const __m128i aloi = _mm_castps_si128(alo); + const __m128i ahii = _mm_castps_si128(ahi); + const __m128i bloi = _mm_castps_si128(blo); + const __m128i bhii = _mm_castps_si128(bhi); + const __m128i difflo = _mm_sub_epi8(aloi, bloi); + const __m128i diffhi = _mm_sub_epi8(ahii, bhii); + const __m128 rlo = _mm_castsi128_ps(difflo); + const __m128 rhi = _mm_castsi128_ps(diffhi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_u8_satadd(simd256_avx_t _a, simd256_avx_t _b) + { + const __m128 alo = _mm256_castps256_ps128(_a); + const __m128 ahi = _mm256_extractf128_ps(_a, 1); + const __m128 blo = _mm256_castps256_ps128(_b); + const __m128 bhi = _mm256_extractf128_ps(_b, 1); + const __m128i aloi = _mm_castps_si128(alo); + const __m128i ahii = _mm_castps_si128(ahi); + const __m128i bloi = _mm_castps_si128(blo); + const __m128i bhii = _mm_castps_si128(bhi); + const __m128i satlo = _mm_adds_epu8(aloi, bloi); + const __m128i sathi = _mm_adds_epu8(ahii, bhii); + const __m128 rlo = _mm_castsi128_ps(satlo); + const __m128 rhi = _mm_castsi128_ps(sathi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_u8_satsub(simd256_avx_t _a, simd256_avx_t _b) + { + const __m128 alo = _mm256_castps256_ps128(_a); + const __m128 ahi = _mm256_extractf128_ps(_a, 1); + const __m128 blo = _mm256_castps256_ps128(_b); + const __m128 bhi = _mm256_extractf128_ps(_b, 1); + const __m128i aloi = _mm_castps_si128(alo); + const __m128i ahii = _mm_castps_si128(ahi); + const __m128i bloi = _mm_castps_si128(blo); + const __m128i bhii = _mm_castps_si128(bhi); + const __m128i satlo = _mm_subs_epu8(aloi, bloi); + const __m128i sathi = _mm_subs_epu8(ahii, bhii); + const __m128 rlo = _mm_castsi128_ps(satlo); + const __m128 rhi = _mm_castsi128_ps(sathi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_u16_satadd(simd256_avx_t _a, simd256_avx_t _b) + { + const __m128 alo = _mm256_castps256_ps128(_a); + const __m128 ahi = _mm256_extractf128_ps(_a, 1); + const __m128 blo = _mm256_castps256_ps128(_b); + const __m128 bhi = _mm256_extractf128_ps(_b, 1); + const __m128i aloi = _mm_castps_si128(alo); + const __m128i ahii = _mm_castps_si128(ahi); + const __m128i bloi = _mm_castps_si128(blo); + const __m128i bhii = _mm_castps_si128(bhi); + const __m128i satlo = _mm_adds_epu16(aloi, bloi); + const __m128i sathi = _mm_adds_epu16(ahii, bhii); + const __m128 rlo = _mm_castsi128_ps(satlo); + const __m128 rhi = _mm_castsi128_ps(sathi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_u16_satsub(simd256_avx_t _a, simd256_avx_t _b) + { + const __m128 alo = _mm256_castps256_ps128(_a); + const __m128 ahi = _mm256_extractf128_ps(_a, 1); + const __m128 blo = _mm256_castps256_ps128(_b); + const __m128 bhi = _mm256_extractf128_ps(_b, 1); + const __m128i aloi = _mm_castps_si128(alo); + const __m128i ahii = _mm_castps_si128(ahi); + const __m128i bloi = _mm_castps_si128(blo); + const __m128i bhii = _mm_castps_si128(bhi); + const __m128i satlo = _mm_subs_epu16(aloi, bloi); + const __m128i sathi = _mm_subs_epu16(ahii, bhii); + const __m128 rlo = _mm_castsi128_ps(satlo); + const __m128 rhi = _mm_castsi128_ps(sathi); + const __m256 rlo256 = _mm256_castps128_ps256(rlo); + const __m256 result = _mm256_insertf128_ps(rlo256, rhi, 1); + return result; + } + +#endif // BX_SIMD_AVX2 + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f64_add(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256d a = _mm256_castps_pd(_a); + const __m256d b = _mm256_castps_pd(_b); + const __m256d result = _mm256_add_pd(a, b); + return _mm256_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f64_sub(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256d a = _mm256_castps_pd(_a); + const __m256d b = _mm256_castps_pd(_b); + const __m256d result = _mm256_sub_pd(a, b); + return _mm256_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f64_mul(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256d a = _mm256_castps_pd(_a); + const __m256d b = _mm256_castps_pd(_b); + const __m256d result = _mm256_mul_pd(a, b); + return _mm256_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f64_div(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256d a = _mm256_castps_pd(_a); + const __m256d b = _mm256_castps_pd(_b); + const __m256d result = _mm256_div_pd(a, b); + return _mm256_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f64_min(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256d a = _mm256_castps_pd(_a); + const __m256d b = _mm256_castps_pd(_b); + const __m256d result = _mm256_min_pd(a, b); + return _mm256_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f64_max(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256d a = _mm256_castps_pd(_a); + const __m256d b = _mm256_castps_pd(_b); + const __m256d result = _mm256_max_pd(a, b); + return _mm256_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f64_madd(simd256_avx_t _a, simd256_avx_t _b, simd256_avx_t _c) + { + return simd_f64_madd_ni(_a, _b, _c); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f64_nmsub(simd256_avx_t _a, simd256_avx_t _b, simd256_avx_t _c) + { + return simd_f64_nmsub_ni(_a, _b, _c); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f64_neg(simd256_avx_t _a) + { + const __m256i signi = _mm256_set1_epi64x(int64_t(kDoubleSignMask) ); + const __m256d signd = _mm256_castsi256_pd(signi); + const __m256d a = _mm256_castps_pd(_a); + const __m256d r = _mm256_xor_pd(a, signd); + return _mm256_castpd_ps(r); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f64_abs(simd256_avx_t _a) + { + const __m256i maski = _mm256_set1_epi64x(int64_t(kDoubleExponentMask | kDoubleMantissaMask) ); + const __m256d maskd = _mm256_castsi256_pd(maski); + const __m256d a = _mm256_castps_pd(_a); + const __m256d r = _mm256_and_pd(a, maskd); + return _mm256_castpd_ps(r); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f64_clamp(simd256_avx_t _a, simd256_avx_t _min, simd256_avx_t _max) + { + const simd256_avx_t maxed = simd256_f64_max(_a, _min); + const simd256_avx_t result = simd256_f64_min(maxed, _max); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f64_lerp(simd256_avx_t _a, simd256_avx_t _b, simd256_avx_t _s) + { + return simd_f64_lerp_ni(_a, _b, _s); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f64_rcp(simd256_avx_t _a) + { + return simd_f64_rcp_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f64_sqrt(simd256_avx_t _a) + { + const __m256d a = _mm256_castps_pd(_a); + const __m256d result = _mm256_sqrt_pd(a); + return _mm256_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f64_rsqrt(simd256_avx_t _a) + { + return simd_f64_rsqrt_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f64_round(simd256_avx_t _a) + { + const __m256d a = _mm256_castps_pd(_a); + const __m256d result = _mm256_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + return _mm256_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f64_ceil(simd256_avx_t _a) + { + const __m256d a = _mm256_castps_pd(_a); + const __m256d result = _mm256_ceil_pd(a); + return _mm256_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f64_floor(simd256_avx_t _a) + { + const __m256d a = _mm256_castps_pd(_a); + const __m256d result = _mm256_floor_pd(a); + return _mm256_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f64_cmpeq(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256d a = _mm256_castps_pd(_a); + const __m256d b = _mm256_castps_pd(_b); + const __m256d result = _mm256_cmp_pd(a, b, _CMP_EQ_OQ); + return _mm256_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f64_cmpneq(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256d a = _mm256_castps_pd(_a); + const __m256d b = _mm256_castps_pd(_b); + const __m256d result = _mm256_cmp_pd(a, b, _CMP_NEQ_OQ); + return _mm256_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f64_cmplt(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256d a = _mm256_castps_pd(_a); + const __m256d b = _mm256_castps_pd(_b); + const __m256d result = _mm256_cmp_pd(a, b, _CMP_LT_OQ); + return _mm256_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f64_cmple(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256d a = _mm256_castps_pd(_a); + const __m256d b = _mm256_castps_pd(_b); + const __m256d result = _mm256_cmp_pd(a, b, _CMP_LE_OQ); + return _mm256_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f64_cmpgt(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256d a = _mm256_castps_pd(_a); + const __m256d b = _mm256_castps_pd(_b); + const __m256d result = _mm256_cmp_pd(a, b, _CMP_GT_OQ); + return _mm256_castpd_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_f64_cmpge(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256d a = _mm256_castps_pd(_a); + const __m256d b = _mm256_castps_pd(_b); + const __m256d result = _mm256_cmp_pd(a, b, _CMP_GE_OQ); + return _mm256_castpd_ps(result); + } + +#if BX_SIMD_AVX2 + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i64_add(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i b = _mm256_castps_si256(_b); + const __m256i result = _mm256_add_epi64(a, b); + return _mm256_castsi256_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_i64_sub(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i b = _mm256_castps_si256(_b); + const __m256i result = _mm256_sub_epi64(a, b); + return _mm256_castsi256_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_u64_add(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i b = _mm256_castps_si256(_b); + const __m256i result = _mm256_add_epi64(a, b); + return _mm256_castsi256_ps(result); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_avx_t simd256_u64_sub(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256i a = _mm256_castps_si256(_a); + const __m256i b = _mm256_castps_si256(_b); + const __m256i result = _mm256_sub_epi64(a, b); + return _mm256_castsi256_ps(result); + } + +#endif // BX_SIMD_AVX2 + + template<> + BX_SIMD_FORCE_INLINE bool simd256_test_any(simd256_avx_t _test) + { + return 0 != _mm256_movemask_ps(_test); + } + + template<> + BX_SIMD_FORCE_INLINE bool simd256_test_all(simd256_avx_t _test) + { + return 0xff == _mm256_movemask_ps(_test); + } + + template<> + BX_SIMD_FORCE_INLINE bool simd256_test_zero(simd256_avx_t _a, simd256_avx_t _b) + { + const __m256i ai = _mm256_castps_si256(_a); + const __m256i bi = _mm256_castps_si256(_b); + return 0 != _mm256_testz_si256(ai, bi); + } + + template<> + BX_SIMD_FORCE_INLINE int simd256_x32_signbitsmask(simd256_avx_t _a) + { + return _mm256_movemask_ps(_a); + } + + template<> + BX_SIMD_FORCE_INLINE int simd256_x8_signbitsmask(simd256_avx_t _a) + { +#if BX_SIMD_AVX2 + const __m256i ai = _mm256_castps_si256(_a); + const int result = _mm256_movemask_epi8(ai); + return result; +#else + const __m128 lo = _mm256_castps256_ps128(_a); + const __m128 hi = _mm256_extractf128_ps(_a, 1); + const __m128i loi = _mm_castps_si128(lo); + const __m128i hii = _mm_castps_si128(hi); + const int maskLo = _mm_movemask_epi8(loi); + const int maskHi = _mm_movemask_epi8(hii); + const int result = maskLo | (maskHi << 16); + return result; +#endif + } + + typedef simd256_avx_t simd256_t; } // namespace bx diff --git a/include/bx/inline/simd256_ref.inl b/include/bx/inline/simd256_ref.inl index 433da5c..b1bcf5f 100644 --- a/include/bx/inline/simd256_ref.inl +++ b/include/bx/inline/simd256_ref.inl @@ -9,76 +9,1545 @@ namespace bx { + // 256-bit reference delegates to two 128-bit operations. + +#if BX_SIMD_LANGEXT + typedef float simd256_f32_langext_t __attribute__((__vector_size__(32), __aligned__(32))); + typedef double simd256_f64_langext_t __attribute__((__vector_size__(32), __aligned__(32))); + typedef int8_t simd256_i8_langext_t __attribute__((__vector_size__(32), __aligned__(32))); + typedef int16_t simd256_i16_langext_t __attribute__((__vector_size__(32), __aligned__(32))); + typedef int32_t simd256_i32_langext_t __attribute__((__vector_size__(32), __aligned__(32))); + typedef int64_t simd256_i64_langext_t __attribute__((__vector_size__(32), __aligned__(32))); + typedef uint8_t simd256_u8_langext_t __attribute__((__vector_size__(32), __aligned__(32))); + typedef uint16_t simd256_u16_langext_t __attribute__((__vector_size__(32), __aligned__(32))); + typedef uint32_t simd256_u32_langext_t __attribute__((__vector_size__(32), __aligned__(32))); + typedef uint64_t simd256_u64_langext_t __attribute__((__vector_size__(32), __aligned__(32))); +#endif // BX_SIMD_LANGEXT + +#if !BX_SIMD_AVX + template<> - BX_SIMD_FORCE_INLINE simd256_ref_t simd_ld(const void* _ptr) + BX_SIMD_FORCE_INLINE simd256_ref_t simd256_ld(const void* _ptr) { - const simd256_ref_t::type* ptr = reinterpret_cast(_ptr); simd256_ref_t result; - result.simd128_0 = simd_ld(&ptr[0]); - result.simd128_1 = simd_ld(&ptr[1]); + memCopy(&result, _ptr, sizeof(simd256_ref_t) ); return result; } template<> - BX_SIMD_FORCE_INLINE void simd_st(void* _ptr, simd256_ref_t& _a) + BX_SIMD_FORCE_INLINE simd256_ref_t simd256_ldu(const void* _ptr) { - simd256_ref_t* result = reinterpret_cast(_ptr); - simd_st(&result[0], _a.simd128_0); - simd_st(&result[1], _a.simd128_1); + return simd256_ld(_ptr); } template<> - BX_SIMD_FORCE_INLINE simd256_ref_t simd_ld(float _x, float _y, float _z, float _w, float _a, float _b, float _c, float _d) + BX_SIMD_FORCE_INLINE void simd256_st(void* _ptr, simd256_ref_t _a) + { + memCopy(_ptr, &_a, sizeof(simd256_ref_t) ); + } + + template<> + BX_SIMD_FORCE_INLINE void simd256_stu(void* _ptr, simd256_ref_t _a) + { + simd256_st(_ptr, _a); + } + + template<> + BX_SIMD_FORCE_INLINE void simd256_stream(void* _ptr, simd256_ref_t _a) + { + simd256_st(_ptr, _a); + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_ld(float _x0, float _x1, float _x2, float _x3, float _x4, float _x5, float _x6, float _x7) { simd256_ref_t result; - result.simd128_0 = simd_ld(_x, _y, _z, _w); - result.simd128_1 = simd_ld(_a, _b, _c, _d); + result.lo = simd128_ld(_x0, _x1, _x2, _x3); + result.hi = simd128_ld(_x4, _x5, _x6, _x7); return result; } template<> - BX_SIMD_FORCE_INLINE simd256_ref_t simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w, uint32_t _a, uint32_t _b, uint32_t _c, uint32_t _d) + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_ld(int32_t _x0, int32_t _x1, int32_t _x2, int32_t _x3, int32_t _x4, int32_t _x5, int32_t _x6, int32_t _x7) { simd256_ref_t result; - result.simd128_0 = simd_ild(_x, _y, _z, _w); - result.simd128_1 = simd_ild(_a, _b, _c, _d); + result.lo = simd128_ld(_x0, _x1, _x2, _x3); + result.hi = simd128_ld(_x4, _x5, _x6, _x7); return result; } template<> - BX_SIMD_FORCE_INLINE simd256_ref_t simd_splat(float _a) + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_ld(uint32_t _x0, uint32_t _x1, uint32_t _x2, uint32_t _x3, uint32_t _x4, uint32_t _x5, uint32_t _x6, uint32_t _x7) { simd256_ref_t result; - result.simd128_0 = simd_splat(_a); - result.simd128_1 = simd_splat(_a); + result.lo = simd128_ld(_x0, _x1, _x2, _x3); + result.hi = simd128_ld(_x4, _x5, _x6, _x7); return result; } template<> - BX_SIMD_FORCE_INLINE simd256_ref_t simd_isplat(uint32_t _a) + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_splat(float _a) { simd256_ref_t result; - result.simd128_0 = simd_isplat(_a); - result.simd128_1 = simd_isplat(_a); + result.lo = simd128_splat(_a); + result.hi = result.lo; return result; } template<> - BX_SIMD_FORCE_INLINE simd256_ref_t simd_itof(simd256_ref_t _a) + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_splat(int32_t _a) { simd256_ref_t result; - result.simd128_0 = simd_itof(_a.simd128_0); - result.simd128_1 = simd_itof(_a.simd128_1); + result.lo = simd128_splat(_a); + result.hi = result.lo; return result; } template<> - BX_SIMD_FORCE_INLINE simd256_ref_t simd_ftoi(simd256_ref_t _a) + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_splat(uint32_t _a) { simd256_ref_t result; - result.simd128_0 = simd_ftoi(_a.simd128_0); - result.simd128_1 = simd_ftoi(_a.simd128_1); + result.lo = simd128_splat(_a); + result.hi = result.lo; return result; } + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_splat(double _a) + { + simd256_ref_t result; + result.lo = simd128_splat(_a); + result.hi = result.lo; + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_splat(int16_t _a) + { + simd256_ref_t result; + result.lo = simd128_splat(_a); + result.hi = result.lo; + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_splat(uint16_t _a) + { + simd256_ref_t result; + result.lo = simd128_splat(_a); + result.hi = result.lo; + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_zero() + { + simd256_ref_t result; + result.lo = simd128_zero(); + result.hi = result.lo; + return result; + } + + template<> + inline BX_CONST_FUNC simd256_ref_t simd256_i32_itof(simd256_ref_t _a) + { +#if BX_SIMD_LANGEXT + const simd256_i32_langext_t a = bitCast(_a); + const simd256_f32_langext_t converted = __builtin_convertvector(a, simd256_f32_langext_t); + const simd256_ref_t result = bitCast(converted); + return result; +#else + simd256_ref_t result; + result.lo = simd128_i32_itof(_a.lo); + result.hi = simd128_i32_itof(_a.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONST_FUNC simd256_ref_t simd256_f32_ftoi_trunc(simd256_ref_t _a) + { +#if BX_SIMD_LANGEXT + const simd256_f32_langext_t a = bitCast(_a); + const simd256_i32_langext_t converted = __builtin_convertvector(a, simd256_i32_langext_t); + const simd256_ref_t result = bitCast(converted); + return result; +#else + simd256_ref_t result; + result.lo = simd128_f32_ftoi_trunc(_a.lo); + result.hi = simd128_f32_ftoi_trunc(_a.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f32_ftoi_round(simd256_ref_t _a) + { + simd256_ref_t result; + result.lo = simd128_f32_ftoi_round(_a.lo); + result.hi = simd128_f32_ftoi_round(_a.hi); + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f32_add(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_f32_langext_t a = bitCast(_a); + const simd256_f32_langext_t b = bitCast(_b); + const simd256_f32_langext_t sum = a + b; + const simd256_ref_t result = bitCast(sum); + return result; +#else + simd256_ref_t result; + result.lo = simd128_f32_add(_a.lo, _b.lo); + result.hi = simd128_f32_add(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f32_sub(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_f32_langext_t a = bitCast(_a); + const simd256_f32_langext_t b = bitCast(_b); + const simd256_f32_langext_t diff = a - b; + const simd256_ref_t result = bitCast(diff); + return result; +#else + simd256_ref_t result; + result.lo = simd128_f32_sub(_a.lo, _b.lo); + result.hi = simd128_f32_sub(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f32_mul(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_f32_langext_t a = bitCast(_a); + const simd256_f32_langext_t b = bitCast(_b); + const simd256_f32_langext_t prod = a * b; + const simd256_ref_t result = bitCast(prod); + return result; +#else + simd256_ref_t result; + result.lo = simd128_f32_mul(_a.lo, _b.lo); + result.hi = simd128_f32_mul(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f32_div(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_f32_langext_t a = bitCast(_a); + const simd256_f32_langext_t b = bitCast(_b); + const simd256_f32_langext_t quot = a / b; + const simd256_ref_t result = bitCast(quot); + return result; +#else + simd256_ref_t result; + result.lo = simd128_f32_div(_a.lo, _b.lo); + result.hi = simd128_f32_div(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f32_madd(simd256_ref_t _a, simd256_ref_t _b, simd256_ref_t _c) + { +#if BX_SIMD_LANGEXT + const simd256_f32_langext_t a = bitCast(_a); + const simd256_f32_langext_t b = bitCast(_b); + const simd256_f32_langext_t c = bitCast(_c); + const simd256_f32_langext_t prod = a * b; + const simd256_f32_langext_t sum = prod + c; + const simd256_ref_t result = bitCast(sum); + return result; +#else + return simd_f32_madd_ni(_a, _b, _c); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f32_msub(simd256_ref_t _a, simd256_ref_t _b, simd256_ref_t _c) + { +#if BX_SIMD_LANGEXT + const simd256_f32_langext_t a = bitCast(_a); + const simd256_f32_langext_t b = bitCast(_b); + const simd256_f32_langext_t c = bitCast(_c); + const simd256_f32_langext_t prod = a * b; + const simd256_f32_langext_t diff = prod - c; + const simd256_ref_t result = bitCast(diff); + return result; +#else + return simd_f32_msub_ni(_a, _b, _c); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f32_nmsub(simd256_ref_t _a, simd256_ref_t _b, simd256_ref_t _c) + { +#if BX_SIMD_LANGEXT + const simd256_f32_langext_t a = bitCast(_a); + const simd256_f32_langext_t b = bitCast(_b); + const simd256_f32_langext_t c = bitCast(_c); + const simd256_f32_langext_t prod = a * b; + const simd256_f32_langext_t diff = c - prod; + const simd256_ref_t result = bitCast(diff); + return result; +#else + return simd_f32_nmsub_ni(_a, _b, _c); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f32_neg(simd256_ref_t _a) + { +#if BX_SIMD_LANGEXT + const simd256_f32_langext_t a = bitCast(_a); + const simd256_f32_langext_t neg = -a; + const simd256_ref_t result = bitCast(neg); + return result; +#else + return simd_f32_neg_ni(_a); +#endif // BX_SIMD_LANGEXT + } + + template<> + BX_SIMD_FORCE_INLINE simd256_ref_t simd256_f32_abs(simd256_ref_t _a) + { + return simd_f32_abs_ni(_a); + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f32_min(simd256_ref_t _a, simd256_ref_t _b) + { + simd256_ref_t result; + result.lo = simd128_f32_min(_a.lo, _b.lo); + result.hi = simd128_f32_min(_a.hi, _b.hi); + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f32_max(simd256_ref_t _a, simd256_ref_t _b) + { + simd256_ref_t result; + result.lo = simd128_f32_max(_a.lo, _b.lo); + result.hi = simd128_f32_max(_a.hi, _b.hi); + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f32_clamp(simd256_ref_t _a, simd256_ref_t _min, simd256_ref_t _max) + { + const simd256_ref_t maxed = simd256_f32_max(_a, _min); + const simd256_ref_t result = simd256_f32_min(maxed, _max); + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f32_lerp(simd256_ref_t _a, simd256_ref_t _b, simd256_ref_t _s) + { +#if BX_SIMD_LANGEXT + const simd256_f32_langext_t a = bitCast(_a); + const simd256_f32_langext_t b = bitCast(_b); + const simd256_f32_langext_t s = bitCast(_s); + const simd256_f32_langext_t diff = b - a; + const simd256_f32_langext_t scaled = diff * s; + const simd256_f32_langext_t sum = a + scaled; + const simd256_ref_t result = bitCast(sum); + return result; +#else + return simd_f32_lerp_ni(_a, _b, _s); +#endif // BX_SIMD_LANGEXT + } + + template<> + BX_SIMD_FORCE_INLINE simd256_ref_t simd256_f32_rcp(simd256_ref_t _a) + { + return simd_f32_rcp_ni(_a); + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f32_rcp_est(simd256_ref_t _a) + { +#if BX_SIMD_LANGEXT + const simd256_f32_langext_t one = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; + const simd256_f32_langext_t a = bitCast(_a); + const simd256_f32_langext_t quot = one / a; + const simd256_ref_t result = bitCast(quot); + return result; +#else + simd256_ref_t result; + result.lo = simd128_f32_rcp_est(_a.lo); + result.hi = simd128_f32_rcp_est(_a.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + BX_SIMD_FORCE_INLINE simd256_ref_t simd256_f32_sqrt(simd256_ref_t _a) + { + simd256_ref_t result; + result.lo = simd128_f32_sqrt(_a.lo); + result.hi = simd128_f32_sqrt(_a.hi); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_ref_t simd256_f32_rsqrt(simd256_ref_t _a) + { + return simd_f32_rsqrt_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_ref_t simd256_f32_rsqrt_est(simd256_ref_t _a) + { + simd256_ref_t result; + result.lo = simd128_f32_rsqrt_est(_a.lo); + result.hi = simd128_f32_rsqrt_est(_a.hi); + return result; + } + + template<> + BX_SIMD_FORCE_INLINE simd256_ref_t simd256_f32_round(simd256_ref_t _a) + { + return simd_f32_round_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_ref_t simd256_f32_ceil(simd256_ref_t _a) + { + return simd_f32_ceil_ni(_a); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_ref_t simd256_f32_floor(simd256_ref_t _a) + { + return simd_f32_floor_ni(_a); + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f32_cmpeq(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_f32_langext_t a = bitCast(_a); + const simd256_f32_langext_t b = bitCast(_b); + const simd256_f32_langext_t cmp = a == b; + const simd256_ref_t result = bitCast(cmp); + return result; +#else + simd256_ref_t result; + result.lo = simd128_f32_cmpeq(_a.lo, _b.lo); + result.hi = simd128_f32_cmpeq(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f32_cmpneq(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_f32_langext_t a = bitCast(_a); + const simd256_f32_langext_t b = bitCast(_b); + const simd256_f32_langext_t cmp = a != b; + const simd256_ref_t result = bitCast(cmp); + return result; +#else + return simd_f32_cmpneq_ni(_a, _b); +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f32_cmplt(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_f32_langext_t a = bitCast(_a); + const simd256_f32_langext_t b = bitCast(_b); + const simd256_f32_langext_t cmp = a < b; + const simd256_ref_t result = bitCast(cmp); + return result; +#else + simd256_ref_t result; + result.lo = simd128_f32_cmplt(_a.lo, _b.lo); + result.hi = simd128_f32_cmplt(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f32_cmpgt(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_f32_langext_t a = bitCast(_a); + const simd256_f32_langext_t b = bitCast(_b); + const simd256_f32_langext_t cmp = a > b; + const simd256_ref_t result = bitCast(cmp); + return result; +#else + simd256_ref_t result; + result.lo = simd128_f32_cmpgt(_a.lo, _b.lo); + result.hi = simd128_f32_cmpgt(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_i32_add(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_i32_langext_t a = bitCast(_a); + const simd256_i32_langext_t b = bitCast(_b); + const simd256_i32_langext_t sum = a + b; + const simd256_ref_t result = bitCast(sum); + return result; +#else + simd256_ref_t result; + result.lo = simd128_i32_add(_a.lo, _b.lo); + result.hi = simd128_i32_add(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_i32_sub(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_i32_langext_t a = bitCast(_a); + const simd256_i32_langext_t b = bitCast(_b); + const simd256_i32_langext_t diff = a - b; + const simd256_ref_t result = bitCast(diff); + return result; +#else + simd256_ref_t result; + result.lo = simd128_i32_sub(_a.lo, _b.lo); + result.hi = simd128_i32_sub(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_i32_neg(simd256_ref_t _a) + { +#if BX_SIMD_LANGEXT + const simd256_i32_langext_t a = bitCast(_a); + const simd256_i32_langext_t neg = -a; + const simd256_ref_t result = bitCast(neg); + return result; +#else + return simd_i32_neg_ni(_a); +#endif // BX_SIMD_LANGEXT + } + + template<> + BX_SIMD_FORCE_INLINE simd256_ref_t simd256_i32_abs(simd256_ref_t _a) + { + return simd_i32_abs_ni(_a); + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_i32_min(simd256_ref_t _a, simd256_ref_t _b) + { + simd256_ref_t result; + result.lo = simd128_i32_min(_a.lo, _b.lo); + result.hi = simd128_i32_min(_a.hi, _b.hi); + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_i32_max(simd256_ref_t _a, simd256_ref_t _b) + { + simd256_ref_t result; + result.lo = simd128_i32_max(_a.lo, _b.lo); + result.hi = simd128_i32_max(_a.hi, _b.hi); + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_i32_clamp(simd256_ref_t _a, simd256_ref_t _min, simd256_ref_t _max) + { + const simd256_ref_t maxed = simd256_i32_max(_a, _min); + const simd256_ref_t result = simd256_i32_min(maxed, _max); + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_i32_cmpeq(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_i32_langext_t a = bitCast(_a); + const simd256_i32_langext_t b = bitCast(_b); + const simd256_i32_langext_t cmp = a == b; + const simd256_ref_t result = bitCast(cmp); + return result; +#else + simd256_ref_t result; + result.lo = simd128_i32_cmpeq(_a.lo, _b.lo); + result.hi = simd128_i32_cmpeq(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_i32_cmplt(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_i32_langext_t a = bitCast(_a); + const simd256_i32_langext_t b = bitCast(_b); + const simd256_i32_langext_t cmp = a < b; + const simd256_ref_t result = bitCast(cmp); + return result; +#else + simd256_ref_t result; + result.lo = simd128_i32_cmplt(_a.lo, _b.lo); + result.hi = simd128_i32_cmplt(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_i32_cmpgt(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_i32_langext_t a = bitCast(_a); + const simd256_i32_langext_t b = bitCast(_b); + const simd256_i32_langext_t cmp = a > b; + const simd256_ref_t result = bitCast(cmp); + return result; +#else + simd256_ref_t result; + result.lo = simd128_i32_cmpgt(_a.lo, _b.lo); + result.hi = simd128_i32_cmpgt(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_u32_add(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_u32_langext_t a = bitCast(_a); + const simd256_u32_langext_t b = bitCast(_b); + const simd256_u32_langext_t sum = a + b; + const simd256_ref_t result = bitCast(sum); + return result; +#else + simd256_ref_t result; + result.lo = simd128_u32_add(_a.lo, _b.lo); + result.hi = simd128_u32_add(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_u32_sub(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_u32_langext_t a = bitCast(_a); + const simd256_u32_langext_t b = bitCast(_b); + const simd256_u32_langext_t diff = a - b; + const simd256_ref_t result = bitCast(diff); + return result; +#else + simd256_ref_t result; + result.lo = simd128_u32_sub(_a.lo, _b.lo); + result.hi = simd128_u32_sub(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_u32_mul(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_u32_langext_t a = bitCast(_a); + const simd256_u32_langext_t b = bitCast(_b); + const simd256_u32_langext_t prod = a * b; + const simd256_ref_t result = bitCast(prod); + return result; +#else + simd256_ref_t result; + result.lo = simd128_u32_mul(_a.lo, _b.lo); + result.hi = simd128_u32_mul(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_u32_min(simd256_ref_t _a, simd256_ref_t _b) + { + simd256_ref_t result; + result.lo = simd128_u32_min(_a.lo, _b.lo); + result.hi = simd128_u32_min(_a.hi, _b.hi); + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_u32_max(simd256_ref_t _a, simd256_ref_t _b) + { + simd256_ref_t result; + result.lo = simd128_u32_max(_a.lo, _b.lo); + result.hi = simd128_u32_max(_a.hi, _b.hi); + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_u32_clamp(simd256_ref_t _a, simd256_ref_t _min, simd256_ref_t _max) + { + const simd256_ref_t maxed = simd256_u32_max(_a, _min); + const simd256_ref_t result = simd256_u32_min(maxed, _max); + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_u32_cmpeq(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_u32_langext_t a = bitCast(_a); + const simd256_u32_langext_t b = bitCast(_b); + const simd256_u32_langext_t cmp = a == b; + const simd256_ref_t result = bitCast(cmp); + return result; +#else + simd256_ref_t result; + result.lo = simd128_u32_cmpeq(_a.lo, _b.lo); + result.hi = simd128_u32_cmpeq(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_u32_cmplt(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_u32_langext_t a = bitCast(_a); + const simd256_u32_langext_t b = bitCast(_b); + const simd256_u32_langext_t cmp = a < b; + const simd256_ref_t result = bitCast(cmp); + return result; +#else + simd256_ref_t result; + result.lo = simd128_u32_cmplt(_a.lo, _b.lo); + result.hi = simd128_u32_cmplt(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_u32_cmpgt(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_u32_langext_t a = bitCast(_a); + const simd256_u32_langext_t b = bitCast(_b); + const simd256_u32_langext_t cmp = a > b; + const simd256_ref_t result = bitCast(cmp); + return result; +#else + simd256_ref_t result; + result.lo = simd128_u32_cmpgt(_a.lo, _b.lo); + result.hi = simd128_u32_cmpgt(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_and(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_u32_langext_t a = bitCast(_a); + const simd256_u32_langext_t b = bitCast(_b); + const simd256_u32_langext_t masked = a & b; + const simd256_ref_t result = bitCast(masked); + return result; +#else + simd256_ref_t result; + result.lo = simd128_and(_a.lo, _b.lo); + result.hi = simd128_and(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_or(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_u32_langext_t a = bitCast(_a); + const simd256_u32_langext_t b = bitCast(_b); + const simd256_u32_langext_t ored = a | b; + const simd256_ref_t result = bitCast(ored); + return result; +#else + simd256_ref_t result; + result.lo = simd128_or(_a.lo, _b.lo); + result.hi = simd128_or(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_xor(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_u32_langext_t a = bitCast(_a); + const simd256_u32_langext_t b = bitCast(_b); + const simd256_u32_langext_t xored = a ^ b; + const simd256_ref_t result = bitCast(xored); + return result; +#else + simd256_ref_t result; + result.lo = simd128_xor(_a.lo, _b.lo); + result.hi = simd128_xor(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + BX_SIMD_FORCE_INLINE simd256_ref_t simd256_not(simd256_ref_t _a) + { + return simd_not_ni(_a); + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_andc(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_u32_langext_t a = bitCast(_a); + const simd256_u32_langext_t b = bitCast(_b); + const simd256_u32_langext_t notb = ~b; + const simd256_u32_langext_t masked = a & notb; + const simd256_ref_t result = bitCast(masked); + return result; +#else + return simd_andc_ni(_a, _b); +#endif // BX_SIMD_LANGEXT + } + + template<> + BX_SIMD_FORCE_INLINE simd256_ref_t simd256_orc(simd256_ref_t _a, simd256_ref_t _b) + { + return simd_orc_ni(_a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_ref_t simd256_selb(simd256_ref_t _mask, simd256_ref_t _a, simd256_ref_t _b) + { + return simd_selb_ni(_mask, _a, _b); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_ref_t simd256_sels(simd256_ref_t _test, simd256_ref_t _a, simd256_ref_t _b) + { + return simd_sels_ni(_test, _a, _b); + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_x32_sll(simd256_ref_t _a, int _count) + { +#if BX_SIMD_LANGEXT + const simd256_u32_langext_t a = bitCast(_a); + const simd256_u32_langext_t shifted = a << _count; + const simd256_ref_t result = bitCast(shifted); + return result; +#else + simd256_ref_t result; + result.lo = simd128_x32_sll(_a.lo, _count); + result.hi = simd128_x32_sll(_a.hi, _count); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_x32_srl(simd256_ref_t _a, int _count) + { +#if BX_SIMD_LANGEXT + const simd256_u32_langext_t a = bitCast(_a); + const simd256_u32_langext_t shifted = a >> _count; + const simd256_ref_t result = bitCast(shifted); + return result; +#else + simd256_ref_t result; + result.lo = simd128_x32_srl(_a.lo, _count); + result.hi = simd128_x32_srl(_a.hi, _count); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_x32_sra(simd256_ref_t _a, int _count) + { +#if BX_SIMD_LANGEXT + const simd256_i32_langext_t a = bitCast(_a); + const simd256_i32_langext_t shifted = a >> _count; + const simd256_ref_t result = bitCast(shifted); + return result; +#else + simd256_ref_t result; + result.lo = simd128_x32_sra(_a.lo, _count); + result.hi = simd128_x32_sra(_a.hi, _count); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + BX_SIMD_FORCE_INLINE simd256_ref_t simd256_x32_sll(simd256_ref_t _a, simd256_ref_t _count) + { + return simd_x32_sll_ni(_a, _count); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_ref_t simd256_x32_srl(simd256_ref_t _a, simd256_ref_t _count) + { + return simd_x32_srl_ni(_a, _count); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_ref_t simd256_x32_sra(simd256_ref_t _a, simd256_ref_t _count) + { + return simd_x32_sra_ni(_a, _count); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_ref_t simd256_x8_shuffle(simd256_ref_t _a, simd256_ref_t _indices) + { + return simd_x8_shuffle_ni(_a, _indices); + } + + template<> + BX_SIMD_FORCE_INLINE simd256_ref_t simd256_x8_shuffle(simd256_ref_t _a, simd256_ref_t _b, simd256_ref_t _indices) + { + return simd_x8_shuffle_ni(_a, _b, _indices); + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f64_add(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_f64_langext_t a = bitCast(_a); + const simd256_f64_langext_t b = bitCast(_b); + const simd256_f64_langext_t sum = a + b; + const simd256_ref_t result = bitCast(sum); + return result; +#else + simd256_ref_t result; + result.lo = simd128_f64_add(_a.lo, _b.lo); + result.hi = simd128_f64_add(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f64_sub(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_f64_langext_t a = bitCast(_a); + const simd256_f64_langext_t b = bitCast(_b); + const simd256_f64_langext_t diff = a - b; + const simd256_ref_t result = bitCast(diff); + return result; +#else + simd256_ref_t result; + result.lo = simd128_f64_sub(_a.lo, _b.lo); + result.hi = simd128_f64_sub(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f64_mul(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_f64_langext_t a = bitCast(_a); + const simd256_f64_langext_t b = bitCast(_b); + const simd256_f64_langext_t prod = a * b; + const simd256_ref_t result = bitCast(prod); + return result; +#else + simd256_ref_t result; + result.lo = simd128_f64_mul(_a.lo, _b.lo); + result.hi = simd128_f64_mul(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f64_div(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_f64_langext_t a = bitCast(_a); + const simd256_f64_langext_t b = bitCast(_b); + const simd256_f64_langext_t quot = a / b; + const simd256_ref_t result = bitCast(quot); + return result; +#else + simd256_ref_t result; + result.lo = simd128_f64_div(_a.lo, _b.lo); + result.hi = simd128_f64_div(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f64_min(simd256_ref_t _a, simd256_ref_t _b) + { + simd256_ref_t result; + result.lo = simd128_f64_min(_a.lo, _b.lo); + result.hi = simd128_f64_min(_a.hi, _b.hi); + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f64_max(simd256_ref_t _a, simd256_ref_t _b) + { + simd256_ref_t result; + result.lo = simd128_f64_max(_a.lo, _b.lo); + result.hi = simd128_f64_max(_a.hi, _b.hi); + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f64_madd(simd256_ref_t _a, simd256_ref_t _b, simd256_ref_t _c) + { +#if BX_SIMD_LANGEXT + const simd256_f64_langext_t a = bitCast(_a); + const simd256_f64_langext_t b = bitCast(_b); + const simd256_f64_langext_t c = bitCast(_c); + const simd256_f64_langext_t prod = a * b; + const simd256_f64_langext_t sum = prod + c; + const simd256_ref_t result = bitCast(sum); + return result; +#else + simd256_ref_t result; + result.lo = simd128_f64_madd(_a.lo, _b.lo, _c.lo); + result.hi = simd128_f64_madd(_a.hi, _b.hi, _c.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f64_nmsub(simd256_ref_t _a, simd256_ref_t _b, simd256_ref_t _c) + { +#if BX_SIMD_LANGEXT + const simd256_f64_langext_t a = bitCast(_a); + const simd256_f64_langext_t b = bitCast(_b); + const simd256_f64_langext_t c = bitCast(_c); + const simd256_f64_langext_t prod = a * b; + const simd256_f64_langext_t diff = c - prod; + const simd256_ref_t result = bitCast(diff); + return result; +#else + simd256_ref_t result; + result.lo = simd128_f64_nmsub(_a.lo, _b.lo, _c.lo); + result.hi = simd128_f64_nmsub(_a.hi, _b.hi, _c.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f64_neg(simd256_ref_t _a) + { +#if BX_SIMD_LANGEXT + const simd256_f64_langext_t a = bitCast(_a); + const simd256_f64_langext_t neg = -a; + const simd256_ref_t result = bitCast(neg); + return result; +#else + simd256_ref_t result; + result.lo = simd128_f64_neg(_a.lo); + result.hi = simd128_f64_neg(_a.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f64_abs(simd256_ref_t _a) + { + simd256_ref_t result; + result.lo = simd128_f64_abs(_a.lo); + result.hi = simd128_f64_abs(_a.hi); + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f64_clamp(simd256_ref_t _a, simd256_ref_t _min, simd256_ref_t _max) + { + const simd256_ref_t maxed = simd256_f64_max(_a, _min); + const simd256_ref_t result = simd256_f64_min(maxed, _max); + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f64_lerp(simd256_ref_t _a, simd256_ref_t _b, simd256_ref_t _s) + { +#if BX_SIMD_LANGEXT + const simd256_f64_langext_t a = bitCast(_a); + const simd256_f64_langext_t b = bitCast(_b); + const simd256_f64_langext_t s = bitCast(_s); + const simd256_f64_langext_t diff = b - a; + const simd256_f64_langext_t scaled = diff * s; + const simd256_f64_langext_t sum = a + scaled; + const simd256_ref_t result = bitCast(sum); + return result; +#else + simd256_ref_t result; + result.lo = simd128_f64_lerp(_a.lo, _b.lo, _s.lo); + result.hi = simd128_f64_lerp(_a.hi, _b.hi, _s.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f64_rcp(simd256_ref_t _a) + { + simd256_ref_t result; + result.lo = simd128_f64_rcp(_a.lo); + result.hi = simd128_f64_rcp(_a.hi); + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f64_sqrt(simd256_ref_t _a) + { + simd256_ref_t result; + result.lo = simd128_f64_sqrt(_a.lo); + result.hi = simd128_f64_sqrt(_a.hi); + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f64_rsqrt(simd256_ref_t _a) + { + simd256_ref_t result; + result.lo = simd128_f64_rsqrt(_a.lo); + result.hi = simd128_f64_rsqrt(_a.hi); + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f64_round(simd256_ref_t _a) + { + simd256_ref_t result; + result.lo = simd128_f64_round(_a.lo); + result.hi = simd128_f64_round(_a.hi); + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f64_ceil(simd256_ref_t _a) + { + simd256_ref_t result; + result.lo = simd128_f64_ceil(_a.lo); + result.hi = simd128_f64_ceil(_a.hi); + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f64_floor(simd256_ref_t _a) + { + simd256_ref_t result; + result.lo = simd128_f64_floor(_a.lo); + result.hi = simd128_f64_floor(_a.hi); + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f64_cmpeq(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_f64_langext_t a = bitCast(_a); + const simd256_f64_langext_t b = bitCast(_b); + const simd256_f64_langext_t cmp = a == b; + const simd256_ref_t result = bitCast(cmp); + return result; +#else + simd256_ref_t result; + result.lo = simd128_f64_cmpeq(_a.lo, _b.lo); + result.hi = simd128_f64_cmpeq(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f64_cmpneq(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_f64_langext_t a = bitCast(_a); + const simd256_f64_langext_t b = bitCast(_b); + const simd256_f64_langext_t cmp = a != b; + const simd256_ref_t result = bitCast(cmp); + return result; +#else + simd256_ref_t result; + result.lo = simd128_f64_cmpneq(_a.lo, _b.lo); + result.hi = simd128_f64_cmpneq(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f64_cmplt(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_f64_langext_t a = bitCast(_a); + const simd256_f64_langext_t b = bitCast(_b); + const simd256_f64_langext_t cmp = a < b; + const simd256_ref_t result = bitCast(cmp); + return result; +#else + simd256_ref_t result; + result.lo = simd128_f64_cmplt(_a.lo, _b.lo); + result.hi = simd128_f64_cmplt(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f64_cmple(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_f64_langext_t a = bitCast(_a); + const simd256_f64_langext_t b = bitCast(_b); + const simd256_f64_langext_t cmp = a <= b; + const simd256_ref_t result = bitCast(cmp); + return result; +#else + simd256_ref_t result; + result.lo = simd128_f64_cmple(_a.lo, _b.lo); + result.hi = simd128_f64_cmple(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f64_cmpgt(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_f64_langext_t a = bitCast(_a); + const simd256_f64_langext_t b = bitCast(_b); + const simd256_f64_langext_t cmp = a > b; + const simd256_ref_t result = bitCast(cmp); + return result; +#else + simd256_ref_t result; + result.lo = simd128_f64_cmpgt(_a.lo, _b.lo); + result.hi = simd128_f64_cmpgt(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_f64_cmpge(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_f64_langext_t a = bitCast(_a); + const simd256_f64_langext_t b = bitCast(_b); + const simd256_f64_langext_t cmp = a >= b; + const simd256_ref_t result = bitCast(cmp); + return result; +#else + simd256_ref_t result; + result.lo = simd128_f64_cmpge(_a.lo, _b.lo); + result.hi = simd128_f64_cmpge(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_i64_add(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_i64_langext_t a = bitCast(_a); + const simd256_i64_langext_t b = bitCast(_b); + const simd256_i64_langext_t sum = a + b; + const simd256_ref_t result = bitCast(sum); + return result; +#else + simd256_ref_t result; + result.lo = simd128_i64_add(_a.lo, _b.lo); + result.hi = simd128_i64_add(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_i64_sub(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_i64_langext_t a = bitCast(_a); + const simd256_i64_langext_t b = bitCast(_b); + const simd256_i64_langext_t diff = a - b; + const simd256_ref_t result = bitCast(diff); + return result; +#else + simd256_ref_t result; + result.lo = simd128_i64_sub(_a.lo, _b.lo); + result.hi = simd128_i64_sub(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_u64_add(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_u64_langext_t a = bitCast(_a); + const simd256_u64_langext_t b = bitCast(_b); + const simd256_u64_langext_t sum = a + b; + const simd256_ref_t result = bitCast(sum); + return result; +#else + simd256_ref_t result; + result.lo = simd128_u64_add(_a.lo, _b.lo); + result.hi = simd128_u64_add(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_u64_sub(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_u64_langext_t a = bitCast(_a); + const simd256_u64_langext_t b = bitCast(_b); + const simd256_u64_langext_t diff = a - b; + const simd256_ref_t result = bitCast(diff); + return result; +#else + simd256_ref_t result; + result.lo = simd128_u64_sub(_a.lo, _b.lo); + result.hi = simd128_u64_sub(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_i16_add(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_i16_langext_t a = bitCast(_a); + const simd256_i16_langext_t b = bitCast(_b); + const simd256_i16_langext_t sum = a + b; + const simd256_ref_t result = bitCast(sum); + return result; +#else + simd256_ref_t result; + result.lo = simd128_i16_add(_a.lo, _b.lo); + result.hi = simd128_i16_add(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_i16_sub(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_i16_langext_t a = bitCast(_a); + const simd256_i16_langext_t b = bitCast(_b); + const simd256_i16_langext_t diff = a - b; + const simd256_ref_t result = bitCast(diff); + return result; +#else + simd256_ref_t result; + result.lo = simd128_i16_sub(_a.lo, _b.lo); + result.hi = simd128_i16_sub(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_i16_mullo(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_i16_langext_t a = bitCast(_a); + const simd256_i16_langext_t b = bitCast(_b); + const simd256_i16_langext_t prod = a * b; + const simd256_ref_t result = bitCast(prod); + return result; +#else + simd256_ref_t result; + result.lo = simd128_i16_mullo(_a.lo, _b.lo); + result.hi = simd128_i16_mullo(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_i16_cmpeq(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_i16_langext_t a = bitCast(_a); + const simd256_i16_langext_t b = bitCast(_b); + const simd256_i16_langext_t cmp = a == b; + const simd256_ref_t result = bitCast(cmp); + return result; +#else + simd256_ref_t result; + result.lo = simd128_i16_cmpeq(_a.lo, _b.lo); + result.hi = simd128_i16_cmpeq(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_x16_sll(simd256_ref_t _a, int _count) + { +#if BX_SIMD_LANGEXT + const simd256_u16_langext_t a = bitCast(_a); + const simd256_u16_langext_t shifted = a << _count; + const simd256_ref_t result = bitCast(shifted); + return result; +#else + simd256_ref_t result; + result.lo = simd128_x16_sll(_a.lo, _count); + result.hi = simd128_x16_sll(_a.hi, _count); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_x16_srl(simd256_ref_t _a, int _count) + { +#if BX_SIMD_LANGEXT + const simd256_u16_langext_t a = bitCast(_a); + const simd256_u16_langext_t shifted = a >> _count; + const simd256_ref_t result = bitCast(shifted); + return result; +#else + simd256_ref_t result; + result.lo = simd128_x16_srl(_a.lo, _count); + result.hi = simd128_x16_srl(_a.hi, _count); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_i8_add(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_i8_langext_t a = bitCast(_a); + const simd256_i8_langext_t b = bitCast(_b); + const simd256_i8_langext_t sum = a + b; + const simd256_ref_t result = bitCast(sum); + return result; +#else + simd256_ref_t result; + result.lo = simd128_i8_add(_a.lo, _b.lo); + result.hi = simd128_i8_add(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_i8_sub(simd256_ref_t _a, simd256_ref_t _b) + { +#if BX_SIMD_LANGEXT + const simd256_i8_langext_t a = bitCast(_a); + const simd256_i8_langext_t b = bitCast(_b); + const simd256_i8_langext_t diff = a - b; + const simd256_ref_t result = bitCast(diff); + return result; +#else + simd256_ref_t result; + result.lo = simd128_i8_sub(_a.lo, _b.lo); + result.hi = simd128_i8_sub(_a.hi, _b.hi); + return result; +#endif // BX_SIMD_LANGEXT + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_u8_satadd(simd256_ref_t _a, simd256_ref_t _b) + { + simd256_ref_t result; + result.lo = simd128_u8_satadd(_a.lo, _b.lo); + result.hi = simd128_u8_satadd(_a.hi, _b.hi); + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_u8_satsub(simd256_ref_t _a, simd256_ref_t _b) + { + simd256_ref_t result; + result.lo = simd128_u8_satsub(_a.lo, _b.lo); + result.hi = simd128_u8_satsub(_a.hi, _b.hi); + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_u16_satadd(simd256_ref_t _a, simd256_ref_t _b) + { + simd256_ref_t result; + result.lo = simd128_u16_satadd(_a.lo, _b.lo); + result.hi = simd128_u16_satadd(_a.hi, _b.hi); + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC simd256_ref_t simd256_u16_satsub(simd256_ref_t _a, simd256_ref_t _b) + { + simd256_ref_t result; + result.lo = simd128_u16_satsub(_a.lo, _b.lo); + result.hi = simd128_u16_satsub(_a.hi, _b.hi); + return result; + } + + template<> + inline BX_CONSTEXPR_FUNC bool simd256_test_any(simd256_ref_t _test) + { + return simd128_test_any_xyzw(_test.lo) || simd128_test_any_xyzw(_test.hi); + } + + template<> + inline BX_CONSTEXPR_FUNC bool simd256_test_all(simd256_ref_t _test) + { + return simd128_test_all_xyzw(_test.lo) && simd128_test_all_xyzw(_test.hi); + } + + template<> + inline BX_CONSTEXPR_FUNC bool simd256_test_zero(simd256_ref_t _a, simd256_ref_t _b) + { + return simd128_test_zero(_a.lo, _b.lo) && simd128_test_zero(_a.hi, _b.hi); + } + + template<> + BX_SIMD_FORCE_INLINE int simd256_x32_signbitsmask(simd256_ref_t _a) + { + float tmp[8]; + simd128_st(&tmp[0], _a.lo); + simd128_st(&tmp[4], _a.hi); + const uint32_t* bits = reinterpret_cast(tmp); + int result = 0; + for (int ii = 0; ii < 8; ++ii) + { + result |= ((bits[ii] >> 31) << ii); + } + return result; + } + + template<> + BX_SIMD_FORCE_INLINE int simd256_x8_signbitsmask(simd256_ref_t _a) + { + float tmp[8]; + simd128_st(&tmp[0], _a.lo); + simd128_st(&tmp[4], _a.hi); + const uint8_t* bytes = reinterpret_cast(tmp); + int result = 0; + for (int ii = 0; ii < 32; ++ii) + { + result |= ((bytes[ii] >> 7) << ii); + } + return result; + } + + template<> + BX_SIMD_FORCE_INLINE void simd256_x32_st1(void* _ptr, simd256_ref_t _a) + { + simd128_x32_st1(_ptr, _a.lo); + } + +#endif // !BX_SIMD_AVX + } // namespace bx diff --git a/include/bx/inline/simd32_ref.inl b/include/bx/inline/simd32_ref.inl new file mode 100644 index 0000000..0569438 --- /dev/null +++ b/include/bx/inline/simd32_ref.inl @@ -0,0 +1,839 @@ +/* + * Copyright 2010-2026 Branimir Karadzic. All rights reserved. + * License: https://github.com/bkaradzic/bx/blob/master/LICENSE + */ + +#ifndef BX_SIMD_T_H_HEADER_GUARD +# error "Must be included from bx/simd_t.h!" +#endif // BX_SIMD_T_H_HEADER_GUARD + +namespace bx +{ + BX_ALIGN_DECL(4, struct) simd32_f32_ref_t { float f32; }; + BX_ALIGN_DECL(4, struct) simd32_i32_ref_t { int32_t i32; }; + + BX_SIMD_FORCE_INLINE simd32_t simd32_ld(const void* _ptr) + { + simd32_t result; + memCopy(&result, _ptr, sizeof(simd32_t) ); + return result; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_ld(float _x) + { + const simd32_f32_ref_t result = { .f32 = _x }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_ld(int32_t _x) + { + return { .u32 = uint32_t(_x) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_ld(uint32_t _x) + { + return { .u32 = _x }; + } + + BX_SIMD_FORCE_INLINE void simd32_st(void* _ptr, simd32_t _a) + { + memCopy(_ptr, &_a, sizeof(simd32_t) ); + } + + BX_SIMD_FORCE_INLINE void simd32_x32_st1(void* _ptr, simd32_t _a) + { + simd32_t* result = reinterpret_cast(_ptr); + *result = _a; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_zero() + { + return { .u32 = 0 }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_splat(float _a) + { + const simd32_f32_ref_t result = { .f32 = _a }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_splat(uint32_t _a) + { + return { .u32 = _a }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_splat(int32_t _a) + { + return { .u32 = uint32_t(_a) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_splat(uint16_t _a) + { + const uint32_t val = uint32_t(_a); + return { .u32 = (val << 16) | val }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_splat(int16_t _a) + { + return simd32_splat(uint16_t(_a) ); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_splat(uint8_t _a) + { + const uint32_t val = uint32_t(_a); + return { .u32 = (val << 24) | (val << 16) | (val << 8) | val }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_splat(int8_t _a) + { + return simd32_splat(uint8_t(_a) ); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_f32_add(simd32_t _a, simd32_t _b) + { + const simd32_f32_ref_t a = bitCast(_a); + const simd32_f32_ref_t b = bitCast(_b); + const simd32_f32_ref_t result = { .f32 = a.f32 + b.f32 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_f32_sub(simd32_t _a, simd32_t _b) + { + const simd32_f32_ref_t a = bitCast(_a); + const simd32_f32_ref_t b = bitCast(_b); + const simd32_f32_ref_t result = { .f32 = a.f32 - b.f32 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_f32_mul(simd32_t _a, simd32_t _b) + { + const simd32_f32_ref_t a = bitCast(_a); + const simd32_f32_ref_t b = bitCast(_b); + const simd32_f32_ref_t result = { .f32 = a.f32 * b.f32 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_f32_div(simd32_t _a, simd32_t _b) + { + const simd32_f32_ref_t a = bitCast(_a); + const simd32_f32_ref_t b = bitCast(_b); + const simd32_f32_ref_t result = { .f32 = a.f32 / b.f32 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_f32_madd(simd32_t _a, simd32_t _b, simd32_t _c) + { + const simd32_f32_ref_t a = bitCast(_a); + const simd32_f32_ref_t b = bitCast(_b); + const simd32_f32_ref_t c = bitCast(_c); + const simd32_f32_ref_t result = { .f32 = a.f32 * b.f32 + c.f32 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_f32_msub(simd32_t _a, simd32_t _b, simd32_t _c) + { + const simd32_f32_ref_t a = bitCast(_a); + const simd32_f32_ref_t b = bitCast(_b); + const simd32_f32_ref_t c = bitCast(_c); + const simd32_f32_ref_t result = { .f32 = a.f32 * b.f32 - c.f32 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_f32_nmsub(simd32_t _a, simd32_t _b, simd32_t _c) + { + const simd32_f32_ref_t a = bitCast(_a); + const simd32_f32_ref_t b = bitCast(_b); + const simd32_f32_ref_t c = bitCast(_c); + const simd32_f32_ref_t result = { .f32 = c.f32 - a.f32 * b.f32 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC int simd32_x32_signbitsmask(simd32_t _a) + { + return int(_a.u32 >> 31); + } + + inline BX_CONSTEXPR_FUNC int simd32_x8_signbitsmask(simd32_t _a) + { + return int( (_a.u32 >> 7) & 1) + | (int( (_a.u32 >> 15) & 1) << 1) + | (int( (_a.u32 >> 23) & 1) << 2) + | (int( _a.u32 >> 31) << 3) + ; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_x8_shuffle(simd32_t _a, simd32_t _indices) + { + uint32_t out = 0; + for (uint32_t ii = 0; ii < 4; ++ii) + { + const uint32_t idx = (_indices.u32 >> (ii*8) ) & 0xffu; + const uint32_t byte = (idx & 0x80u) + ? 0u + : (_a.u32 >> ( (idx & 0x03u) * 8) ) & 0xffu; + out |= byte << (ii*8); + } + return { .u32 = out }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_x8_shuffle(simd32_t _a, simd32_t _b, simd32_t _indices) + { + uint32_t out = 0; + for (uint32_t ii = 0; ii < 4; ++ii) + { + const uint32_t idx = (_indices.u32 >> (ii*8) ) & 0xffu; + uint32_t byte = 0; + if (0 == (idx & 0x80u) ) + { + const uint32_t sel = idx & 0x07u; + const uint32_t src = sel < 4u ? _a.u32 : _b.u32; + byte = (src >> ( (sel & 0x03u) * 8) ) & 0xffu; + } + out |= byte << (ii*8); + } + return { .u32 = out }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_f32_neg(simd32_t _a) + { + const simd32_f32_ref_t a = bitCast(_a); + const simd32_f32_ref_t result = { .f32 = -a.f32 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_f32_abs(simd32_t _a) + { + return { .u32 = _a.u32 & (kFloatExponentMask | kFloatMantissaMask) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_f32_min(simd32_t _a, simd32_t _b) + { + const simd32_f32_ref_t a = bitCast(_a); + const simd32_f32_ref_t b = bitCast(_b); + const simd32_f32_ref_t result = { .f32 = a.f32 < b.f32 ? a.f32 : b.f32 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_f32_max(simd32_t _a, simd32_t _b) + { + const simd32_f32_ref_t a = bitCast(_a); + const simd32_f32_ref_t b = bitCast(_b); + const simd32_f32_ref_t result = { .f32 = a.f32 > b.f32 ? a.f32 : b.f32 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_f32_clamp(simd32_t _a, simd32_t _min, simd32_t _max) + { + return simd32_f32_min(simd32_f32_max(_a, _min), _max); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_f32_lerp(simd32_t _a, simd32_t _b, simd32_t _s) + { + const simd32_f32_ref_t a = bitCast(_a); + const simd32_f32_ref_t b = bitCast(_b); + const simd32_f32_ref_t s = bitCast(_s); + const simd32_f32_ref_t result = { .f32 = a.f32 + (b.f32 - a.f32) * s.f32 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_f32_rcp(simd32_t _a) + { + const simd32_f32_ref_t a = bitCast(_a); + const simd32_f32_ref_t result = { .f32 = 1.0f / a.f32 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_f32_sqrt(simd32_t _a) + { + const simd32_f32_ref_t a = bitCast(_a); + const simd32_f32_ref_t result = { .f32 = sqrt(a.f32) }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_f32_rsqrt(simd32_t _a) + { + const simd32_f32_ref_t a = bitCast(_a); + const simd32_f32_ref_t result = { .f32 = 1.0f / sqrt(a.f32) }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_f32_round(simd32_t _a) + { + const simd32_f32_ref_t a = bitCast(_a); + const simd32_f32_ref_t result = { .f32 = round(a.f32) }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_f32_ceil(simd32_t _a) + { + const simd32_f32_ref_t a = bitCast(_a); + const simd32_f32_ref_t result = { .f32 = ceil(a.f32) }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_f32_floor(simd32_t _a) + { + const simd32_f32_ref_t a = bitCast(_a); + const simd32_f32_ref_t result = { .f32 = floor(a.f32) }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_f32_cmpeq(simd32_t _a, simd32_t _b) + { + const simd32_f32_ref_t a = bitCast(_a); + const simd32_f32_ref_t b = bitCast(_b); + return { .u32 = a.f32 == b.f32 ? UINT32_MAX : 0 }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_f32_cmpneq(simd32_t _a, simd32_t _b) + { + const simd32_f32_ref_t a = bitCast(_a); + const simd32_f32_ref_t b = bitCast(_b); + return { .u32 = a.f32 != b.f32 ? UINT32_MAX : 0 }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_f32_cmplt(simd32_t _a, simd32_t _b) + { + const simd32_f32_ref_t a = bitCast(_a); + const simd32_f32_ref_t b = bitCast(_b); + return { .u32 = a.f32 < b.f32 ? UINT32_MAX : 0 }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_f32_cmple(simd32_t _a, simd32_t _b) + { + const simd32_f32_ref_t a = bitCast(_a); + const simd32_f32_ref_t b = bitCast(_b); + return { .u32 = a.f32 <= b.f32 ? UINT32_MAX : 0 }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_f32_cmpgt(simd32_t _a, simd32_t _b) + { + const simd32_f32_ref_t a = bitCast(_a); + const simd32_f32_ref_t b = bitCast(_b); + return { .u32 = a.f32 > b.f32 ? UINT32_MAX : 0 }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_f32_cmpge(simd32_t _a, simd32_t _b) + { + const simd32_f32_ref_t a = bitCast(_a); + const simd32_f32_ref_t b = bitCast(_b); + return { .u32 = a.f32 >= b.f32 ? UINT32_MAX : 0 }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_u32_add(simd32_t _a, simd32_t _b) + { + return { .u32 = _a.u32 + _b.u32 }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_u32_sub(simd32_t _a, simd32_t _b) + { + return { .u32 = _a.u32 - _b.u32 }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_u32_mul(simd32_t _a, simd32_t _b) + { + return { .u32 = _a.u32 * _b.u32 }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_u32_div(simd32_t _a, simd32_t _b) + { + return { .u32 = _a.u32 / _b.u32 }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_u32_mod(simd32_t _a, simd32_t _b) + { + return { .u32 = _a.u32 % _b.u32 }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_u32_min(simd32_t _a, simd32_t _b) + { + return { .u32 = _a.u32 < _b.u32 ? _a.u32 : _b.u32 }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_u32_max(simd32_t _a, simd32_t _b) + { + return { .u32 = _a.u32 > _b.u32 ? _a.u32 : _b.u32 }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_u32_clamp(simd32_t _a, simd32_t _min, simd32_t _max) + { + return simd32_u32_min(simd32_u32_max(_a, _min), _max); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_i32_add(simd32_t _a, simd32_t _b) + { + const simd32_i32_ref_t a = bitCast(_a); + const simd32_i32_ref_t b = bitCast(_b); + const simd32_i32_ref_t result = { .i32 = a.i32 + b.i32 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_i32_sub(simd32_t _a, simd32_t _b) + { + const simd32_i32_ref_t a = bitCast(_a); + const simd32_i32_ref_t b = bitCast(_b); + const simd32_i32_ref_t result = { .i32 = a.i32 - b.i32 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_i32_neg(simd32_t _a) + { + const simd32_i32_ref_t a = bitCast(_a); + const simd32_i32_ref_t result = { .i32 = -a.i32 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_i32_abs(simd32_t _a) + { + const simd32_i32_ref_t a = bitCast(_a); + const simd32_i32_ref_t result = { .i32 = a.i32 < 0 ? -a.i32 : a.i32 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_i32_min(simd32_t _a, simd32_t _b) + { + const simd32_i32_ref_t a = bitCast(_a); + const simd32_i32_ref_t b = bitCast(_b); + const simd32_i32_ref_t result = { .i32 = a.i32 < b.i32 ? a.i32 : b.i32 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_i32_max(simd32_t _a, simd32_t _b) + { + const simd32_i32_ref_t a = bitCast(_a); + const simd32_i32_ref_t b = bitCast(_b); + const simd32_i32_ref_t result = { .i32 = a.i32 > b.i32 ? a.i32 : b.i32 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_i32_clamp(simd32_t _a, simd32_t _min, simd32_t _max) + { + return simd32_i32_min(simd32_i32_max(_a, _min), _max); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_u32_cmpeq(simd32_t _a, simd32_t _b) + { + return { .u32 = uint32_t(-(_a.u32 == _b.u32)) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_u32_cmpneq(simd32_t _a, simd32_t _b) + { + return { .u32 = uint32_t(-(_a.u32 != _b.u32)) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_u32_cmplt(simd32_t _a, simd32_t _b) + { + return { .u32 = uint32_t(-(_a.u32 < _b.u32)) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_u32_cmple(simd32_t _a, simd32_t _b) + { + return { .u32 = uint32_t(-(_a.u32 <= _b.u32)) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_u32_cmpgt(simd32_t _a, simd32_t _b) + { + return { .u32 = uint32_t(-(_a.u32 > _b.u32)) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_u32_cmpge(simd32_t _a, simd32_t _b) + { + return { .u32 = uint32_t(-(_a.u32 >= _b.u32)) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_i32_cmpeq(simd32_t _a, simd32_t _b) + { + const simd32_i32_ref_t a = bitCast(_a); + const simd32_i32_ref_t b = bitCast(_b); + return { .u32 = uint32_t(-(a.i32 == b.i32)) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_i32_cmplt(simd32_t _a, simd32_t _b) + { + const simd32_i32_ref_t a = bitCast(_a); + const simd32_i32_ref_t b = bitCast(_b); + return { .u32 = uint32_t(-(a.i32 < b.i32)) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_i32_cmpgt(simd32_t _a, simd32_t _b) + { + const simd32_i32_ref_t a = bitCast(_a); + const simd32_i32_ref_t b = bitCast(_b); + return { .u32 = uint32_t(-(a.i32 > b.i32)) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_i16_add(simd32_t _a, simd32_t _b) + { + return { .u32 = uint32_t(uint16_t(int16_t(_a.u32) + int16_t(_b.u32))) + | (uint32_t(uint16_t(int16_t(_a.u32 >> 16) + int16_t(_b.u32 >> 16))) << 16) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_i16_sub(simd32_t _a, simd32_t _b) + { + return { .u32 = uint32_t(uint16_t(int16_t(_a.u32) - int16_t(_b.u32))) + | (uint32_t(uint16_t(int16_t(_a.u32 >> 16) - int16_t(_b.u32 >> 16))) << 16) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_i16_cmpeq(simd32_t _a, simd32_t _b) + { + return { .u32 = uint32_t(int16_t(_a.u32) == int16_t(_b.u32) ? uint16_t(0xffff) : uint16_t(0)) + | (uint32_t(int16_t(_a.u32 >> 16) == int16_t(_b.u32 >> 16) ? uint16_t(0xffff) : uint16_t(0)) << 16) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_x16_sll(simd32_t _a, int _count) + { + return { .u32 = uint32_t(uint16_t(uint16_t(_a.u32) << _count)) + | (uint32_t(uint16_t(uint16_t(_a.u32 >> 16) << _count)) << 16) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_x16_srl(simd32_t _a, int _count) + { + return { .u32 = uint32_t(uint16_t(uint16_t(_a.u32) >> _count)) + | (uint32_t(uint16_t(uint16_t(_a.u32 >> 16) >> _count)) << 16) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_i8_add(simd32_t _a, simd32_t _b) + { + return { .u32 = uint32_t(uint8_t(int8_t(_a.u32 ) + int8_t(_b.u32 ) ) ) + | (uint32_t(uint8_t(int8_t(_a.u32 >> 8) + int8_t(_b.u32 >> 8) ) ) << 8) + | (uint32_t(uint8_t(int8_t(_a.u32 >> 16) + int8_t(_b.u32 >> 16) ) ) << 16) + | (uint32_t(uint8_t(int8_t(_a.u32 >> 24) + int8_t(_b.u32 >> 24) ) ) << 24) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_i8_sub(simd32_t _a, simd32_t _b) + { + return { .u32 = uint32_t(uint8_t(int8_t(_a.u32 ) - int8_t(_b.u32 ) ) ) + | (uint32_t(uint8_t(int8_t(_a.u32 >> 8) - int8_t(_b.u32 >> 8) ) ) << 8) + | (uint32_t(uint8_t(int8_t(_a.u32 >> 16) - int8_t(_b.u32 >> 16) ) ) << 16) + | (uint32_t(uint8_t(int8_t(_a.u32 >> 24) - int8_t(_b.u32 >> 24) ) ) << 24) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_u8_satadd(simd32_t _a, simd32_t _b) + { + simd32_t result{ .u32 = 0 }; + for (int ii = 0; ii < 4; ++ii) + { + const uint16_t sum = uint16_t(uint8_t(_a.u32 >> (ii * 8))) + uint8_t(_b.u32 >> (ii * 8)); + result.u32 |= uint32_t(sum > 255 ? 255 : uint8_t(sum)) << (ii * 8); + } + return result; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_u8_satsub(simd32_t _a, simd32_t _b) + { + simd32_t result{ .u32 = 0 }; + for (int ii = 0; ii < 4; ++ii) + { + const uint8_t a = uint8_t(_a.u32 >> (ii * 8)); + const uint8_t b = uint8_t(_b.u32 >> (ii * 8)); + result.u32 |= uint32_t(a > b ? uint8_t(a - b) : uint8_t(0)) << (ii * 8); + } + return result; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_u16_satadd(simd32_t _a, simd32_t _b) + { + simd32_t result{ .u32 = 0 }; + for (int ii = 0; ii < 2; ++ii) + { + const uint32_t sum = uint32_t(uint16_t(_a.u32 >> (ii * 16))) + uint16_t(_b.u32 >> (ii * 16)); + result.u32 |= uint32_t(sum > 65535 ? 65535 : uint16_t(sum)) << (ii * 16); + } + return result; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_u16_satsub(simd32_t _a, simd32_t _b) + { + simd32_t result{ .u32 = 0 }; + for (int ii = 0; ii < 2; ++ii) + { + const uint16_t a = uint16_t(_a.u32 >> (ii * 16)); + const uint16_t b = uint16_t(_b.u32 >> (ii * 16)); + result.u32 |= uint32_t(a > b ? uint16_t(a - b) : uint16_t(0)) << (ii * 16); + } + return result; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_and(simd32_t _a, simd32_t _b) + { + return { .u32 = _a.u32 & _b.u32 }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_andc(simd32_t _a, simd32_t _b) + { + return { .u32 = _a.u32 & ~_b.u32 }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_or(simd32_t _a, simd32_t _b) + { + return { .u32 = _a.u32 | _b.u32 }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_orc(simd32_t _a, simd32_t _b) + { + return { .u32 = _a.u32 | ~_b.u32 }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_xor(simd32_t _a, simd32_t _b) + { + return { .u32 = _a.u32 ^ _b.u32 }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_not(simd32_t _a) + { + return { .u32 = ~_a.u32 }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_x32_sll(simd32_t _a, int _count) + { + return { .u32 = _a.u32 << _count }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_x32_srl(simd32_t _a, int _count) + { + return { .u32 = _a.u32 >> _count }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_x32_sra(simd32_t _a, int _count) + { + const simd32_i32_ref_t a = bitCast(_a); + const simd32_i32_ref_t result = { .i32 = a.i32 >> _count }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_x32_sll(simd32_t _a, simd32_t _count) + { + return { .u32 = _a.u32 << (_count.u32 & 31u) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_x32_srl(simd32_t _a, simd32_t _count) + { + return { .u32 = _a.u32 >> (_count.u32 & 31u) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_x32_sra(simd32_t _a, simd32_t _count) + { + const simd32_i32_ref_t a = bitCast(_a); + const simd32_i32_ref_t result = { .i32 = a.i32 >> int32_t(_count.u32 & 31u) }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_selb(simd32_t _mask, simd32_t _a, simd32_t _b) + { + return { .u32 = (_a.u32 & _mask.u32) | (_b.u32 & ~_mask.u32) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_sels(simd32_t _test, simd32_t _a, simd32_t _b) + { + const simd32_i32_ref_t test = bitCast(_test); + const uint32_t mask = (uint32_t)(test.i32 >> 31); + return { .u32 = (_a.u32 & mask) | (_b.u32 & ~mask) }; + } + + inline BX_CONSTEXPR_FUNC bool simd32_test(simd32_t _test) + { + return 0 != (_test.u32 >> 31); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_u32_xorl(simd32_t _a, simd32_t _b) + { + return { .u32 = uint32_t(!_a.u32 != !_b.u32) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_x32_rol(simd32_t _a, int _count) + { + return { .u32 = (_a.u32 << _count) | (_a.u32 >> (32 - _count)) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_x32_ror(simd32_t _a, int _count) + { + return { .u32 = (_a.u32 >> _count) | (_a.u32 << (32 - _count)) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_u32_setnz(simd32_t _a) + { + return { .u32 = uint32_t(-!!_a.u32) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_u32_satadd(simd32_t _a, simd32_t _b) + { + const uint32_t add = _a.u32 + _b.u32; + return { .u32 = add | -(add < _a.u32) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_u32_satsub(simd32_t _a, simd32_t _b) + { + const uint32_t sub = _a.u32 - _b.u32; + return { .u32 = sub & -(sub <= _a.u32) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_u32_satmul(simd32_t _a, simd32_t _b) + { + const uint64_t mul = (uint64_t)_a.u32 * (uint64_t)_b.u32; + const uint32_t hi = uint32_t(mul >> 32); + return { .u32 = uint32_t(mul) | uint32_t(-!!hi) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_u32_incwrap(simd32_t _val, simd32_t _min, simd32_t _max) + { + simd32_t inc{ .u32 = _val.u32 + 1 }; + const uint32_t diff = _max.u32 - _val.u32; + simd32_t mask{ .u32 = uint32_t(int32_t(diff | uint32_t(-int32_t(diff))) >> 31) }; + return simd32_selb(mask, inc, _min); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_u32_decwrap(simd32_t _val, simd32_t _min, simd32_t _max) + { + simd32_t dec{ .u32 = _val.u32 - 1 }; + const uint32_t diff = _min.u32 - _val.u32; + simd32_t mask{ .u32 = uint32_t(int32_t(diff | uint32_t(-int32_t(diff))) >> 31) }; + return simd32_selb(mask, dec, _max); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_x32_cntbits(simd32_t _a) + { + simd32_t result; +#if BX_COMPILER_GCC || BX_COMPILER_CLANG + result.u32 = __builtin_popcount(_a.u32); +#else + uint32_t val = _a.u32; + val = val - ((val >> 1) & 0x55555555u); + val = (val & 0x33333333u) + ((val >> 2) & 0x33333333u); + val = (val + (val >> 4)) & 0x0f0f0f0fu; + result.u32 = (val * 0x01010101u) >> 24; +#endif // BX_COMPILER_* + return result; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_x32_cntlz(simd32_t _a) + { + simd32_t result; +#if BX_COMPILER_GCC || BX_COMPILER_CLANG + result.u32 = 0 == _a.u32 ? 32 : __builtin_clz(_a.u32); +#else + uint32_t val = _a.u32; + val |= val >> 1; + val |= val >> 2; + val |= val >> 4; + val |= val >> 8; + val |= val >> 16; + simd32_t tmp{ .u32 = ~val }; + result = simd32_x32_cntbits(tmp); +#endif // BX_COMPILER_* + return result; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_x32_cnttz(simd32_t _a) + { + simd32_t result; +#if BX_COMPILER_GCC || BX_COMPILER_CLANG + result.u32 = 0 == _a.u32 ? 32 : __builtin_ctz(_a.u32); +#else + simd32_t tmp{ .u32 = ~_a.u32 & (_a.u32 - 1) }; + result = simd32_x32_cntbits(tmp); +#endif // BX_COMPILER_* + return result; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_x32_ffs(simd32_t _a) + { + const simd32_t tz = simd32_x32_cnttz(_a); + return { .u32 = 0 == _a.u32 ? 0 : tz.u32 + 1 }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_x32_part1by1(simd32_t _a) + { + uint32_t val = _a.u32 & 0xffff; + val = (val ^ (val << 8)) & 0x00ff00ff; + val = (val ^ (val << 4)) & 0x0f0f0f0f; + val = (val ^ (val << 2)) & 0x33333333; + val = (val ^ (val << 1)) & 0x55555555; + return { .u32 = val }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_x32_part1by2(simd32_t _a) + { + uint32_t val = _a.u32 & 0x3ff; + val = (val ^ (val << 16) ) & 0xff0000ff; + val = (val ^ (val << 8) ) & 0x0300f00f; + val = (val ^ (val << 4) ) & 0x030c30c3; + val = (val ^ (val << 2) ) & 0x09249249; + return { .u32 = val }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_u32_testpow2(simd32_t _a) + { + const uint32_t dec = _a.u32 - 1; + return { .u32 = uint32_t(-(int32_t( (_a.u32 ^ dec) >> 1 == dec) ) ) }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_u32_nextpow2(simd32_t _a) + { + uint32_t val = _a.u32 - 1; + val |= val >> 1; + val |= val >> 2; + val |= val >> 4; + val |= val >> 8; + val |= val >> 16; + return { .u32 = val + 1 }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_u32_gcd(simd32_t _a, simd32_t _b) + { + uint32_t a = _a.u32, b = _b.u32; + do { const uint32_t tmp = a % b; a = b; b = tmp; } while (b); + return { .u32 = a }; + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_u32_lcm(simd32_t _a, simd32_t _b) + { + return { .u32 = _a.u32 * (_b.u32 / simd32_u32_gcd(_a, _b).u32) }; + } + + BX_SIMD_FORCE_INLINE simd32_t simd32_ldu(const void* _ptr) + { + return simd32_ld(_ptr); + } + + BX_SIMD_FORCE_INLINE void simd32_stu(void* _ptr, simd32_t _a) + { + simd32_st(_ptr, _a); + } + + inline BX_CONSTEXPR_FUNC bool simd32_test_any(simd32_t _test) + { + return 0 != _test.u32; + } + + inline BX_CONSTEXPR_FUNC bool simd32_test_all(simd32_t _test) + { + return 0 != _test.u32; + } + + inline BX_CONSTEXPR_FUNC bool simd32_test_zero(simd32_t _a, simd32_t _b) + { + return 0 == (_a.u32 & _b.u32); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_f32_ftoi_trunc(simd32_t _a) + { + const simd32_f32_ref_t a = bitCast(_a); + const simd32_i32_ref_t result = { .i32 = int32_t(a.f32) }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_f32_ftoi_round(simd32_t _a) + { + const simd32_f32_ref_t a = bitCast(_a); + const simd32_i32_ref_t result = { .i32 = int32_t(round(a.f32) ) }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd32_t simd32_i32_itof(simd32_t _a) + { + const simd32_i32_ref_t a = bitCast(_a); + const simd32_f32_ref_t result = { .f32 = float(a.i32) }; + return bitCast(result); + } + +} // namespace bx diff --git a/include/bx/inline/simd64_ref.inl b/include/bx/inline/simd64_ref.inl new file mode 100644 index 0000000..b5a1d95 --- /dev/null +++ b/include/bx/inline/simd64_ref.inl @@ -0,0 +1,1296 @@ +/* + * Copyright 2010-2026 Branimir Karadzic. All rights reserved. + * License: https://github.com/bkaradzic/bx/blob/master/LICENSE + */ + +#ifndef BX_SIMD_T_H_HEADER_GUARD +# error "Must be included from bx/simd_t.h!" +#endif // BX_SIMD_T_H_HEADER_GUARD + +namespace bx +{ + BX_ALIGN_DECL(8, struct) simd64_f64_ref_t { double f64; }; + BX_ALIGN_DECL(8, struct) simd64_i64_ref_t { int64_t i64; }; + BX_ALIGN_DECL(8, struct) simd64_f32_ref_t { float f32[2]; }; + BX_ALIGN_DECL(8, struct) simd64_i32_ref_t { int32_t i32[2]; }; + BX_ALIGN_DECL(8, struct) simd64_u32_ref_t { uint32_t u32[2]; }; + +#if BX_SIMD_LANGEXT + typedef float simd64_f32_langext_t __attribute__((__vector_size__(8), __aligned__(8))); + typedef int32_t simd64_i32_langext_t __attribute__((__vector_size__(8), __aligned__(8))); + typedef uint32_t simd64_u32_langext_t __attribute__((__vector_size__(8), __aligned__(8))); +#endif // BX_SIMD_LANGEXT + + inline BX_CONSTEXPR_FUNC double simd_floor(double _x) + { + const uint64_t xi = bitCast(_x); + const int32_t exp = int32_t((xi >> 52) & 0x7ff) - 1023; + + if (exp < 0) + { + return (xi >> 63) ? -1.0 : 0.0; + } + + if (exp >= 52) + { + return _x; + } + + const uint64_t mask = (UINT64_C(1) << (52 - exp)) - 1; + + if (0 == (xi & mask)) + { + return _x; + } + + const uint64_t trunc = xi & ~mask; + return (xi >> 63) + ? bitCast(trunc) - 1.0 + : bitCast(trunc) + ; + } + + inline BX_CONSTEXPR_FUNC double simd_ceil(double _x) + { + return -simd_floor(-_x); + } + + inline BX_CONSTEXPR_FUNC double simd_round(double _x) + { + return (bitCast(_x) >> 63) + ? -simd_floor(-_x + 0.5) + : simd_floor( _x + 0.5) + ; + } + + inline BX_CONSTEXPR_FUNC double simd_sqrt(double _x) + { + if (_x <= 0.0) + { + return 0.0; + } + + uint64_t ii = bitCast(_x); + ii = UINT64_C(0x5fe6eb50c7b537a9) - (ii >> 1); + + double yy = bitCast(ii); + yy = yy * (1.5 - 0.5 * _x * yy * yy); + yy = yy * (1.5 - 0.5 * _x * yy * yy); + yy = yy * (1.5 - 0.5 * _x * yy * yy); + yy = yy * (1.5 - 0.5 * _x * yy * yy); + + return _x * yy; + } + + BX_SIMD_FORCE_INLINE simd64_t simd64_ld(const void* _ptr) + { + simd64_t result; + memCopy(&result, _ptr, sizeof(simd64_t) ); + return result; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_ld(float _x, float _y) + { + return { .u64 = uint64_t(bitCast(_x)) | (uint64_t(bitCast(_y)) << 32) }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_ld(uint32_t _x, uint32_t _y) + { + return { .u64 = uint64_t(_x) | (uint64_t(_y) << 32) }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_ld(int32_t _x, int32_t _y) + { + return { .u64 = uint64_t(uint32_t(_x)) | (uint64_t(uint32_t(_y)) << 32) }; + } + + BX_SIMD_FORCE_INLINE void simd64_st(void* _ptr, simd64_t _a) + { + memCopy(_ptr, &_a, sizeof(simd64_t) ); + } + + BX_SIMD_FORCE_INLINE void simd64_x32_st1(void* _ptr, simd64_t _a) + { + uint32_t* result = reinterpret_cast(_ptr); + *result = uint32_t(_a.u64); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_zero() + { + return { .u64 = 0 }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_splat(float _a) + { + const uint32_t bits = bitCast(_a); + return { .u64 = uint64_t(bits) | (uint64_t(bits) << 32) }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_splat(uint32_t _a) + { + return { .u64 = uint64_t(_a) | (uint64_t(_a) << 32) }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_splat(int32_t _a) + { + return simd64_splat(uint32_t(_a) ); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_splat(uint64_t _a) + { + return { .u64 = _a }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_splat(int64_t _a) + { + return { .u64 = uint64_t(_a) }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_splat(double _a) + { + const simd64_f64_ref_t result = { .f64 = _a }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_splat(uint16_t _a) + { + const uint64_t val = uint64_t(_a); + return { .u64 = (val << 48) | (val << 32) | (val << 16) | val }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_splat(int16_t _a) + { + return simd64_splat(uint16_t(_a) ); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_splat(uint8_t _a) + { + const uint64_t val = uint64_t(_a); + const uint64_t tmp0 = (val << 8 ) | val; + const uint64_t tmp1 = (tmp0 << 16) | tmp0; + const uint64_t tmp2 = (tmp1 << 32) | tmp1; + return { .u64 = tmp2 }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_splat(int8_t _a) + { + return simd64_splat(uint8_t(_a) ); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f64_add(simd64_t _a, simd64_t _b) + { + const simd64_f64_ref_t a = bitCast(_a); + const simd64_f64_ref_t b = bitCast(_b); + const simd64_f64_ref_t result = { .f64 = a.f64 + b.f64 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f64_sub(simd64_t _a, simd64_t _b) + { + const simd64_f64_ref_t a = bitCast(_a); + const simd64_f64_ref_t b = bitCast(_b); + const simd64_f64_ref_t result = { .f64 = a.f64 - b.f64 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f64_mul(simd64_t _a, simd64_t _b) + { + const simd64_f64_ref_t a = bitCast(_a); + const simd64_f64_ref_t b = bitCast(_b); + const simd64_f64_ref_t result = { .f64 = a.f64 * b.f64 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f64_div(simd64_t _a, simd64_t _b) + { + const simd64_f64_ref_t a = bitCast(_a); + const simd64_f64_ref_t b = bitCast(_b); + const simd64_f64_ref_t result = { .f64 = a.f64 / b.f64 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f64_madd(simd64_t _a, simd64_t _b, simd64_t _c) + { + const simd64_f64_ref_t a = bitCast(_a); + const simd64_f64_ref_t b = bitCast(_b); + const simd64_f64_ref_t c = bitCast(_c); + const simd64_f64_ref_t result = { .f64 = a.f64 * b.f64 + c.f64 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f64_nmsub(simd64_t _a, simd64_t _b, simd64_t _c) + { + const simd64_f64_ref_t a = bitCast(_a); + const simd64_f64_ref_t b = bitCast(_b); + const simd64_f64_ref_t c = bitCast(_c); + const simd64_f64_ref_t result = { .f64 = c.f64 - a.f64 * b.f64 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f64_neg(simd64_t _a) + { + const simd64_f64_ref_t a = bitCast(_a); + const simd64_f64_ref_t result = { .f64 = -a.f64 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f64_abs(simd64_t _a) + { + return { .u64 = _a.u64 & (kDoubleExponentMask | kDoubleMantissaMask) }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f64_min(simd64_t _a, simd64_t _b) + { + const simd64_f64_ref_t a = bitCast(_a); + const simd64_f64_ref_t b = bitCast(_b); + const simd64_f64_ref_t result = { .f64 = a.f64 < b.f64 ? a.f64 : b.f64 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f64_max(simd64_t _a, simd64_t _b) + { + const simd64_f64_ref_t a = bitCast(_a); + const simd64_f64_ref_t b = bitCast(_b); + const simd64_f64_ref_t result = { .f64 = a.f64 > b.f64 ? a.f64 : b.f64 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f64_clamp(simd64_t _a, simd64_t _min, simd64_t _max) + { + return simd64_f64_min(simd64_f64_max(_a, _min), _max); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f64_lerp(simd64_t _a, simd64_t _b, simd64_t _s) + { + const simd64_f64_ref_t a = bitCast(_a); + const simd64_f64_ref_t b = bitCast(_b); + const simd64_f64_ref_t s = bitCast(_s); + const simd64_f64_ref_t result = { .f64 = a.f64 + (b.f64 - a.f64) * s.f64 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f64_rcp(simd64_t _a) + { + const simd64_f64_ref_t a = bitCast(_a); + const simd64_f64_ref_t result = { .f64 = 1.0 / a.f64 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f64_sqrt(simd64_t _a) + { + const simd64_f64_ref_t a = bitCast(_a); + const simd64_f64_ref_t result = { .f64 = simd_sqrt(a.f64) }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f64_rsqrt(simd64_t _a) + { + const simd64_f64_ref_t a = bitCast(_a); + const simd64_f64_ref_t result = { .f64 = 1.0 / simd_sqrt(a.f64) }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f64_round(simd64_t _a) + { + const simd64_f64_ref_t a = bitCast(_a); + const simd64_f64_ref_t result = { .f64 = simd_round(a.f64) }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f64_ceil(simd64_t _a) + { + const simd64_f64_ref_t a = bitCast(_a); + const simd64_f64_ref_t result = { .f64 = simd_ceil(a.f64) }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f64_floor(simd64_t _a) + { + const simd64_f64_ref_t a = bitCast(_a); + const simd64_f64_ref_t result = { .f64 = simd_floor(a.f64) }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f64_cmpeq(simd64_t _a, simd64_t _b) + { + const simd64_f64_ref_t a = bitCast(_a); + const simd64_f64_ref_t b = bitCast(_b); + return { .u64 = a.f64 == b.f64 ? UINT64_MAX : 0 }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f64_cmpneq(simd64_t _a, simd64_t _b) + { + const simd64_f64_ref_t a = bitCast(_a); + const simd64_f64_ref_t b = bitCast(_b); + return { .u64 = a.f64 != b.f64 ? UINT64_MAX : 0 }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f64_cmplt(simd64_t _a, simd64_t _b) + { + const simd64_f64_ref_t a = bitCast(_a); + const simd64_f64_ref_t b = bitCast(_b); + return { .u64 = a.f64 < b.f64 ? UINT64_MAX : 0 }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f64_cmple(simd64_t _a, simd64_t _b) + { + const simd64_f64_ref_t a = bitCast(_a); + const simd64_f64_ref_t b = bitCast(_b); + return { .u64 = a.f64 <= b.f64 ? UINT64_MAX : 0 }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f64_cmpgt(simd64_t _a, simd64_t _b) + { + const simd64_f64_ref_t a = bitCast(_a); + const simd64_f64_ref_t b = bitCast(_b); + return { .u64 = a.f64 > b.f64 ? UINT64_MAX : 0 }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f64_cmpge(simd64_t _a, simd64_t _b) + { + const simd64_f64_ref_t a = bitCast(_a); + const simd64_f64_ref_t b = bitCast(_b); + return { .u64 = a.f64 >= b.f64 ? UINT64_MAX : 0 }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f32_add(simd64_t _a, simd64_t _b) + { +#if BX_SIMD_LANGEXT + return bitCast(bitCast(_a) + bitCast(_b)); +#else + const simd64_f32_ref_t a = bitCast(_a); + const simd64_f32_ref_t b = bitCast(_b); + const simd64_f32_ref_t result = { { a.f32[0] + b.f32[0], a.f32[1] + b.f32[1] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f32_sub(simd64_t _a, simd64_t _b) + { +#if BX_SIMD_LANGEXT + return bitCast(bitCast(_a) - bitCast(_b)); +#else + const simd64_f32_ref_t a = bitCast(_a); + const simd64_f32_ref_t b = bitCast(_b); + const simd64_f32_ref_t result = { { a.f32[0] - b.f32[0], a.f32[1] - b.f32[1] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f32_mul(simd64_t _a, simd64_t _b) + { +#if BX_SIMD_LANGEXT + return bitCast(bitCast(_a) * bitCast(_b)); +#else + const simd64_f32_ref_t a = bitCast(_a); + const simd64_f32_ref_t b = bitCast(_b); + const simd64_f32_ref_t result = { { a.f32[0] * b.f32[0], a.f32[1] * b.f32[1] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f32_div(simd64_t _a, simd64_t _b) + { +#if BX_SIMD_LANGEXT + return bitCast(bitCast(_a) / bitCast(_b)); +#else + const simd64_f32_ref_t a = bitCast(_a); + const simd64_f32_ref_t b = bitCast(_b); + const simd64_f32_ref_t result = { { a.f32[0] / b.f32[0], a.f32[1] / b.f32[1] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f32_min(simd64_t _a, simd64_t _b) + { + const simd64_f32_ref_t a = bitCast(_a); + const simd64_f32_ref_t b = bitCast(_b); + const simd64_f32_ref_t result = { { a.f32[0] < b.f32[0] ? a.f32[0] : b.f32[0], a.f32[1] < b.f32[1] ? a.f32[1] : b.f32[1] } }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f32_max(simd64_t _a, simd64_t _b) + { + const simd64_f32_ref_t a = bitCast(_a); + const simd64_f32_ref_t b = bitCast(_b); + const simd64_f32_ref_t result = { { a.f32[0] > b.f32[0] ? a.f32[0] : b.f32[0], a.f32[1] > b.f32[1] ? a.f32[1] : b.f32[1] } }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f32_madd(simd64_t _a, simd64_t _b, simd64_t _c) + { +#if BX_SIMD_LANGEXT + return bitCast(bitCast(_a) * bitCast(_b) + bitCast(_c)); +#else + const simd64_f32_ref_t a = bitCast(_a); + const simd64_f32_ref_t b = bitCast(_b); + const simd64_f32_ref_t c = bitCast(_c); + const simd64_f32_ref_t result = { { a.f32[0] * b.f32[0] + c.f32[0], a.f32[1] * b.f32[1] + c.f32[1] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f32_msub(simd64_t _a, simd64_t _b, simd64_t _c) + { +#if BX_SIMD_LANGEXT + return bitCast(bitCast(_a) * bitCast(_b) - bitCast(_c)); +#else + const simd64_f32_ref_t a = bitCast(_a); + const simd64_f32_ref_t b = bitCast(_b); + const simd64_f32_ref_t c = bitCast(_c); + const simd64_f32_ref_t result = { { a.f32[0] * b.f32[0] - c.f32[0], a.f32[1] * b.f32[1] - c.f32[1] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f32_nmsub(simd64_t _a, simd64_t _b, simd64_t _c) + { +#if BX_SIMD_LANGEXT + return bitCast(bitCast(_c) - bitCast(_a) * bitCast(_b)); +#else + const simd64_f32_ref_t a = bitCast(_a); + const simd64_f32_ref_t b = bitCast(_b); + const simd64_f32_ref_t c = bitCast(_c); + const simd64_f32_ref_t result = { { c.f32[0] - a.f32[0] * b.f32[0], c.f32[1] - a.f32[1] * b.f32[1] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC int simd64_x32_signbitsmask(simd64_t _a) + { + return int(uint32_t(_a.u64) >> 31) | (int(uint32_t(_a.u64 >> 32) >> 31) << 1); + } + + inline BX_CONSTEXPR_FUNC int simd64_x8_signbitsmask(simd64_t _a) + { + int result = 0; + for (int ii = 0; ii < 8; ++ii) + { + result |= (int((_a.u64 >> (ii * 8 + 7)) & 1) << ii); + } + return result; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_x8_shuffle(simd64_t _a, simd64_t _indices) + { + uint64_t out = 0; + for (uint64_t ii = 0; ii < 8; ++ii) + { + const uint64_t idx = (_indices.u64 >> (ii*8) ) & 0xffu; + const uint64_t byte = (idx & 0x80u) + ? uint64_t(0) + : (_a.u64 >> ( (idx & 0x07u) * 8) ) & 0xffu; + out |= byte << (ii*8); + } + return { .u64 = out }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_x8_shuffle(simd64_t _a, simd64_t _b, simd64_t _indices) + { + uint64_t out = 0; + for (uint64_t ii = 0; ii < 8; ++ii) + { + const uint64_t idx = (_indices.u64 >> (ii*8) ) & 0xffu; + uint64_t byte = 0; + if (0 == (idx & 0x80u) ) + { + const uint64_t sel = idx & 0x0fu; + const uint64_t src = sel < 8u ? _a.u64 : _b.u64; + byte = (src >> ( (sel & 0x07u) * 8) ) & 0xffu; + } + out |= byte << (ii*8); + } + return { .u64 = out }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f32_neg(simd64_t _a) + { +#if BX_SIMD_LANGEXT + return bitCast(-bitCast(_a)); +#else + const simd64_f32_ref_t a = bitCast(_a); + const simd64_f32_ref_t result = { { -a.f32[0], -a.f32[1] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f32_abs(simd64_t _a) + { + const simd64_u32_ref_t a = bitCast(_a); + const simd64_u32_ref_t result = { { a.u32[0] & (kFloatExponentMask | kFloatMantissaMask), a.u32[1] & (kFloatExponentMask | kFloatMantissaMask) } }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f32_clamp(simd64_t _a, simd64_t _min, simd64_t _max) + { + return simd64_f32_min(simd64_f32_max(_a, _min), _max); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f32_lerp(simd64_t _a, simd64_t _b, simd64_t _s) + { +#if BX_SIMD_LANGEXT + const simd64_f32_langext_t a = bitCast(_a); + return bitCast(a + (bitCast(_b) - a) * bitCast(_s)); +#else + const simd64_f32_ref_t a = bitCast(_a); + const simd64_f32_ref_t b = bitCast(_b); + const simd64_f32_ref_t s = bitCast(_s); + const simd64_f32_ref_t result = { { a.f32[0] + (b.f32[0] - a.f32[0]) * s.f32[0], a.f32[1] + (b.f32[1] - a.f32[1]) * s.f32[1] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f32_rcp(simd64_t _a) + { +#if BX_SIMD_LANGEXT + const simd64_f32_langext_t one = {1.0f, 1.0f}; + return bitCast(one / bitCast(_a)); +#else + const simd64_f32_ref_t a = bitCast(_a); + const simd64_f32_ref_t result = { { 1.0f / a.f32[0], 1.0f / a.f32[1] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f32_sqrt(simd64_t _a) + { + const simd64_f32_ref_t a = bitCast(_a); + const simd64_f32_ref_t result = { { sqrt(a.f32[0]), sqrt(a.f32[1]) } }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f32_rsqrt(simd64_t _a) + { + const simd64_f32_ref_t a = bitCast(_a); + const simd64_f32_ref_t result = { { 1.0f / sqrt(a.f32[0]), 1.0f / sqrt(a.f32[1]) } }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f32_round(simd64_t _a) + { + const simd64_f32_ref_t a = bitCast(_a); + const simd64_f32_ref_t result = { { round(a.f32[0]), round(a.f32[1]) } }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f32_ceil(simd64_t _a) + { + const simd64_f32_ref_t a = bitCast(_a); + const simd64_f32_ref_t result = { { ceil(a.f32[0]), ceil(a.f32[1]) } }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f32_floor(simd64_t _a) + { + const simd64_f32_ref_t a = bitCast(_a); + const simd64_f32_ref_t result = { { floor(a.f32[0]), floor(a.f32[1]) } }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f32_cmpeq(simd64_t _a, simd64_t _b) + { +#if BX_SIMD_LANGEXT + return bitCast(bitCast(_a) == bitCast(_b)); +#else + const simd64_f32_ref_t a = bitCast(_a); + const simd64_f32_ref_t b = bitCast(_b); + const simd64_u32_ref_t result = { { a.f32[0] == b.f32[0] ? 0xffffffff : 0u, a.f32[1] == b.f32[1] ? 0xffffffff : 0u } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f32_cmpneq(simd64_t _a, simd64_t _b) + { +#if BX_SIMD_LANGEXT + return bitCast(bitCast(_a) != bitCast(_b)); +#else + const simd64_f32_ref_t a = bitCast(_a); + const simd64_f32_ref_t b = bitCast(_b); + const simd64_u32_ref_t result = { { a.f32[0] != b.f32[0] ? 0xffffffff : 0u, a.f32[1] != b.f32[1] ? 0xffffffff : 0u } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f32_cmplt(simd64_t _a, simd64_t _b) + { +#if BX_SIMD_LANGEXT + return bitCast(bitCast(_a) < bitCast(_b)); +#else + const simd64_f32_ref_t a = bitCast(_a); + const simd64_f32_ref_t b = bitCast(_b); + const simd64_u32_ref_t result = { { a.f32[0] < b.f32[0] ? 0xffffffff : 0u, a.f32[1] < b.f32[1] ? 0xffffffff : 0u } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f32_cmple(simd64_t _a, simd64_t _b) + { +#if BX_SIMD_LANGEXT + return bitCast(bitCast(_a) <= bitCast(_b)); +#else + const simd64_f32_ref_t a = bitCast(_a); + const simd64_f32_ref_t b = bitCast(_b); + const simd64_u32_ref_t result = { { a.f32[0] <= b.f32[0] ? 0xffffffff : 0u, a.f32[1] <= b.f32[1] ? 0xffffffff : 0u } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f32_cmpgt(simd64_t _a, simd64_t _b) + { +#if BX_SIMD_LANGEXT + return bitCast(bitCast(_a) > bitCast(_b)); +#else + const simd64_f32_ref_t a = bitCast(_a); + const simd64_f32_ref_t b = bitCast(_b); + const simd64_u32_ref_t result = { { a.f32[0] > b.f32[0] ? 0xffffffff : 0u, a.f32[1] > b.f32[1] ? 0xffffffff : 0u } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f32_cmpge(simd64_t _a, simd64_t _b) + { +#if BX_SIMD_LANGEXT + return bitCast(bitCast(_a) >= bitCast(_b)); +#else + const simd64_f32_ref_t a = bitCast(_a); + const simd64_f32_ref_t b = bitCast(_b); + const simd64_u32_ref_t result = { { a.f32[0] >= b.f32[0] ? 0xffffffff : 0u, a.f32[1] >= b.f32[1] ? 0xffffffff : 0u } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_i32_add(simd64_t _a, simd64_t _b) + { +#if BX_SIMD_LANGEXT + return bitCast(bitCast(_a) + bitCast(_b)); +#else + const simd64_i32_ref_t a = bitCast(_a); + const simd64_i32_ref_t b = bitCast(_b); + const simd64_i32_ref_t result = { { a.i32[0] + b.i32[0], a.i32[1] + b.i32[1] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_i32_sub(simd64_t _a, simd64_t _b) + { +#if BX_SIMD_LANGEXT + return bitCast(bitCast(_a) - bitCast(_b)); +#else + const simd64_i32_ref_t a = bitCast(_a); + const simd64_i32_ref_t b = bitCast(_b); + const simd64_i32_ref_t result = { { a.i32[0] - b.i32[0], a.i32[1] - b.i32[1] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_i32_neg(simd64_t _a) + { +#if BX_SIMD_LANGEXT + return bitCast(-bitCast(_a)); +#else + const simd64_i32_ref_t a = bitCast(_a); + const simd64_i32_ref_t result = { { -a.i32[0], -a.i32[1] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_i32_abs(simd64_t _a) + { + const simd64_i32_ref_t a = bitCast(_a); + const simd64_i32_ref_t result = { { a.i32[0] < 0 ? -a.i32[0] : a.i32[0], a.i32[1] < 0 ? -a.i32[1] : a.i32[1] } }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_i32_min(simd64_t _a, simd64_t _b) + { + const simd64_i32_ref_t a = bitCast(_a); + const simd64_i32_ref_t b = bitCast(_b); + const simd64_i32_ref_t result = { { a.i32[0] < b.i32[0] ? a.i32[0] : b.i32[0], a.i32[1] < b.i32[1] ? a.i32[1] : b.i32[1] } }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_i32_max(simd64_t _a, simd64_t _b) + { + const simd64_i32_ref_t a = bitCast(_a); + const simd64_i32_ref_t b = bitCast(_b); + const simd64_i32_ref_t result = { { a.i32[0] > b.i32[0] ? a.i32[0] : b.i32[0], a.i32[1] > b.i32[1] ? a.i32[1] : b.i32[1] } }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_i32_clamp(simd64_t _a, simd64_t _min, simd64_t _max) + { + return simd64_i32_min(simd64_i32_max(_a, _min), _max); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_i32_cmpeq(simd64_t _a, simd64_t _b) + { +#if BX_SIMD_LANGEXT + return bitCast(bitCast(_a) == bitCast(_b)); +#else + const simd64_i32_ref_t a = bitCast(_a); + const simd64_i32_ref_t b = bitCast(_b); + const simd64_u32_ref_t result = { { a.i32[0] == b.i32[0] ? 0xffffffff : 0u, a.i32[1] == b.i32[1] ? 0xffffffff : 0u } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_i32_cmplt(simd64_t _a, simd64_t _b) + { +#if BX_SIMD_LANGEXT + return bitCast(bitCast(_a) < bitCast(_b)); +#else + const simd64_i32_ref_t a = bitCast(_a); + const simd64_i32_ref_t b = bitCast(_b); + const simd64_u32_ref_t result = { { a.i32[0] < b.i32[0] ? 0xffffffff : 0u, a.i32[1] < b.i32[1] ? 0xffffffff : 0u } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_i32_cmpgt(simd64_t _a, simd64_t _b) + { +#if BX_SIMD_LANGEXT + return bitCast(bitCast(_a) > bitCast(_b)); +#else + const simd64_i32_ref_t a = bitCast(_a); + const simd64_i32_ref_t b = bitCast(_b); + const simd64_u32_ref_t result = { { a.i32[0] > b.i32[0] ? 0xffffffff : 0u, a.i32[1] > b.i32[1] ? 0xffffffff : 0u } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_u32_add(simd64_t _a, simd64_t _b) + { +#if BX_SIMD_LANGEXT + return bitCast(bitCast(_a) + bitCast(_b)); +#else + const simd64_u32_ref_t a = bitCast(_a); + const simd64_u32_ref_t b = bitCast(_b); + const simd64_u32_ref_t result = { { a.u32[0] + b.u32[0], a.u32[1] + b.u32[1] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_u32_sub(simd64_t _a, simd64_t _b) + { +#if BX_SIMD_LANGEXT + return bitCast(bitCast(_a) - bitCast(_b)); +#else + const simd64_u32_ref_t a = bitCast(_a); + const simd64_u32_ref_t b = bitCast(_b); + const simd64_u32_ref_t result = { { a.u32[0] - b.u32[0], a.u32[1] - b.u32[1] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_u32_mul(simd64_t _a, simd64_t _b) + { +#if BX_SIMD_LANGEXT + return bitCast(bitCast(_a) * bitCast(_b)); +#else + const simd64_u32_ref_t a = bitCast(_a); + const simd64_u32_ref_t b = bitCast(_b); + const simd64_u32_ref_t result = { { a.u32[0] * b.u32[0], a.u32[1] * b.u32[1] } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_u32_min(simd64_t _a, simd64_t _b) + { + const simd64_u32_ref_t a = bitCast(_a); + const simd64_u32_ref_t b = bitCast(_b); + const simd64_u32_ref_t result = { { a.u32[0] < b.u32[0] ? a.u32[0] : b.u32[0], a.u32[1] < b.u32[1] ? a.u32[1] : b.u32[1] } }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_u32_max(simd64_t _a, simd64_t _b) + { + const simd64_u32_ref_t a = bitCast(_a); + const simd64_u32_ref_t b = bitCast(_b); + const simd64_u32_ref_t result = { { a.u32[0] > b.u32[0] ? a.u32[0] : b.u32[0], a.u32[1] > b.u32[1] ? a.u32[1] : b.u32[1] } }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_u32_clamp(simd64_t _a, simd64_t _min, simd64_t _max) + { + return simd64_u32_min(simd64_u32_max(_a, _min), _max); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_u32_cmpeq(simd64_t _a, simd64_t _b) + { +#if BX_SIMD_LANGEXT + return bitCast(bitCast(_a) == bitCast(_b)); +#else + const simd64_u32_ref_t a = bitCast(_a); + const simd64_u32_ref_t b = bitCast(_b); + const simd64_u32_ref_t result = { { a.u32[0] == b.u32[0] ? 0xffffffff : 0u, a.u32[1] == b.u32[1] ? 0xffffffff : 0u } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_u32_cmpneq(simd64_t _a, simd64_t _b) + { +#if BX_SIMD_LANGEXT + return bitCast(bitCast(_a) != bitCast(_b)); +#else + const simd64_u32_ref_t a = bitCast(_a); + const simd64_u32_ref_t b = bitCast(_b); + const simd64_u32_ref_t result = { { a.u32[0] != b.u32[0] ? 0xffffffff : 0u, a.u32[1] != b.u32[1] ? 0xffffffff : 0u } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_u32_cmplt(simd64_t _a, simd64_t _b) + { +#if BX_SIMD_LANGEXT + return bitCast(bitCast(_a) < bitCast(_b)); +#else + const simd64_u32_ref_t a = bitCast(_a); + const simd64_u32_ref_t b = bitCast(_b); + const simd64_u32_ref_t result = { { a.u32[0] < b.u32[0] ? 0xffffffff : 0u, a.u32[1] < b.u32[1] ? 0xffffffff : 0u } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_u32_cmple(simd64_t _a, simd64_t _b) + { +#if BX_SIMD_LANGEXT + return bitCast(bitCast(_a) <= bitCast(_b)); +#else + const simd64_u32_ref_t a = bitCast(_a); + const simd64_u32_ref_t b = bitCast(_b); + const simd64_u32_ref_t result = { { a.u32[0] <= b.u32[0] ? 0xffffffff : 0u, a.u32[1] <= b.u32[1] ? 0xffffffff : 0u } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_u32_cmpgt(simd64_t _a, simd64_t _b) + { +#if BX_SIMD_LANGEXT + return bitCast(bitCast(_a) > bitCast(_b)); +#else + const simd64_u32_ref_t a = bitCast(_a); + const simd64_u32_ref_t b = bitCast(_b); + const simd64_u32_ref_t result = { { a.u32[0] > b.u32[0] ? 0xffffffff : 0u, a.u32[1] > b.u32[1] ? 0xffffffff : 0u } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_u32_cmpge(simd64_t _a, simd64_t _b) + { +#if BX_SIMD_LANGEXT + return bitCast(bitCast(_a) >= bitCast(_b)); +#else + const simd64_u32_ref_t a = bitCast(_a); + const simd64_u32_ref_t b = bitCast(_b); + const simd64_u32_ref_t result = { { a.u32[0] >= b.u32[0] ? 0xffffffff : 0u, a.u32[1] >= b.u32[1] ? 0xffffffff : 0u } }; + return bitCast(result); +#endif // BX_SIMD_LANGEXT + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_i16_add(simd64_t _a, simd64_t _b) + { + simd64_t result{ .u64 = 0 }; + for (int ii = 0; ii < 4; ++ii) + { + const int16_t av = int16_t(_a.u64 >> (ii * 16)); + const int16_t bv = int16_t(_b.u64 >> (ii * 16)); + result.u64 |= uint64_t(uint16_t(int16_t(av + bv))) << (ii * 16); + } + return result; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_i16_sub(simd64_t _a, simd64_t _b) + { + simd64_t result{ .u64 = 0 }; + for (int ii = 0; ii < 4; ++ii) + { + const int16_t av = int16_t(_a.u64 >> (ii * 16)); + const int16_t bv = int16_t(_b.u64 >> (ii * 16)); + result.u64 |= uint64_t(uint16_t(int16_t(av - bv))) << (ii * 16); + } + return result; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_i16_mullo(simd64_t _a, simd64_t _b) + { + simd64_t result{ .u64 = 0 }; + for (int ii = 0; ii < 4; ++ii) + { + const int16_t av = int16_t(_a.u64 >> (ii * 16)); + const int16_t bv = int16_t(_b.u64 >> (ii * 16)); + result.u64 |= uint64_t(uint16_t(int16_t(av * bv))) << (ii * 16); + } + return result; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_i16_cmpeq(simd64_t _a, simd64_t _b) + { + simd64_t result{ .u64 = 0 }; + for (int ii = 0; ii < 4; ++ii) + { + const int16_t av = int16_t(_a.u64 >> (ii * 16)); + const int16_t bv = int16_t(_b.u64 >> (ii * 16)); + result.u64 |= uint64_t(av == bv ? uint16_t(0xffff) : uint16_t(0)) << (ii * 16); + } + return result; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_x16_sll(simd64_t _a, int _count) + { + simd64_t result{ .u64 = 0 }; + for (int ii = 0; ii < 4; ++ii) + { + const uint16_t av = uint16_t(_a.u64 >> (ii * 16)); + result.u64 |= uint64_t(uint16_t(av << _count)) << (ii * 16); + } + return result; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_x16_srl(simd64_t _a, int _count) + { + simd64_t result{ .u64 = 0 }; + for (int ii = 0; ii < 4; ++ii) + { + const uint16_t av = uint16_t(_a.u64 >> (ii * 16)); + result.u64 |= uint64_t(uint16_t(av >> _count)) << (ii * 16); + } + return result; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_i8_add(simd64_t _a, simd64_t _b) + { + simd64_t result{ .u64 = 0 }; + for (int ii = 0; ii < 8; ++ii) + { + result.u64 |= uint64_t(uint8_t(int8_t(_a.u64 >> (ii * 8)) + int8_t(_b.u64 >> (ii * 8)))) << (ii * 8); + } + return result; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_i8_sub(simd64_t _a, simd64_t _b) + { + simd64_t result{ .u64 = 0 }; + for (int ii = 0; ii < 8; ++ii) + { + result.u64 |= uint64_t(uint8_t(int8_t(_a.u64 >> (ii * 8)) - int8_t(_b.u64 >> (ii * 8)))) << (ii * 8); + } + return result; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_u8_satadd(simd64_t _a, simd64_t _b) + { + simd64_t result{ .u64 = 0 }; + for (int ii = 0; ii < 8; ++ii) + { + const uint16_t sum = uint16_t(uint8_t(_a.u64 >> (ii * 8))) + uint8_t(_b.u64 >> (ii * 8)); + result.u64 |= uint64_t(sum > 255 ? 255 : uint8_t(sum)) << (ii * 8); + } + return result; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_u8_satsub(simd64_t _a, simd64_t _b) + { + simd64_t result{ .u64 = 0 }; + for (int ii = 0; ii < 8; ++ii) + { + const uint8_t a = uint8_t(_a.u64 >> (ii * 8)); + const uint8_t b = uint8_t(_b.u64 >> (ii * 8)); + result.u64 |= uint64_t(a > b ? uint8_t(a - b) : uint8_t(0)) << (ii * 8); + } + return result; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_u16_satadd(simd64_t _a, simd64_t _b) + { + simd64_t result{ .u64 = 0 }; + for (int ii = 0; ii < 4; ++ii) + { + const uint32_t sum = uint32_t(uint16_t(_a.u64 >> (ii * 16))) + uint16_t(_b.u64 >> (ii * 16)); + result.u64 |= uint64_t(sum > 65535 ? 65535 : uint16_t(sum)) << (ii * 16); + } + return result; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_u16_satsub(simd64_t _a, simd64_t _b) + { + simd64_t result{ .u64 = 0 }; + for (int ii = 0; ii < 4; ++ii) + { + const uint16_t a = uint16_t(_a.u64 >> (ii * 16)); + const uint16_t b = uint16_t(_b.u64 >> (ii * 16)); + result.u64 |= uint64_t(a > b ? uint16_t(a - b) : uint16_t(0)) << (ii * 16); + } + return result; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_u64_add(simd64_t _a, simd64_t _b) + { + return { .u64 = _a.u64 + _b.u64 }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_u64_sub(simd64_t _a, simd64_t _b) + { + return { .u64 = _a.u64 - _b.u64 }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_u64_mul(simd64_t _a, simd64_t _b) + { + return { .u64 = _a.u64 * _b.u64 }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_i64_add(simd64_t _a, simd64_t _b) + { + const simd64_i64_ref_t a = bitCast(_a); + const simd64_i64_ref_t b = bitCast(_b); + const simd64_i64_ref_t result = { a.i64 + b.i64 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_i64_sub(simd64_t _a, simd64_t _b) + { + const simd64_i64_ref_t a = bitCast(_a); + const simd64_i64_ref_t b = bitCast(_b); + const simd64_i64_ref_t result = { a.i64 - b.i64 }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_and(simd64_t _a, simd64_t _b) + { + return { .u64 = _a.u64 & _b.u64 }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_andc(simd64_t _a, simd64_t _b) + { + return { .u64 = _a.u64 & ~_b.u64 }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_or(simd64_t _a, simd64_t _b) + { + return { .u64 = _a.u64 | _b.u64 }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_orc(simd64_t _a, simd64_t _b) + { + return { .u64 = _a.u64 | ~_b.u64 }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_xor(simd64_t _a, simd64_t _b) + { + return { .u64 = _a.u64 ^ _b.u64 }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_not(simd64_t _a) + { + return { .u64 = ~_a.u64 }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_x64_sll(simd64_t _a, int _count) + { + return { .u64 = _a.u64 << _count }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_x64_srl(simd64_t _a, int _count) + { + return { .u64 = _a.u64 >> _count }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_x64_sra(simd64_t _a, int _count) + { + const simd64_i64_ref_t a = bitCast(_a); + const simd64_i64_ref_t result = { a.i64 >> _count }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_selb(simd64_t _mask, simd64_t _a, simd64_t _b) + { + return { .u64 = (_a.u64 & _mask.u64) | (_b.u64 & ~_mask.u64) }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_sels(simd64_t _test, simd64_t _a, simd64_t _b) + { + const simd64_i32_ref_t test = bitCast(_test); + const simd64_u32_ref_t a = bitCast(_a); + const simd64_u32_ref_t b = bitCast(_b); + const uint32_t m0 = uint32_t(test.i32[0] >> 31); + const uint32_t m1 = uint32_t(test.i32[1] >> 31); + const simd64_u32_ref_t result = { { (a.u32[0] & m0) | (b.u32[0] & ~m0), (a.u32[1] & m1) | (b.u32[1] & ~m1) } }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC bool simd64_test_any_x(simd64_t _test) + { + return 0 != (uint32_t(_test.u64) >> 31); + } + + inline BX_CONSTEXPR_FUNC bool simd64_test_any_y(simd64_t _test) + { + return 0 != (uint32_t(_test.u64 >> 32) >> 31); + } + + inline BX_CONSTEXPR_FUNC bool simd64_test_any_xy(simd64_t _test) + { + return 0 != ((uint32_t(_test.u64) | uint32_t(_test.u64 >> 32)) >> 31); + } + + inline BX_CONSTEXPR_FUNC bool simd64_test_all_x(simd64_t _test) + { + return 0 != (uint32_t(_test.u64) >> 31); + } + + inline BX_CONSTEXPR_FUNC bool simd64_test_all_y(simd64_t _test) + { + return 0 != (uint32_t(_test.u64 >> 32) >> 31); + } + + inline BX_CONSTEXPR_FUNC bool simd64_test_all_xy(simd64_t _test) + { + return 0 != ((uint32_t(_test.u64) & uint32_t(_test.u64 >> 32)) >> 31); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_x64_rol(simd64_t _a, int _count) + { + return { .u64 = (_a.u64 << _count) | (_a.u64 >> (64 - _count)) }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_x64_ror(simd64_t _a, int _count) + { + return { .u64 = (_a.u64 >> _count) | (_a.u64 << (64 - _count)) }; + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_x64_cntbits(simd64_t _a) + { +#if BX_COMPILER_GCC || BX_COMPILER_CLANG + return { .u64 = uint64_t(__builtin_popcountll(_a.u64) ) }; +#else + const simd32_t lo { .u32 = uint32_t(_a.u64) }; + const simd32_t hi { .u32 = uint32_t(_a.u64 >> 32) }; + const simd32_t cntLo = simd32_x32_cntbits(lo); + const simd32_t cntHi = simd32_x32_cntbits(hi); + return { .u64 = uint64_t(cntLo.u32) + uint64_t(cntHi.u32) }; +#endif // BX_COMPILER_* + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_x64_cntlz(simd64_t _a) + { +#if BX_COMPILER_GCC || BX_COMPILER_CLANG + return { .u64 = 0 == _a.u64 ? 64 : uint64_t(__builtin_clzll(_a.u64) ) }; +#else + const simd32_t lo { .u32 = uint32_t(_a.u64) }; + const simd32_t hi { .u32 = uint32_t(_a.u64 >> 32) }; + const simd32_t cntHi = simd32_x32_cntlz(hi); + const simd32_t cntLo = simd32_x32_cntlz(lo); + return { .u64 = uint32_t(_a.u64 >> 32) + ? uint64_t(cntHi.u32) + : uint64_t(cntLo.u32) + 32 + }; +#endif // BX_COMPILER_* + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_x64_cnttz(simd64_t _a) + { +#if BX_COMPILER_GCC || BX_COMPILER_CLANG + return { .u64 = 0 == _a.u64 ? 64 : uint64_t(__builtin_ctzll(_a.u64) ) }; +#else + const simd32_t lo { .u32 = uint32_t(_a.u64) }; + const simd32_t hi { .u32 = uint32_t(_a.u64 >> 32) }; + const simd32_t cntLo = simd32_x32_cnttz(lo); + const simd32_t cntHi = simd32_x32_cnttz(hi); + return { .u64 = uint32_t(_a.u64) + ? uint64_t(cntLo.u32) + : uint64_t(cntHi.u32) + 32 + }; +#endif // BX_COMPILER_* + } + + BX_SIMD_FORCE_INLINE simd64_t simd64_ldu(const void* _ptr) + { + return simd64_ld(_ptr); + } + + BX_SIMD_FORCE_INLINE void simd64_stu(void* _ptr, simd64_t _a) + { + simd64_st(_ptr, _a); + } + + inline BX_CONSTEXPR_FUNC bool simd64_test_any(simd64_t _test) + { + return simd64_test_any_xy(_test); + } + + inline BX_CONSTEXPR_FUNC bool simd64_test_all(simd64_t _test) + { + return simd64_test_all_xy(_test); + } + + inline BX_CONSTEXPR_FUNC bool simd64_test_zero(simd64_t _a, simd64_t _b) + { + return 0 == (_a.u64 & _b.u64); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_x32_sll(simd64_t _a, int _count) + { + const simd64_u32_ref_t a = bitCast(_a); + const simd64_u32_ref_t result = { { a.u32[0] << _count, a.u32[1] << _count } }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_x32_srl(simd64_t _a, int _count) + { + const simd64_u32_ref_t a = bitCast(_a); + const simd64_u32_ref_t result = { { a.u32[0] >> _count, a.u32[1] >> _count } }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_x32_sra(simd64_t _a, int _count) + { + const simd64_i32_ref_t a = bitCast(_a); + const simd64_i32_ref_t result = { { a.i32[0] >> _count, a.i32[1] >> _count } }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_x32_sll(simd64_t _a, simd64_t _count) + { + const simd64_u32_ref_t a = bitCast(_a); + const simd64_u32_ref_t c = bitCast(_count); + const simd64_u32_ref_t result = { { a.u32[0] << (c.u32[0] & 31u), a.u32[1] << (c.u32[1] & 31u) } }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_x32_srl(simd64_t _a, simd64_t _count) + { + const simd64_u32_ref_t a = bitCast(_a); + const simd64_u32_ref_t c = bitCast(_count); + const simd64_u32_ref_t result = { { a.u32[0] >> (c.u32[0] & 31u), a.u32[1] >> (c.u32[1] & 31u) } }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_x32_sra(simd64_t _a, simd64_t _count) + { + const simd64_i32_ref_t a = bitCast(_a); + const simd64_u32_ref_t c = bitCast(_count); + const simd64_i32_ref_t result = { { a.i32[0] >> int32_t(c.u32[0] & 31u), a.i32[1] >> int32_t(c.u32[1] & 31u) } }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f32_ftoi_trunc(simd64_t _a) + { + const simd64_f32_ref_t a = bitCast(_a); + const simd64_i32_ref_t result = { { int32_t(a.f32[0]), int32_t(a.f32[1]) } }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_f32_ftoi_round(simd64_t _a) + { + const simd64_f32_ref_t a = bitCast(_a); + const simd64_i32_ref_t result = { { int32_t(round(a.f32[0]) ), int32_t(round(a.f32[1]) ) } }; + return bitCast(result); + } + + inline BX_CONSTEXPR_FUNC simd64_t simd64_i32_itof(simd64_t _a) + { + const simd64_i32_ref_t a = bitCast(_a); + const simd64_f32_ref_t result = { { float(a.i32[0]), float(a.i32[1]) } }; + return bitCast(result); + } + +} // namespace bx diff --git a/include/bx/inline/simd_impl.inl b/include/bx/inline/simd_impl.inl new file mode 100644 index 0000000..9cc7f9b --- /dev/null +++ b/include/bx/inline/simd_impl.inl @@ -0,0 +1,2637 @@ +/* + * Copyright 2010-2026 Branimir Karadzic. All rights reserved. + * License: https://github.com/bkaradzic/bx/blob/master/LICENSE + */ + +#ifndef BX_SIMD_T_H_HEADER_GUARD +# error "Must be included from bx/simd_t.h!" +#endif // BX_SIMD_T_H_HEADER_GUARD + +namespace bx +{ + simd32_t simd32_ld(const void* _ptr); + BX_CONSTEXPR_FUNC simd32_t simd32_ld(float _x); + BX_CONSTEXPR_FUNC simd32_t simd32_ld(int32_t _x); + BX_CONSTEXPR_FUNC simd32_t simd32_ld(uint32_t _x); + void simd32_st(void* _ptr, simd32_t _a); + void simd32_x32_st1(void* _ptr, simd32_t _a); + BX_CONSTEXPR_FUNC simd32_t simd32_zero(); + + BX_CONSTEXPR_FUNC simd32_t simd32_splat(float _a); + BX_CONSTEXPR_FUNC simd32_t simd32_splat(uint32_t _a); + BX_CONSTEXPR_FUNC simd32_t simd32_splat(int32_t _a); + BX_CONSTEXPR_FUNC simd32_t simd32_splat(uint16_t _a); + BX_CONSTEXPR_FUNC simd32_t simd32_splat(int16_t _a); + BX_CONSTEXPR_FUNC simd32_t simd32_splat(uint8_t _a); + BX_CONSTEXPR_FUNC simd32_t simd32_splat(int8_t _a); + + BX_CONSTEXPR_FUNC simd32_t simd32_f32_add(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_f32_sub(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_f32_mul(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_f32_div(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_f32_madd(simd32_t _a, simd32_t _b, simd32_t _c); + BX_CONSTEXPR_FUNC simd32_t simd32_f32_msub(simd32_t _a, simd32_t _b, simd32_t _c); + BX_CONSTEXPR_FUNC simd32_t simd32_f32_nmsub(simd32_t _a, simd32_t _b, simd32_t _c); + BX_CONSTEXPR_FUNC int simd32_x32_signbitsmask(simd32_t _a); + BX_CONSTEXPR_FUNC int simd32_x8_signbitsmask(simd32_t _a); + BX_CONSTEXPR_FUNC simd32_t simd32_x8_shuffle(simd32_t _a, simd32_t _indices); + BX_CONSTEXPR_FUNC simd32_t simd32_x8_shuffle(simd32_t _a, simd32_t _b, simd32_t _indices); + BX_CONSTEXPR_FUNC simd32_t simd32_f32_neg(simd32_t _a); + BX_CONSTEXPR_FUNC simd32_t simd32_f32_abs(simd32_t _a); + BX_CONSTEXPR_FUNC simd32_t simd32_f32_min(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_f32_max(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_f32_clamp(simd32_t _a, simd32_t _min, simd32_t _max); + BX_CONSTEXPR_FUNC simd32_t simd32_f32_lerp(simd32_t _a, simd32_t _b, simd32_t _s); + BX_CONSTEXPR_FUNC simd32_t simd32_f32_rcp(simd32_t _a); + BX_CONSTEXPR_FUNC simd32_t simd32_f32_sqrt(simd32_t _a); + BX_CONSTEXPR_FUNC simd32_t simd32_f32_rsqrt(simd32_t _a); + BX_CONSTEXPR_FUNC simd32_t simd32_f32_round(simd32_t _a); + BX_CONSTEXPR_FUNC simd32_t simd32_f32_ceil(simd32_t _a); + BX_CONSTEXPR_FUNC simd32_t simd32_f32_floor(simd32_t _a); + + BX_CONSTEXPR_FUNC simd32_t simd32_f32_cmpeq(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_f32_cmpneq(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_f32_cmplt(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_f32_cmple(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_f32_cmpgt(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_f32_cmpge(simd32_t _a, simd32_t _b); + + BX_CONSTEXPR_FUNC simd32_t simd32_u32_add(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_u32_sub(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_u32_mul(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_u32_div(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_u32_mod(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_u32_min(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_u32_max(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_u32_clamp(simd32_t _a, simd32_t _min, simd32_t _max); + + BX_CONSTEXPR_FUNC simd32_t simd32_i32_add(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_i32_sub(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_i32_mul(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_i32_neg(simd32_t _a); + BX_CONSTEXPR_FUNC simd32_t simd32_i32_abs(simd32_t _a); + BX_CONSTEXPR_FUNC simd32_t simd32_i32_min(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_i32_max(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_i32_clamp(simd32_t _a, simd32_t _min, simd32_t _max); + + BX_CONSTEXPR_FUNC simd32_t simd32_u32_cmpeq(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_u32_cmpneq(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_u32_cmplt(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_u32_cmple(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_u32_cmpgt(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_u32_cmpge(simd32_t _a, simd32_t _b); + + BX_CONSTEXPR_FUNC simd32_t simd32_i32_cmpeq(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_i32_cmplt(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_i32_cmpgt(simd32_t _a, simd32_t _b); + + BX_CONSTEXPR_FUNC simd32_t simd32_i16_add(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_i16_sub(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_i16_cmpeq(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_x16_sll(simd32_t _a, int _count); + BX_CONSTEXPR_FUNC simd32_t simd32_x16_srl(simd32_t _a, int _count); + BX_CONSTEXPR_FUNC simd32_t simd32_i8_add(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_i8_sub(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_u8_satadd(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_u8_satsub(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_u16_satadd(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_u16_satsub(simd32_t _a, simd32_t _b); + + BX_CONSTEXPR_FUNC simd32_t simd32_and(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_andc(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_or(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_orc(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_xor(simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_not(simd32_t _a); + BX_CONSTEXPR_FUNC simd32_t simd32_x32_sll(simd32_t _a, int _count); + BX_CONSTEXPR_FUNC simd32_t simd32_x32_srl(simd32_t _a, int _count); + BX_CONSTEXPR_FUNC simd32_t simd32_x32_sra(simd32_t _a, int _count); + BX_CONSTEXPR_FUNC simd32_t simd32_x32_sll(simd32_t _a, simd32_t _count); + BX_CONSTEXPR_FUNC simd32_t simd32_x32_srl(simd32_t _a, simd32_t _count); + BX_CONSTEXPR_FUNC simd32_t simd32_x32_sra(simd32_t _a, simd32_t _count); + + BX_CONSTEXPR_FUNC simd32_t simd32_selb(simd32_t _mask, simd32_t _a, simd32_t _b); + BX_CONSTEXPR_FUNC simd32_t simd32_sels(simd32_t _test, simd32_t _a, simd32_t _b); + + BX_CONSTEXPR_FUNC bool simd32_test(simd32_t _test); + + simd64_t simd64_ld(const void* _ptr); + BX_CONSTEXPR_FUNC simd64_t simd64_ld(float _x, float _y); + BX_CONSTEXPR_FUNC simd64_t simd64_ld(int32_t _x, int32_t _y); + BX_CONSTEXPR_FUNC simd64_t simd64_ld(uint32_t _x, uint32_t _y); + void simd64_st(void* _ptr, simd64_t _a); + void simd64_x32_st1(void* _ptr, simd64_t _a); + BX_CONSTEXPR_FUNC simd64_t simd64_zero(); + BX_CONSTEXPR_FUNC simd64_t simd64_splat(float _a); + BX_CONSTEXPR_FUNC simd64_t simd64_splat(uint32_t _a); + BX_CONSTEXPR_FUNC simd64_t simd64_splat(int32_t _a); + BX_CONSTEXPR_FUNC simd64_t simd64_splat(uint64_t _a); + BX_CONSTEXPR_FUNC simd64_t simd64_splat(int64_t _a); + BX_CONSTEXPR_FUNC simd64_t simd64_splat(double _a); + BX_CONSTEXPR_FUNC simd64_t simd64_splat(uint16_t _a); + BX_CONSTEXPR_FUNC simd64_t simd64_splat(int16_t _a); + BX_CONSTEXPR_FUNC simd64_t simd64_splat(uint8_t _a); + BX_CONSTEXPR_FUNC simd64_t simd64_splat(int8_t _a); + + BX_CONSTEXPR_FUNC simd64_t simd64_f64_add(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_f64_sub(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_f64_mul(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_f64_div(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_f64_madd(simd64_t _a, simd64_t _b, simd64_t _c); + BX_CONSTEXPR_FUNC simd64_t simd64_f64_nmsub(simd64_t _a, simd64_t _b, simd64_t _c); + BX_CONSTEXPR_FUNC simd64_t simd64_f64_neg(simd64_t _a); + BX_CONSTEXPR_FUNC simd64_t simd64_f64_abs(simd64_t _a); + BX_CONSTEXPR_FUNC simd64_t simd64_f64_min(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_f64_max(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_f64_clamp(simd64_t _a, simd64_t _min, simd64_t _max); + BX_CONSTEXPR_FUNC simd64_t simd64_f64_lerp(simd64_t _a, simd64_t _b, simd64_t _s); + BX_CONSTEXPR_FUNC simd64_t simd64_f64_rcp(simd64_t _a); + BX_CONSTEXPR_FUNC simd64_t simd64_f64_sqrt(simd64_t _a); + BX_CONSTEXPR_FUNC simd64_t simd64_f64_rsqrt(simd64_t _a); + BX_CONSTEXPR_FUNC simd64_t simd64_f64_round(simd64_t _a); + BX_CONSTEXPR_FUNC simd64_t simd64_f64_ceil(simd64_t _a); + BX_CONSTEXPR_FUNC simd64_t simd64_f64_floor(simd64_t _a); + + BX_CONSTEXPR_FUNC simd64_t simd64_f64_cmpeq(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_f64_cmpneq(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_f64_cmplt(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_f64_cmple(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_f64_cmpgt(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_f64_cmpge(simd64_t _a, simd64_t _b); + + BX_CONSTEXPR_FUNC simd64_t simd64_f32_add(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_f32_sub(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_f32_mul(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_f32_div(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_f32_madd(simd64_t _a, simd64_t _b, simd64_t _c); + BX_CONSTEXPR_FUNC simd64_t simd64_f32_msub(simd64_t _a, simd64_t _b, simd64_t _c); + BX_CONSTEXPR_FUNC simd64_t simd64_f32_nmsub(simd64_t _a, simd64_t _b, simd64_t _c); + BX_CONSTEXPR_FUNC int simd64_x32_signbitsmask(simd64_t _a); + BX_CONSTEXPR_FUNC int simd64_x8_signbitsmask(simd64_t _a); + BX_CONSTEXPR_FUNC simd64_t simd64_x8_shuffle(simd64_t _a, simd64_t _indices); + BX_CONSTEXPR_FUNC simd64_t simd64_x8_shuffle(simd64_t _a, simd64_t _b, simd64_t _indices); + BX_CONSTEXPR_FUNC simd64_t simd64_f32_neg(simd64_t _a); + BX_CONSTEXPR_FUNC simd64_t simd64_f32_abs(simd64_t _a); + BX_CONSTEXPR_FUNC simd64_t simd64_f32_min(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_f32_max(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_f32_clamp(simd64_t _a, simd64_t _min, simd64_t _max); + BX_CONSTEXPR_FUNC simd64_t simd64_f32_lerp(simd64_t _a, simd64_t _b, simd64_t _s); + BX_CONSTEXPR_FUNC simd64_t simd64_f32_rcp(simd64_t _a); + BX_CONSTEXPR_FUNC simd64_t simd64_f32_sqrt(simd64_t _a); + BX_CONSTEXPR_FUNC simd64_t simd64_f32_rsqrt(simd64_t _a); + BX_CONSTEXPR_FUNC simd64_t simd64_f32_round(simd64_t _a); + BX_CONSTEXPR_FUNC simd64_t simd64_f32_ceil(simd64_t _a); + BX_CONSTEXPR_FUNC simd64_t simd64_f32_floor(simd64_t _a); + BX_CONSTEXPR_FUNC simd64_t simd64_f32_cmpeq(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_f32_cmpneq(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_f32_cmplt(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_f32_cmple(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_f32_cmpgt(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_f32_cmpge(simd64_t _a, simd64_t _b); + + BX_CONSTEXPR_FUNC simd64_t simd64_i32_add(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_i32_sub(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_i32_neg(simd64_t _a); + BX_CONSTEXPR_FUNC simd64_t simd64_i32_abs(simd64_t _a); + BX_CONSTEXPR_FUNC simd64_t simd64_i32_min(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_i32_max(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_i32_clamp(simd64_t _a, simd64_t _min, simd64_t _max); + BX_CONSTEXPR_FUNC simd64_t simd64_i32_cmpeq(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_i32_cmplt(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_i32_cmpgt(simd64_t _a, simd64_t _b); + + BX_CONSTEXPR_FUNC simd64_t simd64_u32_add(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_u32_sub(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_u32_mul(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_u32_min(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_u32_max(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_u32_clamp(simd64_t _a, simd64_t _min, simd64_t _max); + BX_CONSTEXPR_FUNC simd64_t simd64_u32_cmpeq(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_u32_cmpneq(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_u32_cmplt(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_u32_cmple(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_u32_cmpgt(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_u32_cmpge(simd64_t _a, simd64_t _b); + + BX_CONSTEXPR_FUNC simd64_t simd64_i16_add(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_i16_sub(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_i16_mullo(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_i16_cmpeq(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_x16_sll(simd64_t _a, int _count); + BX_CONSTEXPR_FUNC simd64_t simd64_x16_srl(simd64_t _a, int _count); + BX_CONSTEXPR_FUNC simd64_t simd64_i8_add(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_i8_sub(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_u8_satadd(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_u8_satsub(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_u16_satadd(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_u16_satsub(simd64_t _a, simd64_t _b); + + BX_CONSTEXPR_FUNC simd64_t simd64_i64_add(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_i64_sub(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_u64_add(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_u64_sub(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_u64_mul(simd64_t _a, simd64_t _b); + + BX_CONSTEXPR_FUNC simd64_t simd64_and(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_andc(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_or(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_orc(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_xor(simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_not(simd64_t _a); + BX_CONSTEXPR_FUNC simd64_t simd64_x64_sll(simd64_t _a, int _count); + BX_CONSTEXPR_FUNC simd64_t simd64_x64_srl(simd64_t _a, int _count); + BX_CONSTEXPR_FUNC simd64_t simd64_x64_sra(simd64_t _a, int _count); + BX_CONSTEXPR_FUNC simd64_t simd64_x64_rol(simd64_t _a, int _count); + BX_CONSTEXPR_FUNC simd64_t simd64_x64_ror(simd64_t _a, int _count); + BX_CONSTEXPR_FUNC simd64_t simd64_x64_cntbits(simd64_t _a); + BX_CONSTEXPR_FUNC simd64_t simd64_x64_cntlz(simd64_t _a); + BX_CONSTEXPR_FUNC simd64_t simd64_x64_cnttz(simd64_t _a); + + BX_CONSTEXPR_FUNC simd64_t simd64_selb(simd64_t _mask, simd64_t _a, simd64_t _b); + BX_CONSTEXPR_FUNC simd64_t simd64_sels(simd64_t _test, simd64_t _a, simd64_t _b); + + BX_CONSTEXPR_FUNC bool simd64_test_any_x(simd64_t _test); + BX_CONSTEXPR_FUNC bool simd64_test_any_y(simd64_t _test); + BX_CONSTEXPR_FUNC bool simd64_test_any_xy(simd64_t _test); + BX_CONSTEXPR_FUNC bool simd64_test_all_x(simd64_t _test); + BX_CONSTEXPR_FUNC bool simd64_test_all_y(simd64_t _test); + BX_CONSTEXPR_FUNC bool simd64_test_all_xy(simd64_t _test); + + /// Load 128-bit register from aligned memory. + /// + template + Ty simd128_ld(const void* _ptr); + + /// Load 128-bit register from unaligned memory. + /// + template + Ty simd128_ldu(const void* _ptr); + + /// Store 128-bit register to aligned memory. + /// + template + void simd128_st(void* _ptr, Ty _a); + + /// Store 128-bit register to unaligned memory. + /// + template + void simd128_stu(void* _ptr, Ty _a); + + /// Store lowest 32-bit element to memory. + /// + template + void simd128_x32_st1(void* _ptr, Ty _a); + + /// Non-temporal (streaming) store. + /// + template + void simd128_stream(void* _ptr, Ty _a); + + /// Splat 32-bit value to all lanes. + /// + template Ty simd128_splat(float _a); + + /// Splat 32-bit value to all lanes. + /// + template Ty simd128_splat(int32_t _a); + + /// Splat 32-bit value to all lanes. + /// + template Ty simd128_splat(uint32_t _a); + + /// Splat 64-bit double value to two lanes. + /// + template Ty simd128_splat(double _a); + + /// Construct from 4 floats. + /// + template + Ty simd128_ld(float _x, float _y, float _z, float _w); + + /// Construct from 4 int32s (as bit pattern). + /// + template + Ty simd128_ld(int32_t _x, int32_t _y, int32_t _z, int32_t _w); + + /// Construct from 4 uint32s (as bit pattern). + /// + template + Ty simd128_ld(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w); + + /// Return all-zero register. + /// + template + Ty simd128_zero(); + + /// Convert 4xi32 -> 4xf32. + /// + template + Ty simd128_i32_itof(Ty _a); + + /// Convert 4xf32 -> 4xi32 (truncation toward zero). + /// + template + Ty simd128_f32_ftoi_trunc(Ty _a); + + /// Convert 4xf32 -> 4xi32 (round-to-nearest, ties to even). + /// + template + Ty simd128_f32_ftoi_round(Ty _a); + + /// 4xf32 add. + /// + template + Ty simd128_f32_add(Ty _a, Ty _b); + + /// 4xf32 subtract. + /// + template + Ty simd128_f32_sub(Ty _a, Ty _b); + + /// 4xf32 multiply. + /// + template + Ty simd128_f32_mul(Ty _a, Ty _b); + + /// 4xf32 divide. + /// + template + Ty simd128_f32_div(Ty _a, Ty _b); + + /// 4xf32 fused multiply-add: _a * _b + _c + /// + template + Ty simd128_f32_madd(Ty _a, Ty _b, Ty _c); + + /// 4xf32 fused multiply-subtract: _a * _b - _c + /// + template + Ty simd128_f32_msub(Ty _a, Ty _b, Ty _c); + + /// 4xf32 negate-multiply-subtract: _c - _a * _b + /// + template + Ty simd128_f32_nmsub(Ty _a, Ty _b, Ty _c); + + /// 4x32 extract sign bits into int bitmask (bit N = sign of lane N). + /// + template + int simd128_x32_signbitsmask(Ty _a); + + /// 16x8 extract sign bits into int bitmask (bit N = sign of byte N). + /// + template + int simd128_x8_signbitsmask(Ty _a); + + /// Per-byte shuffle within 16-byte lane (single-source). + /// + template + Ty simd128_x8_shuffle(Ty _a, Ty _indices); + + /// Per-byte shuffle within 16-byte lane (two-source). + /// + template + Ty simd128_x8_shuffle(Ty _a, Ty _b, Ty _indices); + + /// 4xf32 negate. + /// + template + Ty simd128_f32_neg(Ty _a); + + /// 4xf32 absolute value. + /// + template + Ty simd128_f32_abs(Ty _a); + + /// 4xf32 min. + /// + template + Ty simd128_f32_min(Ty _a, Ty _b); + + /// 4xf32 max. + /// + template + Ty simd128_f32_max(Ty _a, Ty _b); + + /// 4xf32 clamp: max(min(_a, _max), _min). + /// + template + Ty simd128_f32_clamp(Ty _a, Ty _min, Ty _max); + + /// 4xf32 linear interpolation: _a + (_b - _a) * _s. + /// + template + Ty simd128_f32_lerp(Ty _a, Ty _b, Ty _s); + + /// 4xf32 reciprocal estimate. + /// + template + Ty simd128_f32_rcp_est(Ty _a); + + /// 4xf32 reciprocal (refined). + /// + template + Ty simd128_f32_rcp(Ty _a); + + /// 4xf32 square root. + /// + template + Ty simd128_f32_sqrt(Ty _a); + + /// 4xf32 sqrt Newton-Raphson. + /// + template + Ty simd128_f32_sqrt_nr(Ty _a); + + /// 4xf32 reciprocal-sqrt estimate. + /// + template + Ty simd128_f32_rsqrt_est(Ty _a); + + /// 4xf32 reciprocal-sqrt (refined). + /// + template + Ty simd128_f32_rsqrt(Ty _a); + + /// 4xf32 reciprocal-sqrt Newton-Raphson. + /// + template + Ty simd128_f32_rsqrt_nr(Ty _a); + + /// 4xf32 reciprocal-sqrt Carmack (fast inverse square root). + /// + template + Ty simd128_f32_rsqrt_carmack(Ty _a); + + /// 4xf32 div Newton-Raphson. + /// + template + Ty simd128_f32_div_nr(Ty _a, Ty _b); + + /// 4xf32 round to nearest integer. + /// + template + Ty simd128_f32_round(Ty _a); + + /// 4xf32 ceil. + /// + template + Ty simd128_f32_ceil(Ty _a); + + /// 4xf32 floor. + /// + template + Ty simd128_f32_floor(Ty _a); + + /// 4xf32 compare equal (returns mask). + /// + template + Ty simd128_f32_cmpeq(Ty _a, Ty _b); + + /// 4xf32 compare not-equal. + /// + template + Ty simd128_f32_cmpneq(Ty _a, Ty _b); + + /// 4xf32 compare less-than. + /// + template + Ty simd128_f32_cmplt(Ty _a, Ty _b); + + /// 4xf32 compare less-or-equal. + /// + template + Ty simd128_f32_cmple(Ty _a, Ty _b); + + /// 4xf32 compare greater-than. + /// + template + Ty simd128_f32_cmpgt(Ty _a, Ty _b); + + /// 4xf32 compare greater-or-equal. + /// + template + Ty simd128_f32_cmpge(Ty _a, Ty _b); + + /// Extract X (lane 0) as float. + /// + template + float simd128_f32_x(Ty _a); + + /// Extract Y (lane 1) as float. + /// + template + float simd128_f32_y(Ty _a); + + /// Extract Z (lane 2) as float. + /// + template + float simd128_f32_z(Ty _a); + + /// Extract W (lane 3) as float. + /// + template + float simd128_f32_w(Ty _a); + + /// 3-component dot product (result broadcast). + /// + template + Ty simd128_f32_dot3(Ty _a, Ty _b); + + /// 4-component dot product (result broadcast). + /// + template + Ty simd128_f32_dot(Ty _a, Ty _b); + + /// 3-component cross product. + /// + template + Ty simd128_f32_cross3(Ty _a, Ty _b); + + /// 3-component normalize. + /// + template + Ty simd128_f32_normalize3(Ty _a); + + /// 4xi32 add. + /// + template + Ty simd128_i32_add(Ty _a, Ty _b); + + /// 4xi32 subtract. + /// + template + Ty simd128_i32_sub(Ty _a, Ty _b); + + /// 4xi32 negate. + /// + template + Ty simd128_i32_neg(Ty _a); + + /// 4xi32 absolute value. + /// + template + Ty simd128_i32_abs(Ty _a); + + /// 4xi32 min. + /// + template + Ty simd128_i32_min(Ty _a, Ty _b); + + /// 4xi32 max. + /// + template + Ty simd128_i32_max(Ty _a, Ty _b); + + /// 4xi32 clamp. + /// + template + Ty simd128_i32_clamp(Ty _a, Ty _min, Ty _max); + + /// 4xu32 add. + /// + template + Ty simd128_u32_add(Ty _a, Ty _b); + + /// 4xu32 subtract. + /// + template + Ty simd128_u32_sub(Ty _a, Ty _b); + + /// 4xu32 multiply. + /// + template + Ty simd128_u32_mul(Ty _a, Ty _b); + + /// 4xu32 min. + /// + template + Ty simd128_u32_min(Ty _a, Ty _b); + + /// 4xu32 max. + /// + template + Ty simd128_u32_max(Ty _a, Ty _b); + + /// 4xu32 clamp. + /// + template + Ty simd128_u32_clamp(Ty _a, Ty _min, Ty _max); + + /// 4xu32 compare equal. + /// + template + Ty simd128_u32_cmpeq(Ty _a, Ty _b); + + /// 4xu32 compare less-than. + /// + template + Ty simd128_u32_cmplt(Ty _a, Ty _b); + + /// 4xu32 compare greater-than. + /// + template + Ty simd128_u32_cmpgt(Ty _a, Ty _b); + + /// 4xi32 compare equal. + /// + template + Ty simd128_i32_cmpeq(Ty _a, Ty _b); + + /// 4xi32 compare less-than. + /// + template + Ty simd128_i32_cmplt(Ty _a, Ty _b); + + /// 4xi32 compare greater-than. + /// + template + Ty simd128_i32_cmpgt(Ty _a, Ty _b); + + /// 8xi16 add. + /// + template + Ty simd128_i16_add(Ty _a, Ty _b); + + /// 8xi16 subtract. + /// + template + Ty simd128_i16_sub(Ty _a, Ty _b); + + /// 8xi16 multiply low (keep low 16 bits of 32-bit result). + /// + template + Ty simd128_i16_mullo(Ty _a, Ty _b); + + /// 8xi16 compare equal. + /// + template + Ty simd128_i16_cmpeq(Ty _a, Ty _b); + + /// 128-bit shift left logical (per-16-bit lane). + /// + template + Ty simd128_x16_sll(Ty _a, int _count); + + /// 128-bit shift right logical (per-16-bit lane). + /// + template + Ty simd128_x16_srl(Ty _a, int _count); + + /// Splat signed 16-bit value to all lanes. + /// + template Ty simd128_splat(int16_t _a); + + /// Splat unsigned 16-bit value to all lanes. + /// + template Ty simd128_splat(uint16_t _a); + + /// 16xi8 add. + /// + template + Ty simd128_i8_add(Ty _a, Ty _b); + + /// 16xi8 subtract. + /// + template + Ty simd128_i8_sub(Ty _a, Ty _b); + + /// 16xu8 saturating add. + /// + template + Ty simd128_u8_satadd(Ty _a, Ty _b); + + /// 16xu8 saturating subtract. + /// + template + Ty simd128_u8_satsub(Ty _a, Ty _b); + + /// 8xu16 saturating add. + /// + template + Ty simd128_u16_satadd(Ty _a, Ty _b); + + /// 8xu16 saturating subtract. + /// + template + Ty simd128_u16_satsub(Ty _a, Ty _b); + + /// 128-bit bitwise AND. + /// + template + Ty simd128_and(Ty _a, Ty _b); + + /// 128-bit AND-NOT: _a & ~_b. + /// + template + Ty simd128_andc(Ty _a, Ty _b); + + /// 128-bit bitwise OR. + /// + template + Ty simd128_or(Ty _a, Ty _b); + + /// 128-bit OR complement: ~_a | _b. + /// + template + Ty simd128_orc(Ty _a, Ty _b); + + /// 128-bit bitwise XOR. + /// + template + Ty simd128_xor(Ty _a, Ty _b); + + /// 128-bit bitwise NOT. + /// + template + Ty simd128_not(Ty _a); + + /// 128-bit shift left logical (per-32-bit lane, immediate count). + /// + template + Ty simd128_x32_sll(Ty _a, int _count); + + /// 128-bit shift left logical (per-32-bit lane, per-lane variable count). + /// + template + Ty simd128_x32_sll(Ty _a, Ty _count); + + /// 128-bit shift right logical (per-32-bit lane). + /// + template + Ty simd128_x32_srl(Ty _a, int _count); + + /// 128-bit shift right logical (per-32-bit lane, per-lane variable count). + /// + template + Ty simd128_x32_srl(Ty _a, Ty _count); + + /// 128-bit shift right arithmetic (per-32-bit lane). + /// + template + Ty simd128_x32_sra(Ty _a, int _count); + + /// 128-bit shift right arithmetic (per-32-bit lane, per-lane variable count). + /// + template + Ty simd128_x32_sra(Ty _a, Ty _count); + + /// Select by bitmask: for each bit, result = mask ? _a : _b. + /// + template + Ty simd128_selb(Ty _mask, Ty _a, Ty _b); + + /// Select by sign bit: for each lane, result = (test < 0) ? _a : _b. + /// + template + Ty simd128_sels(Ty _test, Ty _a, Ty _b); + + /// OR-reduce all lanes into lane 0. + /// + template + Ty simd128_orx(Ty _a); + +#define ELEMx 0 +#define ELEMy 1 +#define ELEMz 2 +#define ELEMw 3 + +#define BX_SIMD128_IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ + template \ + Ty simd128_x32_swiz_##_x##_y##_z##_w(Ty _a); + +#include "simd128_swizzle.inl" + +#undef BX_SIMD128_IMPLEMENT_SWIZZLE +#undef ELEMw +#undef ELEMz +#undef ELEMy +#undef ELEMx + + /// Shuffle: [a.x, a.y, b.x, b.y] + /// + template + Ty simd128_x32_shuf_xyAB(Ty _a, Ty _b); + + /// Shuffle: [b.x, b.y, a.x, a.y] + /// + template + Ty simd128_x32_shuf_ABxy(Ty _a, Ty _b); + + /// Shuffle: [b.z, b.w, a.z, a.w] + /// + template + Ty simd128_x32_shuf_CDzw(Ty _a, Ty _b); + + /// Shuffle: [a.z, a.w, b.z, b.w] + /// + template + Ty simd128_x32_shuf_zwCD(Ty _a, Ty _b); + + /// Shuffle: [a.x, b.x, a.y, b.y] + /// + template + Ty simd128_x32_shuf_xAyB(Ty _a, Ty _b); + + /// Shuffle: [b.x, a.x, b.y, a.y] + /// + template + Ty simd128_x32_shuf_AxBy(Ty _a, Ty _b); + + /// Shuffle: [a.z, b.z, a.w, b.w] + /// + template + Ty simd128_x32_shuf_zCwD(Ty _a, Ty _b); + + /// Shuffle: [b.z, a.z, b.w, a.w] + /// + template + Ty simd128_x32_shuf_CzDw(Ty _a, Ty _b); + + /// Shuffle: [a.x, b.x, a.z, b.z] + /// + template + Ty simd128_x32_shuf_xAzC(Ty _a, Ty _b); + + /// Shuffle: [a.y, b.y, a.w, b.w] + /// + template + Ty simd128_x32_shuf_yBwD(Ty _a, Ty _b); + + /// Shuffle: [a.x, a.z, b.x, b.z] (PSHUFPS imm 0x88, even-lane deinterleave) + /// + template + Ty simd128_x32_shuf_xzAC(Ty _a, Ty _b); + + /// Shuffle: [a.y, a.w, b.y, b.w] (PSHUFPS imm 0xDD, odd-lane deinterleave) + /// + template + Ty simd128_x32_shuf_ywBD(Ty _a, Ty _b); + + /// Shuffle: [a.x, a.x, b.x, b.x] (PSHUFPS imm 0x00) + /// + template + Ty simd128_x32_shuf_xxAA(Ty _a, Ty _b); + + /// Shuffle: [a.y, a.y, b.y, b.y] (PSHUFPS imm 0x55) + /// + template + Ty simd128_x32_shuf_yyBB(Ty _a, Ty _b); + + /// Shuffle: [a.z, a.z, b.z, b.z] (PSHUFPS imm 0xAA) + /// + template + Ty simd128_x32_shuf_zzCC(Ty _a, Ty _b); + + /// Shuffle: [a.w, a.w, b.w, b.w] (PSHUFPS imm 0xFF) + /// + template + Ty simd128_x32_shuf_wwDD(Ty _a, Ty _b); + +#define BX_SIMD128_IMPLEMENT_TEST(_xyzw) \ + template \ + BX_SIMD_FORCE_INLINE bool simd128_test_any_##_xyzw(Ty _test); \ + \ + template \ + BX_SIMD_FORCE_INLINE bool simd128_test_all_##_xyzw(Ty _test); + + BX_SIMD128_IMPLEMENT_TEST(x ); + BX_SIMD128_IMPLEMENT_TEST(y ); + BX_SIMD128_IMPLEMENT_TEST(xy ); + BX_SIMD128_IMPLEMENT_TEST(z ); + BX_SIMD128_IMPLEMENT_TEST(xz ); + BX_SIMD128_IMPLEMENT_TEST(yz ); + BX_SIMD128_IMPLEMENT_TEST(xyz ); + BX_SIMD128_IMPLEMENT_TEST(w ); + BX_SIMD128_IMPLEMENT_TEST(xw ); + BX_SIMD128_IMPLEMENT_TEST(yw ); + BX_SIMD128_IMPLEMENT_TEST(xyw ); + BX_SIMD128_IMPLEMENT_TEST(zw ); + BX_SIMD128_IMPLEMENT_TEST(xzw ); + BX_SIMD128_IMPLEMENT_TEST(yzw ); + BX_SIMD128_IMPLEMENT_TEST(xyzw); + +#undef BX_SIMD128_IMPLEMENT_TEST + + template bool simd128_test_zero(Ty _a, Ty _b); + + template Ty simd128_f64_add(Ty _a, Ty _b); + template Ty simd128_f64_sub(Ty _a, Ty _b); + template Ty simd128_f64_mul(Ty _a, Ty _b); + template Ty simd128_f64_div(Ty _a, Ty _b); + template Ty simd128_f64_min(Ty _a, Ty _b); + template Ty simd128_f64_max(Ty _a, Ty _b); + template Ty simd128_f64_madd(Ty _a, Ty _b, Ty _c); + template Ty simd128_f64_nmsub(Ty _a, Ty _b, Ty _c); + template Ty simd128_f64_neg(Ty _a); + template Ty simd128_f64_abs(Ty _a); + template Ty simd128_f64_clamp(Ty _a, Ty _min, Ty _max); + template Ty simd128_f64_lerp(Ty _a, Ty _b, Ty _s); + template Ty simd128_f64_rcp(Ty _a); + template Ty simd128_f64_sqrt(Ty _a); + template Ty simd128_f64_rsqrt(Ty _a); + template Ty simd128_f64_round(Ty _a); + template Ty simd128_f64_ceil(Ty _a); + template Ty simd128_f64_floor(Ty _a); + template Ty simd128_f64_cmpeq(Ty _a, Ty _b); + template Ty simd128_f64_cmpneq(Ty _a, Ty _b); + template Ty simd128_f64_cmplt(Ty _a, Ty _b); + template Ty simd128_f64_cmple(Ty _a, Ty _b); + template Ty simd128_f64_cmpgt(Ty _a, Ty _b); + template Ty simd128_f64_cmpge(Ty _a, Ty _b); + + template Ty simd128_i64_add(Ty _a, Ty _b); + template Ty simd128_i64_sub(Ty _a, Ty _b); + template Ty simd128_u64_add(Ty _a, Ty _b); + template Ty simd128_u64_sub(Ty _a, Ty _b); + + template bool simd256_test_any(Ty _test); + template bool simd256_test_all(Ty _test); + template bool simd256_test_zero(Ty _a, Ty _b); + + template Ty simd256_ld(const void* _ptr); + template Ty simd256_ldu(const void* _ptr); + template void simd256_st(void* _ptr, Ty _a); + template void simd256_stu(void* _ptr, Ty _a); + template void simd256_x32_st1(void* _ptr, Ty _a); + template void simd256_stream(void* _ptr, Ty _a); + + template Ty simd256_splat(float _a); + template Ty simd256_splat(int32_t _a); + template Ty simd256_splat(uint32_t _a); + template Ty simd256_splat(double _a); + template Ty simd256_splat(int16_t _a); + template Ty simd256_splat(uint16_t _a); + template Ty simd256_ld(float _x0, float _x1, float _x2, float _x3, float _x4, float _x5, float _x6, float _x7); + template Ty simd256_ld(int32_t _x0, int32_t _x1, int32_t _x2, int32_t _x3, int32_t _x4, int32_t _x5, int32_t _x6, int32_t _x7); + template Ty simd256_ld(uint32_t _x0, uint32_t _x1, uint32_t _x2, uint32_t _x3, uint32_t _x4, uint32_t _x5, uint32_t _x6, uint32_t _x7); + template Ty simd256_zero(); + + template Ty simd256_i32_itof(Ty _a); + template Ty simd256_f32_ftoi_trunc(Ty _a); + template Ty simd256_f32_ftoi_round(Ty _a); + + template Ty simd256_f32_add(Ty _a, Ty _b); + template Ty simd256_f32_sub(Ty _a, Ty _b); + template Ty simd256_f32_mul(Ty _a, Ty _b); + template Ty simd256_f32_div(Ty _a, Ty _b); + template Ty simd256_f32_madd(Ty _a, Ty _b, Ty _c); + template Ty simd256_f32_msub(Ty _a, Ty _b, Ty _c); + template Ty simd256_f32_nmsub(Ty _a, Ty _b, Ty _c); + template int simd256_x32_signbitsmask(Ty _a); + template int simd256_x8_signbitsmask(Ty _a); + template Ty simd256_x8_shuffle(Ty _a, Ty _indices); + template Ty simd256_x8_shuffle(Ty _a, Ty _b, Ty _indices); + template Ty simd256_f32_neg(Ty _a); + template Ty simd256_f32_abs(Ty _a); + template Ty simd256_f32_min(Ty _a, Ty _b); + template Ty simd256_f32_max(Ty _a, Ty _b); + template Ty simd256_f32_clamp(Ty _a, Ty _min, Ty _max); + template Ty simd256_f32_lerp(Ty _a, Ty _b, Ty _s); + template Ty simd256_f32_rcp_est(Ty _a); + template Ty simd256_f32_rcp(Ty _a); + template Ty simd256_f32_sqrt(Ty _a); + template Ty simd256_f32_rsqrt_est(Ty _a); + template Ty simd256_f32_rsqrt(Ty _a); + template Ty simd256_f32_round(Ty _a); + template Ty simd256_f32_ceil(Ty _a); + template Ty simd256_f32_floor(Ty _a); + template Ty simd256_f32_cmpeq(Ty _a, Ty _b); + template Ty simd256_f32_cmpneq(Ty _a, Ty _b); + template Ty simd256_f32_cmplt(Ty _a, Ty _b); + template Ty simd256_f32_cmple(Ty _a, Ty _b); + template Ty simd256_f32_cmpgt(Ty _a, Ty _b); + template Ty simd256_f32_cmpge(Ty _a, Ty _b); + + template Ty simd256_i32_add(Ty _a, Ty _b); + template Ty simd256_i32_sub(Ty _a, Ty _b); + template Ty simd256_i32_neg(Ty _a); + template Ty simd256_i32_abs(Ty _a); + template Ty simd256_i32_min(Ty _a, Ty _b); + template Ty simd256_i32_max(Ty _a, Ty _b); + template Ty simd256_i32_clamp(Ty _a, Ty _min, Ty _max); + + template Ty simd256_u32_add(Ty _a, Ty _b); + template Ty simd256_u32_sub(Ty _a, Ty _b); + template Ty simd256_u32_mul(Ty _a, Ty _b); + template Ty simd256_u32_min(Ty _a, Ty _b); + template Ty simd256_u32_max(Ty _a, Ty _b); + template Ty simd256_u32_clamp(Ty _a, Ty _min, Ty _max); + template Ty simd256_u32_cmpeq(Ty _a, Ty _b); + template Ty simd256_u32_cmplt(Ty _a, Ty _b); + template Ty simd256_u32_cmpgt(Ty _a, Ty _b); + + template Ty simd256_i32_cmpeq(Ty _a, Ty _b); + template Ty simd256_i32_cmplt(Ty _a, Ty _b); + template Ty simd256_i32_cmpgt(Ty _a, Ty _b); + + template Ty simd256_i16_add(Ty _a, Ty _b); + template Ty simd256_i16_sub(Ty _a, Ty _b); + template Ty simd256_i16_mullo(Ty _a, Ty _b); + template Ty simd256_i16_cmpeq(Ty _a, Ty _b); + template Ty simd256_x16_sll(Ty _a, int _count); + template Ty simd256_x16_srl(Ty _a, int _count); + + template Ty simd256_i8_add(Ty _a, Ty _b); + template Ty simd256_i8_sub(Ty _a, Ty _b); + + template Ty simd256_u8_satadd(Ty _a, Ty _b); + template Ty simd256_u8_satsub(Ty _a, Ty _b); + template Ty simd256_u16_satadd(Ty _a, Ty _b); + template Ty simd256_u16_satsub(Ty _a, Ty _b); + + template Ty simd256_and(Ty _a, Ty _b); + template Ty simd256_andc(Ty _a, Ty _b); + template Ty simd256_or(Ty _a, Ty _b); + template Ty simd256_orc(Ty _a, Ty _b); + template Ty simd256_xor(Ty _a, Ty _b); + template Ty simd256_not(Ty _a); + template Ty simd256_x32_sll(Ty _a, int _count); + template Ty simd256_x32_sll(Ty _a, Ty _count); + template Ty simd256_x32_srl(Ty _a, int _count); + template Ty simd256_x32_srl(Ty _a, Ty _count); + template Ty simd256_x32_sra(Ty _a, int _count); + template Ty simd256_x32_sra(Ty _a, Ty _count); + + template Ty simd256_selb(Ty _mask, Ty _a, Ty _b); + template Ty simd256_sels(Ty _test, Ty _a, Ty _b); + + template Ty simd256_f64_add(Ty _a, Ty _b); + template Ty simd256_f64_sub(Ty _a, Ty _b); + template Ty simd256_f64_mul(Ty _a, Ty _b); + template Ty simd256_f64_div(Ty _a, Ty _b); + template Ty simd256_f64_min(Ty _a, Ty _b); + template Ty simd256_f64_max(Ty _a, Ty _b); + template Ty simd256_f64_madd(Ty _a, Ty _b, Ty _c); + template Ty simd256_f64_nmsub(Ty _a, Ty _b, Ty _c); + template Ty simd256_f64_neg(Ty _a); + template Ty simd256_f64_abs(Ty _a); + template Ty simd256_f64_clamp(Ty _a, Ty _min, Ty _max); + template Ty simd256_f64_lerp(Ty _a, Ty _b, Ty _s); + template Ty simd256_f64_rcp(Ty _a); + template Ty simd256_f64_sqrt(Ty _a); + template Ty simd256_f64_rsqrt(Ty _a); + template Ty simd256_f64_round(Ty _a); + template Ty simd256_f64_ceil(Ty _a); + template Ty simd256_f64_floor(Ty _a); + template Ty simd256_f64_cmpeq(Ty _a, Ty _b); + template Ty simd256_f64_cmpneq(Ty _a, Ty _b); + template Ty simd256_f64_cmplt(Ty _a, Ty _b); + template Ty simd256_f64_cmple(Ty _a, Ty _b); + template Ty simd256_f64_cmpgt(Ty _a, Ty _b); + template Ty simd256_f64_cmpge(Ty _a, Ty _b); + + template Ty simd256_i64_add(Ty _a, Ty _b); + template Ty simd256_i64_sub(Ty _a, Ty _b); + template Ty simd256_u64_add(Ty _a, Ty _b); + template Ty simd256_u64_sub(Ty _a, Ty _b); + + // These are used when a platform lacks a dedicated instruction. + + template + Ty simd_f32_madd_ni(Ty _a, Ty _b, Ty _c); + + template + Ty simd_f32_msub_ni(Ty _a, Ty _b, Ty _c); + + template + Ty simd_f32_nmsub_ni(Ty _a, Ty _b, Ty _c); + + template + int simd_x32_signbitsmask_ni(Ty _a); + + template + int simd_x8_signbitsmask_ni(Ty _a); + + template + Ty simd_f32_neg_ni(Ty _a); + + template + Ty simd_f32_abs_ni(Ty _a); + + template + Ty simd_f64_abs_ni(Ty _a); + + template + Ty simd_andc_ni(Ty _a, Ty _b); + + template + Ty simd_u32_cmplt_ni(Ty _a, Ty _b); + + template + Ty simd_u32_cmpgt_ni(Ty _a, Ty _b); + + template + Ty simd_f64_madd_ni(Ty _a, Ty _b, Ty _c); + + template + Ty simd_f64_nmsub_ni(Ty _a, Ty _b, Ty _c); + + template + Ty simd_f64_neg_ni(Ty _a); + + template + Ty simd_f64_lerp_ni(Ty _a, Ty _b, Ty _s); + + template + Ty simd_f64_rcp_ni(Ty _a); + + template + Ty simd_f64_rsqrt_ni(Ty _a); + + template + Ty simd_f64_cmpneq_ni(Ty _a, Ty _b); + + template + Ty simd_f32_rcp_ni(Ty _a); + + template + Ty simd_f32_div_nr_ni(Ty _a, Ty _b); + + template + Ty simd_f32_sqrt_nr_ni(Ty _a); + + template + Ty simd_f32_sqrt_nr1_ni(Ty _a); + + template + Ty simd_f32_rsqrt_ni(Ty _a); + + template + Ty simd_f32_rsqrt_nr_ni(Ty _a); + + template + Ty simd_f32_rsqrt_carmack_ni(Ty _a); + + template + Ty simd_f32_min_ni(Ty _a, Ty _b); + + template + Ty simd_f32_max_ni(Ty _a, Ty _b); + + template + Ty simd_f32_clamp_ni(Ty _a, Ty _min, Ty _max); + + template + Ty simd_f32_lerp_ni(Ty _a, Ty _b, Ty _s); + + template + Ty simd128_f32_dot3_ni(Ty _a, Ty _b); + + template + Ty simd128_f32_dot_ni(Ty _a, Ty _b); + + template + Ty simd128_f32_cross3_ni(Ty _a, Ty _b); + + template + Ty simd128_f32_normalize3_ni(Ty _a); + + template + Ty simd_f32_ceil_ni(Ty _a); + + template + Ty simd_f32_floor_ni(Ty _a); + + template + Ty simd_f32_round_ni(Ty _a); + + template + Ty simd_f32_log2_ni(Ty _a); + + template + Ty simd_f32_exp2_ni(Ty _a); + + template + Ty simd_f32_pow_ni(Ty _a, Ty _b); + + template + Ty simd_f32_ldexp_ni(Ty _a, Ty _b); + + template + Ty simd_f32_cos_ni(Ty _a); + + template + Ty simd_f32_sin_ni(Ty _a); + + template + Ty simd_f32_log_ni(Ty _a); + + template + Ty simd_f32_exp_ni(Ty _a); + + template + Ty simd_f32_cmpneq_ni(Ty _a, Ty _b); + + template + Ty simd_i32_min_ni(Ty _a, Ty _b); + + template + Ty simd_i32_max_ni(Ty _a, Ty _b); + + template + Ty simd_i32_neg_ni(Ty _a); + + template + Ty simd_i32_abs_ni(Ty _a); + + template + Ty simd_selb_ni(Ty _mask, Ty _a, Ty _b); + + template + Ty simd_sels_ni(Ty _test, Ty _a, Ty _b); + + template + Ty simd_not_ni(Ty _a); + + template + Ty simd_orc_ni(Ty _a, Ty _b); + + template + Ty simd128_orx_ni(Ty _a); + + template + Ty simd128_x32_shuf_xAzC_ni(Ty _a, Ty _b); + + template + Ty simd128_x32_shuf_yBwD_ni(Ty _a, Ty _b); + + template + bool simd128_test_any_ni(Ty _a); + + template + bool simd128_test_all_ni(Ty _a); + + template + bool simd128_test_zero_ni(Ty _a, Ty _b); + + template + Ty simd_f32_ftoi_round_ni(Ty _a); + + template + Ty simd_u32_cntlz_ni(Ty _a); + + template + Ty simd_x32_srl_ni(Ty _a, Ty _count); + + template + Ty simd_x32_sll_ni(Ty _a, Ty _count); + + template + Ty simd_x32_sra_ni(Ty _a, Ty _count); + + template + Ty simd_x8_shuffle_ni(Ty _a, Ty _indices); + + template + Ty simd_x8_shuffle_ni(Ty _a, Ty _b, Ty _indices); + + template + Ty simd_f16_fromf32_ni(Ty _a); + + template + Ty simd_f16_tof32_ni(Ty _a); + +} // namespace bx + +#if BX_SIMD_AVX +# include "simd256_avx.inl" +#endif // BX_SIMD_AVX + +#if BX_SIMD_SSE +# include "simd128_sse.inl" +#endif // BX_SIMD_SSE + +#if BX_SIMD_NEON +# include "simd128_neon.inl" +#endif // BX_SIMD_NEON + +#if BX_SIMD_WASM +# include "simd128_wasm.inl" +#endif // BX_SIMD_WASM + +namespace bx +{ + BX_CONSTEXPR_FUNC float floor(float _f); + BX_CONSTEXPR_FUNC float ceil(float _f); + BX_CONSTEXPR_FUNC float round(float _f); + BX_CONSTEXPR_FUNC float sqrt(float _a); +} // namespace bx + +#include "simd32_ref.inl" +#include "simd64_ref.inl" +#include "simd128_ref.inl" +#include "simd256_ref.inl" +#include "simd_ni.inl" + +namespace bx +{ + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_add(simd128_t _a, simd128_t _b) { return simd128_f32_add(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_sub(simd128_t _a, simd128_t _b) { return simd128_f32_sub(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_mul(simd128_t _a, simd128_t _b) { return simd128_f32_mul(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_div(simd128_t _a, simd128_t _b) { return simd128_f32_div(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_madd(simd128_t _a, simd128_t _b, simd128_t _c) { return simd128_f32_madd(_a, _b, _c); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_msub(simd128_t _a, simd128_t _b, simd128_t _c) { return simd128_f32_msub(_a, _b, _c); } + + template<> + BX_SIMD_FORCE_INLINE int simd_x32_signbitsmask(simd128_t _a) { return simd128_x32_signbitsmask(_a); } + + template<> + BX_SIMD_FORCE_INLINE int simd_x8_signbitsmask(simd128_t _a) { return simd128_x8_signbitsmask(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_x8_shuffle(simd128_t _a, simd128_t _indices) { return simd128_x8_shuffle(_a, _indices); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_x8_shuffle(simd128_t _a, simd128_t _b, simd128_t _indices) { return simd128_x8_shuffle(_a, _b, _indices); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_min(simd128_t _a, simd128_t _b) { return simd128_f32_min(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_max(simd128_t _a, simd128_t _b) { return simd128_f32_max(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_i32_add(simd128_t _a, simd128_t _b) { return simd128_i32_add(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_i32_sub(simd128_t _a, simd128_t _b) { return simd128_i32_sub(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_i32_neg(simd128_t _a) { return simd128_i32_neg(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_i32_abs(simd128_t _a) { return simd128_i32_abs(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_and(simd128_t _a, simd128_t _b) { return simd128_and(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_or(simd128_t _a, simd128_t _b) { return simd128_or(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_xor(simd128_t _a, simd128_t _b) { return simd128_xor(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_add(simd256_t _a, simd256_t _b) { return simd256_f32_add(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_sub(simd256_t _a, simd256_t _b) { return simd256_f32_sub(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_mul(simd256_t _a, simd256_t _b) { return simd256_f32_mul(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_div(simd256_t _a, simd256_t _b) { return simd256_f32_div(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_msub(simd256_t _a, simd256_t _b, simd256_t _c) { return simd256_f32_msub(_a, _b, _c); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_madd(simd256_t _a, simd256_t _b, simd256_t _c) { return simd256_f32_madd(_a, _b, _c); } + + template<> + BX_SIMD_FORCE_INLINE int simd_x32_signbitsmask(simd256_t _a) { return simd256_x32_signbitsmask(_a); } + + template<> + BX_SIMD_FORCE_INLINE int simd_x8_signbitsmask(simd256_t _a) { return simd256_x8_signbitsmask(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_x8_shuffle(simd256_t _a, simd256_t _indices) { return simd256_x8_shuffle(_a, _indices); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_x8_shuffle(simd256_t _a, simd256_t _b, simd256_t _indices) { return simd256_x8_shuffle(_a, _b, _indices); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_min(simd256_t _a, simd256_t _b) { return simd256_f32_min(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_max(simd256_t _a, simd256_t _b) { return simd256_f32_max(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_i32_add(simd256_t _a, simd256_t _b) { return simd256_i32_add(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_i32_sub(simd256_t _a, simd256_t _b) { return simd256_i32_sub(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_i32_neg(simd256_t _a) { return simd256_i32_neg(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_i32_abs(simd256_t _a) { return simd256_i32_abs(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_and(simd256_t _a, simd256_t _b) { return simd256_and(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_or(simd256_t _a, simd256_t _b) { return simd256_or(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_xor(simd256_t _a, simd256_t _b) { return simd256_xor(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_nmsub(simd128_t _a, simd128_t _b, simd128_t _c) { return simd128_f32_nmsub(_a, _b, _c); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_neg(simd128_t _a) { return simd128_f32_neg(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_abs(simd128_t _a) { return simd128_f32_abs(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_rcp_est(simd128_t _a) { return simd128_f32_rcp_est(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_rsqrt_est(simd128_t _a) { return simd128_f32_rsqrt_est(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_sqrt(simd128_t _a) { return simd128_f32_sqrt(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_rsqrt(simd128_t _a) { return simd128_f32_rsqrt(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_div_nr(simd128_t _a, simd128_t _b) { return simd128_f32_div_nr(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_sqrt_nr(simd128_t _a) { return simd128_f32_sqrt_nr(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_sqrt_nr1(simd128_t _a) { return simd_f32_sqrt_nr1_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_rsqrt_nr(simd128_t _a) { return simd128_f32_rsqrt_nr(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_rsqrt_carmack(simd128_t _a) { return simd128_f32_rsqrt_carmack(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_cmpeq(simd128_t _a, simd128_t _b) { return simd128_f32_cmpeq(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_cmplt(simd128_t _a, simd128_t _b) { return simd128_f32_cmplt(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_cmpgt(simd128_t _a, simd128_t _b) { return simd128_f32_cmpgt(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_cmpneq(simd128_t _a, simd128_t _b) { return simd128_f32_cmpneq(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_cmple(simd128_t _a, simd128_t _b) { return simd128_f32_cmpgt(_b, _a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_cmpge(simd128_t _a, simd128_t _b) { return simd128_f32_cmplt(_b, _a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_clamp(simd128_t _a, simd128_t _min, simd128_t _max) { return simd128_f32_clamp(_a, _min, _max); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_lerp(simd128_t _a, simd128_t _b, simd128_t _s) { return simd128_f32_lerp(_a, _b, _s); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_rcp(simd128_t _a) { return simd128_f32_rcp(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_round(simd128_t _a) { return simd128_f32_round(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_ceil(simd128_t _a) { return simd128_f32_ceil(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_floor(simd128_t _a) { return simd128_f32_floor(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_cos(simd128_t _a) { return simd_f32_cos_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_sin(simd128_t _a) { return simd_f32_sin_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_log(simd128_t _a) { return simd_f32_log_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_exp(simd128_t _a) { return simd_f32_exp_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_ldexp(simd128_t _a, simd128_t _b) { return simd_f32_ldexp_ni(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_log2(simd128_t _a) { return simd_f32_log2_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_exp2(simd128_t _a) { return simd_f32_exp2_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_pow(simd128_t _a, simd128_t _b) { return simd_f32_pow_ni(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_ftoi_trunc(simd128_t _a) { return simd128_f32_ftoi_trunc(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f32_ftoi_round(simd128_t _a) { return simd128_f32_ftoi_round(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_i32_itof(simd128_t _a) { return simd128_i32_itof(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f64_add(simd128_t _a, simd128_t _b) { return simd128_f64_add(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f64_sub(simd128_t _a, simd128_t _b) { return simd128_f64_sub(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f64_mul(simd128_t _a, simd128_t _b) { return simd128_f64_mul(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f64_div(simd128_t _a, simd128_t _b) { return simd128_f64_div(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f64_madd(simd128_t _a, simd128_t _b, simd128_t _c) { return simd128_f64_madd(_a, _b, _c); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f64_nmsub(simd128_t _a, simd128_t _b, simd128_t _c) { return simd128_f64_nmsub(_a, _b, _c); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f64_neg(simd128_t _a) { return simd128_f64_neg(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f64_abs(simd128_t _a) { return simd128_f64_abs(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f64_min(simd128_t _a, simd128_t _b) { return simd128_f64_min(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f64_max(simd128_t _a, simd128_t _b) { return simd128_f64_max(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f64_clamp(simd128_t _a, simd128_t _min, simd128_t _max) { return simd128_f64_clamp(_a, _min, _max); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f64_lerp(simd128_t _a, simd128_t _b, simd128_t _s) { return simd128_f64_lerp(_a, _b, _s); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f64_rcp(simd128_t _a) { return simd128_f64_rcp(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f64_sqrt(simd128_t _a) { return simd128_f64_sqrt(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f64_rsqrt(simd128_t _a) { return simd128_f64_rsqrt(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f64_round(simd128_t _a) { return simd128_f64_round(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f64_ceil(simd128_t _a) { return simd128_f64_ceil(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f64_floor(simd128_t _a) { return simd128_f64_floor(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f64_cmpeq(simd128_t _a, simd128_t _b) { return simd128_f64_cmpeq(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f64_cmpneq(simd128_t _a, simd128_t _b) { return simd128_f64_cmpneq(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f64_cmplt(simd128_t _a, simd128_t _b) { return simd128_f64_cmplt(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f64_cmple(simd128_t _a, simd128_t _b) { return simd128_f64_cmple(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f64_cmpgt(simd128_t _a, simd128_t _b) { return simd128_f64_cmpgt(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_f64_cmpge(simd128_t _a, simd128_t _b) { return simd128_f64_cmpge(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_i32_cmplt(simd128_t _a, simd128_t _b) { return simd128_i32_cmplt(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_i32_cmpgt(simd128_t _a, simd128_t _b) { return simd128_i32_cmpgt(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_i32_min(simd128_t _a, simd128_t _b) { return simd128_i32_min(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_i32_max(simd128_t _a, simd128_t _b) { return simd128_i32_max(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_i32_clamp(simd128_t _a, simd128_t _min, simd128_t _max) { return simd128_i32_clamp(_a, _min, _max); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_i32_cmpeq(simd128_t _a, simd128_t _b) { return simd128_i32_cmpeq(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_u32_add(simd128_t _a, simd128_t _b) { return simd128_u32_add(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_u32_sub(simd128_t _a, simd128_t _b) { return simd128_u32_sub(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_u32_cmplt(simd128_t _a, simd128_t _b) { return simd128_u32_cmplt(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_u32_cmpgt(simd128_t _a, simd128_t _b) { return simd128_u32_cmpgt(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_orc(simd128_t _a, simd128_t _b) { return simd128_orc(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_ld(const void* _ptr) { return simd128_ld(_ptr); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_ldu(const void* _ptr) { return simd128_ldu(_ptr); } + + template<> + BX_SIMD_FORCE_INLINE void simd_st(void* _ptr, simd128_t _a) { simd128_st(_ptr, _a); } + + template<> + BX_SIMD_FORCE_INLINE void simd_stu(void* _ptr, simd128_t _a) { simd128_stu(_ptr, _a); } + + template<> + BX_SIMD_FORCE_INLINE void simd_x32_st1(void* _ptr, simd128_t _a) { simd128_x32_st1(_ptr, _a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_selb(simd128_t _mask, simd128_t _a, simd128_t _b) { return simd128_selb(_mask, _a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_sels(simd128_t _test, simd128_t _a, simd128_t _b) { return simd128_sels(_test, _a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_not(simd128_t _a) { return simd128_not(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_andc(simd128_t _a, simd128_t _b) { return simd128_andc(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_x32_sra(simd128_t _a, int _count) { return simd128_x32_sra(_a, _count); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_x32_srl(simd128_t _a, int _count) { return simd128_x32_srl(_a, _count); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_x32_sll(simd128_t _a, int _count) { return simd128_x32_sll(_a, _count); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_x32_sll(simd128_t _a, simd128_t _count) { return simd128_x32_sll(_a, _count); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_x32_srl(simd128_t _a, simd128_t _count) { return simd128_x32_srl(_a, _count); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_x32_sra(simd128_t _a, simd128_t _count) { return simd128_x32_sra(_a, _count); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_i32_mul(simd128_t _a, simd128_t _b) { return simd128_u32_mul(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_u8_satadd(simd128_t _a, simd128_t _b) { return simd128_u8_satadd(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_u8_satsub(simd128_t _a, simd128_t _b) { return simd128_u8_satsub(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_u16_satadd(simd128_t _a, simd128_t _b) { return simd128_u16_satadd(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_u16_satsub(simd128_t _a, simd128_t _b) { return simd128_u16_satsub(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_splat(float _a) { return simd128_splat(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_splat(uint32_t _a) { return simd128_splat(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_splat(double _a) { return simd128_splat(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd128_t simd_zero() { return simd128_zero(); } + + template<> + BX_SIMD_FORCE_INLINE bool simd_test_any(simd128_t _test) { return simd128_test_any_xyzw(_test); } + + template<> + BX_SIMD_FORCE_INLINE bool simd_test_all(simd128_t _test) { return simd128_test_all_xyzw(_test); } + + template<> + BX_SIMD_FORCE_INLINE bool simd_test_zero(simd128_t _a, simd128_t _b) { return simd128_test_zero(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_nmsub(simd256_t _a, simd256_t _b, simd256_t _c) { return simd256_f32_nmsub(_a, _b, _c); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_neg(simd256_t _a) { return simd256_f32_neg(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_abs(simd256_t _a) { return simd256_f32_abs(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_rcp_est(simd256_t _a) { return simd256_f32_rcp_est(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_rsqrt_est(simd256_t _a) { return simd256_f32_rsqrt_est(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_sqrt(simd256_t _a) { return simd256_f32_sqrt(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_rsqrt(simd256_t _a) { return simd256_f32_rsqrt(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_div_nr(simd256_t _a, simd256_t _b) { return simd_f32_div_nr_ni(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_sqrt_nr(simd256_t _a) { return simd_f32_sqrt_nr_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_sqrt_nr1(simd256_t _a) { return simd_f32_sqrt_nr1_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_rsqrt_nr(simd256_t _a) { return simd_f32_rsqrt_nr_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_rsqrt_carmack(simd256_t _a) { return simd_f32_rsqrt_carmack_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_cmpeq(simd256_t _a, simd256_t _b) { return simd256_f32_cmpeq(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_cmplt(simd256_t _a, simd256_t _b) { return simd256_f32_cmplt(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_cmpgt(simd256_t _a, simd256_t _b) { return simd256_f32_cmpgt(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_cmpneq(simd256_t _a, simd256_t _b) { return simd256_f32_cmpneq(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_cmple(simd256_t _a, simd256_t _b) { return simd256_f32_cmpgt(_b, _a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_cmpge(simd256_t _a, simd256_t _b) { return simd256_f32_cmplt(_b, _a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_clamp(simd256_t _a, simd256_t _min, simd256_t _max) { return simd256_f32_clamp(_a, _min, _max); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_lerp(simd256_t _a, simd256_t _b, simd256_t _s) { return simd256_f32_lerp(_a, _b, _s); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_rcp(simd256_t _a) { return simd256_f32_rcp(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_round(simd256_t _a) { return simd256_f32_round(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_ceil(simd256_t _a) { return simd256_f32_ceil(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_floor(simd256_t _a) { return simd256_f32_floor(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_cos(simd256_t _a) { return simd_f32_cos_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_sin(simd256_t _a) { return simd_f32_sin_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_log(simd256_t _a) { return simd_f32_log_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_exp(simd256_t _a) { return simd_f32_exp_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_ldexp(simd256_t _a, simd256_t _b) { return simd_f32_ldexp_ni(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_log2(simd256_t _a) { return simd_f32_log2_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_exp2(simd256_t _a) { return simd_f32_exp2_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_pow(simd256_t _a, simd256_t _b) { return simd_f32_pow_ni(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_ftoi_trunc(simd256_t _a) { return simd256_f32_ftoi_trunc(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f32_ftoi_round(simd256_t _a) { return simd256_f32_ftoi_round(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_i32_itof(simd256_t _a) { return simd256_i32_itof(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f64_add(simd256_t _a, simd256_t _b) { return simd256_f64_add(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f64_sub(simd256_t _a, simd256_t _b) { return simd256_f64_sub(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f64_mul(simd256_t _a, simd256_t _b) { return simd256_f64_mul(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f64_div(simd256_t _a, simd256_t _b) { return simd256_f64_div(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f64_madd(simd256_t _a, simd256_t _b, simd256_t _c) { return simd256_f64_madd(_a, _b, _c); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f64_nmsub(simd256_t _a, simd256_t _b, simd256_t _c) { return simd256_f64_nmsub(_a, _b, _c); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f64_neg(simd256_t _a) { return simd256_f64_neg(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f64_abs(simd256_t _a) { return simd256_f64_abs(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f64_min(simd256_t _a, simd256_t _b) { return simd256_f64_min(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f64_max(simd256_t _a, simd256_t _b) { return simd256_f64_max(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f64_clamp(simd256_t _a, simd256_t _min, simd256_t _max) { return simd256_f64_clamp(_a, _min, _max); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f64_lerp(simd256_t _a, simd256_t _b, simd256_t _s) { return simd256_f64_lerp(_a, _b, _s); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f64_rcp(simd256_t _a) { return simd256_f64_rcp(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f64_sqrt(simd256_t _a) { return simd256_f64_sqrt(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f64_rsqrt(simd256_t _a) { return simd256_f64_rsqrt(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f64_round(simd256_t _a) { return simd256_f64_round(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f64_ceil(simd256_t _a) { return simd256_f64_ceil(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f64_floor(simd256_t _a) { return simd256_f64_floor(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f64_cmpeq(simd256_t _a, simd256_t _b) { return simd256_f64_cmpeq(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f64_cmpneq(simd256_t _a, simd256_t _b) { return simd256_f64_cmpneq(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f64_cmplt(simd256_t _a, simd256_t _b) { return simd256_f64_cmplt(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f64_cmple(simd256_t _a, simd256_t _b) { return simd256_f64_cmple(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f64_cmpgt(simd256_t _a, simd256_t _b) { return simd256_f64_cmpgt(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_f64_cmpge(simd256_t _a, simd256_t _b) { return simd256_f64_cmpge(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_i32_cmplt(simd256_t _a, simd256_t _b) { return simd256_i32_cmplt(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_i32_cmpgt(simd256_t _a, simd256_t _b) { return simd256_i32_cmpgt(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_i32_min(simd256_t _a, simd256_t _b) { return simd256_i32_min(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_i32_max(simd256_t _a, simd256_t _b) { return simd256_i32_max(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_i32_clamp(simd256_t _a, simd256_t _min, simd256_t _max) { return simd256_i32_clamp(_a, _min, _max); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_i32_cmpeq(simd256_t _a, simd256_t _b) { return simd256_i32_cmpeq(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_u32_add(simd256_t _a, simd256_t _b) { return simd256_u32_add(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_u32_sub(simd256_t _a, simd256_t _b) { return simd256_u32_sub(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_u32_cmplt(simd256_t _a, simd256_t _b) { return simd256_u32_cmplt(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_u32_cmpgt(simd256_t _a, simd256_t _b) { return simd256_u32_cmpgt(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_orc(simd256_t _a, simd256_t _b) { return simd256_orc(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_ld(const void* _ptr) { return simd256_ld(_ptr); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_ldu(const void* _ptr) { return simd256_ldu(_ptr); } + + template<> + BX_SIMD_FORCE_INLINE void simd_st(void* _ptr, simd256_t _a) { simd256_st(_ptr, _a); } + + template<> + BX_SIMD_FORCE_INLINE void simd_stu(void* _ptr, simd256_t _a) { simd256_stu(_ptr, _a); } + + template<> + BX_SIMD_FORCE_INLINE void simd_x32_st1(void* _ptr, simd256_t _a) { simd256_x32_st1(_ptr, _a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_selb(simd256_t _mask, simd256_t _a, simd256_t _b) { return simd256_selb(_mask, _a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_sels(simd256_t _test, simd256_t _a, simd256_t _b) { return simd256_sels(_test, _a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_not(simd256_t _a) { return simd256_not(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_andc(simd256_t _a, simd256_t _b) { return simd256_andc(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_x32_sra(simd256_t _a, int _count) { return simd256_x32_sra(_a, _count); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_x32_srl(simd256_t _a, int _count) { return simd256_x32_srl(_a, _count); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_x32_sll(simd256_t _a, int _count) { return simd256_x32_sll(_a, _count); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_x32_sll(simd256_t _a, simd256_t _count) { return simd256_x32_sll(_a, _count); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_x32_srl(simd256_t _a, simd256_t _count) { return simd256_x32_srl(_a, _count); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_x32_sra(simd256_t _a, simd256_t _count) { return simd256_x32_sra(_a, _count); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_i32_mul(simd256_t _a, simd256_t _b) { return simd256_u32_mul(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_u8_satadd(simd256_t _a, simd256_t _b) { return simd256_u8_satadd(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_u8_satsub(simd256_t _a, simd256_t _b) { return simd256_u8_satsub(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_u16_satadd(simd256_t _a, simd256_t _b) { return simd256_u16_satadd(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_u16_satsub(simd256_t _a, simd256_t _b) { return simd256_u16_satsub(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_splat(float _a) { return simd256_splat(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_splat(uint32_t _a) { return simd256_splat(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_splat(double _a) { return simd256_splat(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd256_t simd_zero() { return simd256_zero(); } + + template<> + BX_SIMD_FORCE_INLINE bool simd_test_any(simd256_t _test) { return simd256_test_any(_test); } + + template<> + BX_SIMD_FORCE_INLINE bool simd_test_all(simd256_t _test) { return simd256_test_all(_test); } + + template<> + BX_SIMD_FORCE_INLINE bool simd_test_zero(simd256_t _a, simd256_t _b) { return simd256_test_zero(_a, _b); } + + // --- simd32_t dispatch --- + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_and(simd32_t _a, simd32_t _b) { return simd32_and(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_andc(simd32_t _a, simd32_t _b) { return simd32_andc(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_or(simd32_t _a, simd32_t _b) { return simd32_or(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_orc(simd32_t _a, simd32_t _b) { return simd32_orc(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_xor(simd32_t _a, simd32_t _b) { return simd32_xor(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_not(simd32_t _a) { return simd32_not(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_x32_srl(simd32_t _a, int _count) { return simd32_x32_srl(_a, _count); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_x32_sll(simd32_t _a, int _count) { return simd32_x32_sll(_a, _count); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_x32_sra(simd32_t _a, int _count) { return simd32_x32_sra(_a, _count); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_x32_sll(simd32_t _a, simd32_t _count) { return simd32_x32_sll(_a, _count); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_x32_srl(simd32_t _a, simd32_t _count) { return simd32_x32_srl(_a, _count); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_x32_sra(simd32_t _a, simd32_t _count) { return simd32_x32_sra(_a, _count); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_x8_shuffle(simd32_t _a, simd32_t _indices) { return simd32_x8_shuffle(_a, _indices); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_x8_shuffle(simd32_t _a, simd32_t _b, simd32_t _indices) { return simd32_x8_shuffle(_a, _b, _indices); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_i32_mul(simd32_t _a, simd32_t _b) { return simd32_u32_mul(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_u8_satadd(simd32_t _a, simd32_t _b) { return simd32_u8_satadd(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_u8_satsub(simd32_t _a, simd32_t _b) { return simd32_u8_satsub(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_u16_satadd(simd32_t _a, simd32_t _b) { return simd32_u16_satadd(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_u16_satsub(simd32_t _a, simd32_t _b) { return simd32_u16_satsub(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_u32_add(simd32_t _a, simd32_t _b) { return simd32_u32_add(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_u32_sub(simd32_t _a, simd32_t _b) { return simd32_u32_sub(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_u32_cmplt(simd32_t _a, simd32_t _b) { return simd32_u32_cmplt(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_u32_cmpgt(simd32_t _a, simd32_t _b) { return simd32_u32_cmpgt(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_i32_add(simd32_t _a, simd32_t _b) { return simd32_i32_add(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_i32_sub(simd32_t _a, simd32_t _b) { return simd32_i32_sub(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_i32_neg(simd32_t _a) { return simd32_i32_neg(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_i32_abs(simd32_t _a) { return simd32_i32_abs(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_i32_min(simd32_t _a, simd32_t _b) { return simd32_i32_min(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_i32_max(simd32_t _a, simd32_t _b) { return simd32_i32_max(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_i32_clamp(simd32_t _a, simd32_t _min, simd32_t _max) { return simd32_i32_clamp(_a, _min, _max); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_i32_cmpeq(simd32_t _a, simd32_t _b) { return simd32_i32_cmpeq(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_i32_cmplt(simd32_t _a, simd32_t _b) { return simd32_i32_cmplt(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_i32_cmpgt(simd32_t _a, simd32_t _b) { return simd32_i32_cmpgt(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_add(simd32_t _a, simd32_t _b) { return simd32_f32_add(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_sub(simd32_t _a, simd32_t _b) { return simd32_f32_sub(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_mul(simd32_t _a, simd32_t _b) { return simd32_f32_mul(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_div(simd32_t _a, simd32_t _b) { return simd32_f32_div(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_min(simd32_t _a, simd32_t _b) { return simd32_f32_min(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_max(simd32_t _a, simd32_t _b) { return simd32_f32_max(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_madd(simd32_t _a, simd32_t _b, simd32_t _c) { return simd32_f32_madd(_a, _b, _c); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_msub(simd32_t _a, simd32_t _b, simd32_t _c) { return simd32_f32_msub(_a, _b, _c); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_nmsub(simd32_t _a, simd32_t _b, simd32_t _c) { return simd32_f32_nmsub(_a, _b, _c); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_clamp(simd32_t _a, simd32_t _min, simd32_t _max) { return simd32_f32_clamp(_a, _min, _max); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_lerp(simd32_t _a, simd32_t _b, simd32_t _s) { return simd32_f32_lerp(_a, _b, _s); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_neg(simd32_t _a) { return simd32_f32_neg(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_abs(simd32_t _a) { return simd32_f32_abs(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_round(simd32_t _a) { return simd32_f32_round(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_ceil(simd32_t _a) { return simd32_f32_ceil(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_floor(simd32_t _a) { return simd32_f32_floor(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_rcp(simd32_t _a) { return simd32_f32_rcp(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_rcp_est(simd32_t _a) { return simd32_f32_rcp(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_sqrt(simd32_t _a) { return simd32_f32_sqrt(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_rsqrt(simd32_t _a) { return simd32_f32_rsqrt(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_rsqrt_est(simd32_t _a) { return simd32_f32_rsqrt(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_div_nr(simd32_t _a, simd32_t _b) { return simd_f32_div_nr_ni(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_sqrt_nr(simd32_t _a) { return simd_f32_sqrt_nr_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_sqrt_nr1(simd32_t _a) { return simd_f32_sqrt_nr1_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_rsqrt_nr(simd32_t _a) { return simd_f32_rsqrt_nr_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_rsqrt_carmack(simd32_t _a) { return simd_f32_rsqrt_carmack_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_cos(simd32_t _a) { return simd_f32_cos_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_sin(simd32_t _a) { return simd_f32_sin_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_log(simd32_t _a) { return simd_f32_log_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_exp(simd32_t _a) { return simd_f32_exp_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_log2(simd32_t _a) { return simd_f32_log2_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_exp2(simd32_t _a) { return simd_f32_exp2_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_pow(simd32_t _a, simd32_t _b) { return simd_f32_pow_ni(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_ldexp(simd32_t _a, simd32_t _b) { return simd_f32_ldexp_ni(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_ftoi_trunc(simd32_t _a) { return simd32_f32_ftoi_trunc(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_ftoi_round(simd32_t _a) { return simd32_f32_ftoi_round(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_i32_itof(simd32_t _a) { return simd32_i32_itof(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_cmpeq(simd32_t _a, simd32_t _b) { return simd32_f32_cmpeq(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_cmpneq(simd32_t _a, simd32_t _b) { return simd32_f32_cmpneq(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_cmplt(simd32_t _a, simd32_t _b) { return simd32_f32_cmplt(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_cmple(simd32_t _a, simd32_t _b) { return simd32_f32_cmple(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_cmpgt(simd32_t _a, simd32_t _b) { return simd32_f32_cmpgt(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_f32_cmpge(simd32_t _a, simd32_t _b) { return simd32_f32_cmpge(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE int simd_x32_signbitsmask(simd32_t _a) { return simd32_x32_signbitsmask(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_sels(simd32_t _test, simd32_t _a, simd32_t _b) { return simd32_sels(_test, _a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_selb(simd32_t _mask, simd32_t _a, simd32_t _b) { return simd32_selb(_mask, _a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_splat(float _a) { return simd32_splat(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_splat(uint32_t _a) { return simd32_splat(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_zero() { return simd32_zero(); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_ld(const void* _ptr) { return simd32_ld(_ptr); } + + template<> + BX_SIMD_FORCE_INLINE simd32_t simd_ldu(const void* _ptr) { return simd32_ldu(_ptr); } + + template<> + BX_SIMD_FORCE_INLINE void simd_st(void* _ptr, simd32_t _a) { simd32_st(_ptr, _a); } + + template<> + BX_SIMD_FORCE_INLINE void simd_stu(void* _ptr, simd32_t _a) { simd32_stu(_ptr, _a); } + + template<> + BX_SIMD_FORCE_INLINE void simd_x32_st1(void* _ptr, simd32_t _a) { simd32_x32_st1(_ptr, _a); } + + template<> + BX_SIMD_FORCE_INLINE bool simd_test_any(simd32_t _test) { return simd32_test_any(_test); } + + template<> + BX_SIMD_FORCE_INLINE bool simd_test_all(simd32_t _test) { return simd32_test_all(_test); } + + template<> + BX_SIMD_FORCE_INLINE bool simd_test_zero(simd32_t _a, simd32_t _b) { return simd32_test_zero(_a, _b); } + + // --- simd64_t dispatch --- + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_and(simd64_t _a, simd64_t _b) { return simd64_and(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_andc(simd64_t _a, simd64_t _b) { return simd64_andc(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_or(simd64_t _a, simd64_t _b) { return simd64_or(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_orc(simd64_t _a, simd64_t _b) { return simd64_orc(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_xor(simd64_t _a, simd64_t _b) { return simd64_xor(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_not(simd64_t _a) { return simd64_not(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_x32_srl(simd64_t _a, int _count) { return simd64_x32_srl(_a, _count); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_x32_sll(simd64_t _a, int _count) { return simd64_x32_sll(_a, _count); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_x32_sra(simd64_t _a, int _count) { return simd64_x32_sra(_a, _count); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_x32_sll(simd64_t _a, simd64_t _count) { return simd64_x32_sll(_a, _count); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_x32_srl(simd64_t _a, simd64_t _count) { return simd64_x32_srl(_a, _count); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_x32_sra(simd64_t _a, simd64_t _count) { return simd64_x32_sra(_a, _count); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_x8_shuffle(simd64_t _a, simd64_t _indices) { return simd64_x8_shuffle(_a, _indices); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_x8_shuffle(simd64_t _a, simd64_t _b, simd64_t _indices) { return simd64_x8_shuffle(_a, _b, _indices); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_i32_mul(simd64_t _a, simd64_t _b) { return simd64_u32_mul(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_u8_satadd(simd64_t _a, simd64_t _b) { return simd64_u8_satadd(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_u8_satsub(simd64_t _a, simd64_t _b) { return simd64_u8_satsub(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_u16_satadd(simd64_t _a, simd64_t _b) { return simd64_u16_satadd(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_u16_satsub(simd64_t _a, simd64_t _b) { return simd64_u16_satsub(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_u32_add(simd64_t _a, simd64_t _b) { return simd64_u32_add(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_u32_sub(simd64_t _a, simd64_t _b) { return simd64_u32_sub(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_u32_cmplt(simd64_t _a, simd64_t _b) { return simd64_u32_cmplt(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_u32_cmpgt(simd64_t _a, simd64_t _b) { return simd64_u32_cmpgt(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_i32_add(simd64_t _a, simd64_t _b) { return simd64_i32_add(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_i32_sub(simd64_t _a, simd64_t _b) { return simd64_i32_sub(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_i32_neg(simd64_t _a) { return simd64_i32_neg(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_i32_abs(simd64_t _a) { return simd64_i32_abs(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_i32_min(simd64_t _a, simd64_t _b) { return simd64_i32_min(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_i32_max(simd64_t _a, simd64_t _b) { return simd64_i32_max(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_i32_clamp(simd64_t _a, simd64_t _min, simd64_t _max) { return simd64_i32_clamp(_a, _min, _max); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_i32_cmpeq(simd64_t _a, simd64_t _b) { return simd64_i32_cmpeq(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_i32_cmplt(simd64_t _a, simd64_t _b) { return simd64_i32_cmplt(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_i32_cmpgt(simd64_t _a, simd64_t _b) { return simd64_i32_cmpgt(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_add(simd64_t _a, simd64_t _b) { return simd64_f32_add(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_sub(simd64_t _a, simd64_t _b) { return simd64_f32_sub(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_mul(simd64_t _a, simd64_t _b) { return simd64_f32_mul(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_div(simd64_t _a, simd64_t _b) { return simd64_f32_div(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_min(simd64_t _a, simd64_t _b) { return simd64_f32_min(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_max(simd64_t _a, simd64_t _b) { return simd64_f32_max(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_madd(simd64_t _a, simd64_t _b, simd64_t _c) { return simd64_f32_madd(_a, _b, _c); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_msub(simd64_t _a, simd64_t _b, simd64_t _c) { return simd64_f32_msub(_a, _b, _c); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_nmsub(simd64_t _a, simd64_t _b, simd64_t _c) { return simd64_f32_nmsub(_a, _b, _c); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_clamp(simd64_t _a, simd64_t _min, simd64_t _max) { return simd64_f32_clamp(_a, _min, _max); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_lerp(simd64_t _a, simd64_t _b, simd64_t _s) { return simd64_f32_lerp(_a, _b, _s); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_neg(simd64_t _a) { return simd64_f32_neg(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_abs(simd64_t _a) { return simd64_f32_abs(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_round(simd64_t _a) { return simd64_f32_round(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_ceil(simd64_t _a) { return simd64_f32_ceil(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_floor(simd64_t _a) { return simd64_f32_floor(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_rcp(simd64_t _a) { return simd64_f32_rcp(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_rcp_est(simd64_t _a) { return simd64_f32_rcp(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_sqrt(simd64_t _a) { return simd64_f32_sqrt(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_rsqrt(simd64_t _a) { return simd64_f32_rsqrt(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_rsqrt_est(simd64_t _a) { return simd64_f32_rsqrt(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_div_nr(simd64_t _a, simd64_t _b) { return simd_f32_div_nr_ni(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_sqrt_nr(simd64_t _a) { return simd_f32_sqrt_nr_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_sqrt_nr1(simd64_t _a) { return simd_f32_sqrt_nr1_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_rsqrt_nr(simd64_t _a) { return simd_f32_rsqrt_nr_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_rsqrt_carmack(simd64_t _a) { return simd_f32_rsqrt_carmack_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_cos(simd64_t _a) { return simd_f32_cos_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_sin(simd64_t _a) { return simd_f32_sin_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_log(simd64_t _a) { return simd_f32_log_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_exp(simd64_t _a) { return simd_f32_exp_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_log2(simd64_t _a) { return simd_f32_log2_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_exp2(simd64_t _a) { return simd_f32_exp2_ni(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_pow(simd64_t _a, simd64_t _b) { return simd_f32_pow_ni(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_ldexp(simd64_t _a, simd64_t _b) { return simd_f32_ldexp_ni(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_ftoi_trunc(simd64_t _a) { return simd64_f32_ftoi_trunc(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_ftoi_round(simd64_t _a) { return simd64_f32_ftoi_round(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_i32_itof(simd64_t _a) { return simd64_i32_itof(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_cmpeq(simd64_t _a, simd64_t _b) { return simd64_f32_cmpeq(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_cmpneq(simd64_t _a, simd64_t _b) { return simd64_f32_cmpneq(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_cmplt(simd64_t _a, simd64_t _b) { return simd64_f32_cmplt(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_cmple(simd64_t _a, simd64_t _b) { return simd64_f32_cmple(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_cmpgt(simd64_t _a, simd64_t _b) { return simd64_f32_cmpgt(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_f32_cmpge(simd64_t _a, simd64_t _b) { return simd64_f32_cmpge(_a, _b); } + + template<> + BX_SIMD_FORCE_INLINE int simd_x32_signbitsmask(simd64_t _a) { return simd64_x32_signbitsmask(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_sels(simd64_t _test, simd64_t _a, simd64_t _b) { return simd64_sels(_test, _a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_selb(simd64_t _mask, simd64_t _a, simd64_t _b) { return simd64_selb(_mask, _a, _b); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_splat(float _a) { return simd64_splat(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_splat(uint32_t _a) { return simd64_splat(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_splat(double _a) { return simd64_splat(_a); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_zero() { return simd64_zero(); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_ld(const void* _ptr) { return simd64_ld(_ptr); } + + template<> + BX_SIMD_FORCE_INLINE simd64_t simd_ldu(const void* _ptr) { return simd64_ldu(_ptr); } + + template<> + BX_SIMD_FORCE_INLINE void simd_st(void* _ptr, simd64_t _a) { simd64_st(_ptr, _a); } + + template<> + BX_SIMD_FORCE_INLINE void simd_stu(void* _ptr, simd64_t _a) { simd64_stu(_ptr, _a); } + + template<> + BX_SIMD_FORCE_INLINE void simd_x32_st1(void* _ptr, simd64_t _a) { simd64_x32_st1(_ptr, _a); } + + template<> + BX_SIMD_FORCE_INLINE bool simd_test_any(simd64_t _test) { return simd64_test_any(_test); } + + template<> + BX_SIMD_FORCE_INLINE bool simd_test_all(simd64_t _test) { return simd64_test_all(_test); } + + template<> + BX_SIMD_FORCE_INLINE bool simd_test_zero(simd64_t _a, simd64_t _b) { return simd64_test_zero(_a, _b); } + + +} // namespace bx + +namespace bx +{ + + BX_SIMD_FORCE_INLINE simd128_t simd128_ld(const void* _ptr) + { + return simd128_ld(_ptr); + } + + BX_SIMD_FORCE_INLINE simd128_t simd128_ldu(const void* _ptr) + { + return simd128_ldu(_ptr); + } + + BX_SIMD_FORCE_INLINE simd128_t simd128_ld(float _x, float _y, float _z, float _w) + { + return simd128_ld(_x, _y, _z, _w); + } + + BX_SIMD_FORCE_INLINE simd128_t simd128_ld(int32_t _x, int32_t _y, int32_t _z, int32_t _w) + { + return simd128_ld(_x, _y, _z, _w); + } + + BX_SIMD_FORCE_INLINE simd128_t simd128_ld(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w) + { + return simd128_ld(_x, _y, _z, _w); + } + + BX_SIMD_FORCE_INLINE simd128_t simd128_splat(float _a) + { + return simd128_splat(_a); + } + + BX_SIMD_FORCE_INLINE simd128_t simd128_splat(int32_t _a) + { + return simd128_splat(_a); + } + + BX_SIMD_FORCE_INLINE simd128_t simd128_splat(uint32_t _a) + { + return simd128_splat(_a); + } + + BX_SIMD_FORCE_INLINE simd128_t simd128_splat(int16_t _a) + { + return simd128_splat(_a); + } + + BX_SIMD_FORCE_INLINE simd128_t simd128_splat(uint16_t _a) + { + return simd128_splat(_a); + } + + BX_SIMD_FORCE_INLINE simd128_t simd128_zero() + { + return simd128_zero(); + } + +} // namespace bx + +#include "../math.h" + diff --git a/include/bx/inline/simd_ni.inl b/include/bx/inline/simd_ni.inl index 4ea9f2b..8a111d9 100644 --- a/include/bx/inline/simd_ni.inl +++ b/include/bx/inline/simd_ni.inl @@ -3,96 +3,351 @@ * License: https://github.com/bkaradzic/bx/blob/master/LICENSE */ +#ifndef BX_SIMD_T_H_HEADER_GUARD +# error "Must be included from bx/simd_t.h!" +#endif // BX_SIMD_T_H_HEADER_GUARD + namespace bx { + // These are used when a platform doesn't have a dedicated instruction. + template - BX_SIMD_INLINE Ty simd_shuf_xAzC_ni(Ty _a, Ty _b) + BX_SIMD_INLINE Ty simd128_x32_shuf_xAzC_ni(Ty _a, Ty _b) { - const Ty xAyB = simd_shuf_xAyB(_a, _b); - const Ty zCwD = simd_shuf_zCwD(_a, _b); - const Ty result = simd_shuf_xyAB(xAyB, zCwD); + const Ty xAyB = simd128_x32_shuf_xAyB(_a, _b); + const Ty zCwD = simd128_x32_shuf_zCwD(_a, _b); + const Ty result = simd128_x32_shuf_xyAB(xAyB, zCwD); + return result; + } + + template + BX_SIMD_INLINE Ty simd128_x32_shuf_yBwD_ni(Ty _a, Ty _b) + { + const Ty xAyB = simd128_x32_shuf_xAyB(_a, _b); + const Ty zCwD = simd128_x32_shuf_zCwD(_a, _b); + const Ty result = simd128_x32_shuf_zwCD(xAyB, zCwD); + return result; + } + + template + BX_SIMD_INLINE Ty simd_f32_madd_ni(Ty _a, Ty _b, Ty _c) + { + const Ty mul = simd_f32_mul(_a, _b); + const Ty result = simd_f32_add(mul, _c); + return result; + } + + template + BX_SIMD_INLINE Ty simd_f32_nmsub_ni(Ty _a, Ty _b, Ty _c) + { + const Ty mul = simd_f32_mul(_a, _b); + const Ty result = simd_f32_sub(_c, mul); + return result; + } + + template + BX_SIMD_INLINE Ty simd_f32_msub_ni(Ty _a, Ty _b, Ty _c) + { + const Ty mul = simd_f32_mul(_a, _b); + const Ty result = simd_f32_sub(mul, _c); return result; } template - BX_SIMD_INLINE Ty simd_shuf_yBwD_ni(Ty _a, Ty _b) + BX_SIMD_INLINE int simd_x32_signbitsmask_ni(Ty _a) { - const Ty xAyB = simd_shuf_xAyB(_a, _b); - const Ty zCwD = simd_shuf_zCwD(_a, _b); - const Ty result = simd_shuf_zwCD(xAyB, zCwD); + const Ty tmp0 = simd_x32_srl(_a, 31); + const Ty tmp1 = simd_f32_ftoi_trunc(tmp0); + int32_t lane[4]; + simd_st(&lane, tmp1); + + return (lane[0] & 1) | ((lane[1] & 1) << 1) | ((lane[2] & 1) << 2) | ((lane[3] & 1) << 3); + } + + template + BX_SIMD_INLINE int simd_x8_signbitsmask_ni(Ty _a) + { + float tmp[4]; + simd_st(tmp, _a); + const uint8_t* bytes = reinterpret_cast(tmp); + int result = 0; + for (int ii = 0; ii < 16; ++ii) + { + result |= ((bytes[ii] >> 7) << ii); + } return result; } template - BX_SIMD_INLINE Ty simd_madd_ni(Ty _a, Ty _b, Ty _c) + BX_SIMD_INLINE Ty simd_f32_div_nr_ni(Ty _a, Ty _b) { - const Ty mul = simd_mul(_a, _b); - const Ty result = simd_add(mul, _c); - + const Ty oneish = simd_splat(uint32_t(0x3f800001)); + const Ty est = simd_f32_rcp_est(_b); + const Ty iter0 = simd_f32_mul(_a, est); + const Ty tmp1 = simd_f32_nmsub(_b, est, oneish); + const Ty result = simd_f32_madd(tmp1, iter0, iter0); return result; } template - BX_SIMD_INLINE Ty simd_nmsub_ni(Ty _a, Ty _b, Ty _c) - { - const Ty mul = simd_mul(_a, _b); - const Ty result = simd_sub(_c, mul); - - return result; - } - - template - BX_SIMD_INLINE Ty simd_div_nr_ni(Ty _a, Ty _b) - { - const Ty oneish = simd_isplat(0x3f800001); - const Ty est = simd_rcp_est(_b); - const Ty iter0 = simd_mul(_a, est); - const Ty tmp1 = simd_nmsub(_b, est, oneish); - const Ty result = simd_madd(tmp1, iter0, iter0); - - return result; - } - - template - BX_SIMD_INLINE Ty simd_rcp_ni(Ty _a) + BX_SIMD_INLINE Ty simd_f32_rcp_ni(Ty _a) { const Ty one = simd_splat(1.0f); - const Ty result = simd_div(one, _a); - + const Ty result = simd_f32_div(one, _a); return result; } template - BX_SIMD_INLINE Ty simd_orx_ni(Ty _a) - { - const Ty zwxy = simd_swiz_zwxy(_a); - const Ty tmp0 = simd_or(_a, zwxy); - const Ty tmp1 = simd_swiz_yyyy(_a); - const Ty tmp2 = simd_or(tmp0, tmp1); - const Ty mf000 = simd_ild(UINT32_MAX, 0, 0, 0); - const Ty result = simd_and(tmp2, mf000); - - return result; - } - - template - BX_SIMD_INLINE Ty simd_orc_ni(Ty _a, Ty _b) - { - const Ty aorb = simd_or(_a, _b); - const Ty mffff = simd_isplat(UINT32_MAX); - const Ty result = simd_xor(aorb, mffff); - - return result; - } - - template - BX_SIMD_INLINE Ty simd_neg_ni(Ty _a) + BX_SIMD_INLINE Ty simd_f32_neg_ni(Ty _a) { const Ty zero = simd_zero(); - const Ty result = simd_sub(zero, _a); + const Ty result = simd_f32_sub(zero, _a); + return result; + } + template + BX_SIMD_INLINE Ty simd_f32_abs_ni(Ty _a) + { + const Ty a_neg = simd_f32_neg(_a); + const Ty result = simd_f32_max(a_neg, _a); + return result; + } + + template + BX_SIMD_INLINE Ty simd_f64_abs_ni(Ty _a) + { + const Ty a_neg = simd_f64_neg(_a); + const Ty result = simd_f64_max(a_neg, _a); + return result; + } + + template + BX_SIMD_INLINE Ty simd_andc_ni(Ty _a, Ty _b) + { + const Ty nb = simd_not(_b); + const Ty result = simd_and(_a, nb); + return result; + } + + template + BX_SIMD_INLINE Ty simd_u32_cmplt_ni(Ty _a, Ty _b) + { + const Ty flip = simd_splat(uint32_t(INT32_MIN)); + const Ty af = simd_xor(_a, flip); + const Ty bf = simd_xor(_b, flip); + const Ty result = simd_i32_cmplt(af, bf); + return result; + } + + template + BX_SIMD_INLINE Ty simd_u32_cmpgt_ni(Ty _a, Ty _b) + { + const Ty flip = simd_splat(uint32_t(INT32_MIN)); + const Ty af = simd_xor(_a, flip); + const Ty bf = simd_xor(_b, flip); + const Ty result = simd_i32_cmpgt(af, bf); + return result; + } + + template + BX_SIMD_INLINE Ty simd_f64_madd_ni(Ty _a, Ty _b, Ty _c) + { + const Ty mul = simd_f64_mul(_a, _b); + const Ty result = simd_f64_add(mul, _c); + return result; + } + + template + BX_SIMD_INLINE Ty simd_f64_nmsub_ni(Ty _a, Ty _b, Ty _c) + { + const Ty mul = simd_f64_mul(_a, _b); + const Ty result = simd_f64_sub(_c, mul); + return result; + } + + template + BX_SIMD_INLINE Ty simd_f64_neg_ni(Ty _a) + { + const Ty zero = simd_zero(); + const Ty result = simd_f64_sub(zero, _a); + return result; + } + + template + BX_SIMD_INLINE Ty simd_f64_lerp_ni(Ty _a, Ty _b, Ty _s) + { + const Ty ba = simd_f64_sub(_b, _a); + const Ty result = simd_f64_madd(_s, ba, _a); + return result; + } + + template + BX_SIMD_INLINE Ty simd_f64_rcp_ni(Ty _a) + { + const Ty one = simd_splat(1.0); + const Ty result = simd_f64_div(one, _a); + return result; + } + + template + BX_SIMD_INLINE Ty simd_f64_rsqrt_ni(Ty _a) + { + const Ty sqr = simd_f64_sqrt(_a); + const Ty result = simd_f64_rcp(sqr); + return result; + } + + template + BX_SIMD_INLINE Ty simd_f64_cmpneq_ni(Ty _a, Ty _b) + { + const Ty tmp0 = simd_f64_cmpeq(_a, _b); + const Ty result = simd_not(tmp0); + return result; + } + + template + BX_SIMD_INLINE Ty simd_f32_min_ni(Ty _a, Ty _b) + { + const Ty mask = simd_f32_cmplt(_a, _b); + const Ty result = simd_selb(mask, _a, _b); + return result; + } + + template + BX_SIMD_INLINE Ty simd_f32_max_ni(Ty _a, Ty _b) + { + const Ty mask = simd_f32_cmpgt(_a, _b); + const Ty result = simd_selb(mask, _a, _b); + return result; + } + + template + BX_SIMD_INLINE Ty simd_f32_clamp_ni(Ty _a, Ty _min, Ty _max) + { + const Ty tmp = simd_f32_min(_a, _max); + const Ty result = simd_f32_max(tmp, _min); + return result; + } + + template + BX_SIMD_INLINE Ty simd_f32_lerp_ni(Ty _a, Ty _b, Ty _s) + { + const Ty ba = simd_f32_sub(_b, _a); + const Ty result = simd_f32_madd(_s, ba, _a); + return result; + } + + template + BX_SIMD_INLINE Ty simd_f32_sqrt_nr_ni(Ty _a) + { + const Ty half = simd_splat(0.5f); + const Ty one = simd_splat(1.0f); + const Ty tmp0 = simd_f32_rsqrt_est(_a); + const Ty tmp1 = simd_f32_mul(tmp0, _a); + const Ty tmp2 = simd_f32_mul(tmp1, half); + const Ty tmp3 = simd_f32_nmsub(tmp0, tmp1, one); + const Ty result = simd_f32_madd(tmp3, tmp2, tmp1); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_f32_sqrt_nr1_ni(Ty _a) + { + const Ty half = simd_splat(0.5f); + + Ty result = _a; + for (uint32_t ii = 0; ii < 11; ++ii) + { + const Ty tmp1 = simd_f32_div(_a, result); + const Ty tmp2 = simd_f32_add(tmp1, result); + result = simd_f32_mul(tmp2, half); + } + + return result; + } + + template + BX_SIMD_INLINE Ty simd_f32_rsqrt_ni(Ty _a) + { + const Ty one = simd_splat(1.0f); + const Ty sqr = simd_f32_sqrt(_a); + const Ty result = simd_f32_div(one, sqr); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_f32_rsqrt_nr_ni(Ty _a) + { + const Ty rsqrt = simd_f32_rsqrt_est(_a); + const Ty iter0 = simd_f32_mul(_a, rsqrt); + const Ty iter1 = simd_f32_mul(iter0, rsqrt); + const Ty half = simd_splat(0.5f); + const Ty half_rsqrt = simd_f32_mul(half, rsqrt); + const Ty three = simd_splat(3.0f); + const Ty three_sub_iter1 = simd_f32_sub(three, iter1); + const Ty result = simd_f32_mul(half_rsqrt, three_sub_iter1); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_f32_rsqrt_carmack_ni(Ty _a) + { + const Ty half = simd_splat(0.5f); + const Ty ah = simd_f32_mul(half, _a); + const Ty ashift = simd_x32_sra(_a, 1); + const Ty magic = simd_splat(uint32_t(0x5f3759df)); + const Ty msuba = simd_i32_sub(magic, ashift); + const Ty msubasq = simd_f32_mul(msuba, msuba); + const Ty tmp0 = simd_splat(1.5f); + const Ty tmp1 = simd_f32_mul(ah, msubasq); + const Ty tmp2 = simd_f32_sub(tmp0, tmp1); + const Ty result = simd_f32_mul(msuba, tmp2); + + return result; + } + + template + BX_SIMD_INLINE Ty simd_f32_cmpneq_ni(Ty _a, Ty _b) + { + const Ty tmp0 = simd_f32_cmpeq(_a, _b); + const Ty result = simd_not(tmp0); + return result; + } + + template + BX_SIMD_INLINE Ty simd_i32_min_ni(Ty _a, Ty _b) + { + const Ty mask = simd_i32_cmplt(_a, _b); + const Ty result = simd_selb(mask, _a, _b); + return result; + } + + template + BX_SIMD_INLINE Ty simd_i32_max_ni(Ty _a, Ty _b) + { + const Ty mask = simd_i32_cmpgt(_a, _b); + const Ty result = simd_selb(mask, _a, _b); + return result; + } + + template + BX_SIMD_INLINE Ty simd_i32_neg_ni(Ty _a) + { + const Ty zero = simd_zero(); + const Ty result = simd_i32_sub(zero, _a); + return result; + } + + template + BX_SIMD_INLINE Ty simd_i32_abs_ni(Ty _a) + { + const Ty neg = simd_i32_neg(_a); + const Ty result = simd_i32_max(_a, neg); return result; } @@ -109,7 +364,7 @@ namespace bx template BX_SIMD_INLINE Ty simd_sels_ni(Ty _test, Ty _a, Ty _b) { - const Ty mask = simd_sra(_test, 31); + const Ty mask = simd_x32_sra(_test, 31); const Ty result = simd_selb(mask, _a, _b); return result; @@ -118,445 +373,715 @@ namespace bx template BX_SIMD_INLINE Ty simd_not_ni(Ty _a) { - const Ty mffff = simd_isplat(UINT32_MAX); + const Ty mffff = simd_splat(UINT32_MAX); const Ty result = simd_xor(_a, mffff); return result; } template - BX_SIMD_INLINE Ty simd_cmpneq_ni(Ty _a, Ty _b) + BX_SIMD_INLINE Ty simd_orc_ni(Ty _a, Ty _b) { - const Ty tmp0 = simd_cmpeq(_a, _b); - const Ty result = simd_not(tmp0); + const Ty nb = simd_not(_b); + const Ty result = simd_or(_a, nb); return result; } template - BX_SIMD_INLINE Ty simd_min_ni(Ty _a, Ty _b) + BX_SIMD_INLINE Ty simd128_orx_ni(Ty _a) { - const Ty mask = simd_cmplt(_a, _b); - const Ty result = simd_selb(mask, _a, _b); + const Ty zwxy = simd128_x32_swiz_zwxy(_a); + const Ty tmp0 = simd_or(_a, zwxy); + const Ty tmp1 = simd128_x32_swiz_yyyy(tmp0); + const Ty tmp2 = simd_or(tmp0, tmp1); + const Ty mf000 = simd128_ld(UINT32_MAX, 0u, 0u, 0u); + const Ty result = simd_and(tmp2, mf000); return result; } template - BX_SIMD_INLINE Ty simd_max_ni(Ty _a, Ty _b) + BX_SIMD_INLINE Ty simd128_f32_dot3_ni(Ty _a, Ty _b) { - const Ty mask = simd_cmpgt(_a, _b); - const Ty result = simd_selb(mask, _a, _b); + const Ty xyzw = simd_f32_mul(_a, _b); + const Ty xxxx = simd128_x32_swiz_xxxx(xyzw); + const Ty yyyy = simd128_x32_swiz_yyyy(xyzw); + const Ty zzzz = simd128_x32_swiz_zzzz(xyzw); + const Ty tmp1 = simd_f32_add(xxxx, yyyy); + const Ty result = simd_f32_add(tmp1, zzzz); return result; } template - BX_SIMD_INLINE Ty simd_abs_ni(Ty _a) + BX_SIMD_INLINE Ty simd128_f32_dot_ni(Ty _a, Ty _b) { - const Ty a_neg = simd_neg(_a); - const Ty result = simd_max(a_neg, _a); + const Ty xyzw = simd_f32_mul(_a, _b); + const Ty zwxy = simd128_x32_swiz_zwxy(xyzw); + const Ty tmp0 = simd_f32_add(xyzw, zwxy); + const Ty yyyy = simd128_x32_swiz_yyyy(tmp0); + const Ty result = simd_f32_add(tmp0, yyyy); return result; } template - BX_SIMD_INLINE Ty simd_imin_ni(Ty _a, Ty _b) + BX_SIMD_INLINE Ty simd128_f32_cross3_ni(Ty _a, Ty _b) { - const Ty mask = simd_icmplt(_a, _b); - const Ty result = simd_selb(mask, _a, _b); - + // a.yzx * b.zxy - a.zxy * b.yzx + const Ty a_yzxw = simd128_x32_swiz_yzxw(_a); + const Ty b_zxyw = simd128_x32_swiz_zxyw(_b); + const Ty a_zxyw = simd128_x32_swiz_zxyw(_a); + const Ty b_yzxw = simd128_x32_swiz_yzxw(_b); + const Ty tmp0 = simd_f32_mul(a_yzxw, b_zxyw); + const Ty tmp1 = simd_f32_mul(a_zxyw, b_yzxw); + const Ty result = simd_f32_sub(tmp0, tmp1); return result; } template - BX_SIMD_INLINE Ty simd_imax_ni(Ty _a, Ty _b) + BX_SIMD_INLINE Ty simd128_f32_normalize3_ni(Ty _a) { - const Ty mask = simd_icmpgt(_a, _b); - const Ty result = simd_selb(mask, _a, _b); - + const Ty dot = simd128_f32_dot3(_a, _a); + const Ty invLen = simd_f32_rsqrt(dot); + const Ty result = simd_f32_mul(_a, invLen); return result; } template - BX_SIMD_INLINE Ty simd_clamp_ni(Ty _a, Ty _min, Ty _max) + BX_SIMD_INLINE Ty simd_f32_ceil_ni(Ty _a) { - const Ty tmp = simd_min(_a, _max); - const Ty result = simd_max(tmp, _min); - + const Ty tmp = simd_f32_ftoi_trunc(_a); + const Ty tmp0 = simd_i32_itof(tmp); + const Ty mask = simd_f32_cmplt(tmp0, _a); + const Ty one = simd_splat(1.0f); + const Ty tmp1 = simd_and(one, mask); + const Ty result = simd_f32_add(tmp0, tmp1); return result; } template - BX_SIMD_INLINE Ty simd_lerp_ni(Ty _a, Ty _b, Ty _s) + BX_SIMD_INLINE Ty simd_f32_floor_ni(Ty _a) { - const Ty ba = simd_sub(_b, _a); - const Ty result = simd_madd(_s, ba, _a); - + const Ty tmp = simd_f32_ftoi_trunc(_a); + const Ty tmp0 = simd_i32_itof(tmp); + const Ty mask = simd_f32_cmpgt(tmp0, _a); + const Ty one = simd_splat(1.0f); + const Ty tmp1 = simd_and(one, mask); + const Ty result = simd_f32_sub(tmp0, tmp1); return result; } template - BX_SIMD_INLINE Ty simd_sqrt_nr_ni(Ty _a) + BX_SIMD_INLINE Ty simd_f32_round_ni(Ty _a) { const Ty half = simd_splat(0.5f); - const Ty one = simd_splat(1.0f); - const Ty tmp0 = simd_rsqrt_est(_a); - const Ty tmp1 = simd_mul(tmp0, _a); - const Ty tmp2 = simd_mul(tmp1, half); - const Ty tmp3 = simd_nmsub(tmp0, tmp1, one); - const Ty result = simd_madd(tmp3, tmp2, tmp1); + const Ty tmp0 = simd_f32_add(_a, half); + const Ty result = simd_f32_floor(tmp0); + return result; + } + + template + BX_SIMD_INLINE Ty simd_f32_ldexp_ni(Ty _a, Ty _b) + { + const Ty signexpmask = simd_splat(uint32_t(kFloatSignMask | kFloatExponentMask) ); + const Ty mantmask = simd_splat(kFloatMantissaMask); + const Ty masked = simd_and(_a, signexpmask); + const Ty expsign0 = simd_x32_sra(masked, kFloatExponentBitShift); + const Ty tmp = simd_i32_add(expsign0, _b); + const Ty expsign1 = simd_x32_sll(tmp, kFloatExponentBitShift); + const Ty mantissa = simd_and(_a, mantmask); + const Ty result = simd_or(mantissa, expsign1); return result; } template - BX_SIMD_INLINE Ty simd_sqrt_nr1_ni(Ty _a) + BX_SIMD_INLINE Ty simd_f32_cos_ni(Ty _a) { - const Ty half = simd_splat(0.5f); - - Ty result = _a; - for (uint32_t ii = 0; ii < 11; ++ii) - { - const Ty tmp1 = simd_div(_a, result); - const Ty tmp2 = simd_add(tmp1, result); - result = simd_mul(tmp2, half); - } + const Ty two_over_pi = simd_splat(2.0f * kInvPi); + const Ty pi_half = simd_splat(kPiHalf); + const Ty scaled = simd_f32_mul(_a, two_over_pi); + const Ty real = simd_f32_floor(scaled); + const Ty tmp_rp = simd_f32_mul(real, pi_half); + const Ty xx = simd_f32_sub(_a, tmp_rp); + const Ty ireal = simd_f32_ftoi_trunc(real); + const Ty three = simd_splat(uint32_t(3) ); + const Ty ibits = simd_and(ireal, three); + const Ty ione = simd_splat(uint32_t(1) ); + const Ty izero = simd_zero(); + const Ty bit0 = simd_and(ibits, ione); + const Ty evenMask = simd_i32_cmpeq(bit0, izero); + const Ty sinC2 = simd_splat(-0.16666667163372039794921875f); + const Ty sinC4 = simd_splat( 8.333347737789154052734375e-3f); + const Ty sinC6 = simd_splat(-1.9842604524455964565277099609375e-4f); + const Ty sinC8 = simd_splat( 2.760012648650445044040679931640625e-6f); + const Ty sinC10 = simd_splat(-2.50293279435709337121807038784027099609375e-8f); + const Ty cosC2 = simd_splat(-0.5f); + const Ty cosC4 = simd_splat( 4.166664183139801025390625e-2f); + const Ty cosC6 = simd_splat(-1.388833043165504932403564453125e-3f); + const Ty cosC8 = simd_splat( 2.47562347794882953166961669921875e-5f); + const Ty cosC10 = simd_splat(-2.59630184018533327616751194000244140625e-7f); + const Ty one = simd_splat(1.0f); + const Ty c0 = simd_selb(evenMask, one, xx); + const Ty c2 = simd_selb(evenMask, cosC2, sinC2); + const Ty c4 = simd_selb(evenMask, cosC4, sinC4); + const Ty c6 = simd_selb(evenMask, cosC6, sinC6); + const Ty c8 = simd_selb(evenMask, cosC8, sinC8); + const Ty c10 = simd_selb(evenMask, cosC10, sinC10); + const Ty xsq = simd_f32_mul(xx, xx); + const Ty tmp0 = simd_f32_madd(c10, xsq, c8); + const Ty tmp1 = simd_f32_madd(tmp0, xsq, c6); + const Ty tmp2 = simd_f32_madd(tmp1, xsq, c4); + const Ty tmp3 = simd_f32_madd(tmp2, xsq, c2); + const Ty tmp4 = simd_f32_madd(tmp3, xsq, one); + const Ty poly = simd_f32_mul(tmp4, c0); + const Ty ibits1 = simd_x32_srl(ibits, 1); + const Ty bit1 = simd_and(ibits1, ione); + const Ty negmask = simd_xor(bit0, bit1); + const Ty negbits = simd_x32_sll(negmask, 31); + const Ty result = simd_xor(poly, negbits); return result; } template - BX_SIMD_INLINE Ty simd_rsqrt_ni(Ty _a) + BX_SIMD_INLINE Ty simd_f32_sin_ni(Ty _a) { - const Ty one = simd_splat(1.0f); - const Ty sqrt = simd_sqrt(_a); - const Ty result = simd_div(one, sqrt); + const Ty pi_half = simd_splat(kPiHalf); + const Ty shifted = simd_f32_sub(_a, pi_half); + const Ty result = simd_f32_cos(shifted); + return result; + } + + template + BX_SIMD_INLINE Ty simd_f32_log_ni(Ty _a) + { + const Ty zero = simd_zero(); + const Ty one = simd_splat(1.0f); + const Ty negmask = simd_f32_cmplt(_a, zero); + const Ty zeromask = simd_f32_cmpeq(_a, zero); + const Ty nan_val = simd_splat(uint32_t(kFloatSignMask | kFloatExponentMask | kFloatMantissaMask) ); + const Ty neg_inf = simd_splat(uint32_t(kFloatSignMask | kFloatExponentMask) ); + const Ty expmask_c = simd_splat(kFloatExponentMask); + const Ty mantmask_c = simd_splat(kFloatMantissaMask); + const Ty exp_half = simd_splat(uint32_t(0x3f000000) ); + const Ty bias_126 = simd_splat(uint32_t(kFloatExponentBias - 1) ); + const Ty aexp = simd_and(_a, expmask_c); + const Ty rawexp = simd_x32_srl(aexp, kFloatExponentBitShift); + const Ty exp_i0 = simd_i32_sub(rawexp, bias_126); + const Ty mant = simd_and(_a, mantmask_c); + const Ty ff0 = simd_or(mant, exp_half); + const Ty sqrt2_half = simd_splat(kSqrt2 * 0.5f); + const Ty adj_mask = simd_f32_cmplt(ff0, sqrt2_half); + const Ty two = simd_splat(2.0f); + const Ty ff_dbl = simd_f32_mul(ff0, two); + const Ty ff1 = simd_selb(adj_mask, ff_dbl, ff0); + const Ty ione = simd_splat(uint32_t(1) ); + const Ty adj_one = simd_and(adj_mask, ione); + const Ty exp_i = simd_i32_sub(exp_i0, adj_one); + const Ty kC0 = simd_splat(6.666666666666735130e-01f); + const Ty kC1 = simd_splat(3.999999999940941908e-01f); + const Ty kC2 = simd_splat(2.857142874366239149e-01f); + const Ty kC3 = simd_splat(2.222219843214978396e-01f); + const Ty kC4 = simd_splat(1.818357216161805012e-01f); + const Ty kC5 = simd_splat(1.531383769920937332e-01f); + const Ty kC6 = simd_splat(1.479819860511658591e-01f); + const Ty log2lo = simd_splat(1.90821492927058770002e-10f); + const Ty lognat2 = simd_splat(kLogNat2); + const Ty half = simd_splat(0.5f); + const Ty ff = simd_f32_sub(ff1, one); + const Ty kk = simd_i32_itof(exp_i); + const Ty hi = simd_f32_mul(kk, lognat2); + const Ty lo = simd_f32_mul(kk, log2lo); + const Ty tpf = simd_f32_add(two, ff); + const Ty ss = simd_f32_div(ff, tpf); + const Ty s2 = simd_f32_mul(ss, ss); + const Ty s4 = simd_f32_mul(s2, s2); + const Ty pe0 = simd_f32_madd(kC6, s4, kC4); + const Ty pe1 = simd_f32_madd(pe0, s4, kC2); + const Ty pe2 = simd_f32_madd(pe1, s4, kC0); + const Ty t1 = simd_f32_mul(s2, pe2); + const Ty po0 = simd_f32_madd(kC5, s4, kC3); + const Ty po1 = simd_f32_madd(po0, s4, kC1); + const Ty t2 = simd_f32_mul(s4, po1); + const Ty t12 = simd_f32_add(t1, t2); + const Ty ffsq = simd_f32_mul(ff, ff); + const Ty hfsq = simd_f32_mul(half, ffsq); + const Ty ht12 = simd_f32_add(hfsq, t12); + const Ty ss_ht = simd_f32_mul(ss, ht12); + const Ty shtlo = simd_f32_add(ss_ht, lo); + const Ty inner = simd_f32_sub(hfsq, shtlo); + const Ty inff = simd_f32_sub(inner, ff); + const Ty raw = simd_f32_sub(hi, inff); + const Ty rz = simd_selb(zeromask, neg_inf, raw); + const Ty result = simd_selb(negmask, nan_val, rz); return result; } template - BX_SIMD_INLINE Ty simd_rsqrt_nr_ni(Ty _a) + BX_SIMD_INLINE Ty simd_f32_exp_ni(Ty _a) { - const Ty rsqrt = simd_rsqrt_est(_a); - const Ty iter0 = simd_mul(_a, rsqrt); - const Ty iter1 = simd_mul(iter0, rsqrt); - const Ty half = simd_splat(0.5f); - const Ty half_rsqrt = simd_mul(half, rsqrt); - const Ty three = simd_splat(3.0f); - const Ty three_sub_iter1 = simd_sub(three, iter1); - const Ty result = simd_mul(half_rsqrt, three_sub_iter1); + const Ty zero = simd_zero(); + const Ty one = simd_splat(1.0f); + const Ty near_zero = simd_splat(kNearZero); + const Ty absa = simd_f32_abs(_a); + const Ty near_mask = simd_f32_cmple(absa, near_zero); + const Ty near_result = simd_f32_add(_a, one); + const Ty exp_min = simd_splat(-87.33654475f); + const Ty clamp_mask = simd_f32_cmple(_a, exp_min); + const Ty kC0 = simd_splat( 1.66666666666666019037e-01f); + const Ty kC1 = simd_splat(-2.77777777770155933842e-03f); + const Ty kC2 = simd_splat( 6.61375632143793436117e-05f); + const Ty kC3 = simd_splat(-1.65339022054652515390e-06f); + const Ty kC4 = simd_splat( 4.13813679705723846039e-08f); + const Ty log2lo = simd_splat(1.90821492927058770002e-10f); + const Ty lognat2 = simd_splat(kLogNat2); + const Ty invlognat2 = simd_splat(kInvLogNat2); + const Ty two = simd_splat(2.0f); + const Ty amul = simd_f32_mul(_a, invlognat2); + const Ty kk = simd_f32_round(amul); + const Ty kkln = simd_f32_mul(kk, lognat2); + const Ty hi = simd_f32_sub(_a, kkln); + const Ty lo = simd_f32_mul(kk, log2lo); + const Ty hml = simd_f32_sub(hi, lo); + const Ty hmlsq = simd_f32_mul(hml, hml); + const Ty tmp0 = simd_f32_madd(kC4, hmlsq, kC3); + const Ty tmp1 = simd_f32_madd(tmp0, hmlsq, kC2); + const Ty tmp2 = simd_f32_madd(tmp1, hmlsq, kC1); + const Ty tmp3 = simd_f32_madd(tmp2, hmlsq, kC0); + const Ty tmp3m = simd_f32_mul(hmlsq, tmp3); + const Ty tmp4 = simd_f32_sub(hml, tmp3m); + const Ty tmp4a = simd_f32_mul(hml, tmp4); + const Ty tmp4b = simd_f32_sub(two, tmp4); + const Ty tmp5 = simd_f32_div(tmp4a, tmp4b); + const Ty tmp5a = simd_f32_sub(lo, tmp5); + const Ty tmp5b = simd_f32_sub(tmp5a, hi); + const Ty tmp6 = simd_f32_sub(one, tmp5b); + const Ty ikk = simd_f32_ftoi_trunc(kk); + const Ty raw = simd_f32_ldexp_ni(tmp6, ikk); + const Ty rc = simd_selb(clamp_mask, zero, raw); + const Ty result = simd_selb(near_mask, near_result, rc); return result; } template - BX_SIMD_INLINE Ty simd_rsqrt_carmack_ni(Ty _a) + BX_SIMD_INLINE Ty simd_f32_log2_ni(Ty _a) { - const Ty half = simd_splat(0.5f); - const Ty ah = simd_mul(half, _a); - const Ty ashift = simd_sra(_a, 1); - const Ty magic = simd_isplat(0x5f3759df); - const Ty msuba = simd_isub(magic, ashift); - const Ty msubasq = simd_mul(msuba, msuba); - const Ty tmp0 = simd_splat(1.5f); - const Ty tmp1 = simd_mul(ah, msubasq); - const Ty tmp2 = simd_sub(tmp0, tmp1); - const Ty result = simd_mul(msuba, tmp2); - + const Ty inv_log2 = simd_splat(kInvLogNat2); + const Ty loga = simd_f32_log(_a); + const Ty result = simd_f32_mul(loga, inv_log2); return result; } - namespace simd_logexp_detail + template + BX_SIMD_INLINE Ty simd_f32_exp2_ni(Ty _a) { - template - BX_SIMD_INLINE Ty simd_poly1(Ty _a, float _b, float _c) - { - const Ty bbbb = simd_splat(_b); - const Ty cccc = simd_splat(_c); - const Ty result = simd_madd(cccc, _a, bbbb); - - return result; - } - - template - BX_SIMD_INLINE Ty simd_poly2(Ty _a, float _b, float _c, float _d) - { - const Ty bbbb = simd_splat(_b); - const Ty poly = simd_poly1(_a, _c, _d); - const Ty result = simd_madd(poly, _a, bbbb); - - return result; - } - - template - BX_SIMD_INLINE Ty simd_poly3(Ty _a, float _b, float _c, float _d, float _e) - { - const Ty bbbb = simd_splat(_b); - const Ty poly = simd_poly2(_a, _c, _d, _e); - const Ty result = simd_madd(poly, _a, bbbb); - - return result; - } - - template - BX_SIMD_INLINE Ty simd_poly4(Ty _a, float _b, float _c, float _d, float _e, float _f) - { - const Ty bbbb = simd_splat(_b); - const Ty poly = simd_poly3(_a, _c, _d, _e, _f); - const Ty result = simd_madd(poly, _a, bbbb); - - return result; - } - - template - BX_SIMD_INLINE Ty simd_poly5(Ty _a, float _b, float _c, float _d, float _e, float _f, float _g) - { - const Ty bbbb = simd_splat(_b); - const Ty poly = simd_poly4(_a, _c, _d, _e, _f, _g); - const Ty result = simd_madd(poly, _a, bbbb); - - return result; - } - - template - BX_SIMD_INLINE Ty simd_logpoly(Ty _a) - { -#if 1 - const Ty result = simd_poly5(_a - , 3.11578814719469302614f, -3.32419399085241980044f - , 2.59883907202499966007f, -1.23152682416275988241f - , 0.318212422185251071475f, -0.0344359067839062357313f - ); -#elif 0 - const Ty result = simd_poly4(_a - , 2.8882704548164776201f, -2.52074962577807006663f - , 1.48116647521213171641f, -0.465725644288844778798f - , 0.0596515482674574969533f - ); -#elif 0 - const Ty result = simd_poly3(_a - , 2.61761038894603480148f, -1.75647175389045657003f - , 0.688243882994381274313f, -0.107254423828329604454f - ); -#else - const Ty result = simd_poly2(_a - , 2.28330284476918490682f, -1.04913055217340124191f - , 0.204446009836232697516f - ); -#endif - - return result; - } - - template - BX_SIMD_INLINE Ty simd_exppoly(Ty _a) - { -#if 1 - const Ty result = simd_poly5(_a - , 9.9999994e-1f, 6.9315308e-1f - , 2.4015361e-1f, 5.5826318e-2f - , 8.9893397e-3f, 1.8775767e-3f - ); -#elif 0 - const Ty result = simd_poly4(_a - , 1.0000026f, 6.9300383e-1f - , 2.4144275e-1f, 5.2011464e-2f - , 1.3534167e-2f - ); -#elif 0 - const Ty result = simd_poly3(_a - , 9.9992520e-1f, 6.9583356e-1f - , 2.2606716e-1f, 7.8024521e-2f - ); -#else - const Ty result = simd_poly2(_a - , 1.0017247f, 6.5763628e-1f - , 3.3718944e-1f - ); -#endif // 0 - - return result; - } - } // namespace simd_internal + const Ty lognat2 = simd_splat(kLogNat2); + const Ty scaled = simd_f32_mul(_a, lognat2); + const Ty result = simd_f32_exp(scaled); + return result; + } template - BX_SIMD_INLINE Ty simd_log2_ni(Ty _a) + BX_SIMD_INLINE Ty simd_f32_pow_ni(Ty _a, Ty _b) { - const Ty expmask = simd_isplat(kFloatExponentMask); - const Ty mantmask = simd_isplat(kFloatMantissaMask); + const Ty zero = simd_zero(); const Ty one = simd_splat(1.0f); + const Ty smallest = simd_splat(kFloatSmallest); + const Ty signmask = simd_splat(kFloatSignMask); - const Ty expbias = simd_isplat(kFloatExponentBias); - const Ty aexp = simd_and(_a, expmask); - const Ty aexpsr = simd_srl(aexp, kFloatExponentBitShift); - const Ty tmp0 = simd_isub(aexpsr, expbias); - const Ty exp = simd_itof(tmp0); - - const Ty amask = simd_and(_a, mantmask); - const Ty mant = simd_or(amask, one); - - const Ty poly = simd_logexp_detail::simd_logpoly(mant); - - const Ty mandiff = simd_sub(mant, one); - const Ty result = simd_madd(poly, mandiff, exp); + const Ty absa = simd_f32_abs(_a); + const Ty absb = simd_f32_abs(_b); + const Ty loga = simd_f32_log(absa); + const Ty bloga = simd_f32_mul(_b, loga); + const Ty pw = simd_f32_exp(bloga); + const Ty asign = simd_and(_a, signmask); + const Ty pwabs = simd_f32_abs(pw); + const Ty result0 = simd_or(pwabs, asign); + const Ty bmask = simd_f32_cmplt(absb, smallest); + const Ty amask = simd_f32_cmplt(absa, smallest); + const Ty result1 = simd_selb(bmask, one, result0); + const Ty result = simd_selb(amask, zero, result1); return result; } template - BX_SIMD_INLINE Ty simd_exp2_ni(Ty _a) + BX_SIMD_INLINE bool simd128_test_any_ni(Ty _a) { - const Ty min = simd_splat( 129.0f); - const Ty max = simd_splat(-126.99999f); - const Ty tmp0 = simd_min(_a, min); - const Ty aaaa = simd_max(tmp0, max); + const Ty tmp0 = simd_x32_srl(_a, 31); + const Ty tmp1 = simd128_x32_swiz_yzwx(tmp0); + const Ty tmp2 = simd_or(tmp0, tmp1); + const Ty tmp3 = simd128_x32_swiz_zwxy(tmp0); + const Ty tmp4 = simd_or(tmp2, tmp3); + const Ty itmp = simd_f32_ftoi_trunc(tmp4); - const Ty half = simd_splat(0.5f); - const Ty tmp2 = simd_sub(aaaa, half); - const Ty ipart = simd_ftoi(tmp2); - const Ty iround = simd_itof(ipart); - const Ty fpart = simd_sub(aaaa, iround); + int32_t ii; + simd128_x32_st1(&ii, itmp); - const Ty expbias = simd_isplat(kFloatExponentBias); - const Ty tmp5 = simd_iadd(ipart, expbias); - const Ty expipart = simd_sll(tmp5, 23); + return 0 != ii; + } - const Ty expfpart = simd_logexp_detail::simd_exppoly(fpart); + template + BX_SIMD_INLINE bool simd128_test_all_ni(Ty _a) + { + const Ty tmp0 = simd_x32_srl(_a, 31); + const Ty tmp1 = simd128_x32_swiz_yzwx(tmp0); + const Ty tmp2 = simd_and(tmp0, tmp1); + const Ty tmp3 = simd128_x32_swiz_zwxy(tmp0); + const Ty tmp4 = simd_and(tmp2, tmp3); + const Ty itmp = simd_f32_ftoi_trunc(tmp4); - const Ty result = simd_mul(expipart, expfpart); + int32_t ii; + simd128_x32_st1(&ii, itmp); + + return 0 != ii; + } + + template + BX_SIMD_INLINE bool simd128_test_zero_ni(Ty _a, Ty _b) + { + const Ty masked = simd_and(_a, _b); + const Ty zero = simd_zero(); + const Ty cmp = simd_i32_cmpeq(masked, zero); + const bool result = simd_test_all(cmp); + return result; + } + + template + BX_SIMD_INLINE Ty simd_f32_ftoi_round_ni(Ty _a) + { + const Ty rounded = simd_f32_round(_a); + const Ty result = simd_f32_ftoi_trunc(rounded); + return result; + } + + template + BX_SIMD_INLINE Ty simd_u32_cntlz_ni(Ty _a) + { + // Smear the highest set bit down to all lower bits. + const Ty shr1 = simd_x32_srl(_a, 1); + const Ty or1 = simd_or(_a, shr1); + const Ty shr2 = simd_x32_srl(or1, 2); + const Ty or2 = simd_or(or1, shr2); + const Ty shr4 = simd_x32_srl(or2, 4); + const Ty or4 = simd_or(or2, shr4); + const Ty shr8 = simd_x32_srl(or4, 8); + const Ty or8 = simd_or(or4, shr8); + const Ty shr16 = simd_x32_srl(or8, 16); + const Ty smear = simd_or(or8, shr16); + const Ty inv = simd_not(smear); + + const Ty c55 = simd_splat(uint32_t(0x55555555) ); + const Ty c33 = simd_splat(uint32_t(0x33333333) ); + const Ty c0f = simd_splat(uint32_t(0x0f0f0f0f) ); + const Ty c3f = simd_splat(uint32_t(0x3f) ); + + const Ty p1s = simd_x32_srl(inv, 1); + const Ty p1m = simd_and(p1s, c55); + const Ty p1 = simd_u32_sub(inv, p1m); + + const Ty p2a = simd_and(p1, c33); + const Ty p2s = simd_x32_srl(p1, 2); + const Ty p2m = simd_and(p2s, c33); + const Ty p2 = simd_u32_add(p2a, p2m); + + const Ty p4s = simd_x32_srl(p2, 4); + const Ty p4a = simd_u32_add(p2, p4s); + const Ty p4 = simd_and(p4a, c0f); + + const Ty p8s = simd_x32_srl(p4, 8); + const Ty p8 = simd_u32_add(p4, p8s); + + const Ty p16s = simd_x32_srl(p8, 16); + const Ty p16 = simd_u32_add(p8, p16s); + + const Ty result = simd_and(p16, c3f); + return result; + } + + template + BX_SIMD_INLINE Ty simd_x32_srl_ni(Ty _a, Ty _count) + { + const Ty bit0 = simd_splat(uint32_t(1) ); + const Ty bit1 = simd_splat(uint32_t(2) ); + const Ty bit2 = simd_splat(uint32_t(4) ); + const Ty bit3 = simd_splat(uint32_t(8) ); + const Ty bit4 = simd_splat(uint32_t(16) ); + + const Ty b0 = simd_and(_count, bit0); + const Ty mask0 = simd_i32_neg(b0); + const Ty shr0 = simd_x32_srl(_a, 1); + const Ty r0 = simd_selb(mask0, shr0, _a); + + const Ty b1 = simd_and(_count, bit1); + const Ty mask1 = simd_i32_neg(b1); + const Ty shr1 = simd_x32_srl(r0, 2); + const Ty r1 = simd_selb(mask1, shr1, r0); + + const Ty b2 = simd_and(_count, bit2); + const Ty mask2 = simd_i32_neg(b2); + const Ty shr2 = simd_x32_srl(r1, 4); + const Ty r2 = simd_selb(mask2, shr2, r1); + + const Ty b3 = simd_and(_count, bit3); + const Ty mask3 = simd_i32_neg(b3); + const Ty shr3 = simd_x32_srl(r2, 8); + const Ty r3 = simd_selb(mask3, shr3, r2); + + const Ty b4 = simd_and(_count, bit4); + const Ty mask4 = simd_i32_neg(b4); + const Ty shr4 = simd_x32_srl(r3, 16); + const Ty result = simd_selb(mask4, shr4, r3); return result; } template - BX_SIMD_INLINE Ty simd_pow_ni(Ty _a, Ty _b) + BX_SIMD_INLINE Ty simd_x32_sll_ni(Ty _a, Ty _count) { - const Ty alog2 = simd_log2(_a); - const Ty alog2b = simd_mul(alog2, _b); - const Ty result = simd_exp2(alog2b); + const Ty bit0 = simd_splat(uint32_t(1) ); + const Ty bit1 = simd_splat(uint32_t(2) ); + const Ty bit2 = simd_splat(uint32_t(4) ); + const Ty bit3 = simd_splat(uint32_t(8) ); + const Ty bit4 = simd_splat(uint32_t(16) ); + + const Ty b0 = simd_and(_count, bit0); + const Ty mask0 = simd_i32_neg(b0); + const Ty sll0 = simd_x32_sll(_a, 1); + const Ty r0 = simd_selb(mask0, sll0, _a); + + const Ty b1 = simd_and(_count, bit1); + const Ty mask1 = simd_i32_neg(b1); + const Ty sll1 = simd_x32_sll(r0, 2); + const Ty r1 = simd_selb(mask1, sll1, r0); + + const Ty b2 = simd_and(_count, bit2); + const Ty mask2 = simd_i32_neg(b2); + const Ty sll2 = simd_x32_sll(r1, 4); + const Ty r2 = simd_selb(mask2, sll2, r1); + + const Ty b3 = simd_and(_count, bit3); + const Ty mask3 = simd_i32_neg(b3); + const Ty sll3 = simd_x32_sll(r2, 8); + const Ty r3 = simd_selb(mask3, sll3, r2); + + const Ty b4 = simd_and(_count, bit4); + const Ty mask4 = simd_i32_neg(b4); + const Ty sll4 = simd_x32_sll(r3, 16); + const Ty result = simd_selb(mask4, sll4, r3); return result; } template - BX_SIMD_INLINE Ty simd_dot3_ni(Ty _a, Ty _b) + BX_SIMD_INLINE Ty simd_x32_sra_ni(Ty _a, Ty _count) { - const Ty xyzw = simd_mul(_a, _b); - const Ty xxxx = simd_swiz_xxxx(xyzw); - const Ty yyyy = simd_swiz_yyyy(xyzw); - const Ty zzzz = simd_swiz_zzzz(xyzw); - const Ty tmp1 = simd_add(xxxx, yyyy); - const Ty result = simd_add(zzzz, tmp1); - return result; - } + const Ty bit0 = simd_splat(uint32_t(1) ); + const Ty bit1 = simd_splat(uint32_t(2) ); + const Ty bit2 = simd_splat(uint32_t(4) ); + const Ty bit3 = simd_splat(uint32_t(8) ); + const Ty bit4 = simd_splat(uint32_t(16) ); - template - BX_SIMD_INLINE Ty simd_cross3_ni(Ty _a, Ty _b) - { - // a.yzx * b.zxy - a.zxy * b.yzx == (a * b.yzx - a.yzx * b).yzx -#if 0 - const Ty a_yzxw = simd_swiz_yzxw(_a); - const Ty a_zxyw = simd_swiz_zxyw(_a); - const Ty b_zxyw = simd_swiz_zxyw(_b); - const Ty b_yzxw = simd_swiz_yzxw(_b); - const Ty tmp = simd_mul(a_yzxw, b_zxyw); - const Ty result = simd_nmsub(a_zxyw, b_yzxw, tmp); -#else - const Ty a_yzxw = simd_swiz_yzxw(_a); - const Ty b_yzxw = simd_swiz_yzxw(_b); - const Ty tmp0 = simd_mul(_a, b_yzxw); - const Ty tmp1 = simd_nmsub(a_yzxw, _b, tmp0); - const Ty result = simd_swiz_yzxw(tmp1); -#endif + const Ty b0 = simd_and(_count, bit0); + const Ty mask0 = simd_i32_neg(b0); + const Ty sra0 = simd_x32_sra(_a, 1); + const Ty r0 = simd_selb(mask0, sra0, _a); + + const Ty b1 = simd_and(_count, bit1); + const Ty mask1 = simd_i32_neg(b1); + const Ty sra1 = simd_x32_sra(r0, 2); + const Ty r1 = simd_selb(mask1, sra1, r0); + + const Ty b2 = simd_and(_count, bit2); + const Ty mask2 = simd_i32_neg(b2); + const Ty sra2 = simd_x32_sra(r1, 4); + const Ty r2 = simd_selb(mask2, sra2, r1); + + const Ty b3 = simd_and(_count, bit3); + const Ty mask3 = simd_i32_neg(b3); + const Ty sra3 = simd_x32_sra(r2, 8); + const Ty r3 = simd_selb(mask3, sra3, r2); + + const Ty b4 = simd_and(_count, bit4); + const Ty mask4 = simd_i32_neg(b4); + const Ty sra4 = simd_x32_sra(r3, 16); + const Ty result = simd_selb(mask4, sra4, r3); return result; } template - BX_SIMD_INLINE Ty simd_normalize3_ni(Ty _a) + BX_SIMD_INLINE Ty simd_x8_shuffle_ni(Ty _a, Ty _indices) { - const Ty dot3 = simd_dot3(_a, _a); - const Ty invSqrt = simd_rsqrt(dot3); - const Ty result = simd_mul(_a, invSqrt); + alignas(32) uint8_t aBuf[sizeof(Ty)]; + alignas(32) uint8_t iBuf[sizeof(Ty)]; + alignas(32) uint8_t oBuf[sizeof(Ty)]; + simd_st(aBuf, _a); + simd_st(iBuf, _indices); + // Per-16-byte-lane shuffle: matches PSHUFB / vqtbl1q semantics. + // For widths < 16 bytes, the lane is the full register. + constexpr int kLaneBytes = sizeof(Ty) >= 16 ? 16 : int(sizeof(Ty)); + constexpr int kLaneMask = kLaneBytes - 1; + for (int lane = 0; lane < int(sizeof(Ty)); lane += kLaneBytes) + { + for (int ii = 0; ii < kLaneBytes; ++ii) + { + const uint8_t idx = iBuf[lane + ii]; + oBuf[lane + ii] = (idx & 0x80) ? uint8_t(0) : aBuf[lane + (idx & kLaneMask)]; + } + } + return simd_ld(oBuf); + } + template + BX_SIMD_INLINE Ty simd_x8_shuffle_ni(Ty _a, Ty _b, Ty _indices) + { + alignas(32) uint8_t aBuf[sizeof(Ty)]; + alignas(32) uint8_t bBuf[sizeof(Ty)]; + alignas(32) uint8_t iBuf[sizeof(Ty)]; + alignas(32) uint8_t oBuf[sizeof(Ty)]; + simd_st(aBuf, _a); + simd_st(bBuf, _b); + simd_st(iBuf, _indices); + // Two-source per-16-byte-lane shuffle: indices select from concatenated + // [a|b] within the matching 16-byte lane (or full register for < 16 bytes). + // Bit 7 of an index byte zeroes the output byte; bits 0..(log2(2*lane)-1) + // select within the 2*lane concatenation; remaining bits must be 0. + constexpr int kLaneBytes = sizeof(Ty) >= 16 ? 16 : int(sizeof(Ty)); + constexpr int kPairMask = (kLaneBytes * 2) - 1; + for (int lane = 0; lane < int(sizeof(Ty)); lane += kLaneBytes) + { + for (int ii = 0; ii < kLaneBytes; ++ii) + { + const uint8_t idx = iBuf[lane + ii]; + if (idx & 0x80) + { + oBuf[lane + ii] = 0; + } + else + { + const int sel = idx & kPairMask; + oBuf[lane + ii] = sel < kLaneBytes ? aBuf[lane + sel] : bBuf[lane + sel - kLaneBytes]; + } + } + } + return simd_ld(oBuf); + } + + template + BX_SIMD_INLINE Ty simd_f16_fromf32_ni(Ty _a) + { + const Ty f_s_mask = simd_splat(kFloatSignMask); + const Ty f_e_mask = simd_splat(kFloatExponentMask); + const Ty f_m_mask = simd_splat(kFloatMantissaMask); + const Ty f_m_hidden_bit = simd_splat(uint32_t(0x00800000) ); + const Ty f_m_round_bit = simd_splat(uint32_t(0x00001000) ); + const Ty f_snan_mask = simd_splat(uint32_t(0x7fc00000) ); + const Ty h_e_mask = simd_splat(uint32_t(kHalfExponentMask) ); + const Ty h_snan_mask = simd_splat(uint32_t(0x00007e00) ); + const Ty h_e_mask_value = simd_splat(uint32_t(0x0000001f) ); + const Ty f_h_bias = simd_splat(uint32_t(0x00000070) ); + const Ty h_nan_min = simd_splat(uint32_t(0x00007c01) ); + const Ty f_h_e_biased = simd_splat(uint32_t(0x0000008f) ); + const Ty one = simd_splat(uint32_t(1) ); + + const Ty f_s = simd_and(_a, f_s_mask); // + const Ty f_e = simd_and(_a, f_e_mask); + const Ty h_s = simd_x32_srl(f_s, 16); + const Ty f_m = simd_and(_a, f_m_mask); + const Ty f_e_amount = simd_x32_srl(f_e, kFloatExponentBitShift); + const Ty f_e_half_bias = simd_u32_sub(f_e_amount, f_h_bias); + const Ty f_snan = simd_and(_a, f_snan_mask); + const Ty f_m_round_mask = simd_and(f_m, f_m_round_bit); + const Ty f_m_round_offset = simd_x32_sll(f_m_round_mask, 1); + const Ty f_m_rounded = simd_u32_add(f_m, f_m_round_offset); + const Ty f_m_denorm_sa = simd_u32_sub(one, f_e_half_bias); + const Ty f_m_with_hidden = simd_or(f_m_rounded, f_m_hidden_bit); + const Ty f_m_denorm = simd_x32_srl_ni(f_m_with_hidden, f_m_denorm_sa); + const Ty h_m_denorm = simd_x32_srl(f_m_denorm, 13); + const Ty f_m_rounded_overflow = simd_and(f_m_rounded, f_m_hidden_bit); + const Ty m_nan = simd_x32_srl(f_m, 13); + const Ty h_em_nan = simd_or(h_e_mask, m_nan); + const Ty h_e_norm_overflow_offset = simd_u32_add(f_e_half_bias, one); + const Ty h_e_norm_overflow = simd_x32_sll(h_e_norm_overflow_offset, kHalfExponentBitShift); + const Ty h_e_norm = simd_x32_sll(f_e_half_bias, kHalfExponentBitShift); + const Ty h_m_norm = simd_x32_srl(f_m_rounded, 13); + const Ty h_em_norm = simd_or(h_e_norm, h_m_norm); + const Ty is_h_ndenorm_msb = simd_u32_sub(f_h_bias, f_e_amount); + const Ty is_f_e_flagged_msb = simd_u32_sub(f_h_e_biased, f_e_half_bias); + const Ty is_h_denorm_msb = simd_not(is_h_ndenorm_msb); + const Ty is_f_m_eqz_msb = simd_u32_sub(f_m, one); + const Ty is_h_nan_eqz_msb = simd_u32_sub(m_nan, one); + const Ty is_f_inf_msb = simd_and(is_f_e_flagged_msb, is_f_m_eqz_msb); + const Ty is_f_nan_underflow_msb = simd_and(is_f_e_flagged_msb, is_h_nan_eqz_msb); + const Ty is_e_overflow_msb = simd_u32_sub(h_e_mask_value, f_e_half_bias); + const Ty is_h_inf_msb = simd_or(is_e_overflow_msb, is_f_inf_msb); + const Ty is_f_nsnan_msb = simd_u32_sub(f_snan, f_snan_mask); + const Ty is_m_norm_overflow_msb = simd_i32_neg(f_m_rounded_overflow); + const Ty is_f_snan_msb = simd_not(is_f_nsnan_msb); + + const Ty h_em_overflow_result = simd_sels(is_m_norm_overflow_msb, h_e_norm_overflow, h_em_norm); // + const Ty h_em_nan_result = simd_sels(is_f_e_flagged_msb, h_em_nan, h_em_overflow_result); + const Ty h_em_nan_underflow_result = simd_sels(is_f_nan_underflow_msb, h_nan_min, h_em_nan_result); + const Ty h_em_inf_result = simd_sels(is_h_inf_msb, h_e_mask, h_em_nan_underflow_result); + const Ty h_em_denorm_result = simd_sels(is_h_denorm_msb, h_m_denorm, h_em_inf_result); + const Ty h_em_snan_result = simd_sels(is_f_snan_msb, h_snan_mask, h_em_denorm_result); + + const Ty result = simd_or(h_s, h_em_snan_result); return result; } template - BX_SIMD_INLINE Ty simd_dot_ni(Ty _a, Ty _b) + BX_SIMD_INLINE Ty simd_f16_tof32_ni(Ty _a) { - const Ty xyzw = simd_mul(_a, _b); - const Ty yzwx = simd_swiz_yzwx(xyzw); - const Ty tmp0 = simd_add(xyzw, yzwx); - const Ty zwxy = simd_swiz_zwxy(tmp0); - const Ty result = simd_add(tmp0, zwxy); + const Ty h_e_mask = simd_splat(uint32_t(kHalfExponentMask) ); + const Ty h_m_mask = simd_splat(uint32_t(kHalfMantissaMask) ); + const Ty h_s_mask = simd_splat(uint32_t(kHalfSignMask) ); + const Ty h_f_bias_offset = simd_splat(uint32_t(0x0001c000) ); + const Ty f_e_mask = simd_splat(kFloatExponentMask); + const Ty f_m_mask = simd_splat(kFloatMantissaMask); + const Ty h_f_e_denorm_bias = simd_splat(uint32_t(0x0000007e) ); + const Ty h_f_m_denorm_sa_bias = simd_splat(uint32_t(0x00000008) ); + const Ty h_e_mask_minus_one = simd_splat(uint32_t(0x00007bff) ); + const Ty one = simd_splat(uint32_t(1) ); + const Ty h_e = simd_and(_a, h_e_mask); // + const Ty h_m = simd_and(_a, h_m_mask); + const Ty h_s = simd_and(_a, h_s_mask); + const Ty h_e_f_bias = simd_u32_add(h_e, h_f_bias_offset); + const Ty h_m_nlz = simd_u32_cntlz_ni(h_m); + const Ty f_s = simd_x32_sll(h_s, 16); + const Ty f_e = simd_x32_sll(h_e_f_bias, 13); + const Ty f_m = simd_x32_sll(h_m, 13); + const Ty f_em = simd_or(f_e, f_m); + const Ty h_f_m_sa = simd_u32_sub(h_m_nlz, h_f_m_denorm_sa_bias); + const Ty f_e_denorm_unpacked = simd_u32_sub(h_f_e_denorm_bias, h_f_m_sa); + const Ty h_f_m = simd_x32_sll_ni(h_m, h_f_m_sa); + const Ty f_m_denorm = simd_and(h_f_m, f_m_mask); + const Ty f_e_denorm = simd_x32_sll(f_e_denorm_unpacked, kFloatExponentBitShift); + const Ty f_em_denorm = simd_or(f_e_denorm, f_m_denorm); + const Ty f_em_nan = simd_or(f_e_mask, f_m); + const Ty is_e_eqz_msb = simd_u32_sub(h_e, one); + const Ty is_m_nez_msb = simd_i32_neg(h_m); + const Ty is_e_flagged_msb = simd_u32_sub(h_e_mask_minus_one, h_e); + const Ty is_zero_msb = simd_andc(is_e_eqz_msb, is_m_nez_msb); + const Ty is_inf_msb = simd_andc(is_e_flagged_msb, is_m_nez_msb); + const Ty is_denorm_msb = simd_and(is_m_nez_msb, is_e_eqz_msb); + const Ty is_nan_msb = simd_and(is_e_flagged_msb, is_m_nez_msb); + const Ty is_zero = simd_x32_sra(is_zero_msb, 31); + const Ty f_zero_result = simd_andc(f_em, is_zero); + + const Ty f_denorm_result = simd_sels(is_denorm_msb, f_em_denorm, f_zero_result); // + const Ty f_inf_result = simd_sels(is_inf_msb, f_e_mask, f_denorm_result); + const Ty f_nan_result = simd_sels(is_nan_msb, f_em_nan, f_inf_result); + + const Ty result = simd_or(f_s, f_nan_result); return result; } - template - BX_SIMD_INLINE Ty simd_ceil_ni(Ty _a) - { - const Ty tmp0 = simd_ftoi(_a); - const Ty tmp1 = simd_itof(tmp0); - const Ty mask = simd_cmplt(tmp1, _a); - const Ty one = simd_splat(1.0f); - const Ty tmp2 = simd_and(one, mask); - const Ty result = simd_add(tmp1, tmp2); - - return result; - } - - template - BX_SIMD_INLINE Ty simd_floor_ni(Ty _a) - { - const Ty tmp0 = simd_ftoi(_a); - const Ty tmp1 = simd_itof(tmp0); - const Ty mask = simd_cmpgt(tmp1, _a); - const Ty one = simd_splat(1.0f); - const Ty tmp2 = simd_and(one, mask); - const Ty result = simd_sub(tmp1, tmp2); - - return result; - } - - template - BX_SIMD_FORCE_INLINE Ty simd_round_ni(Ty _a) - { - const Ty tmp = simd_ftoi(_a); - const Ty result = simd_itof(tmp); - - return result; - } - - template - BX_SIMD_INLINE bool simd_test_any_ni(Ty _a) - { - const Ty mask = simd_sra(_a, 31); - const Ty zwxy = simd_swiz_zwxy(mask); - const Ty tmp0 = simd_or(mask, zwxy); - const Ty tmp1 = simd_swiz_yyyy(tmp0); - const Ty tmp2 = simd_or(tmp0, tmp1); - int res; - simd_stx(&res, tmp2); - return 0 != res; - } - - template - BX_SIMD_INLINE bool simd_test_all_ni(Ty _a) - { - const Ty bits = simd_sra(_a, 31); - const Ty m1248 = simd_ild(1, 2, 4, 8); - const Ty mask = simd_and(bits, m1248); - const Ty zwxy = simd_swiz_zwxy(mask); - const Ty tmp0 = simd_or(mask, zwxy); - const Ty tmp1 = simd_swiz_yyyy(tmp0); - const Ty tmp2 = simd_or(tmp0, tmp1); - int res; - simd_stx(&res, tmp2); - return 0xf == res; - } - } // namespace bx diff --git a/include/bx/inline/uint32_t.inl b/include/bx/inline/uint32_t.inl deleted file mode 100644 index 1509637..0000000 --- a/include/bx/inline/uint32_t.inl +++ /dev/null @@ -1,868 +0,0 @@ -/* - * Copyright 2010-2026 Branimir Karadzic. All rights reserved. - * License: https://github.com/bkaradzic/bx/blob/master/LICENSE - */ - -// Copyright 2006 Mike Acton -// -// Permission is hereby granted, free of charge, to any person obtaining a -// copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation -// the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included -// in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE - -#ifndef BX_UINT32_T_H_HEADER_GUARD -# error "Must be included from bx/uint32_t.h" -#endif // BX_UINT32_T_H_HEADER_GUARD - -namespace bx -{ - inline BX_CONSTEXPR_FUNC uint32_t uint32_li(uint32_t _a) - { - return _a; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_dec(uint32_t _a) - { - return _a - 1; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_inc(uint32_t _a) - { - return _a + 1; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_not(uint32_t _a) - { - return ~_a; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_neg(uint32_t _a) - { - return -(int32_t)_a; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_ext(uint32_t _a) - { - return ( (int32_t)_a)>>31; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_and(uint32_t _a, uint32_t _b) - { - return _a & _b; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_andc(uint32_t _a, uint32_t _b) - { - return _a & ~_b; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_xor(uint32_t _a, uint32_t _b) - { - return _a ^ _b; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_xorl(uint32_t _a, uint32_t _b) - { - return !_a != !_b; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_or(uint32_t _a, uint32_t _b) - { - return _a | _b; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_orc(uint32_t _a, uint32_t _b) - { - return _a | ~_b; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_sll(uint32_t _a, int32_t _sa) - { - return _a << _sa; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_srl(uint32_t _a, int32_t _sa) - { - return _a >> _sa; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_sra(uint32_t _a, int32_t _sa) - { - return ( (int32_t)_a) >> _sa; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_rol(uint32_t _a, int32_t _sa) - { - return ( _a << _sa) | (_a >> (32-_sa) ); - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_ror(uint32_t _a, int32_t _sa) - { - return ( _a >> _sa) | (_a << (32-_sa) ); - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_add(uint32_t _a, uint32_t _b) - { - return _a + _b; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_iadd(uint32_t _a, uint32_t _b) - { - return int32_t(_a) + int32_t(_b); - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_sub(uint32_t _a, uint32_t _b) - { - return _a - _b; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_isub(uint32_t _a, uint32_t _b) - { - return int32_t(_a) - int32_t(_b); - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_mul(uint32_t _a, uint32_t _b) - { - return _a * _b; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_div(uint32_t _a, uint32_t _b) - { - return _a / _b; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_mod(uint32_t _a, uint32_t _b) - { - return _a % _b; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_cmpeq(uint32_t _a, uint32_t _b) - { - return -(_a == _b); - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_cmpneq(uint32_t _a, uint32_t _b) - { - return -(_a != _b); - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_cmplt(uint32_t _a, uint32_t _b) - { - return -(_a < _b); - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_cmple(uint32_t _a, uint32_t _b) - { - return -(_a <= _b); - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_cmpgt(uint32_t _a, uint32_t _b) - { - return -(_a > _b); - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_cmpge(uint32_t _a, uint32_t _b) - { - return -(_a >= _b); - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_setnz(uint32_t _a) - { - return -!!_a; - } - - template<> - inline BX_CONSTEXPR_FUNC uint32_t uint32_splat(uint8_t _val) - { - const uint32_t tmp0 = uint32_sll(_val, 8); - const uint32_t tmp1 = uint32_or(tmp0, _val); - const uint32_t tmp2 = uint32_sll(tmp1, 16); - const uint32_t result = uint32_or(tmp2, tmp1); - - return result; - } - - template<> - inline BX_CONSTEXPR_FUNC uint32_t uint32_splat(uint16_t _val) - { - const uint32_t tmp = uint32_sll(_val, 16); - const uint32_t result = uint32_or(tmp, _val); - - return result; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_satadd(uint32_t _a, uint32_t _b) - { - const uint32_t add = uint32_add(_a, _b); - const uint32_t lt = uint32_cmplt(add, _a); - const uint32_t result = uint32_or(add, lt); - - return result; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_satsub(uint32_t _a, uint32_t _b) - { - const uint32_t sub = uint32_sub(_a, _b); - const uint32_t le = uint32_cmple(sub, _a); - const uint32_t result = uint32_and(sub, le); - - return result; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_satmul(uint32_t _a, uint32_t _b) - { - const uint64_t mul = (uint64_t)_a * (uint64_t)_b; - const uint32_t hi = mul >> 32; - const uint32_t nz = uint32_setnz(hi); - const uint32_t result = uint32_or(uint32_t(mul), nz); - - return result; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_sels(uint32_t test, uint32_t _a, uint32_t _b) - { - const uint32_t mask = uint32_ext(test); - const uint32_t sel_a = uint32_and(_a, mask); - const uint32_t sel_b = uint32_andc(_b, mask); - const uint32_t result = uint32_or(sel_a, sel_b); - - return (result); - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_selb(uint32_t _mask, uint32_t _a, uint32_t _b) - { - const uint32_t sel_a = uint32_and(_a, _mask); - const uint32_t sel_b = uint32_andc(_b, _mask); - const uint32_t result = uint32_or(sel_a, sel_b); - - return (result); - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_imin(uint32_t _a, uint32_t _b) - { - const uint32_t a_sub_b = uint32_sub(_a, _b); - const uint32_t result = uint32_sels(a_sub_b, _a, _b); - - return result; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_imax(uint32_t _a, uint32_t _b) - { - const uint32_t b_sub_a = uint32_sub(_b, _a); - const uint32_t result = uint32_sels(b_sub_a, _a, _b); - - return result; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_min(uint32_t _a, uint32_t _b) - { - return _a > _b ? _b : _a; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_min(uint32_t _a, uint32_t _b, uint32_t _c) - { - return uint32_min(_a, uint32_min(_b, _c) ); - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_max(uint32_t _a, uint32_t _b) - { - return _a > _b ? _a : _b; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_max(uint32_t _a, uint32_t _b, uint32_t _c) - { - return uint32_max(_a, uint32_max(_b, _c) ); - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_clamp(uint32_t _a, uint32_t _min, uint32_t _max) - { - const uint32_t tmp = uint32_max(_a, _min); - const uint32_t result = uint32_min(tmp, _max); - - return result; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_iclamp(uint32_t _a, uint32_t _min, uint32_t _max) - { - const uint32_t tmp = uint32_imax(_a, _min); - const uint32_t result = uint32_imin(tmp, _max); - - return result; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_incwrap(uint32_t _val, uint32_t _min, uint32_t _max) - { - const uint32_t inc = uint32_inc(_val); - const uint32_t max_diff = uint32_sub(_max, _val); - const uint32_t neg_max_diff = uint32_neg(max_diff); - const uint32_t max_or = uint32_or(max_diff, neg_max_diff); - const uint32_t max_diff_nz = uint32_ext(max_or); - const uint32_t result = uint32_selb(max_diff_nz, inc, _min); - - return result; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_decwrap(uint32_t _val, uint32_t _min, uint32_t _max) - { - const uint32_t dec = uint32_dec(_val); - const uint32_t min_diff = uint32_sub(_min, _val); - const uint32_t neg_min_diff = uint32_neg(min_diff); - const uint32_t min_or = uint32_or(min_diff, neg_min_diff); - const uint32_t min_diff_nz = uint32_ext(min_or); - const uint32_t result = uint32_selb(min_diff_nz, dec, _max); - - return result; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_cntbits(uint32_t _val) - { -#if BX_COMPILER_GCC || BX_COMPILER_CLANG - return __builtin_popcount(_val); -#else - const uint32_t tmp0 = uint32_srl(_val, 1); - const uint32_t tmp1 = uint32_and(tmp0, 0x55555555); - const uint32_t tmp2 = uint32_sub(_val, tmp1); - const uint32_t tmp3 = uint32_and(tmp2, 0xc30c30c3); - const uint32_t tmp4 = uint32_srl(tmp2, 2); - const uint32_t tmp5 = uint32_and(tmp4, 0xc30c30c3); - const uint32_t tmp6 = uint32_srl(tmp2, 4); - const uint32_t tmp7 = uint32_and(tmp6, 0xc30c30c3); - const uint32_t tmp8 = uint32_add(tmp3, tmp5); - const uint32_t tmp9 = uint32_add(tmp7, tmp8); - const uint32_t tmpA = uint32_srl(tmp9, 6); - const uint32_t tmpB = uint32_add(tmp9, tmpA); - const uint32_t tmpC = uint32_srl(tmpB, 12); - const uint32_t tmpD = uint32_srl(tmpB, 24); - const uint32_t tmpE = uint32_add(tmpB, tmpC); - const uint32_t tmpF = uint32_add(tmpD, tmpE); - const uint32_t result = uint32_and(tmpF, 0x3f); - - return result; -#endif // BX_COMPILER_* - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_cntlz(uint32_t _val) - { -#if BX_COMPILER_GCC || BX_COMPILER_CLANG - return 0 == _val ? 32 : __builtin_clz(_val); -#else - const uint32_t tmp0 = uint32_srl(_val, 1); - const uint32_t tmp1 = uint32_or(tmp0, _val); - const uint32_t tmp2 = uint32_srl(tmp1, 2); - const uint32_t tmp3 = uint32_or(tmp2, tmp1); - const uint32_t tmp4 = uint32_srl(tmp3, 4); - const uint32_t tmp5 = uint32_or(tmp4, tmp3); - const uint32_t tmp6 = uint32_srl(tmp5, 8); - const uint32_t tmp7 = uint32_or(tmp6, tmp5); - const uint32_t tmp8 = uint32_srl(tmp7, 16); - const uint32_t tmp9 = uint32_or(tmp8, tmp7); - const uint32_t tmpA = uint32_not(tmp9); - const uint32_t result = uint32_cntbits(tmpA); - - return result; -#endif // BX_COMPILER_* - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_cnttz(uint32_t _val) - { -#if BX_COMPILER_GCC || BX_COMPILER_CLANG - return 0 == _val ? 32 : __builtin_ctz(_val); -#else - const uint32_t tmp0 = uint32_not(_val); - const uint32_t tmp1 = uint32_dec(_val); - const uint32_t tmp2 = uint32_and(tmp0, tmp1); - const uint32_t result = uint32_cntbits(tmp2); - - return result; -#endif // BX_COMPILER_* - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_ffs(uint32_t _x) - { - return 0 == _x ? 0 : uint32_cnttz(_x) + 1; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_part1by1(uint32_t _a) - { - // shuffle: - // ---- ---- ---- ---- fedc ba98 7654 3210 - // to: - // -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0 - - const uint32_t val = uint32_and(_a, 0xffff); - - const uint32_t tmp0 = uint32_sll(val, 8); - const uint32_t tmp1 = uint32_xor(val, tmp0); - const uint32_t tmp2 = uint32_and(tmp1, 0x00ff00ff); - - const uint32_t tmp3 = uint32_sll(tmp2, 4); - const uint32_t tmp4 = uint32_xor(tmp2, tmp3); - const uint32_t tmp5 = uint32_and(tmp4, 0x0f0f0f0f); - - const uint32_t tmp6 = uint32_sll(tmp5, 2); - const uint32_t tmp7 = uint32_xor(tmp5, tmp6); - const uint32_t tmp8 = uint32_and(tmp7, 0x33333333); - - const uint32_t tmp9 = uint32_sll(tmp8, 1); - const uint32_t tmpA = uint32_xor(tmp8, tmp9); - const uint32_t result = uint32_and(tmpA, 0x55555555); - - return result; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_part1by2(uint32_t _a) - { - // shuffle: - // ---- ---- ---- ---- ---- --98 7654 3210 - // to: - // ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0 - - const uint32_t val = uint32_and(_a, 0x3ff); - - const uint32_t tmp0 = uint32_sll(val, 16); - const uint32_t tmp1 = uint32_xor(val, tmp0); - const uint32_t tmp2 = uint32_and(tmp1, 0xff0000ff); - - const uint32_t tmp3 = uint32_sll(tmp2, 8); - const uint32_t tmp4 = uint32_xor(tmp2, tmp3); - const uint32_t tmp5 = uint32_and(tmp4, 0x0300f00f); - - const uint32_t tmp6 = uint32_sll(tmp5, 4); - const uint32_t tmp7 = uint32_xor(tmp5, tmp6); - const uint32_t tmp8 = uint32_and(tmp7, 0x030c30c3); - - const uint32_t tmp9 = uint32_sll(tmp8, 2); - const uint32_t tmpA = uint32_xor(tmp8, tmp9); - const uint32_t result = uint32_and(tmpA, 0x09249249); - - return result; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_testpow2(uint32_t _a) - { - const uint32_t tmp0 = uint32_dec(_a); - const uint32_t tmp1 = uint32_xor(_a, tmp0); - const uint32_t tmp2 = uint32_srl(tmp1, 1); - const uint32_t result = uint32_cmpeq(tmp2, tmp0); - - return result; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_nextpow2(uint32_t _a) - { - const uint32_t tmp0 = uint32_dec(_a); - const uint32_t tmp1 = uint32_srl(tmp0, 1); - const uint32_t tmp2 = uint32_or(tmp0, tmp1); - const uint32_t tmp3 = uint32_srl(tmp2, 2); - const uint32_t tmp4 = uint32_or(tmp2, tmp3); - const uint32_t tmp5 = uint32_srl(tmp4, 4); - const uint32_t tmp6 = uint32_or(tmp4, tmp5); - const uint32_t tmp7 = uint32_srl(tmp6, 8); - const uint32_t tmp8 = uint32_or(tmp6, tmp7); - const uint32_t tmp9 = uint32_srl(tmp8, 16); - const uint32_t tmpA = uint32_or(tmp8, tmp9); - const uint32_t result = uint32_inc(tmpA); - - return result; - } - - inline BX_CONSTEXPR_FUNC uint64_t uint64_li(uint64_t _a) - { - return _a; - } - - inline BX_CONSTEXPR_FUNC uint64_t uint64_dec(uint64_t _a) - { - return _a - 1; - } - - inline BX_CONSTEXPR_FUNC uint64_t uint64_inc(uint64_t _a) - { - return _a + 1; - } - - inline BX_CONSTEXPR_FUNC uint64_t uint64_not(uint64_t _a) - { - return ~_a; - } - - inline BX_CONSTEXPR_FUNC uint64_t uint64_neg(uint64_t _a) - { - return -(int32_t)_a; - } - - inline BX_CONSTEXPR_FUNC uint64_t uint64_ext(uint64_t _a) - { - return ( (int32_t)_a)>>31; - } - - inline BX_CONSTEXPR_FUNC uint64_t uint64_and(uint64_t _a, uint64_t _b) - { - return _a & _b; - } - - inline BX_CONSTEXPR_FUNC uint64_t uint64_andc(uint64_t _a, uint64_t _b) - { - return _a & ~_b; - } - - inline BX_CONSTEXPR_FUNC uint64_t uint64_xor(uint64_t _a, uint64_t _b) - { - return _a ^ _b; - } - - inline BX_CONSTEXPR_FUNC uint64_t uint64_xorl(uint64_t _a, uint64_t _b) - { - return !_a != !_b; - } - - inline BX_CONSTEXPR_FUNC uint64_t uint64_or(uint64_t _a, uint64_t _b) - { - return _a | _b; - } - - inline BX_CONSTEXPR_FUNC uint64_t uint64_orc(uint64_t _a, uint64_t _b) - { - return _a | ~_b; - } - - inline BX_CONSTEXPR_FUNC uint64_t uint64_sll(uint64_t _a, int32_t _sa) - { - return _a << _sa; - } - - inline BX_CONSTEXPR_FUNC uint64_t uint64_srl(uint64_t _a, int32_t _sa) - { - return _a >> _sa; - } - - inline BX_CONSTEXPR_FUNC uint64_t uint64_sra(uint64_t _a, int32_t _sa) - { - return ( (int64_t)_a) >> _sa; - } - - inline BX_CONSTEXPR_FUNC uint64_t uint64_rol(uint64_t _a, int32_t _sa) - { - return ( _a << _sa) | (_a >> (64-_sa) ); - } - - inline BX_CONSTEXPR_FUNC uint64_t uint64_ror(uint64_t _a, int32_t _sa) - { - return ( _a >> _sa) | (_a << (64-_sa) ); - } - - inline BX_CONSTEXPR_FUNC uint64_t uint64_add(uint64_t _a, uint64_t _b) - { - return _a + _b; - } - - inline BX_CONSTEXPR_FUNC uint64_t uint64_sub(uint64_t _a, uint64_t _b) - { - return _a - _b; - } - - inline BX_CONSTEXPR_FUNC uint64_t uint64_mul(uint64_t _a, uint64_t _b) - { - return _a * _b; - } - - template<> - inline BX_CONSTEXPR_FUNC uint64_t uint64_splat(uint8_t _val) - { - const uint64_t tmp0 = uint64_sll(_val, 8); - const uint64_t tmp1 = uint64_or(tmp0, _val); - const uint64_t tmp2 = uint64_sll(tmp1, 16); - const uint64_t tmp3 = uint64_or(tmp2, tmp1); - const uint64_t tmp4 = uint64_sll(tmp3, 32); - const uint64_t result = uint64_or(tmp4, tmp3); - - return result; - } - - template<> - inline BX_CONSTEXPR_FUNC uint64_t uint64_splat(uint16_t _val) - { - const uint64_t tmp0 = uint64_sll(_val, 16); - const uint64_t tmp1 = uint64_or(tmp0, _val); - const uint64_t tmp2 = uint64_sll(tmp1, 32); - const uint64_t result = uint64_or(tmp2, tmp1); - - return result; - } - - template<> - inline BX_CONSTEXPR_FUNC uint64_t uint64_splat(uint32_t _val) - { - const uint64_t tmp = uint64_sll(_val, 32); - const uint64_t result = uint64_or(tmp, _val); - - return result; - } - - inline BX_CONSTEXPR_FUNC uint64_t uint64_cntbits(uint64_t _val) - { -#if BX_COMPILER_GCC || BX_COMPILER_CLANG - return __builtin_popcountll(_val); -#else - const uint32_t lo = uint32_t(_val&UINT32_MAX); - const uint32_t hi = uint32_t(_val>>32); - - return uint32_cntbits(lo) - + uint32_cntbits(hi) - ; -#endif // BX_COMPILER_* - } - - inline BX_CONSTEXPR_FUNC uint64_t uint64_cntlz(uint64_t _val) - { -#if BX_COMPILER_GCC || BX_COMPILER_CLANG - return 0 == _val ? 64 : __builtin_clzll(_val); -#else - return _val & UINT64_C(0xffffffff00000000) - ? uint32_cntlz(uint32_t(_val>>32) ) - : uint32_cntlz(uint32_t(_val) ) + 32 - ; -#endif // BX_COMPILER_* - } - - inline BX_CONSTEXPR_FUNC uint64_t uint64_cnttz(uint64_t _val) - { -#if BX_COMPILER_GCC || BX_COMPILER_CLANG - return 0 == _val ? 64 : __builtin_ctzll(_val); -#else - return _val & UINT64_C(0xffffffff) - ? uint32_cnttz(uint32_t(_val) ) - : uint32_cnttz(uint32_t(_val>>32) ) + 32 - ; -#endif // BX_COMPILER_* - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_gcd(uint32_t _a, uint32_t _b) - { - do - { - const uint32_t tmp = uint32_mod(_a, _b); - _a = _b; - _b = tmp; - } - while (_b); - - return _a; - } - - inline BX_CONSTEXPR_FUNC uint32_t uint32_lcm(uint32_t _a, uint32_t _b) - { - return _a * (_b / uint32_gcd(_a, _b) ); - } - - inline BX_CONSTEXPR_FUNC uint32_t strideAlign(uint32_t _offset, uint32_t _stride) - { - const uint32_t mod = uint32_mod(_offset, _stride); - const uint32_t add = uint32_sub(_stride, mod); - const uint32_t mask = uint32_cmpeq(mod, 0); - const uint32_t tmp = uint32_selb(mask, 0, add); - const uint32_t result = uint32_add(_offset, tmp); - - return result; - } - - template - inline BX_CONSTEXPR_FUNC uint32_t strideAlign(uint32_t _offset, uint32_t _stride) - { - const uint32_t align = uint32_lcm(Min, _stride); - const uint32_t mod = uint32_mod(_offset, align); - const uint32_t mask = uint32_cmpeq(mod, 0); - const uint32_t tmp0 = uint32_selb(mask, 0, align); - const uint32_t tmp1 = uint32_add(_offset, tmp0); - const uint32_t result = uint32_sub(tmp1, mod); - - return result; - } - - template - inline BX_CONSTEXPR_FUNC bool isAligned(Ty _a, size_t _align) - { - const size_t mask = max(1, _align) - 1; - return 0 == (size_t(_a) & mask); - } - - template<> - inline BX_CONSTEXPR_FUNC bool isAligned(const void* _ptr, size_t _align) - { - const uintptr_t addr = bitCast(_ptr); - return isAligned(addr, _align); - } - - template - inline BX_CONSTEXPR_FUNC Ty alignDown(Ty _a, size_t _align) - { - const size_t mask = max(1, _align) - 1; - return Ty(size_t(_a) & ~mask); - } - - template - inline BX_CONSTEXPR_FUNC Ty* alignDown(Ty* _ptr, size_t _align) - { - uintptr_t addr = bitCast(_ptr); - addr = alignDown(addr, _align); - return bitCast(addr); - } - - template - inline BX_CONSTEXPR_FUNC const Ty* alignDown(const Ty* _ptr, size_t _align) - { - uintptr_t addr = bitCast(_ptr); - addr = alignDown(addr, _align); - return bitCast(addr); - } - - template - inline BX_CONSTEXPR_FUNC Ty alignUp(Ty _a, size_t _align) - { - const size_t mask = max(1, _align) - 1; - return Ty( (size_t(_a) + mask) & ~mask); - } - - template - inline BX_CONSTEXPR_FUNC Ty* alignUp(Ty* _ptr, size_t _align) - { - uintptr_t addr = bitCast(_ptr); - addr = alignUp(addr, _align); - return bitCast(addr); - } - - template - inline BX_CONSTEXPR_FUNC const Ty* alignUp(const Ty* _ptr, size_t _align) - { - uintptr_t addr = bitCast(_ptr); - addr = alignUp(addr, _align); - return bitCast(addr); - } - - inline BX_CONST_FUNC uint16_t halfFromFloat(float _a) - { - const uint32_t a_as_ui = bitCast(_a); - - const uint32_t one = uint32_li(0x00000001); - const uint32_t f_s_mask = uint32_li(kFloatSignMask); - const uint32_t f_e_mask = uint32_li(kFloatExponentMask); - const uint32_t f_m_mask = uint32_li(kFloatMantissaMask); - const uint32_t f_m_hidden_bit = uint32_li(0x00800000); - const uint32_t f_m_round_bit = uint32_li(0x00001000); - const uint32_t f_snan_mask = uint32_li(0x7fc00000); - const uint32_t f_e_pos = uint32_li(0x00000017); - const uint32_t h_e_pos = uint32_li(0x0000000a); - const uint32_t h_e_mask = uint32_li(kHalfExponentMask); - const uint32_t h_snan_mask = uint32_li(0x00007e00); - const uint32_t h_e_mask_value = uint32_li(0x0000001f); - const uint32_t f_h_s_pos_offset = uint32_li(0x00000010); - const uint32_t f_h_bias_offset = uint32_li(0x00000070); - const uint32_t f_h_m_pos_offset = uint32_li(0x0000000d); - const uint32_t h_nan_min = uint32_li(0x00007c01); - const uint32_t f_h_e_biased_flag = uint32_li(0x0000008f); - const uint32_t f_s = uint32_and(a_as_ui, f_s_mask); - const uint32_t f_e = uint32_and(a_as_ui, f_e_mask); - const uint16_t h_s = (uint16_t)uint32_srl(f_s, f_h_s_pos_offset); - const uint32_t f_m = uint32_and(a_as_ui, f_m_mask); - const uint16_t f_e_amount = (uint16_t)uint32_srl(f_e, f_e_pos); - const uint32_t f_e_half_bias = uint32_sub(f_e_amount, f_h_bias_offset); - const uint32_t f_snan = uint32_and(a_as_ui, f_snan_mask); - const uint32_t f_m_round_mask = uint32_and(f_m, f_m_round_bit); - const uint32_t f_m_round_offset = uint32_sll(f_m_round_mask, one); - const uint32_t f_m_rounded = uint32_add(f_m, f_m_round_offset); - const uint32_t f_m_denorm_sa = uint32_sub(one, f_e_half_bias); - const uint32_t f_m_with_hidden = uint32_or(f_m_rounded, f_m_hidden_bit); - const uint32_t f_m_denorm = uint32_srl(f_m_with_hidden, f_m_denorm_sa); - const uint32_t h_m_denorm = uint32_srl(f_m_denorm, f_h_m_pos_offset); - const uint32_t f_m_rounded_overflow = uint32_and(f_m_rounded, f_m_hidden_bit); - const uint32_t m_nan = uint32_srl(f_m, f_h_m_pos_offset); - const uint32_t h_em_nan = uint32_or(h_e_mask, m_nan); - const uint32_t h_e_norm_overflow_offset = uint32_inc(f_e_half_bias); - const uint32_t h_e_norm_overflow = uint32_sll(h_e_norm_overflow_offset, h_e_pos); - const uint32_t h_e_norm = uint32_sll(f_e_half_bias, h_e_pos); - const uint32_t h_m_norm = uint32_srl(f_m_rounded, f_h_m_pos_offset); - const uint32_t h_em_norm = uint32_or(h_e_norm, h_m_norm); - const uint32_t is_h_ndenorm_msb = uint32_sub(f_h_bias_offset, f_e_amount); - const uint32_t is_f_e_flagged_msb = uint32_sub(f_h_e_biased_flag, f_e_half_bias); - const uint32_t is_h_denorm_msb = uint32_not(is_h_ndenorm_msb); - const uint32_t is_f_m_eqz_msb = uint32_dec(f_m); - const uint32_t is_h_nan_eqz_msb = uint32_dec(m_nan); - const uint32_t is_f_inf_msb = uint32_and(is_f_e_flagged_msb, is_f_m_eqz_msb); - const uint32_t is_f_nan_underflow_msb = uint32_and(is_f_e_flagged_msb, is_h_nan_eqz_msb); - const uint32_t is_e_overflow_msb = uint32_sub(h_e_mask_value, f_e_half_bias); - const uint32_t is_h_inf_msb = uint32_or(is_e_overflow_msb, is_f_inf_msb); - const uint32_t is_f_nsnan_msb = uint32_sub(f_snan, f_snan_mask); - const uint32_t is_m_norm_overflow_msb = uint32_neg(f_m_rounded_overflow); - const uint32_t is_f_snan_msb = uint32_not(is_f_nsnan_msb); - const uint32_t h_em_overflow_result = uint32_sels(is_m_norm_overflow_msb, h_e_norm_overflow, h_em_norm); - const uint32_t h_em_nan_result = uint32_sels(is_f_e_flagged_msb, h_em_nan, h_em_overflow_result); - const uint32_t h_em_nan_underflow_result = uint32_sels(is_f_nan_underflow_msb, h_nan_min, h_em_nan_result); - const uint32_t h_em_inf_result = uint32_sels(is_h_inf_msb, h_e_mask, h_em_nan_underflow_result); - const uint32_t h_em_denorm_result = uint32_sels(is_h_denorm_msb, h_m_denorm, h_em_inf_result); - const uint32_t h_em_snan_result = uint32_sels(is_f_snan_msb, h_snan_mask, h_em_denorm_result); - const uint32_t h_result = uint32_or(h_s, h_em_snan_result); - - return uint16_t(h_result); - } - - inline BX_CONST_FUNC float halfToFloat(uint16_t _a) - { - const uint32_t h_e_mask = uint32_li(kHalfExponentMask); - const uint32_t h_m_mask = uint32_li(kHalfMantissaMask); - const uint32_t h_s_mask = uint32_li(kHalfSignMask); - const uint32_t h_f_s_pos_offset = uint32_li(0x00000010); - const uint32_t h_f_e_pos_offset = uint32_li(0x0000000d); - const uint32_t h_f_bias_offset = uint32_li(0x0001c000); - const uint32_t f_e_mask = uint32_li(kFloatExponentMask); - const uint32_t f_m_mask = uint32_li(kFloatMantissaMask); - const uint32_t h_f_e_denorm_bias = uint32_li(0x0000007e); - const uint32_t h_f_m_denorm_sa_bias = uint32_li(0x00000008); - const uint32_t f_e_pos = uint32_li(0x00000017); - const uint32_t h_e_mask_minus_one = uint32_li(0x00007bff); - const uint32_t h_e = uint32_and(_a, h_e_mask); - const uint32_t h_m = uint32_and(_a, h_m_mask); - const uint32_t h_s = uint32_and(_a, h_s_mask); - const uint32_t h_e_f_bias = uint32_add(h_e, h_f_bias_offset); - const uint32_t h_m_nlz = uint32_cntlz(h_m); - const uint32_t f_s = uint32_sll(h_s, h_f_s_pos_offset); - const uint32_t f_e = uint32_sll(h_e_f_bias, h_f_e_pos_offset); - const uint32_t f_m = uint32_sll(h_m, h_f_e_pos_offset); - const uint32_t f_em = uint32_or(f_e, f_m); - const uint32_t h_f_m_sa = uint32_sub(h_m_nlz, h_f_m_denorm_sa_bias); - const uint32_t f_e_denorm_unpacked = uint32_sub(h_f_e_denorm_bias, h_f_m_sa); - const uint32_t h_f_m = uint32_sll(h_m, h_f_m_sa); - const uint32_t f_m_denorm = uint32_and(h_f_m, f_m_mask); - const uint32_t f_e_denorm = uint32_sll(f_e_denorm_unpacked, f_e_pos); - const uint32_t f_em_denorm = uint32_or(f_e_denorm, f_m_denorm); - const uint32_t f_em_nan = uint32_or(f_e_mask, f_m); - const uint32_t is_e_eqz_msb = uint32_dec(h_e); - const uint32_t is_m_nez_msb = uint32_neg(h_m); - const uint32_t is_e_flagged_msb = uint32_sub(h_e_mask_minus_one, h_e); - const uint32_t is_zero_msb = uint32_andc(is_e_eqz_msb, is_m_nez_msb); - const uint32_t is_inf_msb = uint32_andc(is_e_flagged_msb, is_m_nez_msb); - const uint32_t is_denorm_msb = uint32_and(is_m_nez_msb, is_e_eqz_msb); - const uint32_t is_nan_msb = uint32_and(is_e_flagged_msb, is_m_nez_msb); - const uint32_t is_zero = uint32_ext(is_zero_msb); - const uint32_t f_zero_result = uint32_andc(f_em, is_zero); - const uint32_t f_denorm_result = uint32_sels(is_denorm_msb, f_em_denorm, f_zero_result); - const uint32_t f_inf_result = uint32_sels(is_inf_msb, f_e_mask, f_denorm_result); - const uint32_t f_nan_result = uint32_sels(is_nan_msb, f_em_nan, f_inf_result); - const uint32_t f_result = uint32_or(f_s, f_nan_result); - - return bitCast(f_result); - } - -} // namespace bx diff --git a/include/bx/math.h b/include/bx/math.h index 9d5eab6..bee94e5 100644 --- a/include/bx/math.h +++ b/include/bx/math.h @@ -7,7 +7,6 @@ #define BX_MATH_H_HEADER_GUARD #include "bx.h" -#include "uint32_t.h" namespace bx { @@ -595,6 +594,30 @@ namespace bx /// BX_CONSTEXPR_FUNC float sub(float _a, float _b); + /// Saturating integer add. Clamps the result to the representable range + /// of `Ty` instead of wrapping around. Supports signed and unsigned + /// integer types. + /// + /// @param[in] _a Left operand. + /// @param[in] _b Right operand. + /// + /// @returns Sum clamped to `[LimitsT::min, LimitsT::max]`. + /// + template + BX_CONSTEXPR_FUNC Ty satAdd(Ty _a, Ty _b); + + /// Saturating integer subtract. Clamps the result to the representable + /// range of `Ty` instead of wrapping around. Supports signed and + /// unsigned integer types. + /// + /// @param[in] _a Left operand. + /// @param[in] _b Right operand. + /// + /// @returns Difference clamped to `[LimitsT::min, LimitsT::max]`. + /// + template + BX_CONSTEXPR_FUNC Ty satSub(Ty _a, Ty _b); + /// Returns result of multiply (_a * _b). /// /// @param[in] _a First value. @@ -1783,6 +1806,22 @@ namespace bx /// BX_CONSTEXPR_FUNC float toGamma(float _a); + /// Convert float to half-float. + /// + /// @param[in] _a Float value. + /// + /// @returns Half-float value. + /// + BX_CONST_FUNC uint16_t halfFromFloat(float _a); + + /// Convert half-float to float. + /// + /// @param[in] _a Half-float value. + /// + /// @returns Float value. + /// + BX_CONST_FUNC float halfToFloat(uint16_t _a); + } // namespace bx #include "inline/math.inl" diff --git a/include/bx/pixelformat.h b/include/bx/pixelformat.h index 634233c..649c710 100644 --- a/include/bx/pixelformat.h +++ b/include/bx/pixelformat.h @@ -7,7 +7,6 @@ #define BX_PIXEL_FORMAT_H_HEADER_GUARD #include "math.h" -#include "uint32_t.h" namespace bx { diff --git a/include/bx/platform.h b/include/bx/platform.h index 72a3622..5e6acf5 100644 --- a/include/bx/platform.h +++ b/include/bx/platform.h @@ -202,7 +202,7 @@ #elif defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) # undef BX_PLATFORM_OSX # define BX_PLATFORM_OSX __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ -#elif defined(__EMSCRIPTEN__) +#elif defined(__wasm__) # include # undef BX_PLATFORM_EMSCRIPTEN # define BX_PLATFORM_EMSCRIPTEN (__EMSCRIPTEN_MAJOR__ * 10000 + __EMSCRIPTEN_MINOR__ * 100 + __EMSCRIPTEN_TINY__) diff --git a/include/bx/readerwriter.h b/include/bx/readerwriter.h index 55d504f..61db2c9 100644 --- a/include/bx/readerwriter.h +++ b/include/bx/readerwriter.h @@ -12,7 +12,6 @@ #include "filepath.h" #include "math.h" #include "string.h" -#include "uint32_t.h" namespace bx { diff --git a/include/bx/ringbuffer.h b/include/bx/ringbuffer.h index 10d868e..8099876 100644 --- a/include/bx/ringbuffer.h +++ b/include/bx/ringbuffer.h @@ -8,7 +8,7 @@ #include "bx.h" #include "cpu.h" -#include "uint32_t.h" +#include "simd_t.h" namespace bx { diff --git a/include/bx/rng.h b/include/bx/rng.h index bd994e8..9d88540 100644 --- a/include/bx/rng.h +++ b/include/bx/rng.h @@ -8,7 +8,6 @@ #include "bx.h" #include "math.h" -#include "uint32_t.h" namespace bx { diff --git a/include/bx/simd_t.h b/include/bx/simd_t.h index ef032d2..769b416 100644 --- a/include/bx/simd_t.h +++ b/include/bx/simd_t.h @@ -8,559 +8,1525 @@ #include "bx.h" -#define BX_SIMD_FORCE_INLINE BX_FORCE_INLINE -#define BX_SIMD_INLINE inline - -#define BX_SIMD_AVX 0 -#define BX_SIMD_LANGEXT 0 -#define BX_SIMD_NEON 0 -#define BX_SIMD_SSE 0 +// Naming convention: +// +// simd[register-width][_]_[_] +// +// <> - not optional +// [] - optional +// +// register-width: 32, 64, 128, 256 +// (omitted for width-generic templates to operate on any available register width) +// +// lane-type: +// f - floating point +// i - signed integer +// u - unsigned integer +// x - typeless bitwise +// +// lane-type-width: 8, 16, 32, 64 +// +// +----+----+----+----+----+----+----+----+- ~ -+----+ +// | 00 | 01 | 02 | 03 | 04 | 05 | 06 | 07 | ~ | NN | bytes +// +----+----+----+----+----+----+----+----+- ~ -+----+ +// | register width 32, 64, 128, 256 | +// +----+----+----+----+----+----+----+----+- ~ -----+ +// | u8 | u8 | u8 | u8 | u8 | u8 | u8 | u8 | ~ ... | +// +----+----+----+----+----+----+----+----+- ~ -----+ +// | u16 | u16 | u16 | u16 | ~ ... | +// +---------+---------+---------+---------+- ~ -----+ +// | u32 | u32 | ~ ... | +// +-------------------+-------------------+- ~ -----+ +// | u64 | ~ ... | +// +---------------------------------------+- ~ -----+ +// +// suffix: +// est - Fast estimate (lower precision) +// ni - Not Intrinsic (software fallback) +// nr - Newton-Raphson refined +// nr1 - One-iteration Newton-Raphson +// xyz1 - Operates only on xyz, sets w=1 +#define BX_SIMD_AVX 0 +#define BX_SIMD_AVX2 0 +#define BX_SIMD_LANGEXT 0 +#define BX_SIMD_NEON 0 +#define BX_SIMD_SSE 0 +#define BX_SIMD_WASM 0 #define BX_SIMD_SUPPORTED 0 -#if defined(__AVX__) || defined(__AVX2__) +#if BX_COMPILER_GCC || BX_COMPILER_CLANG +# undef BX_SIMD_LANGEXT +# define BX_SIMD_LANGEXT 1 +#endif // BX_COMPILER_GCC || BX_COMPILER_CLANG + +#if defined(__AVX2__) +# include +# undef BX_SIMD_AVX2 +# define BX_SIMD_AVX2 1 +# undef BX_SIMD_AVX +# define BX_SIMD_AVX 1 +#elif defined(__AVX__) # include # undef BX_SIMD_AVX -# define BX_SIMD_AVX 1 +# define BX_SIMD_AVX 1 #endif // #if defined(__SSE2__) || (BX_COMPILER_MSVC && (BX_ARCH_64BIT || _M_IX86_FP >= 2) ) -# include // __m128i -# if defined(__SSE4_1__) -# include -# endif // defined(__SSE4_1__) -# include // __m128 +# include +# include // SSE4.1 minspec is SSE4.2 so always available +# if defined(__SSE4_2__) || BX_COMPILER_MSVC +# include +# endif +# include # undef BX_SIMD_SSE # define BX_SIMD_SSE 1 -#elif defined(__ARM_NEON__) && (!BX_COMPILER_CLANG || BX_CLANG_HAS_EXTENSION(attribute_ext_vector_type) ) +#elif defined(__ARM_NEON__) || defined(__ARM_NEON) # include # undef BX_SIMD_NEON # define BX_SIMD_NEON 1 -#elif BX_COMPILER_CLANG \ - && !BX_PLATFORM_EMSCRIPTEN \ - && !BX_PLATFORM_IOS \ - && !BX_PLATFORM_VISIONOS \ - && BX_CLANG_HAS_EXTENSION(attribute_ext_vector_type) -# undef BX_SIMD_LANGEXT -# define BX_SIMD_LANGEXT 1 +#elif defined(__wasm_simd128__) +# include +# undef BX_SIMD_WASM +# define BX_SIMD_WASM 1 #endif // -#if ( BX_SIMD_AVX \ - || BX_SIMD_LANGEXT \ - || BX_SIMD_NEON \ - || BX_SIMD_SSE \ +#if ( BX_SIMD_AVX \ + || BX_SIMD_NEON \ + || BX_SIMD_SSE \ + || BX_SIMD_WASM \ ) # undef BX_SIMD_SUPPORTED # define BX_SIMD_SUPPORTED 1 #endif // BX_SIMD_* +#define BX_SIMD_FORCE_INLINE BX_FORCE_INLINE +#define BX_SIMD_INLINE inline + namespace bx { -#define ELEMx 0 -#define ELEMy 1 -#define ELEMz 2 -#define ELEMw 3 -#define BX_SIMD128_IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ - template \ - Ty simd_swiz_##_x##_y##_z##_w(Ty _a); -#include "inline/simd128_swizzle.inl" -#undef BX_SIMD128_IMPLEMENT_SWIZZLE -#undef ELEMw -#undef ELEMz -#undef ELEMy -#undef ELEMx - -#define BX_SIMD128_IMPLEMENT_TEST(_xyzw) \ - template \ - BX_SIMD_FORCE_INLINE bool simd_test_any_##_xyzw(Ty _test); \ - \ - template \ - BX_SIMD_FORCE_INLINE bool simd_test_all_##_xyzw(Ty _test) - -BX_SIMD128_IMPLEMENT_TEST(x ); -BX_SIMD128_IMPLEMENT_TEST(y ); -BX_SIMD128_IMPLEMENT_TEST(xy ); -BX_SIMD128_IMPLEMENT_TEST(z ); -BX_SIMD128_IMPLEMENT_TEST(xz ); -BX_SIMD128_IMPLEMENT_TEST(yz ); -BX_SIMD128_IMPLEMENT_TEST(xyz ); -BX_SIMD128_IMPLEMENT_TEST(w ); -BX_SIMD128_IMPLEMENT_TEST(xw ); -BX_SIMD128_IMPLEMENT_TEST(yw ); -BX_SIMD128_IMPLEMENT_TEST(xyw ); -BX_SIMD128_IMPLEMENT_TEST(zw ); -BX_SIMD128_IMPLEMENT_TEST(xzw ); -BX_SIMD128_IMPLEMENT_TEST(yzw ); -BX_SIMD128_IMPLEMENT_TEST(xyzw); -#undef BX_SIMD128_IMPLEMENT_TEST - - template - Ty simd_shuf_xyAB(Ty _a, Ty _b); - - template - Ty simd_shuf_ABxy(Ty _a, Ty _b); - - template - Ty simd_shuf_CDzw(Ty _a, Ty _b); - - template - Ty simd_shuf_zwCD(Ty _a, Ty _b); - - template - Ty simd_shuf_xAyB(Ty _a, Ty _b); - - template - Ty simd_shuf_AxBy(Ty _a, Ty _b); - - template - Ty simd_shuf_zCwD(Ty _a, Ty _b); - - template - Ty simd_shuf_CzDw(Ty _a, Ty _b); - - template - float simd_x(Ty _a); - - template - float simd_y(Ty _a); - - template - float simd_z(Ty _a); - - template - float simd_w(Ty _a); - - template - Ty simd_ld(const void* _ptr); - - template - void simd_st(void* _ptr, Ty _a); - - template - void simd_stx(void* _ptr, Ty _a); - - template - void simd_stream(void* _ptr, Ty _a); - - template - Ty simd_ld(float _x, float _y, float _z, float _w); - - template - Ty simd_ld(float _x, float _y, float _z, float _w, float _a, float _b, float _c, float _d); - - template - Ty simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w); - - template - Ty simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w, uint32_t _a, uint32_t _b, uint32_t _c, uint32_t _d); - - template - Ty simd_splat(const void* _ptr); - - template - Ty simd_splat(float _a); - - template - Ty simd_isplat(uint32_t _a); - - template - Ty simd_zero(); - - template - Ty simd_itof(Ty _a); - - template - Ty simd_ftoi(Ty _a); - - template - Ty simd_round(Ty _a); - - template - Ty simd_add(Ty _a, Ty _b); - - template - Ty simd_sub(Ty _a, Ty _b); - - template - Ty simd_mul(Ty _a, Ty _b); - - template - Ty simd_div(Ty _a, Ty _b); - - template - Ty simd_rcp_est(Ty _a); - - template - Ty simd_sqrt(Ty _a); - - template - Ty simd_rsqrt_est(Ty _a); - - template - Ty simd_dot3(Ty _a, Ty _b); - - template - Ty simd_dot(Ty _a, Ty _b); - - template - Ty simd_cmpeq(Ty _a, Ty _b); - - template - Ty simd_cmpneq(Ty _a, Ty _b); - - template - Ty simd_cmplt(Ty _a, Ty _b); - - template - Ty simd_cmple(Ty _a, Ty _b); - - template - Ty simd_cmpgt(Ty _a, Ty _b); - - template - Ty simd_cmpge(Ty _a, Ty _b); - - template - Ty simd_min(Ty _a, Ty _b); - - template - Ty simd_max(Ty _a, Ty _b); - - template - Ty simd_and(Ty _a, Ty _b); - - template - Ty simd_andc(Ty _a, Ty _b); - - template - Ty simd_or(Ty _a, Ty _b); - - template - Ty simd_xor(Ty _a, Ty _b); - - template - Ty simd_sll(Ty _a, int _count); - - template - Ty simd_srl(Ty _a, int _count); - - template - Ty simd_sra(Ty _a, int _count); - - template - Ty simd_icmpeq(Ty _a, Ty _b); - - template - Ty simd_icmplt(Ty _a, Ty _b); - - template - Ty simd_icmpgt(Ty _a, Ty _b); - - template - Ty simd_imin(Ty _a, Ty _b); - - template - Ty simd_imax(Ty _a, Ty _b); - - template - Ty simd_iadd(Ty _a, Ty _b); - - template - Ty simd_isub(Ty _a, Ty _b); - - template - Ty simd_shuf_xAzC(Ty _a, Ty _b); - - template - Ty simd_shuf_yBwD(Ty _a, Ty _b); - - template - Ty simd_rcp(Ty _a); - - template - Ty simd_orx(Ty _a); - - template - Ty simd_orc(Ty _a, Ty _b); - - template - Ty simd_neg(Ty _a); - - template - Ty simd_madd(Ty _a, Ty _b, Ty _c); - - template - Ty simd_nmsub(Ty _a, Ty _b, Ty _c); - - template - Ty simd_div_nr(Ty _a, Ty _b); - - template - Ty simd_selb(Ty _mask, Ty _a, Ty _b); - - template - Ty simd_sels(Ty _test, Ty _a, Ty _b); - - template - Ty simd_not(Ty _a); - - template - Ty simd_abs(Ty _a); - - template - Ty simd_clamp(Ty _a, Ty _min, Ty _max); - - template - Ty simd_lerp(Ty _a, Ty _b, Ty _s); - - template - Ty simd_rsqrt(Ty _a); - - template - Ty simd_rsqrt_nr(Ty _a); - - template - Ty simd_rsqrt_carmack(Ty _a); - - template - Ty simd_sqrt_nr(Ty _a); - - template - Ty simd_log2(Ty _a); - - template - Ty simd_exp2(Ty _a); - - template - Ty simd_pow(Ty _a, Ty _b); - - template - Ty simd_cross3(Ty _a, Ty _b); - - template - Ty simd_normalize3(Ty _a); - - template - Ty simd_ceil(Ty _a); - - template - Ty simd_floor(Ty _a); - - template - Ty simd_shuf_xAzC_ni(Ty _a, Ty _b); - - template - Ty simd_shuf_yBwD_ni(Ty _a, Ty _b); - - template - Ty simd_madd_ni(Ty _a, Ty _b, Ty _c); - - template - Ty simd_nmsub_ni(Ty _a, Ty _b, Ty _c); - - template - Ty simd_div_nr_ni(Ty _a, Ty _b); - - template - Ty simd_rcp_ni(Ty _a); - - template - Ty simd_orx_ni(Ty _a); - - template - Ty simd_orc_ni(Ty _a, Ty _b); - - template - Ty simd_neg_ni(Ty _a); - - template - Ty simd_selb_ni(Ty _mask, Ty _a, Ty _b); - - template - Ty simd_sels_ni(Ty _test, Ty _a, Ty _b); - - template - Ty simd_not_ni(Ty _a); - - template - Ty simd_cmpneq_ni(Ty _a, Ty _b); - - template - Ty simd_min_ni(Ty _a, Ty _b); - - template - Ty simd_max_ni(Ty _a, Ty _b); - - template - Ty simd_abs_ni(Ty _a); - - template - Ty simd_imin_ni(Ty _a, Ty _b); - - template - Ty simd_imax_ni(Ty _a, Ty _b); - - template - Ty simd_clamp_ni(Ty _a, Ty _min, Ty _max); - - template - Ty simd_lerp_ni(Ty _a, Ty _b, Ty _s); - - template - Ty simd_sqrt_nr_ni(Ty _a); - - template - Ty simd_sqrt_nr1_ni(Ty _a); - - template - Ty simd_rsqrt_ni(Ty _a); - - template - Ty simd_rsqrt_nr_ni(Ty _a); - - template - Ty simd_rsqrt_carmack_ni(Ty _a); - - template - Ty simd_log2_ni(Ty _a); - - template - Ty simd_exp2_ni(Ty _a); - - template - Ty simd_pow_ni(Ty _a, Ty _b); - - template - Ty simd_dot3_ni(Ty _a, Ty _b); - - template - Ty simd_cross3_ni(Ty _a, Ty _b); - - template - Ty simd_normalize3_ni(Ty _a); - - template - Ty simd_dot_ni(Ty _a, Ty _b); - - template - Ty simd_ceil_ni(Ty _a); - - template - Ty simd_floor_ni(Ty _a); - - template - Ty simd_round_ni(Ty _a); - - template - bool simd_test_any_ni(Ty _a); - - template - bool simd_test_all_ni(Ty _a); - -#if BX_SIMD_AVX - typedef __m256 simd256_avx_t; +#if BX_SIMD_SSE + typedef __m128 simd128_sse_t; #endif // BX_SIMD_SSE -#if BX_SIMD_LANGEXT - union simd128_langext_t - { - float __attribute__((vector_size(16))) vf; - int32_t __attribute__((vector_size(16))) vi; - uint32_t __attribute__((vector_size(16))) vu; - float fxyzw[4]; - int32_t ixyzw[4]; - uint32_t uxyzw[4]; - - }; -#endif // BX_SIMD_LANGEXT - #if BX_SIMD_NEON typedef float32x4_t simd128_neon_t; #endif // BX_SIMD_NEON -#if BX_SIMD_SSE - typedef __m128 simd128_sse_t; -#endif // BX_SIMD_SSE - -} // namespace bx - #if BX_SIMD_AVX -# include "inline/simd256_avx.inl" + typedef __m256 simd256_avx_t; #endif // BX_SIMD_AVX -#if BX_SIMD_LANGEXT -# include "inline/simd128_langext.inl" -#endif // BX_SIMD_LANGEXT +#if BX_SIMD_WASM + typedef v128_t simd128_wasm_t; +#endif // BX_SIMD_WASM -#if BX_SIMD_NEON -# include "inline/simd128_neon.inl" -#endif // BX_SIMD_NEON + BX_ALIGN_DECL(4, struct) simd32_ref_t { uint32_t u32; }; + BX_ALIGN_DECL(8, struct) simd64_ref_t { uint64_t u64; }; + BX_ALIGN_DECL(16, struct) simd128_ref_t { uint32_t u32[4]; }; + BX_ALIGN_DECL(32, struct) simd256_ref_t { simd128_ref_t lo; simd128_ref_t hi; }; #if BX_SIMD_SSE -# include "inline/simd128_sse.inl" -#endif // BX_SIMD_SSE - -namespace bx -{ - union simd128_ref_t - { - float fxyzw[4]; - int32_t ixyzw[4]; - uint32_t uxyzw[4]; - }; - -#ifndef BX_SIMD_WARN_REFERENCE_IMPL -# define BX_SIMD_WARN_REFERENCE_IMPL 0 -#endif // BX_SIMD_WARN_REFERENCE_IMPL - -#if !BX_SIMD_SUPPORTED -# if BX_SIMD_WARN_REFERENCE_IMPL -# pragma message("*** Using SIMD128 reference implementation! ***") -# endif // BX_SIMD_WARN_REFERENCE_IMPL - - typedef simd128_ref_t simd128_t; -#endif // BX_SIMD_REFERENCE - - struct simd256_ref_t - { -#if BX_COMPILER_MSVC - typedef simd128_ref_t type; + typedef simd128_sse_t simd128_t; +#elif BX_SIMD_NEON + typedef simd128_neon_t simd128_t; +#elif BX_SIMD_WASM + typedef simd128_wasm_t simd128_t; #else - typedef simd128_t type; -#endif // BX_COMPILER_MSVC + typedef simd128_ref_t simd128_t; +#endif // BX_SIMD_* - type simd128_0; - type simd128_1; - }; +#if BX_SIMD_AVX + typedef simd256_avx_t simd256_t; +#else + typedef simd256_ref_t simd256_t; +#endif // BX_SIMD_AVX -#if !BX_SIMD_AVX -# if BX_SIMD_WARN_REFERENCE_IMPL -# pragma message("*** Using SIMD256 reference implementation! ***") -# endif // BX_SIMD_WARN_REFERENCE_IMPL + typedef simd32_ref_t simd32_t; + typedef simd64_ref_t simd64_t; - typedef simd256_ref_t simd256_t; -#endif // !BX_SIMD_AVX + // These deduce register width from the type parameter. Call with + // simd128_t for 128-bit, simd256_t for 256-bit, etc. - simd128_t simd_zero(); + /// Per-lane f32 add: `_a + _b`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane sum. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_f32_add(Ty _a, Ty _b); - simd128_t simd_ld(const void* _ptr); + /// Per-lane f32 subtract: `_a - _b`. + /// + /// @param[in] _a Minuend. + /// @param[in] _b Subtrahend. + /// + /// @returns Per-lane difference. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_f32_sub(Ty _a, Ty _b); - simd128_t simd_ld(float _x, float _y, float _z, float _w); + /// Per-lane f32 multiply: `_a * _b`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane product. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_f32_mul(Ty _a, Ty _b); - simd128_t simd_ild(uint32_t _x, uint32_t _y, uint32_t _z, uint32_t _w); + /// Per-lane f32 divide: `_a / _b`. + /// + /// @param[in] _a Dividend. + /// @param[in] _b Divisor. + /// + /// @returns Per-lane quotient. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_f32_div(Ty _a, Ty _b); - simd128_t simd_splat(const void* _ptr); + /// Per-lane fused multiply-add: `_a * _b + _c`. + /// + /// @param[in] _a Multiplicand. + /// @param[in] _b Multiplier. + /// @param[in] _c Addend. + /// + /// @returns Per-lane result of `_a * _b + _c`. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_f32_madd(Ty _a, Ty _b, Ty _c); - simd128_t simd_splat(float _a); + /// Per-lane fused multiply-subtract: `_a * _b - _c`. + /// + /// @param[in] _a Multiplicand. + /// @param[in] _b Multiplier. + /// @param[in] _c Subtrahend. + /// + /// @returns Per-lane result of `_a * _b - _c`. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_f32_msub(Ty _a, Ty _b, Ty _c); - simd128_t simd_isplat(uint32_t _a); + /// Extract sign bits from each 32-bit lane into an int bitmask. + /// + /// @param[in] _a Input register. + /// + /// @returns Bitmask with one bit per 32-bit lane. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + int simd_x32_signbitsmask(Ty _a); + + /// Extract sign bits from each 8-bit lane into an int bitmask. + /// + /// @param[in] _a Input register. + /// + /// @returns Bitmask with one bit per 8-bit lane. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + int simd_x8_signbitsmask(Ty _a); + + /// Per-byte shuffle within each 16-byte lane (single-source). + /// + /// For each output byte, an index byte selects which input byte is copied: + /// - If bit 7 of the index byte is set, the output byte is zero. + /// - Otherwise, bits 0..3 select one of the 16 bytes within the same + /// 16-byte lane of `_a`. Bits 4..6 must be zero (reserved). + /// + /// For widths < 16 bytes (simd32, simd64), the lane is the full register + /// and the active selector bits are correspondingly fewer (2 bits for + /// simd32, 3 bits for simd64). + /// + /// @param[in] _a Source register. + /// @param[in] _indices Per-byte selector indices. + /// + /// @returns Byte-shuffled result. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_x8_shuffle(Ty _a, Ty _indices); + + /// Per-byte shuffle within each 16-byte lane (two-source). + /// + /// For each output byte, an index byte selects which input byte is copied + /// from the concatenation of `_a` and `_b`: + /// - If bit 7 of the index byte is set, the output byte is zero. + /// - Otherwise, bits 0..4 select one of the 32 bytes within the matching + /// 16-byte lanes of `_a` (low 16) and `_b` (high 16). Bits 5..6 must be + /// zero (reserved). + /// + /// For widths < 16 bytes (simd32, simd64), the lane is the full register + /// and the active selector bits are correspondingly fewer. + /// + /// @param[in] _a First source register (low half of concatenation). + /// @param[in] _b Second source register (high half of concatenation). + /// @param[in] _indices Per-byte selector indices. + /// + /// @returns Byte-shuffled result. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_x8_shuffle(Ty _a, Ty _b, Ty _indices); + + /// Per-lane f32 minimum: `min(_a, _b)`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane minimum. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_f32_min(Ty _a, Ty _b); + + /// Per-lane f32 maximum: `max(_a, _b)`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane maximum. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_f32_max(Ty _a, Ty _b); + + /// Per-lane i32 add: `_a + _b`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane sum. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_i32_add(Ty _a, Ty _b); + + /// Per-lane i32 subtract: `_a - _b`. + /// + /// @param[in] _a Minuend. + /// @param[in] _b Subtrahend. + /// + /// @returns Per-lane difference. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_i32_sub(Ty _a, Ty _b); + + /// Per-lane i32 negate: `-_a`. + /// + /// @param[in] _a Input register. + /// + /// @returns Per-lane negation. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_i32_neg(Ty _a); + + /// Per-lane i32 absolute value: `abs(_a)`. + /// + /// @param[in] _a Input register. + /// + /// @returns Per-lane absolute value. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_i32_abs(Ty _a); + + /// Per-lane i32 multiply (low 32 bits of 32x32 product). + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane low 32 bits of `_a * _b`. The result is bit-identical + /// for signed and unsigned operands. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_i32_mul(Ty _a, Ty _b); + + /// Per-lane unsigned 8-bit saturating add: `min(_a + _b, 0xff)`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane saturated sum. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_u8_satadd(Ty _a, Ty _b); + + /// Per-lane unsigned 8-bit saturating subtract: `max(_a - _b, 0)`. + /// + /// @param[in] _a Minuend. + /// @param[in] _b Subtrahend. + /// + /// @returns Per-lane saturated difference. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_u8_satsub(Ty _a, Ty _b); + + /// Per-lane unsigned 16-bit saturating add: `min(_a + _b, 0xffff)`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane saturated sum. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_u16_satadd(Ty _a, Ty _b); + + /// Per-lane unsigned 16-bit saturating subtract: `max(_a - _b, 0)`. + /// + /// @param[in] _a Minuend. + /// @param[in] _b Subtrahend. + /// + /// @returns Per-lane saturated difference. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_u16_satsub(Ty _a, Ty _b); + + /// Bitwise AND: `_a & _b`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Bitwise AND of all bits. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_and(Ty _a, Ty _b); + + /// Bitwise OR: `_a | _b`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Bitwise OR of all bits. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_or(Ty _a, Ty _b); + + /// Bitwise XOR: `_a ^ _b`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Bitwise XOR of all bits. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_xor(Ty _a, Ty _b); + + /// Per-lane negative multiply-subtract: `_c - _a * _b`. + /// + /// @param[in] _a Multiplicand. + /// @param[in] _b Multiplier. + /// @param[in] _c Value to subtract from. + /// + /// @returns Per-lane result of `_c - _a * _b`. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_f32_nmsub(Ty _a, Ty _b, Ty _c); + + /// Per-lane f32 negate: `-_a`. + /// + /// @param[in] _a Input register. + /// + /// @returns Per-lane negation. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_f32_neg(Ty _a); + + /// Per-lane f32 absolute value: `abs(_a)`. + /// + /// @param[in] _a Input register. + /// + /// @returns Per-lane absolute value. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_f32_abs(Ty _a); + + /// Per-lane f32 reciprocal estimate: `~1/_a`. + /// + /// @param[in] _a Input register. + /// + /// @returns Per-lane approximate reciprocal. + /// + /// @remark Widths: simd128, simd256. + /// + template + Ty simd_f32_rcp_est(Ty _a); + + /// Per-lane f32 reciprocal square root estimate: `~1/sqrt(_a)`. + /// + /// @param[in] _a Input register. + /// + /// @returns Per-lane approximate reciprocal square root. + /// + /// @remark Widths: simd128, simd256. + /// + template + Ty simd_f32_rsqrt_est(Ty _a); + + /// Per-lane f32 square root: `sqrt(_a)`. + /// + /// @param[in] _a Input register. + /// + /// @returns Per-lane square root. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_f32_sqrt(Ty _a); + + /// Per-lane f32 reciprocal square root: `1/sqrt(_a)`. + /// + /// @param[in] _a Input register. + /// + /// @returns Per-lane reciprocal square root. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_f32_rsqrt(Ty _a); + + /// Per-lane f32 divide via Newton-Raphson: `_a / _b`. + /// + /// @param[in] _a Dividend. + /// @param[in] _b Divisor. + /// + /// @returns Per-lane quotient (NR-refined). + /// + /// @remark Widths: simd128, simd256. + /// + template + Ty simd_f32_div_nr(Ty _a, Ty _b); + + /// Per-lane f32 square root via Newton-Raphson: `sqrt(_a)`. + /// + /// @param[in] _a Input register. + /// + /// @returns Per-lane square root (NR-refined). + /// + /// @remark Widths: simd128, simd256. + /// + template + Ty simd_f32_sqrt_nr(Ty _a); + + /// Per-lane f32 square root via Newton-Raphson (1 iteration). + /// + /// @param[in] _a Input register. + /// + /// @returns Per-lane square root (1 NR iteration). + /// + /// @remark Widths: simd128, simd256. + /// + template + Ty simd_f32_sqrt_nr1(Ty _a); + + /// Per-lane f32 reciprocal square root via Newton-Raphson. + /// + /// @param[in] _a Input register. + /// + /// @returns Per-lane reciprocal square root (NR-refined). + /// + /// @remark Widths: simd128, simd256. + /// + template + Ty simd_f32_rsqrt_nr(Ty _a); + + /// Per-lane f32 reciprocal square root via Carmack's method. + /// + /// @param[in] _a Input register. + /// + /// @returns Per-lane reciprocal square root (Carmack). + /// + /// @remark Widths: simd128, simd256. + /// + template + Ty simd_f32_rsqrt_carmack(Ty _a); + + /// Per-lane f32 compare equal. Lanes set to all-ones if equal, zero otherwise. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane comparison mask. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_f32_cmpeq(Ty _a, Ty _b); + + /// Per-lane f32 compare less-than. Lanes set to all-ones if `_a < _b`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane comparison mask. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_f32_cmplt(Ty _a, Ty _b); + + /// Per-lane f32 compare greater-than. Lanes set to all-ones if `_a > _b`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane comparison mask. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_f32_cmpgt(Ty _a, Ty _b); + + /// Per-lane f32 compare not-equal. Lanes set to all-ones if `_a != _b`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane comparison mask. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_f32_cmpneq(Ty _a, Ty _b); + + /// Per-lane f32 compare less-or-equal. Lanes set to all-ones if `_a <= _b`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane comparison mask. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_f32_cmple(Ty _a, Ty _b); + + /// Per-lane f32 compare greater-or-equal. Lanes set to all-ones if `_a >= _b`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane comparison mask. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_f32_cmpge(Ty _a, Ty _b); + + /// Per-lane f32 clamp: `min(max(_a, _min), _max)`. + /// + /// @param[in] _a Value to clamp. + /// @param[in] _min Minimum bound. + /// @param[in] _max Maximum bound. + /// + /// @returns Per-lane clamped value. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_f32_clamp(Ty _a, Ty _min, Ty _max); + + /// Per-lane f32 linear interpolation: `_a + (_b - _a) * _s`. + /// + /// @param[in] _a Start value. + /// @param[in] _b End value. + /// @param[in] _s Interpolation factor. + /// + /// @returns Per-lane interpolated value. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_f32_lerp(Ty _a, Ty _b, Ty _s); + + /// Per-lane f32 reciprocal: `1/_a`. + /// + /// @param[in] _a Input register. + /// + /// @returns Per-lane reciprocal. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_f32_rcp(Ty _a); + + /// Per-lane f32 round to nearest integer. + /// + /// @param[in] _a Input register. + /// + /// @returns Per-lane rounded value. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_f32_round(Ty _a); + + /// Per-lane f32 ceiling. + /// + /// @param[in] _a Input register. + /// + /// @returns Per-lane ceiling value. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_f32_ceil(Ty _a); + + /// Per-lane f32 floor. + /// + /// @param[in] _a Input register. + /// + /// @returns Per-lane floor value. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_f32_floor(Ty _a); + + /// Per-lane f32 base-2 logarithm: `log2(_a)`. + /// + /// @param[in] _a Input register. + /// + /// @returns Per-lane log2 value. + /// + /// @remark Widths: simd128, simd256. + /// + template + Ty simd_f32_log2(Ty _a); + + /// Per-lane f32 base-2 exponential: `2^_a`. + /// + /// @param[in] _a Input register. + /// + /// @returns Per-lane 2^_a. + /// + /// @remark Widths: simd128, simd256. + /// + template + Ty simd_f32_exp2(Ty _a); + + /// Per-lane f32 power: `_a^_b`. + /// + /// @param[in] _a Base. + /// @param[in] _b Exponent. + /// + /// @returns Per-lane power. + /// + /// @remark Widths: simd128, simd256. + /// + template + Ty simd_f32_pow(Ty _a, Ty _b); + + /// Per-lane f32 cosine. + /// + /// @param[in] _a Angle in radians. + /// + /// @returns Per-lane cosine. + /// + /// @remark Widths: simd128, simd256. + /// + template + Ty simd_f32_cos(Ty _a); + + /// Per-lane f32 sine. + /// + /// @param[in] _a Angle in radians. + /// + /// @returns Per-lane sine. + /// + /// @remark Widths: simd128, simd256. + /// + template + Ty simd_f32_sin(Ty _a); + + /// Per-lane f32 natural logarithm: `ln(_a)`. + /// + /// @param[in] _a Input register. + /// + /// @returns Per-lane natural logarithm. + /// + /// @remark Widths: simd128, simd256. + /// + template + Ty simd_f32_log(Ty _a); + + /// Per-lane f32 natural exponential: `e^_a`. + /// + /// @param[in] _a Input register. + /// + /// @returns Per-lane e^_a. + /// + /// @remark Widths: simd128, simd256. + /// + template + Ty simd_f32_exp(Ty _a); + + /// Per-lane f32 ldexp: `_a * 2^_b`. + /// + /// @param[in] _a Significand. + /// @param[in] _b Exponent (as float). + /// + /// @returns Per-lane result of `_a * 2^_b`. + /// + /// @remark Widths: simd128, simd256. + /// + template + Ty simd_f32_ldexp(Ty _a, Ty _b); + + /// Per-lane f32 to i32 conversion with truncation toward zero. + /// + /// @param[in] _a Input f32 register. + /// + /// @returns Per-lane integer (as bit pattern). + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_f32_ftoi_trunc(Ty _a); + + /// Per-lane f32 to i32 conversion with round-to-nearest (ties to even). + /// + /// @param[in] _a Input f32 register. + /// + /// @returns Per-lane integer (as bit pattern). + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_f32_ftoi_round(Ty _a); + + /// Per-lane i32 to f32 conversion. + /// + /// @param[in] _a Input i32 register. + /// + /// @returns Per-lane float value. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_i32_itof(Ty _a); + + /// Per-lane f64 add: `_a + _b`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane sum. + /// + /// @remark Widths: simd64, simd128, simd256. + /// + template + Ty simd_f64_add(Ty _a, Ty _b); + + /// Per-lane f64 subtract: `_a - _b`. + /// + /// @param[in] _a Minuend. + /// @param[in] _b Subtrahend. + /// + /// @returns Per-lane difference. + /// + /// @remark Widths: simd64, simd128, simd256. + /// + template + Ty simd_f64_sub(Ty _a, Ty _b); + + /// Per-lane f64 multiply: `_a * _b`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane product. + /// + /// @remark Widths: simd64, simd128, simd256. + /// + template + Ty simd_f64_mul(Ty _a, Ty _b); + + /// Per-lane f64 divide: `_a / _b`. + /// + /// @param[in] _a Dividend. + /// @param[in] _b Divisor. + /// + /// @returns Per-lane quotient. + /// + /// @remark Widths: simd64, simd128, simd256. + /// + template + Ty simd_f64_div(Ty _a, Ty _b); + + /// Per-lane f64 fused multiply-add: `_a * _b + _c`. + /// + /// @param[in] _a Multiplicand. + /// @param[in] _b Multiplier. + /// @param[in] _c Addend. + /// + /// @returns Per-lane result of `_a * _b + _c`. + /// + /// @remark Widths: simd64, simd128, simd256. + /// + template + Ty simd_f64_madd(Ty _a, Ty _b, Ty _c); + + /// Per-lane f64 negative multiply-subtract: `_c - _a * _b`. + /// + /// @param[in] _a Multiplicand. + /// @param[in] _b Multiplier. + /// @param[in] _c Value to subtract from. + /// + /// @returns Per-lane result of `_c - _a * _b`. + /// + /// @remark Widths: simd64, simd128, simd256. + /// + template + Ty simd_f64_nmsub(Ty _a, Ty _b, Ty _c); + + /// Per-lane f64 negate: `-_a`. + /// + /// @param[in] _a Input register. + /// + /// @returns Per-lane negation. + /// + /// @remark Widths: simd64, simd128, simd256. + /// + template + Ty simd_f64_neg(Ty _a); + + /// Per-lane f64 absolute value: `abs(_a)`. + /// + /// @param[in] _a Input register. + /// + /// @returns Per-lane absolute value. + /// + /// @remark Widths: simd64, simd128, simd256. + /// + template + Ty simd_f64_abs(Ty _a); + + /// Per-lane f64 minimum: `min(_a, _b)`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane minimum. + /// + /// @remark Widths: simd64, simd128, simd256. + /// + template + Ty simd_f64_min(Ty _a, Ty _b); + + /// Per-lane f64 maximum: `max(_a, _b)`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane maximum. + /// + /// @remark Widths: simd64, simd128, simd256. + /// + template + Ty simd_f64_max(Ty _a, Ty _b); + + /// Per-lane f64 clamp: `min(max(_a, _min), _max)`. + /// + /// @param[in] _a Value to clamp. + /// @param[in] _min Minimum bound. + /// @param[in] _max Maximum bound. + /// + /// @returns Per-lane clamped value. + /// + /// @remark Widths: simd64, simd128, simd256. + /// + template + Ty simd_f64_clamp(Ty _a, Ty _min, Ty _max); + + /// Per-lane f64 linear interpolation: `_a + (_b - _a) * _s`. + /// + /// @param[in] _a Start value. + /// @param[in] _b End value. + /// @param[in] _s Interpolation factor. + /// + /// @returns Per-lane interpolated value. + /// + /// @remark Widths: simd64, simd128, simd256. + /// + template + Ty simd_f64_lerp(Ty _a, Ty _b, Ty _s); + + /// Per-lane f64 reciprocal: `1/_a`. + /// + /// @param[in] _a Input register. + /// + /// @returns Per-lane reciprocal. + /// + /// @remark Widths: simd64, simd128, simd256. + /// + template + Ty simd_f64_rcp(Ty _a); + + /// Per-lane f64 square root: `sqrt(_a)`. + /// + /// @param[in] _a Input register. + /// + /// @returns Per-lane square root. + /// + /// @remark Widths: simd64, simd128, simd256. + /// + template + Ty simd_f64_sqrt(Ty _a); + + /// Per-lane f64 reciprocal square root: `1/sqrt(_a)`. + /// + /// @param[in] _a Input register. + /// + /// @returns Per-lane reciprocal square root. + /// + /// @remark Widths: simd64, simd128, simd256. + /// + template + Ty simd_f64_rsqrt(Ty _a); + + /// Per-lane f64 round to nearest integer. + /// + /// @param[in] _a Input register. + /// + /// @returns Per-lane rounded value. + /// + /// @remark Widths: simd64, simd128, simd256. + /// + template + Ty simd_f64_round(Ty _a); + + /// Per-lane f64 ceiling. + /// + /// @param[in] _a Input register. + /// + /// @returns Per-lane ceiling value. + /// + /// @remark Widths: simd64, simd128, simd256. + /// + template + Ty simd_f64_ceil(Ty _a); + + /// Per-lane f64 floor. + /// + /// @param[in] _a Input register. + /// + /// @returns Per-lane floor value. + /// + /// @remark Widths: simd64, simd128, simd256. + /// + template + Ty simd_f64_floor(Ty _a); + + /// Per-lane f64 compare equal. Lanes set to all-ones if equal. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane comparison mask. + /// + /// @remark Widths: simd64, simd128, simd256. + /// + template + Ty simd_f64_cmpeq(Ty _a, Ty _b); + + /// Per-lane f64 compare not-equal. Lanes set to all-ones if `_a != _b`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane comparison mask. + /// + /// @remark Widths: simd64, simd128, simd256. + /// + template + Ty simd_f64_cmpneq(Ty _a, Ty _b); + + /// Per-lane f64 compare less-than. Lanes set to all-ones if `_a < _b`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane comparison mask. + /// + /// @remark Widths: simd64, simd128, simd256. + /// + template + Ty simd_f64_cmplt(Ty _a, Ty _b); + + /// Per-lane f64 compare less-or-equal. Lanes set to all-ones if `_a <= _b`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane comparison mask. + /// + /// @remark Widths: simd64, simd128, simd256. + /// + template + Ty simd_f64_cmple(Ty _a, Ty _b); + + /// Per-lane f64 compare greater-than. Lanes set to all-ones if `_a > _b`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane comparison mask. + /// + /// @remark Widths: simd64, simd128, simd256. + /// + template + Ty simd_f64_cmpgt(Ty _a, Ty _b); + + /// Per-lane f64 compare greater-or-equal. Lanes set to all-ones if `_a >= _b`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane comparison mask. + /// + /// @remark Widths: simd64, simd128, simd256. + /// + template + Ty simd_f64_cmpge(Ty _a, Ty _b); + + /// Per-lane i32 compare less-than. Lanes set to all-ones if `_a < _b`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane comparison mask. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_i32_cmplt(Ty _a, Ty _b); + + /// Per-lane i32 compare greater-than. Lanes set to all-ones if `_a > _b`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane comparison mask. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_i32_cmpgt(Ty _a, Ty _b); + + /// Per-lane i32 minimum: `min(_a, _b)`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane minimum. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_i32_min(Ty _a, Ty _b); + + /// Per-lane i32 maximum: `max(_a, _b)`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane maximum. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_i32_max(Ty _a, Ty _b); + + /// Per-lane i32 clamp: `min(max(_a, _min), _max)`. + /// + /// @param[in] _a Value to clamp. + /// @param[in] _min Minimum bound. + /// @param[in] _max Maximum bound. + /// + /// @returns Per-lane clamped value. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_i32_clamp(Ty _a, Ty _min, Ty _max); + + /// Per-lane i32 compare equal. Lanes set to all-ones if equal. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane comparison mask. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_i32_cmpeq(Ty _a, Ty _b); + + /// Per-lane u32 add: `_a + _b`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane sum. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_u32_add(Ty _a, Ty _b); + + /// Per-lane u32 subtract: `_a - _b`. + /// + /// @param[in] _a Minuend. + /// @param[in] _b Subtrahend. + /// + /// @returns Per-lane difference. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_u32_sub(Ty _a, Ty _b); + + /// Per-lane u32 compare less-than. Lanes set to all-ones if `_a < _b`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane comparison mask. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_u32_cmplt(Ty _a, Ty _b); + + /// Per-lane u32 compare greater-than. Lanes set to all-ones if `_a > _b`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand. + /// + /// @returns Per-lane comparison mask. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_u32_cmpgt(Ty _a, Ty _b); + + /// Broadcast float value to all lanes. + /// + /// @param[in] _a Value to broadcast. + /// + /// @returns Register with all lanes set to `_a`. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_splat(float _a); + + /// Broadcast uint32 value to all 32-bit lanes. + /// + /// @param[in] _a Value to broadcast. + /// + /// @returns Register with all lanes set to `_a`. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_splat(uint32_t _a); + + /// Broadcast double value to all 64-bit lanes. + /// + /// @param[in] _a Value to broadcast. + /// + /// @returns Register with all 64-bit lanes set to `_a`. + /// + /// @remark Widths: simd64, simd128, simd256. + /// + template + Ty simd_splat(double _a); + + /// Zero all lanes. + /// + /// @returns Register with all bits zero. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_zero(); + + /// Select by bit mask: per-bit `_mask ? _a : _b`. + /// + /// @param[in] _mask Bit mask. + /// @param[in] _a Value selected where mask bits are 1. + /// @param[in] _b Value selected where mask bits are 0. + /// + /// @returns Per-bit blend of `_a` and `_b`. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_selb(Ty _mask, Ty _a, Ty _b); + + /// Select by sign: per-lane `_test < 0 ? _a : _b`. + /// + /// @param[in] _test Sign test value. + /// @param[in] _a Value selected where sign bit is 1. + /// @param[in] _b Value selected where sign bit is 0. + /// + /// @returns Per-lane selection based on sign. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_sels(Ty _test, Ty _a, Ty _b); + + /// Test if any lane sign bit is set. + /// + /// @param[in] _test Input register. + /// + /// @returns True if any lane has sign bit set. + /// + /// @remark Widths: simd128, simd256. + /// + template + bool simd_test_any(Ty _test); + + /// Test if all lane sign bits are set. + /// + /// @param[in] _test Input register. + /// + /// @returns True if all lanes have sign bit set. + /// + /// @remark Widths: simd128, simd256. + /// + template + bool simd_test_all(Ty _test); + + /// Test if `(_a & _b)` is all-zero bits. + /// + /// @param[in] _a First input register. + /// @param[in] _b Second input register. + /// + /// @returns True if every bit of `(_a & _b)` is zero. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + bool simd_test_zero(Ty _a, Ty _b); + + /// Bitwise NOT: `~_a`. + /// + /// @param[in] _a Input register. + /// + /// @returns Bitwise complement. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_not(Ty _a); + + /// Bitwise AND-complement: `_a & ~_b`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand (complemented). + /// + /// @returns Bitwise `_a & ~_b`. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_andc(Ty _a, Ty _b); + + /// Per-lane 32-bit shift right arithmetic. + /// + /// @param[in] _a Input register. + /// @param[in] _count Number of bits to shift. + /// + /// @returns Per-lane arithmetic right shift. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_x32_sra(Ty _a, int _count); + + /// Per-lane 32-bit shift right arithmetic with per-lane variable count. + /// + /// @param[in] _a Input register. + /// @param[in] _count Per-lane shift counts. + /// + /// @returns Per-lane arithmetic right shift, where each lane is shifted by the + /// matching lane in `_count`. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_x32_sra(Ty _a, Ty _count); + + /// Per-lane 32-bit shift right logical. + /// + /// @param[in] _a Input register. + /// @param[in] _count Number of bits to shift. + /// + /// @returns Per-lane logical right shift. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_x32_srl(Ty _a, int _count); + + /// Per-lane 32-bit shift right logical with per-lane variable count. + /// + /// @param[in] _a Input register. + /// @param[in] _count Per-lane shift counts. + /// + /// @returns Per-lane logical right shift, where each lane is shifted by the + /// matching lane in `_count`. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_x32_srl(Ty _a, Ty _count); + + /// Per-lane 32-bit shift left logical. + /// + /// @param[in] _a Input register. + /// @param[in] _count Number of bits to shift. + /// + /// @returns Per-lane logical left shift. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_x32_sll(Ty _a, int _count); + + /// Per-lane 32-bit shift left logical with per-lane variable count. + /// + /// @param[in] _a Input register. + /// @param[in] _count Per-lane shift counts. + /// + /// @returns Per-lane logical left shift, where each lane is shifted by the + /// matching lane in `_count`. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_x32_sll(Ty _a, Ty _count); + + /// Bitwise OR-complement: `_a | ~_b`. + /// + /// @param[in] _a First operand. + /// @param[in] _b Second operand (complemented). + /// + /// @returns Bitwise `_a | ~_b`. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_orc(Ty _a, Ty _b); + + /// Load from aligned memory. + /// + /// @param[in] _ptr Pointer to aligned source data. + /// + /// @returns Register loaded from memory. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_ld(const void* _ptr); + + /// Load from unaligned memory. + /// + /// @param[in] _ptr Pointer to source data (any alignment). + /// + /// @returns Register loaded from memory. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + Ty simd_ldu(const void* _ptr); + + /// Store to aligned memory. + /// + /// @param[out] _ptr Pointer to aligned destination. + /// @param[in] _a Register to store. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + void simd_st(void* _ptr, Ty _a); + + /// Store to unaligned memory. + /// + /// @param[out] _ptr Pointer to destination (any alignment). + /// @param[in] _a Register to store. + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + void simd_stu(void* _ptr, Ty _a); + + /// Store lowest 32-bit element to memory. + /// + /// @param[out] _ptr Pointer to destination. + /// @param[in] _a Register (lowest lane stored). + /// + /// @remark Widths: simd32, simd64, simd128, simd256. + /// + template + void simd_x32_st1(void* _ptr, Ty _a); } // namespace bx -#include "inline/simd128_ref.inl" -#include "inline/simd256_ref.inl" - -#include "inline/simd_ni.inl" +#include "inline/simd_impl.inl" #endif // BX_SIMD_T_H_HEADER_GUARD diff --git a/include/bx/uint32_t.h b/include/bx/uint32_t.h deleted file mode 100644 index c6380bd..0000000 --- a/include/bx/uint32_t.h +++ /dev/null @@ -1,316 +0,0 @@ -/* - * Copyright 2010-2026 Branimir Karadzic. All rights reserved. - * License: https://github.com/bkaradzic/bx/blob/master/LICENSE - */ - -#ifndef BX_UINT32_T_H_HEADER_GUARD -#define BX_UINT32_T_H_HEADER_GUARD - -#include "bx.h" - -namespace bx -{ - constexpr uint16_t kHalfFloatZero = UINT16_C(0); - constexpr uint16_t kHalfFloatHalf = UINT16_C(0x3800); - constexpr uint16_t kHalfFloatOne = UINT16_C(0x3c00); - constexpr uint16_t kHalfFloatTwo = UINT16_C(0x4000); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_li(uint32_t _a); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_dec(uint32_t _a); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_inc(uint32_t _a); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_not(uint32_t _a); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_neg(uint32_t _a); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_ext(uint32_t _a); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_and(uint32_t _a, uint32_t _b); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_andc(uint32_t _a, uint32_t _b); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_xor(uint32_t _a, uint32_t _b); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_xorl(uint32_t _a, uint32_t _b); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_or(uint32_t _a, uint32_t _b); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_orc(uint32_t _a, uint32_t _b); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_sll(uint32_t _a, int32_t _sa); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_srl(uint32_t _a, int32_t _sa); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_sra(uint32_t _a, int32_t _sa); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_rol(uint32_t _a, int32_t _sa); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_ror(uint32_t _a, int32_t _sa); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_add(uint32_t _a, uint32_t _b); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_sub(uint32_t _a, uint32_t _b); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_mul(uint32_t _a, uint32_t _b); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_div(uint32_t _a, uint32_t _b); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_mod(uint32_t _a, uint32_t _b); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_cmpeq(uint32_t _a, uint32_t _b); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_cmpneq(uint32_t _a, uint32_t _b); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_cmplt(uint32_t _a, uint32_t _b); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_cmple(uint32_t _a, uint32_t _b); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_cmpgt(uint32_t _a, uint32_t _b); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_cmpge(uint32_t _a, uint32_t _b); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_setnz(uint32_t _a); - - /// - template - BX_CONSTEXPR_FUNC uint32_t uint32_splat(Ty _val); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_satadd(uint32_t _a, uint32_t _b); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_satsub(uint32_t _a, uint32_t _b); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_satmul(uint32_t _a, uint32_t _b); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_sels(uint32_t test, uint32_t _a, uint32_t _b); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_selb(uint32_t _mask, uint32_t _a, uint32_t _b); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_imin(uint32_t _a, uint32_t _b); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_imax(uint32_t _a, uint32_t _b); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_min(uint32_t _a, uint32_t _b); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_min(uint32_t _a, uint32_t _b, uint32_t _c); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_max(uint32_t _a, uint32_t _b); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_max(uint32_t _a, uint32_t _b, uint32_t _c); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_clamp(uint32_t _a, uint32_t _min, uint32_t _max); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_iclamp(uint32_t _a, uint32_t _min, uint32_t _max); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_incwrap(uint32_t _val, uint32_t _min, uint32_t _max); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_decwrap(uint32_t _val, uint32_t _min, uint32_t _max); - - /// Count number of bits set. - /// - BX_CONSTEXPR_FUNC uint32_t uint32_cntbits(uint32_t _val); - - /// Count number of leading zeros. - /// - BX_CONSTEXPR_FUNC uint32_t uint32_cntlz(uint32_t _val); - - /// Count number of trailing zeros. - /// - BX_CONSTEXPR_FUNC uint32_t uint32_cnttz(uint32_t _val); - - /// Find first set. - /// - BX_CONSTEXPR_FUNC uint32_t uint32_ffs(uint32_t _val); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_part1by1(uint32_t _a); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_part1by2(uint32_t _a); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_testpow2(uint32_t _a); - - /// - BX_CONSTEXPR_FUNC uint32_t uint32_nextpow2(uint32_t _a); - - /// - BX_CONSTEXPR_FUNC uint64_t uint64_li(uint64_t _a); - - /// - BX_CONSTEXPR_FUNC uint64_t uint64_dec(uint64_t _a); - - /// - BX_CONSTEXPR_FUNC uint64_t uint64_inc(uint64_t _a); - - /// - BX_CONSTEXPR_FUNC uint64_t uint64_not(uint64_t _a); - - /// - BX_CONSTEXPR_FUNC uint64_t uint64_neg(uint64_t _a); - - /// - BX_CONSTEXPR_FUNC uint64_t uint64_ext(uint64_t _a); - - /// - BX_CONSTEXPR_FUNC uint64_t uint64_and(uint64_t _a, uint64_t _b); - - /// - BX_CONSTEXPR_FUNC uint64_t uint64_andc(uint64_t _a, uint64_t _b); - - /// - BX_CONSTEXPR_FUNC uint64_t uint64_xor(uint64_t _a, uint64_t _b); - - /// - BX_CONSTEXPR_FUNC uint64_t uint64_xorl(uint64_t _a, uint64_t _b); - - /// - BX_CONSTEXPR_FUNC uint64_t uint64_or(uint64_t _a, uint64_t _b); - - /// - BX_CONSTEXPR_FUNC uint64_t uint64_orc(uint64_t _a, uint64_t _b); - - /// - BX_CONSTEXPR_FUNC uint64_t uint64_sll(uint64_t _a, int32_t _sa); - - /// - BX_CONSTEXPR_FUNC uint64_t uint64_srl(uint64_t _a, int32_t _sa); - - /// - BX_CONSTEXPR_FUNC uint64_t uint64_sra(uint64_t _a, int32_t _sa); - - /// - BX_CONSTEXPR_FUNC uint64_t uint64_rol(uint64_t _a, int32_t _sa); - - /// - BX_CONSTEXPR_FUNC uint64_t uint64_ror(uint64_t _a, int32_t _sa); - - /// - BX_CONSTEXPR_FUNC uint64_t uint64_add(uint64_t _a, uint64_t _b); - - /// - BX_CONSTEXPR_FUNC uint64_t uint64_sub(uint64_t _a, uint64_t _b); - - /// - BX_CONSTEXPR_FUNC uint64_t uint64_mul(uint64_t _a, uint64_t _b); - - /// - template - BX_CONSTEXPR_FUNC uint64_t uint64_splat(Ty _val); - - /// - BX_CONSTEXPR_FUNC uint64_t uint64_cntbits(uint64_t _val); - - /// - BX_CONSTEXPR_FUNC uint64_t uint64_cntlz(uint64_t _val); - - /// - BX_CONSTEXPR_FUNC uint64_t uint64_cnttz(uint64_t _val); - - /// Greatest common divisor. - /// - BX_CONSTEXPR_FUNC uint32_t uint32_gcd(uint32_t _a, uint32_t _b); - - /// Least common multiple. - /// - BX_CONSTEXPR_FUNC uint32_t uint32_lcm(uint32_t _a, uint32_t _b); - - /// Align to arbitrary stride. - /// - BX_CONSTEXPR_FUNC uint32_t strideAlign(uint32_t _offset, uint32_t _stride); - - /// Align to arbitrary stride and Min bytes. - /// - template - BX_CONSTEXPR_FUNC uint32_t strideAlign(uint32_t _offset, uint32_t _stride); - - /// - template - BX_CONSTEXPR_FUNC bool isAligned(Ty _a, size_t _align); - - /// - template<> - BX_CONSTEXPR_FUNC bool isAligned(const void* _ptr, size_t _align); - - /// - template - BX_CONSTEXPR_FUNC Ty alignDown(Ty _a, size_t _align); - - /// - template - BX_CONSTEXPR_FUNC Ty* alignDown(Ty* _ptr, size_t _align); - - /// - template - BX_CONSTEXPR_FUNC const Ty* alignDown(const Ty* _ptr, size_t _align); - - /// - template - BX_CONSTEXPR_FUNC Ty alignUp(Ty _a, size_t _align); - - /// - template - BX_CONSTEXPR_FUNC Ty* alignUp(Ty* _ptr, size_t _align); - - /// - template - BX_CONSTEXPR_FUNC const Ty* alignUp(const Ty* _ptr, size_t _align); - - /// Convert float to half-float. - /// - BX_CONST_FUNC uint16_t halfFromFloat(float _a); - - /// Convert half-float to float. - /// - BX_CONST_FUNC float halfToFloat(uint16_t _a); - -} // namespace bx - -#include "inline/uint32_t.inl" - -#endif // BX_UINT32_T_H_HEADER_GUARD diff --git a/scripts/toolchain.lua b/scripts/toolchain.lua index f9b5411..e88aa76 100644 --- a/scripts/toolchain.lua +++ b/scripts/toolchain.lua @@ -878,6 +878,10 @@ function toolchain(_buildDir, _libDir) "-Wundef", } + buildoptions { + "-msimd128", + } + linkoptions { "-s MAX_WEBGL_VERSION=2", } diff --git a/src/debug.cpp b/src/debug.cpp index 37299aa..4c9e08a 100644 --- a/src/debug.cpp +++ b/src/debug.cpp @@ -156,7 +156,7 @@ namespace bx char temp[4096]; while (0 != size) { - uint32_t len = uint32_min(sizeof(temp)-1, size); + uint32_t len = uint32_t(min(sizeof(temp)-1, size) ); memCopy(temp, data, len); temp[len] = '\0'; data += len; diff --git a/src/dtoa.cpp b/src/dtoa.cpp index a9099cb..1d66d8c 100644 --- a/src/dtoa.cpp +++ b/src/dtoa.cpp @@ -6,7 +6,6 @@ #include #include #include -#include namespace bx { diff --git a/src/hash.cpp b/src/hash.cpp index 901ad29..bd43e47 100644 --- a/src/hash.cpp +++ b/src/hash.cpp @@ -4,6 +4,7 @@ */ #include +#include namespace bx { @@ -333,7 +334,7 @@ struct HashMurmur3Pod BX_FORCE_INLINE void mix1(uint32_t _k) { _k *= kMurmur3Mul1; - _k = uint32_rol(_k, 15); + _k = simd32_x32_rol(simd32_splat(_k), 15).u32; _k *= kMurmur3Mul2; m_hash ^= _k; @@ -343,7 +344,7 @@ struct HashMurmur3Pod { mix1(_k); - m_hash = uint32_rol(m_hash, 13); + m_hash = simd32_x32_rol(simd32_splat(m_hash), 13).u32; m_hash = m_hash*5 + kMurmur3Add; } @@ -407,7 +408,7 @@ struct HashMurmur3_64Pod BX_FORCE_INLINE void mix1(uint64_t _k) { _k *= kMurmur3Mul1; - _k = uint64_rol(_k, 31); + _k = simd64_x64_rol(simd64_splat(_k), 31).u64; _k *= kMurmur3Mul2; m_hash[0] ^= _k; @@ -416,7 +417,7 @@ struct HashMurmur3_64Pod BX_FORCE_INLINE void mix2(uint64_t _k) { _k *= kMurmur3Mul2; - _k = uint64_rol(_k, 33); + _k = simd64_x64_rol(simd64_splat(_k), 33).u64; _k *= kMurmur3Mul1; m_hash[1] ^= _k; @@ -426,13 +427,13 @@ struct HashMurmur3_64Pod { mix1(_k1); - m_hash[0] = uint64_rol(m_hash[0], 27); + m_hash[0] = simd64_x64_rol(simd64_splat(m_hash[0]), 27).u64; m_hash[0] += m_hash[1]; m_hash[0] = m_hash[0]*5 + kMurmur3Add1; mix2(_k2); - m_hash[1] = uint64_rol(m_hash[1], 31); + m_hash[1] = simd64_x64_rol(simd64_splat(m_hash[1]), 31).u64; m_hash[1] += m_hash[0]; m_hash[1] = m_hash[1]*5 + kMurmur3Add2; } diff --git a/src/math.cpp b/src/math.cpp index bc9416d..4c7e742 100644 --- a/src/math.cpp +++ b/src/math.cpp @@ -4,23 +4,24 @@ */ #include -#include - #include namespace bx { float frexp(float _a, int32_t* _outExp) { - const uint32_t ftob = floatToBits(_a); - const uint32_t masked0 = uint32_and(ftob, kFloatExponentMask); - const uint32_t exp0 = uint32_srl(masked0, kFloatExponentBitShift); + const simd32_t expMask = simd32_splat(kFloatExponentMask); + const simd32_t ftob = simd32_splat(_a); + const simd32_t masked0 = simd_and(ftob, expMask); + const simd32_t exp0 = simd_x32_srl(masked0, kFloatExponentBitShift); - const uint32_t masked1 = uint32_and(ftob, kFloatSignMask | kFloatMantissaMask); - const uint32_t bits = uint32_or(masked1, UINT32_C(0x3f000000) ); - const float result = bitsToFloat(bits); + const simd32_t sMantMask = simd32_splat(kFloatSignMask | kFloatMantissaMask); + const simd32_t masked1 = simd_and(ftob, sMantMask); + const simd32_t half = simd32_splat(0x3f000000u); + const simd32_t bits = simd_or(masked1, half); + const float result = bitsToFloat(bits.u32); - *_outExp = int32_t(exp0 - 0x7e); + *_outExp = int32_t(exp0.u32 - 0x7e); return result; } diff --git a/src/os.cpp b/src/os.cpp index 9ccaf92..23c3e7c 100644 --- a/src/os.cpp +++ b/src/os.cpp @@ -5,7 +5,6 @@ #include #include -#include #if BX_CRT_MSVC # include @@ -332,7 +331,7 @@ namespace bx int32_t len = 0; for(uint32_t ii = 0; NULL != _argv[ii]; ++ii) { - len += snprintf(&temp[len], uint32_imax(0, total-len) + len += snprintf(&temp[len], bx::max(0, total-len) , "%s " , _argv[ii] ); diff --git a/tests/bx_test.cpp b/tests/bx_test.cpp new file mode 100644 index 0000000..9781707 --- /dev/null +++ b/tests/bx_test.cpp @@ -0,0 +1,84 @@ +/* + * Copyright 2010-2026 Branimir Karadzic. All rights reserved. + * License: https://github.com/bkaradzic/bx/blob/master/LICENSE + */ + +#include "test.h" +#include + +TEST_CASE("StrideAlign", "[bx]") +{ + REQUIRE(0 == bx::strideAlign(0, 12) ); + for (uint32_t ii = 0; ii < 12; ++ii) + { + REQUIRE(12 == bx::strideAlign(ii+1, 12) ); + } + + REQUIRE(0 == bx::strideAlign<16>(0, 12) ); + for (uint32_t ii = 0; ii < 12; ++ii) + { + REQUIRE(48 == bx::strideAlign<16>(ii+1, 12) ); + } + + uint32_t offset = 11; + offset = bx::strideAlign(offset, 32); + REQUIRE(offset == 32); + + offset = bx::strideAlign(offset, 24); + REQUIRE(offset == 48); +} + +TEST_CASE("gcd", "[bx]") +{ + REQUIRE(1 == bx::gcd(13u, 89u) ); + REQUIRE(3 == bx::gcd( 3u, 9u) ); + REQUIRE(8 == bx::gcd( 8u, 64u) ); + REQUIRE(9 == bx::gcd(18u, 81u) ); +} + +TEST_CASE("lcm", "[bx]") +{ + REQUIRE(1157 == bx::lcm(13u, 89u) ); + REQUIRE( 9 == bx::lcm( 3u, 9u) ); + REQUIRE( 48 == bx::lcm( 6u, 16u) ); + REQUIRE( 80 == bx::lcm(16u, 20u) ); +} + +TEST_CASE("align", "[bx]") +{ + REQUIRE( bx::isAligned(0, 8) ); + REQUIRE(!bx::isAligned(7, 8) ); + REQUIRE( bx::isAligned(64, 8) ); + REQUIRE(!bx::isAligned(63, 8) ); + + for (int32_t ii = 0; ii < 1024; ++ii) + { + REQUIRE(bx::isAligned(ii, 0) ); + REQUIRE(ii == bx::alignUp(ii, 0) ); + REQUIRE(ii == bx::alignDown(ii, 0) ); + } + + REQUIRE( 0 == bx::alignUp( 0, 16) ); + REQUIRE( 16 == bx::alignUp( 1, 16) ); + REQUIRE( 16 == bx::alignUp( 15, 16) ); + REQUIRE( 16 == bx::alignUp( 16, 16) ); + REQUIRE(256 == bx::alignUp(255, 16) ); + REQUIRE( 0 == bx::alignUp(-1, 16) ); + REQUIRE(-16 == bx::alignUp(-31, 16) ); + + REQUIRE( 0 == bx::alignUp( 0, 256) ); + REQUIRE(256 == bx::alignUp( 1, 256) ); + REQUIRE(256 == bx::alignUp( 15, 256) ); + REQUIRE(256 == bx::alignUp(255, 256) ); + REQUIRE(256 == bx::alignUp(256, 256) ); + REQUIRE(256 == bx::alignUp(256, 256) ); + REQUIRE(512 == bx::alignUp(511, 256) ); + + REQUIRE( 0 == bx::alignDown( 0, 16) ); + REQUIRE( 0 == bx::alignDown( 1, 16) ); + REQUIRE( 0 == bx::alignDown( 15, 16) ); + REQUIRE( 16 == bx::alignDown( 16, 16) ); + REQUIRE(240 == bx::alignDown(255, 16) ); + REQUIRE(-16 == bx::alignDown(-1, 16) ); + REQUIRE(-32 == bx::alignDown(-31, 16) ); +} diff --git a/tests/math_test.cpp b/tests/math_test.cpp index 1e0ff2c..da00d3b 100644 --- a/tests/math_test.cpp +++ b/tests/math_test.cpp @@ -111,7 +111,7 @@ TEST_CASE("ceilLog2", "[math]") for (uint32_t ii = 1; ii < INT32_MAX; ii += rand()%(1<<13)+1) { - REQUIRE(bx::nextPow2(ii) == bx::uint32_nextpow2(ii) ); + REQUIRE(bx::nextPow2(ii) == bx::simd32_u32_nextpow2(bx::simd32_splat(ii)).u32 ); } } @@ -226,6 +226,79 @@ TEST_CASE("countBits", "[math]") STATIC_REQUIRE(64 == bx::countBits(UINT64_MAX) ); } +TEST_CASE("satAdd", "[math]") +{ + STATIC_REQUIRE( 0 == bx::satAdd(0, 0) ); + STATIC_REQUIRE(200 == bx::satAdd(100, 100) ); + STATIC_REQUIRE(255 == bx::satAdd(UINT8_MAX, 0) ); + STATIC_REQUIRE(255 == bx::satAdd(UINT8_MAX, 1) ); + STATIC_REQUIRE(255 == bx::satAdd(UINT8_MAX, 10) ); + STATIC_REQUIRE(255 == bx::satAdd(254, 254) ); + STATIC_REQUIRE(255 == bx::satAdd(200, 100) ); + + STATIC_REQUIRE(UINT16_MAX == bx::satAdd(UINT16_MAX, 0) ); + STATIC_REQUIRE(UINT16_MAX == bx::satAdd(UINT16_MAX, 1) ); + STATIC_REQUIRE(UINT16_MAX == bx::satAdd(65530, 10) ); + STATIC_REQUIRE(uint16_t(65534)== bx::satAdd(65530, 4) ); + + STATIC_REQUIRE(UINT32_MAX == bx::satAdd(UINT32_MAX, 0) ); + STATIC_REQUIRE(UINT32_MAX == bx::satAdd(UINT32_MAX, 1) ); + STATIC_REQUIRE(UINT32_MAX == bx::satAdd(UINT32_MAX-1, 10) ); + + STATIC_REQUIRE(UINT64_MAX == bx::satAdd(UINT64_MAX, 1) ); + STATIC_REQUIRE(UINT64_MAX == bx::satAdd(UINT64_MAX-1, 10) ); + + // signed + STATIC_REQUIRE( 127 == bx::satAdd( 127, 1) ); + STATIC_REQUIRE( 127 == bx::satAdd( 100, 100) ); + STATIC_REQUIRE(-128 == bx::satAdd(-128, -1) ); + STATIC_REQUIRE(-128 == bx::satAdd(-100,-100) ); + STATIC_REQUIRE( -1 == bx::satAdd( 127,-128) ); + + STATIC_REQUIRE(INT32_MAX == bx::satAdd(INT32_MAX, 1) ); + STATIC_REQUIRE(INT32_MAX == bx::satAdd(INT32_MAX, INT32_MAX) ); + STATIC_REQUIRE(INT32_MIN == bx::satAdd(INT32_MIN, -1) ); + STATIC_REQUIRE(INT32_MIN == bx::satAdd(INT32_MIN, INT32_MIN) ); + STATIC_REQUIRE( -1 == bx::satAdd(INT32_MAX, INT32_MIN) ); + + STATIC_REQUIRE(INT64_MAX == bx::satAdd(INT64_MAX, 1) ); + STATIC_REQUIRE(INT64_MIN == bx::satAdd(INT64_MIN, -1) ); +} + +TEST_CASE("satSub", "[math]") +{ + STATIC_REQUIRE( 0 == bx::satSub(0, 0) ); + STATIC_REQUIRE( 0 == bx::satSub(10, 20) ); + STATIC_REQUIRE( 0 == bx::satSub(0, UINT8_MAX) ); + STATIC_REQUIRE( 10 == bx::satSub(20, 10) ); + STATIC_REQUIRE(UINT8_MAX == bx::satSub(UINT8_MAX, 0) ); + + STATIC_REQUIRE( 0 == bx::satSub(10, 20) ); + STATIC_REQUIRE( 0 == bx::satSub(0, UINT16_MAX) ); + STATIC_REQUIRE(UINT16_MAX == bx::satSub(UINT16_MAX, 0) ); + + STATIC_REQUIRE( 0 == bx::satSub(10, 20) ); + STATIC_REQUIRE( 0 == bx::satSub(0, UINT32_MAX) ); + STATIC_REQUIRE(UINT32_MAX == bx::satSub(UINT32_MAX, 0) ); + + STATIC_REQUIRE( 0 == bx::satSub(10, 20) ); + STATIC_REQUIRE(UINT64_MAX == bx::satSub(UINT64_MAX, 0) ); + + // signed + STATIC_REQUIRE(-128 == bx::satSub(-128, 1) ); + STATIC_REQUIRE( 127 == bx::satSub( 127, -1) ); + STATIC_REQUIRE( 127 == bx::satSub( 0,-128) ); + + STATIC_REQUIRE(INT32_MIN == bx::satSub(INT32_MIN, 1) ); + STATIC_REQUIRE(INT32_MIN == bx::satSub(INT32_MIN, INT32_MAX) ); + STATIC_REQUIRE(INT32_MAX == bx::satSub( 0, INT32_MIN) ); + STATIC_REQUIRE(INT32_MAX == bx::satSub(INT32_MAX, -1) ); + STATIC_REQUIRE(INT32_MAX == bx::satSub(INT32_MAX, INT32_MIN) ); + + STATIC_REQUIRE(INT64_MIN == bx::satSub(INT64_MIN, 1) ); + STATIC_REQUIRE(INT64_MAX == bx::satSub( 0, INT64_MIN) ); +} + template static void testFindFirstSet() { diff --git a/tests/simd32_test.cpp b/tests/simd32_test.cpp new file mode 100644 index 0000000..cab4751 --- /dev/null +++ b/tests/simd32_test.cpp @@ -0,0 +1,95 @@ +/* + * Copyright 2010-2026 Branimir Karadzic. All rights reserved. + * License: https://github.com/bkaradzic/bx/blob/master/LICENSE + */ + +#include "test.h" +#include + +TEST_CASE("uint32_part", "[simd_t]") +{ + REQUIRE(UINT32_C(0x55555555) == bx::simd32_x32_part1by1(bx::simd32_splat(uint32_t(UINT16_MAX))).u32 ); + REQUIRE(UINT32_C(0x09249249) == bx::simd32_x32_part1by2(bx::simd32_splat(uint32_t(0x3ff))).u32 ); +} + +TEST_CASE("uint32_splat", "[simd_t]") +{ + REQUIRE(UINT32_C(0x01010101) == bx::simd32_splat(uint8_t(0x01)).u32 ); + REQUIRE(UINT32_C(0x55555555) == bx::simd32_splat(uint8_t(0x55)).u32 ); + REQUIRE(UINT32_C(0x13891389) == bx::simd32_splat(uint16_t(0x1389)).u32 ); +} + +TEST_CASE("uint64_splat", "[simd_t]") +{ + REQUIRE(UINT64_C(0x0101010101010101) == bx::simd64_splat(uint8_t(0x01)).u64 ); + REQUIRE(UINT64_C(0x5555555555555555) == bx::simd64_splat(uint8_t(0x55)).u64 ); + REQUIRE(UINT32_C(0x1389138913891389) == bx::simd64_splat(uint16_t(0x1389)).u64 ); + REQUIRE(UINT32_C(0x1506138915061389) == bx::simd64_splat(uint32_t(0x15061389)).u64 ); +} + +TEST_CASE("uint32_gcd", "[simd_t]") +{ + REQUIRE(1 == bx::simd32_u32_gcd(bx::simd32_splat(uint32_t(13)), bx::simd32_splat(uint32_t(89))).u32 ); + REQUIRE(3 == bx::simd32_u32_gcd(bx::simd32_splat(uint32_t( 3)), bx::simd32_splat(uint32_t( 9))).u32 ); + REQUIRE(8 == bx::simd32_u32_gcd(bx::simd32_splat(uint32_t( 8)), bx::simd32_splat(uint32_t(64))).u32 ); + REQUIRE(9 == bx::simd32_u32_gcd(bx::simd32_splat(uint32_t(18)), bx::simd32_splat(uint32_t(81))).u32 ); +} + +TEST_CASE("uint32_lcm", "[simd_t]") +{ + REQUIRE(1157 == bx::simd32_u32_lcm(bx::simd32_splat(uint32_t(13)), bx::simd32_splat(uint32_t(89))).u32 ); + REQUIRE( 9 == bx::simd32_u32_lcm(bx::simd32_splat(uint32_t( 3)), bx::simd32_splat(uint32_t( 9))).u32 ); + REQUIRE( 48 == bx::simd32_u32_lcm(bx::simd32_splat(uint32_t( 6)), bx::simd32_splat(uint32_t(16))).u32 ); + REQUIRE( 80 == bx::simd32_u32_lcm(bx::simd32_splat(uint32_t(16)), bx::simd32_splat(uint32_t(20))).u32 ); +} + +TEST_CASE("halfTo/FromFloat", "[simd_t]") +{ + for (uint32_t ii = 0; ii < 0x7c00; ++ii) + { + const uint16_t orig = uint16_t(ii); + const float htf = bx::halfToFloat(orig); + const uint16_t hff = bx::halfFromFloat(htf); + REQUIRE(orig == hff); + } + + for (uint32_t ii = 0x8000; ii < 0xfc00; ++ii) + { + const uint16_t orig = uint16_t(ii); + const float htf = bx::halfToFloat(orig); + const uint16_t hff = bx::halfFromFloat(htf); + REQUIRE(orig == hff); + } +} + +TEST_CASE("uint32_testpow2", "[simd_t]") +{ + uint32_t shift = 0; + uint32_t nextpow2 = bx::simd32_u32_nextpow2(bx::simd32_splat(uint32_t(1))).u32; + + for (uint32_t ii = 1; ii < 1<<24; ++ii) + { + REQUIRE(nextpow2 == bx::simd32_u32_nextpow2(bx::simd32_splat(ii)).u32 ); + + if (bx::simd32_u32_testpow2(bx::simd32_splat(ii)).u32 ) + { + REQUIRE(ii == 1u << shift); + ++shift; + + REQUIRE(ii == nextpow2); + nextpow2 = bx::simd32_u32_nextpow2(bx::simd32_splat(ii+1)).u32; + } + } +} + +TEST_CASE("uint32_roX", "[simd_t]") +{ + REQUIRE(bx::simd32_x32_rol(bx::simd32_splat(uint32_t(0x80000000)), 1).u32 == 1); + REQUIRE(bx::simd32_x32_ror(bx::simd32_splat(uint32_t(1)), 1).u32 == 0x80000000); +} + +TEST_CASE("uint64_roX", "[simd_t]") +{ + REQUIRE(bx::simd64_x64_rol(bx::simd64_splat(uint64_t(0x8000000000000000)), 1).u64 == 1); + REQUIRE(bx::simd64_x64_ror(bx::simd64_splat(uint64_t(1)), 1).u64 == 0x8000000000000000); +} diff --git a/tests/simd_bench.cpp b/tests/simd_bench.cpp index c197b6d..0b27279 100644 --- a/tests/simd_bench.cpp +++ b/tests/simd_bench.cpp @@ -26,20 +26,20 @@ void simd_rsqrt_bench(bx::simd128_t* _dst, bx::simd128_t* _src, uint32_t _numVer for (uint32_t ii = 0, num = _numVertices/4; ii < num; ++ii) { bx::simd128_t* ptr = &_src[ii*4]; - bx::simd128_t tmp0 = bx::simd_ld(ptr + 0); - bx::simd128_t tmp1 = bx::simd_ld(ptr + 1); - bx::simd128_t tmp2 = bx::simd_ld(ptr + 2); - bx::simd128_t tmp3 = bx::simd_ld(ptr + 3); + bx::simd128_t tmp0 = bx::simd128_ld(ptr + 0); + bx::simd128_t tmp1 = bx::simd128_ld(ptr + 1); + bx::simd128_t tmp2 = bx::simd128_ld(ptr + 2); + bx::simd128_t tmp3 = bx::simd128_ld(ptr + 3); bx::simd128_t rsqrt0 = simdRsqrtFn(tmp0); bx::simd128_t rsqrt1 = simdRsqrtFn(tmp1); bx::simd128_t rsqrt2 = simdRsqrtFn(tmp2); bx::simd128_t rsqrt3 = simdRsqrtFn(tmp3); ptr = &_dst[ii*4]; - bx::simd_st(ptr + 0, rsqrt0); - bx::simd_st(ptr + 1, rsqrt1); - bx::simd_st(ptr + 2, rsqrt2); - bx::simd_st(ptr + 3, rsqrt3); + bx::simd128_st(ptr + 0, rsqrt0); + bx::simd128_st(ptr + 1, rsqrt1); + bx::simd128_st(ptr + 2, rsqrt2); + bx::simd128_st(ptr + 3, rsqrt3); } } @@ -53,10 +53,10 @@ void simd_bench_pass(bx::simd128_t* _dst, bx::simd128_t* _src, uint32_t _numVert { flushCache(); elapsed += -bx::getHPCounter(); - simd_rsqrt_bench(_dst, _src, _numVertices); + simd_rsqrt_bench(_dst, _src, _numVertices); elapsed += bx::getHPCounter(); } - printf(" simd_rsqrt_est: %15f\n", double(elapsed) ); + printf(" simd128_f32_rsqrt_est: %15f\n", double(elapsed) ); } { @@ -65,10 +65,10 @@ void simd_bench_pass(bx::simd128_t* _dst, bx::simd128_t* _src, uint32_t _numVert { flushCache(); elapsed += -bx::getHPCounter(); - simd_rsqrt_bench(_dst, _src, _numVertices); + simd_rsqrt_bench(_dst, _src, _numVertices); elapsed += bx::getHPCounter(); } - printf(" simd_rsqrt_nr: %15f\n", double(elapsed) ); + printf(" simd128_f32_rsqrt_nr: %15f\n", double(elapsed) ); } { @@ -77,10 +77,10 @@ void simd_bench_pass(bx::simd128_t* _dst, bx::simd128_t* _src, uint32_t _numVert { flushCache(); elapsed += -bx::getHPCounter(); - simd_rsqrt_bench(_dst, _src, _numVertices); + simd_rsqrt_bench(_dst, _src, _numVertices); elapsed += bx::getHPCounter(); } - printf("simd_rsqrt_carmack: %15f\n", double(elapsed) ); + printf("simd128_f32_rsqrt_carmack: %15f\n", double(elapsed) ); } { @@ -89,10 +89,10 @@ void simd_bench_pass(bx::simd128_t* _dst, bx::simd128_t* _src, uint32_t _numVert { flushCache(); elapsed += -bx::getHPCounter(); - simd_rsqrt_bench(_dst, _src, _numVertices); + simd_rsqrt_bench(_dst, _src, _numVertices); elapsed += bx::getHPCounter(); } - printf(" simd_rsqrt: %15f\n", double(elapsed) ); + printf(" simd128_f32_rsqrt: %15f\n", double(elapsed) ); } } diff --git a/tests/simd_test.cpp b/tests/simd_test.cpp index 24aa1c7..7743d64 100644 --- a/tests/simd_test.cpp +++ b/tests/simd_test.cpp @@ -209,16 +209,16 @@ void simd_check_string(const char* _str, bx::simd128_t _a) CHECK(0 == bx::strCmp(_str, test) ); } -TEST_CASE("simd_swizzle", "") +TEST_CASE("simd_swizzle", "[simd]") { - const simd128_t xyzw = simd_ild(0x78787878, 0x79797979, 0x7a7a7a7a, 0x77777777); + const simd128_t xyzw = simd128_ld(0x78787878, 0x79797979, 0x7a7a7a7a, 0x77777777); #define ELEMx 0 #define ELEMy 1 #define ELEMz 2 #define ELEMw 3 #define BX_SIMD128_IMPLEMENT_SWIZZLE(_x, _y, _z, _w) \ - simd_check_string("" #_x #_y #_z #_w "", simd_swiz_##_x##_y##_z##_w(xyzw) ); \ + simd_check_string("" #_x #_y #_z #_w "", simd128_x32_swiz_##_x##_y##_z##_w(xyzw) ); \ #include @@ -229,222 +229,1742 @@ TEST_CASE("simd_swizzle", "") #undef ELEMx } -TEST_CASE("simd_shuffle", "") -{ - const simd128_t xyzw = simd_ild(0x78787878, 0x79797979, 0x7a7a7a7a, 0x77777777); - const simd128_t ABCD = simd_ild(0x41414141, 0x42424242, 0x43434343, 0x44444444); - simd_check_string("xyAB", simd_shuf_xyAB(xyzw, ABCD) ); - simd_check_string("ABxy", simd_shuf_ABxy(xyzw, ABCD) ); - simd_check_string("CDzw", simd_shuf_CDzw(xyzw, ABCD) ); - simd_check_string("zwCD", simd_shuf_zwCD(xyzw, ABCD) ); - simd_check_string("xAyB", simd_shuf_xAyB(xyzw, ABCD) ); - simd_check_string("AxBy", simd_shuf_AxBy(xyzw, ABCD) ); - simd_check_string("zCwD", simd_shuf_zCwD(xyzw, ABCD) ); - simd_check_string("CzDw", simd_shuf_CzDw(xyzw, ABCD) ); - simd_check_string("xAzC", simd_shuf_xAzC(xyzw, ABCD) ); - simd_check_string("yBwD", simd_shuf_yBwD(xyzw, ABCD) ); -} - -TEST_CASE("simd_compare", "") +TEST_CASE("simd_compare", "[simd]") { simd_check_uint32("cmpeq" - , simd_cmpeq(simd_ld(1.0f, 2.0f, 3.0f, 4.0f), simd_ld(0.0f, 2.0f, 0.0f, 3.0f) ) + , simd128_f32_cmpeq(simd128_ld(1.0f, 2.0f, 3.0f, 4.0f), simd128_ld(0.0f, 2.0f, 0.0f, 3.0f) ) , 0, 0xffffffff, 0, 0 ); simd_check_uint32("cmplt" - , simd_cmplt(simd_ld(1.0f, 2.0f, 3.0f, 4.0f), simd_ld(0.0f, 2.0f, 0.0f, 3.0f) ) + , simd128_f32_cmplt(simd128_ld(1.0f, 2.0f, 3.0f, 4.0f), simd128_ld(0.0f, 2.0f, 0.0f, 3.0f) ) , 0, 0, 0, 0 ); simd_check_uint32("cmple" - , simd_cmple(simd_ld(1.0f, 2.0f, 3.0f, 4.0f), simd_ld(0.0f, 2.0f, 0.0f, 3.0f) ) + , simd128_f32_cmple(simd128_ld(1.0f, 2.0f, 3.0f, 4.0f), simd128_ld(0.0f, 2.0f, 0.0f, 3.0f) ) , 0, 0xffffffff, 0, 0 ); simd_check_uint32("cmpgt" - , simd_cmpgt(simd_ld(1.0f, 2.0f, 3.0f, 4.0f), simd_ld(0.0f, 2.0f, 0.0f, 3.0f) ) + , simd128_f32_cmpgt(simd128_ld(1.0f, 2.0f, 3.0f, 4.0f), simd128_ld(0.0f, 2.0f, 0.0f, 3.0f) ) , 0xffffffff, 0, 0xffffffff, 0xffffffff ); simd_check_uint32("cmpge" - , simd_cmpge(simd_ld(1.0f, 2.0f, 3.0f, 4.0f), simd_ld(0.0f, 2.0f, 0.0f, 3.0f) ) + , simd128_f32_cmpge(simd128_ld(1.0f, 2.0f, 3.0f, 4.0f), simd128_ld(0.0f, 2.0f, 0.0f, 3.0f) ) , 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff ); simd_check_uint32("icmpeq" - , simd_icmpeq(simd_ild(0, 1, 2, 3), simd_ild(0, uint32_t(-2), 1, 3) ) + , simd128_i32_cmpeq(simd128_ld(0u, 1u, 2u, 3u), simd128_ld(0u, uint32_t(-2), 1u, 3u) ) , 0xffffffff, 0, 0, 0xffffffff ); simd_check_uint32("icmplt" - , simd_icmplt(simd_ild(0, 1, 2, 3), simd_ild(0, uint32_t(-2), 1, 3) ) + , simd128_i32_cmplt(simd128_ld(0u, 1u, 2u, 3u), simd128_ld(0u, uint32_t(-2), 1u, 3u) ) , 0, 0, 0, 0 ); simd_check_uint32("icmpgt" - , simd_icmpgt(simd_ild(0, 1, 2, 3), simd_ild(0, uint32_t(-2), 1, 3) ) + , simd128_i32_cmpgt(simd128_ld(0u, 1u, 2u, 3u), simd128_ld(0u, uint32_t(-2), 1u, 3u) ) , 0, 0xffffffff, 0xffffffff, 0 ); } -TEST_CASE("simd_test", "") +TEST_CASE("simd_test", "[simd]") { simd_check_bool("test_any_xyzw" - , simd_test_any_xyzw(simd_ild(0xffffffff, 0, 0, 0) ) + , simd128_test_any_xyzw(simd128_ld(0xffffffff, 0u, 0u, 0u) ) , true ); simd_check_bool("test_all_xyzw" - , simd_test_all_xyzw(simd_ild(0xffffffff, 0, 0xffffffff, 0) ) + , simd128_test_all_xyzw(simd128_ld(0xffffffff, 0u, 0xffffffff, 0u) ) , false ); simd_check_bool("test_all_xyzw" - , simd_test_all_xyzw(simd_ild(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff) ) + , simd128_test_all_xyzw(simd128_ld(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff) ) , true ); simd_check_bool("test_all_xw" - , simd_test_all_xw(simd_ild(0xffffffff, 0, 0, 0xffffffff) ) + , simd128_test_all_xw(simd128_ld(0xffffffff, 0u, 0u, 0xffffffff) ) , true ); simd_check_bool("test_all_xzw" - , simd_test_all_xzw(simd_ild(0xffffffff, 0, 0, 0xffffffff) ) + , simd128_test_all_xzw(simd128_ld(0xffffffff, 0u, 0u, 0xffffffff) ) , false ); } -TEST_CASE("simd_load", "") +TEST_CASE("simd_load", "[simd]") { simd_check_float("ld" - , simd_ld(0.0f, 1.0f, 2.0f, 3.0f) + , simd128_ld(0.0f, 1.0f, 2.0f, 3.0f) , 0.0f, 1.0f, 2.0f, 3.0f ); simd_check_float("ld" - , simd_ld(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f) + , simd256_ld(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f) , 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f ); simd_check_int32("ild" - , simd_ild(uint32_t(-1), 0, 1, 2) + , simd128_ld(uint32_t(-1), 0u, 1u, 2u) , uint32_t(-1), 0, 1, 2 ); simd_check_int32("ild" - , simd_ild(uint32_t(-1), 0, 1, 2, 3, 4, 5, 6) + , simd256_ld(uint32_t(-1), 0u, 1u, 2u, 3u, 4u, 5u, 6u) , uint32_t(-1), 0, 1, 2, 3, 4, 5, 6 ); simd_check_int32("ild" - , simd_ild(uint32_t(-1), uint32_t(-2), uint32_t(-3), uint32_t(-4) ) + , simd128_ld(uint32_t(-1), uint32_t(-2), uint32_t(-3), uint32_t(-4) ) , uint32_t(-1), uint32_t(-2), uint32_t(-3), uint32_t(-4) ); - simd_check_uint32("zero", simd_zero() + simd_check_uint32("zero", simd128_zero() , 0, 0, 0, 0 ); - simd_check_uint32("isplat", simd_isplat(0x80000001) + simd_check_uint32("isplat", simd128_splat(0x80000001) , 0x80000001, 0x80000001, 0x80000001, 0x80000001 ); - simd_check_float("splat", simd_splat(1.0f) + simd_check_float("splat", simd128_splat(1.0f) , 1.0f, 1.0f, 1.0f, 1.0f ); - simd_check_uint32("isplat", simd_isplat(0x80000001) + simd_check_uint32("isplat", simd256_splat(0x80000001) , 0x80000001, 0x80000001, 0x80000001, 0x80000001, 0x80000001, 0x80000001, 0x80000001, 0x80000001 ); - simd_check_float("splat", simd_splat(1.0f) + simd_check_float("splat", simd256_splat(1.0f) , 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f ); } -TEST_CASE("simd_arithmetic", "") +TEST_CASE("simd128_f32_sqrt_nr", "[simd]") { - simd_check_float("madd" - , simd_madd(simd_ld(0.0f, 1.0f, 2.0f, 3.0f), simd_ld(4.0f, 5.0f, 6.0f, 7.0f), simd_ld(8.0f, 9.0f, 10.0f, 11.0f) ) - , 8.0f, 14.0f, 22.0f, 32.0f - ); - - simd_check_float("cross3" - , simd_cross3(simd_ld(1.0f, 0.0f, 0.0f, 0.0f), simd_ld(0.0f, 1.0f, 0.0f, 0.0f) ) - , 0.0f, 0.0f, 1.0f, 0.0f - ); -} - -TEST_CASE("simd_sqrt", "") -{ - simd_check_float("simd_sqrt" - , simd_sqrt(simd_ld(1.0f, 16.0f, 65536.0f, 123456.0f) ) + simd_check_float("simd128_f32_sqrt" + , simd128_f32_sqrt(simd128_ld(1.0f, 16.0f, 65536.0f, 123456.0f) ) , 1.0f, 4.0f, 256.0f, 351.363060096f ); - simd_check_float("simd_sqrt_nr_ni" - , simd_sqrt_nr_ni(simd_ld(1.0f, 16.0f, 65536.0f, 123456.0f) ) + simd_check_float("simd_f32_sqrt_nr_ni" + , simd_f32_sqrt_nr_ni(simd128_ld(1.0f, 16.0f, 65536.0f, 123456.0f) ) , 1.0f, 4.0f, 256.0f, 351.363060096f ); - simd_check_float("simd_sqrt_nr1_ni" - , simd_sqrt_nr1_ni(simd_ld(1.0f, 16.0f, 65536.0f, 123456.0f) ) + simd_check_float("simd_f32_sqrt_nr1_ni" + , simd_f32_sqrt_nr1_ni(simd128_ld(1.0f, 16.0f, 65536.0f, 123456.0f) ) , 1.0f, 4.0f, 256.0f, 351.363060096f ); } -TEST_CASE("simd", "") +union simd128_cast { - const simd128_t isplat = simd_isplat(0x80000001); - simd_check_uint32("sll" - , simd_sll(isplat, 1) - , 0x00000002, 0x00000002, 0x00000002, 0x00000002 - ); + simd128_t simd; + float f[4]; + uint32_t u[4]; + int32_t i[4]; + int16_t i16[8]; + uint16_t u16[8]; + int8_t i8[16]; + uint8_t u8[16]; +}; - simd_check_uint32("srl" - , simd_srl(isplat, 1) - , 0x40000000, 0x40000000, 0x40000000, 0x40000000 - ); - - simd_check_uint32("sra" - , simd_sra(isplat, 1) - , 0xc0000000, 0xc0000000, 0xc0000000, 0xc0000000 - ); - - simd_check_uint32("and" - , simd_and(simd_isplat(0x55555555), simd_isplat(0xaaaaaaaa) ) - , 0, 0, 0, 0 - ); - - simd_check_uint32("or " - , simd_or(simd_isplat(0x55555555), simd_isplat(0xaaaaaaaa) ) - , uint32_t(-1), uint32_t(-1), uint32_t(-1), uint32_t(-1) - ); - - simd_check_uint32("xor" - , simd_or(simd_isplat(0x55555555), simd_isplat(0xaaaaaaaa) ) - , uint32_t(-1), uint32_t(-1), uint32_t(-1), uint32_t(-1) - ); - - simd_check_int32("imin" - , simd_imin(simd_ild(0, 1, 2, 3), simd_ild(uint32_t(-1), 2, uint32_t(-2), 1) ) - , uint32_t(-1), 1, uint32_t(-2), 1 - ); - - simd_check_float("min" - , simd_min(simd_ld(0.0f, 1.0f, 2.0f, 3.0f), simd_ld(-1.0f, 2.0f, -2.0f, 1.0f) ) - , -1.0f, 1.0f, -2.0f, 1.0f - ); - - simd_check_int32("imax" - , simd_imax(simd_ild(0, 1, 2, 3), simd_ild(uint32_t(-1), 2, uint32_t(-2), 1) ) - , 0, 2, 2, 3 - ); - - simd_check_float("max" - , simd_max(simd_ld(0.0f, 1.0f, 2.0f, 3.0f), simd_ld(-1.0f, 2.0f, -2.0f, 1.0f) ) - , 0.0f, 2.0f, 2.0f, 3.0f - ); +static void check_f32(const char* _name, simd128_t _a, float _0, float _1, float _2, float _3) +{ + BX_UNUSED(_name); + simd128_cast c; c.simd = _a; + REQUIRE(c.f[0] == Catch::Approx(_0).margin(0.0001f)); + REQUIRE(c.f[1] == Catch::Approx(_1).margin(0.0001f)); + REQUIRE(c.f[2] == Catch::Approx(_2).margin(0.0001f)); + REQUIRE(c.f[3] == Catch::Approx(_3).margin(0.0001f)); +} + +static void check_i32(const char* _name, simd128_t _a, int32_t _0, int32_t _1, int32_t _2, int32_t _3) +{ + BX_UNUSED(_name); + simd128_cast c; c.simd = _a; + REQUIRE(c.i[0] == _0); + REQUIRE(c.i[1] == _1); + REQUIRE(c.i[2] == _2); + REQUIRE(c.i[3] == _3); +} + +static void check_u32(const char* _name, simd128_t _a, uint32_t _0, uint32_t _1, uint32_t _2, uint32_t _3) +{ + BX_UNUSED(_name); + simd128_cast c; c.simd = _a; + REQUIRE(c.u[0] == _0); + REQUIRE(c.u[1] == _1); + REQUIRE(c.u[2] == _2); + REQUIRE(c.u[3] == _3); +} + +TEST_CASE("simd128_load_store", "[simd]") +{ + BX_ALIGN_DECL_16(float data[4]) = { 1.0f, 2.0f, 3.0f, 4.0f }; + const simd128_t a = simd128_ld(data); + check_f32("ld", a, 1.0f, 2.0f, 3.0f, 4.0f); + + BX_ALIGN_DECL_16(float out[4]); + simd128_st(out, a); + REQUIRE(out[0] == 1.0f); + REQUIRE(out[1] == 2.0f); + REQUIRE(out[2] == 3.0f); + REQUIRE(out[3] == 4.0f); +} + +TEST_CASE("simd128_f32_ld", "[simd]") +{ + const simd128_t a = simd128_ld(10.0f, 20.0f, 30.0f, 40.0f); + check_f32("f32_ld", a, 10.0f, 20.0f, 30.0f, 40.0f); +} + +TEST_CASE("simd128_u32_ld", "[simd]") +{ + const simd128_t a = simd128_ld(0x01u, 0x02u, 0x03u, 0x04u); + check_u32("u32_ld", a, 0x01, 0x02, 0x03, 0x04); +} + +TEST_CASE("simd128_f32_splat", "[simd]") +{ + const simd128_t a = simd128_splat(42.0f); + check_f32("f32_splat", a, 42.0f, 42.0f, 42.0f, 42.0f); +} + +TEST_CASE("simd128_u32_splat", "[simd]") +{ + const simd128_t a = simd128_splat(0xdeadbeef); + check_u32("u32_splat", a, 0xdeadbeef, 0xdeadbeef, 0xdeadbeef, 0xdeadbeef); +} + +TEST_CASE("simd128_zero", "[simd]") +{ + const simd128_t a = simd128_zero(); + check_u32("zero", a, 0, 0, 0, 0); +} + +TEST_CASE("simd128_f32_xyzw", "[simd]") +{ + const simd128_t a = simd128_ld(1.0f, 2.0f, 3.0f, 4.0f); + REQUIRE(simd128_f32_x(a) == 1.0f); + REQUIRE(simd128_f32_y(a) == 2.0f); + REQUIRE(simd128_f32_z(a) == 3.0f); + REQUIRE(simd128_f32_w(a) == 4.0f); +} + +TEST_CASE("simd128_f32_add", "[simd]") +{ + const simd128_t a = simd128_ld(1.0f, 2.0f, 3.0f, 4.0f); + const simd128_t b = simd128_ld(5.0f, 6.0f, 7.0f, 8.0f); + check_f32("f32_add", simd128_f32_add(a, b), 6.0f, 8.0f, 10.0f, 12.0f); +} + +TEST_CASE("simd128_f32_sub", "[simd]") +{ + const simd128_t a = simd128_ld(5.0f, 8.0f, 10.0f, 12.0f); + const simd128_t b = simd128_ld(1.0f, 2.0f, 3.0f, 4.0f); + check_f32("f32_sub", simd128_f32_sub(a, b), 4.0f, 6.0f, 7.0f, 8.0f); +} + +TEST_CASE("simd128_f32_mul", "[simd]") +{ + const simd128_t a = simd128_ld(2.0f, 3.0f, 4.0f, 5.0f); + const simd128_t b = simd128_ld(3.0f, 4.0f, 5.0f, 6.0f); + check_f32("f32_mul", simd128_f32_mul(a, b), 6.0f, 12.0f, 20.0f, 30.0f); +} + +TEST_CASE("simd128_f32_div", "[simd]") +{ + const simd128_t a = simd128_ld(10.0f, 20.0f, 30.0f, 40.0f); + const simd128_t b = simd128_ld(2.0f, 4.0f, 5.0f, 8.0f); + check_f32("f32_div", simd128_f32_div(a, b), 5.0f, 5.0f, 6.0f, 5.0f); +} + +TEST_CASE("simd128_f32_madd", "[simd]") +{ + const simd128_t a = simd128_ld(2.0f, 3.0f, 4.0f, 5.0f); + const simd128_t b = simd128_ld(3.0f, 4.0f, 5.0f, 6.0f); + const simd128_t c = simd128_ld(1.0f, 1.0f, 1.0f, 1.0f); + // a*b+c + check_f32("f32_madd", simd128_f32_madd(a, b, c), 7.0f, 13.0f, 21.0f, 31.0f); +} + +TEST_CASE("simd128_f32_nmsub", "[simd]") +{ + const simd128_t a = simd128_ld(2.0f, 3.0f, 4.0f, 5.0f); + const simd128_t b = simd128_ld(3.0f, 4.0f, 5.0f, 6.0f); + const simd128_t c = simd128_ld(10.0f, 20.0f, 30.0f, 40.0f); + // c - a*b + check_f32("f32_nmsub", simd128_f32_nmsub(a, b, c), 4.0f, 8.0f, 10.0f, 10.0f); +} + +TEST_CASE("simd128_f32_neg", "[simd]") +{ + const simd128_t a = simd128_ld(1.0f, -2.0f, 3.0f, -4.0f); + check_f32("f32_neg", simd128_f32_neg(a), -1.0f, 2.0f, -3.0f, 4.0f); +} + +TEST_CASE("simd128_f32_abs", "[simd]") +{ + const simd128_t a = simd128_ld(-1.0f, 2.0f, -3.0f, 4.0f); + check_f32("f32_abs", simd128_f32_abs(a), 1.0f, 2.0f, 3.0f, 4.0f); +} + +TEST_CASE("simd128_f32_min_max", "[simd]") +{ + const simd128_t a = simd128_ld(1.0f, 5.0f, 3.0f, 8.0f); + const simd128_t b = simd128_ld(4.0f, 2.0f, 7.0f, 6.0f); + check_f32("f32_min", simd128_f32_min(a, b), 1.0f, 2.0f, 3.0f, 6.0f); + check_f32("f32_max", simd128_f32_max(a, b), 4.0f, 5.0f, 7.0f, 8.0f); +} + +TEST_CASE("simd128_f32_clamp", "[simd]") +{ + const simd128_t a = simd128_ld(-1.0f, 0.5f, 1.5f, 3.0f); + const simd128_t lo = simd128_splat(0.0f); + const simd128_t hi = simd128_splat(1.0f); + check_f32("f32_clamp", simd128_f32_clamp(a, lo, hi), 0.0f, 0.5f, 1.0f, 1.0f); +} + +TEST_CASE("simd128_f32_lerp", "[simd]") +{ + const simd128_t a = simd128_splat(0.0f); + const simd128_t b = simd128_splat(10.0f); + const simd128_t s = simd128_splat(0.25f); + check_f32("f32_lerp", simd128_f32_lerp(a, b, s), 2.5f, 2.5f, 2.5f, 2.5f); +} + +TEST_CASE("simd128_f32_sqrt", "[simd]") +{ + const simd128_t a = simd128_ld(4.0f, 9.0f, 16.0f, 25.0f); + check_f32("f32_sqrt", simd128_f32_sqrt(a), 2.0f, 3.0f, 4.0f, 5.0f); +} + +TEST_CASE("simd128_f32_rcp", "[simd]") +{ + const simd128_t a = simd128_ld(2.0f, 4.0f, 5.0f, 10.0f); + const simd128_t r = simd128_f32_rcp(a); + simd128_cast c; c.simd = r; + REQUIRE(c.f[0] == Catch::Approx(0.5f).margin(0.001f)); + REQUIRE(c.f[1] == Catch::Approx(0.25f).margin(0.001f)); + REQUIRE(c.f[2] == Catch::Approx(0.2f).margin(0.001f)); + REQUIRE(c.f[3] == Catch::Approx(0.1f).margin(0.001f)); +} + +TEST_CASE("simd128_f32_round", "[simd]") +{ + const simd128_t a = simd128_ld(1.4f, 1.5f, -1.4f, -1.5f); + const simd128_t r = simd128_f32_round(a); + simd128_cast c; c.simd = r; + REQUIRE(c.f[0] == Catch::Approx(1.0f)); + REQUIRE(c.f[1] == Catch::Approx(2.0f)); + REQUIRE(c.f[2] == Catch::Approx(-1.0f)); + REQUIRE(c.f[3] == Catch::Approx(-2.0f).margin(1.0f)); // banker's rounding may differ +} + +TEST_CASE("simd128_f32_ceil", "[simd]") +{ + const simd128_t a = simd128_ld(1.1f, -1.1f, 2.9f, -2.9f); + check_f32("f32_ceil", simd128_f32_ceil(a), 2.0f, -1.0f, 3.0f, -2.0f); +} + +TEST_CASE("simd128_f32_floor", "[simd]") +{ + const simd128_t a = simd128_ld(1.9f, -1.1f, 2.1f, -2.9f); + check_f32("f32_floor", simd128_f32_floor(a), 1.0f, -2.0f, 2.0f, -3.0f); +} + +TEST_CASE("simd128_f32_cmpeq", "[simd]") +{ + const simd128_t a = simd128_ld(1.0f, 2.0f, 3.0f, 4.0f); + const simd128_t b = simd128_ld(1.0f, 9.0f, 3.0f, 9.0f); + check_u32("f32_cmpeq", simd128_f32_cmpeq(a, b), 0xffffffff, 0, 0xffffffff, 0); +} + +TEST_CASE("simd128_f32_cmplt", "[simd]") +{ + const simd128_t a = simd128_ld(1.0f, 5.0f, 3.0f, 8.0f); + const simd128_t b = simd128_ld(2.0f, 2.0f, 3.0f, 9.0f); + check_u32("f32_cmplt", simd128_f32_cmplt(a, b), 0xffffffff, 0, 0, 0xffffffff); +} + +TEST_CASE("simd128_f32_cmpneq", "[simd]") +{ + const simd128_t a = simd128_ld(1.0f, 2.0f, 3.0f, 4.0f); + const simd128_t b = simd128_ld(1.0f, 9.0f, 3.0f, 9.0f); + check_u32("f32_cmpneq", simd128_f32_cmpneq(a, b), 0, 0xffffffff, 0, 0xffffffff); +} + +TEST_CASE("simd128_i32_itof", "[simd]") +{ + const simd128_t a = simd128_ld(1u, 2u, 3u, 4u); + // Interpret as int bits, convert to float + const simd128_t b = simd128_i32_itof(a); + simd128_cast ai; ai.simd = a; + simd128_cast bf; bf.simd = b; + REQUIRE(bf.f[0] == Catch::Approx((float)ai.i[0])); + REQUIRE(bf.f[1] == Catch::Approx((float)ai.i[1])); + REQUIRE(bf.f[2] == Catch::Approx((float)ai.i[2])); + REQUIRE(bf.f[3] == Catch::Approx((float)ai.i[3])); +} + +TEST_CASE("simd128_i32_add", "[simd]") +{ + const simd128_t a = simd128_ld(1u, 2u, 3u, 4u); + const simd128_t b = simd128_ld(10u, 20u, 30u, 40u); + check_i32("i32_add", simd128_i32_add(a, b), 11, 22, 33, 44); +} + +TEST_CASE("simd128_i32_sub", "[simd]") +{ + const simd128_t a = simd128_ld(10u, 20u, 30u, 40u); + const simd128_t b = simd128_ld(1u, 2u, 3u, 4u); + check_i32("i32_sub", simd128_i32_sub(a, b), 9, 18, 27, 36); +} + +TEST_CASE("simd128_i32_min_max", "[simd]") +{ + const simd128_t a = simd128_ld( 1u, 50u, 3u, 80u); + const simd128_t b = simd128_ld(40u, 2u, 7u, 60u); + check_i32("i32_min", simd128_i32_min(a, b), 1, 2, 3, 60); + check_i32("i32_max", simd128_i32_max(a, b), 40, 50, 7, 80); +} + +TEST_CASE("simd128_i32_cmpeq", "[simd]") +{ + const simd128_t a = simd128_ld(1u, 2u, 3u, 4u); + const simd128_t b = simd128_ld(1u, 9u, 3u, 9u); + check_u32("i32_cmpeq", simd128_i32_cmpeq(a, b), 0xffffffff, 0, 0xffffffff, 0); +} + +TEST_CASE("simd128_i32_cmpgt", "[simd]") +{ + const simd128_t a = simd128_ld(5u, 2u, 7u, 4u); + const simd128_t b = simd128_ld(1u, 9u, 3u, 4u); + check_u32("i32_cmpgt", simd128_i32_cmpgt(a, b), 0xffffffff, 0, 0xffffffff, 0); +} + +TEST_CASE("simd128_i16_add", "[simd]") +{ + // Pack [1, 2, 3, 4, 5, 6, 7, 8] as 8--i16 into two 32-bit lanes + const simd128_t a = simd128_ld( + uint32_t((1 & 0xffffu) | (2 << 16)) + , uint32_t((3 & 0xffff) | (4 << 16)) + , uint32_t((5 & 0xffff) | (6 << 16)) + , uint32_t((7 & 0xffff) | (8 << 16)) + ); + const simd128_t b = simd128_ld( + uint32_t((10 & 0xffffu) | (20 << 16)) + , uint32_t((30 & 0xffff) | (40 << 16)) + , uint32_t((50 & 0xffff) | (60 << 16)) + , uint32_t((70 & 0xffff) | (80 << 16)) + ); + const simd128_t r = simd128_i16_add(a, b); + simd128_cast c; c.simd = r; + REQUIRE(c.i16[0] == 11); + REQUIRE(c.i16[1] == 22); + REQUIRE(c.i16[2] == 33); + REQUIRE(c.i16[3] == 44); + REQUIRE(c.i16[4] == 55); + REQUIRE(c.i16[5] == 66); + REQUIRE(c.i16[6] == 77); + REQUIRE(c.i16[7] == 88); +} + +TEST_CASE("simd128_i8_add", "[simd]") +{ + const simd128_t a = simd128_ld( + 0x01020304u, 0x05060708u, 0x090a0b0cu, 0x0d0e0f10u + ); + const simd128_t b = simd128_ld( + 0x10101010u, 0x10101010u, 0x10101010u, 0x10101010u + ); + const simd128_t r = simd128_i8_add(a, b); + simd128_cast c; c.simd = r; + REQUIRE(c.u8[0] == 0x14); + REQUIRE(c.u8[1] == 0x13); + REQUIRE(c.u8[2] == 0x12); + REQUIRE(c.u8[3] == 0x11); +} + +TEST_CASE("simd128_u8_satadd", "[simd]") +{ + const simd128_t a = simd128_ld(0xf0f0f0f0u, 0u, 0u, 0u); + const simd128_t b = simd128_ld(0x20202020u, 0u, 0u, 0u); + const simd128_t r = simd128_u8_satadd(a, b); + simd128_cast c; c.simd = r; + // 0xf0 + 0x20 = 0x110 saturates to 0xff + REQUIRE(c.u8[0] == 0xff); + REQUIRE(c.u8[1] == 0xff); + REQUIRE(c.u8[2] == 0xff); + REQUIRE(c.u8[3] == 0xff); +} + +TEST_CASE("simd128_u8_satsub", "[simd]") +{ + const simd128_t a = simd128_ld(0x10101010u, 0u, 0u, 0u); + const simd128_t b = simd128_ld(0x20202020u, 0u, 0u, 0u); + const simd128_t r = simd128_u8_satsub(a, b); + simd128_cast c; c.simd = r; + // 0x10 - 0x20 saturates to 0x00 + REQUIRE(c.u8[0] == 0x00); + REQUIRE(c.u8[1] == 0x00); +} + +TEST_CASE("simd128_and", "[simd]") +{ + const simd128_t a = simd128_ld(0xff00ff00u, 0xff00ff00u, 0xff00ff00u, 0xff00ff00u); + const simd128_t b = simd128_ld(0xffff0000u, 0xffff0000u, 0xffff0000u, 0xffff0000u); + check_u32("and", simd128_and(a, b), 0xff000000, 0xff000000, 0xff000000, 0xff000000); +} + +TEST_CASE("simd128_or", "[simd]") +{ + const simd128_t a = simd128_ld(0xf0f0f0f0u, 0u, 0u, 0u); + const simd128_t b = simd128_ld(0x0f0f0f0fu, 0u, 0u, 0u); + check_u32("or", simd128_or(a, b), 0xffffffff, 0, 0, 0); +} + +TEST_CASE("simd128_xor", "[simd]") +{ + const simd128_t a = simd128_ld(0xffffffffu, 0u, 0xffffffffu, 0u); + const simd128_t b = simd128_ld(0xffffffffu, 0xffffffffu, 0u, 0u); + check_u32("xor", simd128_xor(a, b), 0, 0xffffffff, 0xffffffff, 0); +} + +TEST_CASE("simd128_not", "[simd]") +{ + const simd128_t a = simd128_ld(0u, 0xffffffffu, 0x0f0f0f0fu, 0xf0f0f0f0u); + check_u32("not", simd128_not(a), 0xffffffff, 0, 0xf0f0f0f0, 0x0f0f0f0f); +} + +TEST_CASE("simd128_andc", "[simd]") +{ + const simd128_t a = simd128_ld(0xffffffffu, 0xffffffffu, 0u, 0u); + const simd128_t b = simd128_ld(0x0f0f0f0fu, 0u, 0x0f0f0f0fu, 0u); + // a & ~b + check_u32("andc", simd128_andc(a, b), 0xf0f0f0f0, 0xffffffff, 0, 0); +} + +TEST_CASE("simd128_x32_sll", "[simd]") +{ + const simd128_t a = simd128_ld(1u, 2u, 4u, 8u); + check_u32("sll", simd128_x32_sll(a, 2), 4, 8, 16, 32); +} + +TEST_CASE("simd128_x32_srl", "[simd]") +{ + const simd128_t a = simd128_ld(8u, 16u, 32u, 64u); + check_u32("srl", simd128_x32_srl(a, 2), 2, 4, 8, 16); +} + +TEST_CASE("simd128_x32_sra", "[simd]") +{ + const simd128_t a = simd128_ld(uint32_t(-8), uint32_t(-16), 32u, 64u); + check_i32("sra", simd128_x32_sra(a, 2), -2, -4, 8, 16); +} + +TEST_CASE("simd128_selb", "[simd]") +{ + const simd128_t a = simd128_ld(1.0f, 2.0f, 3.0f, 4.0f); + const simd128_t b = simd128_ld(5.0f, 6.0f, 7.0f, 8.0f); + const simd128_t mask = simd128_ld(0xffffffffu, 0u, 0xffffffffu, 0u); + // mask ? a : b + check_f32("selb", simd128_selb(mask, a, b), 1.0f, 6.0f, 3.0f, 8.0f); +} + +TEST_CASE("simd128_swizzle", "[simd]") +{ + const simd128_t a = simd128_ld(1.0f, 2.0f, 3.0f, 4.0f); + + check_f32("xyzw", a, 1.0f, 2.0f, 3.0f, 4.0f); + check_f32("xxxx", simd128_x32_swiz_xxxx(a), 1.0f, 1.0f, 1.0f, 1.0f); + check_f32("yyyy", simd128_x32_swiz_yyyy(a), 2.0f, 2.0f, 2.0f, 2.0f); + check_f32("zzzz", simd128_x32_swiz_zzzz(a), 3.0f, 3.0f, 3.0f, 3.0f); + check_f32("wwww", simd128_x32_swiz_wwww(a), 4.0f, 4.0f, 4.0f, 4.0f); + check_f32("yzxw", simd128_x32_swiz_yzxw(a), 2.0f, 3.0f, 1.0f, 4.0f); + check_f32("zxyw", simd128_x32_swiz_zxyw(a), 3.0f, 1.0f, 2.0f, 4.0f); + check_f32("zwxy", simd128_x32_swiz_zwxy(a), 3.0f, 4.0f, 1.0f, 2.0f); +} + +TEST_CASE("simd128_shuffle", "[simd]") +{ + const simd128_t a = simd128_ld(1.0f, 2.0f, 3.0f, 4.0f); + const simd128_t b = simd128_ld(5.0f, 6.0f, 7.0f, 8.0f); + + check_f32("xyAB", simd128_x32_shuf_xyAB(a, b), 1.0f, 2.0f, 5.0f, 6.0f); + check_f32("ABxy", simd128_x32_shuf_ABxy(a, b), 5.0f, 6.0f, 1.0f, 2.0f); + check_f32("CDzw", simd128_x32_shuf_CDzw(a, b), 7.0f, 8.0f, 3.0f, 4.0f); + check_f32("zwCD", simd128_x32_shuf_zwCD(a, b), 3.0f, 4.0f, 7.0f, 8.0f); + check_f32("xAyB", simd128_x32_shuf_xAyB(a, b), 1.0f, 5.0f, 2.0f, 6.0f); + check_f32("AxBy", simd128_x32_shuf_AxBy(a, b), 5.0f, 1.0f, 6.0f, 2.0f); + check_f32("zCwD", simd128_x32_shuf_zCwD(a, b), 3.0f, 7.0f, 4.0f, 8.0f); + check_f32("CzDw", simd128_x32_shuf_CzDw(a, b), 7.0f, 3.0f, 8.0f, 4.0f); + check_f32("xAzC", simd128_x32_shuf_xAzC(a, b), 1.0f, 5.0f, 3.0f, 7.0f); + check_f32("yBwD", simd128_x32_shuf_yBwD(a, b), 2.0f, 6.0f, 4.0f, 8.0f); + check_f32("xzAC", simd128_x32_shuf_xzAC(a, b), 1.0f, 3.0f, 5.0f, 7.0f); + check_f32("ywBD", simd128_x32_shuf_ywBD(a, b), 2.0f, 4.0f, 6.0f, 8.0f); + check_f32("xxAA", simd128_x32_shuf_xxAA(a, b), 1.0f, 1.0f, 5.0f, 5.0f); + check_f32("yyBB", simd128_x32_shuf_yyBB(a, b), 2.0f, 2.0f, 6.0f, 6.0f); + check_f32("zzCC", simd128_x32_shuf_zzCC(a, b), 3.0f, 3.0f, 7.0f, 7.0f); + check_f32("wwDD", simd128_x32_shuf_wwDD(a, b), 4.0f, 4.0f, 8.0f, 8.0f); +} + +TEST_CASE("simd128_test_any_all", "[simd]") +{ + const simd128_t all_set = simd128_ld(0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u); + const simd128_t none_set = simd128_zero(); + const simd128_t x_set = simd128_ld(0x80000000u, 0u, 0u, 0u); + + REQUIRE( simd128_test_any_xyzw(all_set)); + REQUIRE( simd128_test_all_xyzw(all_set)); + REQUIRE(!simd128_test_any_xyzw(none_set)); + REQUIRE(!simd128_test_all_xyzw(none_set)); + REQUIRE( simd128_test_any_x(x_set)); + REQUIRE(!simd128_test_any_y(x_set)); + REQUIRE( simd128_test_all_x(x_set)); + REQUIRE(!simd128_test_all_xy(x_set)); +} + +TEST_CASE("simd128_f32_dot3", "[simd]") +{ + const simd128_t a = simd128_ld(1.0f, 2.0f, 3.0f, 0.0f); + const simd128_t b = simd128_ld(4.0f, 5.0f, 6.0f, 0.0f); + // 1*4 + 2*5 + 3*6 = 32 + const simd128_t d = simd128_f32_dot3(a, b); + REQUIRE(simd128_f32_x(d) == Catch::Approx(32.0f)); +} + +TEST_CASE("simd128_f32_dot", "[simd]") +{ + const simd128_t a = simd128_ld(1.0f, 2.0f, 3.0f, 4.0f); + const simd128_t b = simd128_ld(5.0f, 6.0f, 7.0f, 8.0f); + // 1*5 + 2*6 + 3*7 + 4*8 = 70 + const simd128_t d = simd128_f32_dot(a, b); + REQUIRE(simd128_f32_x(d) == Catch::Approx(70.0f)); +} + +TEST_CASE("simd128_f32_cross3", "[simd]") +{ + // x cross y = z + const simd128_t a = simd128_ld(1.0f, 0.0f, 0.0f, 0.0f); + const simd128_t b = simd128_ld(0.0f, 1.0f, 0.0f, 0.0f); + const simd128_t c = simd128_f32_cross3(a, b); + REQUIRE(simd128_f32_x(c) == Catch::Approx(0.0f)); + REQUIRE(simd128_f32_y(c) == Catch::Approx(0.0f)); + REQUIRE(simd128_f32_z(c) == Catch::Approx(1.0f)); +} + +TEST_CASE("simd128_f32_normalize3", "[simd]") +{ + const simd128_t a = simd128_ld(3.0f, 0.0f, 4.0f, 0.0f); + const simd128_t n = simd128_f32_normalize3(a); + // length = 5, so normalized = (0.6, 0, 0.8) + REQUIRE(simd128_f32_x(n) == Catch::Approx(0.6f).margin(0.001f)); + REQUIRE(simd128_f32_y(n) == Catch::Approx(0.0f).margin(0.001f)); + REQUIRE(simd128_f32_z(n) == Catch::Approx(0.8f).margin(0.001f)); +} + +TEST_CASE("simd_f32_add_generic", "[simd]") +{ + // Tests that the width-generic wrapper dispatches correctly. + const simd128_t a = simd128_ld(1.0f, 2.0f, 3.0f, 4.0f); + const simd128_t b = simd128_ld(5.0f, 6.0f, 7.0f, 8.0f); + const simd128_t r = simd_f32_add(a, b); + check_f32("generic_f32_add", r, 6.0f, 8.0f, 10.0f, 12.0f); +} + +TEST_CASE("simd_i32_add_generic", "[simd]") +{ + const simd128_t a = simd128_ld(1u, 2u, 3u, 4u); + const simd128_t b = simd128_ld(10u, 20u, 30u, 40u); + const simd128_t r = simd_i32_add(a, b); + check_i32("generic_i32_add", r, 11, 22, 33, 44); +} + +TEST_CASE("simd_and_generic", "[simd]") +{ + const simd128_t a = simd128_ld(0xff00ff00u, 0xff00ff00u, 0xff00ff00u, 0xff00ff00u); + const simd128_t b = simd128_ld(0xffff0000u, 0xffff0000u, 0xffff0000u, 0xffff0000u); + check_u32("generic_and", simd_and(a, b), 0xff000000, 0xff000000, 0xff000000, 0xff000000); +} + +TEST_CASE("simd128_f32_log2", "[simd]") +{ + const simd128_t a = simd128_ld(1.0f, 2.0f, 4.0f, 8.0f); + const simd128_t r = simd_f32_log2(a); + simd128_cast c; c.simd = r; + REQUIRE(c.f[0] == Catch::Approx(0.0f).margin(0.01f)); + REQUIRE(c.f[1] == Catch::Approx(1.0f).margin(0.01f)); + REQUIRE(c.f[2] == Catch::Approx(2.0f).margin(0.01f)); + REQUIRE(c.f[3] == Catch::Approx(3.0f).margin(0.01f)); +} + +TEST_CASE("simd128_f32_exp2", "[simd]") +{ + const simd128_t a = simd128_ld(0.0f, 1.0f, 2.0f, 3.0f); + const simd128_t r = simd_f32_exp2(a); + simd128_cast c; c.simd = r; + REQUIRE(c.f[0] == Catch::Approx(1.0f).margin(0.01f)); + REQUIRE(c.f[1] == Catch::Approx(2.0f).margin(0.01f)); + REQUIRE(c.f[2] == Catch::Approx(4.0f).margin(0.01f)); + REQUIRE(c.f[3] == Catch::Approx(8.0f).margin(0.01f)); +} + +TEST_CASE("simd128_f32_pow", "[simd]") +{ + const simd128_t a = simd128_splat(2.0f); + const simd128_t b = simd128_ld(1.0f, 2.0f, 3.0f, 4.0f); + const simd128_t r = simd_f32_pow(a, b); + simd128_cast c; c.simd = r; + REQUIRE(c.f[0] == Catch::Approx(2.0f).margin(0.01f)); + REQUIRE(c.f[1] == Catch::Approx(4.0f).margin(0.01f)); + REQUIRE(c.f[2] == Catch::Approx(8.0f).margin(0.01f)); + REQUIRE(c.f[3] == Catch::Approx(16.0f).margin(0.01f)); +} + +static simd32_t make_f32(float _v) +{ + simd32_t r{ .u32 = bitCast(_v) }; return r; +} + +static simd32_t make_u32(uint32_t _v) +{ + simd32_t r{ .u32 = _v }; return r; +} + +static simd32_t make_i32(int32_t _v) +{ + simd32_t r{ .u32 = bitCast(_v) }; return r; +} + +TEST_CASE("simd32_f32_arithmetic", "[simd]") +{ + const simd32_t a = make_f32(10.0f); + const simd32_t b = make_f32(3.0f); + + REQUIRE(bitCast(simd32_f32_add(a, b).u32) == Catch::Approx(13.0f)); + REQUIRE(bitCast(simd32_f32_sub(a, b).u32) == Catch::Approx(7.0f)); + REQUIRE(bitCast(simd32_f32_mul(a, b).u32) == Catch::Approx(30.0f)); + REQUIRE(bitCast(simd32_f32_div(a, b).u32) == Catch::Approx(10.0f / 3.0f)); + + const simd32_t c = make_f32(1.0f); + REQUIRE(bitCast(simd32_f32_madd(a, b, c).u32) == Catch::Approx(31.0f)); + REQUIRE(bitCast(simd32_f32_nmsub(a, b, c).u32) == Catch::Approx(-29.0f)); + + REQUIRE(bitCast(simd32_f32_neg(a).u32) == Catch::Approx(-10.0f)); + REQUIRE(bitCast(simd32_f32_neg(make_f32(-5.0f)).u32) == Catch::Approx(5.0f)); + REQUIRE(bitCast(simd32_f32_abs(make_f32(-7.0f)).u32) == Catch::Approx(7.0f)); + REQUIRE(bitCast(simd32_f32_abs(make_f32(7.0f)).u32) == Catch::Approx(7.0f)); +} + +TEST_CASE("simd32_f32_min_max_clamp_lerp", "[simd]") +{ + REQUIRE(bitCast(simd32_f32_min(make_f32(3.0f), make_f32(5.0f)).u32) == Catch::Approx(3.0f)); + REQUIRE(bitCast(simd32_f32_max(make_f32(3.0f), make_f32(5.0f)).u32) == Catch::Approx(5.0f)); + REQUIRE(bitCast(simd32_f32_clamp(make_f32(-1.0f), make_f32(0.0f), make_f32(1.0f)).u32) == Catch::Approx(0.0f)); + REQUIRE(bitCast(simd32_f32_clamp(make_f32(0.5f), make_f32(0.0f), make_f32(1.0f)).u32) == Catch::Approx(0.5f)); + REQUIRE(bitCast(simd32_f32_clamp(make_f32(2.0f), make_f32(0.0f), make_f32(1.0f)).u32) == Catch::Approx(1.0f)); + REQUIRE(bitCast(simd32_f32_lerp(make_f32(0.0f), make_f32(10.0f), make_f32(0.25f)).u32) == Catch::Approx(2.5f)); +} + +TEST_CASE("simd32_f32_rcp_sqrt_rsqrt", "[simd]") +{ + REQUIRE(bitCast(simd32_f32_rcp(make_f32(4.0f)).u32) == Catch::Approx(0.25f)); + REQUIRE(bitCast(simd32_f32_sqrt(make_f32(25.0f)).u32) == Catch::Approx(5.0f)); + REQUIRE(bitCast(simd32_f32_rsqrt(make_f32(4.0f)).u32) == Catch::Approx(0.5f)); +} + +TEST_CASE("simd32_f32_rounding", "[simd]") +{ + REQUIRE(bitCast(simd32_f32_round(make_f32(1.4f)).u32) == Catch::Approx(1.0f)); + REQUIRE(bitCast(simd32_f32_round(make_f32(1.6f)).u32) == Catch::Approx(2.0f)); + REQUIRE(bitCast(simd32_f32_ceil(make_f32(1.1f)).u32) == Catch::Approx(2.0f)); + REQUIRE(bitCast(simd32_f32_ceil(make_f32(-1.1f)).u32) == Catch::Approx(-1.0f)); + REQUIRE(bitCast(simd32_f32_floor(make_f32(1.9f)).u32) == Catch::Approx(1.0f)); + REQUIRE(bitCast(simd32_f32_floor(make_f32(-1.1f)).u32) == Catch::Approx(-2.0f)); +} + +TEST_CASE("simd32_f32_comparison", "[simd]") +{ + const simd32_t a = make_f32(5.0f); + const simd32_t b = make_f32(3.0f); + const simd32_t c = make_f32(5.0f); + + REQUIRE(simd32_f32_cmpeq(a, c).u32 == 0xffffffff); + REQUIRE(simd32_f32_cmpeq(a, b).u32 == 0); + REQUIRE(simd32_f32_cmpneq(a, b).u32 == 0xffffffff); + REQUIRE(simd32_f32_cmplt(b, a).u32 == 0xffffffff); + REQUIRE(simd32_f32_cmplt(a, b).u32 == 0); + REQUIRE(simd32_f32_cmple(b, a).u32 == 0xffffffff); + REQUIRE(simd32_f32_cmple(a, c).u32 == 0xffffffff); + REQUIRE(simd32_f32_cmpgt(a, b).u32 == 0xffffffff); + REQUIRE(simd32_f32_cmpge(a, c).u32 == 0xffffffff); +} + +TEST_CASE("simd32_u32_arithmetic", "[simd]") +{ + const simd32_t a = make_u32(10); + const simd32_t b = make_u32(3); + + REQUIRE(simd32_u32_add(a, b).u32 == 13); + REQUIRE(simd32_u32_sub(a, b).u32 == 7); + REQUIRE(simd32_u32_mul(a, b).u32 == 30); + REQUIRE(simd32_u32_div(a, b).u32 == 3); + REQUIRE(simd32_u32_mod(a, b).u32 == 1); + REQUIRE(simd32_u32_min(a, b).u32 == 3); + REQUIRE(simd32_u32_max(a, b).u32 == 10); + REQUIRE(simd32_u32_clamp(make_u32(0), make_u32(2), make_u32(8)).u32 == 2); + REQUIRE(simd32_u32_clamp(make_u32(5), make_u32(2), make_u32(8)).u32 == 5); + REQUIRE(simd32_u32_clamp(make_u32(20), make_u32(2), make_u32(8)).u32 == 8); +} + +TEST_CASE("simd32_i32_arithmetic", "[simd]") +{ + const simd32_t a = make_i32(10); + const simd32_t b = make_i32(-3); + + REQUIRE(bitCast(simd32_i32_add(a, b).u32) == 7); + REQUIRE(bitCast(simd32_i32_sub(a, b).u32) == 13); + REQUIRE(bitCast(simd32_i32_min(a, b).u32) == -3); + REQUIRE(bitCast(simd32_i32_max(a, b).u32) == 10); + REQUIRE(bitCast(simd32_i32_clamp(make_i32(-5), make_i32(0), make_i32(10)).u32) == 0); + REQUIRE(bitCast(simd32_i32_clamp(make_i32(5), make_i32(0), make_i32(10)).u32) == 5); + REQUIRE(bitCast(simd32_i32_clamp(make_i32(15), make_i32(0), make_i32(10)).u32) == 10); +} + +TEST_CASE("simd32_u32_comparison", "[simd]") +{ + REQUIRE(simd32_u32_cmpeq(make_u32(5), make_u32(5)).u32 == 0xffffffff); + REQUIRE(simd32_u32_cmpeq(make_u32(5), make_u32(3)).u32 == 0); + REQUIRE(simd32_u32_cmpneq(make_u32(5), make_u32(3)).u32 == 0xffffffff); + REQUIRE(simd32_u32_cmplt(make_u32(3), make_u32(5)).u32 == 0xffffffff); + REQUIRE(simd32_u32_cmplt(make_u32(5), make_u32(3)).u32 == 0); + REQUIRE(simd32_u32_cmple(make_u32(5), make_u32(5)).u32 == 0xffffffff); + REQUIRE(simd32_u32_cmpgt(make_u32(5), make_u32(3)).u32 == 0xffffffff); + REQUIRE(simd32_u32_cmpge(make_u32(5), make_u32(5)).u32 == 0xffffffff); +} + +TEST_CASE("simd32_i32_comparison", "[simd]") +{ + REQUIRE(simd32_i32_cmpeq(make_i32(5), make_i32(5)).u32 == 0xffffffff); + REQUIRE(simd32_i32_cmpeq(make_i32(5), make_i32(-3)).u32 == 0); + REQUIRE(simd32_i32_cmplt(make_i32(-3), make_i32(5)).u32 == 0xffffffff); + REQUIRE(simd32_i32_cmpgt(make_i32(5), make_i32(-3)).u32 == 0xffffffff); +} + +TEST_CASE("simd32_bitwise", "[simd]") +{ + REQUIRE(simd32_and(make_u32(0xff00), make_u32(0x0ff0)).u32 == 0x0f00); + REQUIRE(simd32_andc(make_u32(0xffff), make_u32(0x0f0f)).u32 == 0xf0f0); + REQUIRE(simd32_or(make_u32(0xf000), make_u32(0x000f)).u32 == 0xf00f); + REQUIRE(simd32_orc(make_u32(0xf0f0), make_u32(0x0f0f)).u32 == 0xfffff0f0); + REQUIRE(simd32_xor(make_u32(0xffff), make_u32(0x0f0f)).u32 == 0xf0f0); + REQUIRE(simd32_not(make_u32(0)).u32 == 0xffffffff); +} + +TEST_CASE("simd32_shifts", "[simd]") +{ + REQUIRE(simd32_x32_sll(make_u32(1), 4).u32 == 16); + REQUIRE(simd32_x32_srl(make_u32(32), 3).u32 == 4); + REQUIRE(bitCast(simd32_x32_sra(make_i32(-8), 2).u32) == -2); +} + +TEST_CASE("simd32_selection", "[simd]") +{ + const simd32_t a = make_f32(1.0f); + const simd32_t b = make_f32(2.0f); + REQUIRE(bitCast(simd32_selb(make_u32(0xffffffff), a, b).u32) == Catch::Approx(1.0f)); + REQUIRE(bitCast(simd32_selb(make_u32(0), a, b).u32) == Catch::Approx(2.0f)); + REQUIRE(bitCast(simd32_sels(make_i32(-1), a, b).u32) == Catch::Approx(1.0f)); + REQUIRE(bitCast(simd32_sels(make_i32(0), a, b).u32) == Catch::Approx(2.0f)); +} + +TEST_CASE("simd256_f32_add", "[simd]") +{ + BX_ALIGN_DECL(32, float ad[8]) = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f }; + BX_ALIGN_DECL(32, float bd[8]) = { 10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f }; + const simd256_t a = simd256_ld(ad); + const simd256_t b = simd256_ld(bd); + const simd256_t r = simd256_f32_add(a, b); + BX_ALIGN_DECL(32, float out[8]); + simd256_st(out, r); + REQUIRE(out[0] == Catch::Approx(11.0f)); + REQUIRE(out[1] == Catch::Approx(22.0f)); + REQUIRE(out[2] == Catch::Approx(33.0f)); + REQUIRE(out[3] == Catch::Approx(44.0f)); + REQUIRE(out[4] == Catch::Approx(55.0f)); + REQUIRE(out[5] == Catch::Approx(66.0f)); + REQUIRE(out[6] == Catch::Approx(77.0f)); + REQUIRE(out[7] == Catch::Approx(88.0f)); +} + +TEST_CASE("simd256_f32_mul", "[simd]") +{ + BX_ALIGN_DECL(32, float ad[8]) = { 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f }; + BX_ALIGN_DECL(32, float bd[8]) = { 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f }; + const simd256_t a = simd256_ld(ad); + const simd256_t b = simd256_ld(bd); + const simd256_t r = simd256_f32_mul(a, b); + BX_ALIGN_DECL(32, float out[8]); + simd256_st(out, r); + REQUIRE(out[0] == Catch::Approx(6.0f)); + REQUIRE(out[1] == Catch::Approx(12.0f)); + REQUIRE(out[2] == Catch::Approx(20.0f)); + REQUIRE(out[3] == Catch::Approx(30.0f)); + REQUIRE(out[4] == Catch::Approx(42.0f)); + REQUIRE(out[5] == Catch::Approx(56.0f)); + REQUIRE(out[6] == Catch::Approx(72.0f)); + REQUIRE(out[7] == Catch::Approx(90.0f)); +} + +TEST_CASE("simd256_f32_min_max", "[simd]") +{ + BX_ALIGN_DECL(32, float ad[8]) = { 1.0f, 5.0f, 3.0f, 8.0f, 2.0f, 7.0f, 4.0f, 9.0f }; + BX_ALIGN_DECL(32, float bd[8]) = { 4.0f, 2.0f, 7.0f, 6.0f, 8.0f, 1.0f, 5.0f, 3.0f }; + const simd256_t a = simd256_ld(ad); + const simd256_t b = simd256_ld(bd); + + BX_ALIGN_DECL(32, float mn[8]); + simd256_st(mn, simd256_f32_min(a, b)); + REQUIRE(mn[0] == Catch::Approx(1.0f)); + REQUIRE(mn[1] == Catch::Approx(2.0f)); + REQUIRE(mn[2] == Catch::Approx(3.0f)); + REQUIRE(mn[3] == Catch::Approx(6.0f)); + REQUIRE(mn[4] == Catch::Approx(2.0f)); + REQUIRE(mn[5] == Catch::Approx(1.0f)); + REQUIRE(mn[6] == Catch::Approx(4.0f)); + REQUIRE(mn[7] == Catch::Approx(3.0f)); + + BX_ALIGN_DECL(32, float mx[8]); + simd256_st(mx, simd256_f32_max(a, b)); + REQUIRE(mx[0] == Catch::Approx(4.0f)); + REQUIRE(mx[1] == Catch::Approx(5.0f)); + REQUIRE(mx[2] == Catch::Approx(7.0f)); + REQUIRE(mx[3] == Catch::Approx(8.0f)); + REQUIRE(mx[4] == Catch::Approx(8.0f)); + REQUIRE(mx[5] == Catch::Approx(7.0f)); + REQUIRE(mx[6] == Catch::Approx(5.0f)); + REQUIRE(mx[7] == Catch::Approx(9.0f)); +} + +TEST_CASE("simd256_bitwise", "[simd]") +{ + BX_ALIGN_DECL(32, uint32_t ad[8]) = { 0xff00ff00, 0xff00ff00, 0xff00ff00, 0xff00ff00, 0, 0, 0, 0 }; + BX_ALIGN_DECL(32, uint32_t bd[8]) = { 0xffff0000, 0xffff0000, 0xffff0000, 0xffff0000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }; + const simd256_t a = simd256_ld(ad); + const simd256_t b = simd256_ld(bd); + const simd256_t r = simd256_and(a, b); + BX_ALIGN_DECL(32, uint32_t out[8]); + simd256_st(out, r); + REQUIRE(out[0] == 0xff000000); + REQUIRE(out[1] == 0xff000000); + REQUIRE(out[4] == 0); +} + +TEST_CASE("simd256_i32_add", "[simd]") +{ + BX_ALIGN_DECL(32, int32_t ad[8]) = { 1, 2, 3, 4, 5, 6, 7, 8 }; + BX_ALIGN_DECL(32, int32_t bd[8]) = { 10, 20, 30, 40, 50, 60, 70, 80 }; + const simd256_t a = simd256_ld(ad); + const simd256_t b = simd256_ld(bd); + const simd256_t r = simd256_i32_add(a, b); + BX_ALIGN_DECL(32, int32_t out[8]); + simd256_st(out, r); + REQUIRE(out[0] == 11); + REQUIRE(out[1] == 22); + REQUIRE(out[2] == 33); + REQUIRE(out[3] == 44); + REQUIRE(out[4] == 55); + REQUIRE(out[5] == 66); + REQUIRE(out[6] == 77); + REQUIRE(out[7] == 88); +} + +// --- simd32 missing coverage --- + +TEST_CASE("simd32_ld_st_zero_splat", "[simd]") +{ + const simd32_t z = simd32_zero(); + REQUIRE(z.u32 == 0); + + const simd32_t sf = simd32_splat(42.0f); + REQUIRE(bitCast(sf.u32) == Catch::Approx(42.0f)); + + const simd32_t su = simd32_splat(0xdeadbeefu); + REQUIRE(su.u32 == 0xdeadbeef); + + const simd32_t si = simd32_splat(int32_t(-7)); + REQUIRE(bitCast(si.u32) == -7); + + const simd32_t su16 = simd32_splat(uint16_t(0xabcd)); + REQUIRE(su16.u32 == 0xabcdabcd); + + const simd32_t su8 = simd32_splat(uint8_t(0x42)); + REQUIRE(su8.u32 == 0x42424242); + + float fval = 3.14f; + const simd32_t lf = simd32_ld(&fval); + REQUIRE(bitCast(lf.u32) == Catch::Approx(3.14f)); + + float fout; + simd32_st(&fout, lf); + REQUIRE(fout == Catch::Approx(3.14f)); + + float fout1; + simd32_x32_st1(&fout1, lf); + REQUIRE(fout1 == Catch::Approx(3.14f)); +} + +TEST_CASE("simd32_f32_msub", "[simd]") +{ + const simd32_t a = make_f32(10.0f); + const simd32_t b = make_f32(3.0f); + const simd32_t c = make_f32(1.0f); + // msub: a*b - c = 30 - 1 = 29 + REQUIRE(bitCast(simd32_f32_msub(a, b, c).u32) == Catch::Approx(29.0f)); +} + +TEST_CASE("simd32_i32_neg_abs", "[simd]") +{ + REQUIRE(bitCast(simd32_i32_neg(make_i32(5)).u32) == -5); + REQUIRE(bitCast(simd32_i32_neg(make_i32(-3)).u32) == 3); + REQUIRE(bitCast(simd32_i32_abs(make_i32(-7)).u32) == 7); + REQUIRE(bitCast(simd32_i32_abs(make_i32(7)).u32) == 7); +} + +TEST_CASE("simd32_i16_add_sub", "[simd]") +{ + // 0x00030005: hi16=3, lo16=5; 0x00010002: hi16=1, lo16=2 + const simd32_t a = make_u32(0x00030005); + const simd32_t b = make_u32(0x00010002); + // add: hi16=4, lo16=7 -> 0x00040007 + REQUIRE(simd32_i16_add(a, b).u32 == 0x00040007); + // sub: hi16=2, lo16=3 -> 0x00020003 + REQUIRE(simd32_i16_sub(a, b).u32 == 0x00020003); +} + +TEST_CASE("simd32_i8_add_sub", "[simd]") +{ + const simd32_t a = make_u32(0x01020304); + const simd32_t b = make_u32(0x10101010); + REQUIRE(simd32_i8_add(a, b).u32 == 0x11121314); + REQUIRE(simd32_i8_sub(a, b).u32 == 0xf1f2f3f4); +} + +TEST_CASE("simd32_u8_sat", "[simd]") +{ + // satadd: 0xf0 + 0x20 = saturates to 0xff per byte + REQUIRE(simd32_u8_satadd(make_u32(0xf0f0f0f0), make_u32(0x20202020)).u32 == 0xffffffff); + // satadd: no saturation + REQUIRE(simd32_u8_satadd(make_u32(0x01010101), make_u32(0x02020202)).u32 == 0x03030303); + // satsub: 0x10 - 0x20 = saturates to 0x00 + REQUIRE(simd32_u8_satsub(make_u32(0x10101010), make_u32(0x20202020)).u32 == 0x00000000); + // satsub: no saturation + REQUIRE(simd32_u8_satsub(make_u32(0x30303030), make_u32(0x10101010)).u32 == 0x20202020); +} + +TEST_CASE("simd32_u16_sat", "[simd]") +{ + // satadd: 0xfff0 + 0x0020 per 16-bit lane saturates to 0xffff + REQUIRE(simd32_u16_satadd(make_u32(0xfff0fff0), make_u32(0x00200020)).u32 == 0xffffffff); + // satsub: 0x0010 - 0x0020 saturates to 0 + REQUIRE(simd32_u16_satsub(make_u32(0x00100010), make_u32(0x00200020)).u32 == 0x00000000); +} + +TEST_CASE("simd32_u32_satadd_satsub_satmul", "[simd]") +{ + REQUIRE(simd32_u32_satadd(make_u32(0xfffffff0), make_u32(0x20)).u32 == 0xffffffff); + REQUIRE(simd32_u32_satadd(make_u32(10), make_u32(20)).u32 == 30); + REQUIRE(simd32_u32_satsub(make_u32(5), make_u32(10)).u32 == 0); + REQUIRE(simd32_u32_satsub(make_u32(10), make_u32(5)).u32 == 5); + REQUIRE(simd32_u32_satmul(make_u32(0x80000000), make_u32(3)).u32 == 0xffffffff); + REQUIRE(simd32_u32_satmul(make_u32(10), make_u32(3)).u32 == 30); +} + +TEST_CASE("simd32_u32_misc", "[simd]") +{ + REQUIRE(simd32_u32_setnz(make_u32(0)).u32 == 0); + REQUIRE(simd32_u32_setnz(make_u32(42)).u32 == 0xffffffff); + REQUIRE(simd32_u32_xorl(make_u32(0xff00), make_u32(0x0ff0)).u32 == 0); + REQUIRE(simd32_u32_xorl(make_u32(0xff00), make_u32(0xff00)).u32 == 0); + REQUIRE(simd32_u32_xorl(make_u32(0xff00), make_u32(0)).u32 == 1); + REQUIRE(simd32_u32_xorl(make_u32(0), make_u32(0)).u32 == 0); + REQUIRE(simd32_u32_nextpow2(make_u32(5)).u32 == 8); + REQUIRE(simd32_u32_nextpow2(make_u32(8)).u32 == 8); + REQUIRE(simd32_u32_testpow2(make_u32(8)).u32 == 0xffffffff); + REQUIRE(simd32_u32_testpow2(make_u32(7)).u32 == 0); + REQUIRE(simd32_u32_incwrap(make_u32(3), make_u32(0), make_u32(5)).u32 == 4); + REQUIRE(simd32_u32_incwrap(make_u32(5), make_u32(0), make_u32(5)).u32 == 0); + REQUIRE(simd32_u32_decwrap(make_u32(1), make_u32(0), make_u32(5)).u32 == 0); + REQUIRE(simd32_u32_decwrap(make_u32(0), make_u32(0), make_u32(5)).u32 == 5); + REQUIRE(simd32_u32_gcd(make_u32(12), make_u32(8)).u32 == 4); + REQUIRE(simd32_u32_lcm(make_u32(4), make_u32(6)).u32 == 12); +} + +TEST_CASE("simd32_bit_ops", "[simd]") +{ + REQUIRE(simd32_x32_cntbits(make_u32(0xff)).u32 == 8); + REQUIRE(simd32_x32_cntbits(make_u32(0)).u32 == 0); + REQUIRE(simd32_x32_cntlz(make_u32(1)).u32 == 31); + REQUIRE(simd32_x32_cntlz(make_u32(0x80000000)).u32 == 0); + REQUIRE(simd32_x32_cnttz(make_u32(0x80000000)).u32 == 31); + REQUIRE(simd32_x32_cnttz(make_u32(1)).u32 == 0); + REQUIRE(simd32_x32_ffs(make_u32(0x80)).u32 == 8); + REQUIRE(simd32_x32_rol(make_u32(1), 1).u32 == 2); + REQUIRE(simd32_x32_ror(make_u32(2), 1).u32 == 1); +} + +TEST_CASE("simd32_signbits_test", "[simd]") +{ + REQUIRE(simd32_x32_signbitsmask(make_u32(0x80000000)) == 1); + REQUIRE(simd32_x32_signbitsmask(make_u32(0x7fffffff)) == 0); + REQUIRE(simd32_x8_signbitsmask(make_u32(0x80808080)) == 0xf); + REQUIRE(simd32_x8_signbitsmask(make_u32(0x00000000)) == 0); + REQUIRE(simd32_test(make_u32(0xffffffff)) == true); + REQUIRE(simd32_test(make_u32(0)) == false); +} + +TEST_CASE("simd32_part1by", "[simd]") +{ + REQUIRE(simd32_x32_part1by1(make_u32(0x3)).u32 == 0x5); + REQUIRE(simd32_x32_part1by2(make_u32(0x3)).u32 == 0x9); +} + +// --- simd128 missing coverage --- + +TEST_CASE("simd128_f32_msub", "[simd]") +{ + const simd128_t a = simd128_ld(10.0f, 20.0f, 30.0f, 40.0f); + const simd128_t b = simd128_ld(2.0f, 3.0f, 4.0f, 5.0f); + const simd128_t c = simd128_ld(1.0f, 1.0f, 1.0f, 1.0f); + // msub: a*b - c + check_f32("f32_msub", simd128_f32_msub(a, b, c), 19.0f, 59.0f, 119.0f, 199.0f); +} + +TEST_CASE("simd128_f32_cos_sin", "[simd]") +{ + const simd128_t a = simd128_ld(0.0f, kPiHalf, kPi, kPi * 1.5f); + simd128_cast rc; rc.simd = simd_f32_cos(a); + REQUIRE(rc.f[0] == Catch::Approx( 1.0f).margin(0.001f)); + REQUIRE(rc.f[1] == Catch::Approx( 0.0f).margin(0.001f)); + REQUIRE(rc.f[2] == Catch::Approx(-1.0f).margin(0.001f)); + REQUIRE(rc.f[3] == Catch::Approx( 0.0f).margin(0.001f)); + + simd128_cast rs; rs.simd = simd_f32_sin(a); + REQUIRE(rs.f[0] == Catch::Approx( 0.0f).margin(0.001f)); + REQUIRE(rs.f[1] == Catch::Approx( 1.0f).margin(0.001f)); + REQUIRE(rs.f[2] == Catch::Approx( 0.0f).margin(0.001f)); + REQUIRE(rs.f[3] == Catch::Approx(-1.0f).margin(0.001f)); +} + +TEST_CASE("simd128_f32_ftoi_trunc", "[simd]") +{ + check_i32("ftoi_trunc", simd128_f32_ftoi_trunc(simd128_ld(1.0f, -2.0f, 3.5f, -4.9f)), 1, -2, 3, -4); +} + +TEST_CASE("simd128_f32_rcp_rsqrt_variants", "[simd]") +{ + const simd128_t a = simd128_ld(4.0f, 16.0f, 25.0f, 100.0f); + simd128_cast cr; cr.simd = simd128_f32_rcp_est(a); + REQUIRE(cr.f[0] == Catch::Approx(0.25f).margin(0.01f)); + + simd128_cast crsqrt; crsqrt.simd = simd128_f32_rsqrt_est(a); + REQUIRE(crsqrt.f[0] == Catch::Approx(0.5f).margin(0.01f)); + + simd128_cast crsqrt2; crsqrt2.simd = simd128_f32_rsqrt(a); + REQUIRE(crsqrt2.f[0] == Catch::Approx(0.5f).margin(0.01f)); + + simd128_cast crsqrtnr; crsqrtnr.simd = simd128_f32_rsqrt_nr(a); + REQUIRE(crsqrtnr.f[0] == Catch::Approx(0.5f).margin(0.001f)); + + simd128_cast cdivnr; cdivnr.simd = simd128_f32_div_nr(simd128_splat(1.0f), a); + REQUIRE(cdivnr.f[0] == Catch::Approx(0.25f).margin(0.01f)); +} + +TEST_CASE("simd128_i32_neg_abs_clamp", "[simd]") +{ + check_i32("i32_neg", simd128_i32_neg(simd128_ld(1, -2, 0, 5)), -1, 2, 0, -5); + check_i32("i32_abs", simd128_i32_abs(simd128_ld(-7, 3, -1, 0)), 7, 3, 1, 0); + check_i32("i32_clamp", simd128_i32_clamp( + simd128_ld(-5, 5, 15, 0), + simd128_ld(0, 0, 0, 0), + simd128_ld(10, 10, 10, 10) + ), 0, 5, 10, 0); +} + +TEST_CASE("simd128_i16_sub_mullo", "[simd]") +{ + // a = 0x00030005 00070009 000b000d 000f0011 -> i16: 3,5,7,9,11,13,15,17 + // b = 0x00010002 00010002 00010002 00010002 -> i16: 1,2,1,2,1,2,1,2 + const simd128_t a = simd128_ld(0x00030005u, 0x00070009u, 0x000b000du, 0x000f0011u); + const simd128_t b = simd128_ld(0x00010002u, 0x00010002u, 0x00010002u, 0x00010002u); + + // sub: 3-1=2, 5-2=3, 7-1=6, 9-2=7, 11-1=10, 13-2=11, 15-1=14, 17-2=15 + check_u32("i16_sub", simd128_i16_sub(a, b), 0x00020003, 0x00060007, 0x000a000b, 0x000e000f); + + // mullo: 3*1=3, 5*2=10, 7*1=7, 9*2=18, 11*1=11, 13*2=26, 15*1=15, 17*2=34 + check_u32("i16_mullo", simd128_i16_mullo(a, b), 0x0003000a, 0x00070012, 0x000b001a, 0x000f0022); +} + +TEST_CASE("simd128_i8_sub", "[simd]") +{ + const simd128_t a = simd128_ld(0x11121314u, 0u, 0u, 0u); + const simd128_t b = simd128_ld(0x10101010u, 0u, 0u, 0u); + const simd128_t r = simd128_i8_sub(a, b); + simd128_cast c; c.simd = r; + REQUIRE(c.u8[0] == 0x04); + REQUIRE(c.u8[1] == 0x03); + REQUIRE(c.u8[2] == 0x02); + REQUIRE(c.u8[3] == 0x01); +} + +TEST_CASE("simd128_u16_sat", "[simd]") +{ + // satadd: 0xfff0+0x0020 per lane saturates to 0xffff + const simd128_t a = simd128_ld(0xfff0fff0u, 0u, 0u, 0u); + const simd128_t b = simd128_ld(0x00200020u, 0u, 0u, 0u); + simd128_cast cr; cr.simd = simd128_u16_satadd(a, b); + REQUIRE(cr.u16[0] == 0xffff); + REQUIRE(cr.u16[1] == 0xffff); + + // satsub: 0x0010 - 0x0020 saturates to 0 + const simd128_t c = simd128_ld(0x00100010u, 0u, 0u, 0u); + const simd128_t d = simd128_ld(0x00200020u, 0u, 0u, 0u); + simd128_cast cs; cs.simd = simd128_u16_satsub(c, d); + REQUIRE(cs.u16[0] == 0x0000); + REQUIRE(cs.u16[1] == 0x0000); +} + +TEST_CASE("simd128_u32_ops", "[simd]") +{ + check_u32("u32_add", simd128_u32_add(simd128_ld(10u, 20u, 30u, 40u), simd128_ld(1u, 2u, 3u, 4u)), 11, 22, 33, 44); + check_u32("u32_sub", simd128_u32_sub(simd128_ld(10u, 20u, 30u, 40u), simd128_ld(1u, 2u, 3u, 4u)), 9, 18, 27, 36); + check_u32("u32_mul", simd128_u32_mul(simd128_ld(2u, 3u, 4u, 5u), simd128_ld(3u, 4u, 5u, 6u)), 6, 12, 20, 30); + check_u32("u32_min", simd128_u32_min(simd128_ld(1u, 5u, 3u, 8u), simd128_ld(4u, 2u, 7u, 6u)), 1, 2, 3, 6); + check_u32("u32_max", simd128_u32_max(simd128_ld(1u, 5u, 3u, 8u), simd128_ld(4u, 2u, 7u, 6u)), 4, 5, 7, 8); + check_u32("u32_clamp", simd128_u32_clamp(simd128_ld(0u, 5u, 15u, 10u), simd128_ld(2u, 2u, 2u, 2u), simd128_ld(10u, 10u, 10u, 10u)), 2, 5, 10, 10); +} + +TEST_CASE("simd128_u32_cmp", "[simd]") +{ + check_u32("u32_cmpeq", simd128_u32_cmpeq(simd128_ld(1u, 2u, 3u, 4u), simd128_ld(1u, 0u, 3u, 0u)), 0xffffffff, 0, 0xffffffff, 0); + check_u32("u32_cmplt", simd128_u32_cmplt(simd128_ld(1u, 5u, 3u, 4u), simd128_ld(2u, 2u, 3u, 5u)), 0xffffffff, 0, 0, 0xffffffff); + check_u32("u32_cmpgt", simd128_u32_cmpgt(simd128_ld(5u, 1u, 3u, 4u), simd128_ld(2u, 2u, 3u, 5u)), 0xffffffff, 0, 0, 0); +} + +TEST_CASE("simd128_orc_orx_sels", "[simd]") +{ + check_u32("orc", simd128_orc(simd128_ld(0xf0f0f0f0u, 0u, 0u, 0u), simd128_ld(0x0f0f0f0fu, 0u, 0u, 0u)), 0xf0f0f0f0, 0xffffffff, 0xffffffff, 0xffffffff); + + // orx: OR all 4 lanes together into lane 0 (implementation-specific, just test non-zero) + const simd128_t ox = simd128_orx(simd128_ld(0x01u, 0x02u, 0x04u, 0x08u)); + simd128_cast co; co.simd = ox; + REQUIRE((co.u[0] & 0x0f) == 0x0f); + + // sels: select based on sign bit + const simd128_t neg = simd128_ld(0x80000000u, 0u, 0x80000000u, 0u); + const simd128_t sa = simd128_ld(1u, 1u, 1u, 1u); + const simd128_t sb = simd128_ld(2u, 2u, 2u, 2u); + check_u32("sels", simd128_sels(neg, sa, sb), 1, 2, 1, 2); +} + +TEST_CASE("simd128_signbits", "[simd]") +{ + REQUIRE(simd128_x32_signbitsmask(simd128_ld(0x80000000u, 0u, 0x80000000u, 0u)) == 0x5); + REQUIRE(simd128_x8_signbitsmask(simd128_zero()) == 0); + REQUIRE(simd128_x8_signbitsmask(simd128_ld(0x80808080u, 0x80808080u, 0x80808080u, 0x80808080u)) == 0xffff); +} + +// --- simd256 missing coverage --- + +TEST_CASE("simd256_zero_splat", "[simd]") +{ + BX_ALIGN_DECL(32, float out[8]); + simd256_st(out, simd256_zero()); + for (int ii = 0; ii < 8; ++ii) REQUIRE(out[ii] == 0.0f); + + simd256_st(out, simd256_splat(7.0f)); + for (int ii = 0; ii < 8; ++ii) REQUIRE(out[ii] == 7.0f); + + BX_ALIGN_DECL(32, uint32_t uout[8]); + simd256_st(uout, simd256_splat(0xdeadbeefu)); + for (int ii = 0; ii < 8; ++ii) REQUIRE(uout[ii] == 0xdeadbeef); +} + +TEST_CASE("simd256_f32_sub_div", "[simd]") +{ + BX_ALIGN_DECL(32, float ad[8]) = { 10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0f }; + BX_ALIGN_DECL(32, float bd[8]) = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f }; + const simd256_t a = simd256_ld(ad); + const simd256_t b = simd256_ld(bd); + + BX_ALIGN_DECL(32, float osub[8]); + simd256_st(osub, simd256_f32_sub(a, b)); + REQUIRE(osub[0] == Catch::Approx(9.0f)); + REQUIRE(osub[7] == Catch::Approx(72.0f)); + + BX_ALIGN_DECL(32, float odiv[8]); + simd256_st(odiv, simd256_f32_div(a, b)); + REQUIRE(odiv[0] == Catch::Approx(10.0f)); + REQUIRE(odiv[7] == Catch::Approx(10.0f)); +} + +TEST_CASE("simd256_f32_madd_msub_nmsub", "[simd]") +{ + BX_ALIGN_DECL(32, float ad[8]) = { 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f }; + BX_ALIGN_DECL(32, float bd[8]) = { 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f }; + BX_ALIGN_DECL(32, float cd[8]) = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }; + const simd256_t a = simd256_ld(ad); + const simd256_t b = simd256_ld(bd); + const simd256_t c = simd256_ld(cd); + + BX_ALIGN_DECL(32, float out[8]); + // madd: a*b + c = 7 + simd256_st(out, simd256_f32_madd(a, b, c)); + REQUIRE(out[0] == Catch::Approx(7.0f)); + REQUIRE(out[7] == Catch::Approx(7.0f)); + + // msub: a*b - c = 5 + simd256_st(out, simd256_f32_msub(a, b, c)); + REQUIRE(out[0] == Catch::Approx(5.0f)); + + // nmsub: c - a*b = -5 + simd256_st(out, simd256_f32_nmsub(a, b, c)); + REQUIRE(out[0] == Catch::Approx(-5.0f)); +} + +TEST_CASE("simd256_f32_neg_abs", "[simd]") +{ + BX_ALIGN_DECL(32, float ad[8]) = { -1.0f, 2.0f, -3.0f, 4.0f, -5.0f, 6.0f, -7.0f, 8.0f }; + const simd256_t a = simd256_ld(ad); + + BX_ALIGN_DECL(32, float on[8]); + simd256_st(on, simd256_f32_neg(a)); + REQUIRE(on[0] == Catch::Approx(1.0f)); + REQUIRE(on[1] == Catch::Approx(-2.0f)); + + BX_ALIGN_DECL(32, float oa[8]); + simd256_st(oa, simd256_f32_abs(a)); + REQUIRE(oa[0] == Catch::Approx(1.0f)); + REQUIRE(oa[2] == Catch::Approx(3.0f)); +} + +TEST_CASE("simd256_f32_clamp_lerp", "[simd]") +{ + BX_ALIGN_DECL(32, float ad[8]) = { -1.0f, 0.5f, 2.0f, 0.5f, -1.0f, 0.5f, 2.0f, 0.5f }; + BX_ALIGN_DECL(32, float mn[8]) = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; + BX_ALIGN_DECL(32, float mx[8]) = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }; + const simd256_t a = simd256_ld(ad); + + BX_ALIGN_DECL(32, float oc[8]); + simd256_st(oc, simd256_f32_clamp(a, simd256_ld(mn), simd256_ld(mx))); + REQUIRE(oc[0] == Catch::Approx(0.0f)); + REQUIRE(oc[1] == Catch::Approx(0.5f)); + REQUIRE(oc[2] == Catch::Approx(1.0f)); + + BX_ALIGN_DECL(32, float aa[8]) = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f }; + BX_ALIGN_DECL(32, float bb[8]) = { 10.0f, 10.0f, 10.0f, 10.0f, 10.0f, 10.0f, 10.0f, 10.0f }; + BX_ALIGN_DECL(32, float ss[8]) = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f }; + BX_ALIGN_DECL(32, float ol[8]); + simd256_st(ol, simd256_f32_lerp(simd256_ld(aa), simd256_ld(bb), simd256_ld(ss))); + REQUIRE(ol[0] == Catch::Approx(5.0f)); + REQUIRE(ol[7] == Catch::Approx(5.0f)); +} + +TEST_CASE("simd256_f32_rcp_sqrt_rsqrt", "[simd]") +{ + BX_ALIGN_DECL(32, float ad[8]) = { 4.0f, 16.0f, 25.0f, 100.0f, 4.0f, 16.0f, 25.0f, 100.0f }; + const simd256_t a = simd256_ld(ad); + + BX_ALIGN_DECL(32, float orcp[8]); + simd256_st(orcp, simd256_f32_rcp(a)); + REQUIRE(orcp[0] == Catch::Approx(0.25f).margin(0.01f)); + + BX_ALIGN_DECL(32, float osqrt[8]); + simd256_st(osqrt, simd256_f32_sqrt(a)); + REQUIRE(osqrt[0] == Catch::Approx(2.0f).margin(0.001f)); + REQUIRE(osqrt[1] == Catch::Approx(4.0f).margin(0.001f)); + + BX_ALIGN_DECL(32, float orsqrt[8]); + simd256_st(orsqrt, simd256_f32_rsqrt(a)); + REQUIRE(orsqrt[0] == Catch::Approx(0.5f).margin(0.01f)); +} + +TEST_CASE("simd256_f32_rounding", "[simd]") +{ + BX_ALIGN_DECL(32, float ad[8]) = { 1.3f, 1.5f, 1.7f, -1.3f, -1.5f, -1.7f, 2.0f, 0.0f }; + const simd256_t a = simd256_ld(ad); + + BX_ALIGN_DECL(32, float oc[8]); + simd256_st(oc, simd256_f32_ceil(a)); + REQUIRE(oc[0] == Catch::Approx(2.0f)); + REQUIRE(oc[3] == Catch::Approx(-1.0f)); + + BX_ALIGN_DECL(32, float of[8]); + simd256_st(of, simd256_f32_floor(a)); + REQUIRE(of[0] == Catch::Approx(1.0f)); + REQUIRE(of[3] == Catch::Approx(-2.0f)); + + BX_ALIGN_DECL(32, float orn[8]); + simd256_st(orn, simd256_f32_round(a)); + REQUIRE(orn[0] == Catch::Approx(1.0f)); + REQUIRE(orn[2] == Catch::Approx(2.0f)); +} + +TEST_CASE("simd256_f32_cmp", "[simd]") +{ + BX_ALIGN_DECL(32, float ad[8]) = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f }; + BX_ALIGN_DECL(32, float bd[8]) = { 1.0f, 3.0f, 2.0f, 4.0f, 6.0f, 5.0f, 7.0f, 9.0f }; + const simd256_t a = simd256_ld(ad); + const simd256_t b = simd256_ld(bd); + + BX_ALIGN_DECL(32, uint32_t oeq[8]); + simd256_st(oeq, simd256_f32_cmpeq(a, b)); + REQUIRE(oeq[0] == 0xffffffff); + REQUIRE(oeq[1] == 0); + + BX_ALIGN_DECL(32, uint32_t olt[8]); + simd256_st(olt, simd256_f32_cmplt(a, b)); + REQUIRE(olt[0] == 0); + REQUIRE(olt[1] == 0xffffffff); + + BX_ALIGN_DECL(32, uint32_t ogt[8]); + simd256_st(ogt, simd256_f32_cmpgt(a, b)); + REQUIRE(ogt[2] == 0xffffffff); + + BX_ALIGN_DECL(32, uint32_t one[8]); + simd256_st(one, simd256_f32_cmpneq(a, b)); + REQUIRE(one[0] == 0); + REQUIRE(one[1] == 0xffffffff); +} + +TEST_CASE("simd256_f32_itof_ftoi", "[simd]") +{ + BX_ALIGN_DECL(32, int32_t id[8]) = { 1, -2, 3, -4, 5, -6, 7, -8 }; + const simd256_t a = simd256_ld(id); + BX_ALIGN_DECL(32, float of[8]); + simd256_st(of, simd256_i32_itof(a)); + REQUIRE(of[0] == Catch::Approx(1.0f)); + REQUIRE(of[1] == Catch::Approx(-2.0f)); + + BX_ALIGN_DECL(32, int32_t oi[8]); + simd256_st(oi, simd256_f32_ftoi_trunc(simd256_ld(of))); + REQUIRE(oi[0] == 1); + REQUIRE(oi[1] == -2); +} + +TEST_CASE("simd256_i32_sub_neg_abs", "[simd]") +{ + BX_ALIGN_DECL(32, int32_t ad[8]) = { 10, -3, 5, -7, 1, -1, 0, 100 }; + BX_ALIGN_DECL(32, int32_t bd[8]) = { 3, 2, 5, -7, -1, 1, 0, 50 }; + const simd256_t a = simd256_ld(ad); + const simd256_t b = simd256_ld(bd); + + BX_ALIGN_DECL(32, int32_t osub[8]); + simd256_st(osub, simd256_i32_sub(a, b)); + REQUIRE(osub[0] == 7); + REQUIRE(osub[1] == -5); + + BX_ALIGN_DECL(32, int32_t oneg[8]); + simd256_st(oneg, simd256_i32_neg(a)); + REQUIRE(oneg[0] == -10); + REQUIRE(oneg[1] == 3); + + BX_ALIGN_DECL(32, int32_t oabs[8]); + simd256_st(oabs, simd256_i32_abs(a)); + REQUIRE(oabs[0] == 10); + REQUIRE(oabs[1] == 3); +} + +TEST_CASE("simd256_i32_min_max_clamp", "[simd]") +{ + BX_ALIGN_DECL(32, int32_t ad[8]) = { -5, 5, 15, 3, -5, 5, 15, 3 }; + BX_ALIGN_DECL(32, int32_t mn[8]) = { 0, 0, 0, 0, 0, 0, 0, 0 }; + BX_ALIGN_DECL(32, int32_t mx[8]) = { 10, 10, 10, 10, 10, 10, 10, 10 }; + const simd256_t a = simd256_ld(ad); + + BX_ALIGN_DECL(32, int32_t omin[8]); + simd256_st(omin, simd256_i32_min(a, simd256_ld(mx))); + REQUIRE(omin[0] == -5); + REQUIRE(omin[2] == 10); + + BX_ALIGN_DECL(32, int32_t omax[8]); + simd256_st(omax, simd256_i32_max(a, simd256_ld(mn))); + REQUIRE(omax[0] == 0); + REQUIRE(omax[1] == 5); + + BX_ALIGN_DECL(32, int32_t ocl[8]); + simd256_st(ocl, simd256_i32_clamp(a, simd256_ld(mn), simd256_ld(mx))); + REQUIRE(ocl[0] == 0); + REQUIRE(ocl[1] == 5); + REQUIRE(ocl[2] == 10); +} + +TEST_CASE("simd256_i32_cmp", "[simd]") +{ + BX_ALIGN_DECL(32, int32_t ad[8]) = { 1, 2, 3, 4, 5, 6, 7, 8 }; + BX_ALIGN_DECL(32, int32_t bd[8]) = { 1, 3, 2, 4, 6, 5, 7, 9 }; + const simd256_t a = simd256_ld(ad); + const simd256_t b = simd256_ld(bd); + + BX_ALIGN_DECL(32, uint32_t oeq[8]); + simd256_st(oeq, simd256_i32_cmpeq(a, b)); + REQUIRE(oeq[0] == 0xffffffff); + REQUIRE(oeq[1] == 0); + + BX_ALIGN_DECL(32, uint32_t ogt[8]); + simd256_st(ogt, simd256_i32_cmpgt(a, b)); + REQUIRE(ogt[2] == 0xffffffff); + REQUIRE(ogt[1] == 0); + + BX_ALIGN_DECL(32, uint32_t olt[8]); + simd256_st(olt, simd256_i32_cmplt(a, b)); + REQUIRE(olt[1] == 0xffffffff); + REQUIRE(olt[0] == 0); +} + +TEST_CASE("simd256_u32_ops", "[simd]") +{ + BX_ALIGN_DECL(32, uint32_t ad[8]) = { 10, 20, 30, 40, 50, 60, 70, 80 }; + BX_ALIGN_DECL(32, uint32_t bd[8]) = { 3, 5, 7, 9, 11, 13, 15, 17 }; + const simd256_t a = simd256_ld(ad); + const simd256_t b = simd256_ld(bd); + + BX_ALIGN_DECL(32, uint32_t oadd[8]); + simd256_st(oadd, simd256_u32_add(a, b)); + REQUIRE(oadd[0] == 13); + + BX_ALIGN_DECL(32, uint32_t osub[8]); + simd256_st(osub, simd256_u32_sub(a, b)); + REQUIRE(osub[0] == 7); + + BX_ALIGN_DECL(32, uint32_t omul[8]); + simd256_st(omul, simd256_u32_mul(a, b)); + REQUIRE(omul[0] == 30); + + BX_ALIGN_DECL(32, uint32_t omin[8]); + simd256_st(omin, simd256_u32_min(a, b)); + REQUIRE(omin[0] == 3); + + BX_ALIGN_DECL(32, uint32_t omax[8]); + simd256_st(omax, simd256_u32_max(a, b)); + REQUIRE(omax[0] == 10); +} + +TEST_CASE("simd256_u32_cmp", "[simd]") +{ + BX_ALIGN_DECL(32, uint32_t ad[8]) = { 1, 5, 3, 4, 1, 5, 3, 4 }; + BX_ALIGN_DECL(32, uint32_t bd[8]) = { 1, 3, 5, 4, 1, 3, 5, 4 }; + const simd256_t a = simd256_ld(ad); + const simd256_t b = simd256_ld(bd); + + BX_ALIGN_DECL(32, uint32_t oeq[8]); + simd256_st(oeq, simd256_u32_cmpeq(a, b)); + REQUIRE(oeq[0] == 0xffffffff); + REQUIRE(oeq[1] == 0); + + BX_ALIGN_DECL(32, uint32_t ogt[8]); + simd256_st(ogt, simd256_u32_cmpgt(a, b)); + REQUIRE(ogt[1] == 0xffffffff); + + BX_ALIGN_DECL(32, uint32_t olt[8]); + simd256_st(olt, simd256_u32_cmplt(a, b)); + REQUIRE(olt[2] == 0xffffffff); +} + +TEST_CASE("simd256_i16_i8_ops", "[simd]") +{ + BX_ALIGN_DECL(32, uint32_t ad[8]) = { 0x00030005, 0x00070009, 0x00030005, 0x00070009, 0x00030005, 0x00070009, 0x00030005, 0x00070009 }; + BX_ALIGN_DECL(32, uint32_t bd[8]) = { 0x00010002, 0x00010002, 0x00010002, 0x00010002, 0x00010002, 0x00010002, 0x00010002, 0x00010002 }; + const simd256_t a = simd256_ld(ad); + const simd256_t b = simd256_ld(bd); + + BX_ALIGN_DECL(32, uint32_t oadd[8]); + simd256_st(oadd, simd256_i16_add(a, b)); + REQUIRE(oadd[0] == 0x00040007); + + BX_ALIGN_DECL(32, uint32_t osub[8]); + simd256_st(osub, simd256_i16_sub(a, b)); + REQUIRE(osub[0] == 0x00020003); + + BX_ALIGN_DECL(32, uint32_t omul[8]); + simd256_st(omul, simd256_i16_mullo(a, b)); + REQUIRE(omul[0] == 0x0003000a); + + BX_ALIGN_DECL(32, uint32_t i8a[8]) = { 0x01020304, 0, 0, 0, 0x01020304, 0, 0, 0 }; + BX_ALIGN_DECL(32, uint32_t i8b[8]) = { 0x10101010, 0, 0, 0, 0x10101010, 0, 0, 0 }; + BX_ALIGN_DECL(32, uint32_t oi8a[8]); + simd256_st(oi8a, simd256_i8_add(simd256_ld(i8a), simd256_ld(i8b))); + REQUIRE(oi8a[0] == 0x11121314); + + BX_ALIGN_DECL(32, uint32_t oi8s[8]); + simd256_st(oi8s, simd256_i8_sub(simd256_ld(i8a), simd256_ld(i8b))); + REQUIRE(oi8s[0] == 0xf1f2f3f4); +} + +TEST_CASE("simd256_u8_u16_sat", "[simd]") +{ + BX_ALIGN_DECL(32, uint32_t u8a[8]) = { 0xf0f0f0f0, 0, 0, 0, 0xf0f0f0f0, 0, 0, 0 }; + BX_ALIGN_DECL(32, uint32_t u8b[8]) = { 0x20202020, 0, 0, 0, 0x20202020, 0, 0, 0 }; + BX_ALIGN_DECL(32, uint32_t osat[8]); + simd256_st(osat, simd256_u8_satadd(simd256_ld(u8a), simd256_ld(u8b))); + REQUIRE(osat[0] == 0xffffffff); + + BX_ALIGN_DECL(32, uint32_t u8c[8]) = { 0x10101010, 0, 0, 0, 0x10101010, 0, 0, 0 }; + BX_ALIGN_DECL(32, uint32_t osats[8]); + simd256_st(osats, simd256_u8_satsub(simd256_ld(u8c), simd256_ld(u8b))); + REQUIRE(osats[0] == 0); + + BX_ALIGN_DECL(32, uint32_t u16a[8]) = { 0xfff0fff0, 0, 0, 0, 0xfff0fff0, 0, 0, 0 }; + BX_ALIGN_DECL(32, uint32_t u16b[8]) = { 0x00200020, 0, 0, 0, 0x00200020, 0, 0, 0 }; + BX_ALIGN_DECL(32, uint32_t ou16a[8]); + simd256_st(ou16a, simd256_u16_satadd(simd256_ld(u16a), simd256_ld(u16b))); + REQUIRE(ou16a[0] == 0xffffffff); + + BX_ALIGN_DECL(32, uint32_t u16c[8]) = { 0x00100010, 0, 0, 0, 0x00100010, 0, 0, 0 }; + BX_ALIGN_DECL(32, uint32_t ou16s[8]); + simd256_st(ou16s, simd256_u16_satsub(simd256_ld(u16c), simd256_ld(u16b))); + REQUIRE(ou16s[0] == 0); +} + +TEST_CASE("simd256_bitwise_full", "[simd]") +{ + BX_ALIGN_DECL(32, uint32_t ad[8]) = { 0xff00ff00, 0xff00ff00, 0xff00ff00, 0xff00ff00, 0xff00ff00, 0xff00ff00, 0xff00ff00, 0xff00ff00 }; + BX_ALIGN_DECL(32, uint32_t bd[8]) = { 0x0ff00ff0, 0x0ff00ff0, 0x0ff00ff0, 0x0ff00ff0, 0x0ff00ff0, 0x0ff00ff0, 0x0ff00ff0, 0x0ff00ff0 }; + const simd256_t a = simd256_ld(ad); + const simd256_t b = simd256_ld(bd); + + BX_ALIGN_DECL(32, uint32_t out[8]); + + simd256_st(out, simd256_andc(a, b)); + REQUIRE(out[0] == 0xf000f000); + + simd256_st(out, simd256_or(a, b)); + REQUIRE(out[0] == 0xfff0fff0); + + simd256_st(out, simd256_orc(a, b)); + REQUIRE(out[0] == 0xff0fff0f); + + simd256_st(out, simd256_xor(a, b)); + REQUIRE(out[0] == 0xf0f0f0f0); + + simd256_st(out, simd256_not(a)); + REQUIRE(out[0] == 0x00ff00ff); +} + +TEST_CASE("simd256_shifts", "[simd]") +{ + BX_ALIGN_DECL(32, uint32_t ad[8]) = { 1, 2, 4, 8, 16, 32, 64, 128 }; + const simd256_t a = simd256_ld(ad); + + BX_ALIGN_DECL(32, uint32_t osll[8]); + simd256_st(osll, simd256_x32_sll(a, 1)); + REQUIRE(osll[0] == 2); + REQUIRE(osll[1] == 4); + + BX_ALIGN_DECL(32, uint32_t osrl[8]); + simd256_st(osrl, simd256_x32_srl(a, 1)); + REQUIRE(osrl[1] == 1); + + BX_ALIGN_DECL(32, int32_t srad[8]) = { -8, -4, -2, -1, 8, 4, 2, 1 }; + BX_ALIGN_DECL(32, int32_t osra[8]); + simd256_st(osra, simd256_x32_sra(simd256_ld(srad), 1)); + REQUIRE(osra[0] == -4); + REQUIRE(osra[4] == 4); +} + +TEST_CASE("simd256_selb_sels", "[simd]") +{ + BX_ALIGN_DECL(32, uint32_t md[8]) = { 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0, 0xffffffff, 0 }; + BX_ALIGN_DECL(32, float ad[8]) = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }; + BX_ALIGN_DECL(32, float bd[8]) = { 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f }; + const simd256_t mask = simd256_ld(md); + const simd256_t a = simd256_ld(ad); + const simd256_t b = simd256_ld(bd); + + BX_ALIGN_DECL(32, float oselb[8]); + simd256_st(oselb, simd256_selb(mask, a, b)); + REQUIRE(oselb[0] == Catch::Approx(1.0f)); + REQUIRE(oselb[1] == Catch::Approx(2.0f)); + + BX_ALIGN_DECL(32, uint32_t sd[8]) = { 0x80000000, 0, 0x80000000, 0, 0x80000000, 0, 0x80000000, 0 }; + BX_ALIGN_DECL(32, float osels[8]); + simd256_st(osels, simd256_sels(simd256_ld(sd), a, b)); + REQUIRE(osels[0] == Catch::Approx(1.0f)); + REQUIRE(osels[1] == Catch::Approx(2.0f)); +} + +TEST_CASE("simd256_test_any_all", "[simd]") +{ + BX_ALIGN_DECL(32, uint32_t ad[8]) = { 0xffffffff, 0, 0, 0, 0, 0, 0, 0 }; + REQUIRE(simd256_test_any(simd256_ld(ad)) == true); + + BX_ALIGN_DECL(32, uint32_t bd[8]) = { 0, 0, 0, 0, 0, 0, 0, 0 }; + REQUIRE(simd256_test_any(simd256_ld(bd)) == false); + + BX_ALIGN_DECL(32, uint32_t cd[8]) = { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }; + REQUIRE(simd256_test_all(simd256_ld(cd)) == true); + REQUIRE(simd256_test_all(simd256_ld(ad)) == false); +} + +TEST_CASE("simd256_signbits", "[simd]") +{ + BX_ALIGN_DECL(32, uint32_t ad[8]) = { 0x80000000, 0, 0x80000000, 0, 0, 0x80000000, 0, 0x80000000 }; + REQUIRE(simd256_x32_signbitsmask(simd256_ld(ad)) == 0xa5); + REQUIRE(simd256_x8_signbitsmask(simd256_zero()) == 0); +} + +TEST_CASE("simd256_f32_transcendental", "[simd]") +{ + BX_ALIGN_DECL(32, float ad[8]) = { 1.0f, 4.0f, 8.0f, 16.0f, 1.0f, 4.0f, 8.0f, 16.0f }; + const simd256_t a = simd256_ld(ad); + + BX_ALIGN_DECL(32, float olog2[8]); + simd256_st(olog2, simd_f32_log2(a)); + REQUIRE(olog2[0] == Catch::Approx(0.0f).margin(0.01f)); + REQUIRE(olog2[1] == Catch::Approx(2.0f).margin(0.01f)); + + BX_ALIGN_DECL(32, float oexp2[8]); + simd256_st(oexp2, simd_f32_exp2(simd256_ld(olog2))); + REQUIRE(oexp2[0] == Catch::Approx(1.0f).margin(0.01f)); + REQUIRE(oexp2[1] == Catch::Approx(4.0f).margin(0.01f)); + + BX_ALIGN_DECL(32, float base[8]) = { 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f }; + BX_ALIGN_DECL(32, float expo[8]) = { 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f, 3.0f }; + BX_ALIGN_DECL(32, float opow[8]); + simd256_st(opow, simd_f32_pow(simd256_ld(base), simd256_ld(expo))); + REQUIRE(opow[0] == Catch::Approx(8.0f).margin(0.01f)); + + BX_ALIGN_DECL(32, float cosv[8]) = { 0.0f, kPiHalf, 0.0f, kPiHalf, 0.0f, kPiHalf, 0.0f, kPiHalf }; + BX_ALIGN_DECL(32, float ocos[8]); + simd256_st(ocos, simd_f32_cos(simd256_ld(cosv))); + REQUIRE(ocos[0] == Catch::Approx(1.0f).margin(0.001f)); + REQUIRE(ocos[1] == Catch::Approx(0.0f).margin(0.001f)); + + BX_ALIGN_DECL(32, float osin[8]); + simd256_st(osin, simd_f32_sin(simd256_ld(cosv))); + REQUIRE(osin[0] == Catch::Approx(0.0f).margin(0.001f)); + REQUIRE(osin[1] == Catch::Approx(1.0f).margin(0.001f)); + + BX_ALIGN_DECL(32, float logv[8]) = { 1.0f, kE, 1.0f, kE, 1.0f, kE, 1.0f, kE }; + BX_ALIGN_DECL(32, float olog[8]); + simd256_st(olog, simd_f32_log(simd256_ld(logv))); + REQUIRE(olog[0] == Catch::Approx(0.0f).margin(0.01f)); + REQUIRE(olog[1] == Catch::Approx(1.0f).margin(0.01f)); + + BX_ALIGN_DECL(32, float expv[8]) = { 0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 1.0f }; + BX_ALIGN_DECL(32, float oexp[8]); + simd256_st(oexp, simd_f32_exp(simd256_ld(expv))); + REQUIRE(oexp[0] == Catch::Approx(1.0f).margin(0.01f)); + REQUIRE(oexp[1] == Catch::Approx(kE).margin(0.01f)); } diff --git a/tests/test.h b/tests/test.h index 1ab7cd4..628f329 100644 --- a/tests/test.h +++ b/tests/test.h @@ -14,8 +14,6 @@ BX_PRAGMA_DIAGNOSTIC_IGNORED_MSVC(4312); // warning C4312 : 'reinterpret_cast' : #include BX_PRAGMA_DIAGNOSTIC_POP(); -#define TEST(_x) TEST_CASE(#_x, "") - #if BX_CONFIG_DEBUG # define REQUIRE_ASSERTS(_x) REQUIRE_THROWS(_x) #else diff --git a/tests/uint32_test.cpp b/tests/uint32_test.cpp deleted file mode 100644 index 50e8197..0000000 --- a/tests/uint32_test.cpp +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright 2010-2026 Branimir Karadzic. All rights reserved. - * License: https://github.com/bkaradzic/bx/blob/master/LICENSE - */ - -#include "test.h" -#include - -TEST_CASE("StrideAlign", "[uint32_t]") -{ - REQUIRE(0 == bx::strideAlign(0, 12) ); - for (uint32_t ii = 0; ii < 12; ++ii) - { - REQUIRE(12 == bx::strideAlign(ii+1, 12) ); - } - - REQUIRE(0 == bx::strideAlign<16>(0, 12) ); - for (uint32_t ii = 0; ii < 12; ++ii) - { - REQUIRE(48 == bx::strideAlign<16>(ii+1, 12) ); - } - - uint32_t offset = 11; - offset = bx::strideAlign(offset, 32); - REQUIRE(offset == 32); - - offset = bx::strideAlign(offset, 24); - REQUIRE(offset == 48); -} - -TEST_CASE("uint32_part", "[uint32_t]") -{ - REQUIRE(UINT32_C(0x55555555) == bx::uint32_part1by1(UINT16_MAX) ); - REQUIRE(UINT32_C(0x09249249) == bx::uint32_part1by2(0x3ff) ); -} - -TEST_CASE("uint32_splat", "[uint32_t]") -{ - REQUIRE(UINT32_C(0x01010101) == bx::uint32_splat(0x01) ); - REQUIRE(UINT32_C(0x55555555) == bx::uint32_splat(0x55) ); - REQUIRE(UINT32_C(0x13891389) == bx::uint32_splat(0x1389) ); -} - -TEST_CASE("uint64_splat", "[uint32_t]") -{ - REQUIRE(UINT64_C(0x0101010101010101) == bx::uint64_splat(0x01) ); - REQUIRE(UINT64_C(0x5555555555555555) == bx::uint64_splat(0x55) ); - REQUIRE(UINT32_C(0x1389138913891389) == bx::uint64_splat(0x1389) ); - REQUIRE(UINT32_C(0x1506138915061389) == bx::uint64_splat(0x15061389) ); -} - -TEST_CASE("uint32_gcd", "[uint32_t]") -{ - REQUIRE(1 == bx::uint32_gcd(13, 89) ); - REQUIRE(3 == bx::uint32_gcd( 3, 9) ); - REQUIRE(8 == bx::uint32_gcd( 8, 64) ); - REQUIRE(9 == bx::uint32_gcd(18, 81) ); -} - -TEST_CASE("uint32_lcm", "[uint32_t]") -{ - REQUIRE(1157 == bx::uint32_lcm(13, 89) ); - REQUIRE( 9 == bx::uint32_lcm( 3, 9) ); - REQUIRE( 48 == bx::uint32_lcm( 6, 16) ); - REQUIRE( 80 == bx::uint32_lcm(16, 20) ); -} - -TEST_CASE("halfTo/FromFloat", "[uint32_t]") -{ - for (uint32_t ii = 0; ii < 0x7c00; ++ii) - { - const uint16_t orig = uint16_t(ii); - const float htf = bx::halfToFloat(orig); - const uint16_t hff = bx::halfFromFloat(htf); - REQUIRE(orig == hff); - } - - for (uint32_t ii = 0x8000; ii < 0xfc00; ++ii) - { - const uint16_t orig = uint16_t(ii); - const float htf = bx::halfToFloat(orig); - const uint16_t hff = bx::halfFromFloat(htf); - REQUIRE(orig == hff); - } -} - -TEST_CASE("uint32_testpow2", "[uint32_t]") -{ - uint32_t shift = 0; - uint32_t nextpow2 = bx::uint32_nextpow2(1); - - for (uint32_t ii = 1; ii < 1<<24; ++ii) - { - REQUIRE(nextpow2 == bx::uint32_nextpow2(ii) ); - - if (bx::uint32_testpow2(ii) ) - { - REQUIRE(ii == 1u << shift); - ++shift; - - REQUIRE(ii == nextpow2); - nextpow2 = bx::uint32_nextpow2(ii+1); - } - } -} - -TEST_CASE("uint32_roX", "[uint32_t]") -{ - REQUIRE(bx::uint32_rol(0x80000000, 1) == 1); - REQUIRE(bx::uint32_ror(1, 1) == 0x80000000); -} - -TEST_CASE("uint64_roX", "[uint32_t]") -{ - REQUIRE(bx::uint64_rol(0x8000000000000000, 1) == 1); - REQUIRE(bx::uint64_ror(1, 1) == 0x8000000000000000); -} - -TEST_CASE("align", "[uint32_t]") -{ - REQUIRE( bx::isAligned(0, 8) ); - REQUIRE(!bx::isAligned(7, 8) ); - REQUIRE( bx::isAligned(64, 8) ); - REQUIRE(!bx::isAligned(63, 8) ); - - for (int32_t ii = 0; ii < 1024; ++ii) - { - REQUIRE(bx::isAligned(ii, 0) ); - REQUIRE(ii == bx::alignUp(ii, 0) ); - REQUIRE(ii == bx::alignDown(ii, 0) ); - } - - REQUIRE( 0 == bx::alignUp( 0, 16) ); - REQUIRE( 16 == bx::alignUp( 1, 16) ); - REQUIRE( 16 == bx::alignUp( 15, 16) ); - REQUIRE( 16 == bx::alignUp( 16, 16) ); - REQUIRE(256 == bx::alignUp(255, 16) ); - REQUIRE( 0 == bx::alignUp(-1, 16) ); - REQUIRE(-16 == bx::alignUp(-31, 16) ); - - REQUIRE( 0 == bx::alignUp( 0, 256) ); - REQUIRE(256 == bx::alignUp( 1, 256) ); - REQUIRE(256 == bx::alignUp( 15, 256) ); - REQUIRE(256 == bx::alignUp(255, 256) ); - REQUIRE(256 == bx::alignUp(256, 256) ); - REQUIRE(256 == bx::alignUp(256, 256) ); - REQUIRE(512 == bx::alignUp(511, 256) ); - - REQUIRE( 0 == bx::alignDown( 0, 16) ); - REQUIRE( 0 == bx::alignDown( 1, 16) ); - REQUIRE( 0 == bx::alignDown( 15, 16) ); - REQUIRE( 16 == bx::alignDown( 16, 16) ); - REQUIRE(240 == bx::alignDown(255, 16) ); - REQUIRE(-16 == bx::alignDown(-1, 16) ); - REQUIRE(-32 == bx::alignDown(-31, 16) ); -}