308 lines
6.6 KiB
C
308 lines
6.6 KiB
C
/*
|
|
* Copyright (c), Recep Aslantas.
|
|
*
|
|
* MIT License (MIT), http://opensource.org/licenses/MIT
|
|
* Full license can be found in the LICENSE file
|
|
*/
|
|
|
|
#ifndef cglm_simd_x86_h
|
|
#define cglm_simd_x86_h
|
|
#include "intrin.h"
|
|
#ifdef CGLM_SIMD_x86
|
|
|
|
#ifdef CGLM_ALL_UNALIGNED
|
|
# define glmm_load(p) _mm_loadu_ps(p)
|
|
# define glmm_store(p, a) _mm_storeu_ps(p, a)
|
|
#else
|
|
# define glmm_load(p) _mm_load_ps(p)
|
|
# define glmm_store(p, a) _mm_store_ps(p, a)
|
|
#endif
|
|
|
|
#define glmm_set1(x) _mm_set1_ps(x)
|
|
#define glmm_128 __m128
|
|
|
|
#ifdef CGLM_USE_INT_DOMAIN
|
|
# define glmm_shuff1(xmm, z, y, x, w) \
|
|
_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xmm), \
|
|
_MM_SHUFFLE(z, y, x, w)))
|
|
#else
|
|
# define glmm_shuff1(xmm, z, y, x, w) \
|
|
_mm_shuffle_ps(xmm, xmm, _MM_SHUFFLE(z, y, x, w))
|
|
#endif
|
|
|
|
#define glmm_splat(x, lane) glmm_shuff1(x, lane, lane, lane, lane)
|
|
|
|
#define glmm_splat_x(x) glmm_splat(x, 0)
|
|
#define glmm_splat_y(x) glmm_splat(x, 1)
|
|
#define glmm_splat_z(x) glmm_splat(x, 2)
|
|
#define glmm_splat_w(x) glmm_splat(x, 3)
|
|
|
|
/* glmm_shuff1x() is DEPRECATED!, use glmm_splat() */
|
|
#define glmm_shuff1x(xmm, x) glmm_shuff1(xmm, x, x, x, x)
|
|
|
|
#define glmm_shuff2(a, b, z0, y0, x0, w0, z1, y1, x1, w1) \
|
|
glmm_shuff1(_mm_shuffle_ps(a, b, _MM_SHUFFLE(z0, y0, x0, w0)), \
|
|
z1, y1, x1, w1)
|
|
|
|
#ifdef __AVX__
|
|
# ifdef CGLM_ALL_UNALIGNED
|
|
# define glmm_load256(p) _mm256_loadu_ps(p)
|
|
# define glmm_store256(p, a) _mm256_storeu_ps(p, a)
|
|
# else
|
|
# define glmm_load256(p) _mm256_load_ps(p)
|
|
# define glmm_store256(p, a) _mm256_store_ps(p, a)
|
|
# endif
|
|
#endif
|
|
|
|
static inline
|
|
__m128
|
|
glmm_abs(__m128 x) {
|
|
return _mm_andnot_ps(_mm_set1_ps(-0.0f), x);
|
|
}
|
|
|
|
static inline
|
|
__m128
|
|
glmm_vhadd(__m128 v) {
|
|
__m128 x0;
|
|
x0 = _mm_add_ps(v, glmm_shuff1(v, 0, 1, 2, 3));
|
|
x0 = _mm_add_ps(x0, glmm_shuff1(x0, 1, 0, 0, 1));
|
|
return x0;
|
|
}
|
|
|
|
static inline
|
|
__m128
|
|
glmm_vhadds(__m128 v) {
|
|
#if defined(__SSE3__)
|
|
__m128 shuf, sums;
|
|
shuf = _mm_movehdup_ps(v);
|
|
sums = _mm_add_ps(v, shuf);
|
|
shuf = _mm_movehl_ps(shuf, sums);
|
|
sums = _mm_add_ss(sums, shuf);
|
|
return sums;
|
|
#else
|
|
__m128 shuf, sums;
|
|
shuf = glmm_shuff1(v, 2, 3, 0, 1);
|
|
sums = _mm_add_ps(v, shuf);
|
|
shuf = _mm_movehl_ps(shuf, sums);
|
|
sums = _mm_add_ss(sums, shuf);
|
|
return sums;
|
|
#endif
|
|
}
|
|
|
|
static inline
|
|
float
|
|
glmm_hadd(__m128 v) {
|
|
return _mm_cvtss_f32(glmm_vhadds(v));
|
|
}
|
|
|
|
static inline
|
|
__m128
|
|
glmm_vhmin(__m128 v) {
|
|
__m128 x0, x1, x2;
|
|
x0 = _mm_movehl_ps(v, v); /* [2, 3, 2, 3] */
|
|
x1 = _mm_min_ps(x0, v); /* [0|2, 1|3, 2|2, 3|3] */
|
|
x2 = glmm_splat(x1, 1); /* [1|3, 1|3, 1|3, 1|3] */
|
|
return _mm_min_ss(x1, x2);
|
|
}
|
|
|
|
static inline
|
|
float
|
|
glmm_hmin(__m128 v) {
|
|
return _mm_cvtss_f32(glmm_vhmin(v));
|
|
}
|
|
|
|
static inline
|
|
__m128
|
|
glmm_vhmax(__m128 v) {
|
|
__m128 x0, x1, x2;
|
|
x0 = _mm_movehl_ps(v, v); /* [2, 3, 2, 3] */
|
|
x1 = _mm_max_ps(x0, v); /* [0|2, 1|3, 2|2, 3|3] */
|
|
x2 = glmm_splat(x1, 1); /* [1|3, 1|3, 1|3, 1|3] */
|
|
return _mm_max_ss(x1, x2);
|
|
}
|
|
|
|
static inline
|
|
float
|
|
glmm_hmax(__m128 v) {
|
|
return _mm_cvtss_f32(glmm_vhmax(v));
|
|
}
|
|
|
|
static inline
|
|
__m128
|
|
glmm_vdots(__m128 a, __m128 b) {
|
|
#if (defined(__SSE4_1__) || defined(__SSE4_2__)) && defined(CGLM_SSE4_DOT)
|
|
return _mm_dp_ps(a, b, 0xFF);
|
|
#elif defined(__SSE3__) && defined(CGLM_SSE3_DOT)
|
|
__m128 x0, x1;
|
|
x0 = _mm_mul_ps(a, b);
|
|
x1 = _mm_hadd_ps(x0, x0);
|
|
return _mm_hadd_ps(x1, x1);
|
|
#else
|
|
return glmm_vhadds(_mm_mul_ps(a, b));
|
|
#endif
|
|
}
|
|
|
|
static inline
|
|
__m128
|
|
glmm_vdot(__m128 a, __m128 b) {
|
|
#if (defined(__SSE4_1__) || defined(__SSE4_2__)) && defined(CGLM_SSE4_DOT)
|
|
return _mm_dp_ps(a, b, 0xFF);
|
|
#elif defined(__SSE3__) && defined(CGLM_SSE3_DOT)
|
|
__m128 x0, x1;
|
|
x0 = _mm_mul_ps(a, b);
|
|
x1 = _mm_hadd_ps(x0, x0);
|
|
return _mm_hadd_ps(x1, x1);
|
|
#else
|
|
__m128 x0;
|
|
x0 = _mm_mul_ps(a, b);
|
|
x0 = _mm_add_ps(x0, glmm_shuff1(x0, 1, 0, 3, 2));
|
|
return _mm_add_ps(x0, glmm_shuff1(x0, 0, 1, 0, 1));
|
|
#endif
|
|
}
|
|
|
|
static inline
|
|
float
|
|
glmm_dot(__m128 a, __m128 b) {
|
|
return _mm_cvtss_f32(glmm_vdots(a, b));
|
|
}
|
|
|
|
static inline
|
|
float
|
|
glmm_norm(__m128 a) {
|
|
return _mm_cvtss_f32(_mm_sqrt_ss(glmm_vhadds(_mm_mul_ps(a, a))));
|
|
}
|
|
|
|
static inline
|
|
float
|
|
glmm_norm2(__m128 a) {
|
|
return _mm_cvtss_f32(glmm_vhadds(_mm_mul_ps(a, a)));
|
|
}
|
|
|
|
static inline
|
|
float
|
|
glmm_norm_one(__m128 a) {
|
|
return _mm_cvtss_f32(glmm_vhadds(glmm_abs(a)));
|
|
}
|
|
|
|
static inline
|
|
float
|
|
glmm_norm_inf(__m128 a) {
|
|
return _mm_cvtss_f32(glmm_vhmax(glmm_abs(a)));
|
|
}
|
|
|
|
static inline
|
|
__m128
|
|
glmm_load3(float v[3]) {
|
|
__m128i xy;
|
|
__m128 z;
|
|
|
|
xy = _mm_loadl_epi64(CGLM_CASTPTR_ASSUME_ALIGNED(v, const __m128i));
|
|
z = _mm_load_ss(&v[2]);
|
|
|
|
return _mm_movelh_ps(_mm_castsi128_ps(xy), z);
|
|
}
|
|
|
|
static inline
|
|
void
|
|
glmm_store3(float v[3], __m128 vx) {
|
|
_mm_storel_pi(CGLM_CASTPTR_ASSUME_ALIGNED(v, __m64), vx);
|
|
_mm_store_ss(&v[2], glmm_shuff1(vx, 2, 2, 2, 2));
|
|
}
|
|
|
|
static inline
|
|
__m128
|
|
glmm_div(__m128 a, __m128 b) {
|
|
return _mm_div_ps(a, b);
|
|
}
|
|
|
|
/* enable FMA macro for MSVC? */
|
|
#if defined(_MSC_VER) && !defined(__FMA__) && defined(__AVX2__)
|
|
# define __FMA__ 1
|
|
#endif
|
|
|
|
static inline
|
|
__m128
|
|
glmm_fmadd(__m128 a, __m128 b, __m128 c) {
|
|
#ifdef __FMA__
|
|
return _mm_fmadd_ps(a, b, c);
|
|
#else
|
|
return _mm_add_ps(c, _mm_mul_ps(a, b));
|
|
#endif
|
|
}
|
|
|
|
static inline
|
|
__m128
|
|
glmm_fnmadd(__m128 a, __m128 b, __m128 c) {
|
|
#ifdef __FMA__
|
|
return _mm_fnmadd_ps(a, b, c);
|
|
#else
|
|
return _mm_sub_ps(c, _mm_mul_ps(a, b));
|
|
#endif
|
|
}
|
|
|
|
static inline
|
|
__m128
|
|
glmm_fmsub(__m128 a, __m128 b, __m128 c) {
|
|
#ifdef __FMA__
|
|
return _mm_fmsub_ps(a, b, c);
|
|
#else
|
|
return _mm_sub_ps(_mm_mul_ps(a, b), c);
|
|
#endif
|
|
}
|
|
|
|
static inline
|
|
__m128
|
|
glmm_fnmsub(__m128 a, __m128 b, __m128 c) {
|
|
#ifdef __FMA__
|
|
return _mm_fnmsub_ps(a, b, c);
|
|
#else
|
|
return _mm_xor_ps(_mm_add_ps(_mm_mul_ps(a, b), c), _mm_set1_ps(-0.0f));
|
|
#endif
|
|
}
|
|
|
|
#if defined(__AVX__)
|
|
static inline
|
|
__m256
|
|
glmm256_fmadd(__m256 a, __m256 b, __m256 c) {
|
|
#ifdef __FMA__
|
|
return _mm256_fmadd_ps(a, b, c);
|
|
#else
|
|
return _mm256_add_ps(c, _mm256_mul_ps(a, b));
|
|
#endif
|
|
}
|
|
|
|
static inline
|
|
__m256
|
|
glmm256_fnmadd(__m256 a, __m256 b, __m256 c) {
|
|
#ifdef __FMA__
|
|
return _mm256_fnmadd_ps(a, b, c);
|
|
#else
|
|
return _mm256_sub_ps(c, _mm256_mul_ps(a, b));
|
|
#endif
|
|
}
|
|
|
|
static inline
|
|
__m256
|
|
glmm256_fmsub(__m256 a, __m256 b, __m256 c) {
|
|
#ifdef __FMA__
|
|
return _mm256_fmsub_ps(a, b, c);
|
|
#else
|
|
return _mm256_sub_ps(_mm256_mul_ps(a, b), c);
|
|
#endif
|
|
}
|
|
|
|
static inline
|
|
__m256
|
|
glmm256_fnmsub(__m256 a, __m256 b, __m256 c) {
|
|
#ifdef __FMA__
|
|
return _mm256_fmsub_ps(a, b, c);
|
|
#else
|
|
return _mm256_xor_ps(_mm256_sub_ps(_mm256_mul_ps(a, b), c),
|
|
_mm256_set1_ps(-0.0f));
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
#endif
|
|
#endif /* cglm_simd_x86_h */
|