/* * Copyright (c), Recep Aslantas. * * MIT License (MIT), http://opensource.org/licenses/MIT * Full license can be found in the LICENSE file */ #ifndef cglm_simd_arm_h #define cglm_simd_arm_h #include "intrin.h" #ifdef CGLM_SIMD_ARM #if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || defined(__aarch64__) # define CGLM_ARM64 1 #endif #define glmm_load(p) vld1q_f32(p) #define glmm_store(p, a) vst1q_f32(p, a) #define glmm_set1(x) vdupq_n_f32(x) #define glmm_128 float32x4_t #define glmm_splat_x(x) vdupq_lane_f32(vget_low_f32(x), 0) #define glmm_splat_y(x) vdupq_lane_f32(vget_low_f32(x), 1) #define glmm_splat_z(x) vdupq_lane_f32(vget_high_f32(x), 0) #define glmm_splat_w(x) vdupq_lane_f32(vget_high_f32(x), 1) #define glmm_xor(a, b) \ vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(a), \ vreinterpretq_s32_f32(b))) #define glmm_swplane(v) vextq_f32(v, v, 2) #define glmm_low(x) vget_low_f32(x) #define glmm_high(x) vget_high_f32(x) #define glmm_combine_ll(x, y) vcombine_f32(vget_low_f32(x), vget_low_f32(y)) #define glmm_combine_hl(x, y) vcombine_f32(vget_high_f32(x), vget_low_f32(y)) #define glmm_combine_lh(x, y) vcombine_f32(vget_low_f32(x), vget_high_f32(y)) #define glmm_combine_hh(x, y) vcombine_f32(vget_high_f32(x), vget_high_f32(y)) static inline float32x4_t glmm_abs(float32x4_t v) { return vabsq_f32(v); } static inline float32x4_t glmm_vhadd(float32x4_t v) { return vaddq_f32(vaddq_f32(glmm_splat_x(v), glmm_splat_y(v)), vaddq_f32(glmm_splat_z(v), glmm_splat_w(v))); /* this seems slower: v = vaddq_f32(v, vrev64q_f32(v)); return vaddq_f32(v, vcombine_f32(vget_high_f32(v), vget_low_f32(v))); */ } static inline float glmm_hadd(float32x4_t v) { #if CGLM_ARM64 return vaddvq_f32(v); #else v = vaddq_f32(v, vrev64q_f32(v)); v = vaddq_f32(v, vcombine_f32(vget_high_f32(v), vget_low_f32(v))); return vgetq_lane_f32(v, 0); #endif } static inline float glmm_hmin(float32x4_t v) { float32x2_t t; t = vpmin_f32(vget_low_f32(v), vget_high_f32(v)); t = vpmin_f32(t, t); return vget_lane_f32(t, 0); } static inline float glmm_hmax(float32x4_t v) { float32x2_t t; t = vpmax_f32(vget_low_f32(v), vget_high_f32(v)); t = vpmax_f32(t, t); return vget_lane_f32(t, 0); } static inline float glmm_dot(float32x4_t a, float32x4_t b) { return glmm_hadd(vmulq_f32(a, b)); } static inline float glmm_norm(float32x4_t a) { return sqrtf(glmm_dot(a, a)); } static inline float glmm_norm2(float32x4_t a) { return glmm_dot(a, a); } static inline float glmm_norm_one(float32x4_t a) { return glmm_hadd(glmm_abs(a)); } static inline float glmm_norm_inf(float32x4_t a) { return glmm_hmax(glmm_abs(a)); } static inline float32x4_t glmm_div(float32x4_t a, float32x4_t b) { #if CGLM_ARM64 return vdivq_f32(a, b); #else /* 2 iterations of Newton-Raphson refinement of reciprocal */ float32x4_t r0, r1; r0 = vrecpeq_f32(b); r1 = vrecpsq_f32(r0, b); r0 = vmulq_f32(r1, r0); r1 = vrecpsq_f32(r0, b); r0 = vmulq_f32(r1, r0); return vmulq_f32(a, r0); #endif } static inline float32x4_t glmm_fmadd(float32x4_t a, float32x4_t b, float32x4_t c) { #if CGLM_ARM64 return vfmaq_f32(c, a, b); /* why vfmaq_f32 is slower than vmlaq_f32 ??? */ #else return vmlaq_f32(c, a, b); #endif } static inline float32x4_t glmm_fnmadd(float32x4_t a, float32x4_t b, float32x4_t c) { #if CGLM_ARM64 return vfmsq_f32(c, a, b); #else return vmlsq_f32(c, a, b); #endif } static inline float32x4_t glmm_fmsub(float32x4_t a, float32x4_t b, float32x4_t c) { #if CGLM_ARM64 return vfmsq_f32(c, a, b); #else return vmlsq_f32(c, a, b); #endif } static inline float32x4_t glmm_fnmsub(float32x4_t a, float32x4_t b, float32x4_t c) { return vsubq_f32(vdupq_n_f32(0.0f), glmm_fmadd(a, b, c)); } #endif #endif /* cglm_simd_arm_h */