/*
 * Copyright (c), Recep Aslantas.
 *
 * MIT License (MIT), http://opensource.org/licenses/MIT
 * Full license can be found in the LICENSE file
 */

#ifndef cglm_quat_neon_h
#define cglm_quat_neon_h
#if defined(__ARM_NEON_FP)

#include "../../common.h"
#include "../intrin.h"

CGLM_INLINE
void
glm_quat_mul_neon(versor p, versor q, versor dest) {
  /*
   + (a1 b2 + b1 a2 + c1 d2 − d1 c2)i
   + (a1 c2 − b1 d2 + c1 a2 + d1 b2)j
   + (a1 d2 + b1 c2 − c1 b2 + d1 a2)k
     a1 a2 − b1 b2 − c1 c2 − d1 d2
   */

  glmm_128 xp, xq, xqr, r, x, y, z, s2, s3;
  glmm_128 s1 = {-0.f, 0.f, 0.f, -0.f};
  float32x2_t   qh, ql;
  
  xp  = glmm_load(p); /* 3 2 1 0 */
  xq  = glmm_load(q);

  r   = vmulq_f32(glmm_splat_w(xp), xq);
  x   = glmm_splat_x(xp);
  y   = glmm_splat_y(xp);
  z   = glmm_splat_z(xp);

  ql  = vget_high_f32(s1);
  s3  = vcombine_f32(ql, ql);
  s2  = vzipq_f32(s3, s3).val[0];

  xqr = vrev64q_f32(xq);
  qh  = vget_high_f32(xqr);
  ql  = vget_low_f32(xqr);

  r = glmm_fmadd(glmm_xor(x, s3), vcombine_f32(qh, ql), r);
  
  r = glmm_fmadd(glmm_xor(y, s2), vcombine_f32(vget_high_f32(xq),
                                               vget_low_f32(xq)), r);
  
  r = glmm_fmadd(glmm_xor(z, s1), vcombine_f32(ql, qh), r);

  glmm_store(dest, r);
}

#endif
#endif /* cglm_quat_neon_h */