You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

373 lines
9.9KB

  1. /******************************************************************************
  2. * @file arm_vec_math.h
  3. * @brief Public header file for CMSIS DSP Library
  4. * @version V1.7.0
  5. * @date 15. October 2019
  6. ******************************************************************************/
  7. /*
  8. * Copyright (c) 2010-2019 Arm Limited or its affiliates. All rights reserved.
  9. *
  10. * SPDX-License-Identifier: Apache-2.0
  11. *
  12. * Licensed under the Apache License, Version 2.0 (the License); you may
  13. * not use this file except in compliance with the License.
  14. * You may obtain a copy of the License at
  15. *
  16. * www.apache.org/licenses/LICENSE-2.0
  17. *
  18. * Unless required by applicable law or agreed to in writing, software
  19. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  20. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  21. * See the License for the specific language governing permissions and
  22. * limitations under the License.
  23. */
  24. #ifndef _ARM_VEC_MATH_H
  25. #define _ARM_VEC_MATH_H
  26. #include "arm_math.h"
  27. #include "arm_common_tables.h"
  28. #include "arm_helium_utils.h"
  29. #ifdef __cplusplus
  30. extern "C"
  31. {
  32. #endif
  33. #if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
  34. #define INV_NEWTON_INIT_F32 0x7EF127EA
  35. static const float32_t __logf_rng_f32=0.693147180f;
  36. /* fast inverse approximation (3x newton) */
  37. __STATIC_INLINE f32x4_t vrecip_medprec_f32(
  38. f32x4_t x)
  39. {
  40. q31x4_t m;
  41. f32x4_t b;
  42. any32x4_t xinv;
  43. f32x4_t ax = vabsq(x);
  44. xinv.f = ax;
  45. m = 0x3F800000 - (xinv.i & 0x7F800000);
  46. xinv.i = xinv.i + m;
  47. xinv.f = 1.41176471f - 0.47058824f * xinv.f;
  48. xinv.i = xinv.i + m;
  49. b = 2.0f - xinv.f * ax;
  50. xinv.f = xinv.f * b;
  51. b = 2.0f - xinv.f * ax;
  52. xinv.f = xinv.f * b;
  53. b = 2.0f - xinv.f * ax;
  54. xinv.f = xinv.f * b;
  55. xinv.f = vdupq_m(xinv.f, INFINITY, vcmpeqq(x, 0.0f));
  56. /*
  57. * restore sign
  58. */
  59. xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f));
  60. return xinv.f;
  61. }
  62. /* fast inverse approximation (4x newton) */
  63. __STATIC_INLINE f32x4_t vrecip_hiprec_f32(
  64. f32x4_t x)
  65. {
  66. q31x4_t m;
  67. f32x4_t b;
  68. any32x4_t xinv;
  69. f32x4_t ax = vabsq(x);
  70. xinv.f = ax;
  71. m = 0x3F800000 - (xinv.i & 0x7F800000);
  72. xinv.i = xinv.i + m;
  73. xinv.f = 1.41176471f - 0.47058824f * xinv.f;
  74. xinv.i = xinv.i + m;
  75. b = 2.0f - xinv.f * ax;
  76. xinv.f = xinv.f * b;
  77. b = 2.0f - xinv.f * ax;
  78. xinv.f = xinv.f * b;
  79. b = 2.0f - xinv.f * ax;
  80. xinv.f = xinv.f * b;
  81. b = 2.0f - xinv.f * ax;
  82. xinv.f = xinv.f * b;
  83. xinv.f = vdupq_m(xinv.f, INFINITY, vcmpeqq(x, 0.0f));
  84. /*
  85. * restore sign
  86. */
  87. xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f));
  88. return xinv.f;
  89. }
  90. __STATIC_INLINE f32x4_t vdiv_f32(
  91. f32x4_t num, f32x4_t den)
  92. {
  93. return vmulq(num, vrecip_hiprec_f32(den));
  94. }
  95. /**
  96. @brief Single-precision taylor dev.
  97. @param[in] x f32 quad vector input
  98. @param[in] coeffs f32 quad vector coeffs
  99. @return destination f32 quad vector
  100. */
  101. __STATIC_INLINE f32x4_t vtaylor_polyq_f32(
  102. f32x4_t x,
  103. const float32_t * coeffs)
  104. {
  105. f32x4_t A = vfmasq(vdupq_n_f32(coeffs[4]), x, coeffs[0]);
  106. f32x4_t B = vfmasq(vdupq_n_f32(coeffs[6]), x, coeffs[2]);
  107. f32x4_t C = vfmasq(vdupq_n_f32(coeffs[5]), x, coeffs[1]);
  108. f32x4_t D = vfmasq(vdupq_n_f32(coeffs[7]), x, coeffs[3]);
  109. f32x4_t x2 = vmulq(x, x);
  110. f32x4_t x4 = vmulq(x2, x2);
  111. f32x4_t res = vfmaq(vfmaq_f32(A, B, x2), vfmaq_f32(C, D, x2), x4);
  112. return res;
  113. }
  114. __STATIC_INLINE f32x4_t vmant_exp_f32(
  115. f32x4_t x,
  116. int32x4_t * e)
  117. {
  118. any32x4_t r;
  119. int32x4_t n;
  120. r.f = x;
  121. n = r.i >> 23;
  122. n = n - 127;
  123. r.i = r.i - (n << 23);
  124. *e = n;
  125. return r.f;
  126. }
  127. __STATIC_INLINE f32x4_t vlogq_f32(f32x4_t vecIn)
  128. {
  129. q31x4_t vecExpUnBiased;
  130. f32x4_t vecTmpFlt0, vecTmpFlt1;
  131. f32x4_t vecAcc0, vecAcc1, vecAcc2, vecAcc3;
  132. f32x4_t vecExpUnBiasedFlt;
  133. /*
  134. * extract exponent
  135. */
  136. vecTmpFlt1 = vmant_exp_f32(vecIn, &vecExpUnBiased);
  137. vecTmpFlt0 = vecTmpFlt1 * vecTmpFlt1;
  138. /*
  139. * a = (__logf_lut_f32[4] * r.f) + (__logf_lut_f32[0]);
  140. */
  141. vecAcc0 = vdupq_n_f32(__logf_lut_f32[0]);
  142. vecAcc0 = vfmaq(vecAcc0, vecTmpFlt1, __logf_lut_f32[4]);
  143. /*
  144. * b = (__logf_lut_f32[6] * r.f) + (__logf_lut_f32[2]);
  145. */
  146. vecAcc1 = vdupq_n_f32(__logf_lut_f32[2]);
  147. vecAcc1 = vfmaq(vecAcc1, vecTmpFlt1, __logf_lut_f32[6]);
  148. /*
  149. * c = (__logf_lut_f32[5] * r.f) + (__logf_lut_f32[1]);
  150. */
  151. vecAcc2 = vdupq_n_f32(__logf_lut_f32[1]);
  152. vecAcc2 = vfmaq(vecAcc2, vecTmpFlt1, __logf_lut_f32[5]);
  153. /*
  154. * d = (__logf_lut_f32[7] * r.f) + (__logf_lut_f32[3]);
  155. */
  156. vecAcc3 = vdupq_n_f32(__logf_lut_f32[3]);
  157. vecAcc3 = vfmaq(vecAcc3, vecTmpFlt1, __logf_lut_f32[7]);
  158. /*
  159. * a = a + b * xx;
  160. */
  161. vecAcc0 = vfmaq(vecAcc0, vecAcc1, vecTmpFlt0);
  162. /*
  163. * c = c + d * xx;
  164. */
  165. vecAcc2 = vfmaq(vecAcc2, vecAcc3, vecTmpFlt0);
  166. /*
  167. * xx = xx * xx;
  168. */
  169. vecTmpFlt0 = vecTmpFlt0 * vecTmpFlt0;
  170. vecExpUnBiasedFlt = vcvtq_f32_s32(vecExpUnBiased);
  171. /*
  172. * r.f = a + c * xx;
  173. */
  174. vecAcc0 = vfmaq(vecAcc0, vecAcc2, vecTmpFlt0);
  175. /*
  176. * add exponent
  177. * r.f = r.f + ((float32_t) m) * __logf_rng_f32;
  178. */
  179. vecAcc0 = vfmaq(vecAcc0, vecExpUnBiasedFlt, __logf_rng_f32);
  180. // set log0 down to -inf
  181. vecAcc0 = vdupq_m(vecAcc0, -INFINITY, vcmpeqq(vecIn, 0.0f));
  182. return vecAcc0;
  183. }
  184. __STATIC_INLINE f32x4_t vexpq_f32(
  185. f32x4_t x)
  186. {
  187. // Perform range reduction [-log(2),log(2)]
  188. int32x4_t m = vcvtq_s32_f32(vmulq_n_f32(x, 1.4426950408f));
  189. f32x4_t val = vfmsq_f32(x, vcvtq_f32_s32(m), vdupq_n_f32(0.6931471805f));
  190. // Polynomial Approximation
  191. f32x4_t poly = vtaylor_polyq_f32(val, exp_tab);
  192. // Reconstruct
  193. poly = (f32x4_t) (vqaddq_s32((q31x4_t) (poly), vqshlq_n_s32(m, 23)));
  194. poly = vdupq_m(poly, 0.0f, vcmpltq_n_s32(m, -126));
  195. return poly;
  196. }
  197. __STATIC_INLINE f32x4_t arm_vec_exponent_f32(f32x4_t x, int32_t nb)
  198. {
  199. f32x4_t r = x;
  200. nb--;
  201. while (nb > 0) {
  202. r = vmulq(r, x);
  203. nb--;
  204. }
  205. return (r);
  206. }
  207. __STATIC_INLINE f32x4_t vrecip_f32(f32x4_t vecIn)
  208. {
  209. f32x4_t vecSx, vecW, vecTmp;
  210. any32x4_t v;
  211. vecSx = vabsq(vecIn);
  212. v.f = vecIn;
  213. v.i = vsubq(vdupq_n_s32(INV_NEWTON_INIT_F32), v.i);
  214. vecW = vmulq(vecSx, v.f);
  215. // v.f = v.f * (8 + w * (-28 + w * (56 + w * (-70 + w *(56 + w * (-28 + w * (8 - w)))))));
  216. vecTmp = vsubq(vdupq_n_f32(8.0f), vecW);
  217. vecTmp = vfmasq(vecW, vecTmp, -28.0f);
  218. vecTmp = vfmasq(vecW, vecTmp, 56.0f);
  219. vecTmp = vfmasq(vecW, vecTmp, -70.0f);
  220. vecTmp = vfmasq(vecW, vecTmp, 56.0f);
  221. vecTmp = vfmasq(vecW, vecTmp, -28.0f);
  222. vecTmp = vfmasq(vecW, vecTmp, 8.0f);
  223. v.f = vmulq(v.f, vecTmp);
  224. v.f = vdupq_m(v.f, INFINITY, vcmpeqq(vecIn, 0.0f));
  225. /*
  226. * restore sign
  227. */
  228. v.f = vnegq_m(v.f, v.f, vcmpltq(vecIn, 0.0f));
  229. return v.f;
  230. }
  231. __STATIC_INLINE f32x4_t vtanhq_f32(
  232. f32x4_t val)
  233. {
  234. f32x4_t x =
  235. vminnmq_f32(vmaxnmq_f32(val, vdupq_n_f32(-10.f)), vdupq_n_f32(10.0f));
  236. f32x4_t exp2x = vexpq_f32(vmulq_n_f32(x, 2.f));
  237. f32x4_t num = vsubq_n_f32(exp2x, 1.f);
  238. f32x4_t den = vaddq_n_f32(exp2x, 1.f);
  239. f32x4_t tanh = vmulq_f32(num, vrecip_f32(den));
  240. return tanh;
  241. }
  242. __STATIC_INLINE f32x4_t vpowq_f32(
  243. f32x4_t val,
  244. f32x4_t n)
  245. {
  246. return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
  247. }
  248. #endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/
  249. #if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM))
  250. #endif /* (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) */
  251. #if (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
  252. #include "NEMath.h"
  253. /**
  254. * @brief Vectorized integer exponentiation
  255. * @param[in] x value
  256. * @param[in] nb integer exponent >= 1
  257. * @return x^nb
  258. *
  259. */
  260. __STATIC_INLINE float32x4_t arm_vec_exponent_f32(float32x4_t x, int32_t nb)
  261. {
  262. float32x4_t r = x;
  263. nb --;
  264. while(nb > 0)
  265. {
  266. r = vmulq_f32(r , x);
  267. nb--;
  268. }
  269. return(r);
  270. }
  271. __STATIC_INLINE float32x4_t __arm_vec_sqrt_f32_neon(float32x4_t x)
  272. {
  273. float32x4_t x1 = vmaxq_f32(x, vdupq_n_f32(FLT_MIN));
  274. float32x4_t e = vrsqrteq_f32(x1);
  275. e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
  276. e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
  277. return vmulq_f32(x, e);
  278. }
  279. __STATIC_INLINE int16x8_t __arm_vec_sqrt_q15_neon(int16x8_t vec)
  280. {
  281. float32x4_t tempF;
  282. int32x4_t tempHI,tempLO;
  283. tempLO = vmovl_s16(vget_low_s16(vec));
  284. tempF = vcvtq_n_f32_s32(tempLO,15);
  285. tempF = __arm_vec_sqrt_f32_neon(tempF);
  286. tempLO = vcvtq_n_s32_f32(tempF,15);
  287. tempHI = vmovl_s16(vget_high_s16(vec));
  288. tempF = vcvtq_n_f32_s32(tempHI,15);
  289. tempF = __arm_vec_sqrt_f32_neon(tempF);
  290. tempHI = vcvtq_n_s32_f32(tempF,15);
  291. return(vcombine_s16(vqmovn_s32(tempLO),vqmovn_s32(tempHI)));
  292. }
  293. __STATIC_INLINE int32x4_t __arm_vec_sqrt_q31_neon(int32x4_t vec)
  294. {
  295. float32x4_t temp;
  296. temp = vcvtq_n_f32_s32(vec,31);
  297. temp = __arm_vec_sqrt_f32_neon(temp);
  298. return(vcvtq_n_s32_f32(temp,31));
  299. }
  300. #endif /* (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE) */
  301. #ifdef __cplusplus
  302. }
  303. #endif
  304. #endif /* _ARM_VEC_MATH_H */
  305. /**
  306. *
  307. * End of file.
  308. */