simd-mappings.h 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892
  1. #pragma once
  2. #include "ggml-cpu-impl.h"
  3. //
  4. // simd mappings
  5. //
  6. // we define a common set of C macros which map to specific intrinsics based on the current architecture
  7. // we then implement the fundamental computation operations below using only these macros
  8. // adding support for new architectures requires to define the corresponding SIMD macros
  9. //
  10. // GGML_F32_STEP / GGML_F16_STEP
  11. // number of elements to process in a single step
  12. //
  13. // GGML_F32_EPR / GGML_F16_EPR
  14. // number of elements to fit in a single register
  15. //
  16. #if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
  17. #define GGML_SIMD
  18. // F32 NEON
  19. #define GGML_F32_STEP 16
  20. #define GGML_F32_EPR 4
  21. #define GGML_F32x4 float32x4_t
  22. #define GGML_F32x4_ZERO vdupq_n_f32(0.0f)
  23. #define GGML_F32x4_SET1(x) vdupq_n_f32(x)
  24. #define GGML_F32x4_LOAD vld1q_f32
  25. #define GGML_F32x4_STORE vst1q_f32
  26. #define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
  27. #define GGML_F32x4_ADD vaddq_f32
  28. #define GGML_F32x4_MUL vmulq_f32
  29. #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
  30. #define GGML_F32x4_REDUCE(res, x) \
  31. { \
  32. int offset = GGML_F32_ARR >> 1; \
  33. for (int i = 0; i < offset; ++i) { \
  34. (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
  35. } \
  36. offset >>= 1; \
  37. for (int i = 0; i < offset; ++i) { \
  38. (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
  39. } \
  40. offset >>= 1; \
  41. for (int i = 0; i < offset; ++i) { \
  42. (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
  43. } \
  44. (res) = (ggml_float) GGML_F32x4_REDUCE_ONE((x)[0]); \
  45. }
  46. #define GGML_F32_VEC GGML_F32x4
  47. #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
  48. #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
  49. #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
  50. #define GGML_F32_VEC_STORE GGML_F32x4_STORE
  51. #define GGML_F32_VEC_FMA GGML_F32x4_FMA
  52. #define GGML_F32_VEC_ADD GGML_F32x4_ADD
  53. #define GGML_F32_VEC_MUL GGML_F32x4_MUL
  54. #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
  55. // F16 NEON
  56. #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
  57. #define GGML_F16_STEP 32
  58. #define GGML_F16_EPR 8
  59. #define GGML_F16x8 float16x8_t
  60. #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
  61. #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
  62. #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
  63. #define GGML_F16x8_STORE vst1q_f16
  64. #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
  65. #define GGML_F16x8_ADD vaddq_f16
  66. #define GGML_F16x8_MUL vmulq_f16
  67. #define GGML_F16x8_REDUCE(res, x) \
  68. do { \
  69. int offset = GGML_F16_ARR >> 1; \
  70. for (int i = 0; i < offset; ++i) { \
  71. (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
  72. } \
  73. offset >>= 1; \
  74. for (int i = 0; i < offset; ++i) { \
  75. (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
  76. } \
  77. offset >>= 1; \
  78. for (int i = 0; i < offset; ++i) { \
  79. (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
  80. } \
  81. const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
  82. const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
  83. (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
  84. } while (0)
  85. #define GGML_F16_VEC GGML_F16x8
  86. #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
  87. #define GGML_F16_VEC_SET1 GGML_F16x8_SET1
  88. #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
  89. #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
  90. #define GGML_F16_VEC_FMA GGML_F16x8_FMA
  91. #define GGML_F16_VEC_ADD GGML_F16x8_ADD
  92. #define GGML_F16_VEC_MUL GGML_F16x8_MUL
  93. #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
  94. #else
  95. // if FP16 vector arithmetic is not supported, we use FP32 instead
  96. // and take advantage of the vcvt_ functions to convert to/from FP16
  97. #define GGML_F16_STEP 16
  98. #define GGML_F16_EPR 4
  99. #define GGML_F32Cx4 float32x4_t
  100. #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
  101. #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
  102. #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
  103. #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
  104. #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
  105. #define GGML_F32Cx4_ADD vaddq_f32
  106. #define GGML_F32Cx4_MUL vmulq_f32
  107. #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
  108. #define GGML_F16_VEC GGML_F32Cx4
  109. #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
  110. #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
  111. #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
  112. #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
  113. #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
  114. #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
  115. #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
  116. #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
  117. #endif
  118. #elif defined(__AVX512F__)
  119. #define GGML_SIMD
  120. // F32 AVX512
  121. #define GGML_F32_STEP 64
  122. #define GGML_F32_EPR 16
  123. #define GGML_F32x16 __m512
  124. #define GGML_F32x16_ZERO _mm512_setzero_ps()
  125. #define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
  126. #define GGML_F32x16_LOAD _mm512_loadu_ps
  127. #define GGML_F32x16_STORE _mm512_storeu_ps
  128. // _mm512_fmadd_ps is defined in AVX512F so no guard is required
  129. #define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
  130. #define GGML_F32x16_ADD _mm512_add_ps
  131. #define GGML_F32x16_MUL _mm512_mul_ps
  132. #define GGML_F32x16_REDUCE(res, x) \
  133. do { \
  134. int offset = GGML_F32_ARR >> 1; \
  135. for (int i = 0; i < offset; ++i) { \
  136. x[i] = _mm512_add_ps(x[i], x[offset+i]); \
  137. } \
  138. offset >>= 1; \
  139. for (int i = 0; i < offset; ++i) { \
  140. x[i] = _mm512_add_ps(x[i], x[offset+i]); \
  141. } \
  142. offset >>= 1; \
  143. for (int i = 0; i < offset; ++i) { \
  144. x[i] = _mm512_add_ps(x[i], x[offset+i]); \
  145. } \
  146. res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
  147. } while (0)
  148. // TODO: is this optimal ?
  149. #define GGML_F32_VEC GGML_F32x16
  150. #define GGML_F32_VEC_ZERO GGML_F32x16_ZERO
  151. #define GGML_F32_VEC_SET1 GGML_F32x16_SET1
  152. #define GGML_F32_VEC_LOAD GGML_F32x16_LOAD
  153. #define GGML_F32_VEC_STORE GGML_F32x16_STORE
  154. #define GGML_F32_VEC_FMA GGML_F32x16_FMA
  155. #define GGML_F32_VEC_ADD GGML_F32x16_ADD
  156. #define GGML_F32_VEC_MUL GGML_F32x16_MUL
  157. #define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
  158. // F16 AVX512
  159. // F16 AVX
  160. #define GGML_F16_STEP 64
  161. #define GGML_F16_EPR 16
  162. // AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
  163. #define GGML_F32Cx16 __m512
  164. #define GGML_F32Cx16_ZERO _mm512_setzero_ps()
  165. #define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
  166. // unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
  167. // so F16C guard isn't required
  168. #define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x)))
  169. #define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
  170. #define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
  171. #define GGML_F32Cx16_ADD _mm512_add_ps
  172. #define GGML_F32Cx16_MUL _mm512_mul_ps
  173. #define GGML_F32Cx16_REDUCE(res, x) \
  174. do { \
  175. int offset = GGML_F32_ARR >> 1; \
  176. for (int i = 0; i < offset; ++i) { \
  177. x[i] = _mm512_add_ps(x[i], x[offset+i]); \
  178. } \
  179. offset >>= 1; \
  180. for (int i = 0; i < offset; ++i) { \
  181. x[i] = _mm512_add_ps(x[i], x[offset+i]); \
  182. } \
  183. offset >>= 1; \
  184. for (int i = 0; i < offset; ++i) { \
  185. x[i] = _mm512_add_ps(x[i], x[offset+i]); \
  186. } \
  187. res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
  188. } while (0)
  189. #define GGML_F16_VEC GGML_F32Cx16
  190. #define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO
  191. #define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1
  192. #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p)
  193. #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
  194. #define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
  195. #define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
  196. #define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
  197. #define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
  198. #elif defined(__AVX__)
  199. #define GGML_SIMD
  200. // F32 AVX
  201. #define GGML_F32_STEP 32
  202. #define GGML_F32_EPR 8
  203. #define GGML_F32x8 __m256
  204. #define GGML_F32x8_ZERO _mm256_setzero_ps()
  205. #define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
  206. #define GGML_F32x8_LOAD _mm256_loadu_ps
  207. #define GGML_F32x8_STORE _mm256_storeu_ps
  208. #if defined(__FMA__)
  209. #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
  210. #else
  211. #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
  212. #endif
  213. #define GGML_F32x8_ADD _mm256_add_ps
  214. #define GGML_F32x8_MUL _mm256_mul_ps
  215. #define GGML_F32x8_REDUCE(res, x) \
  216. do { \
  217. int offset = GGML_F32_ARR >> 1; \
  218. for (int i = 0; i < offset; ++i) { \
  219. x[i] = _mm256_add_ps(x[i], x[offset+i]); \
  220. } \
  221. offset >>= 1; \
  222. for (int i = 0; i < offset; ++i) { \
  223. x[i] = _mm256_add_ps(x[i], x[offset+i]); \
  224. } \
  225. offset >>= 1; \
  226. for (int i = 0; i < offset; ++i) { \
  227. x[i] = _mm256_add_ps(x[i], x[offset+i]); \
  228. } \
  229. const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
  230. _mm256_extractf128_ps(x[0], 1)); \
  231. const __m128 t1 = _mm_hadd_ps(t0, t0); \
  232. res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
  233. } while (0)
  234. // TODO: is this optimal ?
  235. #define GGML_F32_VEC GGML_F32x8
  236. #define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
  237. #define GGML_F32_VEC_SET1 GGML_F32x8_SET1
  238. #define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
  239. #define GGML_F32_VEC_STORE GGML_F32x8_STORE
  240. #define GGML_F32_VEC_FMA GGML_F32x8_FMA
  241. #define GGML_F32_VEC_ADD GGML_F32x8_ADD
  242. #define GGML_F32_VEC_MUL GGML_F32x8_MUL
  243. #define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
  244. // F16 AVX
  245. #define GGML_F16_STEP 32
  246. #define GGML_F16_EPR 8
  247. // F16 arithmetic is not supported by AVX, so we use F32 instead
  248. #define GGML_F32Cx8 __m256
  249. #define GGML_F32Cx8_ZERO _mm256_setzero_ps()
  250. #define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x)
  251. #if defined(__F16C__)
  252. // the _mm256_cvt intrinsics require F16C
  253. #define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
  254. #define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
  255. #else
  256. static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
  257. float tmp[8];
  258. for (int i = 0; i < 8; i++) {
  259. tmp[i] = GGML_FP16_TO_FP32(x[i]);
  260. }
  261. return _mm256_loadu_ps(tmp);
  262. }
  263. static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
  264. float arr[8];
  265. _mm256_storeu_ps(arr, y);
  266. for (int i = 0; i < 8; i++)
  267. x[i] = GGML_FP32_TO_FP16(arr[i]);
  268. }
  269. #define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
  270. #define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
  271. #endif
  272. #define GGML_F32Cx8_FMA GGML_F32x8_FMA
  273. #define GGML_F32Cx8_ADD _mm256_add_ps
  274. #define GGML_F32Cx8_MUL _mm256_mul_ps
  275. #define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
  276. #define GGML_F16_VEC GGML_F32Cx8
  277. #define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
  278. #define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
  279. #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p)
  280. #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
  281. #define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
  282. #define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
  283. #define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
  284. #define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
  285. #elif defined(__POWER9_VECTOR__)
  286. #define GGML_SIMD
  287. // F32 POWER9
  288. #define GGML_F32_STEP 32
  289. #define GGML_F32_EPR 4
  290. #define GGML_F32x4 vector float
  291. #define GGML_F32x4_ZERO {0.0f}
  292. #define GGML_F32x4_SET1 vec_splats
  293. #define GGML_F32x4_LOAD(p) vec_xl(0, p)
  294. #define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
  295. #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
  296. #define GGML_F32x4_ADD vec_add
  297. #define GGML_F32x4_MUL vec_mul
  298. #define GGML_F32x4_REDUCE(res, x) \
  299. { \
  300. int offset = GGML_F32_ARR >> 1; \
  301. for (int i = 0; i < offset; ++i) { \
  302. x[i] = vec_add(x[i], x[offset+i]); \
  303. } \
  304. offset >>= 1; \
  305. for (int i = 0; i < offset; ++i) { \
  306. x[i] = vec_add(x[i], x[offset+i]); \
  307. } \
  308. offset >>= 1; \
  309. for (int i = 0; i < offset; ++i) { \
  310. x[i] = vec_add(x[i], x[offset+i]); \
  311. } \
  312. res = vec_extract(x[0], 0) + \
  313. vec_extract(x[0], 1) + \
  314. vec_extract(x[0], 2) + \
  315. vec_extract(x[0], 3); \
  316. }
  317. #define GGML_F32_VEC GGML_F32x4
  318. #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
  319. #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
  320. #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
  321. #define GGML_F32_VEC_STORE GGML_F32x4_STORE
  322. #define GGML_F32_VEC_FMA GGML_F32x4_FMA
  323. #define GGML_F32_VEC_ADD GGML_F32x4_ADD
  324. #define GGML_F32_VEC_MUL GGML_F32x4_MUL
  325. #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
  326. // F16 POWER9
  327. #define GGML_F16_STEP GGML_F32_STEP
  328. #define GGML_F16_EPR GGML_F32_EPR
  329. #define GGML_F16_VEC GGML_F32x4
  330. #define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
  331. #define GGML_F16_VEC_SET1 GGML_F32x4_SET1
  332. #define GGML_F16_VEC_FMA GGML_F32x4_FMA
  333. #define GGML_F16_VEC_ADD GGML_F32x4_ADD
  334. #define GGML_F16_VEC_MUL GGML_F32x4_MUL
  335. #define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
  336. // Use vec_xl, not vec_ld, in case the load address is not aligned.
  337. #define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \
  338. vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
  339. vec_extract_fp32_from_shortl(vec_xl(0, p))
  340. static inline unsigned char ggml_endian_byte(int i) {
  341. uint16_t tmp_val = 1;
  342. return ((unsigned char *)&tmp_val)[i];
  343. }
  344. #define GGML_ENDIAN_BYTE(i) ggml_endian_byte(i)
  345. #define GGML_F16_VEC_STORE(p, r, i) \
  346. if (i & 0x1) \
  347. vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)], \
  348. r[i - GGML_ENDIAN_BYTE(0)]), \
  349. 0, p - GGML_F16_EPR)
  350. #elif defined(__wasm_simd128__)
  351. #define GGML_SIMD
  352. // F32 WASM
  353. #define GGML_F32_STEP 16
  354. #define GGML_F32_EPR 4
  355. #define GGML_F32x4 v128_t
  356. #define GGML_F32x4_ZERO wasm_f32x4_splat(0.0f)
  357. #define GGML_F32x4_SET1(x) wasm_f32x4_splat(x)
  358. #define GGML_F32x4_LOAD wasm_v128_load
  359. #define GGML_F32x4_STORE wasm_v128_store
  360. #define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
  361. #define GGML_F32x4_ADD wasm_f32x4_add
  362. #define GGML_F32x4_MUL wasm_f32x4_mul
  363. #define GGML_F32x4_REDUCE(res, x) \
  364. { \
  365. int offset = GGML_F32_ARR >> 1; \
  366. for (int i = 0; i < offset; ++i) { \
  367. x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
  368. } \
  369. offset >>= 1; \
  370. for (int i = 0; i < offset; ++i) { \
  371. x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
  372. } \
  373. offset >>= 1; \
  374. for (int i = 0; i < offset; ++i) { \
  375. x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
  376. } \
  377. res = wasm_f32x4_extract_lane(x[0], 0) + \
  378. wasm_f32x4_extract_lane(x[0], 1) + \
  379. wasm_f32x4_extract_lane(x[0], 2) + \
  380. wasm_f32x4_extract_lane(x[0], 3); \
  381. }
  382. #define GGML_F32_VEC GGML_F32x4
  383. #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
  384. #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
  385. #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
  386. #define GGML_F32_VEC_STORE GGML_F32x4_STORE
  387. #define GGML_F32_VEC_FMA GGML_F32x4_FMA
  388. #define GGML_F32_VEC_ADD GGML_F32x4_ADD
  389. #define GGML_F32_VEC_MUL GGML_F32x4_MUL
  390. #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
  391. // F16 WASM
  392. #define GGML_F16_STEP 16
  393. #define GGML_F16_EPR 4
  394. inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
  395. float tmp[4];
  396. tmp[0] = GGML_FP16_TO_FP32(p[0]);
  397. tmp[1] = GGML_FP16_TO_FP32(p[1]);
  398. tmp[2] = GGML_FP16_TO_FP32(p[2]);
  399. tmp[3] = GGML_FP16_TO_FP32(p[3]);
  400. return wasm_v128_load(tmp);
  401. }
  402. inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
  403. float tmp[4];
  404. wasm_v128_store(tmp, x);
  405. p[0] = GGML_FP32_TO_FP16(tmp[0]);
  406. p[1] = GGML_FP32_TO_FP16(tmp[1]);
  407. p[2] = GGML_FP32_TO_FP16(tmp[2]);
  408. p[3] = GGML_FP32_TO_FP16(tmp[3]);
  409. }
  410. #define GGML_F16x4 v128_t
  411. #define GGML_F16x4_ZERO wasm_f32x4_splat(0.0f)
  412. #define GGML_F16x4_SET1(x) wasm_f32x4_splat(x)
  413. #define GGML_F16x4_LOAD(x) __wasm_f16x4_load(x)
  414. #define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
  415. #define GGML_F16x4_FMA GGML_F32x4_FMA
  416. #define GGML_F16x4_ADD wasm_f32x4_add
  417. #define GGML_F16x4_MUL wasm_f32x4_mul
  418. #define GGML_F16x4_REDUCE(res, x) \
  419. { \
  420. int offset = GGML_F16_ARR >> 1; \
  421. for (int i = 0; i < offset; ++i) { \
  422. x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
  423. } \
  424. offset >>= 1; \
  425. for (int i = 0; i < offset; ++i) { \
  426. x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
  427. } \
  428. offset >>= 1; \
  429. for (int i = 0; i < offset; ++i) { \
  430. x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
  431. } \
  432. res = (ggml_float) (wasm_f32x4_extract_lane(x[0], 0) + \
  433. wasm_f32x4_extract_lane(x[0], 1) + \
  434. wasm_f32x4_extract_lane(x[0], 2) + \
  435. wasm_f32x4_extract_lane(x[0], 3)); \
  436. }
  437. #define GGML_F16_VEC GGML_F16x4
  438. #define GGML_F16_VEC_ZERO GGML_F16x4_ZERO
  439. #define GGML_F16_VEC_SET1 GGML_F16x4_SET1
  440. #define GGML_F16_VEC_LOAD(p, i) GGML_F16x4_LOAD(p)
  441. #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
  442. #define GGML_F16_VEC_FMA GGML_F16x4_FMA
  443. #define GGML_F16_VEC_ADD GGML_F16x4_ADD
  444. #define GGML_F16_VEC_MUL GGML_F16x4_MUL
  445. #define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
  446. #elif defined(__SSE3__)
  447. #define GGML_SIMD
  448. // F32 SSE
  449. #define GGML_F32_STEP 32
  450. #define GGML_F32_EPR 4
  451. #define GGML_F32x4 __m128
  452. #define GGML_F32x4_ZERO _mm_setzero_ps()
  453. #define GGML_F32x4_SET1(x) _mm_set1_ps(x)
  454. #define GGML_F32x4_LOAD _mm_loadu_ps
  455. #define GGML_F32x4_STORE _mm_storeu_ps
  456. #if defined(__FMA__)
  457. // TODO: Does this work?
  458. #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
  459. #else
  460. #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
  461. #endif
  462. #define GGML_F32x4_ADD _mm_add_ps
  463. #define GGML_F32x4_MUL _mm_mul_ps
  464. #define GGML_F32x4_REDUCE(res, x) \
  465. { \
  466. int offset = GGML_F32_ARR >> 1; \
  467. for (int i = 0; i < offset; ++i) { \
  468. x[i] = _mm_add_ps(x[i], x[offset+i]); \
  469. } \
  470. offset >>= 1; \
  471. for (int i = 0; i < offset; ++i) { \
  472. x[i] = _mm_add_ps(x[i], x[offset+i]); \
  473. } \
  474. offset >>= 1; \
  475. for (int i = 0; i < offset; ++i) { \
  476. x[i] = _mm_add_ps(x[i], x[offset+i]); \
  477. } \
  478. const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
  479. res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
  480. }
  481. // TODO: is this optimal ?
  482. #define GGML_F32_VEC GGML_F32x4
  483. #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
  484. #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
  485. #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
  486. #define GGML_F32_VEC_STORE GGML_F32x4_STORE
  487. #define GGML_F32_VEC_FMA GGML_F32x4_FMA
  488. #define GGML_F32_VEC_ADD GGML_F32x4_ADD
  489. #define GGML_F32_VEC_MUL GGML_F32x4_MUL
  490. #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
  491. // F16 SSE
  492. #define GGML_F16_STEP 32
  493. #define GGML_F16_EPR 4
  494. static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
  495. float tmp[4];
  496. tmp[0] = GGML_FP16_TO_FP32(x[0]);
  497. tmp[1] = GGML_FP16_TO_FP32(x[1]);
  498. tmp[2] = GGML_FP16_TO_FP32(x[2]);
  499. tmp[3] = GGML_FP16_TO_FP32(x[3]);
  500. return _mm_loadu_ps(tmp);
  501. }
  502. static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
  503. float arr[4];
  504. _mm_storeu_ps(arr, y);
  505. x[0] = GGML_FP32_TO_FP16(arr[0]);
  506. x[1] = GGML_FP32_TO_FP16(arr[1]);
  507. x[2] = GGML_FP32_TO_FP16(arr[2]);
  508. x[3] = GGML_FP32_TO_FP16(arr[3]);
  509. }
  510. #define GGML_F32Cx4 __m128
  511. #define GGML_F32Cx4_ZERO _mm_setzero_ps()
  512. #define GGML_F32Cx4_SET1(x) _mm_set1_ps(x)
  513. #define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x)
  514. #define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
  515. #define GGML_F32Cx4_FMA GGML_F32x4_FMA
  516. #define GGML_F32Cx4_ADD _mm_add_ps
  517. #define GGML_F32Cx4_MUL _mm_mul_ps
  518. #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
  519. #define GGML_F16_VEC GGML_F32Cx4
  520. #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
  521. #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
  522. #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
  523. #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
  524. #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
  525. #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
  526. #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
  527. #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
  528. #elif defined(__loongarch_asx)
  529. #define GGML_SIMD
  530. // F32 LASX
  531. #define GGML_F32_STEP 32
  532. #define GGML_F32_EPR 8
  533. #define GGML_F32x8 __m256
  534. #define GGML_F32x8_ZERO (__m256)__lasx_xvldi(0)
  535. #define GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
  536. #define GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0)
  537. #define GGML_F32x8_STORE(x,y) __lasx_xvst((y), (x), 0)
  538. #define GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a)
  539. #define GGML_F32x8_ADD __lasx_xvfadd_s
  540. #define GGML_F32x8_MUL __lasx_xvfmul_s
  541. #define GGML_F32x8_REDUCE(res, x) \
  542. do { \
  543. int offset = GGML_F32_ARR >> 1; \
  544. for (int i = 0; i < offset; ++i) { \
  545. x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
  546. } \
  547. offset >>= 1; \
  548. for (int i = 0; i < offset; ++i) { \
  549. x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
  550. } \
  551. offset >>= 1; \
  552. for (int i = 0; i < offset; ++i) { \
  553. x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
  554. } \
  555. float *tmp_p = (float *)&x[0]; \
  556. res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7]; \
  557. } while (0)
  558. // TODO: is this optimal ?
  559. #define GGML_F32_VEC GGML_F32x8
  560. #define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
  561. #define GGML_F32_VEC_SET1 GGML_F32x8_SET1
  562. #define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
  563. #define GGML_F32_VEC_STORE GGML_F32x8_STORE
  564. #define GGML_F32_VEC_FMA GGML_F32x8_FMA
  565. #define GGML_F32_VEC_ADD GGML_F32x8_ADD
  566. #define GGML_F32_VEC_MUL GGML_F32x8_MUL
  567. #define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
  568. // F16 LASX
  569. #define GGML_F16_STEP 32
  570. #define GGML_F16_EPR 8
  571. // F16 arithmetic is not supported by LASX, so we use F32 instead
  572. #define GGML_F32Cx8 __m256
  573. #define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
  574. #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
  575. static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
  576. __m256i a;
  577. memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
  578. a = __lasx_xvpermi_d(a, 0 | (1 << 4));
  579. return __lasx_xvfcvtl_s_h(a);
  580. }
  581. static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
  582. __m256i a = __lasx_xvfcvt_h_s(y, y);
  583. a = __lasx_xvpermi_d(a, 0 | (2 << 2));
  584. memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
  585. }
  586. #define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
  587. #define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
  588. #define GGML_F32Cx8_FMA GGML_F32x8_FMA
  589. #define GGML_F32Cx8_ADD __lasx_xvfadd_s
  590. #define GGML_F32Cx8_MUL __lasx_xvfmul_s
  591. #define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
  592. #define GGML_F16_VEC GGML_F32Cx8
  593. #define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
  594. #define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
  595. #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p)
  596. #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
  597. #define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
  598. #define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
  599. #define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
  600. #define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
  601. #elif defined(__loongarch_sx)
  602. #define GGML_SIMD
  603. // F32 LSX
  604. #define GGML_F32_STEP 32
  605. #define GGML_F32_EPR 4
  606. #define GGML_F32x4 __m128
  607. #define GGML_F32x4_ZERO __lsx_vldi(0)
  608. #define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
  609. #define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
  610. #define GGML_F32x4_STORE((x),(y)) __lsx_vst((y), (x), 0)
  611. #define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
  612. #define GGML_F32x4_ADD __lsx_vfadd_s
  613. #define GGML_F32x4_MUL __lsx_vfmul_s
  614. #define GGML_F32x4_REDUCE(res, x) \
  615. { \
  616. int offset = GGML_F32_ARR >> 1; \
  617. for (int i = 0; i < offset; ++i) { \
  618. x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
  619. } \
  620. offset >>= 1; \
  621. for (int i = 0; i < offset; ++i) { \
  622. x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
  623. } \
  624. offset >>= 1; \
  625. for (int i = 0; i < offset; ++i) { \
  626. x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
  627. } \
  628. __m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
  629. tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
  630. tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
  631. const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
  632. tmp = __lsx_vsrli_d((__m128i) t0, 32); \
  633. tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
  634. tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
  635. res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
  636. }
  637. #define GGML_F32_VEC GGML_F32x4
  638. #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
  639. #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
  640. #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
  641. #define GGML_F32_VEC_STORE GGML_F32x4_STORE
  642. #define GGML_F32_VEC_FMA GGML_F32x4_FMA
  643. #define GGML_F32_VEC_ADD GGML_F32x4_ADD
  644. #define GGML_F32_VEC_MUL GGML_F32x4_MUL
  645. #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
  646. // F16 LSX
  647. #define GGML_F16_STEP 32
  648. #define GGML_F16_EPR 4
  649. static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
  650. float tmp[4];
  651. tmp[0] = GGML_FP16_TO_FP32(x[0]);
  652. tmp[1] = GGML_FP16_TO_FP32(x[1]);
  653. tmp[2] = GGML_FP16_TO_FP32(x[2]);
  654. tmp[3] = GGML_FP16_TO_FP32(x[3]);
  655. return __lsx_vld(tmp, 0);
  656. }
  657. static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
  658. float arr[4];
  659. __lsx_vst(y, arr, 0);
  660. x[0] = GGML_FP32_TO_FP16(arr[0]);
  661. x[1] = GGML_FP32_TO_FP16(arr[1]);
  662. x[2] = GGML_FP32_TO_FP16(arr[2]);
  663. x[3] = GGML_FP32_TO_FP16(arr[3]);
  664. }
  665. #define GGML_F32Cx4 __m128
  666. #define GGML_F32Cx4_ZERO __lsx_vldi(0)
  667. #define GGML_F32Cx4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
  668. #define GGML_F32Cx4_LOAD(x) __lsx_f16x4_load(x)
  669. #define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
  670. #define GGML_F32Cx4_FMA GGML_F32x4_FMA
  671. #define GGML_F32Cx4_ADD __lsx_vfadd_s
  672. #define GGML_F32Cx4_MUL __lsx_vfmul_s
  673. #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
  674. #define GGML_F16_VEC GGML_F32Cx4
  675. #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
  676. #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
  677. #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
  678. #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
  679. #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
  680. #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
  681. #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
  682. #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
  683. #elif defined(__VXE__) || defined(__VXE2__)
  684. #define GGML_SIMD
  685. // F32 s390x
  686. #define GGML_F32_STEP 32
  687. #define GGML_F32_EPR 4
  688. #define GGML_F32x4 __vector float
  689. #define GGML_F32x4_ZERO vec_splats(0.0f)
  690. #define GGML_F32x4_SET1 vec_splats
  691. #define GGML_F32x4_LOAD(p) vec_xl(0, p)
  692. #define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
  693. #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
  694. #define GGML_F32x4_ADD vec_add
  695. #define GGML_F32x4_MUL vec_mul
  696. #define GGML_F32x4_REDUCE(res, x) \
  697. { \
  698. int offset = GGML_F32_ARR >> 1; \
  699. for (int i = 0; i < offset; ++i) { \
  700. x[i] = vec_add(x[i], x[offset + i]); \
  701. } \
  702. offset >>= 1; \
  703. for (int i = 0; i < offset; ++i) { \
  704. x[i] = vec_add(x[i], x[offset + i]); \
  705. } \
  706. offset >>= 1; \
  707. for (int i = 0; i < offset; ++i) { \
  708. x[i] = vec_add(x[i], x[offset + i]); \
  709. } \
  710. res = vec_extract(x[0], 0) + \
  711. vec_extract(x[0], 1) + \
  712. vec_extract(x[0], 2) + \
  713. vec_extract(x[0], 3); \
  714. }
  715. #define GGML_F32_VEC GGML_F32x4
  716. #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
  717. #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
  718. #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
  719. #define GGML_F32_VEC_STORE GGML_F32x4_STORE
  720. #define GGML_F32_VEC_FMA GGML_F32x4_FMA
  721. #define GGML_F32_VEC_ADD GGML_F32x4_ADD
  722. #define GGML_F32_VEC_MUL GGML_F32x4_MUL
  723. #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
  724. // F16 s390x
  725. #define GGML_F16_STEP GGML_F32_STEP
  726. #define GGML_F16_EPR GGML_F32_EPR
  727. static inline __vector float __lzs_f16cx4_load(const ggml_fp16_t * x) {
  728. float tmp[4];
  729. for (int i = 0; i < 4; i++) {
  730. tmp[i] = GGML_FP16_TO_FP32(x[i]);
  731. }
  732. // note: keep type-cast here to prevent compiler bugs
  733. // see: https://github.com/ggml-org/llama.cpp/issues/12846
  734. return vec_xl(0, (const float *)(tmp));
  735. }
  736. static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) {
  737. float arr[4];
  738. // note: keep type-cast here to prevent compiler bugs
  739. // see: https://github.com/ggml-org/llama.cpp/issues/12846
  740. vec_xst(y, 0, (float *)(arr));
  741. for (int i = 0; i < 4; i++) {
  742. x[i] = GGML_FP32_TO_FP16(arr[i]);
  743. }
  744. }
  745. #define GGML_F16_VEC GGML_F32x4
  746. #define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
  747. #define GGML_F16_VEC_SET1 GGML_F32x4_SET1
  748. #define GGML_F16_VEC_LOAD(p, i) __lzs_f16cx4_load(p)
  749. #define GGML_F16_VEC_STORE(p, r, i) __lzs_f16cx4_store(p, r[i])
  750. #define GGML_F16_VEC_FMA GGML_F32x4_FMA
  751. #define GGML_F16_VEC_ADD GGML_F32x4_ADD
  752. #define GGML_F16_VEC_MUL GGML_F32x4_MUL
  753. #define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
  754. #endif
  755. // GGML_F32_ARR / GGML_F16_ARR
  756. // number of registers to use per step
  757. #ifdef GGML_SIMD
  758. #define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
  759. #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
  760. #endif