simd-mappings.h 47 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184
  1. #pragma once
  2. #include "ggml-cpu-impl.h"
  3. #ifdef __ARM_FEATURE_SVE
  4. #include <arm_sve.h>
  5. #endif // __ARM_FEATURE_SVE
  6. #if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
  7. // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
  8. //
  9. // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
  10. //
  11. #include <arm_neon.h>
  12. #endif
  13. #if defined(__F16C__)
  14. #include <immintrin.h>
  15. #endif
  16. #ifdef __cplusplus
  17. extern "C" {
  18. #endif
  19. //
  20. // simd mappings
  21. //
  22. // FP16 to FP32 conversion
  23. // 16-bit float
  24. // on Arm, we use __fp16
  25. // on x86, we use uint16_t
  26. //
  27. // for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
  28. // for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
  29. //
  30. #if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
  31. #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) neon_compute_fp16_to_fp32(x)
  32. #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) neon_compute_fp32_to_fp16(x)
  33. #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
  34. static inline float neon_compute_fp16_to_fp32(ggml_fp16_t h) {
  35. __fp16 tmp;
  36. memcpy(&tmp, &h, sizeof(ggml_fp16_t));
  37. return (float)tmp;
  38. }
  39. static inline ggml_fp16_t neon_compute_fp32_to_fp16(float f) {
  40. ggml_fp16_t res;
  41. __fp16 tmp = f;
  42. memcpy(&res, &tmp, sizeof(ggml_fp16_t));
  43. return res;
  44. }
  45. #elif defined(__F16C__)
  46. #ifdef _MSC_VER
  47. #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
  48. #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
  49. #else
  50. #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
  51. #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
  52. #endif
  53. #elif defined(__POWER9_VECTOR__)
  54. #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) power_compute_fp16_to_fp32(x)
  55. #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) power_compute_fp32_to_fp16(x)
  56. /* the inline asm below is about 12% faster than the lookup method */
  57. #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
  58. #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
  59. static inline float power_compute_fp16_to_fp32(ggml_fp16_t h) {
  60. float f;
  61. double d;
  62. __asm__(
  63. "mtfprd %0,%2\n"
  64. "xscvhpdp %0,%0\n"
  65. "frsp %1,%0\n" :
  66. /* temp */ "=d"(d),
  67. /* out */ "=f"(f):
  68. /* in */ "r"(h));
  69. return f;
  70. }
  71. static inline ggml_fp16_t power_compute_fp32_to_fp16(float f) {
  72. double d;
  73. ggml_fp16_t r;
  74. __asm__( /* xscvdphp can work on double or single precision */
  75. "xscvdphp %0,%2\n"
  76. "mffprd %1,%0\n" :
  77. /* temp */ "=d"(d),
  78. /* out */ "=r"(r):
  79. /* in */ "f"(f));
  80. return r;
  81. }
  82. #elif defined(__riscv) && defined(__riscv_zfhmin)
  83. static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) {
  84. float f;
  85. __asm__(
  86. "fmv.h.x %[f], %[h]\n\t"
  87. "fcvt.s.h %[f], %[f]"
  88. : [f] "=&f" (f)
  89. : [h] "r" (h)
  90. );
  91. return f;
  92. }
  93. static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) {
  94. ggml_fp16_t res;
  95. __asm__(
  96. "fcvt.h.s %[f], %[f]\n\t"
  97. "fmv.x.h %[h], %[f]"
  98. : [h] "=&r" (res)
  99. : [f] "f" (f)
  100. );
  101. return res;
  102. }
  103. #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) riscv_compute_fp16_to_fp32(x)
  104. #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
  105. #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
  106. #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
  107. #elif defined(__NNPA__)
  108. #define GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x)
  109. #define GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x)
  110. #define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
  111. #define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
  112. static inline float nnpa_compute_fp16_to_fp32(ggml_fp16_t h) {
  113. uint16x8_t v_h = vec_splats(h);
  114. uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0);
  115. return vec_extend_to_fp32_hi(v_hd, 0)[0];
  116. }
  117. static inline ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) {
  118. float32x4_t v_f = vec_splats(f);
  119. float32x4_t v_zero = vec_splats(0.0f);
  120. uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0);
  121. uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0);
  122. return vec_extract(v_h, 0);
  123. }
  124. #endif
  125. // precomputed f32 table for f16 (256 KB)
  126. // defined in ggml-cpu.c, initialized in ggml_cpu_init()
  127. extern float ggml_table_f32_f16[1 << 16];
  128. // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
  129. // so we define GGML_CPU_FP16_TO_FP32 and GGML_CPU_FP32_TO_FP16 elsewhere for NEON.
  130. // This is also true for POWER9.
  131. #if !defined(GGML_CPU_FP16_TO_FP32)
  132. inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
  133. uint16_t s;
  134. memcpy(&s, &f, sizeof(uint16_t));
  135. return ggml_table_f32_f16[s];
  136. }
  137. #define GGML_CPU_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
  138. #endif
  139. #if !defined(GGML_CPU_FP32_TO_FP16)
  140. #define GGML_CPU_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
  141. #endif
  142. // we define a common set of C macros which map to specific intrinsics based on the current architecture
  143. // we then implement the fundamental computation operations below using only these macros
  144. // adding support for new architectures requires to define the corresponding SIMD macros
  145. //
  146. // GGML_F32_STEP / GGML_F16_STEP
  147. // number of elements to process in a single step
  148. //
  149. // GGML_F32_EPR / GGML_F16_EPR
  150. // number of elements to fit in a single register
  151. //
  152. #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_FMA)
  153. #define GGML_SIMD
  154. // F32 SVE
  155. #define GGML_F32_EPR 8
  156. #define DEFAULT_PG svptrue_b32()
  157. #define GGML_F32xt svfloat32_t
  158. #define GGML_F32xt_ZERO svdup_n_f32(0.0f)
  159. #define GGML_F32xt_SET1(x) svdup_n_f32(x)
  160. #define GGML_F32xt_LOAD_IMPL(pg, a, ...) svld1_f32(pg, a)
  161. #define GGML_F32xt_LOAD(...) GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__)
  162. #define GGML_F32xt_STORE_IMPL(pg,a,b) svst1_f32(pg, a, b)
  163. #define GGML_F32xt_STORE(...) GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__)
  164. #define GGML_F32xt_FMA_IMPL(pg, a, b, c) svmad_f32_m(pg, b, c, a)
  165. #define GGML_F32xt_FMA(...) GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__)
  166. #define GGML_F32xt_ADD_IMPL(pg, a, b) svadd_f32_m(pg, a, b)
  167. #define GGML_F32xt_ADD(...) GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__)
  168. #define GGML_F32xt_MUL_IMPL(pg, a, b) svmul_f32_m(pg, a, b)
  169. #define GGML_F32xt_MUL(...) GGML_F32xt_MUL_IMPL(DEFAULT_PG, __VA_ARGS__)
  170. #define GGML_F32xt_REDUCE_ONE_IMPL(pg, a) svaddv(pg, a)
  171. #define GGML_F32xt_REDUCE_ONE(...) GGML_F32xt_REDUCE_ONE_IMPL(DEFAULT_PG, __VA_ARGS__)
  172. #define GGML_F32xt_REDUCE_IMPL(pg, res, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8) \
  173. { \
  174. sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum2); \
  175. sum3 = svadd_f32_m(DEFAULT_PG, sum3, sum4); \
  176. sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum6); \
  177. sum7 = svadd_f32_m(DEFAULT_PG, sum7, sum8); \
  178. sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum3); \
  179. sum5 = svadd_f32_m(DEFAULT_PG, sum5, sum7); \
  180. sum1 = svadd_f32_m(DEFAULT_PG, sum1, sum5); \
  181. (res) = (ggml_float) GGML_F32xt_REDUCE_ONE(sum1); \
  182. }
  183. #define GGML_F32xt_REDUCE(...) GGML_F32xt_REDUCE_IMPL(DEFAULT_PG, __VA_ARGS__)
  184. #define GGML_F32_VEC GGML_F32xt
  185. #define GGML_F32_VEC_ZERO GGML_F32xt_ZERO
  186. #define GGML_F32_VEC_SET1 GGML_F32xt_SET1
  187. #define GGML_F32_VEC_LOAD GGML_F32xt_LOAD
  188. #define GGML_F32_VEC_STORE GGML_F32xt_STORE
  189. #define GGML_F32_VEC_FMA GGML_F32xt_FMA
  190. #define GGML_F32_VEC_ADD GGML_F32xt_ADD
  191. #define GGML_F32_VEC_MUL GGML_F32xt_MUL
  192. #define GGML_F32_VEC_REDUCE GGML_F32xt_REDUCE
  193. // F16 NEON
  194. #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
  195. #define GGML_F16_STEP 32
  196. #define GGML_F16_EPR 8
  197. #define GGML_F16x8 float16x8_t
  198. #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
  199. #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
  200. #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
  201. #define GGML_F16x8_STORE vst1q_f16
  202. #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
  203. #define GGML_F16x8_ADD vaddq_f16
  204. #define GGML_F16x8_MUL vmulq_f16
  205. #define GGML_F16x8_REDUCE(res, x) \
  206. do { \
  207. int offset = GGML_F16_ARR >> 1; \
  208. for (int i = 0; i < offset; ++i) { \
  209. (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
  210. } \
  211. offset >>= 1; \
  212. for (int i = 0; i < offset; ++i) { \
  213. (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
  214. } \
  215. offset >>= 1; \
  216. for (int i = 0; i < offset; ++i) { \
  217. (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
  218. } \
  219. const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
  220. const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
  221. (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
  222. } while (0)
  223. #define GGML_F16_VEC GGML_F16x8
  224. #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
  225. #define GGML_F16_VEC_SET1 GGML_F16x8_SET1
  226. #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
  227. #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
  228. #define GGML_F16_VEC_FMA GGML_F16x8_FMA
  229. #define GGML_F16_VEC_ADD GGML_F16x8_ADD
  230. #define GGML_F16_VEC_MUL GGML_F16x8_MUL
  231. #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
  232. #else
  233. // if FP16 vector arithmetic is not supported, we use FP32 instead
  234. // and take advantage of the vcvt_ functions to convert to/from FP16
  235. #define GGML_F16_STEP 16
  236. #define GGML_F16_EPR 4
  237. #define GGML_F32Cx4 float32x4_t
  238. #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
  239. #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
  240. #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
  241. #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
  242. #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
  243. #define GGML_F32Cx4_ADD vaddq_f32
  244. #define GGML_F32Cx4_MUL vmulq_f32
  245. #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
  246. #define GGML_F16_VEC GGML_F32Cx4
  247. #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
  248. #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
  249. #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
  250. #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
  251. #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
  252. #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
  253. #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
  254. #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
  255. #endif
  256. #elif defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
  257. #define GGML_SIMD
  258. // F32 NEON
  259. #define GGML_F32_STEP 16
  260. #define GGML_F32_EPR 4
  261. #define GGML_F32x4 float32x4_t
  262. #define GGML_F32x4_ZERO vdupq_n_f32(0.0f)
  263. #define GGML_F32x4_SET1(x) vdupq_n_f32(x)
  264. #define GGML_F32x4_LOAD vld1q_f32
  265. #define GGML_F32x4_STORE vst1q_f32
  266. #define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
  267. #define GGML_F32x4_ADD vaddq_f32
  268. #define GGML_F32x4_MUL vmulq_f32
  269. #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
  270. #define GGML_F32x4_REDUCE(res, x) \
  271. { \
  272. int offset = GGML_F32_ARR >> 1; \
  273. for (int i = 0; i < offset; ++i) { \
  274. (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
  275. } \
  276. offset >>= 1; \
  277. for (int i = 0; i < offset; ++i) { \
  278. (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
  279. } \
  280. offset >>= 1; \
  281. for (int i = 0; i < offset; ++i) { \
  282. (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
  283. } \
  284. (res) = (ggml_float) GGML_F32x4_REDUCE_ONE((x)[0]); \
  285. }
  286. #define GGML_F32_VEC GGML_F32x4
  287. #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
  288. #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
  289. #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
  290. #define GGML_F32_VEC_STORE GGML_F32x4_STORE
  291. #define GGML_F32_VEC_FMA GGML_F32x4_FMA
  292. #define GGML_F32_VEC_ADD GGML_F32x4_ADD
  293. #define GGML_F32_VEC_MUL GGML_F32x4_MUL
  294. #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
  295. // F16 NEON
  296. #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
  297. #define GGML_F16_STEP 32
  298. #define GGML_F16_EPR 8
  299. #define GGML_F16x8 float16x8_t
  300. #define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
  301. #define GGML_F16x8_SET1(x) vdupq_n_f16(x)
  302. #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x))
  303. #define GGML_F16x8_STORE vst1q_f16
  304. #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
  305. #define GGML_F16x8_ADD vaddq_f16
  306. #define GGML_F16x8_MUL vmulq_f16
  307. #define GGML_F16x8_REDUCE(res, x) \
  308. do { \
  309. int offset = GGML_F16_ARR >> 1; \
  310. for (int i = 0; i < offset; ++i) { \
  311. (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
  312. } \
  313. offset >>= 1; \
  314. for (int i = 0; i < offset; ++i) { \
  315. (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
  316. } \
  317. offset >>= 1; \
  318. for (int i = 0; i < offset; ++i) { \
  319. (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
  320. } \
  321. const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
  322. const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
  323. (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
  324. } while (0)
  325. #define GGML_F16_VEC GGML_F16x8
  326. #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
  327. #define GGML_F16_VEC_SET1 GGML_F16x8_SET1
  328. #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
  329. #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
  330. #define GGML_F16_VEC_FMA GGML_F16x8_FMA
  331. #define GGML_F16_VEC_ADD GGML_F16x8_ADD
  332. #define GGML_F16_VEC_MUL GGML_F16x8_MUL
  333. #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
  334. #else
  335. // if FP16 vector arithmetic is not supported, we use FP32 instead
  336. // and take advantage of the vcvt_ functions to convert to/from FP16
  337. #define GGML_F16_STEP 16
  338. #define GGML_F16_EPR 4
  339. #define GGML_F32Cx4 float32x4_t
  340. #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
  341. #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
  342. #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
  343. #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
  344. #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
  345. #define GGML_F32Cx4_ADD vaddq_f32
  346. #define GGML_F32Cx4_MUL vmulq_f32
  347. #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
  348. #define GGML_F16_VEC GGML_F32Cx4
  349. #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
  350. #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
  351. #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
  352. #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
  353. #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
  354. #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
  355. #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
  356. #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
  357. #endif
  358. #elif defined(__AVX512F__)
  359. #define GGML_SIMD
  360. // F32 AVX512
  361. #define GGML_F32_STEP 64
  362. #define GGML_F32_EPR 16
  363. #define GGML_F32x16 __m512
  364. #define GGML_F32x16_ZERO _mm512_setzero_ps()
  365. #define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
  366. #define GGML_F32x16_LOAD _mm512_loadu_ps
  367. #define GGML_F32x16_STORE _mm512_storeu_ps
  368. // _mm512_fmadd_ps is defined in AVX512F so no guard is required
  369. #define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
  370. #define GGML_F32x16_ADD _mm512_add_ps
  371. #define GGML_F32x16_MUL _mm512_mul_ps
  372. #define GGML_F32x16_REDUCE(res, x) \
  373. do { \
  374. int offset = GGML_F32_ARR >> 1; \
  375. for (int i = 0; i < offset; ++i) { \
  376. x[i] = _mm512_add_ps(x[i], x[offset+i]); \
  377. } \
  378. offset >>= 1; \
  379. for (int i = 0; i < offset; ++i) { \
  380. x[i] = _mm512_add_ps(x[i], x[offset+i]); \
  381. } \
  382. offset >>= 1; \
  383. for (int i = 0; i < offset; ++i) { \
  384. x[i] = _mm512_add_ps(x[i], x[offset+i]); \
  385. } \
  386. res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
  387. } while (0)
  388. // TODO: is this optimal ?
  389. #define GGML_F32_VEC GGML_F32x16
  390. #define GGML_F32_VEC_ZERO GGML_F32x16_ZERO
  391. #define GGML_F32_VEC_SET1 GGML_F32x16_SET1
  392. #define GGML_F32_VEC_LOAD GGML_F32x16_LOAD
  393. #define GGML_F32_VEC_STORE GGML_F32x16_STORE
  394. #define GGML_F32_VEC_FMA GGML_F32x16_FMA
  395. #define GGML_F32_VEC_ADD GGML_F32x16_ADD
  396. #define GGML_F32_VEC_MUL GGML_F32x16_MUL
  397. #define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
  398. // F16 AVX512
  399. // F16 AVX
  400. #define GGML_F16_STEP 64
  401. #define GGML_F16_EPR 16
  402. // AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
  403. #define GGML_F32Cx16 __m512
  404. #define GGML_F32Cx16_ZERO _mm512_setzero_ps()
  405. #define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
  406. // unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
  407. // so F16C guard isn't required
  408. #define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x)))
  409. #define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
  410. #define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
  411. #define GGML_F32Cx16_ADD _mm512_add_ps
  412. #define GGML_F32Cx16_MUL _mm512_mul_ps
  413. #define GGML_F32Cx16_REDUCE(res, x) \
  414. do { \
  415. int offset = GGML_F32_ARR >> 1; \
  416. for (int i = 0; i < offset; ++i) { \
  417. x[i] = _mm512_add_ps(x[i], x[offset+i]); \
  418. } \
  419. offset >>= 1; \
  420. for (int i = 0; i < offset; ++i) { \
  421. x[i] = _mm512_add_ps(x[i], x[offset+i]); \
  422. } \
  423. offset >>= 1; \
  424. for (int i = 0; i < offset; ++i) { \
  425. x[i] = _mm512_add_ps(x[i], x[offset+i]); \
  426. } \
  427. res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
  428. } while (0)
  429. #define GGML_F16_VEC GGML_F32Cx16
  430. #define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO
  431. #define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1
  432. #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p)
  433. #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
  434. #define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
  435. #define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
  436. #define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
  437. #define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
  438. #elif defined(__AVX__)
  439. #define GGML_SIMD
  440. // F32 AVX
  441. #define GGML_F32_STEP 32
  442. #define GGML_F32_EPR 8
  443. #define GGML_F32x8 __m256
  444. #define GGML_F32x8_ZERO _mm256_setzero_ps()
  445. #define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
  446. #define GGML_F32x8_LOAD _mm256_loadu_ps
  447. #define GGML_F32x8_STORE _mm256_storeu_ps
  448. #if defined(__FMA__)
  449. #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
  450. #else
  451. #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
  452. #endif
  453. #define GGML_F32x8_ADD _mm256_add_ps
  454. #define GGML_F32x8_MUL _mm256_mul_ps
  455. #define GGML_F32x8_REDUCE(res, x) \
  456. do { \
  457. int offset = GGML_F32_ARR >> 1; \
  458. for (int i = 0; i < offset; ++i) { \
  459. x[i] = _mm256_add_ps(x[i], x[offset+i]); \
  460. } \
  461. offset >>= 1; \
  462. for (int i = 0; i < offset; ++i) { \
  463. x[i] = _mm256_add_ps(x[i], x[offset+i]); \
  464. } \
  465. offset >>= 1; \
  466. for (int i = 0; i < offset; ++i) { \
  467. x[i] = _mm256_add_ps(x[i], x[offset+i]); \
  468. } \
  469. const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
  470. _mm256_extractf128_ps(x[0], 1)); \
  471. const __m128 t1 = _mm_hadd_ps(t0, t0); \
  472. res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
  473. } while (0)
  474. // TODO: is this optimal ?
  475. #define GGML_F32_VEC GGML_F32x8
  476. #define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
  477. #define GGML_F32_VEC_SET1 GGML_F32x8_SET1
  478. #define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
  479. #define GGML_F32_VEC_STORE GGML_F32x8_STORE
  480. #define GGML_F32_VEC_FMA GGML_F32x8_FMA
  481. #define GGML_F32_VEC_ADD GGML_F32x8_ADD
  482. #define GGML_F32_VEC_MUL GGML_F32x8_MUL
  483. #define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
  484. // F16 AVX
  485. #define GGML_F16_STEP 32
  486. #define GGML_F16_EPR 8
  487. // F16 arithmetic is not supported by AVX, so we use F32 instead
  488. #define GGML_F32Cx8 __m256
  489. #define GGML_F32Cx8_ZERO _mm256_setzero_ps()
  490. #define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x)
  491. #if defined(__F16C__)
  492. // the _mm256_cvt intrinsics require F16C
  493. #define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
  494. #define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
  495. #else
  496. static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
  497. float tmp[8];
  498. for (int i = 0; i < 8; i++) {
  499. tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
  500. }
  501. return _mm256_loadu_ps(tmp);
  502. }
  503. static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
  504. float arr[8];
  505. _mm256_storeu_ps(arr, y);
  506. for (int i = 0; i < 8; i++)
  507. x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
  508. }
  509. #define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
  510. #define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
  511. #endif
  512. #define GGML_F32Cx8_FMA GGML_F32x8_FMA
  513. #define GGML_F32Cx8_ADD _mm256_add_ps
  514. #define GGML_F32Cx8_MUL _mm256_mul_ps
  515. #define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
  516. #define GGML_F16_VEC GGML_F32Cx8
  517. #define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
  518. #define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
  519. #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p)
  520. #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
  521. #define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
  522. #define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
  523. #define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
  524. #define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
  525. #elif defined(__POWER9_VECTOR__)
  526. #define GGML_SIMD
  527. // F32 POWER9
  528. #define GGML_F32_STEP 32
  529. #define GGML_F32_EPR 4
  530. #define GGML_F32x4 vector float
  531. #define GGML_F32x4_ZERO {0.0f}
  532. #define GGML_F32x4_SET1 vec_splats
  533. #define GGML_F32x4_LOAD(p) vec_xl(0, p)
  534. #define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
  535. #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
  536. #define GGML_F32x4_ADD vec_add
  537. #define GGML_F32x4_MUL vec_mul
  538. #define GGML_F32x4_REDUCE(res, x) \
  539. { \
  540. int offset = GGML_F32_ARR >> 1; \
  541. for (int i = 0; i < offset; ++i) { \
  542. x[i] = vec_add(x[i], x[offset+i]); \
  543. } \
  544. offset >>= 1; \
  545. for (int i = 0; i < offset; ++i) { \
  546. x[i] = vec_add(x[i], x[offset+i]); \
  547. } \
  548. offset >>= 1; \
  549. for (int i = 0; i < offset; ++i) { \
  550. x[i] = vec_add(x[i], x[offset+i]); \
  551. } \
  552. res = vec_extract(x[0], 0) + \
  553. vec_extract(x[0], 1) + \
  554. vec_extract(x[0], 2) + \
  555. vec_extract(x[0], 3); \
  556. }
  557. #define GGML_F32_VEC GGML_F32x4
  558. #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
  559. #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
  560. #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
  561. #define GGML_F32_VEC_STORE GGML_F32x4_STORE
  562. #define GGML_F32_VEC_FMA GGML_F32x4_FMA
  563. #define GGML_F32_VEC_ADD GGML_F32x4_ADD
  564. #define GGML_F32_VEC_MUL GGML_F32x4_MUL
  565. #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
  566. // F16 POWER9
  567. #define GGML_F16_STEP GGML_F32_STEP
  568. #define GGML_F16_EPR GGML_F32_EPR
  569. #define GGML_F16_VEC GGML_F32x4
  570. #define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
  571. #define GGML_F16_VEC_SET1 GGML_F32x4_SET1
  572. #define GGML_F16_VEC_FMA GGML_F32x4_FMA
  573. #define GGML_F16_VEC_ADD GGML_F32x4_ADD
  574. #define GGML_F16_VEC_MUL GGML_F32x4_MUL
  575. #define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
  576. // Use vec_xl, not vec_ld, in case the load address is not aligned.
  577. #define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \
  578. vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
  579. vec_extract_fp32_from_shortl(vec_xl(0, p))
  580. static inline unsigned char ggml_endian_byte(int i) {
  581. uint16_t tmp_val = 1;
  582. return ((unsigned char *)&tmp_val)[i];
  583. }
  584. #define GGML_ENDIAN_BYTE(i) ggml_endian_byte(i)
  585. #define GGML_F16_VEC_STORE(p, r, i) \
  586. if (i & 0x1) \
  587. vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)], \
  588. r[i - GGML_ENDIAN_BYTE(0)]), \
  589. 0, p - GGML_F16_EPR)
  590. #elif defined(__wasm_simd128__)
  591. #define GGML_SIMD
  592. // F32 WASM
  593. #define GGML_F32_STEP 16
  594. #define GGML_F32_EPR 4
  595. #define GGML_F32x4 v128_t
  596. #define GGML_F32x4_ZERO wasm_f32x4_splat(0.0f)
  597. #define GGML_F32x4_SET1(x) wasm_f32x4_splat(x)
  598. #define GGML_F32x4_LOAD wasm_v128_load
  599. #define GGML_F32x4_STORE wasm_v128_store
  600. #define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
  601. #define GGML_F32x4_ADD wasm_f32x4_add
  602. #define GGML_F32x4_MUL wasm_f32x4_mul
  603. #define GGML_F32x4_REDUCE(res, x) \
  604. { \
  605. int offset = GGML_F32_ARR >> 1; \
  606. for (int i = 0; i < offset; ++i) { \
  607. x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
  608. } \
  609. offset >>= 1; \
  610. for (int i = 0; i < offset; ++i) { \
  611. x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
  612. } \
  613. offset >>= 1; \
  614. for (int i = 0; i < offset; ++i) { \
  615. x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
  616. } \
  617. res = wasm_f32x4_extract_lane(x[0], 0) + \
  618. wasm_f32x4_extract_lane(x[0], 1) + \
  619. wasm_f32x4_extract_lane(x[0], 2) + \
  620. wasm_f32x4_extract_lane(x[0], 3); \
  621. }
  622. #define GGML_F32_VEC GGML_F32x4
  623. #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
  624. #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
  625. #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
  626. #define GGML_F32_VEC_STORE GGML_F32x4_STORE
  627. #define GGML_F32_VEC_FMA GGML_F32x4_FMA
  628. #define GGML_F32_VEC_ADD GGML_F32x4_ADD
  629. #define GGML_F32_VEC_MUL GGML_F32x4_MUL
  630. #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
  631. // F16 WASM
  632. #define GGML_F16_STEP 16
  633. #define GGML_F16_EPR 4
  634. inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
  635. float tmp[4];
  636. tmp[0] = GGML_CPU_FP16_TO_FP32(p[0]);
  637. tmp[1] = GGML_CPU_FP16_TO_FP32(p[1]);
  638. tmp[2] = GGML_CPU_FP16_TO_FP32(p[2]);
  639. tmp[3] = GGML_CPU_FP16_TO_FP32(p[3]);
  640. return wasm_v128_load(tmp);
  641. }
  642. inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
  643. float tmp[4];
  644. wasm_v128_store(tmp, x);
  645. p[0] = GGML_CPU_FP32_TO_FP16(tmp[0]);
  646. p[1] = GGML_CPU_FP32_TO_FP16(tmp[1]);
  647. p[2] = GGML_CPU_FP32_TO_FP16(tmp[2]);
  648. p[3] = GGML_CPU_FP32_TO_FP16(tmp[3]);
  649. }
  650. #define GGML_F16x4 v128_t
  651. #define GGML_F16x4_ZERO wasm_f32x4_splat(0.0f)
  652. #define GGML_F16x4_SET1(x) wasm_f32x4_splat(x)
  653. #define GGML_F16x4_LOAD(x) __wasm_f16x4_load(x)
  654. #define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
  655. #define GGML_F16x4_FMA GGML_F32x4_FMA
  656. #define GGML_F16x4_ADD wasm_f32x4_add
  657. #define GGML_F16x4_MUL wasm_f32x4_mul
  658. #define GGML_F16x4_REDUCE(res, x) \
  659. { \
  660. int offset = GGML_F16_ARR >> 1; \
  661. for (int i = 0; i < offset; ++i) { \
  662. x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
  663. } \
  664. offset >>= 1; \
  665. for (int i = 0; i < offset; ++i) { \
  666. x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
  667. } \
  668. offset >>= 1; \
  669. for (int i = 0; i < offset; ++i) { \
  670. x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
  671. } \
  672. res = (ggml_float) (wasm_f32x4_extract_lane(x[0], 0) + \
  673. wasm_f32x4_extract_lane(x[0], 1) + \
  674. wasm_f32x4_extract_lane(x[0], 2) + \
  675. wasm_f32x4_extract_lane(x[0], 3)); \
  676. }
  677. #define GGML_F16_VEC GGML_F16x4
  678. #define GGML_F16_VEC_ZERO GGML_F16x4_ZERO
  679. #define GGML_F16_VEC_SET1 GGML_F16x4_SET1
  680. #define GGML_F16_VEC_LOAD(p, i) GGML_F16x4_LOAD(p)
  681. #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
  682. #define GGML_F16_VEC_FMA GGML_F16x4_FMA
  683. #define GGML_F16_VEC_ADD GGML_F16x4_ADD
  684. #define GGML_F16_VEC_MUL GGML_F16x4_MUL
  685. #define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
  686. #elif defined(__SSE3__)
  687. #define GGML_SIMD
  688. // F32 SSE
  689. #define GGML_F32_STEP 32
  690. #define GGML_F32_EPR 4
  691. #define GGML_F32x4 __m128
  692. #define GGML_F32x4_ZERO _mm_setzero_ps()
  693. #define GGML_F32x4_SET1(x) _mm_set1_ps(x)
  694. #define GGML_F32x4_LOAD _mm_loadu_ps
  695. #define GGML_F32x4_STORE _mm_storeu_ps
  696. #if defined(__FMA__)
  697. // TODO: Does this work?
  698. #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
  699. #else
  700. #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
  701. #endif
  702. #define GGML_F32x4_ADD _mm_add_ps
  703. #define GGML_F32x4_MUL _mm_mul_ps
  704. #define GGML_F32x4_REDUCE(res, x) \
  705. { \
  706. int offset = GGML_F32_ARR >> 1; \
  707. for (int i = 0; i < offset; ++i) { \
  708. x[i] = _mm_add_ps(x[i], x[offset+i]); \
  709. } \
  710. offset >>= 1; \
  711. for (int i = 0; i < offset; ++i) { \
  712. x[i] = _mm_add_ps(x[i], x[offset+i]); \
  713. } \
  714. offset >>= 1; \
  715. for (int i = 0; i < offset; ++i) { \
  716. x[i] = _mm_add_ps(x[i], x[offset+i]); \
  717. } \
  718. const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
  719. res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
  720. }
  721. // TODO: is this optimal ?
  722. #define GGML_F32_VEC GGML_F32x4
  723. #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
  724. #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
  725. #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
  726. #define GGML_F32_VEC_STORE GGML_F32x4_STORE
  727. #define GGML_F32_VEC_FMA GGML_F32x4_FMA
  728. #define GGML_F32_VEC_ADD GGML_F32x4_ADD
  729. #define GGML_F32_VEC_MUL GGML_F32x4_MUL
  730. #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
  731. // F16 SSE
  732. #define GGML_F16_STEP 32
  733. #define GGML_F16_EPR 4
  734. static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
  735. float tmp[4];
  736. tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
  737. tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
  738. tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
  739. tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
  740. return _mm_loadu_ps(tmp);
  741. }
  742. static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
  743. float arr[4];
  744. _mm_storeu_ps(arr, y);
  745. x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
  746. x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
  747. x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
  748. x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
  749. }
  750. #define GGML_F32Cx4 __m128
  751. #define GGML_F32Cx4_ZERO _mm_setzero_ps()
  752. #define GGML_F32Cx4_SET1(x) _mm_set1_ps(x)
  753. #define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x)
  754. #define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
  755. #define GGML_F32Cx4_FMA GGML_F32x4_FMA
  756. #define GGML_F32Cx4_ADD _mm_add_ps
  757. #define GGML_F32Cx4_MUL _mm_mul_ps
  758. #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
  759. #define GGML_F16_VEC GGML_F32Cx4
  760. #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
  761. #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
  762. #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
  763. #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
  764. #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
  765. #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
  766. #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
  767. #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
  768. #elif defined(__loongarch_asx)
  769. #define GGML_SIMD
  770. // F32 LASX
  771. #define GGML_F32_STEP 32
  772. #define GGML_F32_EPR 8
  773. #define GGML_F32x8 __m256
  774. #define GGML_F32x8_ZERO (__m256)__lasx_xvldi(0)
  775. #define GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
  776. #define GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0)
  777. #define GGML_F32x8_STORE(x,y) __lasx_xvst((y), (x), 0)
  778. #define GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a)
  779. #define GGML_F32x8_ADD __lasx_xvfadd_s
  780. #define GGML_F32x8_MUL __lasx_xvfmul_s
  781. #define GGML_F32x8_REDUCE(res, x) \
  782. do { \
  783. int offset = GGML_F32_ARR >> 1; \
  784. for (int i = 0; i < offset; ++i) { \
  785. x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
  786. } \
  787. offset >>= 1; \
  788. for (int i = 0; i < offset; ++i) { \
  789. x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
  790. } \
  791. offset >>= 1; \
  792. for (int i = 0; i < offset; ++i) { \
  793. x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
  794. } \
  795. float *tmp_p = (float *)&x[0]; \
  796. res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7]; \
  797. } while (0)
  798. // TODO: is this optimal ?
  799. #define GGML_F32_VEC GGML_F32x8
  800. #define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
  801. #define GGML_F32_VEC_SET1 GGML_F32x8_SET1
  802. #define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
  803. #define GGML_F32_VEC_STORE GGML_F32x8_STORE
  804. #define GGML_F32_VEC_FMA GGML_F32x8_FMA
  805. #define GGML_F32_VEC_ADD GGML_F32x8_ADD
  806. #define GGML_F32_VEC_MUL GGML_F32x8_MUL
  807. #define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
  808. // F16 LASX
  809. #define GGML_F16_STEP 32
  810. #define GGML_F16_EPR 8
  811. // F16 arithmetic is not supported by LASX, so we use F32 instead
  812. #define GGML_F32Cx8 __m256
  813. #define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
  814. #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
  815. static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
  816. __m256i a;
  817. memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
  818. a = __lasx_xvpermi_d(a, 0 | (1 << 4));
  819. return __lasx_xvfcvtl_s_h(a);
  820. }
  821. static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
  822. __m256i a = __lasx_xvfcvt_h_s(y, y);
  823. a = __lasx_xvpermi_d(a, 0 | (2 << 2));
  824. memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
  825. }
  826. #define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
  827. #define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
  828. #define GGML_F32Cx8_FMA GGML_F32x8_FMA
  829. #define GGML_F32Cx8_ADD __lasx_xvfadd_s
  830. #define GGML_F32Cx8_MUL __lasx_xvfmul_s
  831. #define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
  832. #define GGML_F16_VEC GGML_F32Cx8
  833. #define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
  834. #define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
  835. #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p)
  836. #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
  837. #define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
  838. #define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
  839. #define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
  840. #define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
  841. #elif defined(__loongarch_sx)
  842. #define GGML_SIMD
  843. // F32 LSX
  844. #define GGML_F32_STEP 32
  845. #define GGML_F32_EPR 4
  846. #define GGML_F32x4 __m128
  847. #define GGML_F32x4_ZERO __lsx_vldi(0)
  848. #define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
  849. #define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
  850. #define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
  851. #define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
  852. #define GGML_F32x4_ADD __lsx_vfadd_s
  853. #define GGML_F32x4_MUL __lsx_vfmul_s
  854. #define GGML_F32x4_REDUCE(res, x) \
  855. { \
  856. int offset = GGML_F32_ARR >> 1; \
  857. for (int i = 0; i < offset; ++i) { \
  858. x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
  859. } \
  860. offset >>= 1; \
  861. for (int i = 0; i < offset; ++i) { \
  862. x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
  863. } \
  864. offset >>= 1; \
  865. for (int i = 0; i < offset; ++i) { \
  866. x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
  867. } \
  868. __m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
  869. tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
  870. tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
  871. const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
  872. tmp = __lsx_vsrli_d((__m128i) t0, 32); \
  873. tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
  874. tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
  875. res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
  876. }
  877. #define GGML_F32_VEC GGML_F32x4
  878. #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
  879. #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
  880. #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
  881. #define GGML_F32_VEC_STORE GGML_F32x4_STORE
  882. #define GGML_F32_VEC_FMA GGML_F32x4_FMA
  883. #define GGML_F32_VEC_ADD GGML_F32x4_ADD
  884. #define GGML_F32_VEC_MUL GGML_F32x4_MUL
  885. #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
  886. // F16 LSX
  887. #define GGML_F16_STEP 32
  888. #define GGML_F16_EPR 4
  889. static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
  890. float tmp[4];
  891. tmp[0] = GGML_CPU_FP16_TO_FP32(x[0]);
  892. tmp[1] = GGML_CPU_FP16_TO_FP32(x[1]);
  893. tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
  894. tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
  895. return __lsx_vld(tmp, 0);
  896. }
  897. static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
  898. float arr[4];
  899. __lsx_vst(y, arr, 0);
  900. x[0] = GGML_CPU_FP32_TO_FP16(arr[0]);
  901. x[1] = GGML_CPU_FP32_TO_FP16(arr[1]);
  902. x[2] = GGML_CPU_FP32_TO_FP16(arr[2]);
  903. x[3] = GGML_CPU_FP32_TO_FP16(arr[3]);
  904. }
  905. #define GGML_F32Cx4 __m128
  906. #define GGML_F32Cx4_ZERO __lsx_vldi(0)
  907. #define GGML_F32Cx4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
  908. #define GGML_F32Cx4_LOAD(x) __lsx_f16x4_load(x)
  909. #define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
  910. #define GGML_F32Cx4_FMA GGML_F32x4_FMA
  911. #define GGML_F32Cx4_ADD __lsx_vfadd_s
  912. #define GGML_F32Cx4_MUL __lsx_vfmul_s
  913. #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
  914. #define GGML_F16_VEC GGML_F32Cx4
  915. #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
  916. #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
  917. #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
  918. #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
  919. #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
  920. #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
  921. #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
  922. #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
  923. #elif defined(__VXE__) || defined(__VXE2__)
  924. #define GGML_SIMD
  925. // F32 s390x
  926. #define GGML_F32_STEP 32
  927. #define GGML_F32_EPR 4
  928. #define GGML_F32x4 float32x4_t
  929. #define GGML_F32x4_ZERO vec_splats(0.0f)
  930. #define GGML_F32x4_SET1 vec_splats
  931. #define GGML_F32x4_LOAD(p) vec_xl(0, p)
  932. #define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
  933. #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
  934. #define GGML_F32x4_ADD vec_add
  935. #define GGML_F32x4_MUL vec_mul
  936. #define GGML_F32x4_REDUCE(res, x) \
  937. { \
  938. int offset = GGML_F32_ARR >> 1; \
  939. for (int i = 0; i < offset; ++i) { \
  940. x[i] = vec_add(x[i], x[offset + i]); \
  941. } \
  942. offset >>= 1; \
  943. for (int i = 0; i < offset; ++i) { \
  944. x[i] = vec_add(x[i], x[offset + i]); \
  945. } \
  946. offset >>= 1; \
  947. for (int i = 0; i < offset; ++i) { \
  948. x[i] = vec_add(x[i], x[offset + i]); \
  949. } \
  950. float32x4_t tmp = x[0] + vec_reve(x[0]); \
  951. res = tmp[0] + tmp[1]; \
  952. }
  953. #define GGML_F32_VEC GGML_F32x4
  954. #define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
  955. #define GGML_F32_VEC_SET1 GGML_F32x4_SET1
  956. #define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
  957. #define GGML_F32_VEC_STORE GGML_F32x4_STORE
  958. #define GGML_F32_VEC_FMA GGML_F32x4_FMA
  959. #define GGML_F32_VEC_ADD GGML_F32x4_ADD
  960. #define GGML_F32_VEC_MUL GGML_F32x4_MUL
  961. #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
  962. // F16 s390x
  963. #define GGML_F16_STEP GGML_F32_STEP
  964. #define GGML_F16_EPR GGML_F32_EPR
  965. static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
  966. #if defined(__NNPA__)
  967. uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)x);
  968. uint16x8_t v_xd = vec_convert_from_fp16(v_x, 0);
  969. return vec_extend_to_fp32_hi(v_xd, 0);
  970. #else
  971. float tmp[4];
  972. for (int i = 0; i < 4; i++) {
  973. tmp[i] = GGML_CPU_FP16_TO_FP32(x[i]);
  974. }
  975. // note: keep type-cast here to prevent compiler bugs
  976. // see: https://github.com/ggml-org/llama.cpp/issues/12846
  977. return vec_xl(0, (const float *)(tmp));
  978. #endif
  979. }
  980. static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
  981. #if defined(__NNPA__)
  982. float32x4_t v_zero = vec_splats(0.0f);
  983. uint16x8_t v_xd = vec_round_from_fp32(v_y, v_zero, 0);
  984. uint16x8_t v_x = vec_convert_to_fp16(v_xd, 0);
  985. x[0] = vec_extract(v_x, 0);
  986. x[1] = vec_extract(v_x, 1);
  987. x[2] = vec_extract(v_x, 2);
  988. x[3] = vec_extract(v_x, 3);
  989. #else
  990. float arr[4];
  991. // note: keep type-cast here to prevent compiler bugs
  992. // see: https://github.com/ggml-org/llama.cpp/issues/12846
  993. vec_xst(v_y, 0, (float *)(arr));
  994. for (int i = 0; i < 4; i++) {
  995. x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
  996. }
  997. #endif
  998. }
  999. #define GGML_F16_VEC GGML_F32x4
  1000. #define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
  1001. #define GGML_F16_VEC_SET1 GGML_F32x4_SET1
  1002. #define GGML_F16_VEC_LOAD(p, i) __lzs_f16cx4_load(p)
  1003. #define GGML_F16_VEC_STORE(p, r, i) __lzs_f16cx4_store(p, r[i])
  1004. #define GGML_F16_VEC_FMA GGML_F32x4_FMA
  1005. #define GGML_F16_VEC_ADD GGML_F32x4_ADD
  1006. #define GGML_F16_VEC_MUL GGML_F32x4_MUL
  1007. #define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
  1008. #endif
  1009. // GGML_F32_ARR / GGML_F16_ARR
  1010. // number of registers to use per step
  1011. #ifdef GGML_SIMD
  1012. #define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
  1013. #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
  1014. #endif
  1015. #ifdef __cplusplus
  1016. }
  1017. #endif