vec.h 49 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223
  1. // Vectorized functions for fundamental operations
  2. #pragma once
  3. #include "ggml-impl.h"
  4. #include "simd-mappings.h"
  5. #include "ggml.h"
  6. #include "ggml-cpu.h"
  7. #if defined(GGML_USE_ACCELERATE)
  8. #include <Accelerate/Accelerate.h>
  9. #endif
  10. // floating point type used to accumulate sums
  11. typedef double ggml_float;
  12. #define GGML_GELU_FP16
  13. #define GGML_GELU_QUICK_FP16
  14. #define GGML_SOFT_MAX_UNROLL 4
  15. #define GGML_VEC_DOT_UNROLL 2
  16. #define GGML_VEC_MAD_UNROLL 32
  17. #ifdef __cplusplus
  18. extern "C" {
  19. #endif
  20. //
  21. // global data
  22. //
  23. // precomputed gelu table for f16 (128 KB)
  24. extern ggml_fp16_t ggml_table_gelu_f16[1 << 16];
  25. // precomputed quick gelu table for f16 (128 KB)
  26. extern ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
  27. //
  28. // fundamental operations
  29. //
  30. void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
  31. void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
  32. void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
  33. void ggml_vec_silu_f32(const int n, float * y, const float * x);
  34. ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max);
  35. ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max);
  36. inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
  37. inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
  38. inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
  39. inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
  40. inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
  41. inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
  42. inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) {
  43. int i = 0;
  44. #if defined(__AVX2__)
  45. for (; i + 7 < n; i += 8) {
  46. __m256 vx = _mm256_loadu_ps(x + i);
  47. __m256 vy = _mm256_loadu_ps(y + i);
  48. __m256 vz = _mm256_add_ps(vx, vy);
  49. _mm256_storeu_ps(z + i, vz);
  50. }
  51. #endif
  52. for (; i < n; ++i) {
  53. z[i] = x[i] + y[i];
  54. }
  55. }
  56. inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
  57. for (int i = 0; i < n; ++i) {
  58. z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
  59. }
  60. }
  61. inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
  62. inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
  63. inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; }
  64. inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
  65. inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
  66. for (int i = 0; i < n; ++i) {
  67. z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) - GGML_CPU_FP16_TO_FP32(y[i]));
  68. }
  69. }
  70. inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
  71. inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
  72. inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
  73. inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  74. for (int i = 0; i < n; ++i) {
  75. y[i] = GGML_CPU_FP32_TO_FP16(-GGML_CPU_FP16_TO_FP32(x[i]));
  76. }
  77. }
  78. inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
  79. inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
  80. for (int i = 0; i < n; ++i) {
  81. z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) * GGML_CPU_FP16_TO_FP32(y[i]));
  82. }
  83. }
  84. inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
  85. inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
  86. for (int i = 0; i < n; ++i) {
  87. z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) / GGML_CPU_FP16_TO_FP32(y[i]));
  88. }
  89. }
  90. // compute GGML_VEC_DOT_UNROLL dot products at once
  91. // xs - x row stride in bytes
  92. inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) {
  93. ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
  94. ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL];
  95. for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
  96. x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
  97. }
  98. #if defined(GGML_SIMD)
  99. #if defined(__riscv_v_intrinsic)
  100. // todo: RVV impl
  101. for (int i = 0; i < n; ++i) {
  102. for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
  103. sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
  104. }
  105. }
  106. #else
  107. const int np = (n & ~(GGML_F16_STEP - 1));
  108. GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
  109. GGML_F16_VEC ax[GGML_F16_ARR];
  110. GGML_F16_VEC ay[GGML_F16_ARR];
  111. for (int i = 0; i < np; i += GGML_F16_STEP) {
  112. for (int j = 0; j < GGML_F16_ARR; j++) {
  113. ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
  114. for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
  115. ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
  116. sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
  117. }
  118. }
  119. }
  120. // reduce sum0..sum3 to sum0
  121. for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
  122. GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
  123. }
  124. // leftovers
  125. for (int i = np; i < n; ++i) {
  126. for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
  127. sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
  128. }
  129. }
  130. #endif
  131. #else
  132. for (int i = 0; i < n; ++i) {
  133. for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
  134. sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
  135. }
  136. }
  137. #endif
  138. for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
  139. s[i] = (float)sumf[i];
  140. }
  141. }
  142. inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
  143. #if defined(GGML_SIMD)
  144. #if defined(__ARM_FEATURE_SVE)
  145. const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
  146. const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
  147. const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
  148. GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
  149. const int np = (n & ~(ggml_f32_step - 1));
  150. svfloat32_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
  151. svfloat32_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
  152. for (int i = 0; i < np; i += ggml_f32_step) {
  153. ax1 = GGML_F32_VEC_LOAD(x + i);
  154. ay1 = GGML_F32_VEC_LOAD(y + i);
  155. ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
  156. GGML_F32_VEC_STORE(y + i, ay1);
  157. ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
  158. ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
  159. ay2 = GGML_F32_VEC_FMA(ay2, ax2, vx);
  160. GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
  161. ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
  162. ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
  163. ay3 = GGML_F32_VEC_FMA(ay3, ax3, vx);
  164. GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
  165. ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
  166. ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
  167. ay4 = GGML_F32_VEC_FMA(ay4, ax4, vx);
  168. GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
  169. ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
  170. ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
  171. ay5 = GGML_F32_VEC_FMA(ay5, ax5, vx);
  172. GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
  173. ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
  174. ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
  175. ay6 = GGML_F32_VEC_FMA(ay6, ax6, vx);
  176. GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
  177. ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
  178. ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
  179. ay7 = GGML_F32_VEC_FMA(ay7, ax7, vx);
  180. GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
  181. ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
  182. ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
  183. ay8 = GGML_F32_VEC_FMA(ay8, ax8, vx);
  184. GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
  185. }
  186. // leftovers
  187. // Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
  188. const int np2 = (n & ~(ggml_f32_epr - 1));
  189. for (int i = np; i < np2; i += ggml_f32_epr) {
  190. ax1 = GGML_F32_VEC_LOAD(x + i);
  191. ay1 = GGML_F32_VEC_LOAD(y + i);
  192. ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
  193. GGML_F32_VEC_STORE(y + i, ay1);
  194. }
  195. // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
  196. if (np2 < n) {
  197. svbool_t pg =svwhilelt_b32(np2, n);
  198. ax1 = svld1_f32(pg, x + np2);
  199. ay1 = svld1_f32(pg, y + np2);
  200. ay1 = svmad_f32_m(pg, ax1, vx, ay1);
  201. svst1_f32(pg, y + np2, ay1);
  202. }
  203. #elif defined(__riscv_v_intrinsic)
  204. for (int i = 0, avl; i < n; i += avl) {
  205. avl = __riscv_vsetvl_e32m8(n - i);
  206. vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
  207. vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
  208. vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, v, ay, avl);
  209. __riscv_vse32_v_f32m8(&y[i], ny, avl);
  210. }
  211. #else
  212. const int np = (n & ~(GGML_F32_STEP - 1));
  213. GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
  214. GGML_F32_VEC ax[GGML_F32_ARR];
  215. GGML_F32_VEC ay[GGML_F32_ARR];
  216. for (int i = 0; i < np; i += GGML_F32_STEP) {
  217. for (int j = 0; j < GGML_F32_ARR; j++) {
  218. ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
  219. ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
  220. ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
  221. GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
  222. }
  223. }
  224. // leftovers
  225. for (int i = np; i < n; ++i) {
  226. y[i] += x[i]*v;
  227. }
  228. #endif
  229. #else
  230. // scalar
  231. for (int i = 0; i < n; ++i) {
  232. y[i] += x[i]*v;
  233. }
  234. #endif
  235. }
  236. inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
  237. #if defined(GGML_SIMD)
  238. #if defined(__riscv_v_intrinsic)
  239. // todo: RVV impl
  240. // scalar
  241. for (int i = 0; i < n; ++i) {
  242. y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
  243. }
  244. #else
  245. const int np = (n & ~(GGML_F16_STEP - 1));
  246. GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
  247. GGML_F16_VEC ax[GGML_F16_ARR];
  248. GGML_F16_VEC ay[GGML_F16_ARR];
  249. for (int i = 0; i < np; i += GGML_F16_STEP) {
  250. for (int j = 0; j < GGML_F16_ARR; j++) {
  251. ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
  252. ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
  253. ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
  254. GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
  255. }
  256. }
  257. // leftovers
  258. for (int i = np; i < n; ++i) {
  259. y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
  260. }
  261. #endif
  262. #else
  263. // scalar
  264. for (int i = 0; i < n; ++i) {
  265. y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
  266. }
  267. #endif
  268. }
  269. // xs and vs are byte strides of x and v
  270. inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) {
  271. const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL];
  272. const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL];
  273. for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
  274. x[i] = (const float *) ((const char *) xv + i*xs);
  275. v[i] = (const float *) ((const char *) vv + i*vs);
  276. }
  277. #if defined(GGML_SIMD)
  278. #if defined(__ARM_FEATURE_SVE)
  279. // scalar Route to scalar implementation //TODO: Write SVE code
  280. for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
  281. for (int i = 0; i < n; ++i) {
  282. y[i] += x[k][i]*v[k][0];
  283. }
  284. }
  285. #elif defined(__riscv_v_intrinsic)
  286. for (int i = 0, avl; i < n; i += avl) {
  287. avl = __riscv_vsetvl_e32m8(n - i);
  288. vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
  289. for (int k = 0; k < GGML_VEC_MAD_UNROLL; k++) {
  290. vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[k][i], avl);
  291. ay = __riscv_vfmadd_vf_f32m8(ax, v[k][0], ay, avl);
  292. }
  293. __riscv_vse32_v_f32m8(&y[i], ay, avl);
  294. }
  295. #else
  296. const int np = (n & ~(GGML_F32_STEP - 1));
  297. GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
  298. for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
  299. vx[k] = GGML_F32_VEC_SET1(v[k][0]);
  300. }
  301. GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
  302. GGML_F32_VEC ay[GGML_F32_ARR];
  303. for (int i = 0; i < np; i += GGML_F32_STEP) {
  304. for (int j = 0; j < GGML_F32_ARR; j++) {
  305. ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
  306. for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
  307. ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
  308. ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
  309. }
  310. GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
  311. }
  312. }
  313. // leftovers
  314. for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
  315. for (int i = np; i < n; ++i) {
  316. y[i] += x[k][i]*v[k][0];
  317. }
  318. }
  319. #endif
  320. #else
  321. // scalar
  322. for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
  323. for (int i = 0; i < n; ++i) {
  324. y[i] += x[k][i]*v[k][0];
  325. }
  326. }
  327. #endif
  328. }
  329. inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, const float s, const float b) {
  330. #if defined(GGML_USE_ACCELERATE)
  331. vDSP_vsmsa(x, 1, &s, &b, y, 1, n);
  332. #elif defined(GGML_SIMD)
  333. #if defined(__ARM_FEATURE_SVE)
  334. // scalar ; TODO: Write SVE code
  335. for (int i = 0; i < n; ++i) {
  336. y[i] = x[i]*s + b;
  337. }
  338. #elif defined(__riscv_v_intrinsic)
  339. for (int i = 0, avl; i < n; i += avl) {
  340. avl = __riscv_vsetvl_e32m8(n - i);
  341. vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
  342. vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8(b, avl);
  343. vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, s, vb, avl);
  344. __riscv_vse32_v_f32m8(&y[i], ny, avl);
  345. }
  346. #else
  347. const int np = (n & ~(GGML_F32_STEP - 1));
  348. GGML_F32_VEC vs = GGML_F32_VEC_SET1(s);
  349. GGML_F32_VEC vb = GGML_F32_VEC_SET1(b);
  350. GGML_F32_VEC ay[GGML_F32_ARR];
  351. for (int i = 0; i < np; i += GGML_F32_STEP) {
  352. for (int j = 0; j < GGML_F32_ARR; j++) {
  353. ay[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
  354. ay[j] = GGML_F32_VEC_FMA(ay[j], vs, vb);
  355. GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
  356. }
  357. }
  358. // leftovers
  359. for (int i = np; i < n; ++i) {
  360. y[i] = x[i]*s + b;
  361. }
  362. #endif
  363. #else
  364. // scalar
  365. for (int i = 0; i < n; ++i) {
  366. y[i] = x[i]*s + b;
  367. }
  368. #endif
  369. }
  370. //inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
  371. inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
  372. #if defined(GGML_USE_ACCELERATE)
  373. vDSP_vsmul(y, 1, &v, y, 1, n);
  374. #elif defined(GGML_SIMD)
  375. #if defined(__ARM_FEATURE_SVE)
  376. const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
  377. const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
  378. const int ggml_f32_step = 2 * ggml_f32_epr;
  379. GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
  380. const int np = (n & ~(ggml_f32_step - 1));
  381. svfloat32_t ay1;
  382. svfloat32_t ay2;
  383. for (int i = 0; i < np; i += ggml_f32_step) {
  384. ay1 = GGML_F32_VEC_LOAD(y + i);
  385. ay1 = GGML_F32_VEC_MUL(ay1, vx);
  386. GGML_F32_VEC_STORE(y + i, ay1);
  387. ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
  388. ay2 = GGML_F32_VEC_MUL(ay2, vx);
  389. GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
  390. }
  391. // leftovers
  392. // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
  393. if (np < n) {
  394. svbool_t pg = svwhilelt_b32(np, n);
  395. ay1 = svld1_f32(pg, y + np);
  396. ay1 = svmul_f32_m(pg, ay1, vx);
  397. svst1_f32(pg, y + np, ay1);
  398. }
  399. #elif defined(__riscv_v_intrinsic)
  400. for (int i = 0, avl; i < n; i += avl) {
  401. avl = __riscv_vsetvl_e32m8(n - i);
  402. vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
  403. vfloat32m8_t ny = __riscv_vfmul_vf_f32m8(ay, v, avl);
  404. __riscv_vse32_v_f32m8(&y[i], ny, avl);
  405. }
  406. #else
  407. const int np = (n & ~(GGML_F32_STEP - 1));
  408. GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
  409. GGML_F32_VEC ay[GGML_F32_ARR];
  410. for (int i = 0; i < np; i += GGML_F32_STEP) {
  411. for (int j = 0; j < GGML_F32_ARR; j++) {
  412. ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
  413. ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
  414. GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
  415. }
  416. }
  417. // leftovers
  418. for (int i = np; i < n; ++i) {
  419. y[i] *= v;
  420. }
  421. #endif
  422. #else
  423. // scalar
  424. for (int i = 0; i < n; ++i) {
  425. y[i] *= v;
  426. }
  427. #endif
  428. }
  429. inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
  430. #if defined(GGML_SIMD)
  431. #if defined(__riscv_v_intrinsic)
  432. // todo: RVV impl
  433. // scalar
  434. for (int i = 0; i < n; ++i) {
  435. y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
  436. }
  437. #else
  438. const int np = (n & ~(GGML_F16_STEP - 1));
  439. GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
  440. GGML_F16_VEC ay[GGML_F16_ARR];
  441. for (int i = 0; i < np; i += GGML_F16_STEP) {
  442. for (int j = 0; j < GGML_F16_ARR; j++) {
  443. ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
  444. ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
  445. GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
  446. }
  447. }
  448. // leftovers
  449. for (int i = np; i < n; ++i) {
  450. y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
  451. }
  452. #endif
  453. #else
  454. // scalar
  455. for (int i = 0; i < n; ++i) {
  456. y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
  457. }
  458. #endif
  459. }
  460. inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
  461. inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
  462. inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  463. for (int i = 0; i < n; ++i) {
  464. float v = GGML_CPU_FP16_TO_FP32(x[i]);
  465. y[i] = GGML_CPU_FP32_TO_FP16(v*v);
  466. }
  467. }
  468. inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
  469. inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  470. for (int i = 0; i < n; ++i) {
  471. y[i] = GGML_CPU_FP32_TO_FP16(sqrtf(GGML_CPU_FP16_TO_FP32(x[i])));
  472. }
  473. }
  474. inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
  475. inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  476. for (int i = 0; i < n; ++i) {
  477. y[i] = GGML_CPU_FP32_TO_FP16(logf(GGML_CPU_FP16_TO_FP32(x[i])));
  478. }
  479. }
  480. inline static void ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]); }
  481. inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  482. for (int i = 0; i < n; ++i) {
  483. y[i] = GGML_CPU_FP32_TO_FP16(sinf(GGML_CPU_FP16_TO_FP32(x[i])));
  484. }
  485. }
  486. inline static void ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]); }
  487. inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  488. for (int i = 0; i < n; ++i) {
  489. y[i] = GGML_CPU_FP32_TO_FP16(cosf(GGML_CPU_FP16_TO_FP32(x[i])));
  490. }
  491. }
  492. inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
  493. inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  494. for (int i = 0; i < n; ++i) {
  495. y[i] = GGML_CPU_FP32_TO_FP16(fabsf(GGML_CPU_FP16_TO_FP32(x[i])));
  496. }
  497. }
  498. inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
  499. inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  500. for (int i = 0; i < n; ++i) {
  501. float v = GGML_CPU_FP16_TO_FP32(x[i]);
  502. y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
  503. }
  504. }
  505. inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
  506. inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  507. for (int i = 0; i < n; ++i) {
  508. y[i] = GGML_CPU_FP32_TO_FP16((GGML_CPU_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
  509. }
  510. }
  511. inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
  512. inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  513. for (int i = 0; i < n; ++i) {
  514. y[i] = GGML_CPU_FP32_TO_FP16(tanhf(GGML_CPU_FP16_TO_FP32(x[i])));
  515. }
  516. }
  517. inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
  518. inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  519. for (int i = 0; i < n; ++i) {
  520. y[i] = GGML_CPU_FP32_TO_FP16(expm1f(GGML_CPU_FP16_TO_FP32(x[i])));
  521. }
  522. }
  523. inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
  524. inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  525. for (int i = 0; i < n; ++i) {
  526. float v = GGML_CPU_FP16_TO_FP32(x[i]);
  527. y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : 0.f);
  528. }
  529. }
  530. inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
  531. inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) {
  532. for (int i = 0; i < n; ++i) {
  533. float v = GGML_CPU_FP16_TO_FP32(x[i]);
  534. y[i] = GGML_CPU_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
  535. }
  536. }
  537. inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
  538. inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  539. for (int i = 0; i < n; ++i) {
  540. y[i] = GGML_CPU_FP32_TO_FP16(1.f / (1.f + expf(-GGML_CPU_FP16_TO_FP32(x[i]))));
  541. }
  542. }
  543. // TODO: optimize performance
  544. inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
  545. inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  546. for (int i = 0; i < n; ++i) {
  547. float v = GGML_CPU_FP16_TO_FP32(x[i]);
  548. y[i] = GGML_CPU_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
  549. }
  550. }
  551. inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
  552. inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  553. for (int i = 0; i < n; ++i) {
  554. y[i] = GGML_CPU_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_CPU_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
  555. }
  556. }
  557. inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
  558. inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  559. for (int i = 0; i < n; ++i) {
  560. y[i] = GGML_CPU_FP32_TO_FP16(expf(GGML_CPU_FP16_TO_FP32(x[i])));
  561. }
  562. }
  563. static const float GELU_COEF_A = 0.044715f;
  564. static const float GELU_QUICK_COEF = -1.702f;
  565. static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
  566. static const float SQRT_2_INV = 0.70710678118654752440084436210484f;
  567. inline static float ggml_gelu_f32(float x) {
  568. return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
  569. }
  570. inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  571. const uint16_t * i16 = (const uint16_t *) x;
  572. for (int i = 0; i < n; ++i) {
  573. y[i] = ggml_table_gelu_f16[i16[i]];
  574. }
  575. }
  576. inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  577. for (int i = 0; i < n; ++i) {
  578. float xi = GGML_CPU_FP16_TO_FP32(x[i]);
  579. float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
  580. y[i] = GGML_CPU_FP32_TO_FP16(res);
  581. }
  582. }
  583. #ifdef GGML_GELU_FP16
  584. inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
  585. uint16_t t;
  586. for (int i = 0; i < n; ++i) {
  587. if (x[i] <= -10.0f) {
  588. y[i] = 0.0f;
  589. } else if (x[i] >= 10.0f) {
  590. y[i] = x[i];
  591. } else {
  592. ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
  593. memcpy(&t, &fp16, sizeof(uint16_t));
  594. y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]);
  595. }
  596. }
  597. }
  598. #else
  599. inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
  600. for (int i = 0; i < n; ++i) {
  601. y[i] = ggml_gelu_f32(x[i]);
  602. }
  603. }
  604. #endif
  605. inline static void ggml_vec_gelu_erf_f32(const int n, float * y, const float * x) {
  606. for (int i = 0; i < n; ++i) {
  607. float xi = x[i];
  608. y[i] = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
  609. }
  610. }
  611. inline static float ggml_gelu_quick_f32(float x) {
  612. return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
  613. }
  614. //inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  615. // const uint16_t * i16 = (const uint16_t *) x;
  616. // for (int i = 0; i < n; ++i) {
  617. // y[i] = ggml_table_gelu_quick_f16[i16[i]];
  618. // }
  619. //}
  620. #ifdef GGML_GELU_QUICK_FP16
  621. inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
  622. uint16_t t;
  623. for (int i = 0; i < n; ++i) {
  624. ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
  625. memcpy(&t, &fp16, sizeof(uint16_t));
  626. y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
  627. }
  628. }
  629. #else
  630. inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
  631. for (int i = 0; i < n; ++i) {
  632. y[i] = ggml_gelu_quick_f32(x[i]);
  633. }
  634. }
  635. #endif
  636. inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  637. for (int i = 0; i < n; ++i) {
  638. float v = GGML_CPU_FP16_TO_FP32(x[i]);
  639. y[i] = GGML_CPU_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
  640. }
  641. }
  642. // Sigmoid Linear Unit (SiLU) function
  643. inline static float ggml_silu_f32(float x) {
  644. return x/(1.0f + expf(-x));
  645. }
  646. inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
  647. float v = GGML_CPU_FP16_TO_FP32(x);
  648. return GGML_CPU_FP32_TO_FP16(v/(1.0f + expf(-v)));
  649. }
  650. #if __FINITE_MATH_ONLY__
  651. #error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
  652. #error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
  653. #endif
  654. /* Below function was borrowed from the GitHub repository:
  655. https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp */
  656. #if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
  657. inline static svfloat32_t exp_ps_sve(svbool_t pg, svfloat32_t src) {
  658. // Constants
  659. const svfloat32_t log2_e = svdup_n_f32(1.4426950409f);
  660. const svfloat32_t ln2 = svdup_n_f32(0.6931473921f);
  661. const svfloat32_t half_ln2_sq = svdup_n_f32(0.2413862043f);
  662. const svuint32_t not_mask17 = svdup_n_u32(~((1u << 17) - 1));
  663. const svfloat32_t one = svdup_n_f32(1.0f);
  664. const svfloat32_t inactive1 = svdup_n_f32(0.0f);
  665. const svint32_t inactive2 = svdup_n_s32(0);
  666. // Algorithm starts here
  667. svfloat32_t t0 = svmul_f32_m(pg, src, log2_e); // y = x * log2(e)
  668. svfloat32_t t1 = svrintm_f32_m(inactive1, pg, t0); // rount to int (float)
  669. svint32_t t2 = svcvt_s32_f32_m(inactive2, pg, t1); // n
  670. t1 = svsub_f32_m(pg, t0, t1); // a = y - floor(y)
  671. t1 = svadd_f32_m(pg, t1, one); // b = a + 1
  672. svuint32_t t3 = svlsr_n_u32_m(pg, svreinterpret_u32_f32(t1), 17); // v = b >> 17 (u32)
  673. svfloat32_t t4 = svexpa_f32(t3); // c = fexpa(v)
  674. t4 = svscale_f32_m(pg, t4, t2); // fexpa(v) * 2^(n)
  675. // and_(t2.d, t1.d, not_mask17.d)
  676. svfloat32_t t5 = svreinterpret_f32_u32(svand_u32_m(pg, svreinterpret_u32_f32(t1), not_mask17));
  677. t5 = svsub_f32_m(pg, t1, t5); // z
  678. t0 = svmla_f32_m(pg, ln2, t5, half_ln2_sq); // ln2 + half_ln2_sq * z
  679. t0 = svmla_f32_m(pg, one, t5, t0); // 1 + (ln2 * z) + (half_ln2_sq * z * z)
  680. t0 = svmul_f32_m(pg, t0, t4); // Final result
  681. return t0;
  682. }
  683. #endif
  684. #if defined(__ARM_NEON) && defined(__aarch64__)
  685. // adapted from arm limited optimized routine
  686. // the maximum error is 1.45358 plus 0.5 ulps
  687. // numbers above 88.38 will flush to infinity
  688. // numbers beneath -103.97 will flush to zero
  689. inline static float32x4_t ggml_v_expf(float32x4_t x) {
  690. const float32x4_t r = vdupq_n_f32(0x1.8p23f);
  691. const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f));
  692. const float32x4_t n = vsubq_f32(z, r);
  693. const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n,
  694. vdupq_n_f32(0x1.7f7d1cp-20f));
  695. const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23);
  696. const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1))));
  697. const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126));
  698. const float32x4_t u = vmulq_f32(b, b);
  699. const float32x4_t j = vfmaq_f32(
  700. vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b),
  701. vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b),
  702. vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u);
  703. if (!vpaddd_u64(vreinterpretq_u64_u32(c)))
  704. return vfmaq_f32(k, j, k);
  705. const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000));
  706. const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000)));
  707. const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d));
  708. return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1),
  709. vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j)));
  710. }
  711. // computes silu x/(1+exp(-x)) in single precision vector
  712. inline static float32x4_t ggml_v_silu(float32x4_t x) {
  713. const float32x4_t one = vdupq_n_f32(1.0f);
  714. const float32x4_t zero = vdupq_n_f32(0.0f);
  715. const float32x4_t neg_x = vsubq_f32(zero, x);
  716. const float32x4_t exp_neg_x = ggml_v_expf(neg_x);
  717. const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
  718. return vdivq_f32(x, one_plus_exp_neg_x);
  719. }
  720. #elif defined(__AVX512F__) && defined(__AVX512DQ__)
  721. // adapted from arm limited optimized routine
  722. // the maximum error is 1.45358 plus 0.5 ulps
  723. // numbers above 88.38 will flush to infinity
  724. // numbers beneath -103.97 will flush to zero
  725. inline static __m512 ggml_v_expf(__m512 x) {
  726. const __m512 r = _mm512_set1_ps(0x1.8p23f);
  727. const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
  728. const __m512 n = _mm512_sub_ps(z, r);
  729. const __m512 b =
  730. _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
  731. _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
  732. const __mmask16 d =
  733. _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
  734. const __m512 u = _mm512_mul_ps(b, b);
  735. const __m512 j = _mm512_fmadd_ps(
  736. _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
  737. _mm512_set1_ps(0x1.573e2ep-5f)),
  738. u,
  739. _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
  740. _mm512_set1_ps(0x1.fffdb6p-2f))),
  741. u,
  742. _mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F)));
  743. const __m512 res = _mm512_scalef_ps(j, n);
  744. if (_mm512_kortestz(d, d))
  745. return res;
  746. const __m512 zero = _mm512_setzero_ps();
  747. const __m512 alt = _mm512_mask_blend_ps(
  748. _mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero);
  749. return _mm512_mask_blend_ps(d, res, alt);
  750. }
  751. // computes silu x/(1+exp(-x)) in single precision vector
  752. inline static __m512 ggml_v_silu(__m512 x) {
  753. const __m512 one = _mm512_set1_ps(1);
  754. const __m512 zero = _mm512_setzero_ps();
  755. const __m512 neg_x = _mm512_sub_ps(zero, x);
  756. const __m512 exp_neg_x = ggml_v_expf(neg_x);
  757. const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x);
  758. return _mm512_div_ps(x, one_plus_exp_neg_x);
  759. }
  760. #elif defined(__AVX2__) && defined(__FMA__)
  761. // adapted from arm limited optimized routine
  762. // the maximum error is 1.45358 plus 0.5 ulps
  763. // numbers above 88.38 will flush to infinity
  764. // numbers beneath -103.97 will flush to zero
  765. inline static __m256 ggml_v_expf(__m256 x) {
  766. const __m256 r = _mm256_set1_ps(0x1.8p23f);
  767. const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r);
  768. const __m256 n = _mm256_sub_ps(z, r);
  769. const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f),
  770. _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x));
  771. const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23);
  772. const __m256 k = _mm256_castsi256_ps(
  773. _mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1))));
  774. const __m256i c = _mm256_castps_si256(
  775. _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
  776. _mm256_set1_ps(126), _CMP_GT_OQ));
  777. const __m256 u = _mm256_mul_ps(b, b);
  778. const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b,
  779. _mm256_set1_ps(0x1.573e2ep-5f)), u,
  780. _mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b,
  781. _mm256_set1_ps(0x1.fffdb6p-2f))),
  782. u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b));
  783. if (!_mm256_movemask_ps(_mm256_castsi256_ps(c)))
  784. return _mm256_fmadd_ps(j, k, k);
  785. const __m256i g = _mm256_and_si256(
  786. _mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)),
  787. _mm256_set1_epi32(0x82000000u));
  788. const __m256 s1 =
  789. _mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u)));
  790. const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g));
  791. const __m256i d = _mm256_castps_si256(
  792. _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
  793. _mm256_set1_ps(192), _CMP_GT_OQ));
  794. return _mm256_or_ps(
  795. _mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)),
  796. _mm256_andnot_ps(
  797. _mm256_castsi256_ps(d),
  798. _mm256_or_ps(
  799. _mm256_and_ps(_mm256_castsi256_ps(c),
  800. _mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)),
  801. _mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k)))));
  802. }
  803. // computes silu x/(1+exp(-x)) in single precision vector
  804. inline static __m256 ggml_v_silu(__m256 x) {
  805. const __m256 one = _mm256_set1_ps(1);
  806. const __m256 zero = _mm256_setzero_ps();
  807. const __m256 neg_x = _mm256_sub_ps(zero, x);
  808. const __m256 exp_neg_x = ggml_v_expf(neg_x);
  809. const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x);
  810. return _mm256_div_ps(x, one_plus_exp_neg_x);
  811. }
  812. #elif defined(__SSE2__) // __AVX2__ / __ARM_NEON
  813. #if defined(__FMA__)
  814. #define MADD128(x, y, z) _mm_fmadd_ps(x, y, z)
  815. #define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z)
  816. #else
  817. #define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
  818. #define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y))
  819. #endif
  820. // adapted from arm limited optimized routine
  821. // the maximum error is 1.45358 plus 0.5 ulps
  822. // numbers above 88.38 will flush to infinity
  823. // numbers beneath -103.97 will flush to zero
  824. inline static __m128 ggml_v_expf(__m128 x) {
  825. const __m128 r = _mm_set1_ps(0x1.8p23f);
  826. const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r);
  827. const __m128 n = _mm_sub_ps(z, r);
  828. const __m128 b =
  829. NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x));
  830. const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23);
  831. const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1))));
  832. const __m128i c =
  833. _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126)));
  834. const __m128 u = _mm_mul_ps(b, b);
  835. const __m128 j =
  836. MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u,
  837. MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))),
  838. u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b));
  839. if (!_mm_movemask_epi8(c))
  840. return MADD128(j, k, k);
  841. const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())),
  842. _mm_set1_epi32(0x82000000u));
  843. const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u)));
  844. const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g));
  845. const __m128i d =
  846. _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192)));
  847. return _mm_or_ps(
  848. _mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)),
  849. _mm_andnot_ps(_mm_castsi128_ps(d),
  850. _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)),
  851. _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
  852. }
  853. // computes silu x/(1+exp(-x)) in single precision vector
  854. inline static __m128 ggml_v_silu(__m128 x) {
  855. const __m128 one = _mm_set1_ps(1);
  856. const __m128 zero = _mm_setzero_ps();
  857. const __m128 neg_x = _mm_sub_ps(zero, x);
  858. const __m128 exp_neg_x = ggml_v_expf(neg_x);
  859. const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x);
  860. return _mm_div_ps(x, one_plus_exp_neg_x);
  861. }
  862. #elif defined(__riscv_v_intrinsic)
  863. // adapted from arm limited optimized routine
  864. // the maximum error is 1.45358 plus 0.5 ulps
  865. // numbers above 88.38 will flush to infinity
  866. // numbers beneath -103.97 will flush to zero
  867. inline static vfloat32m2_t ggml_v_expf_m2(vfloat32m2_t x, int vl) {
  868. const vfloat32m2_t r = __riscv_vfmv_v_f_f32m2(0x1.8p23f, vl);
  869. #ifdef __riscv_xtheadvector
  870. // workaround for compiler bug (gcc 14.3.0: Error: unrecognized opcode `th.vmv1r.v v2,v4')
  871. vfloat32m2_t z = __riscv_vfadd_vf_f32m2(r, 0.0f, vl);
  872. z = __riscv_vfmacc_vf_f32m2(z, 0x1.715476p+0f, x, vl);
  873. #else
  874. const vfloat32m2_t z = __riscv_vfmacc_vf_f32m2(r, 0x1.715476p+0f, x, vl);
  875. #endif
  876. const vfloat32m2_t n = __riscv_vfsub_vv_f32m2(z, r, vl);
  877. const vfloat32m2_t b = __riscv_vfnmsac_vf_f32m2(__riscv_vfnmsac_vf_f32m2(x, 0x1.62e4p-1f, n, vl),
  878. 0x1.7f7d1cp-20f, n, vl);
  879. const vuint32m2_t e = __riscv_vsll_vx_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(z), 23, vl);
  880. const vfloat32m2_t k = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(e, 0x3f800000, vl)); // 1.0f
  881. const vbool16_t c = __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 126.0f, vl);
  882. const vfloat32m2_t u = __riscv_vfmul_vv_f32m2(b, b, vl);
  883. const vfloat32m2_t j = __riscv_vfmacc_vv_f32m2(
  884. __riscv_vfmul_vf_f32m2(b, 0x1.ffffecp-1f, vl),
  885. __riscv_vfmacc_vv_f32m2(
  886. __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.fffdb6p-2f, vl), 0x1.555e66p-3f, b, vl),
  887. __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.573e2ep-5f, vl), 0x1.0e4020p-7f, b, vl),
  888. u, vl), u, vl);
  889. if (!__riscv_vcpop_m_b16(c, vl))
  890. return __riscv_vfmacc_vv_f32m2(k, j, k, vl);
  891. const vbool16_t dm = __riscv_vmfle_vf_f32m2_b16(n, 0.0f, vl);
  892. const vuint32m2_t d = __riscv_vmerge_vxm_u32m2(__riscv_vmv_v_x_u32m2(0, vl), 0x82000000, dm, vl);
  893. const vfloat32m2_t s1 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(d, 0x7f000000, vl));
  894. const vfloat32m2_t s2 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vsub_vv_u32m2(e, d, vl));
  895. const vfloat32m2_t r1 = __riscv_vmerge_vvm_f32m2(
  896. __riscv_vfmacc_vv_f32m2(k, k, j, vl),
  897. __riscv_vfmul_vv_f32m2(__riscv_vfmacc_vv_f32m2(s2, s2, j, vl), s1, vl),
  898. c, vl);
  899. return __riscv_vmerge_vvm_f32m2(
  900. r1, __riscv_vfmul_vv_f32m2(s1, s1, vl),
  901. __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 192.0f, vl),
  902. vl);
  903. }
  904. #endif // __ARM_NEON / __AVX2__ / __SSE2__ / __riscv_v_intrinsic
  905. inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  906. for (int i = 0; i < n; ++i) {
  907. y[i] = ggml_silu_f16(x[i]);
  908. }
  909. }
  910. inline static float ggml_silu_backward_f32(float x, float dy) {
  911. const float s = 1.0f/(1.0f + expf(-x));
  912. return dy*s*(1.0f + x*(1.0f - s));
  913. }
  914. inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) {
  915. const float v = GGML_CPU_FP16_TO_FP32(x);
  916. const float s = 1.0f/(1.0f + expf(-v));
  917. return GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
  918. }
  919. inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
  920. for (int i = 0; i < n; ++i) {
  921. dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
  922. }
  923. }
  924. inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, const ggml_fp16_t * x, const ggml_fp16_t * dy) {
  925. for (int i = 0; i < n; ++i) {
  926. dx[i] = ggml_silu_backward_f16(x[i], dy[i]);
  927. }
  928. }
  929. inline static void ggml_vec_reglu_f32 (const int n, float * y, const float * x, const float * g) {
  930. for (int i = 0; i < n; ++i) {
  931. y[i] = (x[i] > 0.f) ? x[i] * g[i] : 0.f;
  932. }
  933. }
  934. inline static void ggml_vec_reglu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
  935. for (int i = 0; i < n; ++i) {
  936. float v = GGML_CPU_FP16_TO_FP32(x[i]);
  937. y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v * GGML_CPU_FP16_TO_FP32(g[i]) : 0.f);
  938. }
  939. }
  940. #ifdef GGML_GELU_FP16
  941. inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
  942. uint16_t t;
  943. for (int i = 0; i < n; ++i) {
  944. if (x[i] <= -10.0f) {
  945. y[i] = 0.0f;
  946. } else if (x[i] >= 10.0f) {
  947. y[i] = x[i] * g[i];
  948. } else {
  949. ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
  950. memcpy(&t, &fp16, sizeof(uint16_t));
  951. y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]) * g[i];
  952. }
  953. }
  954. }
  955. #else
  956. inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
  957. for (int i = 0; i < n; ++i) {
  958. y[i] = ggml_gelu_f32(x[i]) * g[i];
  959. }
  960. }
  961. #endif
  962. inline static void ggml_vec_geglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
  963. const uint16_t * i16 = (const uint16_t *) x;
  964. for (int i = 0; i < n; ++i) {
  965. float v = GGML_CPU_FP16_TO_FP32(g[i]);
  966. y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * v);
  967. }
  968. }
  969. void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g);
  970. inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
  971. for (int i = 0; i < n; ++i) {
  972. float xi = GGML_CPU_FP16_TO_FP32(x[i]);
  973. float gi = GGML_CPU_FP16_TO_FP32(g[i]);
  974. y[i] = GGML_CPU_FP32_TO_FP16((xi/(1.0f + expf(-xi))) * gi);
  975. }
  976. }
  977. inline static void ggml_vec_geglu_erf_f32(const int n, float * y, const float * x, const float * g) {
  978. for (int i = 0; i < n; ++i) {
  979. float xi = x[i];
  980. y[i] = 0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * g[i];
  981. }
  982. }
  983. inline static void ggml_vec_geglu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
  984. for (int i = 0; i < n; ++i) {
  985. float xi = GGML_CPU_FP16_TO_FP32(x[i]);
  986. float gi = GGML_CPU_FP16_TO_FP32(g[i]);
  987. y[i] = GGML_CPU_FP32_TO_FP16(0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * gi);
  988. }
  989. }
  990. #ifdef GGML_GELU_QUICK_FP16
  991. inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
  992. uint16_t t;
  993. for (int i = 0; i < n; ++i) {
  994. ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
  995. memcpy(&t, &fp16, sizeof(uint16_t));
  996. y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]) * g[i];
  997. }
  998. }
  999. #else
  1000. inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
  1001. for (int i = 0; i < n; ++i) {
  1002. y[i] = ggml_gelu_quick_f32(x[i]) * g[i];
  1003. }
  1004. }
  1005. #endif
  1006. inline static void ggml_vec_geglu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
  1007. const uint16_t * i16 = (const uint16_t *) x;
  1008. for (int i = 0; i < n; ++i) {
  1009. float v = GGML_CPU_FP16_TO_FP32(g[i]);
  1010. y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[i16[i]]) * v);
  1011. }
  1012. }
  1013. inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
  1014. #ifndef GGML_USE_ACCELERATE
  1015. ggml_float sum = 0.0;
  1016. for (int i = 0; i < n; ++i) {
  1017. sum += (ggml_float)x[i];
  1018. }
  1019. *s = (float)sum;
  1020. #else
  1021. vDSP_sve(x, 1, s, n);
  1022. #endif
  1023. }
  1024. inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
  1025. ggml_float sum = 0.0;
  1026. for (int i = 0; i < n; ++i) {
  1027. sum += (ggml_float)x[i];
  1028. }
  1029. *s = sum;
  1030. }
  1031. inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
  1032. float sum = 0.0f;
  1033. for (int i = 0; i < n; ++i) {
  1034. sum += GGML_CPU_FP16_TO_FP32(x[i]);
  1035. }
  1036. *s = sum;
  1037. }
  1038. inline static void ggml_vec_sum_bf16_ggf(const int n, float * s, const ggml_bf16_t * x) {
  1039. float sum = 0.0f;
  1040. for (int i = 0; i < n; ++i) {
  1041. sum += GGML_BF16_TO_FP32(x[i]);
  1042. }
  1043. *s = sum;
  1044. }
  1045. inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
  1046. #ifndef GGML_USE_ACCELERATE
  1047. float max = -INFINITY;
  1048. for (int i = 0; i < n; ++i) {
  1049. max = MAX(max, x[i]);
  1050. }
  1051. *s = max;
  1052. #else
  1053. vDSP_maxv(x, 1, s, n);
  1054. #endif
  1055. }
  1056. inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) {
  1057. ggml_vec_norm_f32(n, s, x);
  1058. *s = 1.f/(*s);
  1059. }
  1060. inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
  1061. float max = -INFINITY;
  1062. int idx = 0;
  1063. for (int i = 0; i < n; ++i) {
  1064. max = MAX(max, x[i]);
  1065. if (max == x[i]) { idx = i; }
  1066. }
  1067. *s = idx;
  1068. }
  1069. #ifdef __cplusplus
  1070. }
  1071. #endif