vec.h 60 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482
  1. // Vectorized functions for fundamental operations
  2. #pragma once
  3. #include "ggml-impl.h"
  4. #include "simd-mappings.h"
  5. #include "ggml.h"
  6. #include "ggml-cpu.h"
  7. #if defined(GGML_USE_ACCELERATE)
  8. #include <Accelerate/Accelerate.h>
  9. #endif
  10. // floating point type used to accumulate sums
  11. typedef double ggml_float;
  12. #define GGML_GELU_FP16
  13. #define GGML_GELU_QUICK_FP16
  14. #define GGML_SOFT_MAX_UNROLL 4
  15. #define GGML_VEC_DOT_UNROLL 2
  16. #define GGML_VEC_MAD_UNROLL 32
  17. #ifdef __cplusplus
  18. extern "C" {
  19. #endif
  20. //
  21. // global data
  22. //
  23. // precomputed gelu table for f16 (128 KB)
  24. extern ggml_fp16_t ggml_table_gelu_f16[1 << 16];
  25. // precomputed quick gelu table for f16 (128 KB)
  26. extern ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
  27. //
  28. // fundamental operations
  29. //
  30. void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
  31. void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
  32. void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
  33. void ggml_vec_silu_f32(const int n, float * y, const float * x);
  34. ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean); //it will also center y ( y = y - mean )
  35. ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max);
  36. ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max);
  37. inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
  38. inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
  39. inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
  40. inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
  41. inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
  42. inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
  43. inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) {
  44. int i = 0;
  45. #if defined(__AVX2__)
  46. for (; i + 7 < n; i += 8) {
  47. __m256 vx = _mm256_loadu_ps(x + i);
  48. __m256 vy = _mm256_loadu_ps(y + i);
  49. __m256 vz = _mm256_add_ps(vx, vy);
  50. _mm256_storeu_ps(z + i, vz);
  51. }
  52. #endif
  53. for (; i < n; ++i) {
  54. z[i] = x[i] + y[i];
  55. }
  56. }
  57. inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
  58. for (int i = 0; i < n; ++i) {
  59. z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) + GGML_CPU_FP16_TO_FP32(y[i]));
  60. }
  61. }
  62. inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; }
  63. inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; }
  64. inline static void ggml_vec_acc1_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] += v; }
  65. inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] - y[i]; }
  66. inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
  67. for (int i = 0; i < n; ++i) {
  68. z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) - GGML_CPU_FP16_TO_FP32(y[i]));
  69. }
  70. }
  71. inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
  72. inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
  73. inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
  74. inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  75. for (int i = 0; i < n; ++i) {
  76. y[i] = GGML_CPU_FP32_TO_FP16(-GGML_CPU_FP16_TO_FP32(x[i]));
  77. }
  78. }
  79. inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; }
  80. inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
  81. for (int i = 0; i < n; ++i) {
  82. z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) * GGML_CPU_FP16_TO_FP32(y[i]));
  83. }
  84. }
  85. inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; }
  86. inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
  87. for (int i = 0; i < n; ++i) {
  88. z[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(x[i]) / GGML_CPU_FP16_TO_FP32(y[i]));
  89. }
  90. }
  91. // compute GGML_VEC_DOT_UNROLL dot products at once
  92. // xs - x row stride in bytes
  93. inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) {
  94. ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
  95. ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL];
  96. for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
  97. x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
  98. }
  99. #if defined(GGML_SIMD)
  100. #if defined(__ARM_FEATURE_SVE)
  101. const int sve_register_length = svcntb() * 8;
  102. const int ggml_f16_epr = sve_register_length / 16; // running when 16
  103. const int ggml_f16_step = 8 * ggml_f16_epr; // choose 8 SVE registers
  104. const int np = (n & ~(ggml_f16_step - 1));
  105. svfloat16_t sum_00 = svdup_n_f16(0.0f);
  106. svfloat16_t sum_01 = svdup_n_f16(0.0f);
  107. svfloat16_t sum_02 = svdup_n_f16(0.0f);
  108. svfloat16_t sum_03 = svdup_n_f16(0.0f);
  109. svfloat16_t sum_10 = svdup_n_f16(0.0f);
  110. svfloat16_t sum_11 = svdup_n_f16(0.0f);
  111. svfloat16_t sum_12 = svdup_n_f16(0.0f);
  112. svfloat16_t sum_13 = svdup_n_f16(0.0f);
  113. svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
  114. svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
  115. for (int i = 0; i < np; i += ggml_f16_step) {
  116. ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); // 8 elements
  117. ax1 = GGML_F16x_VEC_LOAD(x[0] + i + 0*ggml_f16_epr, 0); // 8 elements
  118. sum_00 = GGML_F16x_VEC_FMA(sum_00, ax1, ay1); // sum_00 = sum_00+ax1*ay1
  119. ax1 = GGML_F16x_VEC_LOAD(x[1] + i + 0*ggml_f16_epr, 0); // 8 elements
  120. sum_10 = GGML_F16x_VEC_FMA(sum_10, ax1, ay1);
  121. ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); // next 8 elements
  122. ax2 = GGML_F16x_VEC_LOAD(x[0] + i + 1*ggml_f16_epr, 1); // next 8 elements
  123. sum_01 = GGML_F16x_VEC_FMA(sum_01, ax2, ay2);
  124. ax2 = GGML_F16x_VEC_LOAD(x[1] + i + 1*ggml_f16_epr, 1);
  125. sum_11 = GGML_F16x_VEC_FMA(sum_11, ax2, ay2);
  126. ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
  127. ax3 = GGML_F16x_VEC_LOAD(x[0] + i + 2*ggml_f16_epr, 2);
  128. sum_02 = GGML_F16x_VEC_FMA(sum_02, ax3, ay3);
  129. ax3 = GGML_F16x_VEC_LOAD(x[1] + i + 2*ggml_f16_epr, 2);
  130. sum_12 = GGML_F16x_VEC_FMA(sum_12, ax3, ay3);
  131. ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
  132. ax4 = GGML_F16x_VEC_LOAD(x[0] + i + 3*ggml_f16_epr, 3);
  133. sum_03 = GGML_F16x_VEC_FMA(sum_03, ax4, ay4);
  134. ax4 = GGML_F16x_VEC_LOAD(x[1] + i + 3*ggml_f16_epr, 3);
  135. sum_13 = GGML_F16x_VEC_FMA(sum_13, ax4, ay4);
  136. ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
  137. ax5 = GGML_F16x_VEC_LOAD(x[0] + i + 4*ggml_f16_epr, 4);
  138. sum_00 = GGML_F16x_VEC_FMA(sum_00, ax5, ay5);
  139. ax5 = GGML_F16x_VEC_LOAD(x[1] + i + 4*ggml_f16_epr, 4);
  140. sum_10 = GGML_F16x_VEC_FMA(sum_10, ax5, ay5);
  141. ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
  142. ax6 = GGML_F16x_VEC_LOAD(x[0] + i + 5*ggml_f16_epr, 5);
  143. sum_01 = GGML_F16x_VEC_FMA(sum_01, ax6, ay6);
  144. ax6 = GGML_F16x_VEC_LOAD(x[1] + i + 5*ggml_f16_epr, 5);
  145. sum_11 = GGML_F16x_VEC_FMA(sum_11, ax6, ay6);
  146. ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
  147. ax7 = GGML_F16x_VEC_LOAD(x[0] + i + 6*ggml_f16_epr, 6);
  148. sum_02 = GGML_F16x_VEC_FMA(sum_02, ax7, ay7);
  149. ax7 = GGML_F16x_VEC_LOAD(x[1] + i + 6*ggml_f16_epr, 6);
  150. sum_12 = GGML_F16x_VEC_FMA(sum_12, ax7, ay7);
  151. ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
  152. ax8 = GGML_F16x_VEC_LOAD(x[0] + i + 7*ggml_f16_epr, 7);
  153. sum_03 = GGML_F16x_VEC_FMA(sum_03, ax8, ay8);
  154. ax8 = GGML_F16x_VEC_LOAD(x[1] + i + 7*ggml_f16_epr, 7);
  155. sum_13 = GGML_F16x_VEC_FMA(sum_13, ax8, ay8);
  156. }
  157. const int np2 = (n & ~(ggml_f16_epr - 1));
  158. for (int k = np; k < np2; k += ggml_f16_epr) {
  159. svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
  160. svfloat16_t rx = GGML_F16x_VEC_LOAD(x[0] + k, 0);
  161. sum_00 = GGML_F16x_VEC_FMA(sum_00, rx, ry);
  162. rx = GGML_F16x_VEC_LOAD(x[1] + k, 0);
  163. sum_10 = GGML_F16x_VEC_FMA(sum_10, rx, ry);
  164. }
  165. if (np2 < n) {
  166. svbool_t pg = svwhilelt_b16(np2, n);
  167. svfloat16_t hx_0 = svld1_f16(pg, (const __fp16 *)(x[0] + np2));
  168. svfloat16_t hx_1 = svld1_f16(pg, (const __fp16 *)(x[1] + np2));
  169. svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
  170. sum_00 = svmad_f16_x(pg, hx_0, hy, sum_00);
  171. sum_10 = svmad_f16_x(pg, hx_1, hy, sum_10);
  172. }
  173. GGML_F16x_VEC_REDUCE(sumf[0], sum_00, sum_01, sum_02, sum_03);
  174. GGML_F16x_VEC_REDUCE(sumf[1], sum_10, sum_11, sum_12, sum_13);
  175. #elif defined(__riscv_v_intrinsic)
  176. // todo: RVV impl
  177. for (int i = 0; i < n; ++i) {
  178. for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
  179. sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
  180. }
  181. }
  182. #else
  183. const int np = (n & ~(GGML_F16_STEP - 1));
  184. GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
  185. GGML_F16_VEC ax[GGML_F16_ARR];
  186. GGML_F16_VEC ay[GGML_F16_ARR];
  187. for (int i = 0; i < np; i += GGML_F16_STEP) {
  188. for (int j = 0; j < GGML_F16_ARR; j++) {
  189. ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
  190. for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
  191. ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
  192. sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
  193. }
  194. }
  195. }
  196. // reduce sum0..sum3 to sum0
  197. for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
  198. GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
  199. }
  200. // leftovers
  201. for (int i = np; i < n; ++i) {
  202. for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
  203. sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
  204. }
  205. }
  206. #endif
  207. #else
  208. for (int i = 0; i < n; ++i) {
  209. for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
  210. sumf[j] += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[j][i])*GGML_CPU_FP16_TO_FP32(y[i]));
  211. }
  212. }
  213. #endif
  214. for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
  215. s[i] = (float)sumf[i];
  216. }
  217. }
  218. inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
  219. #if defined(GGML_SIMD)
  220. #if defined(__ARM_FEATURE_SVE)
  221. const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
  222. const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
  223. const int ggml_f32_step = 8 * ggml_f32_epr; // choose 8 SVE registers
  224. GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
  225. const int np = (n & ~(ggml_f32_step - 1));
  226. svfloat32_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
  227. svfloat32_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
  228. for (int i = 0; i < np; i += ggml_f32_step) {
  229. ax1 = GGML_F32_VEC_LOAD(x + i);
  230. ay1 = GGML_F32_VEC_LOAD(y + i);
  231. ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
  232. GGML_F32_VEC_STORE(y + i, ay1);
  233. ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr);
  234. ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
  235. ay2 = GGML_F32_VEC_FMA(ay2, ax2, vx);
  236. GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
  237. ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr);
  238. ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr);
  239. ay3 = GGML_F32_VEC_FMA(ay3, ax3, vx);
  240. GGML_F32_VEC_STORE(y + i + 2*ggml_f32_epr, ay3);
  241. ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr);
  242. ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr);
  243. ay4 = GGML_F32_VEC_FMA(ay4, ax4, vx);
  244. GGML_F32_VEC_STORE(y + i + 3*ggml_f32_epr, ay4);
  245. ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr);
  246. ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr);
  247. ay5 = GGML_F32_VEC_FMA(ay5, ax5, vx);
  248. GGML_F32_VEC_STORE(y + i + 4*ggml_f32_epr, ay5);
  249. ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr);
  250. ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr);
  251. ay6 = GGML_F32_VEC_FMA(ay6, ax6, vx);
  252. GGML_F32_VEC_STORE(y + i + 5*ggml_f32_epr, ay6);
  253. ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr);
  254. ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr);
  255. ay7 = GGML_F32_VEC_FMA(ay7, ax7, vx);
  256. GGML_F32_VEC_STORE(y + i + 6*ggml_f32_epr, ay7);
  257. ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr);
  258. ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr);
  259. ay8 = GGML_F32_VEC_FMA(ay8, ax8, vx);
  260. GGML_F32_VEC_STORE(y + i + 7*ggml_f32_epr, ay8);
  261. }
  262. // leftovers
  263. // Since 8 unrolls are done in above loop, leftovers lie in range [0, ggml_f32_step] which is handled in below loop
  264. const int np2 = (n & ~(ggml_f32_epr - 1));
  265. for (int i = np; i < np2; i += ggml_f32_epr) {
  266. ax1 = GGML_F32_VEC_LOAD(x + i);
  267. ay1 = GGML_F32_VEC_LOAD(y + i);
  268. ay1 = GGML_F32_VEC_FMA(ay1, ax1, vx);
  269. GGML_F32_VEC_STORE(y + i, ay1);
  270. }
  271. // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
  272. if (np2 < n) {
  273. svbool_t pg =svwhilelt_b32(np2, n);
  274. ax1 = svld1_f32(pg, x + np2);
  275. ay1 = svld1_f32(pg, y + np2);
  276. ay1 = svmad_f32_m(pg, ax1, vx, ay1);
  277. svst1_f32(pg, y + np2, ay1);
  278. }
  279. #elif defined(__riscv_v_intrinsic)
  280. for (int i = 0, avl; i < n; i += avl) {
  281. avl = __riscv_vsetvl_e32m8(n - i);
  282. vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
  283. vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
  284. vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, v, ay, avl);
  285. __riscv_vse32_v_f32m8(&y[i], ny, avl);
  286. }
  287. #else
  288. const int np = (n & ~(GGML_F32_STEP - 1));
  289. GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
  290. GGML_F32_VEC ax[GGML_F32_ARR];
  291. GGML_F32_VEC ay[GGML_F32_ARR];
  292. for (int i = 0; i < np; i += GGML_F32_STEP) {
  293. for (int j = 0; j < GGML_F32_ARR; j++) {
  294. ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
  295. ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
  296. ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
  297. GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
  298. }
  299. }
  300. // leftovers
  301. for (int i = np; i < n; ++i) {
  302. y[i] += x[i]*v;
  303. }
  304. #endif
  305. #else
  306. // scalar
  307. for (int i = 0; i < n; ++i) {
  308. y[i] += x[i]*v;
  309. }
  310. #endif
  311. }
  312. inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
  313. #if defined(GGML_SIMD) && defined(__ARM_FEATURE_SVE)
  314. const int sve_register_length = svcntb() * 8;
  315. const int ggml_f16_epr = sve_register_length / 16;
  316. const int ggml_f16_step = 8 * ggml_f16_epr;
  317. GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
  318. int np = (n & ~(ggml_f16_step - 1));
  319. svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8;
  320. svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8;
  321. for (int i = 0; i < np; i += ggml_f16_step) {
  322. ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0);
  323. ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0);
  324. ay1 = GGML_F16x_VEC_FMA(ay1, ax1, vx);
  325. GGML_F16x_VEC_STORE(y + i + 0 * ggml_f16_epr, ay1, 0);
  326. ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1);
  327. ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1);
  328. ay2 = GGML_F16x_VEC_FMA(ay2, ax2, vx);
  329. GGML_F16x_VEC_STORE(y + i + 1 * ggml_f16_epr, ay2, 1);
  330. ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2);
  331. ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2);
  332. ay3 = GGML_F16x_VEC_FMA(ay3, ax3, vx);
  333. GGML_F16x_VEC_STORE(y + i + 2 * ggml_f16_epr, ay3, 2);
  334. ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3);
  335. ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3);
  336. ay4 = GGML_F16x_VEC_FMA(ay4, ax4, vx);
  337. GGML_F16x_VEC_STORE(y + i + 3 * ggml_f16_epr, ay4, 3);
  338. ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4);
  339. ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4);
  340. ay5 = GGML_F16x_VEC_FMA(ay5, ax5, vx);
  341. GGML_F16x_VEC_STORE(y + i + 4 * ggml_f16_epr, ay5, 4);
  342. ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5);
  343. ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5);
  344. ay6 = GGML_F16x_VEC_FMA(ay6, ax6, vx);
  345. GGML_F16x_VEC_STORE(y + i + 5 * ggml_f16_epr, ay6, 5);
  346. ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6);
  347. ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6);
  348. ay7 = GGML_F16x_VEC_FMA(ay7, ax7, vx);
  349. GGML_F16x_VEC_STORE(y + i + 6 * ggml_f16_epr, ay7, 6);
  350. ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7);
  351. ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7);
  352. ay8 = GGML_F16x_VEC_FMA(ay8, ax8, vx);
  353. GGML_F16x_VEC_STORE(y + i + 7 * ggml_f16_epr, ay8, 7);
  354. }
  355. const int np2 = (n & ~(ggml_f16_epr - 1));
  356. for (int k = np; k < np2; k += ggml_f16_epr) {
  357. svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0);
  358. svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0);
  359. ry = GGML_F16x_VEC_FMA(ry, rx, vx);
  360. GGML_F16x_VEC_STORE(y + k, ry, 0);
  361. }
  362. if (np2 < n) {
  363. svbool_t pg = svwhilelt_b16(np2, n);
  364. svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2));
  365. svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2));
  366. hy = svmad_f16_x(pg, hx, vx, hy);
  367. svst1_f16(pg, (__fp16 *)(y + np2), hy);
  368. }
  369. np = n;
  370. #elif defined(__riscv_zvfh) // implies __riscv_v_intrinsic
  371. const int np = n;
  372. _Float16 hv = (_Float16)v;
  373. for (int i = 0, avl; i < n; i += avl) {
  374. avl = __riscv_vsetvl_e16m8(n - i);
  375. vfloat16m8_t ax = __riscv_vle16_v_f16m8((const _Float16 *)&x[i], avl);
  376. vfloat16m8_t ay = __riscv_vle16_v_f16m8((_Float16 *)&y[i], avl);
  377. vfloat16m8_t ny = __riscv_vfmadd_vf_f16m8(ax, hv, ay, avl);
  378. __riscv_vse16_v_f16m8((_Float16 *)&y[i], ny, avl);
  379. }
  380. #elif defined(GGML_SIMD)
  381. const int np = (n & ~(GGML_F16_STEP - 1));
  382. GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
  383. GGML_F16_VEC ax[GGML_F16_ARR];
  384. GGML_F16_VEC ay[GGML_F16_ARR];
  385. for (int i = 0; i < np; i += GGML_F16_STEP) {
  386. for (int j = 0; j < GGML_F16_ARR; j++) {
  387. ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
  388. ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
  389. ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
  390. GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
  391. }
  392. }
  393. #else
  394. const int np = 0;
  395. #endif
  396. // leftovers
  397. for (int i = np; i < n; ++i) {
  398. y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i]) + GGML_CPU_FP16_TO_FP32(x[i])*v);
  399. }
  400. }
  401. // xs and vs are byte strides of x and v
  402. inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) {
  403. const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL];
  404. const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL];
  405. for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
  406. x[i] = (const float *) ((const char *) xv + i*xs);
  407. v[i] = (const float *) ((const char *) vv + i*vs);
  408. }
  409. #if defined(GGML_SIMD)
  410. #if defined(__ARM_FEATURE_SVE)
  411. // scalar Route to scalar implementation //TODO: Write SVE code
  412. for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
  413. for (int i = 0; i < n; ++i) {
  414. y[i] += x[k][i]*v[k][0];
  415. }
  416. }
  417. #elif defined(__riscv_v_intrinsic)
  418. for (int i = 0, avl; i < n; i += avl) {
  419. avl = __riscv_vsetvl_e32m8(n - i);
  420. vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
  421. for (int k = 0; k < GGML_VEC_MAD_UNROLL; k++) {
  422. vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[k][i], avl);
  423. ay = __riscv_vfmadd_vf_f32m8(ax, v[k][0], ay, avl);
  424. }
  425. __riscv_vse32_v_f32m8(&y[i], ay, avl);
  426. }
  427. #else
  428. const int np = (n & ~(GGML_F32_STEP - 1));
  429. GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
  430. for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
  431. vx[k] = GGML_F32_VEC_SET1(v[k][0]);
  432. }
  433. GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
  434. GGML_F32_VEC ay[GGML_F32_ARR];
  435. for (int i = 0; i < np; i += GGML_F32_STEP) {
  436. for (int j = 0; j < GGML_F32_ARR; j++) {
  437. ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
  438. for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
  439. ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
  440. ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
  441. }
  442. GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
  443. }
  444. }
  445. // leftovers
  446. for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
  447. for (int i = np; i < n; ++i) {
  448. y[i] += x[k][i]*v[k][0];
  449. }
  450. }
  451. #endif
  452. #else
  453. // scalar
  454. for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
  455. for (int i = 0; i < n; ++i) {
  456. y[i] += x[k][i]*v[k][0];
  457. }
  458. }
  459. #endif
  460. }
  461. inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, const float s, const float b) {
  462. #if defined(GGML_USE_ACCELERATE)
  463. vDSP_vsmsa(x, 1, &s, &b, y, 1, n);
  464. #elif defined(GGML_SIMD)
  465. #if defined(__ARM_FEATURE_SVE)
  466. // scalar ; TODO: Write SVE code
  467. for (int i = 0; i < n; ++i) {
  468. y[i] = x[i]*s + b;
  469. }
  470. #elif defined(__riscv_v_intrinsic)
  471. for (int i = 0, avl; i < n; i += avl) {
  472. avl = __riscv_vsetvl_e32m8(n - i);
  473. vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
  474. vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8(b, avl);
  475. vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8(ax, s, vb, avl);
  476. __riscv_vse32_v_f32m8(&y[i], ny, avl);
  477. }
  478. #else
  479. const int np = (n & ~(GGML_F32_STEP - 1));
  480. GGML_F32_VEC vs = GGML_F32_VEC_SET1(s);
  481. GGML_F32_VEC vb = GGML_F32_VEC_SET1(b);
  482. GGML_F32_VEC ay[GGML_F32_ARR];
  483. for (int i = 0; i < np; i += GGML_F32_STEP) {
  484. for (int j = 0; j < GGML_F32_ARR; j++) {
  485. ay[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
  486. ay[j] = GGML_F32_VEC_FMA(vb, ay[j], vs);
  487. GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
  488. }
  489. }
  490. // leftovers
  491. for (int i = np; i < n; ++i) {
  492. y[i] = x[i]*s + b;
  493. }
  494. #endif
  495. #else
  496. // scalar
  497. for (int i = 0; i < n; ++i) {
  498. y[i] = x[i]*s + b;
  499. }
  500. #endif
  501. }
  502. //inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
  503. inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
  504. #if defined(GGML_USE_ACCELERATE)
  505. vDSP_vsmul(y, 1, &v, y, 1, n);
  506. #elif defined(GGML_SIMD)
  507. #if defined(__ARM_FEATURE_SVE)
  508. const int sve_register_length = ggml_cpu_get_sve_cnt() * 8;
  509. const int ggml_f32_epr = sve_register_length / 32;//8;//svcntw(); // SVE128:4, SVE256:8, SVE512:16
  510. const int ggml_f32_step = 2 * ggml_f32_epr;
  511. GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
  512. const int np = (n & ~(ggml_f32_step - 1));
  513. svfloat32_t ay1;
  514. svfloat32_t ay2;
  515. for (int i = 0; i < np; i += ggml_f32_step) {
  516. ay1 = GGML_F32_VEC_LOAD(y + i);
  517. ay1 = GGML_F32_VEC_MUL(ay1, vx);
  518. GGML_F32_VEC_STORE(y + i, ay1);
  519. ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr);
  520. ay2 = GGML_F32_VEC_MUL(ay2, vx);
  521. GGML_F32_VEC_STORE(y + i + 1*ggml_f32_epr, ay2);
  522. }
  523. // leftovers
  524. // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only
  525. for (int i = np; i < n; i += ggml_f32_epr) {
  526. svbool_t pg = svwhilelt_b32(i, n);
  527. ay1 = svld1_f32(pg, y + i);
  528. ay1 = svmul_f32_m(pg, ay1, vx);
  529. svst1_f32(pg, y + i, ay1);
  530. }
  531. #elif defined(__riscv_v_intrinsic)
  532. for (int i = 0, avl; i < n; i += avl) {
  533. avl = __riscv_vsetvl_e32m8(n - i);
  534. vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
  535. vfloat32m8_t ny = __riscv_vfmul_vf_f32m8(ay, v, avl);
  536. __riscv_vse32_v_f32m8(&y[i], ny, avl);
  537. }
  538. #else
  539. const int np = (n & ~(GGML_F32_STEP - 1));
  540. GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
  541. GGML_F32_VEC ay[GGML_F32_ARR];
  542. for (int i = 0; i < np; i += GGML_F32_STEP) {
  543. for (int j = 0; j < GGML_F32_ARR; j++) {
  544. ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
  545. ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
  546. GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
  547. }
  548. }
  549. // leftovers
  550. for (int i = np; i < n; ++i) {
  551. y[i] *= v;
  552. }
  553. #endif
  554. #else
  555. // scalar
  556. for (int i = 0; i < n; ++i) {
  557. y[i] *= v;
  558. }
  559. #endif
  560. }
  561. inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
  562. #if defined(GGML_SIMD) && defined(__ARM_FEATURE_SVE)
  563. const int sve_register_length = svcntb() * 8;
  564. const int ggml_f16_epr = sve_register_length / 16;
  565. const int ggml_f16_step = 2 * ggml_f16_epr;
  566. GGML_F16x_VEC vx = GGML_F16x_VEC_SET1(v);
  567. const int np = (n & ~(ggml_f16_step - 1));
  568. svfloat16_t ay1, ay2;
  569. for (int i = 0; i < np; i += ggml_f16_step) {
  570. ay1 = GGML_F16x_VEC_LOAD(y + i + 0*ggml_f16_epr, 0);
  571. ay1 = GGML_F16x_VEC_MUL(ay1, vx);
  572. GGML_F16x_VEC_STORE(y + i + 0*ggml_f16_epr, ay1, 0);
  573. ay2 = GGML_F16x_VEC_LOAD(y + i + 1*ggml_f16_epr, 1);
  574. ay2 = GGML_F16x_VEC_MUL(ay2, vx);
  575. GGML_F16x_VEC_STORE(y + i + 1*ggml_f16_epr, ay2, 1);
  576. }
  577. // leftovers
  578. // maximum number of leftover elements will be less that ggmlF_16x_epr. Apply predicated svmad on available elements only
  579. if (np < n) {
  580. svbool_t pg = svwhilelt_b16(np, n);
  581. svfloat16_t hy = svld1_f16(pg, (__fp16 *)(y + np));
  582. svfloat16_t out = svmul_f16_m(pg, hy, vx);
  583. svst1_f16(pg, (__fp16 *)(y + np), out);
  584. }
  585. #elif defined(__riscv_v_intrinsic) && defined(__riscv_zvfh)
  586. for (int i = 0, vl; i < n; i += vl) {
  587. vl = __riscv_vsetvl_e16m2(n - i);
  588. vfloat16m2_t vy = __riscv_vle16_v_f16m2((_Float16 *)&y[i], vl);
  589. vfloat32m4_t vy32 = __riscv_vfwcvt_f_f_v_f32m4(vy, vl);
  590. vy32 = __riscv_vfmul_vf_f32m4(vy32, v, vl);
  591. vy = __riscv_vfncvt_f_f_w_f16m2(vy32, vl);
  592. __riscv_vse16_v_f16m2((_Float16 *)&y[i], vy, vl);
  593. }
  594. #elif defined(GGML_SIMD)
  595. const int np = (n & ~(GGML_F16_STEP - 1));
  596. GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
  597. GGML_F16_VEC ay[GGML_F16_ARR];
  598. for (int i = 0; i < np; i += GGML_F16_STEP) {
  599. for (int j = 0; j < GGML_F16_ARR; j++) {
  600. ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
  601. ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
  602. GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
  603. }
  604. }
  605. // leftovers
  606. for (int i = np; i < n; ++i) {
  607. y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
  608. }
  609. #else
  610. // scalar
  611. for (int i = 0; i < n; ++i) {
  612. y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(y[i])*v);
  613. }
  614. #endif
  615. }
  616. inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); }
  617. inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; }
  618. inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  619. for (int i = 0; i < n; ++i) {
  620. float v = GGML_CPU_FP16_TO_FP32(x[i]);
  621. y[i] = GGML_CPU_FP32_TO_FP16(v*v);
  622. }
  623. }
  624. inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
  625. inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  626. for (int i = 0; i < n; ++i) {
  627. y[i] = GGML_CPU_FP32_TO_FP16(sqrtf(GGML_CPU_FP16_TO_FP32(x[i])));
  628. }
  629. }
  630. inline static void ggml_vec_log_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]); }
  631. inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  632. for (int i = 0; i < n; ++i) {
  633. y[i] = GGML_CPU_FP32_TO_FP16(logf(GGML_CPU_FP16_TO_FP32(x[i])));
  634. }
  635. }
  636. inline static void ggml_vec_sin_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]); }
  637. inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  638. for (int i = 0; i < n; ++i) {
  639. y[i] = GGML_CPU_FP32_TO_FP16(sinf(GGML_CPU_FP16_TO_FP32(x[i])));
  640. }
  641. }
  642. inline static void ggml_vec_cos_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]); }
  643. inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  644. for (int i = 0; i < n; ++i) {
  645. y[i] = GGML_CPU_FP32_TO_FP16(cosf(GGML_CPU_FP16_TO_FP32(x[i])));
  646. }
  647. }
  648. inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
  649. inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  650. for (int i = 0; i < n; ++i) {
  651. y[i] = GGML_CPU_FP32_TO_FP16(fabsf(GGML_CPU_FP16_TO_FP32(x[i])));
  652. }
  653. }
  654. inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
  655. inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  656. for (int i = 0; i < n; ++i) {
  657. float v = GGML_CPU_FP16_TO_FP32(x[i]);
  658. y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
  659. }
  660. }
  661. inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
  662. inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  663. for (int i = 0; i < n; ++i) {
  664. y[i] = GGML_CPU_FP32_TO_FP16((GGML_CPU_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
  665. }
  666. }
  667. inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
  668. inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  669. for (int i = 0; i < n; ++i) {
  670. y[i] = GGML_CPU_FP32_TO_FP16(tanhf(GGML_CPU_FP16_TO_FP32(x[i])));
  671. }
  672. }
  673. inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
  674. inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  675. for (int i = 0; i < n; ++i) {
  676. const float v = GGML_CPU_FP16_TO_FP32(x[i]);
  677. y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : expm1f(v));
  678. }
  679. }
  680. inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
  681. inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  682. for (int i = 0; i < n; ++i) {
  683. float v = GGML_CPU_FP16_TO_FP32(x[i]);
  684. y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v : 0.f);
  685. }
  686. }
  687. inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
  688. inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) {
  689. for (int i = 0; i < n; ++i) {
  690. float v = GGML_CPU_FP16_TO_FP32(x[i]);
  691. y[i] = GGML_CPU_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
  692. }
  693. }
  694. inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
  695. inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  696. for (int i = 0; i < n; ++i) {
  697. y[i] = GGML_CPU_FP32_TO_FP16(1.f / (1.f + expf(-GGML_CPU_FP16_TO_FP32(x[i]))));
  698. }
  699. }
  700. // TODO: optimize performance
  701. inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
  702. inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  703. for (int i = 0; i < n; ++i) {
  704. float v = GGML_CPU_FP16_TO_FP32(x[i]);
  705. y[i] = GGML_CPU_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
  706. }
  707. }
  708. inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
  709. inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  710. for (int i = 0; i < n; ++i) {
  711. y[i] = GGML_CPU_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_CPU_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
  712. }
  713. }
  714. inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
  715. inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  716. for (int i = 0; i < n; ++i) {
  717. y[i] = GGML_CPU_FP32_TO_FP16(expf(GGML_CPU_FP16_TO_FP32(x[i])));
  718. }
  719. }
  720. static const float GELU_COEF_A = 0.044715f;
  721. static const float GELU_QUICK_COEF = -1.702f;
  722. static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
  723. static const float SQRT_2_INV = 0.70710678118654752440084436210484f;
  724. inline static float ggml_gelu_f32(float x) {
  725. return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
  726. }
  727. inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  728. const uint16_t * i16 = (const uint16_t *) x;
  729. for (int i = 0; i < n; ++i) {
  730. y[i] = ggml_table_gelu_f16[i16[i]];
  731. }
  732. }
  733. inline static void ggml_vec_gelu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  734. for (int i = 0; i < n; ++i) {
  735. float xi = GGML_CPU_FP16_TO_FP32(x[i]);
  736. float res = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
  737. y[i] = GGML_CPU_FP32_TO_FP16(res);
  738. }
  739. }
  740. #ifdef GGML_GELU_FP16
  741. inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
  742. uint16_t t;
  743. for (int i = 0; i < n; ++i) {
  744. if (x[i] <= -10.0f) {
  745. y[i] = 0.0f;
  746. } else if (x[i] >= 10.0f) {
  747. y[i] = x[i];
  748. } else {
  749. ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
  750. memcpy(&t, &fp16, sizeof(uint16_t));
  751. y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]);
  752. }
  753. }
  754. }
  755. #else
  756. inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
  757. for (int i = 0; i < n; ++i) {
  758. y[i] = ggml_gelu_f32(x[i]);
  759. }
  760. }
  761. #endif
  762. inline static void ggml_vec_gelu_erf_f32(const int n, float * y, const float * x) {
  763. for (int i = 0; i < n; ++i) {
  764. float xi = x[i];
  765. y[i] = 0.5f*xi*(1.0f + erff(xi*SQRT_2_INV));
  766. }
  767. }
  768. inline static float ggml_gelu_quick_f32(float x) {
  769. return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
  770. }
  771. //inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  772. // const uint16_t * i16 = (const uint16_t *) x;
  773. // for (int i = 0; i < n; ++i) {
  774. // y[i] = ggml_table_gelu_quick_f16[i16[i]];
  775. // }
  776. //}
  777. #ifdef GGML_GELU_QUICK_FP16
  778. inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
  779. uint16_t t;
  780. for (int i = 0; i < n; ++i) {
  781. ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
  782. memcpy(&t, &fp16, sizeof(uint16_t));
  783. y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
  784. }
  785. }
  786. #else
  787. inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
  788. for (int i = 0; i < n; ++i) {
  789. y[i] = ggml_gelu_quick_f32(x[i]);
  790. }
  791. }
  792. #endif
  793. inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  794. for (int i = 0; i < n; ++i) {
  795. float v = GGML_CPU_FP16_TO_FP32(x[i]);
  796. y[i] = GGML_CPU_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
  797. }
  798. }
  799. // Sigmoid Linear Unit (SiLU) function
  800. inline static float ggml_silu_f32(float x) {
  801. return x/(1.0f + expf(-x));
  802. }
  803. inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
  804. float v = GGML_CPU_FP16_TO_FP32(x);
  805. return GGML_CPU_FP32_TO_FP16(v/(1.0f + expf(-v)));
  806. }
  807. #if __FINITE_MATH_ONLY__
  808. #error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
  809. #error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
  810. #endif
  811. /* Below function was borrowed from the GitHub repository:
  812. https://github.com/openvinotoolkit/openvino/blob/master/src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp */
  813. #if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
  814. inline static svfloat32_t exp_ps_sve(svbool_t pg, svfloat32_t src) {
  815. // Constants
  816. const svfloat32_t log2_e = svdup_n_f32(1.4426950409f);
  817. const svfloat32_t ln2 = svdup_n_f32(0.6931473921f);
  818. const svfloat32_t half_ln2_sq = svdup_n_f32(0.2413862043f);
  819. const svuint32_t not_mask17 = svdup_n_u32(~((1u << 17) - 1));
  820. const svfloat32_t one = svdup_n_f32(1.0f);
  821. const svfloat32_t inactive1 = svdup_n_f32(0.0f);
  822. const svint32_t inactive2 = svdup_n_s32(0);
  823. // Algorithm starts here
  824. svfloat32_t t0 = svmul_f32_m(pg, src, log2_e); // y = x * log2(e)
  825. svfloat32_t t1 = svrintm_f32_m(inactive1, pg, t0); // rount to int (float)
  826. svint32_t t2 = svcvt_s32_f32_m(inactive2, pg, t1); // n
  827. t1 = svsub_f32_m(pg, t0, t1); // a = y - floor(y)
  828. t1 = svadd_f32_m(pg, t1, one); // b = a + 1
  829. svuint32_t t3 = svlsr_n_u32_m(pg, svreinterpret_u32_f32(t1), 17); // v = b >> 17 (u32)
  830. svfloat32_t t4 = svexpa_f32(t3); // c = fexpa(v)
  831. t4 = svscale_f32_m(pg, t4, t2); // fexpa(v) * 2^(n)
  832. // and_(t2.d, t1.d, not_mask17.d)
  833. svfloat32_t t5 = svreinterpret_f32_u32(svand_u32_m(pg, svreinterpret_u32_f32(t1), not_mask17));
  834. t5 = svsub_f32_m(pg, t1, t5); // z
  835. t0 = svmla_f32_m(pg, ln2, t5, half_ln2_sq); // ln2 + half_ln2_sq * z
  836. t0 = svmla_f32_m(pg, one, t5, t0); // 1 + (ln2 * z) + (half_ln2_sq * z * z)
  837. t0 = svmul_f32_m(pg, t0, t4); // Final result
  838. return t0;
  839. }
  840. #endif
  841. #if defined(__ARM_FEATURE_SVE) && defined(__aarch64__)
  842. inline static svfloat32_t ggml_v_expf(svbool_t pg, svfloat32_t x) {
  843. const svfloat32_t r = svdup_n_f32_x(pg, 0x1.8p23f);
  844. const svfloat32_t z = svmla_n_f32_x(pg, r, x, 0x1.715476p+0f);
  845. const svfloat32_t n = svsub_f32_x(pg, z, r);
  846. const svfloat32_t b = svmls_n_f32_x(pg, svmls_n_f32_x(pg, x, n, 0x1.62e4p-1f), n, 0x1.7f7d1cp-20f);
  847. const svuint32_t e = svlsl_n_u32_x(pg, svreinterpret_u32_f32(z), 23);
  848. const svfloat32_t k = svreinterpret_f32_u32(svadd_u32_x(pg, e, svreinterpret_u32_f32(svdup_n_f32_x(pg, 1))));
  849. const svbool_t c = svacgt_n_f32(pg, n, 126);
  850. const svfloat32_t u = svmul_f32_x(pg, b, b);
  851. const svfloat32_t j = svmla_f32_x(pg,
  852. svmul_n_f32_x(pg, b, 0x1.ffffecp-1f),
  853. svmla_f32_x(pg, svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.fffdb6p-2f), svdup_n_f32_x(pg, 0x1.555e66p-3f), b),
  854. svmla_f32_x(pg, svdup_n_f32_x(pg, 0x1.573e2ep-5f), svdup_n_f32_x(pg, 0x1.0e4020p-7f), b), u), u);
  855. const svuint32_t d = svdup_n_u32_z(svcmple_n_f32(pg, n, 0.0), 0x82000000);
  856. const svfloat32_t s1 = svreinterpret_f32_u32(svadd_n_u32_x(pg, d, 0x7f000000));
  857. const svfloat32_t s2 = svreinterpret_f32_u32(svsub_u32_x(pg, e, d));
  858. return svsel_f32(svacgt_f32(pg, n, svdup_n_f32_x(pg, 192)), svmul_f32_x(pg, s1, s1),
  859. svsel_f32(c, svmul_f32_x(pg, svmla_f32_x(pg, s2, s2, j), s1), svmla_f32_x(pg, k, k, j)));
  860. }
  861. // computes silu x/(1+exp(-x)) in single precision vector
  862. inline static svfloat32_t ggml_v_silu(svbool_t pg, svfloat32_t x) {
  863. const svfloat32_t one = svdup_n_f32_x(pg, 1.0f);
  864. const svfloat32_t zero = svdup_n_f32_x(pg, 0.0f);
  865. const svfloat32_t neg_x = svsub_f32_x(pg, zero, x);
  866. const svfloat32_t exp_neg_x = ggml_v_expf(pg, neg_x);
  867. const svfloat32_t one_plus_exp_neg_x = svadd_f32_x(pg, one, exp_neg_x);
  868. return svdiv_f32_x(pg, x, one_plus_exp_neg_x);
  869. }
  870. #elif defined(__ARM_NEON) && defined(__aarch64__)
  871. // adapted from arm limited optimized routine
  872. // the maximum error is 1.45358 plus 0.5 ulps
  873. // numbers above 88.38 will flush to infinity
  874. // numbers beneath -103.97 will flush to zero
  875. inline static float32x4_t ggml_v_expf(float32x4_t x) {
  876. const float32x4_t r = vdupq_n_f32(0x1.8p23f);
  877. const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f));
  878. const float32x4_t n = vsubq_f32(z, r);
  879. const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n,
  880. vdupq_n_f32(0x1.7f7d1cp-20f));
  881. const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23);
  882. const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1))));
  883. const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126));
  884. const float32x4_t u = vmulq_f32(b, b);
  885. const float32x4_t j = vfmaq_f32(
  886. vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b),
  887. vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b),
  888. vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u);
  889. if (!vpaddd_u64(vreinterpretq_u64_u32(c)))
  890. return vfmaq_f32(k, j, k);
  891. const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000));
  892. const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000)));
  893. const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d));
  894. return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1),
  895. vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j)));
  896. }
  897. // computes silu x/(1+exp(-x)) in single precision vector
  898. inline static float32x4_t ggml_v_silu(float32x4_t x) {
  899. const float32x4_t one = vdupq_n_f32(1.0f);
  900. const float32x4_t zero = vdupq_n_f32(0.0f);
  901. const float32x4_t neg_x = vsubq_f32(zero, x);
  902. const float32x4_t exp_neg_x = ggml_v_expf(neg_x);
  903. const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
  904. return vdivq_f32(x, one_plus_exp_neg_x);
  905. }
  906. #elif defined(__AVX512F__) && defined(__AVX512DQ__)
  907. // adapted from arm limited optimized routine
  908. // the maximum error is 1.45358 plus 0.5 ulps
  909. // numbers above 88.38 will flush to infinity
  910. // numbers beneath -103.97 will flush to zero
  911. inline static __m512 ggml_v_expf(__m512 x) {
  912. const __m512 r = _mm512_set1_ps(0x1.8p23f);
  913. const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
  914. const __m512 n = _mm512_sub_ps(z, r);
  915. const __m512 b =
  916. _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
  917. _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
  918. const __mmask16 d =
  919. _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
  920. const __m512 u = _mm512_mul_ps(b, b);
  921. const __m512 j = _mm512_fmadd_ps(
  922. _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
  923. _mm512_set1_ps(0x1.573e2ep-5f)),
  924. u,
  925. _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
  926. _mm512_set1_ps(0x1.fffdb6p-2f))),
  927. u,
  928. _mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F)));
  929. const __m512 res = _mm512_scalef_ps(j, n);
  930. if (_mm512_kortestz(d, d))
  931. return res;
  932. const __m512 zero = _mm512_setzero_ps();
  933. const __m512 alt = _mm512_mask_blend_ps(
  934. _mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero);
  935. return _mm512_mask_blend_ps(d, res, alt);
  936. }
  937. // computes silu x/(1+exp(-x)) in single precision vector
  938. inline static __m512 ggml_v_silu(__m512 x) {
  939. const __m512 one = _mm512_set1_ps(1);
  940. const __m512 zero = _mm512_setzero_ps();
  941. const __m512 neg_x = _mm512_sub_ps(zero, x);
  942. const __m512 exp_neg_x = ggml_v_expf(neg_x);
  943. const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x);
  944. return _mm512_div_ps(x, one_plus_exp_neg_x);
  945. }
  946. #elif defined(__AVX2__) && defined(__FMA__)
  947. // adapted from arm limited optimized routine
  948. // the maximum error is 1.45358 plus 0.5 ulps
  949. // numbers above 88.38 will flush to infinity
  950. // numbers beneath -103.97 will flush to zero
  951. inline static __m256 ggml_v_expf(__m256 x) {
  952. const __m256 r = _mm256_set1_ps(0x1.8p23f);
  953. const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r);
  954. const __m256 n = _mm256_sub_ps(z, r);
  955. const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f),
  956. _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x));
  957. const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23);
  958. const __m256 k = _mm256_castsi256_ps(
  959. _mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1))));
  960. const __m256i c = _mm256_castps_si256(
  961. _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
  962. _mm256_set1_ps(126), _CMP_GT_OQ));
  963. const __m256 u = _mm256_mul_ps(b, b);
  964. const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b,
  965. _mm256_set1_ps(0x1.573e2ep-5f)), u,
  966. _mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b,
  967. _mm256_set1_ps(0x1.fffdb6p-2f))),
  968. u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b));
  969. if (!_mm256_movemask_ps(_mm256_castsi256_ps(c)))
  970. return _mm256_fmadd_ps(j, k, k);
  971. const __m256i g = _mm256_and_si256(
  972. _mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)),
  973. _mm256_set1_epi32(0x82000000u));
  974. const __m256 s1 =
  975. _mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u)));
  976. const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g));
  977. const __m256i d = _mm256_castps_si256(
  978. _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
  979. _mm256_set1_ps(192), _CMP_GT_OQ));
  980. return _mm256_or_ps(
  981. _mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)),
  982. _mm256_andnot_ps(
  983. _mm256_castsi256_ps(d),
  984. _mm256_or_ps(
  985. _mm256_and_ps(_mm256_castsi256_ps(c),
  986. _mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)),
  987. _mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k)))));
  988. }
  989. // computes silu x/(1+exp(-x)) in single precision vector
  990. inline static __m256 ggml_v_silu(__m256 x) {
  991. const __m256 one = _mm256_set1_ps(1);
  992. const __m256 zero = _mm256_setzero_ps();
  993. const __m256 neg_x = _mm256_sub_ps(zero, x);
  994. const __m256 exp_neg_x = ggml_v_expf(neg_x);
  995. const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x);
  996. return _mm256_div_ps(x, one_plus_exp_neg_x);
  997. }
  998. #elif defined(__SSE2__) // __AVX2__ / __ARM_NEON
  999. #if defined(__FMA__)
  1000. #define MADD128(x, y, z) _mm_fmadd_ps(x, y, z)
  1001. #define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z)
  1002. #else
  1003. #define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
  1004. #define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y))
  1005. #endif
  1006. // adapted from arm limited optimized routine
  1007. // the maximum error is 1.45358 plus 0.5 ulps
  1008. // numbers above 88.38 will flush to infinity
  1009. // numbers beneath -103.97 will flush to zero
  1010. inline static __m128 ggml_v_expf(__m128 x) {
  1011. const __m128 r = _mm_set1_ps(0x1.8p23f);
  1012. const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r);
  1013. const __m128 n = _mm_sub_ps(z, r);
  1014. const __m128 b =
  1015. NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x));
  1016. const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23);
  1017. const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1))));
  1018. const __m128i c =
  1019. _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126)));
  1020. const __m128 u = _mm_mul_ps(b, b);
  1021. const __m128 j =
  1022. MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u,
  1023. MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))),
  1024. u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b));
  1025. if (!_mm_movemask_epi8(c))
  1026. return MADD128(j, k, k);
  1027. const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())),
  1028. _mm_set1_epi32(0x82000000u));
  1029. const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u)));
  1030. const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g));
  1031. const __m128i d =
  1032. _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192)));
  1033. return _mm_or_ps(
  1034. _mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)),
  1035. _mm_andnot_ps(_mm_castsi128_ps(d),
  1036. _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)),
  1037. _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
  1038. }
  1039. // computes silu x/(1+exp(-x)) in single precision vector
  1040. inline static __m128 ggml_v_silu(__m128 x) {
  1041. const __m128 one = _mm_set1_ps(1);
  1042. const __m128 zero = _mm_setzero_ps();
  1043. const __m128 neg_x = _mm_sub_ps(zero, x);
  1044. const __m128 exp_neg_x = ggml_v_expf(neg_x);
  1045. const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x);
  1046. return _mm_div_ps(x, one_plus_exp_neg_x);
  1047. }
  1048. #elif defined(__riscv_v_intrinsic)
  1049. // adapted from arm limited optimized routine
  1050. // the maximum error is 1.45358 plus 0.5 ulps
  1051. // numbers above 88.38 will flush to infinity
  1052. // numbers beneath -103.97 will flush to zero
  1053. inline static vfloat32m2_t ggml_v_expf_m2(vfloat32m2_t x, int vl) {
  1054. const vfloat32m2_t r = __riscv_vfmv_v_f_f32m2(0x1.8p23f, vl);
  1055. #ifdef __riscv_xtheadvector
  1056. // workaround for compiler bug (gcc 14.3.0: Error: unrecognized opcode `th.vmv1r.v v2,v4')
  1057. vfloat32m2_t z = __riscv_vfadd_vf_f32m2(r, 0.0f, vl);
  1058. z = __riscv_vfmacc_vf_f32m2(z, 0x1.715476p+0f, x, vl);
  1059. #else
  1060. const vfloat32m2_t z = __riscv_vfmacc_vf_f32m2(r, 0x1.715476p+0f, x, vl);
  1061. #endif
  1062. const vfloat32m2_t n = __riscv_vfsub_vv_f32m2(z, r, vl);
  1063. const vfloat32m2_t b = __riscv_vfnmsac_vf_f32m2(__riscv_vfnmsac_vf_f32m2(x, 0x1.62e4p-1f, n, vl),
  1064. 0x1.7f7d1cp-20f, n, vl);
  1065. const vuint32m2_t e = __riscv_vsll_vx_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(z), 23, vl);
  1066. const vfloat32m2_t k = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(e, 0x3f800000, vl)); // 1.0f
  1067. const vbool16_t c = __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 126.0f, vl);
  1068. const vfloat32m2_t u = __riscv_vfmul_vv_f32m2(b, b, vl);
  1069. const vfloat32m2_t j = __riscv_vfmacc_vv_f32m2(
  1070. __riscv_vfmul_vf_f32m2(b, 0x1.ffffecp-1f, vl),
  1071. __riscv_vfmacc_vv_f32m2(
  1072. __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.fffdb6p-2f, vl), 0x1.555e66p-3f, b, vl),
  1073. __riscv_vfmacc_vf_f32m2(__riscv_vfmv_v_f_f32m2(0x1.573e2ep-5f, vl), 0x1.0e4020p-7f, b, vl),
  1074. u, vl), u, vl);
  1075. if (!__riscv_vcpop_m_b16(c, vl))
  1076. return __riscv_vfmacc_vv_f32m2(k, j, k, vl);
  1077. const vbool16_t dm = __riscv_vmfle_vf_f32m2_b16(n, 0.0f, vl);
  1078. const vuint32m2_t d = __riscv_vmerge_vxm_u32m2(__riscv_vmv_v_x_u32m2(0, vl), 0x82000000, dm, vl);
  1079. const vfloat32m2_t s1 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vadd_vx_u32m2(d, 0x7f000000, vl));
  1080. const vfloat32m2_t s2 = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vsub_vv_u32m2(e, d, vl));
  1081. const vfloat32m2_t r1 = __riscv_vmerge_vvm_f32m2(
  1082. __riscv_vfmacc_vv_f32m2(k, k, j, vl),
  1083. __riscv_vfmul_vv_f32m2(__riscv_vfmacc_vv_f32m2(s2, s2, j, vl), s1, vl),
  1084. c, vl);
  1085. return __riscv_vmerge_vvm_f32m2(
  1086. r1, __riscv_vfmul_vv_f32m2(s1, s1, vl),
  1087. __riscv_vmfgt_vf_f32m2_b16(__riscv_vfabs_v_f32m2(n, vl), 192.0f, vl),
  1088. vl);
  1089. }
  1090. // computes silu x/(1+exp(-x)) in single precision vector
  1091. inline static vfloat32m2_t ggml_v_silu_m2(vfloat32m2_t x, int vl) {
  1092. const vfloat32m2_t neg_x = __riscv_vfneg_v_f32m2(x, vl);
  1093. const vfloat32m2_t exp_neg_x = ggml_v_expf_m2(neg_x, vl);
  1094. const vfloat32m2_t one_plus_exp_neg_x = __riscv_vfadd_vf_f32m2(exp_neg_x, 1.0f, vl);
  1095. return __riscv_vfdiv_vv_f32m2(x, one_plus_exp_neg_x, vl);
  1096. }
  1097. #endif // __ARM_NEON / __AVX2__ / __SSE2__ / __riscv_v_intrinsic
  1098. inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
  1099. for (int i = 0; i < n; ++i) {
  1100. y[i] = ggml_silu_f16(x[i]);
  1101. }
  1102. }
  1103. inline static float ggml_silu_backward_f32(float x, float dy) {
  1104. const float s = 1.0f/(1.0f + expf(-x));
  1105. return dy*s*(1.0f + x*(1.0f - s));
  1106. }
  1107. inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) {
  1108. const float v = GGML_CPU_FP16_TO_FP32(x);
  1109. const float s = 1.0f/(1.0f + expf(-v));
  1110. return GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
  1111. }
  1112. inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
  1113. for (int i = 0; i < n; ++i) {
  1114. dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
  1115. }
  1116. }
  1117. inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, const ggml_fp16_t * x, const ggml_fp16_t * dy) {
  1118. for (int i = 0; i < n; ++i) {
  1119. dx[i] = ggml_silu_backward_f16(x[i], dy[i]);
  1120. }
  1121. }
  1122. inline static void ggml_vec_reglu_f32 (const int n, float * y, const float * x, const float * g) {
  1123. for (int i = 0; i < n; ++i) {
  1124. y[i] = (x[i] > 0.f) ? x[i] * g[i] : 0.f;
  1125. }
  1126. }
  1127. inline static void ggml_vec_reglu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
  1128. for (int i = 0; i < n; ++i) {
  1129. float v = GGML_CPU_FP16_TO_FP32(x[i]);
  1130. y[i] = GGML_CPU_FP32_TO_FP16((v > 0.f) ? v * GGML_CPU_FP16_TO_FP32(g[i]) : 0.f);
  1131. }
  1132. }
  1133. #ifdef GGML_GELU_FP16
  1134. inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
  1135. uint16_t t;
  1136. for (int i = 0; i < n; ++i) {
  1137. if (x[i] <= -10.0f) {
  1138. y[i] = 0.0f;
  1139. } else if (x[i] >= 10.0f) {
  1140. y[i] = x[i] * g[i];
  1141. } else {
  1142. ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
  1143. memcpy(&t, &fp16, sizeof(uint16_t));
  1144. y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[t]) * g[i];
  1145. }
  1146. }
  1147. }
  1148. #else
  1149. inline static void ggml_vec_geglu_f32(const int n, float * y, const float * x, const float * g) {
  1150. for (int i = 0; i < n; ++i) {
  1151. y[i] = ggml_gelu_f32(x[i]) * g[i];
  1152. }
  1153. }
  1154. #endif
  1155. inline static void ggml_vec_geglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
  1156. const uint16_t * i16 = (const uint16_t *) x;
  1157. for (int i = 0; i < n; ++i) {
  1158. float v = GGML_CPU_FP16_TO_FP32(g[i]);
  1159. y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_f16[i16[i]]) * v);
  1160. }
  1161. }
  1162. void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g);
  1163. inline static void ggml_vec_swiglu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
  1164. for (int i = 0; i < n; ++i) {
  1165. float xi = GGML_CPU_FP16_TO_FP32(x[i]);
  1166. float gi = GGML_CPU_FP16_TO_FP32(g[i]);
  1167. y[i] = GGML_CPU_FP32_TO_FP16((xi/(1.0f + expf(-xi))) * gi);
  1168. }
  1169. }
  1170. inline static void ggml_vec_geglu_erf_f32(const int n, float * y, const float * x, const float * g) {
  1171. for (int i = 0; i < n; ++i) {
  1172. float xi = x[i];
  1173. y[i] = 0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * g[i];
  1174. }
  1175. }
  1176. inline static void ggml_vec_geglu_erf_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
  1177. for (int i = 0; i < n; ++i) {
  1178. float xi = GGML_CPU_FP16_TO_FP32(x[i]);
  1179. float gi = GGML_CPU_FP16_TO_FP32(g[i]);
  1180. y[i] = GGML_CPU_FP32_TO_FP16(0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * gi);
  1181. }
  1182. }
  1183. #ifdef GGML_GELU_QUICK_FP16
  1184. inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
  1185. uint16_t t;
  1186. for (int i = 0; i < n; ++i) {
  1187. ggml_fp16_t fp16 = GGML_CPU_FP32_TO_FP16(x[i]);
  1188. memcpy(&t, &fp16, sizeof(uint16_t));
  1189. y[i] = GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]) * g[i];
  1190. }
  1191. }
  1192. #else
  1193. inline static void ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
  1194. for (int i = 0; i < n; ++i) {
  1195. y[i] = ggml_gelu_quick_f32(x[i]) * g[i];
  1196. }
  1197. }
  1198. #endif
  1199. inline static void ggml_vec_geglu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const ggml_fp16_t * g) {
  1200. const uint16_t * i16 = (const uint16_t *) x;
  1201. for (int i = 0; i < n; ++i) {
  1202. float v = GGML_CPU_FP16_TO_FP32(g[i]);
  1203. y[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(ggml_table_gelu_quick_f16[i16[i]]) * v);
  1204. }
  1205. }
  1206. inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
  1207. #ifndef GGML_USE_ACCELERATE
  1208. ggml_float sum = 0.0;
  1209. for (int i = 0; i < n; ++i) {
  1210. sum += (ggml_float)x[i];
  1211. }
  1212. *s = (float)sum;
  1213. #else
  1214. vDSP_sve(x, 1, s, n);
  1215. #endif
  1216. }
  1217. inline static void ggml_vec_cumsum_f32(const int n, float * y, const float * x) {
  1218. for (int i = 0; i < n; ++i) {
  1219. if (i == 0) {
  1220. y[i] = x[i];
  1221. } else {
  1222. y[i] = y[i - 1] + x[i];
  1223. }
  1224. }
  1225. }
  1226. inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
  1227. ggml_float sum = 0.0;
  1228. for (int i = 0; i < n; ++i) {
  1229. sum += (ggml_float)x[i];
  1230. }
  1231. *s = sum;
  1232. }
  1233. inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
  1234. float sum = 0.0f;
  1235. for (int i = 0; i < n; ++i) {
  1236. sum += GGML_CPU_FP16_TO_FP32(x[i]);
  1237. }
  1238. *s = sum;
  1239. }
  1240. inline static void ggml_vec_sum_bf16_ggf(const int n, float * s, const ggml_bf16_t * x) {
  1241. float sum = 0.0f;
  1242. for (int i = 0; i < n; ++i) {
  1243. sum += GGML_BF16_TO_FP32(x[i]);
  1244. }
  1245. *s = sum;
  1246. }
  1247. inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
  1248. #ifndef GGML_USE_ACCELERATE
  1249. float max = -INFINITY;
  1250. for (int i = 0; i < n; ++i) {
  1251. max = MAX(max, x[i]);
  1252. }
  1253. *s = max;
  1254. #else
  1255. vDSP_maxv(x, 1, s, n);
  1256. #endif
  1257. }
  1258. inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) {
  1259. ggml_vec_norm_f32(n, s, x);
  1260. *s = 1.f/(*s);
  1261. }
  1262. inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
  1263. float max = -INFINITY;
  1264. int idx = 0;
  1265. for (int i = 0; i < n; ++i) {
  1266. max = MAX(max, x[i]);
  1267. if (max == x[i]) { idx = i; }
  1268. }
  1269. *s = idx;
  1270. }
  1271. #ifdef __cplusplus
  1272. }
  1273. #endif