1
0

sgemm.cpp 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032
  1. // Copyright 2024 Mozilla Foundation
  2. //
  3. // Permission is hereby granted, free of charge, to any person obtaining
  4. // a copy of this software and associated documentation files (the
  5. // "Software"), to deal in the Software without restriction, including
  6. // without limitation the rights to use, copy, modify, merge, publish,
  7. // distribute, sublicense, and/or sell copies of the Software, and to
  8. // permit persons to whom the Software is furnished to do so, subject to
  9. // the following conditions:
  10. //
  11. // The above copyright notice and this permission notice shall be
  12. // included in all copies or substantial portions of the Software.
  13. //
  14. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  15. // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  16. // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  17. // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  18. // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  19. // ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  20. // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21. // SOFTWARE.
  22. //
  23. // _ _ ___ _ _ ___
  24. // | |_(_)_ _ _ _| _ ) | /_\ / __|
  25. // | _| | ' \ || | _ \ |__ / _ \\__ \.
  26. // \__|_|_||_\_, |___/____/_/ \_\___/
  27. // |__/
  28. //
  29. // BASIC LINEAR ALGEBRA SUBPROGRAMS
  30. //
  31. //
  32. // This file implements multithreaded CPU matrix multiplication for the
  33. // common contiguous use case C = Aᵀ * B. These kernels are designed to
  34. // have excellent performance[1] for matrices that fit in the CPU cache
  35. // without imposing any overhead such as cache filling or malloc calls.
  36. //
  37. // This implementation does not guarantee any upper bound with rounding
  38. // errors, which grow along with k. Our goal's to maximally exploit the
  39. // hardware for performance, and then use whatever resources remain for
  40. // improving numerical accuracy.
  41. //
  42. // [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
  43. // Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].
  44. #if defined(__GNUC__)
  45. #pragma GCC diagnostic ignored "-Wpedantic"
  46. #pragma GCC diagnostic ignored "-Wignored-attributes"
  47. #endif
  48. #include "sgemm.h"
  49. #include "ggml-impl.h"
  50. #include "ggml-quants.h"
  51. #ifdef _MSC_VER
  52. #define NOINLINE __declspec(noinline)
  53. #else
  54. #define NOINLINE __attribute__((__noinline__))
  55. #endif
  56. #if defined(__ARM_NEON) || defined(__AVX512F__)
  57. #define VECTOR_REGISTERS 32
  58. #else
  59. #define VECTOR_REGISTERS 16
  60. #endif
  61. #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
  62. namespace {
  63. inline float unhalf(ggml_fp16_t d) {
  64. return GGML_FP16_TO_FP32(d);
  65. }
  66. ////////////////////////////////////////////////////////////////////////////////////////////////////
  67. // VECTORIZED ARITHMETIC OPERATIONS
  68. #if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
  69. inline __m128 add(__m128 x, __m128 y) { return _mm_add_ps(x, y); }
  70. inline __m128 sub(__m128 x, __m128 y) { return _mm_sub_ps(x, y); }
  71. inline __m128 mul(__m128 x, __m128 y) { return _mm_mul_ps(x, y); }
  72. #endif // __SSE__
  73. #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
  74. inline __m256 add(__m256 x, __m256 y) { return _mm256_add_ps(x, y); }
  75. inline __m256 sub(__m256 x, __m256 y) { return _mm256_sub_ps(x, y); }
  76. inline __m256 mul(__m256 x, __m256 y) { return _mm256_mul_ps(x, y); }
  77. #endif // __AVX__
  78. #if defined(__AVX512F__)
  79. inline __m512 add(__m512 x, __m512 y) { return _mm512_add_ps(x, y); }
  80. inline __m512 sub(__m512 x, __m512 y) { return _mm512_sub_ps(x, y); }
  81. inline __m512 mul(__m512 x, __m512 y) { return _mm512_mul_ps(x, y); }
  82. #endif // __AVX512F__
  83. #if defined(__ARM_NEON)
  84. inline float32x4_t add(float32x4_t x, float32x4_t y) { return vaddq_f32(x, y); }
  85. inline float32x4_t sub(float32x4_t x, float32x4_t y) { return vsubq_f32(x, y); }
  86. inline float32x4_t mul(float32x4_t x, float32x4_t y) { return vmulq_f32(x, y); }
  87. #endif // __ARM_NEON
  88. #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
  89. inline float16x8_t add(float16x8_t x, float16x8_t y) { return vaddq_f16(x, y); }
  90. inline float16x8_t sub(float16x8_t x, float16x8_t y) { return vsubq_f16(x, y); }
  91. inline float16x8_t mul(float16x8_t x, float16x8_t y) { return vmulq_f16(x, y); }
  92. #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  93. ////////////////////////////////////////////////////////////////////////////////////////////////////
  94. // VECTORIZED FUSED MULTIPLY ADD
  95. /**
  96. * Computes a * b + c.
  97. */
  98. template <typename T, typename U>
  99. inline U madd(T a, T b, U c) {
  100. return add(mul(a, b), c);
  101. }
  102. #if defined(__FMA__)
  103. #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
  104. template <>
  105. inline __m256 madd(__m256 a, __m256 b, __m256 c) {
  106. return _mm256_fmadd_ps(a, b, c);
  107. }
  108. #endif
  109. #if defined(__AVX512F__)
  110. template <>
  111. inline __m512 madd(__m512 a, __m512 b, __m512 c) {
  112. return _mm512_fmadd_ps(a, b, c);
  113. }
  114. #endif
  115. #endif
  116. #if defined(__ARM_FEATURE_FMA)
  117. template <>
  118. inline float32x4_t madd(float32x4_t a, float32x4_t b, float32x4_t c) {
  119. return vfmaq_f32(c, b, a);
  120. }
  121. #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
  122. template <>
  123. inline float16x8_t madd(float16x8_t a, float16x8_t b, float16x8_t c) {
  124. return vfmaq_f16(c, b, a);
  125. }
  126. #endif
  127. #endif
  128. ////////////////////////////////////////////////////////////////////////////////////////////////////
  129. // VECTORIZED HORIZONTAL SUM
  130. #if defined(__ARM_NEON)
  131. inline float hsum(float32x4_t x) {
  132. return vaddvq_f32(x);
  133. }
  134. #endif // __ARM_NEON
  135. #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
  136. inline float hsum(float16x8_t x) {
  137. return vaddvq_f32(vaddq_f32(vcvt_f32_f16(vget_low_f16(x)),
  138. vcvt_f32_f16(vget_high_f16(x))));
  139. }
  140. #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
  141. #if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
  142. inline float hsum(__m128 x) {
  143. #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
  144. x = _mm_add_ps(x, _mm_movehl_ps(x, x));
  145. x = _mm_add_ss(x, _mm_movehdup_ps(x));
  146. #else
  147. __m128 t;
  148. t = _mm_shuffle_ps(x, x, _MM_SHUFFLE(2, 3, 0, 1));
  149. x = _mm_add_ps(x, t);
  150. t = _mm_movehl_ps(t, x);
  151. x = _mm_add_ss(x, t);
  152. #endif
  153. return _mm_cvtss_f32(x);
  154. }
  155. #endif
  156. #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
  157. inline float hsum(__m256 x) {
  158. return hsum(_mm_add_ps(_mm256_extractf128_ps(x, 1),
  159. _mm256_castps256_ps128(x)));
  160. }
  161. #endif // __AVX__
  162. #if defined(__AVX512F__)
  163. inline float hsum(__m512 x) {
  164. return _mm512_reduce_add_ps(x);
  165. }
  166. #endif // __AVX512F__
  167. ////////////////////////////////////////////////////////////////////////////////////////////////////
  168. // VECTORIZED MEMORY LOADING
  169. template <typename T, typename U> T load(const U *);
  170. #if defined(__ARM_NEON)
  171. template <> inline float32x4_t load(const float *p) {
  172. return vld1q_f32(p);
  173. }
  174. #if !defined(_MSC_VER)
  175. template <> inline float16x8_t load(const ggml_fp16_t *p) {
  176. return vld1q_f16((const float16_t *)p);
  177. }
  178. template <> inline float32x4_t load(const ggml_fp16_t *p) {
  179. return vcvt_f32_f16(vld1_f16((const float16_t *)p));
  180. }
  181. #endif // _MSC_VER
  182. #endif // __ARM_NEON
  183. #if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
  184. template <> inline __m128 load(const float *p) {
  185. return _mm_loadu_ps(p);
  186. }
  187. #endif // __SSE__
  188. #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
  189. template <> inline __m256 load(const float *p) {
  190. return _mm256_loadu_ps(p);
  191. }
  192. #endif // __AVX__
  193. #if defined(__F16C__)
  194. template <> inline __m256 load(const ggml_fp16_t *p) {
  195. return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)p));
  196. }
  197. #endif // __F16C__
  198. #if defined(__AVX512F__)
  199. template <> inline __m512 load(const float *p) {
  200. return _mm512_loadu_ps(p);
  201. }
  202. template <> inline __m512 load(const ggml_fp16_t *p) {
  203. return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)p));
  204. }
  205. #endif // __AVX512F__
  206. ////////////////////////////////////////////////////////////////////////////////////////////////////
  207. // FLOATING POINT MATRIX MULTIPLICATION
  208. template <int KN, typename D, typename V, typename TA, typename TB, typename TC>
  209. class tinyBLAS {
  210. public:
  211. tinyBLAS(int64_t k,
  212. const TA *A, int64_t lda,
  213. const TB *B, int64_t ldb,
  214. TC *C, int64_t ldc,
  215. int ith, int nth)
  216. : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
  217. }
  218. void matmul(int64_t m, int64_t n, int task) {
  219. if (task == GGML_TASK_TYPE_COMPUTE)
  220. mnpack(0, m, 0, n);
  221. }
  222. private:
  223. NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
  224. int64_t mc, nc, mp, np;
  225. switch ((MIN(m - m0, 5) << 4) | MIN(n - n0, 5)) {
  226. #if VECTOR_REGISTERS == 32
  227. case 0x55:
  228. mc = 5;
  229. nc = 5;
  230. gemm<5, 5>(m0, m, n0, n);
  231. break;
  232. case 0x45:
  233. mc = 4;
  234. nc = 5;
  235. gemm<4, 5>(m0, m, n0, n);
  236. break;
  237. case 0x54:
  238. mc = 5;
  239. nc = 4;
  240. gemm<5, 4>(m0, m, n0, n);
  241. break;
  242. case 0x44:
  243. mc = 4;
  244. nc = 4;
  245. gemm<4, 4>(m0, m, n0, n);
  246. break;
  247. case 0x53:
  248. mc = 5;
  249. nc = 3;
  250. gemm<5, 3>(m0, m, n0, n);
  251. break;
  252. case 0x35:
  253. mc = 3;
  254. nc = 5;
  255. gemm<3, 5>(m0, m, n0, n);
  256. break;
  257. case 0x43:
  258. mc = 4;
  259. nc = 3;
  260. gemm<4, 3>(m0, m, n0, n);
  261. break;
  262. #else
  263. case 0x55:
  264. case 0x54:
  265. case 0x53:
  266. case 0x45:
  267. case 0x44:
  268. case 0x43:
  269. mc = 4;
  270. nc = 3;
  271. gemm<4, 3>(m0, m, n0, n);
  272. break;
  273. case 0x35:
  274. #endif
  275. case 0x34:
  276. mc = 3;
  277. nc = 4;
  278. gemm<3, 4>(m0, m, n0, n);
  279. break;
  280. case 0x52:
  281. mc = 5;
  282. nc = 2;
  283. gemm<5, 2>(m0, m, n0, n);
  284. break;
  285. case 0x33:
  286. mc = 3;
  287. nc = 3;
  288. gemm<3, 3>(m0, m, n0, n);
  289. break;
  290. case 0x25:
  291. mc = 2;
  292. nc = 5;
  293. gemm<2, 5>(m0, m, n0, n);
  294. break;
  295. case 0x42:
  296. mc = 4;
  297. nc = 2;
  298. gemm<4, 2>(m0, m, n0, n);
  299. break;
  300. case 0x24:
  301. mc = 2;
  302. nc = 4;
  303. gemm<2, 4>(m0, m, n0, n);
  304. break;
  305. case 0x32:
  306. mc = 3;
  307. nc = 2;
  308. gemm<3, 2>(m0, m, n0, n);
  309. break;
  310. case 0x23:
  311. mc = 2;
  312. nc = 3;
  313. gemm<2, 3>(m0, m, n0, n);
  314. break;
  315. case 0x51:
  316. mc = 5;
  317. nc = 1;
  318. gemm<5, 1>(m0, m, n0, n);
  319. break;
  320. case 0x41:
  321. mc = 4;
  322. nc = 1;
  323. gemm<4, 1>(m0, m, n0, n);
  324. break;
  325. case 0x22:
  326. mc = 2;
  327. nc = 2;
  328. gemm<2, 2>(m0, m, n0, n);
  329. break;
  330. case 0x15:
  331. mc = 1;
  332. nc = 5;
  333. gemm<1, 5>(m0, m, n0, n);
  334. break;
  335. case 0x14:
  336. mc = 1;
  337. nc = 4;
  338. gemm<1, 4>(m0, m, n0, n);
  339. break;
  340. case 0x31:
  341. mc = 3;
  342. nc = 1;
  343. gemm<3, 1>(m0, m, n0, n);
  344. break;
  345. case 0x13:
  346. mc = 1;
  347. nc = 3;
  348. gemm<1, 3>(m0, m, n0, n);
  349. break;
  350. case 0x21:
  351. mc = 2;
  352. nc = 1;
  353. gemm<2, 1>(m0, m, n0, n);
  354. break;
  355. case 0x12:
  356. mc = 1;
  357. nc = 2;
  358. gemm<1, 2>(m0, m, n0, n);
  359. break;
  360. case 0x11:
  361. mc = 1;
  362. nc = 1;
  363. gemm<1, 1>(m0, m, n0, n);
  364. break;
  365. default:
  366. return;
  367. }
  368. mp = m0 + (m - m0) / mc * mc;
  369. np = n0 + (n - n0) / nc * nc;
  370. mnpack(mp, m, n0, np);
  371. mnpack(m0, m, np, n);
  372. }
  373. template <int RM, int RN>
  374. NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
  375. int64_t ytiles = (m - m0) / RM;
  376. int64_t xtiles = (n - n0) / RN;
  377. int64_t tiles = xtiles * ytiles;
  378. int64_t duty = (tiles + nth - 1) / nth;
  379. int64_t start = duty * ith;
  380. int64_t end = start + duty;
  381. if (end > tiles)
  382. end = tiles;
  383. for (int64_t job = start; job < end; ++job) {
  384. int64_t ii = m0 + job / xtiles * RM;
  385. int64_t jj = n0 + job % xtiles * RN;
  386. D Cv[RN][RM] = {};
  387. for (int64_t l = 0; l < k; l += KN)
  388. for (int64_t j = 0; j < RN; ++j)
  389. for (int64_t i = 0; i < RM; ++i)
  390. Cv[j][i] = madd(load<V>(A + lda * (ii + i) + l),
  391. load<V>(B + ldb * (jj + j) + l),
  392. Cv[j][i]);
  393. for (int64_t j = 0; j < RN; ++j)
  394. for (int64_t i = 0; i < RM; ++i)
  395. C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
  396. }
  397. }
  398. const TA *const A;
  399. const TB *const B;
  400. TC *const C;
  401. const int64_t k;
  402. const int64_t lda;
  403. const int64_t ldb;
  404. const int64_t ldc;
  405. const int ith;
  406. const int nth;
  407. };
  408. //////////////////////////////////////////////////////////////////////////////////////////
  409. // QUANT ZERO MATRIX MULTIPLICATION
  410. #if defined(__ARM_FEATURE_DOTPROD)
  411. template <typename TA>
  412. class tinyBLAS_Q0_ARM {
  413. public:
  414. tinyBLAS_Q0_ARM(int64_t k,
  415. const TA *A, int64_t lda,
  416. const block_q8_0 *B, int64_t ldb,
  417. float *C, int64_t ldc,
  418. int ith, int nth)
  419. : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
  420. }
  421. void matmul(int64_t m, int64_t n, int task) {
  422. if (task == GGML_TASK_TYPE_COMPUTE)
  423. mnpack(0, m, 0, n);
  424. }
  425. private:
  426. NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
  427. int64_t mc, nc, mp, np;
  428. switch ((MIN(m - m0, 3) << 4) | MIN(n - n0, 3ll)) {
  429. case 0x33:
  430. mc = 3;
  431. nc = 3;
  432. gemm<3, 3>(m0, m, n0, n);
  433. break;
  434. case 0x32:
  435. mc = 3;
  436. nc = 2;
  437. gemm<3, 2>(m0, m, n0, n);
  438. break;
  439. case 0x23:
  440. mc = 2;
  441. nc = 3;
  442. gemm<2, 3>(m0, m, n0, n);
  443. break;
  444. case 0x22:
  445. mc = 2;
  446. nc = 2;
  447. gemm<2, 2>(m0, m, n0, n);
  448. break;
  449. case 0x31:
  450. mc = 3;
  451. nc = 1;
  452. gemm<3, 1>(m0, m, n0, n);
  453. break;
  454. case 0x13:
  455. mc = 1;
  456. nc = 3;
  457. gemm<1, 3>(m0, m, n0, n);
  458. break;
  459. case 0x21:
  460. mc = 2;
  461. nc = 1;
  462. gemm<2, 1>(m0, m, n0, n);
  463. break;
  464. case 0x12:
  465. mc = 1;
  466. nc = 2;
  467. gemm<1, 2>(m0, m, n0, n);
  468. break;
  469. case 0x11:
  470. mc = 1;
  471. nc = 1;
  472. gemm<1, 1>(m0, m, n0, n);
  473. break;
  474. default:
  475. return;
  476. }
  477. mp = m0 + (m - m0) / mc * mc;
  478. np = n0 + (n - n0) / nc * nc;
  479. mnpack(mp, m, n0, np);
  480. mnpack(m0, m, np, n);
  481. }
  482. template <int RM, int RN>
  483. NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
  484. int64_t ytiles = (m - m0) / RM;
  485. int64_t xtiles = (n - n0) / RN;
  486. int64_t tiles = xtiles * ytiles;
  487. int64_t duty = (tiles + nth - 1) / nth;
  488. int64_t start = duty * ith;
  489. int64_t end = start + duty;
  490. if (end > tiles)
  491. end = tiles;
  492. for (int64_t job = start; job < end; ++job) {
  493. int64_t ii = m0 + job / xtiles * RM;
  494. int64_t jj = n0 + job % xtiles * RN;
  495. float32x4_t Cv[RN][RM] = {};
  496. for (int64_t l = 0; l < k; ++l)
  497. for (int64_t j = 0; j < RN; ++j)
  498. for (int64_t i = 0; i < RM; ++i)
  499. Cv[j][i] = vmlaq_n_f32(Cv[j][i],
  500. vcvtq_f32_s32(vdotq_s32(
  501. vdotq_s32(vdupq_n_s32(0),
  502. load_lo(A + lda * (ii + i) + l),
  503. load_lo(B + ldb * (jj + j) + l)),
  504. load_hi(A + lda * (ii + i) + l),
  505. load_hi(B + ldb * (jj + j) + l))),
  506. unhalf(A[lda * (ii + i) + l].d) *
  507. unhalf(B[ldb * (jj + j) + l].d));
  508. for (int64_t j = 0; j < RN; ++j)
  509. for (int64_t i = 0; i < RM; ++i)
  510. C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
  511. }
  512. }
  513. inline int8x16_t load_lo(const block_q8_0 *b) {
  514. return vld1q_s8(b->qs);
  515. }
  516. inline int8x16_t load_hi(const block_q8_0 *b) {
  517. return vld1q_s8(b->qs + 16);
  518. }
  519. inline int8x16_t load_lo(const block_q4_0 *b) {
  520. return vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vld1q_u8(b->qs),
  521. vdupq_n_u8(0x0f))),
  522. vdupq_n_s8(0x8));
  523. }
  524. inline int8x16_t load_hi(const block_q4_0 *b) {
  525. return vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(vld1q_u8(b->qs), 4)),
  526. vdupq_n_s8(0x8));
  527. }
  528. const TA *const A;
  529. const block_q8_0 *const B;
  530. float *const C;
  531. const int64_t k;
  532. const int64_t lda;
  533. const int64_t ldb;
  534. const int64_t ldc;
  535. const int ith;
  536. const int nth;
  537. };
  538. #endif // __ARM_FEATURE_DOTPROD
  539. #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
  540. template <typename TA, typename TB, typename TC>
  541. class tinyBLAS_Q0_AVX {
  542. public:
  543. tinyBLAS_Q0_AVX(int64_t k,
  544. const TA *A, int64_t lda,
  545. const TB *B, int64_t ldb,
  546. TC *C, int64_t ldc,
  547. int ith, int nth)
  548. : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
  549. }
  550. void matmul(int64_t m, int64_t n, int task) {
  551. if (task == GGML_TASK_TYPE_COMPUTE)
  552. mnpack(0, m, 0, n);
  553. }
  554. private:
  555. void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
  556. int64_t mc, nc, mp, np;
  557. switch ((MIN(m - m0, 4) << 4) | MIN(n - n0, 4)) {
  558. #if VECTOR_REGISTERS == 32
  559. case 0x44:
  560. mc = 4;
  561. nc = 4;
  562. gemm<4, 4>(m0, m, n0, n);
  563. break;
  564. case 0x43:
  565. mc = 4;
  566. nc = 3;
  567. gemm<4, 3>(m0, m, n0, n);
  568. break;
  569. case 0x34:
  570. mc = 3;
  571. nc = 4;
  572. gemm<3, 4>(m0, m, n0, n);
  573. break;
  574. case 0x33:
  575. mc = 3;
  576. nc = 3;
  577. gemm<3, 3>(m0, m, n0, n);
  578. break;
  579. case 0x42:
  580. mc = 4;
  581. nc = 2;
  582. gemm<4, 2>(m0, m, n0, n);
  583. break;
  584. case 0x24:
  585. mc = 2;
  586. nc = 4;
  587. gemm<2, 4>(m0, m, n0, n);
  588. break;
  589. #else
  590. case 0x44:
  591. case 0x43:
  592. case 0x42:
  593. mc = 4;
  594. nc = 2;
  595. gemm<4, 2>(m0, m, n0, n);
  596. break;
  597. case 0x34:
  598. case 0x24:
  599. mc = 2;
  600. nc = 4;
  601. gemm<2, 4>(m0, m, n0, n);
  602. break;
  603. case 0x33:
  604. #endif
  605. case 0x32:
  606. mc = 3;
  607. nc = 2;
  608. gemm<3, 2>(m0, m, n0, n);
  609. break;
  610. case 0x23:
  611. mc = 2;
  612. nc = 3;
  613. gemm<2, 3>(m0, m, n0, n);
  614. break;
  615. case 0x41:
  616. mc = 4;
  617. nc = 1;
  618. gemm<4, 1>(m0, m, n0, n);
  619. break;
  620. case 0x22:
  621. mc = 2;
  622. nc = 2;
  623. gemm<2, 2>(m0, m, n0, n);
  624. break;
  625. case 0x14:
  626. mc = 1;
  627. nc = 4;
  628. gemm<1, 4>(m0, m, n0, n);
  629. break;
  630. case 0x31:
  631. mc = 3;
  632. nc = 1;
  633. gemm<3, 1>(m0, m, n0, n);
  634. break;
  635. case 0x13:
  636. mc = 1;
  637. nc = 3;
  638. gemm<1, 3>(m0, m, n0, n);
  639. break;
  640. case 0x21:
  641. mc = 2;
  642. nc = 1;
  643. gemm<2, 1>(m0, m, n0, n);
  644. break;
  645. case 0x12:
  646. mc = 1;
  647. nc = 2;
  648. gemm<1, 2>(m0, m, n0, n);
  649. break;
  650. case 0x11:
  651. mc = 1;
  652. nc = 1;
  653. gemm<1, 1>(m0, m, n0, n);
  654. break;
  655. default:
  656. return;
  657. }
  658. mp = m0 + (m - m0) / mc * mc;
  659. np = n0 + (n - n0) / nc * nc;
  660. mnpack(mp, m, n0, np);
  661. mnpack(m0, m, np, n);
  662. }
  663. template <int RM, int RN>
  664. NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
  665. int64_t ytiles = (m - m0) / RM;
  666. int64_t xtiles = (n - n0) / RN;
  667. int64_t tiles = xtiles * ytiles;
  668. int64_t duty = (tiles + nth - 1) / nth;
  669. int64_t start = duty * ith;
  670. int64_t end = start + duty;
  671. if (end > tiles)
  672. end = tiles;
  673. for (int64_t job = start; job < end; ++job) {
  674. int64_t ii = m0 + job / xtiles * RM;
  675. int64_t jj = n0 + job % xtiles * RN;
  676. __m256 Cv[RN][RM] = {};
  677. for (int64_t l = 0; l < k; ++l)
  678. for (int64_t j = 0; j < RN; ++j)
  679. for (int64_t i = 0; i < RM; ++i) {
  680. #if defined(__AVX2__)
  681. __m256 udTmp = updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
  682. load(A + lda * (ii + i) + l)),
  683. _mm256_sign_epi8(load(B + ldb * (jj + j) + l),
  684. load(A + lda * (ii + i) + l)));
  685. #else
  686. __m128i ali0 = load0(A + lda * (ii + i) + l);
  687. __m128i ali1 = load1(A + lda * (ii + i) + l);
  688. __m128i blj0 = load0(B + ldb * (jj + j) + l);
  689. __m128i blj1 = load1(B + ldb * (jj + j) + l);
  690. __m128i sepAA0 = _mm_sign_epi8(ali0, ali0);
  691. __m128i sepAA1 = _mm_sign_epi8(ali1, ali1);
  692. __m128i sepBA0 = _mm_sign_epi8(blj0, ali0);
  693. __m128i sepBA1 = _mm_sign_epi8(blj1, ali1);
  694. // updot
  695. const __m128i oneFill = _mm_set1_epi16(1);
  696. __m128i mad0 = _mm_maddubs_epi16(sepAA0, sepBA0);
  697. __m128i mad1 = _mm_maddubs_epi16(sepAA1, sepBA1);
  698. __m256 udTmp = _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_madd_epi16(oneFill, mad1), _mm_madd_epi16(oneFill, mad0)));
  699. #endif
  700. Cv[j][i] = madd(_mm256_set1_ps(unhalf(A[lda * (ii + i) + l].d) *
  701. unhalf(B[ldb * (jj + j) + l].d)),
  702. udTmp,
  703. Cv[j][i]);
  704. }
  705. for (int64_t j = 0; j < RN; ++j)
  706. for (int64_t i = 0; i < RM; ++i)
  707. C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
  708. }
  709. }
  710. inline __m256i load(const block_q8_0 *b) {
  711. return _mm256_loadu_si256((const __m256i *)b->qs);
  712. }
  713. inline __m128i load0(const block_q8_0 *b) {
  714. return _mm_loadu_si128((const __m128i *)b->qs);
  715. }
  716. inline __m128i load1(const block_q8_0 *b) {
  717. return _mm_loadu_si128(((const __m128i *)b->qs) + 1);
  718. }
  719. inline __m256i load(const block_q4_0 *b) {
  720. return _mm256_sub_epi8(denibble(b->qs), _mm256_set1_epi8(8));
  721. }
  722. inline __m128i load0(const block_q4_0 *b) {
  723. const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
  724. return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), x), _mm_set1_epi8(8));
  725. }
  726. inline __m128i load1(const block_q4_0 *b) {
  727. const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
  728. return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8));
  729. }
  730. inline __m256 updot(__m256i u, __m256i s) {
  731. __m256i res;
  732. #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
  733. res = _mm256_dpbusd_epi32(_mm256_setzero_si256(), u, s);
  734. #else
  735. res = _mm256_madd_epi16(_mm256_set1_epi16(1), _mm256_maddubs_epi16(u, s));
  736. #endif
  737. return _mm256_cvtepi32_ps(res);
  738. }
  739. static inline __m256i denibble(const uint8_t *p) {
  740. __m128i x = _mm_loadu_si128((const __m128i *)p);
  741. return _mm256_and_si256(_mm256_set1_epi8(15),
  742. _mm256_insertf128_si256(_mm256_castsi128_si256(x),
  743. _mm_srli_epi16(x, 4), 1));
  744. }
  745. const TA *const A;
  746. const TB *const B;
  747. TC *const C;
  748. const int64_t k;
  749. const int64_t lda;
  750. const int64_t ldb;
  751. const int64_t ldc;
  752. const int ith;
  753. const int nth;
  754. };
  755. #endif // __AVX__
  756. } // namespace
  757. /**
  758. * Performs optimized matrix multiplication on CPU.
  759. *
  760. * This subroutine may compute C = Aᵀ * B with column major ordering.
  761. * Despite its name, this isn't a generalized implementation. Work is
  762. * only performed when a handwritten kernel is written and available.
  763. * Otherwise the caller should fall back to a general matmul routine.
  764. *
  765. * For example, for single-threaded single-precision GEMM you can say
  766. *
  767. * llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc,
  768. * 0, 1, GGML_TASK_TYPE_COMPUTE,
  769. * GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32);
  770. *
  771. * @param m is rows in `A` and `C`
  772. * @param n is cols in `B` and `C`
  773. * @param k is cols in `A` and rows in `B`
  774. * @param A is first input matrix (always transposed)
  775. * @param lda is row stride of `A`
  776. * @param B is second input matrix (never transposed)
  777. * @param ldb is row stride of `B`
  778. * @param C is input/output array of output matrices
  779. * @param ldc is row stride of `C`
  780. * @param ith is thread id (must be less than `nth`)
  781. * @param nth is number of threads (must be greater than zero)
  782. * @param task is GGML task type
  783. * @param Atype is GGML data type of `A`
  784. * @param Btype is GGML data type of `B`
  785. * @param Ctype is GGML data type of `C`
  786. * @return true if this function was able to service the matmul request
  787. */
  788. bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
  789. int64_t ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype) {
  790. assert(m >= 0);
  791. assert(n >= 0);
  792. assert(k >= 0);
  793. assert(lda >= k);
  794. assert(ldb >= k);
  795. assert(ldc >= m);
  796. assert(nth > 0);
  797. assert(ith < nth);
  798. if (Ctype != GGML_TYPE_F32)
  799. return false;
  800. switch (Atype) {
  801. case GGML_TYPE_F32: {
  802. if (Btype != GGML_TYPE_F32)
  803. return false;
  804. #if defined(__AVX512F__)
  805. if (k % 16)
  806. return false;
  807. tinyBLAS<16, __m512, __m512, float, float, float> tb{
  808. k, (const float *)A, lda,
  809. (const float *)B, ldb,
  810. (float *)C, ldc,
  811. ith, nth};
  812. tb.matmul(m, n, task);
  813. return true;
  814. #elif defined(__AVX__) || defined(__AVX2__)
  815. if (k % 8)
  816. return false;
  817. tinyBLAS<8, __m256, __m256, float, float, float> tb{
  818. k, (const float *)A, lda,
  819. (const float *)B, ldb,
  820. (float *)C, ldc,
  821. ith, nth};
  822. tb.matmul(m, n, task);
  823. return true;
  824. #elif defined(__ARM_NEON)
  825. if (n < 4)
  826. return false;
  827. if (k % 4)
  828. return false;
  829. tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{
  830. k, (const float *)A, lda,
  831. (const float *)B, ldb,
  832. (float *)C, ldc,
  833. ith, nth};
  834. tb.matmul(m, n, task);
  835. return true;
  836. #else
  837. return false;
  838. #endif
  839. }
  840. case GGML_TYPE_F16: {
  841. #if defined(__AVX512F__)
  842. if (k % 16)
  843. return false;
  844. if (Btype != GGML_TYPE_F32)
  845. return false;
  846. tinyBLAS<16, __m512, __m512, ggml_fp16_t, float, float> tb{
  847. k, (const ggml_fp16_t *)A, lda,
  848. (const float *)B, ldb,
  849. (float *)C, ldc,
  850. ith, nth};
  851. tb.matmul(m, n, task);
  852. return true;
  853. #elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
  854. if (k % 8)
  855. return false;
  856. if (Btype != GGML_TYPE_F32)
  857. return false;
  858. tinyBLAS<8, __m256, __m256, ggml_fp16_t, float, float> tb{
  859. k, (const ggml_fp16_t *)A, lda,
  860. (const float *)B, ldb,
  861. (float *)C, ldc,
  862. ith, nth};
  863. tb.matmul(m, n, task);
  864. return true;
  865. #elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
  866. if (n < 8)
  867. return false;
  868. if (k % 8)
  869. return false;
  870. if (Btype != GGML_TYPE_F16)
  871. return false;
  872. tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{
  873. k, (const ggml_fp16_t *)A, lda,
  874. (const ggml_fp16_t *)B, ldb,
  875. (float *)C, ldc,
  876. ith, nth};
  877. tb.matmul(m, n, task);
  878. return true;
  879. #elif defined(__ARM_NEON) && !defined(_MSC_VER)
  880. if (k % 4)
  881. return false;
  882. if (Btype != GGML_TYPE_F32)
  883. return false;
  884. tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{
  885. k, (const ggml_fp16_t *)A, lda,
  886. (const float *)B, ldb,
  887. (float *)C, ldc,
  888. ith, nth};
  889. tb.matmul(m, n, task);
  890. return true;
  891. #else
  892. return false;
  893. #endif
  894. }
  895. case GGML_TYPE_Q8_0: {
  896. if (Btype != GGML_TYPE_Q8_0)
  897. return false;
  898. #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
  899. tinyBLAS_Q0_AVX<block_q8_0, block_q8_0, float> tb{
  900. k, (const block_q8_0 *)A, lda,
  901. (const block_q8_0 *)B, ldb,
  902. (float *)C, ldc,
  903. ith, nth};
  904. tb.matmul(m, n, task);
  905. return true;
  906. #elif defined(__ARM_FEATURE_DOTPROD)
  907. tinyBLAS_Q0_ARM<block_q8_0> tb{
  908. k, (const block_q8_0 *)A, lda,
  909. (const block_q8_0 *)B, ldb,
  910. (float *)C, ldc,
  911. ith, nth};
  912. tb.matmul(m, n, task);
  913. return true;
  914. #else
  915. return false;
  916. #endif
  917. }
  918. case GGML_TYPE_Q4_0: {
  919. if (Btype != GGML_TYPE_Q8_0)
  920. return false;
  921. #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
  922. tinyBLAS_Q0_AVX<block_q4_0, block_q8_0, float> tb{
  923. k, (const block_q4_0 *)A, lda,
  924. (const block_q8_0 *)B, ldb,
  925. (float *)C, ldc,
  926. ith, nth};
  927. tb.matmul(m, n, task);
  928. return true;
  929. #elif defined(__ARM_FEATURE_DOTPROD)
  930. tinyBLAS_Q0_ARM<block_q4_0> tb{
  931. k, (const block_q4_0 *)A, lda,
  932. (const block_q8_0 *)B, ldb,
  933. (float *)C, ldc,
  934. ith, nth};
  935. tb.matmul(m, n, task);
  936. return true;
  937. #else
  938. return false;
  939. #endif
  940. }
  941. default:
  942. return false;
  943. }
  944. (void)m;
  945. (void)n;
  946. (void)k;
  947. (void)A;
  948. (void)lda;
  949. (void)B;
  950. (void)ldb;
  951. (void)C;
  952. (void)ldc;
  953. (void)ith;
  954. (void)nth;
  955. (void)task;
  956. (void)Atype;
  957. (void)Btype;
  958. (void)Ctype;
  959. }