repack.cpp 71 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833
  1. #define GGML_COMMON_IMPL_CPP
  2. #define GGML_COMMON_DECL_CPP
  3. #include "ggml-common.h"
  4. #include "ggml-backend-impl.h"
  5. #include "ggml-impl.h"
  6. #include "ggml-cpu.h"
  7. #include "ggml-cpu-impl.h"
  8. #include "simd-mappings.h"
  9. #include "traits.h"
  10. #include "arch-fallback.h"
  11. #include <cmath>
  12. #include <cstring>
  13. #include <cassert>
  14. #include <cstdio> // for GGML_ASSERT
  15. #include "repack.h"
  16. #if defined(__GNUC__)
  17. #pragma GCC diagnostic ignored "-Woverlength-strings"
  18. #endif
  19. #define UNUSED GGML_UNUSED
  20. static inline int nearest_int(float fval) {
  21. assert(fabsf(fval) <= 4194303.f);
  22. float val = fval + 12582912.f;
  23. int i; memcpy(&i, &val, sizeof(int));
  24. return (i & 0x007fffff) - 0x00400000;
  25. }
  26. // Functions to create the interleaved data layout formats
  27. // interleave 4 block_q4_0s in blocks of blck_size_interleave
  28. // returns an interleaved block_q4_0x4
  29. // in the interleaved block_q4_0x4, place deltas for 4 block_q4_0 blocks
  30. // first, then interleave quants from 4 block_q4_0s in blocks of blck_size_interleave
  31. //
  32. // - in : an array of block_q4_0 pointers
  33. // - blck_size_interleave : the block_q4_0 quants bytes are interleaved in blocks of
  34. // blck_size_interleave bytes
  35. // - xor_mask : the mask to convert the nibbles in block_q4_0 quants bytes
  36. // from bias offset form to pure sign form (this saves subtract
  37. // operations durin unpacking)
  38. //
  39. extern "C" {
  40. void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
  41. assert(QK8_0 == 32);
  42. assert(k % QK8_0 == 0);
  43. const int nb = k / QK8_0;
  44. block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
  45. // scalar
  46. const int blck_size_interleave = 4;
  47. float srcv[4][QK8_0];
  48. float id[4];
  49. for (int i = 0; i < nb; i++) {
  50. for (int row_iter = 0; row_iter < 4; row_iter++) {
  51. float amax = 0.0f; // absolute max
  52. for (int j = 0; j < QK8_0; j++) {
  53. srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
  54. amax = MAX(amax, fabsf(srcv[row_iter][j]));
  55. }
  56. const float d = amax / ((1 << 7) - 1);
  57. id[row_iter] = d ? 1.0f / d : 0.0f;
  58. y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
  59. }
  60. for (int j = 0; j < QK8_0 * 4; j++) {
  61. int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
  62. int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
  63. src_offset += (j % blck_size_interleave);
  64. float x0 = srcv[src_id][src_offset] * id[src_id];
  65. y[i].qs[j] = roundf(x0);
  66. }
  67. }
  68. }
  69. void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
  70. assert(QK8_0 == 32);
  71. assert(k % QK8_0 == 0);
  72. const int nb = k / QK8_0;
  73. block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
  74. // scalar
  75. const int blck_size_interleave = 8;
  76. float srcv[4][QK8_0];
  77. float id[4];
  78. for (int i = 0; i < nb; i++) {
  79. for (int row_iter = 0; row_iter < 4; row_iter++) {
  80. float amax = 0.0f; // absolute max
  81. for (int j = 0; j < QK8_0; j++) {
  82. srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
  83. amax = MAX(amax, fabsf(srcv[row_iter][j]));
  84. }
  85. const float d = amax / ((1 << 7) - 1);
  86. id[row_iter] = d ? 1.0f / d : 0.0f;
  87. y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
  88. }
  89. for (int j = 0; j < QK8_0 * 4; j++) {
  90. int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
  91. int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
  92. src_offset += (j % blck_size_interleave);
  93. float x0 = srcv[src_id][src_offset] * id[src_id];
  94. y[i].qs[j] = roundf(x0);
  95. }
  96. }
  97. }
  98. void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
  99. assert(QK_K == 256);
  100. assert(k % QK_K == 0);
  101. const int nb = k / QK_K;
  102. block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy;
  103. // scalar
  104. const int blck_size_interleave = 8;
  105. float srcv[4][QK_K];
  106. float iscale[4];
  107. for (int i = 0; i < nb; i++) {
  108. for (int row_iter = 0; row_iter < 4; row_iter++) {
  109. float amax = 0.0f; // absolute max
  110. float max = 0;
  111. for (int j = 0; j < QK_K; j++) {
  112. srcv[row_iter][j] = x[row_iter * k + i * QK_K + j];
  113. // Update the maximum value of the corresponding super block
  114. if(amax < fabsf(srcv[row_iter][j])) {
  115. amax = fabsf(srcv[row_iter][j]);
  116. max = srcv[row_iter][j];
  117. }
  118. }
  119. iscale[row_iter] = amax ? -127.f/max : 0;
  120. y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0;
  121. }
  122. for (int j = 0; j < QK_K / 4; j++) {
  123. y[i].bsums[j] = 0;
  124. }
  125. // Quants values are interleaved in sequence of eight bytes from corresponding super blocks
  126. // Bsums values are interleaved in sequence of four bsums from each super block taken for interleaving
  127. // i.e first four bsums from the first super block, followed by first four bsums from second super block and so on
  128. for (int j = 0; j < QK_K * 4; j++) {
  129. int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
  130. int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
  131. src_offset += (j % blck_size_interleave);
  132. int index = (((j & 31) >> 3) << 2) + ((j >> 8) << 4) + ((j >> 6) & 3);
  133. float x0 = srcv[src_id][src_offset] * iscale[src_id];
  134. y[i].qs[j] = nearest_int(x0);
  135. y[i].bsums[index] += y[i].qs[j];
  136. }
  137. }
  138. }
  139. } // extern "C"
  140. template <int64_t INTER_SIZE, ggml_type PARAM_TYPE>
  141. void ggml_quantize_mat_t(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row);
  142. template <> void ggml_quantize_mat_t<4, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
  143. assert(nrow == 4);
  144. UNUSED(nrow);
  145. ggml_quantize_mat_q8_0_4x4(x, vy, n_per_row);
  146. }
  147. template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
  148. assert(nrow == 4);
  149. UNUSED(nrow);
  150. ggml_quantize_mat_q8_0_4x8(x, vy, n_per_row);
  151. }
  152. template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
  153. assert(nrow == 4);
  154. UNUSED(nrow);
  155. ggml_quantize_mat_q8_K_4x8(x, vy, n_per_row);
  156. }
  157. extern "C" {
  158. void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  159. const int qk = QK8_0;
  160. const int nb = n / qk;
  161. const int ncols_interleaved = 4;
  162. const int blocklen = 4;
  163. assert (n % qk == 0);
  164. assert (nc % ncols_interleaved == 0);
  165. UNUSED(s);
  166. UNUSED(bs);
  167. UNUSED(vx);
  168. UNUSED(vy);
  169. UNUSED(nr);
  170. UNUSED(nc);
  171. UNUSED(nb);
  172. UNUSED(ncols_interleaved);
  173. UNUSED(blocklen);
  174. float sumf[4];
  175. int sumi;
  176. const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
  177. for (int x = 0; x < nc / ncols_interleaved; x++) {
  178. const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
  179. for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
  180. for (int l = 0; l < nb; l++) {
  181. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  182. for (int j = 0; j < ncols_interleaved; j++) {
  183. sumi = 0;
  184. for (int i = 0; i < blocklen; ++i) {
  185. const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
  186. const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
  187. sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
  188. }
  189. sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
  190. }
  191. }
  192. }
  193. for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
  194. }
  195. }
  196. void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  197. const int qk = QK8_0;
  198. const int nb = n / qk;
  199. const int ncols_interleaved = 4;
  200. const int blocklen = 8;
  201. assert (n % qk == 0);
  202. assert (nc % ncols_interleaved == 0);
  203. UNUSED(s);
  204. UNUSED(bs);
  205. UNUSED(vx);
  206. UNUSED(vy);
  207. UNUSED(nr);
  208. UNUSED(nc);
  209. UNUSED(nb);
  210. UNUSED(ncols_interleaved);
  211. UNUSED(blocklen);
  212. float sumf[4];
  213. int sumi;
  214. const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
  215. for (int x = 0; x < nc / ncols_interleaved; x++) {
  216. const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
  217. for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
  218. for (int l = 0; l < nb; l++) {
  219. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  220. for (int j = 0; j < ncols_interleaved; j++) {
  221. sumi = 0;
  222. for (int i = 0; i < blocklen; ++i) {
  223. const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
  224. const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
  225. sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
  226. }
  227. sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
  228. }
  229. }
  230. }
  231. for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
  232. }
  233. }
  234. void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  235. const int qk = QK8_0;
  236. const int nb = n / qk;
  237. const int ncols_interleaved = 8;
  238. const int blocklen = 8;
  239. assert (n % qk == 0);
  240. assert (nc % ncols_interleaved == 0);
  241. UNUSED(s);
  242. UNUSED(bs);
  243. UNUSED(vx);
  244. UNUSED(vy);
  245. UNUSED(nr);
  246. UNUSED(nc);
  247. UNUSED(nb);
  248. UNUSED(ncols_interleaved);
  249. UNUSED(blocklen);
  250. {
  251. float sumf[8];
  252. int sumi;
  253. const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
  254. for (int x = 0; x < nc / ncols_interleaved; x++) {
  255. const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
  256. for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
  257. for (int l = 0; l < nb; l++) {
  258. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  259. for (int j = 0; j < ncols_interleaved; j++) {
  260. sumi = 0;
  261. for (int i = 0; i < blocklen; ++i) {
  262. const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
  263. const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
  264. sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
  265. }
  266. sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
  267. }
  268. }
  269. }
  270. for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
  271. }
  272. }
  273. }
  274. void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  275. const int qk = QK_K;
  276. const int nb = n / qk;
  277. const int ncols_interleaved = 8;
  278. const int blocklen = 8;
  279. static const uint32_t kmask1 = 0x3f3f3f3f;
  280. static const uint32_t kmask2 = 0x0f0f0f0f;
  281. static const uint32_t kmask3 = 0x03030303;
  282. assert (n % qk == 0);
  283. assert (nc % ncols_interleaved == 0);
  284. UNUSED(s);
  285. UNUSED(bs);
  286. UNUSED(vx);
  287. UNUSED(vy);
  288. UNUSED(nr);
  289. UNUSED(nc);
  290. UNUSED(nb);
  291. UNUSED(ncols_interleaved);
  292. UNUSED(blocklen);
  293. float sumf[8];
  294. float sum_minf[8];
  295. uint32_t utmp[32];
  296. int sumi1;
  297. int sumi2;
  298. int sumi;
  299. const block_q8_K * a_ptr = (const block_q8_K *) vy;
  300. for (int x = 0; x < nc / ncols_interleaved; x++) {
  301. const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
  302. for (int j = 0; j < ncols_interleaved; j++) {
  303. sumf[j] = 0.0;
  304. sum_minf[j] = 0.0;
  305. }
  306. for (int l = 0; l < nb; l++) {
  307. for (int sb = 0; sb < 8; sb++) {
  308. memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
  309. utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
  310. const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
  311. utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
  312. utmp[sb * 4 + 2] = uaux_0;
  313. utmp[sb * 4 + 0] &= kmask1;
  314. }
  315. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  316. uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
  317. uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
  318. for (int j = 0; j < ncols_interleaved; j++) {
  319. sumi1 = 0;
  320. sumi2 = 0;
  321. sumi = 0;
  322. for (int i = 0; i < blocklen; ++i) {
  323. const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
  324. const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
  325. sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i]);
  326. sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i + 32]);
  327. sumi1 = sumi1 * scales_0[j];
  328. sumi2 = sumi2 * scales_1[j];
  329. sumi += sumi1 + sumi2;
  330. }
  331. sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
  332. }
  333. }
  334. for (int sb = 0; sb < 8; sb++) {
  335. uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
  336. for (int j = 0; j < ncols_interleaved; j++) {
  337. sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
  338. }
  339. }
  340. }
  341. for (int j = 0; j < ncols_interleaved; j++) {
  342. s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
  343. }
  344. }
  345. }
  346. void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  347. const int qk = QK_K;
  348. const int nb = n / qk;
  349. const int ncols_interleaved = 8;
  350. const int blocklen = 8;
  351. assert (n % qk == 0);
  352. assert (nc % ncols_interleaved == 0);
  353. UNUSED(s);
  354. UNUSED(bs);
  355. UNUSED(vx);
  356. UNUSED(vy);
  357. UNUSED(nr);
  358. UNUSED(nc);
  359. UNUSED(nb);
  360. UNUSED(ncols_interleaved);
  361. UNUSED(blocklen);
  362. float sumf[8];
  363. float sum_minf[8];
  364. int sumi1,sumi2,sumi3,sumi4;
  365. int sumi;
  366. const block_q8_K * a_ptr = (const block_q8_K *)vy;
  367. for(int x = 0; x < nc / ncols_interleaved; x++) {
  368. const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
  369. for (int j = 0; j < ncols_interleaved; j++) {
  370. sumf[j] = 0.0;
  371. sum_minf[j] = 0.0;
  372. }
  373. for (int l = 0; l < nb; l++) {
  374. for (int k = 0; k < (qk / (4 * blocklen)); k++) {
  375. const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
  376. const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
  377. const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
  378. const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
  379. for (int j = 0; j < ncols_interleaved; j++) {
  380. sumi1 = 0;
  381. sumi2 = 0;
  382. sumi3 = 0;
  383. sumi4 = 0;
  384. sumi = 0;
  385. int offset = ((k / 2) % 2) + j * 2;
  386. for (int i = 0; i < blocklen; ++i){
  387. const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
  388. const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
  389. const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
  390. const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
  391. sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]);
  392. sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]);
  393. sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 64]);
  394. sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 96]);
  395. sumi1 = sumi1 * (scales_0[offset] & 0xF);
  396. sumi2 = sumi2 * (scales_1[offset] & 0xF);
  397. sumi3 = sumi3 * (scales_2[offset] & 0xF);
  398. sumi4 = sumi4 * (scales_3[offset] & 0xF);
  399. sumi += sumi1 + sumi2 + sumi3 + sumi4;
  400. }
  401. sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
  402. }
  403. }
  404. for(int sb = 0; sb < 8; sb++) {
  405. const uint8_t *mins = b_ptr[l].scales + sb * 16;
  406. for(int j = 0; j < ncols_interleaved; j++){
  407. sum_minf[j] += ((mins[j * 2] >> 4) * a_ptr[l].bsums[sb * 2] + (mins[(j * 2)+ 1] >> 4) * a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
  408. }
  409. }
  410. }
  411. for (int j = 0; j < ncols_interleaved; j++) {
  412. s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
  413. }
  414. }
  415. }
  416. void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  417. const int qk = QK8_0;
  418. const int nb = n / qk;
  419. const int ncols_interleaved = 4;
  420. const int blocklen = 4;
  421. assert (n % qk == 0);
  422. assert (nc % ncols_interleaved == 0);
  423. UNUSED(s);
  424. UNUSED(bs);
  425. UNUSED(vx);
  426. UNUSED(vy);
  427. UNUSED(nr);
  428. UNUSED(nc);
  429. UNUSED(nb);
  430. UNUSED(ncols_interleaved);
  431. UNUSED(blocklen);
  432. {
  433. float sumf[4];
  434. int sumi;
  435. const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
  436. for (int x = 0; x < nc / ncols_interleaved; x++) {
  437. const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
  438. for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
  439. for (int l = 0; l < nb; l++) {
  440. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  441. for (int j = 0; j < ncols_interleaved; j++) {
  442. sumi = 0;
  443. for (int i = 0; i < blocklen; ++i) {
  444. const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
  445. const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
  446. sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
  447. }
  448. sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
  449. }
  450. }
  451. }
  452. for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
  453. }
  454. }
  455. }
  456. void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  457. const int qk = QK8_0;
  458. const int nb = n / qk;
  459. const int ncols_interleaved = 4;
  460. const int blocklen = 4;
  461. assert (n % qk == 0);
  462. assert (nr % 4 == 0);
  463. assert (nc % ncols_interleaved == 0);
  464. UNUSED(s);
  465. UNUSED(bs);
  466. UNUSED(vx);
  467. UNUSED(vy);
  468. UNUSED(nr);
  469. UNUSED(nc);
  470. UNUSED(nb);
  471. UNUSED(ncols_interleaved);
  472. UNUSED(blocklen);
  473. {
  474. float sumf[4][4];
  475. int sumi;
  476. for (int y = 0; y < nr / 4; y++) {
  477. const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
  478. for (int x = 0; x < nc / ncols_interleaved; x++) {
  479. const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
  480. for (int m = 0; m < 4; m++) {
  481. for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
  482. }
  483. for (int l = 0; l < nb; l++) {
  484. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  485. for (int m = 0; m < 4; m++) {
  486. for (int j = 0; j < ncols_interleaved; j++) {
  487. sumi = 0;
  488. for (int i = 0; i < blocklen; ++i) {
  489. const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
  490. const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
  491. sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
  492. (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
  493. }
  494. sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
  495. }
  496. }
  497. }
  498. }
  499. for (int m = 0; m < 4; m++) {
  500. for (int j = 0; j < ncols_interleaved; j++)
  501. s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
  502. }
  503. }
  504. }
  505. }
  506. }
  507. void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  508. const int qk = QK8_0;
  509. const int nb = n / qk;
  510. const int ncols_interleaved = 4;
  511. const int blocklen = 8;
  512. assert (n % qk == 0);
  513. assert (nr % 4 == 0);
  514. assert (nc % ncols_interleaved == 0);
  515. UNUSED(s);
  516. UNUSED(bs);
  517. UNUSED(vx);
  518. UNUSED(vy);
  519. UNUSED(nr);
  520. UNUSED(nc);
  521. UNUSED(nb);
  522. UNUSED(ncols_interleaved);
  523. UNUSED(blocklen);
  524. float sumf[4][4];
  525. int sumi;
  526. for (int y = 0; y < nr / 4; y++) {
  527. const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
  528. for (int x = 0; x < nc / ncols_interleaved; x++) {
  529. const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
  530. for (int m = 0; m < 4; m++) {
  531. for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
  532. }
  533. for (int l = 0; l < nb; l++) {
  534. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  535. for (int m = 0; m < 4; m++) {
  536. for (int j = 0; j < ncols_interleaved; j++) {
  537. sumi = 0;
  538. for (int i = 0; i < blocklen; ++i) {
  539. const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
  540. const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
  541. sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
  542. (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
  543. }
  544. sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
  545. }
  546. }
  547. }
  548. }
  549. for (int m = 0; m < 4; m++) {
  550. for (int j = 0; j < ncols_interleaved; j++)
  551. s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
  552. }
  553. }
  554. }
  555. }
  556. void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  557. const int qk = QK8_0;
  558. const int nb = n / qk;
  559. const int ncols_interleaved = 8;
  560. const int blocklen = 8;
  561. assert (n % qk == 0);
  562. assert (nr % 4 == 0);
  563. assert (nc % ncols_interleaved == 0);
  564. UNUSED(s);
  565. UNUSED(bs);
  566. UNUSED(vx);
  567. UNUSED(vy);
  568. UNUSED(nr);
  569. UNUSED(nc);
  570. UNUSED(nb);
  571. UNUSED(ncols_interleaved);
  572. UNUSED(blocklen);
  573. float sumf[4][8];
  574. int sumi;
  575. for (int y = 0; y < nr / 4; y++) {
  576. const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
  577. for (int x = 0; x < nc / ncols_interleaved; x++) {
  578. const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
  579. for (int m = 0; m < 4; m++) {
  580. for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
  581. }
  582. for (int l = 0; l < nb; l++) {
  583. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  584. for (int m = 0; m < 4; m++) {
  585. for (int j = 0; j < ncols_interleaved; j++) {
  586. sumi = 0;
  587. for (int i = 0; i < blocklen; ++i) {
  588. const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
  589. const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
  590. sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
  591. (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
  592. }
  593. sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
  594. }
  595. }
  596. }
  597. }
  598. for (int m = 0; m < 4; m++) {
  599. for (int j = 0; j < ncols_interleaved; j++)
  600. s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
  601. }
  602. }
  603. }
  604. }
  605. void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  606. const int qk = QK_K;
  607. const int nb = n / qk;
  608. const int ncols_interleaved = 8;
  609. const int blocklen = 8;
  610. static const uint32_t kmask1 = 0x3f3f3f3f;
  611. static const uint32_t kmask2 = 0x0f0f0f0f;
  612. static const uint32_t kmask3 = 0x03030303;
  613. assert (n % qk == 0);
  614. assert (nr % 4 == 0);
  615. assert (nc % ncols_interleaved == 0);
  616. UNUSED(s);
  617. UNUSED(bs);
  618. UNUSED(vx);
  619. UNUSED(vy);
  620. UNUSED(nr);
  621. UNUSED(nc);
  622. UNUSED(nb);
  623. UNUSED(ncols_interleaved);
  624. UNUSED(blocklen);
  625. float sumf[4][8];
  626. float sum_minf[4][8];
  627. uint32_t utmp[32];
  628. int sumi1;
  629. int sumi2;
  630. int sumi;
  631. for (int y = 0; y < nr / 4; y++) {
  632. const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
  633. for (int x = 0; x < nc / ncols_interleaved; x++) {
  634. const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
  635. for (int m = 0; m < 4; m++) {
  636. for (int j = 0; j < ncols_interleaved; j++) {
  637. sumf[m][j] = 0.0;
  638. sum_minf[m][j] = 0.0;
  639. }
  640. }
  641. for (int l = 0; l < nb; l++) {
  642. for (int sb = 0; sb < 8; sb++) {
  643. memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
  644. utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
  645. const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
  646. utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
  647. utmp[sb * 4 + 2] = uaux_0;
  648. utmp[sb * 4 + 0] &= kmask1;
  649. }
  650. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  651. uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
  652. uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
  653. for (int m = 0; m < 4; m++) {
  654. for (int j = 0; j < ncols_interleaved; j++) {
  655. sumi1 = 0;
  656. sumi2 = 0;
  657. sumi = 0;
  658. for (int i = 0; i < blocklen; ++i) {
  659. const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
  660. const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
  661. sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i]);
  662. sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
  663. sumi1 = sumi1 * scales_0[j];
  664. sumi2 = sumi2 * scales_1[j];
  665. sumi += sumi1 + sumi2;
  666. }
  667. sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
  668. }
  669. }
  670. }
  671. for (int sb = 0; sb < 8; sb++) {
  672. uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
  673. for(int m = 0; m < 4; m++) {
  674. const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
  675. for(int j = 0; j < ncols_interleaved; j++) {
  676. sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
  677. }
  678. }
  679. }
  680. }
  681. for (int m = 0; m < 4; m++) {
  682. for (int j = 0; j < ncols_interleaved; j++) {
  683. s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
  684. }
  685. }
  686. }
  687. }
  688. }
  689. void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  690. const int qk = QK_K;
  691. const int nb = n / qk;
  692. const int ncols_interleaved = 8;
  693. const int blocklen = 8;
  694. assert (n % qk == 0);
  695. assert (nr % 4 == 0);
  696. assert (nc % ncols_interleaved == 0);
  697. UNUSED(s);
  698. UNUSED(bs);
  699. UNUSED(vx);
  700. UNUSED(vy);
  701. UNUSED(nr);
  702. UNUSED(nc);
  703. UNUSED(nb);
  704. UNUSED(ncols_interleaved);
  705. UNUSED(blocklen);
  706. float sumf[4][8];
  707. float sum_minf[4][8];
  708. int sumi1, sumi2, sumi3, sumi4;
  709. int sumi;
  710. for (int y = 0; y < nr / 4; y++) {
  711. const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
  712. for (int x = 0; x < nc / ncols_interleaved; x++) {
  713. const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
  714. for (int m = 0; m < 4; m++) {
  715. for (int j = 0; j < ncols_interleaved; j++) {
  716. sumf[m][j] = 0.0;
  717. sum_minf[m][j] = 0.0;
  718. }
  719. }
  720. for (int l = 0; l < nb; l++) {
  721. for (int k = 0; k < (qk / (4 * blocklen)); k++) {
  722. const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
  723. const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
  724. const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
  725. const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
  726. for (int m = 0; m < 4; m++) {
  727. for (int j = 0; j < ncols_interleaved; j++) {
  728. sumi1 = 0;
  729. sumi2 = 0;
  730. sumi3 = 0;
  731. sumi4 = 0;
  732. sumi = 0;
  733. int offset = ((k / 2) % 2) + j * 2;
  734. for (int i = 0; i < blocklen; ++i){
  735. const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
  736. const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
  737. const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
  738. const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
  739. sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]);
  740. sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
  741. sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 256]);
  742. sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 384]);
  743. sumi1 = sumi1 * (scales_0[offset] & 0xF);
  744. sumi2 = sumi2 * (scales_1[offset] & 0xF);
  745. sumi3 = sumi3 * (scales_2[offset] & 0xF);
  746. sumi4 = sumi4 * (scales_3[offset] & 0xF);
  747. sumi += sumi1 + sumi2 + sumi3 + sumi4;
  748. }
  749. sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
  750. }
  751. }
  752. }
  753. for(int sb = 0; sb < 8; sb++) {
  754. const uint8_t *mins = b_ptr[l].scales + sb * 16;
  755. for(int m = 0; m < 4; m++) {
  756. const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
  757. for(int j = 0; j < ncols_interleaved; j++) {
  758. int mins_prod = ((mins[j * 2] >> 4) * bsums[0] + (mins[(j * 2)+ 1] >> 4) * bsums[1]);
  759. sum_minf[m][j] += (mins_prod) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
  760. }
  761. }
  762. }
  763. }
  764. for (int m = 0; m < 4; m++) {
  765. for (int j = 0; j < ncols_interleaved; j++) {
  766. s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
  767. }
  768. }
  769. }
  770. }
  771. }
  772. void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  773. const int qk = QK8_0;
  774. const int nb = n / qk;
  775. const int ncols_interleaved = 4;
  776. const int blocklen = 4;
  777. assert (n % qk == 0);
  778. assert (nr % 4 == 0);
  779. assert (nc % ncols_interleaved == 0);
  780. UNUSED(s);
  781. UNUSED(bs);
  782. UNUSED(vx);
  783. UNUSED(vy);
  784. UNUSED(nr);
  785. UNUSED(nc);
  786. UNUSED(nb);
  787. UNUSED(ncols_interleaved);
  788. UNUSED(blocklen);
  789. {
  790. float sumf[4][4];
  791. int sumi;
  792. for (int y = 0; y < nr / 4; y++) {
  793. const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
  794. for (int x = 0; x < nc / ncols_interleaved; x++) {
  795. const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
  796. for (int m = 0; m < 4; m++) {
  797. for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
  798. }
  799. for (int l = 0; l < nb; l++) {
  800. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  801. for (int m = 0; m < 4; m++) {
  802. for (int j = 0; j < ncols_interleaved; j++) {
  803. sumi = 0;
  804. for (int i = 0; i < blocklen; ++i) {
  805. const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
  806. const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
  807. sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
  808. (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
  809. }
  810. sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
  811. }
  812. }
  813. }
  814. }
  815. for (int m = 0; m < 4; m++) {
  816. for (int j = 0; j < ncols_interleaved; j++)
  817. s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
  818. }
  819. }
  820. }
  821. }
  822. }
  823. } // extern "C"
  824. static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
  825. block_q4_0x4 out;
  826. for (int i = 0; i < 4; i++) {
  827. out.d[i] = in[i].d;
  828. }
  829. const int end = QK4_0 * 2 / blck_size_interleave;
  830. if (blck_size_interleave == 8) {
  831. const uint64_t xor_mask = 0x8888888888888888ULL;
  832. for (int i = 0; i < end; ++i) {
  833. int src_id = i % 4;
  834. int src_offset = (i / 4) * blck_size_interleave;
  835. int dst_offset = i * blck_size_interleave;
  836. uint64_t elems;
  837. // Using memcpy to avoid unaligned memory accesses
  838. memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
  839. elems ^= xor_mask;
  840. memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
  841. }
  842. } else if (blck_size_interleave == 4) {
  843. const uint32_t xor_mask = 0x88888888;
  844. for (int i = 0; i < end; ++i) {
  845. int src_id = i % 4;
  846. int src_offset = (i / 4) * blck_size_interleave;
  847. int dst_offset = i * blck_size_interleave;
  848. uint32_t elems;
  849. memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
  850. elems ^= xor_mask;
  851. memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
  852. }
  853. } else {
  854. GGML_ASSERT(false);
  855. }
  856. return out;
  857. }
  858. // interleave 8 block_q4_0s in blocks of blck_size_interleave
  859. // returns an interleaved block_q4_0x8
  860. // in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
  861. // first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
  862. static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
  863. block_q4_0x8 out;
  864. for (int i = 0; i < 8; i++) {
  865. out.d[i] = in[i].d;
  866. }
  867. const int end = QK4_0 * 4 / blck_size_interleave;
  868. const uint64_t xor_mask = 0x8888888888888888ULL;
  869. for (int i = 0; i < end; ++i) {
  870. int src_id = i % 8;
  871. int src_offset = (i / 8) * blck_size_interleave;
  872. int dst_offset = i * blck_size_interleave;
  873. uint64_t elems;
  874. memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
  875. elems ^= xor_mask;
  876. memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
  877. }
  878. return out;
  879. }
  880. static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_interleave) {
  881. block_q4_Kx8 out;
  882. //Delta(scale) and dmin values of the eight Q4_K structures are copied onto the output interleaved structure
  883. for (int i = 0; i < 8; i++) {
  884. out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
  885. }
  886. for (int i = 0; i < 8; i++) {
  887. out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
  888. }
  889. const int end = QK_K * 4 / blck_size_interleave;
  890. // Interleave Q4_K quants by taking 8 bytes at a time
  891. for (int i = 0; i < end; ++i) {
  892. int src_id = i % 8;
  893. int src_offset = (i / 8) * blck_size_interleave;
  894. int dst_offset = i * blck_size_interleave;
  895. uint64_t elems;
  896. memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
  897. memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
  898. }
  899. // The below logic is designed so as to unpack and rearrange scales and mins values in Q4_K
  900. // Currently the Q4_K structure has 8 scales and 8 mins packed in 12 bytes ( 6 bits for each value)
  901. // The output Q4_Kx8 structure has 96 bytes
  902. // Every 12 byte is packed such that it contains scales and mins for corresponding sub blocks from Q4_K structure
  903. // For eg - First 12 bytes contains 8 scales and 8 mins - each of first sub block from different Q4_K structures
  904. uint8_t s[8], m[8];
  905. for (int i = 0; i < 4; i++) {
  906. for (int j = 0; j < 8; j++) {
  907. s[j] = in[j].scales[i] & 63;
  908. m[j] = in[j].scales[i + 4] & 63;
  909. }
  910. out.scales[i * 12] = (s[0] & 63) + ((s[4] & 48) << 2);
  911. out.scales[i * 12 + 1] = (s[1] & 63) + ((s[5] & 48) << 2);
  912. out.scales[i * 12 + 2] = (s[2] & 63) + ((s[6] & 48) << 2);
  913. out.scales[i * 12 + 3] = (s[3] & 63) + ((s[7] & 48) << 2);
  914. out.scales[i * 12 + 4] = (m[0] & 63) + ((m[4] & 48) << 2);
  915. out.scales[i * 12 + 5] = (m[1] & 63) + ((m[5] & 48) << 2);
  916. out.scales[i * 12 + 6] = (m[2] & 63) + ((m[6] & 48) << 2);
  917. out.scales[i * 12 + 7] = (m[3] & 63) + ((m[7] & 48) << 2);
  918. out.scales[i * 12 + 8] = (s[4] & 15) + ((m[4] & 15) << 4);
  919. out.scales[i * 12 + 9] = (s[5] & 15) + ((m[5] & 15) << 4);
  920. out.scales[i * 12 + 10] = (s[6] & 15) + ((m[6] & 15) << 4);
  921. out.scales[i * 12 + 11] = (s[7] & 15) + ((m[7] & 15) << 4);
  922. }
  923. for (int i = 0; i < 4; i++) {
  924. for (int j = 0; j < 8; j++) {
  925. s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15);
  926. m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4);
  927. }
  928. out.scales[i * 12 + 48] = (s[0] & 63) + ((s[4] & 48) << 2);
  929. out.scales[i * 12 + 49] = (s[1] & 63) + ((s[5] & 48) << 2);
  930. out.scales[i * 12 + 50] = (s[2] & 63) + ((s[6] & 48) << 2);
  931. out.scales[i * 12 + 51] = (s[3] & 63) + ((s[7] & 48) << 2);
  932. out.scales[i * 12 + 52] = (m[0] & 63) + ((m[4] & 48) << 2);
  933. out.scales[i * 12 + 53] = (m[1] & 63) + ((m[5] & 48) << 2);
  934. out.scales[i * 12 + 54] = (m[2] & 63) + ((m[6] & 48) << 2);
  935. out.scales[i * 12 + 55] = (m[3] & 63) + ((m[7] & 48) << 2);
  936. out.scales[i * 12 + 56] = (s[4] & 15) + ((m[4] & 15) << 4);
  937. out.scales[i * 12 + 57] = (s[5] & 15) + ((m[5] & 15) << 4);
  938. out.scales[i * 12 + 58] = (s[6] & 15) + ((m[6] & 15) << 4);
  939. out.scales[i * 12 + 59] = (s[7] & 15) + ((m[7] & 15) << 4);
  940. }
  941. return out;
  942. }
  943. static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_interleave) {
  944. block_q2_Kx8 out;
  945. // Delta(scale) and dmin values of the eight Q2_K structures are copied onto the output interleaved structure
  946. for (int i = 0; i < 8; i++) {
  947. out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
  948. }
  949. for (int i = 0; i < 8; i++) {
  950. out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
  951. }
  952. const int end = QK_K * 2 / blck_size_interleave;
  953. // Interleave Q2_K quants by taking 8 bytes at a time
  954. for (int i = 0; i < end; ++i) {
  955. int src_id = i % 8;
  956. int src_offset = (i / 8) * blck_size_interleave;
  957. int dst_offset = i * blck_size_interleave;
  958. uint64_t elems;
  959. memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
  960. memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
  961. }
  962. // The below logic is designed so as to unpack and rearrange scales and mins values in Q2_K
  963. // Currently the Q2_K structure has 16 scales and 16 mins packed in 16 bytes ( 4 bits for each value)
  964. // The output Q2_Kx8 structure has 128 bytes for storing scales and mins
  965. // Every 16 byte is packed such that it contains scales and mins for corresponding sub blocks from Q2_K structure
  966. // For eg - First 16 bytes contains 16 scales and 16 mins - each of first and second sub blocks from different Q2_K structures
  967. for(int i = 0; i < 128; i++){
  968. // Index for selecting which q2k super block
  969. int src1 = (i % 16) / 2;
  970. // Index for selecting scale
  971. int src2 = ((i / 16) * 2) + (i % 2);
  972. out.scales[i] = in[src1].scales[src2];
  973. }
  974. return out;
  975. }
  976. static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
  977. GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
  978. GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
  979. constexpr int nrows_interleaved = 4;
  980. block_q4_0x4 * dst = (block_q4_0x4 *)t->data;
  981. const block_q4_0 * src = (const block_q4_0 *)data;
  982. block_q4_0 dst_tmp[4];
  983. int nrow = ggml_nrows(t);
  984. int nblocks = t->ne[0] / QK4_0;
  985. GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
  986. if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
  987. return -1;
  988. }
  989. for (int b = 0; b < nrow; b += nrows_interleaved) {
  990. for (int64_t x = 0; x < nblocks; x++) {
  991. for (int i = 0; i < nrows_interleaved; i++) {
  992. dst_tmp[i] = src[x + i * nblocks];
  993. }
  994. *dst++ = make_block_q4_0x4(dst_tmp, interleave_block);
  995. }
  996. src += nrows_interleaved * nblocks;
  997. }
  998. return 0;
  999. GGML_UNUSED(data_size);
  1000. }
  1001. static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
  1002. GGML_ASSERT(t->type == GGML_TYPE_Q4_K);
  1003. GGML_ASSERT(interleave_block == 8);
  1004. constexpr int nrows_interleaved = 8;
  1005. block_q4_Kx8 * dst = (block_q4_Kx8*)t->data;
  1006. const block_q4_K * src = (const block_q4_K*) data;
  1007. block_q4_K dst_tmp[8];
  1008. int nrow = ggml_nrows(t);
  1009. int nblocks = t->ne[0] / QK_K;
  1010. GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_K));
  1011. if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
  1012. return -1;
  1013. }
  1014. for (int b = 0; b < nrow; b += nrows_interleaved) {
  1015. for (int64_t x = 0; x < nblocks; x++) {
  1016. for (int i = 0; i < nrows_interleaved; i++ ) {
  1017. dst_tmp[i] = src[x + i * nblocks];
  1018. }
  1019. *dst++ = make_block_q4_Kx8(dst_tmp, interleave_block);
  1020. }
  1021. src += nrows_interleaved * nblocks;
  1022. }
  1023. return 0;
  1024. GGML_UNUSED(data_size);
  1025. }
  1026. static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
  1027. GGML_ASSERT(t->type == GGML_TYPE_Q2_K);
  1028. GGML_ASSERT(interleave_block == 8);
  1029. constexpr int nrows_interleaved = 8;
  1030. block_q2_Kx8 * dst = (block_q2_Kx8*)t->data;
  1031. const block_q2_K * src = (const block_q2_K*) data;
  1032. block_q2_K dst_tmp[8];
  1033. int nrow = ggml_nrows(t);
  1034. int nblocks = t->ne[0] / QK_K;
  1035. GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q2_K));
  1036. if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
  1037. return -1;
  1038. }
  1039. for (int b = 0; b < nrow; b += nrows_interleaved) {
  1040. for (int64_t x = 0; x < nblocks; x++) {
  1041. for (int i = 0; i < nrows_interleaved; i++ ) {
  1042. dst_tmp[i] = src[x + i * nblocks];
  1043. }
  1044. *dst++ = make_block_q2_Kx8(dst_tmp, interleave_block);
  1045. }
  1046. src += nrows_interleaved * nblocks;
  1047. }
  1048. return 0;
  1049. GGML_UNUSED(data_size);
  1050. }
  1051. static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
  1052. GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
  1053. GGML_ASSERT(interleave_block == 8);
  1054. constexpr int nrows_interleaved = 8;
  1055. block_q4_0x8 * dst = (block_q4_0x8*)t->data;
  1056. const block_q4_0 * src = (const block_q4_0*) data;
  1057. block_q4_0 dst_tmp[8];
  1058. int nrow = ggml_nrows(t);
  1059. int nblocks = t->ne[0] / QK4_0;
  1060. GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
  1061. if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
  1062. return -1;
  1063. }
  1064. for (int b = 0; b < nrow; b += nrows_interleaved) {
  1065. for (int64_t x = 0; x < nblocks; x++) {
  1066. for (int i = 0; i < nrows_interleaved; i++ ) {
  1067. dst_tmp[i] = src[x + i * nblocks];
  1068. }
  1069. *dst++ = make_block_q4_0x8(dst_tmp, interleave_block);
  1070. }
  1071. src += nrows_interleaved * nblocks;
  1072. }
  1073. return 0;
  1074. GGML_UNUSED(data_size);
  1075. }
  1076. static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
  1077. block_iq4_nlx4 out;
  1078. for (int i = 0; i < 4; i++) {
  1079. out.d[i] = in[i].d;
  1080. }
  1081. const int end = QK4_NL * 2 / blck_size_interleave;
  1082. // TODO: this branch seems wrong
  1083. //if (blck_size_interleave == 8) {
  1084. // for (int i = 0; i < end; ++i) {
  1085. // int src_id = i % 4;
  1086. // int src_offset = (i / 4) * blck_size_interleave;
  1087. // int dst_offset = i * blck_size_interleave;
  1088. // // Using memcpy to avoid unaligned memory accesses
  1089. // memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
  1090. // }
  1091. //} else
  1092. if (blck_size_interleave == 4) {
  1093. for (int i = 0; i < end; ++i) {
  1094. int src_id = i % 4;
  1095. int src_offset = (i / 4) * blck_size_interleave;
  1096. int dst_offset = i * blck_size_interleave;
  1097. memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t));
  1098. }
  1099. } else {
  1100. GGML_ASSERT(false);
  1101. }
  1102. return out;
  1103. }
  1104. static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
  1105. GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
  1106. //GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
  1107. GGML_ASSERT(interleave_block == 4);
  1108. block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data;
  1109. const block_iq4_nl * src = (const block_iq4_nl *)data;
  1110. block_iq4_nl dst_tmp[4];
  1111. int nrow = ggml_nrows(t);
  1112. int nrows_interleaved = 4;
  1113. int nblocks = t->ne[0] / QK4_0;
  1114. GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
  1115. if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
  1116. return -1;
  1117. }
  1118. for (int b = 0; b < nrow; b += nrows_interleaved) {
  1119. for (int64_t x = 0; x < nblocks; x++) {
  1120. for (int i = 0; i < nrows_interleaved; i++) {
  1121. dst_tmp[i] = src[x + i * nblocks];
  1122. }
  1123. *dst++ = make_block_iq4_nlx4(dst_tmp, interleave_block);
  1124. }
  1125. src += nrows_interleaved * nblocks;
  1126. }
  1127. return 0;
  1128. GGML_UNUSED(data_size);
  1129. }
  1130. namespace ggml::cpu::repack {
  1131. // repack
  1132. template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
  1133. int repack(struct ggml_tensor *, const void *, size_t);
  1134. // TODO: generalise.
  1135. template <> int repack<block_q4_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
  1136. return repack_q4_0_to_q4_0_4_bl(t, 4, data, data_size);
  1137. }
  1138. template <> int repack<block_q4_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
  1139. return repack_q4_0_to_q4_0_4_bl(t, 8, data, data_size);
  1140. }
  1141. template <> int repack<block_q4_0, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
  1142. return repack_q4_0_to_q4_0_8_bl(t, 8, data, data_size);
  1143. }
  1144. template <> int repack<block_q4_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
  1145. return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
  1146. }
  1147. template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
  1148. return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
  1149. }
  1150. template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
  1151. return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
  1152. }
  1153. // TODO: needs to be revisited
  1154. //template <> int repack<block_iq4_nl, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
  1155. // return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
  1156. //}
  1157. // gemv
  1158. template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
  1159. void gemv(int, float *, size_t, const void *, const void *, int, int);
  1160. template <> void gemv<block_q4_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  1161. ggml_gemv_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
  1162. }
  1163. template <> void gemv<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  1164. ggml_gemv_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
  1165. }
  1166. template <> void gemv<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  1167. ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
  1168. }
  1169. template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  1170. ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
  1171. }
  1172. template <> void gemv<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  1173. ggml_gemv_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
  1174. }
  1175. template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  1176. ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
  1177. }
  1178. // gemm
  1179. template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
  1180. void gemm(int, float *, size_t, const void *, const void *, int, int);
  1181. template <> void gemm<block_q4_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  1182. ggml_gemm_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
  1183. }
  1184. template <> void gemm<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  1185. ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
  1186. }
  1187. template <> void gemm<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  1188. ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
  1189. }
  1190. template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  1191. ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
  1192. }
  1193. template <> void gemm<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  1194. ggml_gemm_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
  1195. }
  1196. template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  1197. ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
  1198. }
  1199. class tensor_traits_base : public ggml::cpu::tensor_traits {
  1200. public:
  1201. virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
  1202. };
  1203. template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE> class tensor_traits : public tensor_traits_base {
  1204. bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
  1205. // not realy a GGML_TYPE_Q8_0 but same size.
  1206. switch (op->op) {
  1207. case GGML_OP_MUL_MAT:
  1208. {
  1209. size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
  1210. return true;
  1211. }
  1212. case GGML_OP_MUL_MAT_ID:
  1213. {
  1214. size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
  1215. size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc.
  1216. const int64_t ne02 = op->src[0]->ne[2]; // n_as, n_expert
  1217. const int64_t ne12 = op->src[1]->ne[2]; // n_tokens
  1218. const size_t sizeof_mmid_row_mapping = sizeof(int64_t);
  1219. size += sizeof_mmid_row_mapping*ne02*(ne12 + 1);
  1220. return true;
  1221. }
  1222. default:
  1223. // GGML_ABORT("fatal error");
  1224. break;
  1225. }
  1226. return false;
  1227. }
  1228. bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override {
  1229. switch (op->op) {
  1230. case GGML_OP_MUL_MAT:
  1231. forward_mul_mat(params, op);
  1232. return true;
  1233. case GGML_OP_MUL_MAT_ID:
  1234. forward_mul_mat_id(params, op);
  1235. return true;
  1236. default:
  1237. // GGML_ABORT("fatal error");
  1238. break;
  1239. }
  1240. return false;
  1241. }
  1242. void forward_mul_mat(ggml_compute_params * params, ggml_tensor * op) {
  1243. const ggml_tensor * src0 = op->src[0];
  1244. const ggml_tensor * src1 = op->src[1];
  1245. ggml_tensor * dst = op;
  1246. GGML_TENSOR_BINARY_OP_LOCALS
  1247. const int ith = params->ith;
  1248. const int nth = params->nth;
  1249. GGML_ASSERT(ne0 == ne01);
  1250. GGML_ASSERT(ne1 == ne11);
  1251. GGML_ASSERT(ne2 == ne12);
  1252. GGML_ASSERT(ne3 == ne13);
  1253. // dst cannot be transposed or permuted
  1254. GGML_ASSERT(nb0 == sizeof(float));
  1255. GGML_ASSERT(nb0 <= nb1);
  1256. GGML_ASSERT(nb1 <= nb2);
  1257. GGML_ASSERT(nb2 <= nb3);
  1258. GGML_ASSERT(src1->type == GGML_TYPE_F32);
  1259. GGML_ASSERT(ggml_n_dims(op->src[0]) == 2);
  1260. // GGML_ASSERT(ggml_n_dims(op->src[1]) == 2);
  1261. char * wdata = static_cast<char *>(params->wdata);
  1262. const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10);
  1263. assert(params->wsize >= nbw1 * ne11);
  1264. const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
  1265. int64_t i11_processed = 0;
  1266. for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
  1267. ggml_quantize_mat_t<INTER_SIZE, PARAM_TYPE>((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4, ne10);
  1268. }
  1269. i11_processed = ne11 - ne11 % 4;
  1270. for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
  1271. from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10);
  1272. }
  1273. ggml_barrier(params->threadpool);
  1274. const void * src1_wdata = params->wdata;
  1275. const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10);
  1276. int64_t src0_start = (ith * ne01) / nth;
  1277. int64_t src0_end = ((ith + 1) * ne01) / nth;
  1278. src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
  1279. src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
  1280. if (src0_start >= src0_end) {
  1281. return;
  1282. }
  1283. // If there are more than three rows in src1, use gemm; otherwise, use gemv.
  1284. if (ne11 > 3) {
  1285. gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
  1286. (float *) ((char *) dst->data) + src0_start, ne01,
  1287. (const char *) src0->data + src0_start * nb01,
  1288. (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
  1289. }
  1290. for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) {
  1291. gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
  1292. (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01,
  1293. (const char *) src0->data + src0_start * nb01,
  1294. (const char *) src1_wdata + (src1_col_stride * iter), 1,
  1295. src0_end - src0_start);
  1296. }
  1297. }
  1298. void forward_mul_mat_id(ggml_compute_params * params, ggml_tensor * op) {
  1299. const ggml_tensor * src0 = op->src[0];
  1300. const ggml_tensor * src1 = op->src[1];
  1301. const ggml_tensor * ids = op->src[2];
  1302. ggml_tensor * dst = op;
  1303. GGML_TENSOR_BINARY_OP_LOCALS
  1304. const int ith = params->ith;
  1305. const int nth = params->nth;
  1306. const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
  1307. // we don't support permuted src0 or src1
  1308. GGML_ASSERT(nb00 == ggml_type_size(src0->type));
  1309. GGML_ASSERT(nb10 == ggml_type_size(src1->type));
  1310. // dst cannot be transposed or permuted
  1311. GGML_ASSERT(nb0 == sizeof(float));
  1312. GGML_ASSERT(nb0 <= nb1);
  1313. GGML_ASSERT(nb1 <= nb2);
  1314. GGML_ASSERT(nb2 <= nb3);
  1315. GGML_ASSERT(ne03 == 1);
  1316. GGML_ASSERT(ne13 == 1);
  1317. GGML_ASSERT(ne3 == 1);
  1318. GGML_ASSERT(src1->type == GGML_TYPE_F32);
  1319. // row groups
  1320. const int n_ids = ids->ne[0]; // n_expert_used
  1321. const int n_as = ne02; // n_expert
  1322. const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10);
  1323. const size_t nbw2 = nbw1*ne11;
  1324. const size_t nbw3 = nbw2*ne12;
  1325. struct mmid_row_mapping {
  1326. int32_t i1;
  1327. int32_t i2;
  1328. };
  1329. GGML_ASSERT(params->wsize >=
  1330. (GGML_PAD(nbw3, sizeof(int64_t)) +
  1331. n_as*(ne12 + 1)*sizeof(mmid_row_mapping))
  1332. );
  1333. auto * wdata = (char *)params->wdata;
  1334. auto * wdata_src1_end = (char *)wdata + GGML_PAD(nbw3, sizeof(int64_t));
  1335. // total of [n_as][ne12 + 1] elemets of type mmid_row_mapping (2*int32_t = int64_t)
  1336. auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
  1337. struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]
  1338. // src1: float32 => param type
  1339. for (int64_t i12 = 0; i12 < ne12; ++i12) {
  1340. for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
  1341. from_float((float *)((char *) src1->data + i12 * nb12 + i11 * nb11),
  1342. (void *) (wdata + i12 * nbw2 + i11 * nbw1),
  1343. ne10);
  1344. }
  1345. }
  1346. #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ne12 + (i1)]
  1347. if (ith == 0) {
  1348. // initialize matrix_row_counts
  1349. memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
  1350. // group rows by src0 matrix
  1351. for (int32_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
  1352. for (int32_t id = 0; id < n_ids; ++id) {
  1353. const int32_t i02 =
  1354. *(const int32_t *) ((const char *) ids->data + iid1 * ids->nb[1] + id * ids->nb[0]);
  1355. GGML_ASSERT(i02 >= 0 && i02 < n_as);
  1356. MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = { id, iid1 };
  1357. matrix_row_counts[i02] += 1;
  1358. }
  1359. }
  1360. }
  1361. ggml_barrier(params->threadpool);
  1362. // compute each matrix multiplication in sequence
  1363. for (int cur_a = 0; cur_a < n_as; ++cur_a) {
  1364. const int64_t cne1 = matrix_row_counts[cur_a];
  1365. if (cne1 == 0) {
  1366. continue;
  1367. }
  1368. const auto * src0_cur = (const char *) src0->data + cur_a*nb02;
  1369. //const int64_t nr0 = ne01; // src0 rows
  1370. const int64_t nr1 = cne1; // src1 rows
  1371. int64_t src0_cur_start = (ith * ne01) / nth;
  1372. int64_t src0_cur_end = ((ith + 1) * ne01) / nth;
  1373. src0_cur_start = (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start;
  1374. src0_cur_end = (src0_cur_end % NB_COLS) ? src0_cur_end + NB_COLS - (src0_cur_end % NB_COLS) : src0_cur_end;
  1375. if (src0_cur_start >= src0_cur_end) {
  1376. return;
  1377. }
  1378. for (int ir1 = 0; ir1 < nr1; ir1++) {
  1379. struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
  1380. const int id = row_mapping.i1; // selected expert index
  1381. const int64_t i11 = id % ne11;
  1382. const int64_t i12 = row_mapping.i2; // row index in src1
  1383. const int64_t i1 = id; // selected expert index
  1384. const int64_t i2 = i12; // row
  1385. const auto * src1_col = (const char *) wdata + (i11 * nbw1 + i12 * nbw2);
  1386. gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
  1387. (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
  1388. src0_cur + src0_cur_start * nb01,
  1389. src1_col, 1, src0_cur_end - src0_cur_start);
  1390. }
  1391. }
  1392. #undef MMID_MATRIX_ROW
  1393. }
  1394. int repack(struct ggml_tensor * t, const void * data, size_t data_size) override {
  1395. GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type),
  1396. (int) NB_COLS, (int) INTER_SIZE);
  1397. return ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
  1398. }
  1399. };
  1400. } // namespace ggml::cpu::repack
  1401. static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(const struct ggml_tensor * cur) {
  1402. // instance for Q4
  1403. static const ggml::cpu::repack::tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
  1404. static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
  1405. static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
  1406. static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
  1407. // instance for Q2
  1408. static const ggml::cpu::repack::tensor_traits<block_q2_K, 8, 8, GGML_TYPE_Q8_K> q2_K_8x8_q8_K;
  1409. // instance for IQ4
  1410. static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
  1411. if (cur->type == GGML_TYPE_Q4_0) {
  1412. if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
  1413. if (cur->ne[1] % 8 == 0) {
  1414. return &q4_0_8x8_q8_0;
  1415. }
  1416. }
  1417. if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
  1418. if (cur->ne[1] % 4 == 0) {
  1419. return &q4_0_4x8_q8_0;
  1420. }
  1421. }
  1422. if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
  1423. if (cur->ne[1] % 4 == 0) {
  1424. return &q4_0_4x4_q8_0;
  1425. }
  1426. }
  1427. } else if (cur->type == GGML_TYPE_Q4_K) {
  1428. if (ggml_cpu_has_avx2()) {
  1429. if (cur->ne[1] % 8 == 0) {
  1430. return &q4_K_8x8_q8_K;
  1431. }
  1432. }
  1433. } else if (cur->type == GGML_TYPE_Q2_K) {
  1434. if (ggml_cpu_has_avx512()) {
  1435. if (cur->ne[1] % 8 == 0) {
  1436. return &q2_K_8x8_q8_K;
  1437. }
  1438. }
  1439. } else if (cur->type == GGML_TYPE_IQ4_NL) {
  1440. if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
  1441. if (cur->ne[1] % 4 == 0) {
  1442. return &iq4_nl_4x4_q8_0;
  1443. }
  1444. }
  1445. }
  1446. return nullptr;
  1447. }
  1448. static enum ggml_status ggml_backend_cpu_repack_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
  1449. tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_repack_get_optimal_repack_type(tensor));
  1450. GGML_UNUSED(buffer);
  1451. return GGML_STATUS_SUCCESS;
  1452. }
  1453. static void ggml_backend_cpu_repack_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
  1454. const void * data, size_t offset, size_t size) {
  1455. GGML_ASSERT(offset == 0);
  1456. GGML_ASSERT(size == ggml_nbytes(tensor));
  1457. auto tensor_traits = (ggml::cpu::repack::tensor_traits_base *) tensor->extra;
  1458. auto OK = tensor_traits->repack(tensor, data, size);
  1459. GGML_ASSERT(OK == 0);
  1460. GGML_UNUSED(buffer);
  1461. }
  1462. static const char * ggml_backend_cpu_repack_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
  1463. return "CPU_REPACK";
  1464. GGML_UNUSED(buft);
  1465. }
  1466. static ggml_backend_buffer_t ggml_backend_cpu_repack_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
  1467. ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
  1468. if (buffer == nullptr) {
  1469. return nullptr;
  1470. }
  1471. buffer->buft = buft;
  1472. buffer->iface.init_tensor = ggml_backend_cpu_repack_buffer_init_tensor;
  1473. buffer->iface.set_tensor = ggml_backend_cpu_repack_buffer_set_tensor;
  1474. buffer->iface.get_tensor = nullptr;
  1475. buffer->iface.cpy_tensor = nullptr;
  1476. return buffer;
  1477. }
  1478. static size_t ggml_backend_cpu_repack_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
  1479. return TENSOR_ALIGNMENT;
  1480. GGML_UNUSED(buft);
  1481. }
  1482. namespace ggml::cpu::repack {
  1483. class extra_buffer_type : ggml::cpu::extra_buffer_type {
  1484. bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
  1485. if ( op->op == GGML_OP_MUL_MAT &&
  1486. op->src[0]->buffer &&
  1487. (ggml_n_dims(op->src[0]) == 2) &&
  1488. op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type() &&
  1489. ggml_repack_get_optimal_repack_type(op->src[0])
  1490. ) {
  1491. if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
  1492. return false;
  1493. }
  1494. if (op->src[1]->type == GGML_TYPE_F32) {
  1495. return true;
  1496. }
  1497. //if (op->src[1]->type == GGML_TYPE_Q8_0) {
  1498. // return true;
  1499. //}
  1500. // may be possible if Q8_0 packed...
  1501. } else if (op->op == GGML_OP_MUL_MAT_ID
  1502. && op->src[0]->buffer
  1503. && (ggml_n_dims(op->src[0]) == 3)
  1504. && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()
  1505. && ggml_repack_get_optimal_repack_type(op->src[0])
  1506. ) {
  1507. if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
  1508. return false;
  1509. }
  1510. if (op->src[1]->type == GGML_TYPE_F32) {
  1511. return true;
  1512. }
  1513. //if (op->src[1]->type == GGML_TYPE_Q8_0) {
  1514. // return true;
  1515. //}
  1516. }
  1517. return false;
  1518. }
  1519. ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
  1520. if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) {
  1521. if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()) {
  1522. return (ggml::cpu::tensor_traits *) op->src[0]->extra;
  1523. }
  1524. }
  1525. return nullptr;
  1526. }
  1527. };
  1528. } // namespace ggml::cpu::repack
  1529. ggml_backend_buffer_type_t ggml_backend_cpu_repack_buffer_type(void) {
  1530. static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_repack = {
  1531. /* .iface = */ {
  1532. /* .get_name = */ ggml_backend_cpu_repack_buffer_type_get_name,
  1533. /* .alloc_buffer = */ ggml_backend_cpu_repack_buffer_type_alloc_buffer,
  1534. /* .get_alignment = */ ggml_backend_cpu_repack_buffer_type_get_alignment,
  1535. /* .get_max_size = */ nullptr, // defaults to SIZE_MAX
  1536. /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes
  1537. /* .is_host = */ nullptr,
  1538. },
  1539. /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
  1540. /* .context = */ new ggml::cpu::repack::extra_buffer_type(),
  1541. };
  1542. return &ggml_backend_cpu_buffer_type_repack;
  1543. }