repack.cpp 60 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570
  1. #define GGML_COMMON_IMPL_CPP
  2. #define GGML_COMMON_DECL_CPP
  3. #include "ggml-common.h"
  4. #include "ggml-backend-impl.h"
  5. #include "ggml-impl.h"
  6. #include "ggml-cpu.h"
  7. #include "ggml-cpu-impl.h"
  8. #include "traits.h"
  9. #if defined(__APPLE__)
  10. #include "apple-fallback.h"
  11. #endif
  12. #include <cmath>
  13. #include <cstring>
  14. #include <cassert>
  15. #include <cstdlib> // for qsort
  16. #include <cstdio> // for GGML_ASSERT
  17. #include "repack.h"
  18. #if defined(__GNUC__)
  19. #pragma GCC diagnostic ignored "-Woverlength-strings"
  20. #endif
  21. #define UNUSED GGML_UNUSED
  22. static inline int nearest_int(float fval) {
  23. assert(fabsf(fval) <= 4194303.f);
  24. float val = fval + 12582912.f;
  25. int i; memcpy(&i, &val, sizeof(int));
  26. return (i & 0x007fffff) - 0x00400000;
  27. }
  28. // Functions to create the interleaved data layout formats
  29. // interleave 4 block_q4_0s in blocks of blck_size_interleave
  30. // returns an interleaved block_q4_0x4
  31. // in the interleaved block_q4_0x4, place deltas for 4 block_q4_0 blocks
  32. // first, then interleave quants from 4 block_q4_0s in blocks of blck_size_interleave
  33. //
  34. // - in : an array of block_q4_0 pointers
  35. // - blck_size_interleave : the block_q4_0 quants bytes are interleaved in blocks of
  36. // blck_size_interleave bytes
  37. // - xor_mask : the mask to convert the nibbles in block_q4_0 quants bytes
  38. // from bias offset form to pure sign form (this saves subtract
  39. // operations durin unpacking)
  40. //
  41. extern "C" {
  42. void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
  43. assert(QK8_0 == 32);
  44. assert(k % QK8_0 == 0);
  45. const int nb = k / QK8_0;
  46. block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
  47. // scalar
  48. const int blck_size_interleave = 4;
  49. float srcv[4][QK8_0];
  50. float id[4];
  51. for (int i = 0; i < nb; i++) {
  52. for (int row_iter = 0; row_iter < 4; row_iter++) {
  53. float amax = 0.0f; // absolute max
  54. for (int j = 0; j < QK8_0; j++) {
  55. srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
  56. amax = MAX(amax, fabsf(srcv[row_iter][j]));
  57. }
  58. const float d = amax / ((1 << 7) - 1);
  59. id[row_iter] = d ? 1.0f / d : 0.0f;
  60. y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
  61. }
  62. for (int j = 0; j < QK8_0 * 4; j++) {
  63. int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
  64. int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
  65. src_offset += (j % blck_size_interleave);
  66. float x0 = srcv[src_id][src_offset] * id[src_id];
  67. y[i].qs[j] = roundf(x0);
  68. }
  69. }
  70. }
  71. GGML_CPU_NATIVE_IMPL(ggml_quantize_mat_q8_0_4x4)
  72. void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
  73. assert(QK8_0 == 32);
  74. assert(k % QK8_0 == 0);
  75. const int nb = k / QK8_0;
  76. block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
  77. // scalar
  78. const int blck_size_interleave = 8;
  79. float srcv[4][QK8_0];
  80. float id[4];
  81. for (int i = 0; i < nb; i++) {
  82. for (int row_iter = 0; row_iter < 4; row_iter++) {
  83. float amax = 0.0f; // absolute max
  84. for (int j = 0; j < QK8_0; j++) {
  85. srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
  86. amax = MAX(amax, fabsf(srcv[row_iter][j]));
  87. }
  88. const float d = amax / ((1 << 7) - 1);
  89. id[row_iter] = d ? 1.0f / d : 0.0f;
  90. y[i].d[row_iter] = GGML_FP32_TO_FP16(d);
  91. }
  92. for (int j = 0; j < QK8_0 * 4; j++) {
  93. int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
  94. int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
  95. src_offset += (j % blck_size_interleave);
  96. float x0 = srcv[src_id][src_offset] * id[src_id];
  97. y[i].qs[j] = roundf(x0);
  98. }
  99. }
  100. }
  101. GGML_CPU_NATIVE_IMPL(ggml_quantize_mat_q8_0_4x8)
  102. void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
  103. assert(QK_K == 256);
  104. assert(k % QK_K == 0);
  105. const int nb = k / QK_K;
  106. block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy;
  107. // scalar
  108. const int blck_size_interleave = 8;
  109. float srcv[4][QK_K];
  110. float iscale[4];
  111. for (int i = 0; i < nb; i++) {
  112. for (int row_iter = 0; row_iter < 4; row_iter++) {
  113. float amax = 0.0f; // absolute max
  114. float max = 0;
  115. for (int j = 0; j < QK_K; j++) {
  116. srcv[row_iter][j] = x[row_iter * k + i * QK_K + j];
  117. // Update the maximum value of the corresponding super block
  118. if(amax < fabsf(srcv[row_iter][j])) {
  119. amax = fabsf(srcv[row_iter][j]);
  120. max = srcv[row_iter][j];
  121. }
  122. }
  123. iscale[row_iter] = amax ? -127.f/max : 0;
  124. y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0;
  125. }
  126. for (int j = 0; j < QK_K / 4; j++) {
  127. y[i].bsums[j] = 0;
  128. }
  129. // Quants values are interleaved in sequence of eight bytes from corresponding super blocks
  130. // Bsums values are interleaved in sequence of four bsums from each super block taken for interleaving
  131. // i.e first four bsums from the first super block, followed by first four bsums from second super block and so on
  132. for (int j = 0; j < QK_K * 4; j++) {
  133. int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
  134. int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
  135. src_offset += (j % blck_size_interleave);
  136. int index = (((j & 31) >> 3) << 2) + ((j >> 8) << 4) + ((j >> 6) & 3);
  137. float x0 = srcv[src_id][src_offset] * iscale[src_id];
  138. y[i].qs[j] = nearest_int(x0);
  139. y[i].bsums[index] += y[i].qs[j];
  140. }
  141. }
  142. }
  143. GGML_CPU_NATIVE_IMPL(ggml_quantize_mat_q8_K_4x8)
  144. } // extern "C"
  145. template <int64_t INTER_SIZE, ggml_type PARAM_TYPE>
  146. void ggml_quantize_mat_t(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row);
  147. template <> void ggml_quantize_mat_t<4, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
  148. assert(nrow == 4);
  149. UNUSED(nrow);
  150. ggml_quantize_mat_q8_0_4x4(x, vy, n_per_row);
  151. }
  152. template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
  153. assert(nrow == 4);
  154. UNUSED(nrow);
  155. ggml_quantize_mat_q8_0_4x8(x, vy, n_per_row);
  156. }
  157. template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
  158. assert(nrow == 4);
  159. UNUSED(nrow);
  160. ggml_quantize_mat_q8_K_4x8(x, vy, n_per_row);
  161. }
  162. extern "C" {
  163. void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  164. const int qk = QK8_0;
  165. const int nb = n / qk;
  166. const int ncols_interleaved = 4;
  167. const int blocklen = 4;
  168. assert (n % qk == 0);
  169. assert (nc % ncols_interleaved == 0);
  170. UNUSED(s);
  171. UNUSED(bs);
  172. UNUSED(vx);
  173. UNUSED(vy);
  174. UNUSED(nr);
  175. UNUSED(nc);
  176. UNUSED(nb);
  177. UNUSED(ncols_interleaved);
  178. UNUSED(blocklen);
  179. float sumf[4];
  180. int sumi;
  181. const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
  182. for (int x = 0; x < nc / ncols_interleaved; x++) {
  183. const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
  184. for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
  185. for (int l = 0; l < nb; l++) {
  186. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  187. for (int j = 0; j < ncols_interleaved; j++) {
  188. sumi = 0;
  189. for (int i = 0; i < blocklen; ++i) {
  190. const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
  191. const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
  192. sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
  193. }
  194. sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
  195. }
  196. }
  197. }
  198. for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
  199. }
  200. }
  201. GGML_CPU_NATIVE_IMPL(ggml_gemv_q4_0_4x4_q8_0)
  202. void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  203. const int qk = QK8_0;
  204. const int nb = n / qk;
  205. const int ncols_interleaved = 4;
  206. const int blocklen = 8;
  207. assert (n % qk == 0);
  208. assert (nc % ncols_interleaved == 0);
  209. UNUSED(s);
  210. UNUSED(bs);
  211. UNUSED(vx);
  212. UNUSED(vy);
  213. UNUSED(nr);
  214. UNUSED(nc);
  215. UNUSED(nb);
  216. UNUSED(ncols_interleaved);
  217. UNUSED(blocklen);
  218. float sumf[4];
  219. int sumi;
  220. const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
  221. for (int x = 0; x < nc / ncols_interleaved; x++) {
  222. const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
  223. for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
  224. for (int l = 0; l < nb; l++) {
  225. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  226. for (int j = 0; j < ncols_interleaved; j++) {
  227. sumi = 0;
  228. for (int i = 0; i < blocklen; ++i) {
  229. const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
  230. const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
  231. sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
  232. }
  233. sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
  234. }
  235. }
  236. }
  237. for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
  238. }
  239. }
  240. GGML_CPU_NATIVE_IMPL(ggml_gemv_q4_0_4x8_q8_0)
  241. void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  242. const int qk = QK8_0;
  243. const int nb = n / qk;
  244. const int ncols_interleaved = 8;
  245. const int blocklen = 8;
  246. assert (n % qk == 0);
  247. assert (nc % ncols_interleaved == 0);
  248. UNUSED(s);
  249. UNUSED(bs);
  250. UNUSED(vx);
  251. UNUSED(vy);
  252. UNUSED(nr);
  253. UNUSED(nc);
  254. UNUSED(nb);
  255. UNUSED(ncols_interleaved);
  256. UNUSED(blocklen);
  257. {
  258. float sumf[8];
  259. int sumi;
  260. const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
  261. for (int x = 0; x < nc / ncols_interleaved; x++) {
  262. const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
  263. for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
  264. for (int l = 0; l < nb; l++) {
  265. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  266. for (int j = 0; j < ncols_interleaved; j++) {
  267. sumi = 0;
  268. for (int i = 0; i < blocklen; ++i) {
  269. const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
  270. const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
  271. sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
  272. }
  273. sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
  274. }
  275. }
  276. }
  277. for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
  278. }
  279. }
  280. }
  281. GGML_CPU_NATIVE_IMPL(ggml_gemv_q4_0_8x8_q8_0)
  282. void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  283. const int qk = QK_K;
  284. const int nb = n / qk;
  285. const int ncols_interleaved = 8;
  286. const int blocklen = 8;
  287. static const uint32_t kmask1 = 0x3f3f3f3f;
  288. static const uint32_t kmask2 = 0x0f0f0f0f;
  289. static const uint32_t kmask3 = 0x03030303;
  290. assert (n % qk == 0);
  291. assert (nc % ncols_interleaved == 0);
  292. UNUSED(s);
  293. UNUSED(bs);
  294. UNUSED(vx);
  295. UNUSED(vy);
  296. UNUSED(nr);
  297. UNUSED(nc);
  298. UNUSED(nb);
  299. UNUSED(ncols_interleaved);
  300. UNUSED(blocklen);
  301. float sumf[8];
  302. float sum_minf[8];
  303. uint32_t utmp[32];
  304. int sumi1;
  305. int sumi2;
  306. int sumi;
  307. const block_q8_K * a_ptr = (const block_q8_K *) vy;
  308. for (int x = 0; x < nc / ncols_interleaved; x++) {
  309. const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
  310. for (int j = 0; j < ncols_interleaved; j++) {
  311. sumf[j] = 0.0;
  312. sum_minf[j] = 0.0;
  313. }
  314. for (int l = 0; l < nb; l++) {
  315. for (int sb = 0; sb < 8; sb++) {
  316. memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
  317. utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
  318. const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
  319. utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
  320. utmp[sb * 4 + 2] = uaux_0;
  321. utmp[sb * 4 + 0] &= kmask1;
  322. }
  323. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  324. uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
  325. uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
  326. for (int j = 0; j < ncols_interleaved; j++) {
  327. sumi1 = 0;
  328. sumi2 = 0;
  329. sumi = 0;
  330. for (int i = 0; i < blocklen; ++i) {
  331. const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
  332. const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
  333. sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i]);
  334. sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i + 32]);
  335. sumi1 = sumi1 * scales_0[j];
  336. sumi2 = sumi2 * scales_1[j];
  337. sumi += sumi1 + sumi2;
  338. }
  339. sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
  340. }
  341. }
  342. for (int sb = 0; sb < 8; sb++) {
  343. uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
  344. for (int j = 0; j < ncols_interleaved; j++) {
  345. sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
  346. }
  347. }
  348. }
  349. for (int j = 0; j < ncols_interleaved; j++) {
  350. s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
  351. }
  352. }
  353. }
  354. GGML_CPU_NATIVE_IMPL(ggml_gemv_q4_K_8x8_q8_K)
  355. void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  356. const int qk = QK8_0;
  357. const int nb = n / qk;
  358. const int ncols_interleaved = 4;
  359. const int blocklen = 4;
  360. assert (n % qk == 0);
  361. assert (nc % ncols_interleaved == 0);
  362. UNUSED(s);
  363. UNUSED(bs);
  364. UNUSED(vx);
  365. UNUSED(vy);
  366. UNUSED(nr);
  367. UNUSED(nc);
  368. UNUSED(nb);
  369. UNUSED(ncols_interleaved);
  370. UNUSED(blocklen);
  371. {
  372. float sumf[4];
  373. int sumi;
  374. const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
  375. for (int x = 0; x < nc / ncols_interleaved; x++) {
  376. const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
  377. for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
  378. for (int l = 0; l < nb; l++) {
  379. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  380. for (int j = 0; j < ncols_interleaved; j++) {
  381. sumi = 0;
  382. for (int i = 0; i < blocklen; ++i) {
  383. const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
  384. const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
  385. sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
  386. }
  387. sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d);
  388. }
  389. }
  390. }
  391. for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
  392. }
  393. }
  394. }
  395. GGML_CPU_NATIVE_IMPL(ggml_gemv_iq4_nl_4x4_q8_0)
  396. void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  397. const int qk = QK8_0;
  398. const int nb = n / qk;
  399. const int ncols_interleaved = 4;
  400. const int blocklen = 4;
  401. assert (n % qk == 0);
  402. assert (nr % 4 == 0);
  403. assert (nc % ncols_interleaved == 0);
  404. UNUSED(s);
  405. UNUSED(bs);
  406. UNUSED(vx);
  407. UNUSED(vy);
  408. UNUSED(nr);
  409. UNUSED(nc);
  410. UNUSED(nb);
  411. UNUSED(ncols_interleaved);
  412. UNUSED(blocklen);
  413. {
  414. float sumf[4][4];
  415. int sumi;
  416. for (int y = 0; y < nr / 4; y++) {
  417. const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
  418. for (int x = 0; x < nc / ncols_interleaved; x++) {
  419. const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
  420. for (int m = 0; m < 4; m++) {
  421. for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
  422. }
  423. for (int l = 0; l < nb; l++) {
  424. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  425. for (int m = 0; m < 4; m++) {
  426. for (int j = 0; j < ncols_interleaved; j++) {
  427. sumi = 0;
  428. for (int i = 0; i < blocklen; ++i) {
  429. const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
  430. const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
  431. sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
  432. (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
  433. }
  434. sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
  435. }
  436. }
  437. }
  438. }
  439. for (int m = 0; m < 4; m++) {
  440. for (int j = 0; j < ncols_interleaved; j++)
  441. s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
  442. }
  443. }
  444. }
  445. }
  446. }
  447. GGML_CPU_NATIVE_IMPL(ggml_gemm_q4_0_4x4_q8_0)
  448. void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  449. const int qk = QK8_0;
  450. const int nb = n / qk;
  451. const int ncols_interleaved = 4;
  452. const int blocklen = 8;
  453. assert (n % qk == 0);
  454. assert (nr % 4 == 0);
  455. assert (nc % ncols_interleaved == 0);
  456. UNUSED(s);
  457. UNUSED(bs);
  458. UNUSED(vx);
  459. UNUSED(vy);
  460. UNUSED(nr);
  461. UNUSED(nc);
  462. UNUSED(nb);
  463. UNUSED(ncols_interleaved);
  464. UNUSED(blocklen);
  465. float sumf[4][4];
  466. int sumi;
  467. for (int y = 0; y < nr / 4; y++) {
  468. const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
  469. for (int x = 0; x < nc / ncols_interleaved; x++) {
  470. const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
  471. for (int m = 0; m < 4; m++) {
  472. for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
  473. }
  474. for (int l = 0; l < nb; l++) {
  475. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  476. for (int m = 0; m < 4; m++) {
  477. for (int j = 0; j < ncols_interleaved; j++) {
  478. sumi = 0;
  479. for (int i = 0; i < blocklen; ++i) {
  480. const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
  481. const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
  482. sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
  483. (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
  484. }
  485. sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
  486. }
  487. }
  488. }
  489. }
  490. for (int m = 0; m < 4; m++) {
  491. for (int j = 0; j < ncols_interleaved; j++)
  492. s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
  493. }
  494. }
  495. }
  496. }
  497. GGML_CPU_NATIVE_IMPL(ggml_gemm_q4_0_4x8_q8_0)
  498. void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  499. const int qk = QK8_0;
  500. const int nb = n / qk;
  501. const int ncols_interleaved = 8;
  502. const int blocklen = 8;
  503. assert (n % qk == 0);
  504. assert (nr % 4 == 0);
  505. assert (nc % ncols_interleaved == 0);
  506. UNUSED(s);
  507. UNUSED(bs);
  508. UNUSED(vx);
  509. UNUSED(vy);
  510. UNUSED(nr);
  511. UNUSED(nc);
  512. UNUSED(nb);
  513. UNUSED(ncols_interleaved);
  514. UNUSED(blocklen);
  515. float sumf[4][8];
  516. int sumi;
  517. for (int y = 0; y < nr / 4; y++) {
  518. const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
  519. for (int x = 0; x < nc / ncols_interleaved; x++) {
  520. const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
  521. for (int m = 0; m < 4; m++) {
  522. for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
  523. }
  524. for (int l = 0; l < nb; l++) {
  525. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  526. for (int m = 0; m < 4; m++) {
  527. for (int j = 0; j < ncols_interleaved; j++) {
  528. sumi = 0;
  529. for (int i = 0; i < blocklen; ++i) {
  530. const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
  531. const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
  532. sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
  533. (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
  534. }
  535. sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
  536. }
  537. }
  538. }
  539. }
  540. for (int m = 0; m < 4; m++) {
  541. for (int j = 0; j < ncols_interleaved; j++)
  542. s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
  543. }
  544. }
  545. }
  546. }
  547. GGML_CPU_NATIVE_IMPL(ggml_gemm_q4_0_8x8_q8_0)
  548. void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  549. const int qk = QK_K;
  550. const int nb = n / qk;
  551. const int ncols_interleaved = 8;
  552. const int blocklen = 8;
  553. static const uint32_t kmask1 = 0x3f3f3f3f;
  554. static const uint32_t kmask2 = 0x0f0f0f0f;
  555. static const uint32_t kmask3 = 0x03030303;
  556. assert (n % qk == 0);
  557. assert (nr % 4 == 0);
  558. assert (nc % ncols_interleaved == 0);
  559. UNUSED(s);
  560. UNUSED(bs);
  561. UNUSED(vx);
  562. UNUSED(vy);
  563. UNUSED(nr);
  564. UNUSED(nc);
  565. UNUSED(nb);
  566. UNUSED(ncols_interleaved);
  567. UNUSED(blocklen);
  568. float sumf[4][8];
  569. float sum_minf[4][8];
  570. uint32_t utmp[32];
  571. int sumi1;
  572. int sumi2;
  573. int sumi;
  574. for (int y = 0; y < nr / 4; y++) {
  575. const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
  576. for (int x = 0; x < nc / ncols_interleaved; x++) {
  577. const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
  578. for (int m = 0; m < 4; m++) {
  579. for (int j = 0; j < ncols_interleaved; j++) {
  580. sumf[m][j] = 0.0;
  581. sum_minf[m][j] = 0.0;
  582. }
  583. }
  584. for (int l = 0; l < nb; l++) {
  585. for (int sb = 0; sb < 8; sb++) {
  586. memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
  587. utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
  588. const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
  589. utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
  590. utmp[sb * 4 + 2] = uaux_0;
  591. utmp[sb * 4 + 0] &= kmask1;
  592. }
  593. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  594. uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
  595. uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
  596. for (int m = 0; m < 4; m++) {
  597. for (int j = 0; j < ncols_interleaved; j++) {
  598. sumi1 = 0;
  599. sumi2 = 0;
  600. sumi = 0;
  601. for (int i = 0; i < blocklen; ++i) {
  602. const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
  603. const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
  604. sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i]);
  605. sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
  606. sumi1 = sumi1 * scales_0[j];
  607. sumi2 = sumi2 * scales_1[j];
  608. sumi += sumi1 + sumi2;
  609. }
  610. sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
  611. }
  612. }
  613. }
  614. for (int sb = 0; sb < 8; sb++) {
  615. uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
  616. for(int m = 0; m < 4; m++) {
  617. const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
  618. for(int j = 0; j < ncols_interleaved; j++) {
  619. sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
  620. }
  621. }
  622. }
  623. }
  624. for (int m = 0; m < 4; m++) {
  625. for (int j = 0; j < ncols_interleaved; j++) {
  626. s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
  627. }
  628. }
  629. }
  630. }
  631. }
  632. GGML_CPU_NATIVE_IMPL(ggml_gemm_q4_K_8x8_q8_K)
  633. void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  634. const int qk = QK8_0;
  635. const int nb = n / qk;
  636. const int ncols_interleaved = 4;
  637. const int blocklen = 4;
  638. assert (n % qk == 0);
  639. assert (nr % 4 == 0);
  640. assert (nc % ncols_interleaved == 0);
  641. UNUSED(s);
  642. UNUSED(bs);
  643. UNUSED(vx);
  644. UNUSED(vy);
  645. UNUSED(nr);
  646. UNUSED(nc);
  647. UNUSED(nb);
  648. UNUSED(ncols_interleaved);
  649. UNUSED(blocklen);
  650. {
  651. float sumf[4][4];
  652. int sumi;
  653. for (int y = 0; y < nr / 4; y++) {
  654. const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
  655. for (int x = 0; x < nc / ncols_interleaved; x++) {
  656. const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
  657. for (int m = 0; m < 4; m++) {
  658. for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
  659. }
  660. for (int l = 0; l < nb; l++) {
  661. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  662. for (int m = 0; m < 4; m++) {
  663. for (int j = 0; j < ncols_interleaved; j++) {
  664. sumi = 0;
  665. for (int i = 0; i < blocklen; ++i) {
  666. const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
  667. const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
  668. sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
  669. (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
  670. }
  671. sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_FP16_TO_FP32(a_ptr[l].d[m]);
  672. }
  673. }
  674. }
  675. }
  676. for (int m = 0; m < 4; m++) {
  677. for (int j = 0; j < ncols_interleaved; j++)
  678. s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
  679. }
  680. }
  681. }
  682. }
  683. }
  684. GGML_CPU_NATIVE_IMPL(ggml_gemm_iq4_nl_4x4_q8_0)
  685. } // extern "C"
  686. static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
  687. block_q4_0x4 out;
  688. for (int i = 0; i < 4; i++) {
  689. out.d[i] = in[i].d;
  690. }
  691. const int end = QK4_0 * 2 / blck_size_interleave;
  692. if (blck_size_interleave == 8) {
  693. const uint64_t xor_mask = 0x8888888888888888ULL;
  694. for (int i = 0; i < end; ++i) {
  695. int src_id = i % 4;
  696. int src_offset = (i / 4) * blck_size_interleave;
  697. int dst_offset = i * blck_size_interleave;
  698. uint64_t elems;
  699. // Using memcpy to avoid unaligned memory accesses
  700. memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
  701. elems ^= xor_mask;
  702. memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
  703. }
  704. } else if (blck_size_interleave == 4) {
  705. const uint32_t xor_mask = 0x88888888;
  706. for (int i = 0; i < end; ++i) {
  707. int src_id = i % 4;
  708. int src_offset = (i / 4) * blck_size_interleave;
  709. int dst_offset = i * blck_size_interleave;
  710. uint32_t elems;
  711. memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
  712. elems ^= xor_mask;
  713. memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
  714. }
  715. } else {
  716. GGML_ASSERT(false);
  717. }
  718. return out;
  719. }
  720. // interleave 8 block_q4_0s in blocks of blck_size_interleave
  721. // returns an interleaved block_q4_0x8
  722. // in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
  723. // first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
  724. static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
  725. block_q4_0x8 out;
  726. for (int i = 0; i < 8; i++) {
  727. out.d[i] = in[i].d;
  728. }
  729. const int end = QK4_0 * 4 / blck_size_interleave;
  730. const uint64_t xor_mask = 0x8888888888888888ULL;
  731. for (int i = 0; i < end; ++i) {
  732. int src_id = i % 8;
  733. int src_offset = (i / 8) * blck_size_interleave;
  734. int dst_offset = i * blck_size_interleave;
  735. uint64_t elems;
  736. memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
  737. elems ^= xor_mask;
  738. memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
  739. }
  740. return out;
  741. }
  742. static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_interleave) {
  743. block_q4_Kx8 out;
  744. //Delta(scale) and dmin values of the eight Q4_K structures are copied onto the output interleaved structure
  745. for (int i = 0; i < 8; i++) {
  746. out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
  747. }
  748. for (int i = 0; i < 8; i++) {
  749. out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
  750. }
  751. const int end = QK_K * 4 / blck_size_interleave;
  752. // Interleave Q4_K quants by taking 8 bytes at a time
  753. for (int i = 0; i < end; ++i) {
  754. int src_id = i % 8;
  755. int src_offset = (i / 8) * blck_size_interleave;
  756. int dst_offset = i * blck_size_interleave;
  757. uint64_t elems;
  758. memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
  759. memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
  760. }
  761. // The below logic is designed so as to unpack and rearrange scales and mins values in Q4_K
  762. // Currently the Q4_K structure has 8 scales and 8 mins packed in 12 bytes ( 6 bits for each value)
  763. // The output Q4_Kx8 structure has 96 bytes
  764. // Every 12 byte is packed such that it contains scales and mins for corresponding sub blocks from Q4_K structure
  765. // For eg - First 12 bytes contains 8 scales and 8 mins - each of first sub block from different Q4_K structures
  766. uint8_t s[8], m[8];
  767. for (int i = 0; i < 4; i++) {
  768. for (int j = 0; j < 8; j++) {
  769. s[j] = in[j].scales[i] & 63;
  770. m[j] = in[j].scales[i + 4] & 63;
  771. }
  772. out.scales[i * 12] = (s[0] & 63) + ((s[4] & 48) << 2);
  773. out.scales[i * 12 + 1] = (s[1] & 63) + ((s[5] & 48) << 2);
  774. out.scales[i * 12 + 2] = (s[2] & 63) + ((s[6] & 48) << 2);
  775. out.scales[i * 12 + 3] = (s[3] & 63) + ((s[7] & 48) << 2);
  776. out.scales[i * 12 + 4] = (m[0] & 63) + ((m[4] & 48) << 2);
  777. out.scales[i * 12 + 5] = (m[1] & 63) + ((m[5] & 48) << 2);
  778. out.scales[i * 12 + 6] = (m[2] & 63) + ((m[6] & 48) << 2);
  779. out.scales[i * 12 + 7] = (m[3] & 63) + ((m[7] & 48) << 2);
  780. out.scales[i * 12 + 8] = (s[4] & 15) + ((m[4] & 15) << 4);
  781. out.scales[i * 12 + 9] = (s[5] & 15) + ((m[5] & 15) << 4);
  782. out.scales[i * 12 + 10] = (s[6] & 15) + ((m[6] & 15) << 4);
  783. out.scales[i * 12 + 11] = (s[7] & 15) + ((m[7] & 15) << 4);
  784. }
  785. for (int i = 0; i < 4; i++) {
  786. for (int j = 0; j < 8; j++) {
  787. s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15);
  788. m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4);
  789. }
  790. out.scales[i * 12 + 48] = (s[0] & 63) + ((s[4] & 48) << 2);
  791. out.scales[i * 12 + 49] = (s[1] & 63) + ((s[5] & 48) << 2);
  792. out.scales[i * 12 + 50] = (s[2] & 63) + ((s[6] & 48) << 2);
  793. out.scales[i * 12 + 51] = (s[3] & 63) + ((s[7] & 48) << 2);
  794. out.scales[i * 12 + 52] = (m[0] & 63) + ((m[4] & 48) << 2);
  795. out.scales[i * 12 + 53] = (m[1] & 63) + ((m[5] & 48) << 2);
  796. out.scales[i * 12 + 54] = (m[2] & 63) + ((m[6] & 48) << 2);
  797. out.scales[i * 12 + 55] = (m[3] & 63) + ((m[7] & 48) << 2);
  798. out.scales[i * 12 + 56] = (s[4] & 15) + ((m[4] & 15) << 4);
  799. out.scales[i * 12 + 57] = (s[5] & 15) + ((m[5] & 15) << 4);
  800. out.scales[i * 12 + 58] = (s[6] & 15) + ((m[6] & 15) << 4);
  801. out.scales[i * 12 + 59] = (s[7] & 15) + ((m[7] & 15) << 4);
  802. }
  803. return out;
  804. }
  805. static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
  806. GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
  807. GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
  808. constexpr int nrows_interleaved = 4;
  809. block_q4_0x4 * dst = (block_q4_0x4 *)t->data;
  810. const block_q4_0 * src = (const block_q4_0 *)data;
  811. block_q4_0 dst_tmp[4];
  812. int nrow = ggml_nrows(t);
  813. int nblocks = t->ne[0] / QK4_0;
  814. GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
  815. if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
  816. return -1;
  817. }
  818. for (int b = 0; b < nrow; b += nrows_interleaved) {
  819. for (int64_t x = 0; x < nblocks; x++) {
  820. for (int i = 0; i < nrows_interleaved; i++) {
  821. dst_tmp[i] = src[x + i * nblocks];
  822. }
  823. *dst++ = make_block_q4_0x4(dst_tmp, interleave_block);
  824. }
  825. src += nrows_interleaved * nblocks;
  826. }
  827. return 0;
  828. GGML_UNUSED(data_size);
  829. }
  830. static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
  831. GGML_ASSERT(t->type == GGML_TYPE_Q4_K);
  832. GGML_ASSERT(interleave_block == 8);
  833. constexpr int nrows_interleaved = 8;
  834. block_q4_Kx8 * dst = (block_q4_Kx8*)t->data;
  835. const block_q4_K * src = (const block_q4_K*) data;
  836. block_q4_K dst_tmp[8];
  837. int nrow = ggml_nrows(t);
  838. int nblocks = t->ne[0] / QK_K;
  839. GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_K));
  840. if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
  841. return -1;
  842. }
  843. for (int b = 0; b < nrow; b += nrows_interleaved) {
  844. for (int64_t x = 0; x < nblocks; x++) {
  845. for (int i = 0; i < nrows_interleaved; i++ ) {
  846. dst_tmp[i] = src[x + i * nblocks];
  847. }
  848. *dst++ = make_block_q4_Kx8(dst_tmp, interleave_block);
  849. }
  850. src += nrows_interleaved * nblocks;
  851. }
  852. return 0;
  853. GGML_UNUSED(data_size);
  854. }
  855. static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
  856. GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
  857. GGML_ASSERT(interleave_block == 8);
  858. constexpr int nrows_interleaved = 8;
  859. block_q4_0x8 * dst = (block_q4_0x8*)t->data;
  860. const block_q4_0 * src = (const block_q4_0*) data;
  861. block_q4_0 dst_tmp[8];
  862. int nrow = ggml_nrows(t);
  863. int nblocks = t->ne[0] / QK4_0;
  864. GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
  865. if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
  866. return -1;
  867. }
  868. for (int b = 0; b < nrow; b += nrows_interleaved) {
  869. for (int64_t x = 0; x < nblocks; x++) {
  870. for (int i = 0; i < nrows_interleaved; i++ ) {
  871. dst_tmp[i] = src[x + i * nblocks];
  872. }
  873. *dst++ = make_block_q4_0x8(dst_tmp, interleave_block);
  874. }
  875. src += nrows_interleaved * nblocks;
  876. }
  877. return 0;
  878. GGML_UNUSED(data_size);
  879. }
  880. static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
  881. block_iq4_nlx4 out;
  882. for (int i = 0; i < 4; i++) {
  883. out.d[i] = in[i].d;
  884. }
  885. const int end = QK4_NL * 2 / blck_size_interleave;
  886. // TODO: this branch seems wrong
  887. //if (blck_size_interleave == 8) {
  888. // for (int i = 0; i < end; ++i) {
  889. // int src_id = i % 4;
  890. // int src_offset = (i / 4) * blck_size_interleave;
  891. // int dst_offset = i * blck_size_interleave;
  892. // // Using memcpy to avoid unaligned memory accesses
  893. // memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
  894. // }
  895. //} else
  896. if (blck_size_interleave == 4) {
  897. for (int i = 0; i < end; ++i) {
  898. int src_id = i % 4;
  899. int src_offset = (i / 4) * blck_size_interleave;
  900. int dst_offset = i * blck_size_interleave;
  901. memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t));
  902. }
  903. } else {
  904. GGML_ASSERT(false);
  905. }
  906. return out;
  907. }
  908. static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
  909. GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
  910. //GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
  911. GGML_ASSERT(interleave_block == 4);
  912. block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data;
  913. const block_iq4_nl * src = (const block_iq4_nl *)data;
  914. block_iq4_nl dst_tmp[4];
  915. int nrow = ggml_nrows(t);
  916. int nrows_interleaved = 4;
  917. int nblocks = t->ne[0] / QK4_0;
  918. GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
  919. if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
  920. return -1;
  921. }
  922. for (int b = 0; b < nrow; b += nrows_interleaved) {
  923. for (int64_t x = 0; x < nblocks; x++) {
  924. for (int i = 0; i < nrows_interleaved; i++) {
  925. dst_tmp[i] = src[x + i * nblocks];
  926. }
  927. *dst++ = make_block_iq4_nlx4(dst_tmp, interleave_block);
  928. }
  929. src += nrows_interleaved * nblocks;
  930. }
  931. return 0;
  932. GGML_UNUSED(data_size);
  933. }
  934. namespace ggml::cpu::repack {
  935. // repack
  936. template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
  937. int repack(struct ggml_tensor *, const void *, size_t);
  938. // TODO: generalise.
  939. template <> int repack<block_q4_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
  940. return repack_q4_0_to_q4_0_4_bl(t, 4, data, data_size);
  941. }
  942. template <> int repack<block_q4_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
  943. return repack_q4_0_to_q4_0_4_bl(t, 8, data, data_size);
  944. }
  945. template <> int repack<block_q4_0, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
  946. return repack_q4_0_to_q4_0_8_bl(t, 8, data, data_size);
  947. }
  948. template <> int repack<block_q4_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
  949. return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
  950. }
  951. template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
  952. return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
  953. }
  954. // TODO: needs to be revisited
  955. //template <> int repack<block_iq4_nl, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
  956. // return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
  957. //}
  958. // gemv
  959. template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
  960. void gemv(int, float *, size_t, const void *, const void *, int, int);
  961. template <> void gemv<block_q4_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  962. ggml_gemv_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
  963. }
  964. template <> void gemv<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  965. ggml_gemv_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
  966. }
  967. template <> void gemv<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  968. ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
  969. }
  970. template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  971. ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
  972. }
  973. template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  974. ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
  975. }
  976. // gemm
  977. template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
  978. void gemm(int, float *, size_t, const void *, const void *, int, int);
  979. template <> void gemm<block_q4_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  980. ggml_gemm_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
  981. }
  982. template <> void gemm<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  983. ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
  984. }
  985. template <> void gemm<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  986. ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
  987. }
  988. template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  989. ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
  990. }
  991. template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  992. ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
  993. }
  994. class tensor_traits_base : public ggml::cpu::tensor_traits {
  995. public:
  996. virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
  997. };
  998. template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE> class tensor_traits : public tensor_traits_base {
  999. bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
  1000. // not realy a GGML_TYPE_Q8_0 but same size.
  1001. switch (op->op) {
  1002. case GGML_OP_MUL_MAT:
  1003. size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
  1004. return true;
  1005. case GGML_OP_MUL_MAT_ID:
  1006. size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
  1007. size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc.
  1008. size += sizeof(int64_t) * (1+op->src[0]->ne[2]) * op->src[1]->ne[2];
  1009. return true;
  1010. default:
  1011. // GGML_ABORT("fatal error");
  1012. break;
  1013. }
  1014. return false;
  1015. }
  1016. bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override {
  1017. switch (op->op) {
  1018. case GGML_OP_MUL_MAT:
  1019. forward_mul_mat(params, op);
  1020. return true;
  1021. case GGML_OP_MUL_MAT_ID:
  1022. forward_mul_mat_id(params, op);
  1023. return true;
  1024. default:
  1025. // GGML_ABORT("fatal error");
  1026. break;
  1027. }
  1028. return false;
  1029. }
  1030. void forward_mul_mat(ggml_compute_params * params, ggml_tensor * op) {
  1031. const ggml_tensor * src0 = op->src[0];
  1032. const ggml_tensor * src1 = op->src[1];
  1033. ggml_tensor * dst = op;
  1034. GGML_TENSOR_BINARY_OP_LOCALS
  1035. const int ith = params->ith;
  1036. const int nth = params->nth;
  1037. GGML_ASSERT(ne0 == ne01);
  1038. GGML_ASSERT(ne1 == ne11);
  1039. GGML_ASSERT(ne2 == ne12);
  1040. GGML_ASSERT(ne3 == ne13);
  1041. // dst cannot be transposed or permuted
  1042. GGML_ASSERT(nb0 == sizeof(float));
  1043. GGML_ASSERT(nb0 <= nb1);
  1044. GGML_ASSERT(nb1 <= nb2);
  1045. GGML_ASSERT(nb2 <= nb3);
  1046. GGML_ASSERT(src1->type == GGML_TYPE_F32);
  1047. GGML_ASSERT(ggml_n_dims(op->src[0]) == 2);
  1048. // GGML_ASSERT(ggml_n_dims(op->src[1]) == 2);
  1049. char * wdata = static_cast<char *>(params->wdata);
  1050. const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10);
  1051. assert(params->wsize >= nbw1 * ne11);
  1052. const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
  1053. int64_t i11_processed = 0;
  1054. for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
  1055. ggml_quantize_mat_t<INTER_SIZE, PARAM_TYPE>((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4, ne10);
  1056. }
  1057. i11_processed = ne11 - ne11 % 4;
  1058. for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
  1059. from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10);
  1060. }
  1061. ggml_barrier(params->threadpool);
  1062. const void * src1_wdata = params->wdata;
  1063. const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10);
  1064. int64_t src0_start = (ith * ne01) / nth;
  1065. int64_t src0_end = ((ith + 1) * ne01) / nth;
  1066. src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
  1067. src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
  1068. if (src0_start >= src0_end) {
  1069. return;
  1070. }
  1071. // If there are more than three rows in src1, use gemm; otherwise, use gemv.
  1072. if (ne11 > 3) {
  1073. gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
  1074. (float *) ((char *) dst->data) + src0_start, ne01,
  1075. (const char *) src0->data + src0_start * nb01,
  1076. (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
  1077. }
  1078. for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) {
  1079. gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
  1080. (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01,
  1081. (const char *) src0->data + src0_start * nb01,
  1082. (const char *) src1_wdata + (src1_col_stride * iter), 1,
  1083. src0_end - src0_start);
  1084. }
  1085. }
  1086. void forward_mul_mat_id(ggml_compute_params * params, ggml_tensor * op) {
  1087. const ggml_tensor * src0 = op->src[0];
  1088. const ggml_tensor * src1 = op->src[1];
  1089. const ggml_tensor * ids = op->src[2];
  1090. ggml_tensor * dst = op;
  1091. GGML_TENSOR_BINARY_OP_LOCALS
  1092. const int ith = params->ith;
  1093. const int nth = params->nth;
  1094. const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
  1095. // we don't support permuted src0 or src1
  1096. GGML_ASSERT(nb00 == ggml_type_size(src0->type));
  1097. GGML_ASSERT(nb10 == ggml_type_size(src1->type));
  1098. // dst cannot be transposed or permuted
  1099. GGML_ASSERT(nb0 == sizeof(float));
  1100. GGML_ASSERT(nb0 <= nb1);
  1101. GGML_ASSERT(nb1 <= nb2);
  1102. GGML_ASSERT(nb2 <= nb3);
  1103. GGML_ASSERT(ne03 == 1);
  1104. GGML_ASSERT(ne13 == 1);
  1105. GGML_ASSERT(ne3 == 1);
  1106. GGML_ASSERT(src1->type == GGML_TYPE_F32);
  1107. // row groups
  1108. const int n_ids = ids->ne[0]; // n_expert_used
  1109. const int n_as = ne02; // n_expert
  1110. const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10);
  1111. const size_t nbw2 = nbw1*ne11;
  1112. const size_t nbw3 = nbw2*ne12;
  1113. struct mmid_row_mapping {
  1114. int32_t i1;
  1115. int32_t i2;
  1116. };
  1117. GGML_ASSERT(params->wsize >= (GGML_PAD(nbw3, sizeof(int64_t)) + n_as * sizeof(int64_t) +
  1118. n_as * ne12 * sizeof(mmid_row_mapping)));
  1119. auto * wdata = (char *) params->wdata;
  1120. auto * wdata_src1_end = (char *) wdata + GGML_PAD(nbw3, sizeof(int64_t));
  1121. auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
  1122. struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]
  1123. // src1: float32 => param type
  1124. for (int64_t i12 = 0; i12 < ne12; ++i12) {
  1125. for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
  1126. from_float((float *)((char *) src1->data + i12 * nb12 + i11 * nb11),
  1127. (void *) (wdata + i12 * nbw2 + i11 * nbw1),
  1128. ne10);
  1129. }
  1130. }
  1131. #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ne12 + (i1)]
  1132. if (ith == 0) {
  1133. // initialize matrix_row_counts
  1134. memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
  1135. // group rows by src0 matrix
  1136. for (int32_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
  1137. for (int32_t id = 0; id < n_ids; ++id) {
  1138. const int32_t i02 =
  1139. *(const int32_t *) ((const char *) ids->data + iid1 * ids->nb[1] + id * ids->nb[0]);
  1140. GGML_ASSERT(i02 >= 0 && i02 < n_as);
  1141. MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = { id, iid1 };
  1142. matrix_row_counts[i02] += 1;
  1143. }
  1144. }
  1145. }
  1146. ggml_barrier(params->threadpool);
  1147. // compute each matrix multiplication in sequence
  1148. for (int cur_a = 0; cur_a < n_as; ++cur_a) {
  1149. const int64_t cne1 = matrix_row_counts[cur_a];
  1150. if (cne1 == 0) {
  1151. continue;
  1152. }
  1153. const auto * src0_cur = (const char *) src0->data + cur_a*nb02;
  1154. //const int64_t nr0 = ne01; // src0 rows
  1155. const int64_t nr1 = cne1; // src1 rows
  1156. int64_t src0_cur_start = (ith * ne01) / nth;
  1157. int64_t src0_cur_end = ((ith + 1) * ne01) / nth;
  1158. src0_cur_start = (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start;
  1159. src0_cur_end = (src0_cur_end % NB_COLS) ? src0_cur_end + NB_COLS - (src0_cur_end % NB_COLS) : src0_cur_end;
  1160. if (src0_cur_start >= src0_cur_end) {
  1161. return;
  1162. }
  1163. for (int ir1 = 0; ir1 < nr1; ir1++) {
  1164. struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
  1165. const int id = row_mapping.i1; // selected expert index
  1166. const int64_t i11 = id % ne11;
  1167. const int64_t i12 = row_mapping.i2; // row index in src1
  1168. const int64_t i1 = id; // selected expert index
  1169. const int64_t i2 = i12; // row
  1170. const auto * src1_col = (const char *) wdata + (i11 * nbw1 + i12 * nbw2);
  1171. gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
  1172. (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
  1173. src0_cur + src0_cur_start * nb01,
  1174. src1_col, 1, src0_cur_end - src0_cur_start);
  1175. }
  1176. }
  1177. #undef MMID_MATRIX_ROW
  1178. }
  1179. int repack(struct ggml_tensor * t, const void * data, size_t data_size) override {
  1180. GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type),
  1181. (int) NB_COLS, (int) INTER_SIZE);
  1182. return ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
  1183. }
  1184. };
  1185. // instance for Q4
  1186. static const tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
  1187. static const tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
  1188. static const tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
  1189. static const tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
  1190. // instance for IQ4
  1191. static const tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
  1192. } // namespace ggml::cpu::repack
  1193. static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(const struct ggml_tensor * cur) {
  1194. if (cur->type == GGML_TYPE_Q4_0) {
  1195. if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
  1196. if (cur->ne[1] % 8 == 0) {
  1197. return &ggml::cpu::repack::q4_0_8x8_q8_0;
  1198. }
  1199. }
  1200. if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
  1201. if (cur->ne[1] % 4 == 0) {
  1202. return &ggml::cpu::repack::q4_0_4x8_q8_0;
  1203. }
  1204. }
  1205. if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
  1206. if (cur->ne[1] % 4 == 0) {
  1207. return &ggml::cpu::repack::q4_0_4x4_q8_0;
  1208. }
  1209. }
  1210. } else if (cur->type == GGML_TYPE_Q4_K) {
  1211. if (ggml_cpu_has_avx2()) {
  1212. if (cur->ne[1] % 8 == 0) {
  1213. return &ggml::cpu::repack::q4_K_8x8_q8_K;
  1214. }
  1215. }
  1216. } else if (cur->type == GGML_TYPE_IQ4_NL) {
  1217. if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
  1218. if (cur->ne[1] % 4 == 0) {
  1219. return &ggml::cpu::repack::iq4_nl_4x4_q8_0;
  1220. }
  1221. }
  1222. }
  1223. return nullptr;
  1224. }
  1225. static enum ggml_status ggml_backend_cpu_repack_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
  1226. tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_repack_get_optimal_repack_type(tensor));
  1227. GGML_UNUSED(buffer);
  1228. return GGML_STATUS_SUCCESS;
  1229. }
  1230. static void ggml_backend_cpu_repack_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
  1231. const void * data, size_t offset, size_t size) {
  1232. GGML_ASSERT(offset == 0);
  1233. GGML_ASSERT(size == ggml_nbytes(tensor));
  1234. auto tensor_traits = (ggml::cpu::repack::tensor_traits_base *) tensor->extra;
  1235. auto OK = tensor_traits->repack(tensor, data, size);
  1236. GGML_ASSERT(OK == 0);
  1237. GGML_UNUSED(buffer);
  1238. }
  1239. static const char * ggml_backend_cpu_repack_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
  1240. return "CPU_REPACK";
  1241. GGML_UNUSED(buft);
  1242. }
  1243. static ggml_backend_buffer_t ggml_backend_cpu_repack_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
  1244. ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
  1245. if (buffer == nullptr) {
  1246. return nullptr;
  1247. }
  1248. buffer->buft = buft;
  1249. buffer->iface.init_tensor = ggml_backend_cpu_repack_buffer_init_tensor;
  1250. buffer->iface.set_tensor = ggml_backend_cpu_repack_buffer_set_tensor;
  1251. buffer->iface.get_tensor = nullptr;
  1252. buffer->iface.cpy_tensor = nullptr;
  1253. return buffer;
  1254. }
  1255. static size_t ggml_backend_cpu_repack_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
  1256. return TENSOR_ALIGNMENT;
  1257. GGML_UNUSED(buft);
  1258. }
  1259. namespace ggml::cpu::repack {
  1260. class extra_buffer_type : ggml::cpu::extra_buffer_type {
  1261. bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
  1262. if ( op->op == GGML_OP_MUL_MAT &&
  1263. op->src[0]->buffer &&
  1264. (ggml_n_dims(op->src[0]) == 2) &&
  1265. op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type() &&
  1266. ggml_repack_get_optimal_repack_type(op->src[0])
  1267. ) {
  1268. if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
  1269. return false;
  1270. }
  1271. if (op->src[1]->type == GGML_TYPE_F32) {
  1272. return true;
  1273. }
  1274. //if (op->src[1]->type == GGML_TYPE_Q8_0) {
  1275. // return true;
  1276. //}
  1277. // may be possible if Q8_0 packed...
  1278. } else if (op->op == GGML_OP_MUL_MAT_ID
  1279. && op->src[0]->buffer
  1280. && (ggml_n_dims(op->src[0]) == 3)
  1281. && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()
  1282. && ggml_repack_get_optimal_repack_type(op->src[0])
  1283. ) {
  1284. if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
  1285. return false;
  1286. }
  1287. if (op->src[1]->type == GGML_TYPE_F32) {
  1288. return true;
  1289. }
  1290. //if (op->src[1]->type == GGML_TYPE_Q8_0) {
  1291. // return true;
  1292. //}
  1293. }
  1294. return false;
  1295. }
  1296. ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
  1297. if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) {
  1298. if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()) {
  1299. return (ggml::cpu::tensor_traits *) op->src[0]->extra;
  1300. }
  1301. }
  1302. return nullptr;
  1303. }
  1304. };
  1305. } // namespace ggml::cpu::repack
  1306. ggml_backend_buffer_type_t ggml_backend_cpu_repack_buffer_type(void) {
  1307. static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_repack = {
  1308. /* .iface = */ {
  1309. /* .get_name = */ ggml_backend_cpu_repack_buffer_type_get_name,
  1310. /* .alloc_buffer = */ ggml_backend_cpu_repack_buffer_type_alloc_buffer,
  1311. /* .get_alignment = */ ggml_backend_cpu_repack_buffer_type_get_alignment,
  1312. /* .get_max_size = */ nullptr, // defaults to SIZE_MAX
  1313. /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes
  1314. /* .is_host = */ nullptr,
  1315. },
  1316. /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
  1317. /* .context = */ new ggml::cpu::repack::extra_buffer_type(),
  1318. };
  1319. return &ggml_backend_cpu_buffer_type_repack;
  1320. }