repack.cpp 77 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982
  1. #define GGML_COMMON_IMPL_CPP
  2. #define GGML_COMMON_DECL_CPP
  3. #include "ggml-common.h"
  4. #include "ggml-backend-impl.h"
  5. #include "ggml-impl.h"
  6. #include "ggml-cpu.h"
  7. #include "ggml-cpu-impl.h"
  8. #include "simd-mappings.h"
  9. #include "traits.h"
  10. #include "arch-fallback.h"
  11. #include <cmath>
  12. #include <cstring>
  13. #include <cassert>
  14. #include <cstdio> // for GGML_ASSERT
  15. #include "repack.h"
  16. #if defined(__GNUC__)
  17. #pragma GCC diagnostic ignored "-Woverlength-strings"
  18. #endif
  19. #define UNUSED GGML_UNUSED
  20. static inline int nearest_int(float fval) {
  21. assert(fabsf(fval) <= 4194303.f);
  22. float val = fval + 12582912.f;
  23. int i; memcpy(&i, &val, sizeof(int));
  24. return (i & 0x007fffff) - 0x00400000;
  25. }
  26. // Functions to create the interleaved data layout formats
  27. // interleave 4 block_q4_0s in blocks of blck_size_interleave
  28. // returns an interleaved block_q4_0x4
  29. // in the interleaved block_q4_0x4, place deltas for 4 block_q4_0 blocks
  30. // first, then interleave quants from 4 block_q4_0s in blocks of blck_size_interleave
  31. //
  32. // - in : an array of block_q4_0 pointers
  33. // - blck_size_interleave : the block_q4_0 quants bytes are interleaved in blocks of
  34. // blck_size_interleave bytes
  35. // - xor_mask : the mask to convert the nibbles in block_q4_0 quants bytes
  36. // from bias offset form to pure sign form (this saves subtract
  37. // operations durin unpacking)
  38. //
  39. extern "C" {
  40. void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
  41. assert(QK8_0 == 32);
  42. assert(k % QK8_0 == 0);
  43. const int nb = k / QK8_0;
  44. block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
  45. // scalar
  46. const int blck_size_interleave = 4;
  47. float srcv[4][QK8_0];
  48. float id[4];
  49. for (int i = 0; i < nb; i++) {
  50. for (int row_iter = 0; row_iter < 4; row_iter++) {
  51. float amax = 0.0f; // absolute max
  52. for (int j = 0; j < QK8_0; j++) {
  53. srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
  54. amax = MAX(amax, fabsf(srcv[row_iter][j]));
  55. }
  56. const float d = amax / ((1 << 7) - 1);
  57. id[row_iter] = d ? 1.0f / d : 0.0f;
  58. y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
  59. }
  60. for (int j = 0; j < QK8_0 * 4; j++) {
  61. int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
  62. int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
  63. src_offset += (j % blck_size_interleave);
  64. float x0 = srcv[src_id][src_offset] * id[src_id];
  65. y[i].qs[j] = roundf(x0);
  66. }
  67. }
  68. }
  69. void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
  70. assert(QK8_0 == 32);
  71. assert(k % QK8_0 == 0);
  72. const int nb = k / QK8_0;
  73. block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
  74. // scalar
  75. const int blck_size_interleave = 8;
  76. float srcv[4][QK8_0];
  77. float id[4];
  78. for (int i = 0; i < nb; i++) {
  79. for (int row_iter = 0; row_iter < 4; row_iter++) {
  80. float amax = 0.0f; // absolute max
  81. for (int j = 0; j < QK8_0; j++) {
  82. srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
  83. amax = MAX(amax, fabsf(srcv[row_iter][j]));
  84. }
  85. const float d = amax / ((1 << 7) - 1);
  86. id[row_iter] = d ? 1.0f / d : 0.0f;
  87. y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
  88. }
  89. for (int j = 0; j < QK8_0 * 4; j++) {
  90. int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
  91. int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
  92. src_offset += (j % blck_size_interleave);
  93. float x0 = srcv[src_id][src_offset] * id[src_id];
  94. y[i].qs[j] = roundf(x0);
  95. }
  96. }
  97. }
  98. void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
  99. assert(QK_K == 256);
  100. assert(k % QK_K == 0);
  101. const int nb = k / QK_K;
  102. block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy;
  103. // scalar
  104. const int blck_size_interleave = 8;
  105. float srcv[4][QK_K];
  106. float iscale[4];
  107. for (int i = 0; i < nb; i++) {
  108. for (int row_iter = 0; row_iter < 4; row_iter++) {
  109. float amax = 0.0f; // absolute max
  110. float max = 0;
  111. for (int j = 0; j < QK_K; j++) {
  112. srcv[row_iter][j] = x[row_iter * k + i * QK_K + j];
  113. // Update the maximum value of the corresponding super block
  114. if(amax < fabsf(srcv[row_iter][j])) {
  115. amax = fabsf(srcv[row_iter][j]);
  116. max = srcv[row_iter][j];
  117. }
  118. }
  119. iscale[row_iter] = amax ? -127.f/max : 0;
  120. y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0;
  121. }
  122. for (int j = 0; j < QK_K / 4; j++) {
  123. y[i].bsums[j] = 0;
  124. }
  125. // Quants values are interleaved in sequence of eight bytes from corresponding super blocks
  126. // Bsums values are interleaved in sequence of four bsums from each super block taken for interleaving
  127. // i.e first four bsums from the first super block, followed by first four bsums from second super block and so on
  128. for (int j = 0; j < QK_K * 4; j++) {
  129. int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
  130. int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
  131. src_offset += (j % blck_size_interleave);
  132. int index = (((j & 31) >> 3) << 2) + ((j >> 8) << 4) + ((j >> 6) & 3);
  133. float x0 = srcv[src_id][src_offset] * iscale[src_id];
  134. y[i].qs[j] = nearest_int(x0);
  135. y[i].bsums[index] += y[i].qs[j];
  136. }
  137. }
  138. }
  139. } // extern "C"
  140. template <int64_t INTER_SIZE, ggml_type PARAM_TYPE>
  141. void ggml_quantize_mat_t(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row);
  142. template <> void ggml_quantize_mat_t<4, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
  143. assert(nrow == 4);
  144. UNUSED(nrow);
  145. ggml_quantize_mat_q8_0_4x4(x, vy, n_per_row);
  146. }
  147. template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
  148. assert(nrow == 4);
  149. UNUSED(nrow);
  150. ggml_quantize_mat_q8_0_4x8(x, vy, n_per_row);
  151. }
  152. template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
  153. assert(nrow == 4);
  154. UNUSED(nrow);
  155. ggml_quantize_mat_q8_K_4x8(x, vy, n_per_row);
  156. }
  157. extern "C" {
  158. void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  159. const int qk = QK8_0;
  160. const int nb = n / qk;
  161. const int ncols_interleaved = 4;
  162. const int blocklen = 4;
  163. assert(nr == 1);
  164. assert(n % qk == 0);
  165. assert(nc % ncols_interleaved == 0);
  166. UNUSED(s);
  167. UNUSED(bs);
  168. UNUSED(vx);
  169. UNUSED(vy);
  170. UNUSED(nr);
  171. UNUSED(nc);
  172. UNUSED(nb);
  173. UNUSED(ncols_interleaved);
  174. UNUSED(blocklen);
  175. float sumf[4];
  176. int sumi;
  177. const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
  178. for (int x = 0; x < nc / ncols_interleaved; x++) {
  179. const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
  180. for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
  181. for (int l = 0; l < nb; l++) {
  182. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  183. for (int j = 0; j < ncols_interleaved; j++) {
  184. sumi = 0;
  185. for (int i = 0; i < blocklen; ++i) {
  186. const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
  187. const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
  188. sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
  189. }
  190. sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
  191. }
  192. }
  193. }
  194. for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
  195. }
  196. }
  197. void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  198. const int qk = QK8_0;
  199. const int nb = n / qk;
  200. const int ncols_interleaved = 4;
  201. const int blocklen = 8;
  202. assert (n % qk == 0);
  203. assert (nc % ncols_interleaved == 0);
  204. UNUSED(s);
  205. UNUSED(bs);
  206. UNUSED(vx);
  207. UNUSED(vy);
  208. UNUSED(nr);
  209. UNUSED(nc);
  210. UNUSED(nb);
  211. UNUSED(ncols_interleaved);
  212. UNUSED(blocklen);
  213. float sumf[4];
  214. int sumi;
  215. const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
  216. for (int x = 0; x < nc / ncols_interleaved; x++) {
  217. const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
  218. for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
  219. for (int l = 0; l < nb; l++) {
  220. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  221. for (int j = 0; j < ncols_interleaved; j++) {
  222. sumi = 0;
  223. for (int i = 0; i < blocklen; ++i) {
  224. const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
  225. const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
  226. sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
  227. }
  228. sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
  229. }
  230. }
  231. }
  232. for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
  233. }
  234. }
  235. void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  236. const int qk = QK8_0;
  237. const int nb = n / qk;
  238. const int ncols_interleaved = 8;
  239. const int blocklen = 8;
  240. assert (n % qk == 0);
  241. assert (nc % ncols_interleaved == 0);
  242. UNUSED(s);
  243. UNUSED(bs);
  244. UNUSED(vx);
  245. UNUSED(vy);
  246. UNUSED(nr);
  247. UNUSED(nc);
  248. UNUSED(nb);
  249. UNUSED(ncols_interleaved);
  250. UNUSED(blocklen);
  251. float sumf[8];
  252. int sumi;
  253. const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
  254. for (int x = 0; x < nc / ncols_interleaved; x++) {
  255. const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
  256. for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
  257. for (int l = 0; l < nb; l++) {
  258. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  259. for (int j = 0; j < ncols_interleaved; j++) {
  260. sumi = 0;
  261. for (int i = 0; i < blocklen; ++i) {
  262. const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
  263. const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
  264. sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
  265. }
  266. sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
  267. }
  268. }
  269. }
  270. for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
  271. }
  272. }
  273. void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  274. const int qk = QK_K;
  275. const int nb = n / qk;
  276. const int ncols_interleaved = 8;
  277. const int blocklen = 8;
  278. static const uint32_t kmask1 = 0x3f3f3f3f;
  279. static const uint32_t kmask2 = 0x0f0f0f0f;
  280. static const uint32_t kmask3 = 0x03030303;
  281. assert (n % qk == 0);
  282. assert (nc % ncols_interleaved == 0);
  283. UNUSED(s);
  284. UNUSED(bs);
  285. UNUSED(vx);
  286. UNUSED(vy);
  287. UNUSED(nr);
  288. UNUSED(nc);
  289. UNUSED(nb);
  290. UNUSED(ncols_interleaved);
  291. UNUSED(blocklen);
  292. float sumf[8];
  293. float sum_minf[8];
  294. uint32_t utmp[32];
  295. int sumi1;
  296. int sumi2;
  297. int sumi;
  298. const block_q8_K * a_ptr = (const block_q8_K *) vy;
  299. for (int x = 0; x < nc / ncols_interleaved; x++) {
  300. const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
  301. for (int j = 0; j < ncols_interleaved; j++) {
  302. sumf[j] = 0.0;
  303. sum_minf[j] = 0.0;
  304. }
  305. for (int l = 0; l < nb; l++) {
  306. for (int sb = 0; sb < 8; sb++) {
  307. memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
  308. utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
  309. const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
  310. utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
  311. utmp[sb * 4 + 2] = uaux_0;
  312. utmp[sb * 4 + 0] &= kmask1;
  313. }
  314. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  315. uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
  316. uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
  317. for (int j = 0; j < ncols_interleaved; j++) {
  318. sumi1 = 0;
  319. sumi2 = 0;
  320. sumi = 0;
  321. for (int i = 0; i < blocklen; ++i) {
  322. const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
  323. const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
  324. sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i]);
  325. sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i + 32]);
  326. sumi1 = sumi1 * scales_0[j];
  327. sumi2 = sumi2 * scales_1[j];
  328. sumi += sumi1 + sumi2;
  329. }
  330. sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
  331. }
  332. }
  333. for (int sb = 0; sb < 8; sb++) {
  334. uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
  335. for (int j = 0; j < ncols_interleaved; j++) {
  336. sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
  337. }
  338. }
  339. }
  340. for (int j = 0; j < ncols_interleaved; j++) {
  341. s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
  342. }
  343. }
  344. }
  345. void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  346. const int qk = QK_K;
  347. const int nb = n / qk;
  348. const int ncols_interleaved = 8;
  349. const int blocklen = 8;
  350. assert (n % qk == 0);
  351. assert (nc % ncols_interleaved == 0);
  352. UNUSED(s);
  353. UNUSED(bs);
  354. UNUSED(vx);
  355. UNUSED(vy);
  356. UNUSED(nr);
  357. UNUSED(nc);
  358. UNUSED(nb);
  359. UNUSED(ncols_interleaved);
  360. UNUSED(blocklen);
  361. float sumf[8];
  362. float sum_minf[8];
  363. int sumi1,sumi2,sumi3,sumi4;
  364. int sumi;
  365. const block_q8_K * a_ptr = (const block_q8_K *)vy;
  366. for(int x = 0; x < nc / ncols_interleaved; x++) {
  367. const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
  368. for (int j = 0; j < ncols_interleaved; j++) {
  369. sumf[j] = 0.0;
  370. sum_minf[j] = 0.0;
  371. }
  372. for (int l = 0; l < nb; l++) {
  373. for (int k = 0; k < (qk / (4 * blocklen)); k++) {
  374. const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
  375. const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
  376. const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
  377. const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
  378. for (int j = 0; j < ncols_interleaved; j++) {
  379. sumi1 = 0;
  380. sumi2 = 0;
  381. sumi3 = 0;
  382. sumi4 = 0;
  383. sumi = 0;
  384. int offset = ((k / 2) % 2) + j * 2;
  385. for (int i = 0; i < blocklen; ++i){
  386. const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
  387. const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
  388. const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
  389. const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
  390. sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]);
  391. sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]);
  392. sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 64]);
  393. sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 96]);
  394. sumi1 = sumi1 * (scales_0[offset] & 0xF);
  395. sumi2 = sumi2 * (scales_1[offset] & 0xF);
  396. sumi3 = sumi3 * (scales_2[offset] & 0xF);
  397. sumi4 = sumi4 * (scales_3[offset] & 0xF);
  398. sumi += sumi1 + sumi2 + sumi3 + sumi4;
  399. }
  400. sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
  401. }
  402. }
  403. for(int sb = 0; sb < 8; sb++) {
  404. const uint8_t *mins = b_ptr[l].scales + sb * 16;
  405. for(int j = 0; j < ncols_interleaved; j++){
  406. sum_minf[j] += ((mins[j * 2] >> 4) * a_ptr[l].bsums[sb * 2] + (mins[(j * 2)+ 1] >> 4) * a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
  407. }
  408. }
  409. }
  410. for (int j = 0; j < ncols_interleaved; j++) {
  411. s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
  412. }
  413. }
  414. }
  415. void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  416. const int qk = QK8_0;
  417. const int nb = n / qk;
  418. const int ncols_interleaved = 4;
  419. const int blocklen = 4;
  420. assert(nr == 1);
  421. assert(n % qk == 0);
  422. assert(nc % ncols_interleaved == 0);
  423. UNUSED(bs);
  424. UNUSED(nr);
  425. float sumf[4];
  426. int sumi;
  427. const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
  428. for (int x = 0; x < nc / ncols_interleaved; x++) {
  429. const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
  430. for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
  431. for (int l = 0; l < nb; l++) {
  432. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  433. for (int j = 0; j < ncols_interleaved; j++) {
  434. sumi = 0;
  435. for (int i = 0; i < blocklen; ++i) {
  436. const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
  437. const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
  438. sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
  439. }
  440. sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
  441. }
  442. }
  443. }
  444. for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
  445. }
  446. }
  447. void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  448. const int qk = QK8_0;
  449. const int nb = n / qk;
  450. const int ncols_interleaved = 8;
  451. const int blocklen = 8;
  452. assert(nr == 1);
  453. assert(n % qk == 0);
  454. assert(nc % ncols_interleaved == 0);
  455. UNUSED(bs);
  456. UNUSED(nr);
  457. float sumf[8];
  458. int sumi;
  459. const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
  460. for (int x = 0; x < nc / ncols_interleaved; x++) {
  461. const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
  462. for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
  463. for (int l = 0; l < nb; l++) {
  464. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  465. for (int j = 0; j < ncols_interleaved; j++) {
  466. sumi = 0;
  467. for (int i = 0; i < blocklen; ++i) {
  468. const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
  469. const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
  470. sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
  471. }
  472. sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
  473. }
  474. }
  475. }
  476. for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
  477. }
  478. }
  479. void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  480. const int qk = QK8_0;
  481. const int nb = n / qk;
  482. const int ncols_interleaved = 4;
  483. const int blocklen = 4;
  484. assert (n % qk == 0);
  485. assert (nr % 4 == 0);
  486. assert (nc % ncols_interleaved == 0);
  487. UNUSED(s);
  488. UNUSED(bs);
  489. UNUSED(vx);
  490. UNUSED(vy);
  491. UNUSED(nr);
  492. UNUSED(nc);
  493. UNUSED(nb);
  494. UNUSED(ncols_interleaved);
  495. UNUSED(blocklen);
  496. {
  497. float sumf[4][4];
  498. int sumi;
  499. for (int y = 0; y < nr / 4; y++) {
  500. const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
  501. for (int x = 0; x < nc / ncols_interleaved; x++) {
  502. const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
  503. for (int m = 0; m < 4; m++) {
  504. for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
  505. }
  506. for (int l = 0; l < nb; l++) {
  507. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  508. for (int m = 0; m < 4; m++) {
  509. for (int j = 0; j < ncols_interleaved; j++) {
  510. sumi = 0;
  511. for (int i = 0; i < blocklen; ++i) {
  512. const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
  513. const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
  514. sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
  515. (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
  516. }
  517. sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
  518. }
  519. }
  520. }
  521. }
  522. for (int m = 0; m < 4; m++) {
  523. for (int j = 0; j < ncols_interleaved; j++)
  524. s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
  525. }
  526. }
  527. }
  528. }
  529. }
  530. void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  531. const int qk = QK8_0;
  532. const int nb = n / qk;
  533. const int ncols_interleaved = 4;
  534. const int blocklen = 8;
  535. assert (n % qk == 0);
  536. assert (nr % 4 == 0);
  537. assert (nc % ncols_interleaved == 0);
  538. UNUSED(s);
  539. UNUSED(bs);
  540. UNUSED(vx);
  541. UNUSED(vy);
  542. UNUSED(nr);
  543. UNUSED(nc);
  544. UNUSED(nb);
  545. UNUSED(ncols_interleaved);
  546. UNUSED(blocklen);
  547. float sumf[4][4];
  548. int sumi;
  549. for (int y = 0; y < nr / 4; y++) {
  550. const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
  551. for (int x = 0; x < nc / ncols_interleaved; x++) {
  552. const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
  553. for (int m = 0; m < 4; m++) {
  554. for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
  555. }
  556. for (int l = 0; l < nb; l++) {
  557. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  558. for (int m = 0; m < 4; m++) {
  559. for (int j = 0; j < ncols_interleaved; j++) {
  560. sumi = 0;
  561. for (int i = 0; i < blocklen; ++i) {
  562. const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
  563. const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
  564. sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
  565. (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
  566. }
  567. sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
  568. }
  569. }
  570. }
  571. }
  572. for (int m = 0; m < 4; m++) {
  573. for (int j = 0; j < ncols_interleaved; j++)
  574. s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
  575. }
  576. }
  577. }
  578. }
  579. void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  580. const int qk = QK8_0;
  581. const int nb = n / qk;
  582. const int ncols_interleaved = 8;
  583. const int blocklen = 8;
  584. assert (n % qk == 0);
  585. assert (nr % 4 == 0);
  586. assert (nc % ncols_interleaved == 0);
  587. UNUSED(s);
  588. UNUSED(bs);
  589. UNUSED(vx);
  590. UNUSED(vy);
  591. UNUSED(nr);
  592. UNUSED(nc);
  593. UNUSED(nb);
  594. UNUSED(ncols_interleaved);
  595. UNUSED(blocklen);
  596. float sumf[4][8];
  597. int sumi;
  598. for (int y = 0; y < nr / 4; y++) {
  599. const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
  600. for (int x = 0; x < nc / ncols_interleaved; x++) {
  601. const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
  602. for (int m = 0; m < 4; m++) {
  603. for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
  604. }
  605. for (int l = 0; l < nb; l++) {
  606. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  607. for (int m = 0; m < 4; m++) {
  608. for (int j = 0; j < ncols_interleaved; j++) {
  609. sumi = 0;
  610. for (int i = 0; i < blocklen; ++i) {
  611. const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
  612. const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
  613. sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
  614. (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
  615. }
  616. sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
  617. }
  618. }
  619. }
  620. }
  621. for (int m = 0; m < 4; m++) {
  622. for (int j = 0; j < ncols_interleaved; j++)
  623. s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
  624. }
  625. }
  626. }
  627. }
  628. void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  629. const int qk = QK_K;
  630. const int nb = n / qk;
  631. const int ncols_interleaved = 8;
  632. const int blocklen = 8;
  633. static const uint32_t kmask1 = 0x3f3f3f3f;
  634. static const uint32_t kmask2 = 0x0f0f0f0f;
  635. static const uint32_t kmask3 = 0x03030303;
  636. assert (n % qk == 0);
  637. assert (nr % 4 == 0);
  638. assert (nc % ncols_interleaved == 0);
  639. UNUSED(s);
  640. UNUSED(bs);
  641. UNUSED(vx);
  642. UNUSED(vy);
  643. UNUSED(nr);
  644. UNUSED(nc);
  645. UNUSED(nb);
  646. UNUSED(ncols_interleaved);
  647. UNUSED(blocklen);
  648. float sumf[4][8];
  649. float sum_minf[4][8];
  650. uint32_t utmp[32];
  651. int sumi1;
  652. int sumi2;
  653. int sumi;
  654. for (int y = 0; y < nr / 4; y++) {
  655. const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
  656. for (int x = 0; x < nc / ncols_interleaved; x++) {
  657. const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
  658. for (int m = 0; m < 4; m++) {
  659. for (int j = 0; j < ncols_interleaved; j++) {
  660. sumf[m][j] = 0.0;
  661. sum_minf[m][j] = 0.0;
  662. }
  663. }
  664. for (int l = 0; l < nb; l++) {
  665. for (int sb = 0; sb < 8; sb++) {
  666. memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
  667. utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
  668. const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
  669. utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
  670. utmp[sb * 4 + 2] = uaux_0;
  671. utmp[sb * 4 + 0] &= kmask1;
  672. }
  673. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  674. uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
  675. uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
  676. for (int m = 0; m < 4; m++) {
  677. for (int j = 0; j < ncols_interleaved; j++) {
  678. sumi1 = 0;
  679. sumi2 = 0;
  680. sumi = 0;
  681. for (int i = 0; i < blocklen; ++i) {
  682. const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
  683. const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
  684. sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i]);
  685. sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
  686. sumi1 = sumi1 * scales_0[j];
  687. sumi2 = sumi2 * scales_1[j];
  688. sumi += sumi1 + sumi2;
  689. }
  690. sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
  691. }
  692. }
  693. }
  694. for (int sb = 0; sb < 8; sb++) {
  695. uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
  696. for(int m = 0; m < 4; m++) {
  697. const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
  698. for(int j = 0; j < ncols_interleaved; j++) {
  699. sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
  700. }
  701. }
  702. }
  703. }
  704. for (int m = 0; m < 4; m++) {
  705. for (int j = 0; j < ncols_interleaved; j++) {
  706. s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
  707. }
  708. }
  709. }
  710. }
  711. }
  712. void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  713. const int qk = QK_K;
  714. const int nb = n / qk;
  715. const int ncols_interleaved = 8;
  716. const int blocklen = 8;
  717. assert (n % qk == 0);
  718. assert (nr % 4 == 0);
  719. assert (nc % ncols_interleaved == 0);
  720. UNUSED(s);
  721. UNUSED(bs);
  722. UNUSED(vx);
  723. UNUSED(vy);
  724. UNUSED(nr);
  725. UNUSED(nc);
  726. UNUSED(nb);
  727. UNUSED(ncols_interleaved);
  728. UNUSED(blocklen);
  729. float sumf[4][8];
  730. float sum_minf[4][8];
  731. int sumi1, sumi2, sumi3, sumi4;
  732. int sumi;
  733. for (int y = 0; y < nr / 4; y++) {
  734. const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
  735. for (int x = 0; x < nc / ncols_interleaved; x++) {
  736. const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
  737. for (int m = 0; m < 4; m++) {
  738. for (int j = 0; j < ncols_interleaved; j++) {
  739. sumf[m][j] = 0.0;
  740. sum_minf[m][j] = 0.0;
  741. }
  742. }
  743. for (int l = 0; l < nb; l++) {
  744. for (int k = 0; k < (qk / (4 * blocklen)); k++) {
  745. const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
  746. const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
  747. const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
  748. const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
  749. for (int m = 0; m < 4; m++) {
  750. for (int j = 0; j < ncols_interleaved; j++) {
  751. sumi1 = 0;
  752. sumi2 = 0;
  753. sumi3 = 0;
  754. sumi4 = 0;
  755. sumi = 0;
  756. int offset = ((k / 2) % 2) + j * 2;
  757. for (int i = 0; i < blocklen; ++i){
  758. const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
  759. const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
  760. const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
  761. const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
  762. sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]);
  763. sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
  764. sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 256]);
  765. sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 384]);
  766. sumi1 = sumi1 * (scales_0[offset] & 0xF);
  767. sumi2 = sumi2 * (scales_1[offset] & 0xF);
  768. sumi3 = sumi3 * (scales_2[offset] & 0xF);
  769. sumi4 = sumi4 * (scales_3[offset] & 0xF);
  770. sumi += sumi1 + sumi2 + sumi3 + sumi4;
  771. }
  772. sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
  773. }
  774. }
  775. }
  776. for(int sb = 0; sb < 8; sb++) {
  777. const uint8_t *mins = b_ptr[l].scales + sb * 16;
  778. for(int m = 0; m < 4; m++) {
  779. const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
  780. for(int j = 0; j < ncols_interleaved; j++) {
  781. int mins_prod = ((mins[j * 2] >> 4) * bsums[0] + (mins[(j * 2)+ 1] >> 4) * bsums[1]);
  782. sum_minf[m][j] += (mins_prod) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
  783. }
  784. }
  785. }
  786. }
  787. for (int m = 0; m < 4; m++) {
  788. for (int j = 0; j < ncols_interleaved; j++) {
  789. s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
  790. }
  791. }
  792. }
  793. }
  794. }
  795. void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  796. const int qk = QK8_0;
  797. const int nb = n / qk;
  798. const int ncols_interleaved = 4;
  799. const int blocklen = 4;
  800. assert (n % qk == 0);
  801. assert (nr % 4 == 0);
  802. assert (nc % ncols_interleaved == 0);
  803. UNUSED(s);
  804. UNUSED(bs);
  805. UNUSED(vx);
  806. UNUSED(vy);
  807. UNUSED(nr);
  808. UNUSED(nc);
  809. UNUSED(nb);
  810. UNUSED(ncols_interleaved);
  811. UNUSED(blocklen);
  812. {
  813. float sumf[4][4];
  814. int sumi;
  815. for (int y = 0; y < nr / 4; y++) {
  816. const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
  817. for (int x = 0; x < nc / ncols_interleaved; x++) {
  818. const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
  819. for (int m = 0; m < 4; m++) {
  820. for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
  821. }
  822. for (int l = 0; l < nb; l++) {
  823. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  824. for (int m = 0; m < 4; m++) {
  825. for (int j = 0; j < ncols_interleaved; j++) {
  826. sumi = 0;
  827. for (int i = 0; i < blocklen; ++i) {
  828. const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
  829. const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
  830. sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
  831. (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
  832. }
  833. sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
  834. }
  835. }
  836. }
  837. }
  838. for (int m = 0; m < 4; m++) {
  839. for (int j = 0; j < ncols_interleaved; j++)
  840. s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
  841. }
  842. }
  843. }
  844. }
  845. }
  846. void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
  847. const int qk = QK8_0;
  848. const int nb = n / qk;
  849. const int ncols_interleaved = 8;
  850. const int blocklen = 8;
  851. assert(n % qk == 0);
  852. assert(nr % 4 == 0);
  853. assert(nc % ncols_interleaved == 0);
  854. float sumf[4][8];
  855. int sumi;
  856. for (int y = 0; y < nr / 4; y++) {
  857. const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
  858. for (int x = 0; x < nc / ncols_interleaved; x++) {
  859. const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
  860. for (int m = 0; m < 4; m++) {
  861. for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
  862. }
  863. for (int l = 0; l < nb; l++) {
  864. for (int k = 0; k < (qk / (2 * blocklen)); k++) {
  865. for (int m = 0; m < 4; m++) {
  866. for (int j = 0; j < ncols_interleaved; j++) {
  867. sumi = 0;
  868. for (int i = 0; i < blocklen; ++i) {
  869. const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
  870. const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
  871. sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
  872. (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
  873. }
  874. sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
  875. }
  876. }
  877. }
  878. }
  879. for (int m = 0; m < 4; m++) {
  880. for (int j = 0; j < ncols_interleaved; j++)
  881. s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
  882. }
  883. }
  884. }
  885. }
  886. } // extern "C"
  887. static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
  888. block_q4_0x4 out;
  889. for (int i = 0; i < 4; i++) {
  890. out.d[i] = in[i].d;
  891. }
  892. const int end = QK4_0 * 2 / blck_size_interleave;
  893. if (blck_size_interleave == 8) {
  894. const uint64_t xor_mask = 0x8888888888888888ULL;
  895. for (int i = 0; i < end; ++i) {
  896. int src_id = i % 4;
  897. int src_offset = (i / 4) * blck_size_interleave;
  898. int dst_offset = i * blck_size_interleave;
  899. uint64_t elems;
  900. // Using memcpy to avoid unaligned memory accesses
  901. memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
  902. elems ^= xor_mask;
  903. memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
  904. }
  905. } else if (blck_size_interleave == 4) {
  906. const uint32_t xor_mask = 0x88888888;
  907. for (int i = 0; i < end; ++i) {
  908. int src_id = i % 4;
  909. int src_offset = (i / 4) * blck_size_interleave;
  910. int dst_offset = i * blck_size_interleave;
  911. uint32_t elems;
  912. memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
  913. elems ^= xor_mask;
  914. memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
  915. }
  916. } else {
  917. GGML_ASSERT(false);
  918. }
  919. return out;
  920. }
  921. // interleave 8 block_q4_0s in blocks of blck_size_interleave
  922. // returns an interleaved block_q4_0x8
  923. // in the interleaved block_q4_0x8, place deltas for 8 block_q4_0 blocks
  924. // first, then interleave quants from 8 block_q4_0s in blocks of blck_size_interleave
  925. static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
  926. block_q4_0x8 out;
  927. for (int i = 0; i < 8; i++) {
  928. out.d[i] = in[i].d;
  929. }
  930. const int end = QK4_0 * 4 / blck_size_interleave;
  931. const uint64_t xor_mask = 0x8888888888888888ULL;
  932. for (int i = 0; i < end; ++i) {
  933. int src_id = i % 8;
  934. int src_offset = (i / 8) * blck_size_interleave;
  935. int dst_offset = i * blck_size_interleave;
  936. uint64_t elems;
  937. memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
  938. elems ^= xor_mask;
  939. memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
  940. }
  941. return out;
  942. }
  943. static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_interleave) {
  944. block_q4_Kx8 out;
  945. //Delta(scale) and dmin values of the eight Q4_K structures are copied onto the output interleaved structure
  946. for (int i = 0; i < 8; i++) {
  947. out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
  948. }
  949. for (int i = 0; i < 8; i++) {
  950. out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
  951. }
  952. const int end = QK_K * 4 / blck_size_interleave;
  953. // Interleave Q4_K quants by taking 8 bytes at a time
  954. for (int i = 0; i < end; ++i) {
  955. int src_id = i % 8;
  956. int src_offset = (i / 8) * blck_size_interleave;
  957. int dst_offset = i * blck_size_interleave;
  958. uint64_t elems;
  959. memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
  960. memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
  961. }
  962. // The below logic is designed so as to unpack and rearrange scales and mins values in Q4_K
  963. // Currently the Q4_K structure has 8 scales and 8 mins packed in 12 bytes ( 6 bits for each value)
  964. // The output Q4_Kx8 structure has 96 bytes
  965. // Every 12 byte is packed such that it contains scales and mins for corresponding sub blocks from Q4_K structure
  966. // For eg - First 12 bytes contains 8 scales and 8 mins - each of first sub block from different Q4_K structures
  967. uint8_t s[8], m[8];
  968. for (int i = 0; i < 4; i++) {
  969. for (int j = 0; j < 8; j++) {
  970. s[j] = in[j].scales[i] & 63;
  971. m[j] = in[j].scales[i + 4] & 63;
  972. }
  973. out.scales[i * 12] = (s[0] & 63) + ((s[4] & 48) << 2);
  974. out.scales[i * 12 + 1] = (s[1] & 63) + ((s[5] & 48) << 2);
  975. out.scales[i * 12 + 2] = (s[2] & 63) + ((s[6] & 48) << 2);
  976. out.scales[i * 12 + 3] = (s[3] & 63) + ((s[7] & 48) << 2);
  977. out.scales[i * 12 + 4] = (m[0] & 63) + ((m[4] & 48) << 2);
  978. out.scales[i * 12 + 5] = (m[1] & 63) + ((m[5] & 48) << 2);
  979. out.scales[i * 12 + 6] = (m[2] & 63) + ((m[6] & 48) << 2);
  980. out.scales[i * 12 + 7] = (m[3] & 63) + ((m[7] & 48) << 2);
  981. out.scales[i * 12 + 8] = (s[4] & 15) + ((m[4] & 15) << 4);
  982. out.scales[i * 12 + 9] = (s[5] & 15) + ((m[5] & 15) << 4);
  983. out.scales[i * 12 + 10] = (s[6] & 15) + ((m[6] & 15) << 4);
  984. out.scales[i * 12 + 11] = (s[7] & 15) + ((m[7] & 15) << 4);
  985. }
  986. for (int i = 0; i < 4; i++) {
  987. for (int j = 0; j < 8; j++) {
  988. s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15);
  989. m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4);
  990. }
  991. out.scales[i * 12 + 48] = (s[0] & 63) + ((s[4] & 48) << 2);
  992. out.scales[i * 12 + 49] = (s[1] & 63) + ((s[5] & 48) << 2);
  993. out.scales[i * 12 + 50] = (s[2] & 63) + ((s[6] & 48) << 2);
  994. out.scales[i * 12 + 51] = (s[3] & 63) + ((s[7] & 48) << 2);
  995. out.scales[i * 12 + 52] = (m[0] & 63) + ((m[4] & 48) << 2);
  996. out.scales[i * 12 + 53] = (m[1] & 63) + ((m[5] & 48) << 2);
  997. out.scales[i * 12 + 54] = (m[2] & 63) + ((m[6] & 48) << 2);
  998. out.scales[i * 12 + 55] = (m[3] & 63) + ((m[7] & 48) << 2);
  999. out.scales[i * 12 + 56] = (s[4] & 15) + ((m[4] & 15) << 4);
  1000. out.scales[i * 12 + 57] = (s[5] & 15) + ((m[5] & 15) << 4);
  1001. out.scales[i * 12 + 58] = (s[6] & 15) + ((m[6] & 15) << 4);
  1002. out.scales[i * 12 + 59] = (s[7] & 15) + ((m[7] & 15) << 4);
  1003. }
  1004. return out;
  1005. }
  1006. static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_interleave) {
  1007. block_q2_Kx8 out;
  1008. // Delta(scale) and dmin values of the eight Q2_K structures are copied onto the output interleaved structure
  1009. for (int i = 0; i < 8; i++) {
  1010. out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
  1011. }
  1012. for (int i = 0; i < 8; i++) {
  1013. out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
  1014. }
  1015. const int end = QK_K * 2 / blck_size_interleave;
  1016. // Interleave Q2_K quants by taking 8 bytes at a time
  1017. for (int i = 0; i < end; ++i) {
  1018. int src_id = i % 8;
  1019. int src_offset = (i / 8) * blck_size_interleave;
  1020. int dst_offset = i * blck_size_interleave;
  1021. uint64_t elems;
  1022. memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
  1023. memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
  1024. }
  1025. // The below logic is designed so as to unpack and rearrange scales and mins values in Q2_K
  1026. // Currently the Q2_K structure has 16 scales and 16 mins packed in 16 bytes ( 4 bits for each value)
  1027. // The output Q2_Kx8 structure has 128 bytes for storing scales and mins
  1028. // Every 16 byte is packed such that it contains scales and mins for corresponding sub blocks from Q2_K structure
  1029. // For eg - First 16 bytes contains 16 scales and 16 mins - each of first and second sub blocks from different Q2_K structures
  1030. for(int i = 0; i < 128; i++){
  1031. // Index for selecting which q2k super block
  1032. int src1 = (i % 16) / 2;
  1033. // Index for selecting scale
  1034. int src2 = ((i / 16) * 2) + (i % 2);
  1035. out.scales[i] = in[src1].scales[src2];
  1036. }
  1037. return out;
  1038. }
  1039. static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
  1040. GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
  1041. GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
  1042. constexpr int nrows_interleaved = 4;
  1043. block_q4_0x4 * dst = (block_q4_0x4 *)t->data;
  1044. const block_q4_0 * src = (const block_q4_0 *)data;
  1045. block_q4_0 dst_tmp[4];
  1046. int nrow = ggml_nrows(t);
  1047. int nblocks = t->ne[0] / QK4_0;
  1048. GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
  1049. if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
  1050. return -1;
  1051. }
  1052. for (int b = 0; b < nrow; b += nrows_interleaved) {
  1053. for (int64_t x = 0; x < nblocks; x++) {
  1054. for (int i = 0; i < nrows_interleaved; i++) {
  1055. dst_tmp[i] = src[x + i * nblocks];
  1056. }
  1057. *dst++ = make_block_q4_0x4(dst_tmp, interleave_block);
  1058. }
  1059. src += nrows_interleaved * nblocks;
  1060. }
  1061. return 0;
  1062. GGML_UNUSED(data_size);
  1063. }
  1064. static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
  1065. GGML_ASSERT(t->type == GGML_TYPE_Q4_K);
  1066. GGML_ASSERT(interleave_block == 8);
  1067. constexpr int nrows_interleaved = 8;
  1068. block_q4_Kx8 * dst = (block_q4_Kx8*)t->data;
  1069. const block_q4_K * src = (const block_q4_K*) data;
  1070. block_q4_K dst_tmp[8];
  1071. int nrow = ggml_nrows(t);
  1072. int nblocks = t->ne[0] / QK_K;
  1073. GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_K));
  1074. if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
  1075. return -1;
  1076. }
  1077. for (int b = 0; b < nrow; b += nrows_interleaved) {
  1078. for (int64_t x = 0; x < nblocks; x++) {
  1079. for (int i = 0; i < nrows_interleaved; i++ ) {
  1080. dst_tmp[i] = src[x + i * nblocks];
  1081. }
  1082. *dst++ = make_block_q4_Kx8(dst_tmp, interleave_block);
  1083. }
  1084. src += nrows_interleaved * nblocks;
  1085. }
  1086. return 0;
  1087. GGML_UNUSED(data_size);
  1088. }
  1089. static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
  1090. GGML_ASSERT(t->type == GGML_TYPE_Q2_K);
  1091. GGML_ASSERT(interleave_block == 8);
  1092. constexpr int nrows_interleaved = 8;
  1093. block_q2_Kx8 * dst = (block_q2_Kx8*)t->data;
  1094. const block_q2_K * src = (const block_q2_K*) data;
  1095. block_q2_K dst_tmp[8];
  1096. int nrow = ggml_nrows(t);
  1097. int nblocks = t->ne[0] / QK_K;
  1098. GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q2_K));
  1099. if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
  1100. return -1;
  1101. }
  1102. for (int b = 0; b < nrow; b += nrows_interleaved) {
  1103. for (int64_t x = 0; x < nblocks; x++) {
  1104. for (int i = 0; i < nrows_interleaved; i++ ) {
  1105. dst_tmp[i] = src[x + i * nblocks];
  1106. }
  1107. *dst++ = make_block_q2_Kx8(dst_tmp, interleave_block);
  1108. }
  1109. src += nrows_interleaved * nblocks;
  1110. }
  1111. return 0;
  1112. GGML_UNUSED(data_size);
  1113. }
  1114. static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
  1115. GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
  1116. GGML_ASSERT(interleave_block == 8);
  1117. constexpr int nrows_interleaved = 8;
  1118. block_q4_0x8 * dst = (block_q4_0x8*)t->data;
  1119. const block_q4_0 * src = (const block_q4_0*) data;
  1120. block_q4_0 dst_tmp[8];
  1121. int nrow = ggml_nrows(t);
  1122. int nblocks = t->ne[0] / QK4_0;
  1123. GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
  1124. if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
  1125. return -1;
  1126. }
  1127. for (int b = 0; b < nrow; b += nrows_interleaved) {
  1128. for (int64_t x = 0; x < nblocks; x++) {
  1129. for (int i = 0; i < nrows_interleaved; i++ ) {
  1130. dst_tmp[i] = src[x + i * nblocks];
  1131. }
  1132. *dst++ = make_block_q4_0x8(dst_tmp, interleave_block);
  1133. }
  1134. src += nrows_interleaved * nblocks;
  1135. }
  1136. return 0;
  1137. GGML_UNUSED(data_size);
  1138. }
  1139. static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
  1140. block_iq4_nlx4 out;
  1141. for (int i = 0; i < 4; i++) {
  1142. out.d[i] = in[i].d;
  1143. }
  1144. const int end = QK4_NL * 2 / blck_size_interleave;
  1145. // TODO: this branch seems wrong
  1146. //if (blck_size_interleave == 8) {
  1147. // for (int i = 0; i < end; ++i) {
  1148. // int src_id = i % 4;
  1149. // int src_offset = (i / 4) * blck_size_interleave;
  1150. // int dst_offset = i * blck_size_interleave;
  1151. // // Using memcpy to avoid unaligned memory accesses
  1152. // memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
  1153. // }
  1154. //} else
  1155. if (blck_size_interleave == 4) {
  1156. for (int i = 0; i < end; ++i) {
  1157. int src_id = i % 4;
  1158. int src_offset = (i / 4) * blck_size_interleave;
  1159. int dst_offset = i * blck_size_interleave;
  1160. memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t));
  1161. }
  1162. } else {
  1163. GGML_ASSERT(false);
  1164. }
  1165. return out;
  1166. }
  1167. static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
  1168. GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
  1169. GGML_ASSERT(interleave_block == 4);
  1170. const block_iq4_nl * src = (const block_iq4_nl *)data;
  1171. block_iq4_nlx4 * dst = ( block_iq4_nlx4 *)t->data;
  1172. block_iq4_nl dst_tmp[4];
  1173. int nrow = ggml_nrows(t);
  1174. int nrows_interleaved = 4;
  1175. int nblocks = t->ne[0] / QK4_NL;
  1176. GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
  1177. if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
  1178. return -1;
  1179. }
  1180. for (int b = 0; b < nrow; b += nrows_interleaved) {
  1181. for (int64_t x = 0; x < nblocks; x++) {
  1182. for (int i = 0; i < nrows_interleaved; i++) {
  1183. dst_tmp[i] = src[x + i * nblocks];
  1184. }
  1185. *dst++ = make_block_iq4_nlx4(dst_tmp, interleave_block);
  1186. }
  1187. src += nrows_interleaved * nblocks;
  1188. }
  1189. return 0;
  1190. GGML_UNUSED(data_size);
  1191. }
  1192. static block_iq4_nlx8 make_block_iq4_nlx8(block_iq4_nl * in, unsigned int blck_size_interleave) {
  1193. block_iq4_nlx8 out;
  1194. for (int i = 0; i < 8; i++) {
  1195. out.d[i] = in[i].d;
  1196. }
  1197. const int end = QK4_NL * 4 / blck_size_interleave;
  1198. if (blck_size_interleave == 8) {
  1199. for (int i = 0; i < end; ++i) {
  1200. int src_id = i % 8;
  1201. int src_offset = (i / 8) * blck_size_interleave;
  1202. int dst_offset = i * blck_size_interleave;
  1203. memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
  1204. }
  1205. } else {
  1206. GGML_ASSERT(false);
  1207. }
  1208. return out;
  1209. }
  1210. static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
  1211. GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
  1212. GGML_ASSERT(interleave_block == 8);
  1213. const block_iq4_nl * src = (const block_iq4_nl *)data;
  1214. block_iq4_nlx8 * dst = ( block_iq4_nlx8 *)t->data;
  1215. block_iq4_nl dst_tmp[8];
  1216. int nrow = ggml_nrows(t);
  1217. int nrows_interleaved = 8;
  1218. int nblocks = t->ne[0] / QK4_NL;
  1219. GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
  1220. if (t->ne[1] % nrows_interleaved != 0) {
  1221. return -1;
  1222. }
  1223. for (int b = 0; b < nrow; b += nrows_interleaved) {
  1224. for (int64_t x = 0; x < nblocks; x++) {
  1225. for (int i = 0; i < nrows_interleaved; i++) {
  1226. dst_tmp[i] = src[x + i * nblocks];
  1227. }
  1228. *dst++ = make_block_iq4_nlx8(dst_tmp, interleave_block);
  1229. }
  1230. src += nrows_interleaved * nblocks;
  1231. }
  1232. return 0;
  1233. GGML_UNUSED(data_size);
  1234. }
  1235. namespace ggml::cpu::repack {
  1236. // repack
  1237. template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
  1238. int repack(struct ggml_tensor *, const void *, size_t);
  1239. // TODO: generalise.
  1240. template <> int repack<block_q4_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
  1241. return repack_q4_0_to_q4_0_4_bl(t, 4, data, data_size);
  1242. }
  1243. template <> int repack<block_q4_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
  1244. return repack_q4_0_to_q4_0_4_bl(t, 8, data, data_size);
  1245. }
  1246. template <> int repack<block_q4_0, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
  1247. return repack_q4_0_to_q4_0_8_bl(t, 8, data, data_size);
  1248. }
  1249. template <> int repack<block_q4_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
  1250. return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
  1251. }
  1252. template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
  1253. return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
  1254. }
  1255. template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
  1256. return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
  1257. }
  1258. // TODO: needs to be revisited
  1259. //template <> int repack<block_iq4_nl, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
  1260. // return repack_iq4_nl_to_iq4_nl_4_bl(t, 8, data, data_size);
  1261. //}
  1262. template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
  1263. return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
  1264. }
  1265. // gemv
  1266. template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
  1267. void gemv(int, float *, size_t, const void *, const void *, int, int);
  1268. template <> void gemv<block_q4_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  1269. ggml_gemv_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
  1270. }
  1271. template <> void gemv<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  1272. ggml_gemv_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
  1273. }
  1274. template <> void gemv<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  1275. ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
  1276. }
  1277. template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  1278. ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
  1279. }
  1280. template <> void gemv<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  1281. ggml_gemv_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
  1282. }
  1283. template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  1284. ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
  1285. }
  1286. template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  1287. ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
  1288. }
  1289. // gemm
  1290. template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
  1291. void gemm(int, float *, size_t, const void *, const void *, int, int);
  1292. template <> void gemm<block_q4_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  1293. ggml_gemm_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
  1294. }
  1295. template <> void gemm<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  1296. ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
  1297. }
  1298. template <> void gemm<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  1299. ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
  1300. }
  1301. template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  1302. ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
  1303. }
  1304. template <> void gemm<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  1305. ggml_gemm_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
  1306. }
  1307. template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  1308. ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
  1309. }
  1310. template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
  1311. ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
  1312. }
  1313. class tensor_traits_base : public ggml::cpu::tensor_traits {
  1314. public:
  1315. virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
  1316. };
  1317. template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE> class tensor_traits : public tensor_traits_base {
  1318. bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
  1319. // not realy a GGML_TYPE_Q8_0 but same size.
  1320. switch (op->op) {
  1321. case GGML_OP_MUL_MAT:
  1322. {
  1323. size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
  1324. return true;
  1325. }
  1326. case GGML_OP_MUL_MAT_ID:
  1327. {
  1328. size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
  1329. size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc.
  1330. const int64_t ne02 = op->src[0]->ne[2]; // n_as, n_expert
  1331. const int64_t ne12 = op->src[1]->ne[2]; // n_tokens
  1332. const size_t sizeof_mmid_row_mapping = sizeof(int64_t);
  1333. size += sizeof_mmid_row_mapping*ne02*(ne12 + 1);
  1334. return true;
  1335. }
  1336. default:
  1337. // GGML_ABORT("fatal error");
  1338. break;
  1339. }
  1340. return false;
  1341. }
  1342. bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override {
  1343. switch (op->op) {
  1344. case GGML_OP_MUL_MAT:
  1345. forward_mul_mat(params, op);
  1346. return true;
  1347. case GGML_OP_MUL_MAT_ID:
  1348. forward_mul_mat_id(params, op);
  1349. return true;
  1350. default:
  1351. // GGML_ABORT("fatal error");
  1352. break;
  1353. }
  1354. return false;
  1355. }
  1356. void forward_mul_mat(ggml_compute_params * params, ggml_tensor * op) {
  1357. const ggml_tensor * src0 = op->src[0];
  1358. const ggml_tensor * src1 = op->src[1];
  1359. ggml_tensor * dst = op;
  1360. GGML_TENSOR_BINARY_OP_LOCALS
  1361. const int ith = params->ith;
  1362. const int nth = params->nth;
  1363. GGML_ASSERT(ne0 == ne01);
  1364. GGML_ASSERT(ne1 == ne11);
  1365. GGML_ASSERT(ne2 == ne12);
  1366. GGML_ASSERT(ne3 == ne13);
  1367. // dst cannot be transposed or permuted
  1368. GGML_ASSERT(nb0 == sizeof(float));
  1369. GGML_ASSERT(nb0 <= nb1);
  1370. GGML_ASSERT(nb1 <= nb2);
  1371. GGML_ASSERT(nb2 <= nb3);
  1372. GGML_ASSERT(src1->type == GGML_TYPE_F32);
  1373. GGML_ASSERT(ggml_n_dims(op->src[0]) == 2);
  1374. // GGML_ASSERT(ggml_n_dims(op->src[1]) == 2);
  1375. char * wdata = static_cast<char *>(params->wdata);
  1376. const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10);
  1377. assert(params->wsize >= nbw1 * ne11);
  1378. const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
  1379. int64_t i11_processed = 0;
  1380. for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
  1381. ggml_quantize_mat_t<INTER_SIZE, PARAM_TYPE>((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4, ne10);
  1382. }
  1383. i11_processed = ne11 - ne11 % 4;
  1384. for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
  1385. from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10);
  1386. }
  1387. ggml_barrier(params->threadpool);
  1388. const void * src1_wdata = params->wdata;
  1389. const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10);
  1390. int64_t src0_start = (ith * ne01) / nth;
  1391. int64_t src0_end = ((ith + 1) * ne01) / nth;
  1392. src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
  1393. src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
  1394. if (src0_start >= src0_end) {
  1395. return;
  1396. }
  1397. // If there are more than three rows in src1, use gemm; otherwise, use gemv.
  1398. if (ne11 > 3) {
  1399. gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
  1400. (float *) ((char *) dst->data) + src0_start, ne01,
  1401. (const char *) src0->data + src0_start * nb01,
  1402. (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
  1403. }
  1404. for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) {
  1405. gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
  1406. (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01,
  1407. (const char *) src0->data + src0_start * nb01,
  1408. (const char *) src1_wdata + (src1_col_stride * iter), 1,
  1409. src0_end - src0_start);
  1410. }
  1411. }
  1412. void forward_mul_mat_id(ggml_compute_params * params, ggml_tensor * op) {
  1413. const ggml_tensor * src0 = op->src[0];
  1414. const ggml_tensor * src1 = op->src[1];
  1415. const ggml_tensor * ids = op->src[2];
  1416. ggml_tensor * dst = op;
  1417. GGML_TENSOR_BINARY_OP_LOCALS
  1418. const int ith = params->ith;
  1419. const int nth = params->nth;
  1420. const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
  1421. // we don't support permuted src0 or src1
  1422. GGML_ASSERT(nb00 == ggml_type_size(src0->type));
  1423. GGML_ASSERT(nb10 == ggml_type_size(src1->type));
  1424. // dst cannot be transposed or permuted
  1425. GGML_ASSERT(nb0 == sizeof(float));
  1426. GGML_ASSERT(nb0 <= nb1);
  1427. GGML_ASSERT(nb1 <= nb2);
  1428. GGML_ASSERT(nb2 <= nb3);
  1429. GGML_ASSERT(ne03 == 1);
  1430. GGML_ASSERT(ne13 == 1);
  1431. GGML_ASSERT(ne3 == 1);
  1432. GGML_ASSERT(src1->type == GGML_TYPE_F32);
  1433. // row groups
  1434. const int n_ids = ids->ne[0]; // n_expert_used
  1435. const int n_as = ne02; // n_expert
  1436. const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10);
  1437. const size_t nbw2 = nbw1*ne11;
  1438. const size_t nbw3 = nbw2*ne12;
  1439. struct mmid_row_mapping {
  1440. int32_t i1;
  1441. int32_t i2;
  1442. };
  1443. GGML_ASSERT(params->wsize >=
  1444. (GGML_PAD(nbw3, sizeof(int64_t)) +
  1445. n_as*(ne12 + 1)*sizeof(mmid_row_mapping))
  1446. );
  1447. auto * wdata = (char *)params->wdata;
  1448. auto * wdata_src1_end = (char *)wdata + GGML_PAD(nbw3, sizeof(int64_t));
  1449. // total of [n_as][ne12 + 1] elemets of type mmid_row_mapping (2*int32_t = int64_t)
  1450. auto * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
  1451. struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]
  1452. // src1: float32 => param type
  1453. for (int64_t i12 = 0; i12 < ne12; ++i12) {
  1454. for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
  1455. from_float((float *)((char *) src1->data + i12 * nb12 + i11 * nb11),
  1456. (void *) (wdata + i12 * nbw2 + i11 * nbw1),
  1457. ne10);
  1458. }
  1459. }
  1460. #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ne12 + (i1)]
  1461. if (ith == 0) {
  1462. // initialize matrix_row_counts
  1463. memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
  1464. // group rows by src0 matrix
  1465. for (int32_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
  1466. for (int32_t id = 0; id < n_ids; ++id) {
  1467. const int32_t i02 =
  1468. *(const int32_t *) ((const char *) ids->data + iid1 * ids->nb[1] + id * ids->nb[0]);
  1469. GGML_ASSERT(i02 >= 0 && i02 < n_as);
  1470. MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = { id, iid1 };
  1471. matrix_row_counts[i02] += 1;
  1472. }
  1473. }
  1474. }
  1475. ggml_barrier(params->threadpool);
  1476. // compute each matrix multiplication in sequence
  1477. for (int cur_a = 0; cur_a < n_as; ++cur_a) {
  1478. const int64_t cne1 = matrix_row_counts[cur_a];
  1479. if (cne1 == 0) {
  1480. continue;
  1481. }
  1482. const auto * src0_cur = (const char *) src0->data + cur_a*nb02;
  1483. //const int64_t nr0 = ne01; // src0 rows
  1484. const int64_t nr1 = cne1; // src1 rows
  1485. int64_t src0_cur_start = (ith * ne01) / nth;
  1486. int64_t src0_cur_end = ((ith + 1) * ne01) / nth;
  1487. src0_cur_start = (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start;
  1488. src0_cur_end = (src0_cur_end % NB_COLS) ? src0_cur_end + NB_COLS - (src0_cur_end % NB_COLS) : src0_cur_end;
  1489. if (src0_cur_start >= src0_cur_end) {
  1490. return;
  1491. }
  1492. for (int ir1 = 0; ir1 < nr1; ir1++) {
  1493. struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
  1494. const int id = row_mapping.i1; // selected expert index
  1495. const int64_t i11 = id % ne11;
  1496. const int64_t i12 = row_mapping.i2; // row index in src1
  1497. const int64_t i1 = id; // selected expert index
  1498. const int64_t i2 = i12; // row
  1499. const auto * src1_col = (const char *) wdata + (i11 * nbw1 + i12 * nbw2);
  1500. gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
  1501. (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
  1502. src0_cur + src0_cur_start * nb01,
  1503. src1_col, 1, src0_cur_end - src0_cur_start);
  1504. }
  1505. }
  1506. #undef MMID_MATRIX_ROW
  1507. }
  1508. int repack(struct ggml_tensor * t, const void * data, size_t data_size) override {
  1509. GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type),
  1510. (int) NB_COLS, (int) INTER_SIZE);
  1511. return ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
  1512. }
  1513. };
  1514. } // namespace ggml::cpu::repack
  1515. static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(const struct ggml_tensor * cur) {
  1516. // instance for Q4
  1517. static const ggml::cpu::repack::tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
  1518. static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
  1519. static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
  1520. static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
  1521. // instance for Q2
  1522. static const ggml::cpu::repack::tensor_traits<block_q2_K, 8, 8, GGML_TYPE_Q8_K> q2_K_8x8_q8_K;
  1523. // instance for IQ4
  1524. static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
  1525. static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
  1526. if (cur->type == GGML_TYPE_Q4_0) {
  1527. if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
  1528. if (cur->ne[1] % 8 == 0) {
  1529. return &q4_0_8x8_q8_0;
  1530. }
  1531. }
  1532. if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
  1533. if (cur->ne[1] % 4 == 0) {
  1534. return &q4_0_4x8_q8_0;
  1535. }
  1536. }
  1537. if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
  1538. if (cur->ne[1] % 4 == 0) {
  1539. return &q4_0_4x4_q8_0;
  1540. }
  1541. }
  1542. } else if (cur->type == GGML_TYPE_Q4_K) {
  1543. if (ggml_cpu_has_avx2()) {
  1544. if (cur->ne[1] % 8 == 0) {
  1545. return &q4_K_8x8_q8_K;
  1546. }
  1547. }
  1548. } else if (cur->type == GGML_TYPE_Q2_K) {
  1549. if (ggml_cpu_has_avx512()) {
  1550. if (cur->ne[1] % 8 == 0) {
  1551. return &q2_K_8x8_q8_K;
  1552. }
  1553. }
  1554. } else if (cur->type == GGML_TYPE_IQ4_NL) {
  1555. if (ggml_cpu_has_avx2()) {
  1556. if (cur->ne[1] % 8 == 0) {
  1557. return &iq4_nl_8x8_q8_0;
  1558. }
  1559. }
  1560. if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
  1561. if (cur->ne[1] % 4 == 0) {
  1562. return &iq4_nl_4x4_q8_0;
  1563. }
  1564. }
  1565. }
  1566. return nullptr;
  1567. }
  1568. static enum ggml_status ggml_backend_cpu_repack_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
  1569. tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_repack_get_optimal_repack_type(tensor));
  1570. GGML_UNUSED(buffer);
  1571. return GGML_STATUS_SUCCESS;
  1572. }
  1573. static void ggml_backend_cpu_repack_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
  1574. const void * data, size_t offset, size_t size) {
  1575. GGML_ASSERT(offset == 0);
  1576. GGML_ASSERT(size == ggml_nbytes(tensor));
  1577. auto tensor_traits = (ggml::cpu::repack::tensor_traits_base *) tensor->extra;
  1578. auto OK = tensor_traits->repack(tensor, data, size);
  1579. GGML_ASSERT(OK == 0);
  1580. GGML_UNUSED(buffer);
  1581. }
  1582. static const char * ggml_backend_cpu_repack_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
  1583. return "CPU_REPACK";
  1584. GGML_UNUSED(buft);
  1585. }
  1586. static ggml_backend_buffer_t ggml_backend_cpu_repack_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
  1587. ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
  1588. if (buffer == nullptr) {
  1589. return nullptr;
  1590. }
  1591. buffer->buft = buft;
  1592. buffer->iface.init_tensor = ggml_backend_cpu_repack_buffer_init_tensor;
  1593. buffer->iface.set_tensor = ggml_backend_cpu_repack_buffer_set_tensor;
  1594. buffer->iface.get_tensor = nullptr;
  1595. buffer->iface.cpy_tensor = nullptr;
  1596. return buffer;
  1597. }
  1598. static size_t ggml_backend_cpu_repack_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
  1599. return TENSOR_ALIGNMENT;
  1600. GGML_UNUSED(buft);
  1601. }
  1602. namespace ggml::cpu::repack {
  1603. class extra_buffer_type : ggml::cpu::extra_buffer_type {
  1604. bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
  1605. if ( op->op == GGML_OP_MUL_MAT &&
  1606. op->src[0]->buffer &&
  1607. (ggml_n_dims(op->src[0]) == 2) &&
  1608. op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type() &&
  1609. ggml_repack_get_optimal_repack_type(op->src[0])
  1610. ) {
  1611. if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
  1612. return false;
  1613. }
  1614. if (op->src[1]->type == GGML_TYPE_F32) {
  1615. return true;
  1616. }
  1617. //if (op->src[1]->type == GGML_TYPE_Q8_0) {
  1618. // return true;
  1619. //}
  1620. // may be possible if Q8_0 packed...
  1621. } else if (op->op == GGML_OP_MUL_MAT_ID
  1622. && op->src[0]->buffer
  1623. && (ggml_n_dims(op->src[0]) == 3)
  1624. && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()
  1625. && ggml_repack_get_optimal_repack_type(op->src[0])
  1626. ) {
  1627. if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
  1628. return false;
  1629. }
  1630. if (op->src[1]->type == GGML_TYPE_F32) {
  1631. return true;
  1632. }
  1633. //if (op->src[1]->type == GGML_TYPE_Q8_0) {
  1634. // return true;
  1635. //}
  1636. }
  1637. return false;
  1638. }
  1639. ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
  1640. if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) {
  1641. if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()) {
  1642. return (ggml::cpu::tensor_traits *) op->src[0]->extra;
  1643. }
  1644. }
  1645. return nullptr;
  1646. }
  1647. };
  1648. } // namespace ggml::cpu::repack
  1649. ggml_backend_buffer_type_t ggml_backend_cpu_repack_buffer_type(void) {
  1650. static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_repack = {
  1651. /* .iface = */ {
  1652. /* .get_name = */ ggml_backend_cpu_repack_buffer_type_get_name,
  1653. /* .alloc_buffer = */ ggml_backend_cpu_repack_buffer_type_alloc_buffer,
  1654. /* .get_alignment = */ ggml_backend_cpu_repack_buffer_type_get_alignment,
  1655. /* .get_max_size = */ nullptr, // defaults to SIZE_MAX
  1656. /* .get_alloc_size = */ nullptr, // defaults to ggml_nbytes
  1657. /* .is_host = */ nullptr,
  1658. },
  1659. /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
  1660. /* .context = */ new ggml::cpu::repack::extra_buffer_type(),
  1661. };
  1662. return &ggml_backend_cpu_buffer_type_repack;
  1663. }