ggml-quants.h 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362
  1. #pragma once
  2. #include "ggml-impl.h"
  3. // GGML internal header
  4. #include <stdint.h>
  5. #include <stddef.h>
  6. #define QK4_0 32
  7. typedef struct {
  8. ggml_fp16_t d; // delta
  9. uint8_t qs[QK4_0 / 2]; // nibbles / quants
  10. } block_q4_0;
  11. static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
  12. #define QK4_1 32
  13. typedef struct {
  14. ggml_fp16_t d; // delta
  15. ggml_fp16_t m; // min
  16. uint8_t qs[QK4_1 / 2]; // nibbles / quants
  17. } block_q4_1;
  18. static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
  19. #define QK5_0 32
  20. typedef struct {
  21. ggml_fp16_t d; // delta
  22. uint8_t qh[4]; // 5-th bit of quants
  23. uint8_t qs[QK5_0 / 2]; // nibbles / quants
  24. } block_q5_0;
  25. static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
  26. #define QK5_1 32
  27. typedef struct {
  28. ggml_fp16_t d; // delta
  29. ggml_fp16_t m; // min
  30. uint8_t qh[4]; // 5-th bit of quants
  31. uint8_t qs[QK5_1 / 2]; // nibbles / quants
  32. } block_q5_1;
  33. static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
  34. #define QK8_0 32
  35. typedef struct {
  36. ggml_fp16_t d; // delta
  37. int8_t qs[QK8_0]; // quants
  38. } block_q8_0;
  39. static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
  40. #define QK8_1 32
  41. typedef struct {
  42. float d; // delta
  43. float s; // d * sum(qs[i])
  44. int8_t qs[QK8_1]; // quants
  45. } block_q8_1;
  46. static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
  47. //
  48. // Super-block quantization structures
  49. //
  50. // Super-block size
  51. #ifdef GGML_QKK_64
  52. #define QK_K 64
  53. #define K_SCALE_SIZE 4
  54. #else
  55. #define QK_K 256
  56. #define K_SCALE_SIZE 12
  57. #endif
  58. // 2-bit quantization
  59. // weight is represented as x = a * q + b
  60. // 16 blocks of 16 elements each
  61. // Effectively 2.625 bits per weight
  62. typedef struct {
  63. uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
  64. uint8_t qs[QK_K/4]; // quants
  65. ggml_fp16_t d; // super-block scale for quantized scales
  66. ggml_fp16_t dmin; // super-block scale for quantized mins
  67. } block_q2_K;
  68. static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
  69. // 3-bit quantization
  70. // weight is represented as x = a * q
  71. // 16 blocks of 16 elements each
  72. // Effectively 3.4375 bits per weight
  73. #ifdef GGML_QKK_64
  74. typedef struct {
  75. uint8_t hmask[QK_K/8]; // quants - high bit
  76. uint8_t qs[QK_K/4]; // quants - low 2 bits
  77. uint8_t scales[2];
  78. ggml_fp16_t d; // super-block scale
  79. } block_q3_K;
  80. static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
  81. #else
  82. typedef struct {
  83. uint8_t hmask[QK_K/8]; // quants - high bit
  84. uint8_t qs[QK_K/4]; // quants - low 2 bits
  85. uint8_t scales[12]; // scales, quantized with 6 bits
  86. ggml_fp16_t d; // super-block scale
  87. } block_q3_K;
  88. static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
  89. #endif
  90. // 4-bit quantization
  91. // 8 blocks of 32 elements each
  92. // weight is represented as x = a * q + b
  93. // Effectively 4.5 bits per weight
  94. #ifdef GGML_QKK_64
  95. typedef struct {
  96. ggml_fp16_t d[2]; // super-block scales/mins
  97. uint8_t scales[2]; // 4-bit block scales/mins
  98. uint8_t qs[QK_K/2]; // 4--bit quants
  99. } block_q4_K;
  100. static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
  101. #else
  102. typedef struct {
  103. ggml_fp16_t d; // super-block scale for quantized scales
  104. ggml_fp16_t dmin; // super-block scale for quantized mins
  105. uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
  106. uint8_t qs[QK_K/2]; // 4--bit quants
  107. } block_q4_K;
  108. static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
  109. #endif
  110. // 5-bit quantization
  111. // 8 blocks of 32 elements each
  112. // weight is represented as x = a * q + b
  113. // Effectively 5.5 bits per weight
  114. #ifdef GGML_QKK_64
  115. typedef struct {
  116. ggml_fp16_t d; // super-block scale
  117. int8_t scales[QK_K/16]; // 8-bit block scales
  118. uint8_t qh[QK_K/8]; // quants, high bit
  119. uint8_t qs[QK_K/2]; // quants, low 4 bits
  120. } block_q5_K;
  121. static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
  122. #else
  123. typedef struct {
  124. ggml_fp16_t d; // super-block scale for quantized scales
  125. ggml_fp16_t dmin; // super-block scale for quantized mins
  126. uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
  127. uint8_t qh[QK_K/8]; // quants, high bit
  128. uint8_t qs[QK_K/2]; // quants, low 4 bits
  129. } block_q5_K;
  130. static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
  131. #endif
  132. // 6-bit quantization
  133. // weight is represented as x = a * q
  134. // 16 blocks of 16 elements each
  135. // Effectively 6.5625 bits per weight
  136. typedef struct {
  137. uint8_t ql[QK_K/2]; // quants, lower 4 bits
  138. uint8_t qh[QK_K/4]; // quants, upper 2 bits
  139. int8_t scales[QK_K/16]; // scales, quantized with 8 bits
  140. ggml_fp16_t d; // super-block scale
  141. } block_q6_K;
  142. static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
  143. // This is only used for intermediate quantization and dot products
  144. typedef struct {
  145. float d; // delta
  146. int8_t qs[QK_K]; // quants
  147. int16_t bsums[QK_K/16]; // sum of quants in groups of 16
  148. } block_q8_K;
  149. static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
  150. // (Almost) "true" 2-bit quantization.
  151. // Due to the need to use blocks as per ggml design, it ends up using
  152. // 2.0625 bpw because of the 16-bit scale for each block of 256.
  153. typedef struct {
  154. ggml_fp16_t d;
  155. uint16_t qs[QK_K/8];
  156. } block_iq2_xxs;
  157. static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
  158. // 2.3125 bpw quants
  159. typedef struct {
  160. ggml_fp16_t d;
  161. uint16_t qs[QK_K/8];
  162. uint8_t scales[QK_K/32];
  163. } block_iq2_xs;
  164. static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
  165. // 2.5625 bpw quants
  166. typedef struct {
  167. ggml_fp16_t d;
  168. uint8_t qs[QK_K/4];
  169. uint8_t qh[QK_K/32];
  170. uint8_t scales[QK_K/32];
  171. } block_iq2_s;
  172. static_assert(sizeof(block_iq2_s) == sizeof(ggml_fp16_t) + QK_K/4 + QK_K/16, "wrong iq2_s block size/padding");
  173. // (Almost) "true" 3-bit quantization.
  174. // Due to the need to use blocks as per ggml design, it ends up using
  175. // 3.0625 bpw because of the 16-bit scale for each block of 256.
  176. typedef struct {
  177. ggml_fp16_t d;
  178. uint8_t qs[3*QK_K/8];
  179. } block_iq3_xxs;
  180. static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
  181. // 3.4375 bpw
  182. #if QK_K == 64
  183. #define IQ3S_N_SCALE 2
  184. #else
  185. #define IQ3S_N_SCALE QK_K/64
  186. #endif
  187. typedef struct {
  188. ggml_fp16_t d;
  189. uint8_t qs[QK_K/4];
  190. uint8_t qh[QK_K/32];
  191. uint8_t signs[QK_K/8];
  192. uint8_t scales[IQ3S_N_SCALE];
  193. } block_iq3_s;
  194. static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
  195. typedef struct {
  196. ggml_fp16_t d;
  197. uint8_t qs[QK_K/8];
  198. uint8_t scales[QK_K/16];
  199. } block_iq1_s;
  200. static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
  201. // Non-linear quants
  202. #define QK4_NL 32
  203. typedef struct {
  204. ggml_fp16_t d;
  205. uint8_t qs[QK4_NL/2];
  206. } block_iq4_nl;
  207. static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
  208. #if QK_K == 64
  209. #define block_iq4_xs block_iq4_nl
  210. //typedef struct block_iq4_nl block_iq4_xs;
  211. #else
  212. typedef struct {
  213. ggml_fp16_t d;
  214. uint16_t scales_h;
  215. uint8_t scales_l[QK_K/64];
  216. uint8_t qs[QK_K/2];
  217. } block_iq4_xs;
  218. static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
  219. #endif
  220. #ifdef __cplusplus
  221. extern "C" {
  222. #endif
  223. // Quantization
  224. void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int k);
  225. void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int k);
  226. void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int k);
  227. void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int k);
  228. void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int k);
  229. void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int k);
  230. void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k);
  231. void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int k);
  232. void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int k);
  233. void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k);
  234. void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
  235. void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
  236. void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
  237. void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
  238. void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int k);
  239. void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int k);
  240. void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int k);
  241. void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
  242. void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
  243. void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
  244. void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
  245. void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
  246. void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
  247. void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
  248. void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
  249. void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
  250. void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
  251. void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
  252. void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
  253. void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
  254. void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
  255. void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
  256. void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
  257. void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
  258. // Dequantization
  259. void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
  260. void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
  261. void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
  262. void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
  263. void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
  264. //void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
  265. void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
  266. void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
  267. void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
  268. void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
  269. void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
  270. void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
  271. void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
  272. void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
  273. void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
  274. void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
  275. void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
  276. void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
  277. void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
  278. void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
  279. // Dot product
  280. void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
  281. void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
  282. void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
  283. void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
  284. void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
  285. void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
  286. void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
  287. void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
  288. void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
  289. void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
  290. void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
  291. void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
  292. void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
  293. void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
  294. void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
  295. void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
  296. void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
  297. void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
  298. //
  299. // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
  300. //
  301. size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
  302. size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
  303. size_t quantize_iq2_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
  304. size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
  305. size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
  306. size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
  307. size_t quantize_iq4_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
  308. size_t quantize_iq3_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
  309. size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
  310. size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
  311. size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
  312. size_t quantize_q5_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
  313. size_t quantize_q6_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
  314. size_t quantize_q4_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
  315. size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
  316. size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
  317. size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
  318. void iq2xs_init_impl(enum ggml_type type);
  319. void iq2xs_free_impl(enum ggml_type type);
  320. void iq3xs_init_impl(int grid_size);
  321. void iq3xs_free_impl(int grid_size);
  322. #ifdef __cplusplus
  323. }
  324. #endif