llama-quant.cpp 44 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967
  1. #include "llama-quant.h"
  2. #include "llama-impl.h"
  3. #include "llama-model.h"
  4. #include "llama-model-loader.h"
  5. #include <algorithm>
  6. #include <cmath>
  7. #include <cstring>
  8. #include <cinttypes>
  9. #include <fstream>
  10. #include <mutex>
  11. #include <regex>
  12. #include <thread>
  13. #include <unordered_map>
  14. // Quantization types. Changes to this struct must be replicated in quantize.cpp
  15. struct tensor_quantization {
  16. std::string name;
  17. ggml_type quant = GGML_TYPE_COUNT;
  18. };
  19. static void zeros(std::ofstream & file, size_t n) {
  20. char zero = 0;
  21. for (size_t i = 0; i < n; ++i) {
  22. file.write(&zero, 1);
  23. }
  24. }
  25. struct quantize_state_impl {
  26. const llama_model & model;
  27. const llama_model_quantize_params * params;
  28. int n_attention_wv = 0;
  29. int n_ffn_down = 0;
  30. int n_ffn_gate = 0;
  31. int n_ffn_up = 0;
  32. int i_attention_wv = 0;
  33. int i_ffn_down = 0;
  34. int i_ffn_gate = 0;
  35. int i_ffn_up = 0;
  36. int n_k_quantized = 0;
  37. int n_fallback = 0;
  38. bool has_imatrix = false;
  39. // used to figure out if a model shares tok_embd with the output weight
  40. bool has_output = false;
  41. quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
  42. : model(model)
  43. , params(params)
  44. {}
  45. };
  46. static void llama_tensor_dequantize_impl(
  47. ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
  48. const size_t nelements, const int nthread
  49. ) {
  50. if (output.size() < nelements) {
  51. output.resize(nelements);
  52. }
  53. float * f32_output = (float *) output.data();
  54. const ggml_type_traits * qtype = ggml_get_type_traits(tensor->type);
  55. if (ggml_is_quantized(tensor->type)) {
  56. if (qtype->to_float == NULL) {
  57. throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
  58. }
  59. } else if (tensor->type != GGML_TYPE_F16 &&
  60. tensor->type != GGML_TYPE_BF16) {
  61. throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
  62. }
  63. if (nthread < 2) {
  64. if (tensor->type == GGML_TYPE_F16) {
  65. ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
  66. } else if (tensor->type == GGML_TYPE_BF16) {
  67. ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
  68. } else if (ggml_is_quantized(tensor->type)) {
  69. qtype->to_float(tensor->data, f32_output, nelements);
  70. } else {
  71. GGML_ABORT("fatal error"); // unreachable
  72. }
  73. return;
  74. }
  75. size_t block_size;
  76. if (tensor->type == GGML_TYPE_F16 ||
  77. tensor->type == GGML_TYPE_BF16) {
  78. block_size = 1;
  79. } else {
  80. block_size = (size_t)ggml_blck_size(tensor->type);
  81. }
  82. size_t block_size_bytes = ggml_type_size(tensor->type);
  83. GGML_ASSERT(nelements % block_size == 0);
  84. size_t nblocks = nelements / block_size;
  85. size_t blocks_per_thread = nblocks / nthread;
  86. size_t spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
  87. size_t in_buff_offs = 0;
  88. size_t out_buff_offs = 0;
  89. for (int tnum = 0; tnum < nthread; tnum++) {
  90. size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
  91. size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
  92. size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
  93. auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
  94. if (typ == GGML_TYPE_F16) {
  95. ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
  96. } else if (typ == GGML_TYPE_BF16) {
  97. ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
  98. } else {
  99. qtype->to_float(inbuf, outbuf, nels);
  100. }
  101. };
  102. workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
  103. in_buff_offs += thr_block_bytes;
  104. out_buff_offs += thr_elems;
  105. }
  106. for (auto & w : workers) { w.join(); }
  107. workers.clear();
  108. }
  109. static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
  110. const std::string name = ggml_get_name(tensor);
  111. // TODO: avoid hardcoded tensor names - use the TN_* constants
  112. const llm_arch arch = qs.model.arch;
  113. const auto tn = LLM_TN(arch);
  114. auto use_more_bits = [](int i_layer, int n_layers) -> bool {
  115. return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
  116. };
  117. const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
  118. auto layer_info = [n_expert] (int i_layer, int n_layer, const char * name) {
  119. if (n_expert > 1) {
  120. // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but occasionally randomly
  121. // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work
  122. // for getting the current layer as I initially thought, and we need to resort to parsing the
  123. // tensor name.
  124. if (sscanf(name, "blk.%d.", &i_layer) != 1) {
  125. throw std::runtime_error(format("Failed to determine layer for tensor %s", name));
  126. }
  127. if (i_layer < 0 || i_layer >= n_layer) {
  128. throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name, n_layer));
  129. }
  130. }
  131. return std::make_pair(i_layer, n_layer);
  132. };
  133. // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
  134. // with the quantization of the output tensor
  135. if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
  136. if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
  137. new_type = qs.params->output_tensor_type;
  138. } else {
  139. const int64_t nx = tensor->ne[0];
  140. const int64_t qk_k = ggml_blck_size(new_type);
  141. if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) {
  142. new_type = GGML_TYPE_Q8_0;
  143. }
  144. else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
  145. ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ||
  146. ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
  147. new_type = GGML_TYPE_Q5_K;
  148. }
  149. else if (new_type != GGML_TYPE_Q8_0) {
  150. new_type = GGML_TYPE_Q6_K;
  151. }
  152. }
  153. } else if (name == "token_embd.weight") {
  154. if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
  155. new_type = qs.params->token_embedding_type;
  156. } else {
  157. if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
  158. ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
  159. new_type = GGML_TYPE_Q2_K;
  160. }
  161. else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
  162. new_type = GGML_TYPE_IQ3_S;
  163. }
  164. else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
  165. new_type = GGML_TYPE_IQ3_S;
  166. }
  167. else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
  168. new_type = GGML_TYPE_Q4_K;
  169. }
  170. }
  171. } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
  172. ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
  173. if (name.find("attn_v.weight") != std::string::npos) {
  174. if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
  175. else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
  176. ++qs.i_attention_wv;
  177. }
  178. else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
  179. new_type = GGML_TYPE_Q4_K;
  180. }
  181. else if (name.find("ffn_down") != std::string::npos) {
  182. if (qs.i_ffn_down < qs.n_ffn_down/8) {
  183. new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
  184. }
  185. ++qs.i_ffn_down;
  186. }
  187. else if (name.find("attn_output.weight") != std::string::npos) {
  188. if (qs.model.hparams.n_expert == 8) {
  189. new_type = GGML_TYPE_Q5_K;
  190. } else {
  191. if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS;
  192. else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
  193. }
  194. }
  195. } else if (name.find("attn_v.weight") != std::string::npos) {
  196. if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
  197. new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
  198. }
  199. else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
  200. new_type = GGML_TYPE_Q4_K;
  201. }
  202. else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
  203. new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
  204. }
  205. else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
  206. new_type = GGML_TYPE_Q4_K;
  207. }
  208. else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
  209. new_type = GGML_TYPE_Q4_K;
  210. }
  211. else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
  212. new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
  213. }
  214. else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
  215. else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
  216. new_type = GGML_TYPE_Q5_K;
  217. }
  218. else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
  219. use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
  220. else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
  221. if (qs.model.type == LLM_TYPE_70B) {
  222. // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
  223. // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
  224. // nearly negligible increase in model size by quantizing this tensor with more bits:
  225. if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
  226. }
  227. if (qs.model.hparams.n_expert == 8) {
  228. // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
  229. // TODO: explore better strategies
  230. new_type = GGML_TYPE_Q8_0;
  231. }
  232. ++qs.i_attention_wv;
  233. } else if (name.find("attn_k.weight") != std::string::npos) {
  234. if (qs.model.hparams.n_expert == 8) {
  235. // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
  236. // TODO: explore better strategies
  237. new_type = GGML_TYPE_Q8_0;
  238. }
  239. else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
  240. new_type = GGML_TYPE_IQ3_XXS;
  241. }
  242. else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
  243. new_type = GGML_TYPE_IQ2_S;
  244. }
  245. } else if (name.find("attn_q.weight") != std::string::npos) {
  246. if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
  247. new_type = GGML_TYPE_IQ3_XXS;
  248. }
  249. else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
  250. new_type = GGML_TYPE_IQ2_S;
  251. }
  252. } else if (name.find("ffn_down") != std::string::npos) {
  253. auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
  254. int i_layer = info.first, n_layer = info.second;
  255. if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
  256. else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
  257. if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
  258. }
  259. else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
  260. new_type = i_layer < n_layer/8 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
  261. }
  262. else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
  263. new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
  264. : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
  265. : GGML_TYPE_Q3_K;
  266. }
  267. else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
  268. (qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
  269. new_type = GGML_TYPE_Q4_K;
  270. }
  271. else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
  272. new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
  273. }
  274. else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
  275. if (arch == LLM_ARCH_FALCON) {
  276. new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
  277. use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
  278. } else {
  279. if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
  280. }
  281. }
  282. else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
  283. new_type = GGML_TYPE_Q5_K;
  284. }
  285. else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
  286. else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
  287. new_type = GGML_TYPE_Q5_K;
  288. }
  289. else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
  290. && qs.has_imatrix && i_layer < n_layer/8) {
  291. // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
  292. // We only do it when an imatrix is provided because a) we want to make sure that one can always get the
  293. // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
  294. new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
  295. }
  296. ++qs.i_ffn_down;
  297. } else if (name.find("attn_output.weight") != std::string::npos) {
  298. if (arch != LLM_ARCH_FALCON) {
  299. if (qs.model.hparams.n_expert == 8) {
  300. if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
  301. ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
  302. ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
  303. ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
  304. new_type = GGML_TYPE_Q5_K;
  305. }
  306. } else {
  307. if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
  308. else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
  309. else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
  310. else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
  311. else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K;
  312. }
  313. } else {
  314. if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
  315. }
  316. }
  317. else if (name.find("attn_qkv.weight") != std::string::npos) {
  318. if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
  319. new_type = GGML_TYPE_Q4_K;
  320. }
  321. else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
  322. else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
  323. }
  324. else if (name.find("ffn_gate") != std::string::npos) {
  325. auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
  326. int i_layer = info.first, n_layer = info.second;
  327. if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
  328. new_type = GGML_TYPE_IQ3_XXS;
  329. }
  330. ++qs.i_ffn_gate;
  331. }
  332. else if (name.find("ffn_up") != std::string::npos) {
  333. auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
  334. int i_layer = info.first, n_layer = info.second;
  335. if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
  336. new_type = GGML_TYPE_IQ3_XXS;
  337. }
  338. ++qs.i_ffn_up;
  339. }
  340. // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
  341. //}
  342. // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
  343. //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
  344. // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
  345. //}
  346. // This can be used to reduce the size of the Q5_K_S model.
  347. // The associated PPL increase is fully in line with the size reduction
  348. //else {
  349. // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
  350. //}
  351. bool convert_incompatible_tensor = false;
  352. {
  353. const int64_t nx = tensor->ne[0];
  354. const int64_t ny = tensor->ne[1];
  355. const int64_t qk_k = ggml_blck_size(new_type);
  356. if (nx % qk_k != 0) {
  357. LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
  358. convert_incompatible_tensor = true;
  359. } else {
  360. ++qs.n_k_quantized;
  361. }
  362. }
  363. if (convert_incompatible_tensor) {
  364. switch (new_type) {
  365. case GGML_TYPE_TQ1_0:
  366. case GGML_TYPE_TQ2_0: new_type = GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead
  367. case GGML_TYPE_IQ2_XXS:
  368. case GGML_TYPE_IQ2_XS:
  369. case GGML_TYPE_IQ2_S:
  370. case GGML_TYPE_IQ3_XXS:
  371. case GGML_TYPE_IQ3_S:
  372. case GGML_TYPE_IQ1_S:
  373. case GGML_TYPE_IQ1_M:
  374. case GGML_TYPE_Q2_K:
  375. case GGML_TYPE_Q3_K:
  376. case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
  377. case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
  378. case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
  379. case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
  380. default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
  381. }
  382. if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
  383. new_type = GGML_TYPE_F16;
  384. }
  385. LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
  386. ++qs.n_fallback;
  387. }
  388. return new_type;
  389. }
  390. static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
  391. if (nthread < 2) {
  392. // single-thread
  393. size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
  394. if (!ggml_validate_row_data(new_type, new_data, new_size)) {
  395. throw std::runtime_error("quantized data validation failed");
  396. }
  397. return new_size;
  398. }
  399. std::mutex mutex;
  400. int64_t counter = 0;
  401. size_t new_size = 0;
  402. bool valid = true;
  403. auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size,
  404. nrows, n_per_row, imatrix]() {
  405. const int64_t nrows_per_chunk = chunk_size / n_per_row;
  406. size_t local_size = 0;
  407. while (true) {
  408. std::unique_lock<std::mutex> lock(mutex);
  409. int64_t first_row = counter; counter += nrows_per_chunk;
  410. if (first_row >= nrows) {
  411. if (local_size > 0) {
  412. new_size += local_size;
  413. }
  414. break;
  415. }
  416. lock.unlock();
  417. const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
  418. size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
  419. local_size += this_size;
  420. // validate the quantized data
  421. const size_t row_size = ggml_row_size(new_type, n_per_row);
  422. void * this_data = (char *) new_data + first_row * row_size;
  423. if (!ggml_validate_row_data(new_type, this_data, this_size)) {
  424. std::unique_lock<std::mutex> lock(mutex);
  425. valid = false;
  426. break;
  427. }
  428. }
  429. };
  430. for (int it = 0; it < nthread - 1; ++it) {
  431. workers.emplace_back(compute);
  432. }
  433. compute();
  434. for (auto & w : workers) { w.join(); }
  435. workers.clear();
  436. if (!valid) {
  437. throw std::runtime_error("quantized data validation failed");
  438. }
  439. return new_size;
  440. }
  441. static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
  442. ggml_type default_type;
  443. llama_ftype ftype = params->ftype;
  444. switch (params->ftype) {
  445. case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break;
  446. case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break;
  447. case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break;
  448. case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
  449. case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
  450. case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
  451. case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
  452. case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
  453. // K-quants
  454. case LLAMA_FTYPE_MOSTLY_Q2_K_S:
  455. case LLAMA_FTYPE_MOSTLY_Q2_K: default_type = GGML_TYPE_Q2_K; break;
  456. case LLAMA_FTYPE_MOSTLY_IQ3_XS: default_type = GGML_TYPE_IQ3_S; break;
  457. case LLAMA_FTYPE_MOSTLY_Q3_K_S:
  458. case LLAMA_FTYPE_MOSTLY_Q3_K_M:
  459. case LLAMA_FTYPE_MOSTLY_Q3_K_L: default_type = GGML_TYPE_Q3_K; break;
  460. case LLAMA_FTYPE_MOSTLY_Q4_K_S:
  461. case LLAMA_FTYPE_MOSTLY_Q4_K_M: default_type = GGML_TYPE_Q4_K; break;
  462. case LLAMA_FTYPE_MOSTLY_Q5_K_S:
  463. case LLAMA_FTYPE_MOSTLY_Q5_K_M: default_type = GGML_TYPE_Q5_K; break;
  464. case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = GGML_TYPE_Q6_K; break;
  465. case LLAMA_FTYPE_MOSTLY_TQ1_0: default_type = GGML_TYPE_TQ1_0; break;
  466. case LLAMA_FTYPE_MOSTLY_TQ2_0: default_type = GGML_TYPE_TQ2_0; break;
  467. case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break;
  468. case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break;
  469. case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break;
  470. case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break;
  471. case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
  472. case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break;
  473. case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break;
  474. case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break;
  475. case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
  476. case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
  477. case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
  478. default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
  479. }
  480. int nthread = params->nthread;
  481. if (nthread <= 0) {
  482. nthread = std::thread::hardware_concurrency();
  483. }
  484. // mmap consistently increases speed on Linux, and also increases speed on Windows with
  485. // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
  486. #if defined(__linux__) || defined(_WIN32)
  487. constexpr bool use_mmap = true;
  488. #else
  489. constexpr bool use_mmap = false;
  490. #endif
  491. llama_model_kv_override * kv_overrides = nullptr;
  492. if (params->kv_overrides) {
  493. auto * v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
  494. kv_overrides = v->data();
  495. }
  496. std::vector<std::string> splits = {};
  497. llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr);
  498. ml.init_mappings(false); // no prefetching
  499. llama_model model(llama_model_default_params());
  500. model.load_arch (ml);
  501. model.load_hparams(ml);
  502. model.load_stats (ml);
  503. quantize_state_impl qs(model, params);
  504. if (params->only_copy) {
  505. ftype = ml.ftype;
  506. }
  507. const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
  508. if (params->imatrix) {
  509. imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
  510. if (imatrix_data) {
  511. LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
  512. qs.has_imatrix = true;
  513. // check imatrix for nans or infs
  514. for (const auto & kv : *imatrix_data) {
  515. for (float f : kv.second) {
  516. if (!std::isfinite(f)) {
  517. throw std::runtime_error(format("imatrix contains non-finite value %f\n", f));
  518. }
  519. }
  520. }
  521. }
  522. }
  523. const size_t align = GGUF_DEFAULT_ALIGNMENT;
  524. gguf_context_ptr ctx_out { gguf_init_empty() };
  525. // copy the KV pairs from the input file
  526. gguf_set_kv (ctx_out.get(), ml.meta.get());
  527. gguf_set_val_u32(ctx_out.get(), "general.quantization_version", GGML_QNT_VERSION); // TODO: use LLM_KV
  528. gguf_set_val_u32(ctx_out.get(), "general.file_type", ftype); // TODO: use LLM_KV
  529. // Remove split metadata
  530. gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str());
  531. gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str());
  532. gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str());
  533. if (params->kv_overrides) {
  534. const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
  535. for (const auto & o : overrides) {
  536. if (o.key[0] == 0) break;
  537. if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
  538. gguf_set_val_f32(ctx_out.get(), o.key, o.val_f64);
  539. } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
  540. // Setting type to UINT32. See https://github.com/ggml-org/llama.cpp/pull/14182 for context
  541. gguf_set_val_u32(ctx_out.get(), o.key, (uint32_t)abs(o.val_i64));
  542. } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
  543. gguf_set_val_bool(ctx_out.get(), o.key, o.val_bool);
  544. } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
  545. gguf_set_val_str(ctx_out.get(), o.key, o.val_str);
  546. } else {
  547. LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
  548. }
  549. }
  550. }
  551. // make a list of weights
  552. std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
  553. tensors.reserve(ml.weights_map.size());
  554. for (const auto & it : ml.weights_map) {
  555. tensors.push_back(&it.second);
  556. }
  557. // keep_split requires that the weights are sorted by split index
  558. if (params->keep_split) {
  559. std::sort(tensors.begin(), tensors.end(), [](const llama_model_loader::llama_tensor_weight * a, const llama_model_loader::llama_tensor_weight * b) {
  560. if (a->idx == b->idx) {
  561. return a->offs < b->offs;
  562. }
  563. return a->idx < b->idx;
  564. });
  565. }
  566. for (const auto * it : tensors) {
  567. const struct ggml_tensor * tensor = it->tensor;
  568. const std::string name = ggml_get_name(tensor);
  569. // TODO: avoid hardcoded tensor names - use the TN_* constants
  570. if (name.find("attn_v.weight") != std::string::npos ||
  571. name.find("attn_qkv.weight") != std::string::npos ||
  572. name.find("attn_kv_b.weight")!= std::string::npos) {
  573. ++qs.n_attention_wv;
  574. } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
  575. qs.has_output = true;
  576. }
  577. }
  578. qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
  579. // sanity checks for models that have attention layers
  580. if (qs.n_attention_wv != 0)
  581. {
  582. const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
  583. // attention layers have a non-zero number of kv heads
  584. int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
  585. if (llama_model_has_encoder(&model)) {
  586. n_attn_layer *= 3;
  587. }
  588. GGML_ASSERT((qs.n_attention_wv == n_attn_layer) && "n_attention_wv is unexpected");
  589. }
  590. size_t total_size_org = 0;
  591. size_t total_size_new = 0;
  592. std::vector<std::thread> workers;
  593. workers.reserve(nthread);
  594. int idx = 0;
  595. std::vector<no_init<uint8_t>> read_data;
  596. std::vector<no_init<uint8_t>> work;
  597. std::vector<no_init<float>> f32_conv_buf;
  598. uint16_t n_split = 1;
  599. // Assume split index is continuous
  600. if (params->keep_split) {
  601. for (const auto * it : tensors) {
  602. n_split = std::max(uint16_t(it->idx + 1), n_split);
  603. }
  604. }
  605. std::vector<gguf_context_ptr> ctx_outs(n_split);
  606. ctx_outs[0] = std::move(ctx_out);
  607. // populate the original tensors so we get an initial meta data
  608. for (const auto * it : tensors) {
  609. uint16_t i_split = params->keep_split ? it->idx : 0;
  610. ggml_tensor * tensor = it->tensor;
  611. if (!ctx_outs[i_split]) {
  612. ctx_outs[i_split].reset(gguf_init_empty());
  613. }
  614. gguf_add_tensor(ctx_outs[i_split].get(), tensor);
  615. }
  616. // Set split info if needed
  617. if (n_split > 1) {
  618. for (size_t i = 0; i < ctx_outs.size(); ++i) {
  619. gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
  620. gguf_set_val_u16(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
  621. gguf_set_val_i32(ctx_outs[i].get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
  622. }
  623. }
  624. int cur_split = -1;
  625. std::ofstream fout;
  626. auto close_ofstream = [&]() {
  627. // Write metadata and close file handler
  628. if (fout.is_open()) {
  629. fout.seekp(0);
  630. std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split].get()));
  631. gguf_get_meta_data(ctx_outs[cur_split].get(), data.data());
  632. fout.write((const char *) data.data(), data.size());
  633. fout.close();
  634. }
  635. };
  636. auto new_ofstream = [&](int index) {
  637. cur_split = index;
  638. GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
  639. std::string fname = fname_out;
  640. if (params->keep_split) {
  641. std::vector<char> split_path(llama_path_max(), 0);
  642. llama_split_path(split_path.data(), split_path.size(), fname_out.c_str(), cur_split, n_split);
  643. fname = std::string(split_path.data());
  644. }
  645. fout = std::ofstream(fname, std::ios::binary);
  646. fout.exceptions(std::ofstream::failbit); // fail fast on write errors
  647. const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split].get());
  648. // placeholder for the meta data
  649. ::zeros(fout, meta_size);
  650. };
  651. const auto tn = LLM_TN(model.arch);
  652. new_ofstream(0);
  653. for (const auto * it : tensors) {
  654. const auto & weight = *it;
  655. ggml_tensor * tensor = weight.tensor;
  656. if (weight.idx != cur_split && params->keep_split) {
  657. close_ofstream();
  658. new_ofstream(weight.idx);
  659. }
  660. const std::string name = ggml_get_name(tensor);
  661. if (!ml.use_mmap) {
  662. if (read_data.size() < ggml_nbytes(tensor)) {
  663. read_data.resize(ggml_nbytes(tensor));
  664. }
  665. tensor->data = read_data.data();
  666. }
  667. ml.load_data_for(tensor);
  668. LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
  669. ++idx, ml.n_tensors,
  670. ggml_get_name(tensor),
  671. llama_format_tensor_shape(tensor).c_str(),
  672. ggml_type_name(tensor->type));
  673. // This used to be a regex, but <regex> has an extreme cost to compile times.
  674. bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
  675. // quantize only 2D and 3D tensors (experts)
  676. quantize &= (ggml_n_dims(tensor) >= 2);
  677. // do not quantize norm tensors
  678. quantize &= name.find("_norm.weight") == std::string::npos;
  679. quantize &= params->quantize_output_tensor || name != "output.weight";
  680. quantize &= !params->only_copy;
  681. // do not quantize expert gating tensors
  682. // NOTE: can't use LLM_TN here because the layer number is not known
  683. quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
  684. // do not quantize positional embeddings and token types (BERT)
  685. quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
  686. quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
  687. // do not quantize Mamba's small yet 2D weights
  688. // NOTE: can't use LLM_TN here because the layer number is not known
  689. quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
  690. // do not quantize RWKV's small yet 2D weights
  691. quantize &= name.find("time_mix_first.weight") == std::string::npos;
  692. quantize &= name.find("time_mix_w0.weight") == std::string::npos;
  693. quantize &= name.find("time_mix_w1.weight") == std::string::npos;
  694. quantize &= name.find("time_mix_w2.weight") == std::string::npos;
  695. quantize &= name.find("time_mix_v0.weight") == std::string::npos;
  696. quantize &= name.find("time_mix_v1.weight") == std::string::npos;
  697. quantize &= name.find("time_mix_v2.weight") == std::string::npos;
  698. quantize &= name.find("time_mix_a0.weight") == std::string::npos;
  699. quantize &= name.find("time_mix_a1.weight") == std::string::npos;
  700. quantize &= name.find("time_mix_a2.weight") == std::string::npos;
  701. quantize &= name.find("time_mix_g1.weight") == std::string::npos;
  702. quantize &= name.find("time_mix_g2.weight") == std::string::npos;
  703. quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
  704. quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
  705. quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
  706. // do not quantize relative position bias (T5)
  707. quantize &= name.find("attn_rel_b.weight") == std::string::npos;
  708. ggml_type new_type;
  709. void * new_data;
  710. size_t new_size;
  711. if (quantize) {
  712. new_type = default_type;
  713. // get more optimal quantization type based on the tensor shape, layer, etc.
  714. if (!params->pure && ggml_is_quantized(default_type)) {
  715. new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
  716. // unless the user specifies a type
  717. if (params->tensor_types) {
  718. const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
  719. const std::string tensor_name(tensor->name);
  720. for (const auto & [tname, qtype] : tensor_types) {
  721. if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
  722. if (qtype != new_type) {
  723. LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
  724. new_type = qtype;
  725. break; // if two or more types are specified for the tensor, first match wins
  726. }
  727. }
  728. }
  729. }
  730. }
  731. if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
  732. new_type = params->token_embedding_type;
  733. }
  734. if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
  735. new_type = params->output_tensor_type;
  736. }
  737. // If we've decided to quantize to the same type the tensor is already
  738. // in then there's nothing to do.
  739. quantize = tensor->type != new_type;
  740. }
  741. if (!quantize) {
  742. new_type = tensor->type;
  743. new_data = tensor->data;
  744. new_size = ggml_nbytes(tensor);
  745. LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
  746. } else {
  747. const int64_t nelements = ggml_nelements(tensor);
  748. const float * imatrix = nullptr;
  749. if (imatrix_data) {
  750. auto it = imatrix_data->find(tensor->name);
  751. if (it == imatrix_data->end()) {
  752. LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
  753. } else {
  754. if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
  755. imatrix = it->second.data();
  756. } else {
  757. LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
  758. int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
  759. // this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
  760. // this is a significant error and it may be good idea to abort the process if this happens,
  761. // since many people will miss the error and not realize that most of the model is being quantized without an imatrix
  762. // tok_embd should be ignored in this case, since it always causes this warning
  763. if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
  764. throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
  765. int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
  766. }
  767. }
  768. }
  769. }
  770. if ((new_type == GGML_TYPE_IQ2_XXS ||
  771. new_type == GGML_TYPE_IQ2_XS ||
  772. new_type == GGML_TYPE_IQ2_S ||
  773. new_type == GGML_TYPE_IQ1_S ||
  774. (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) ||
  775. (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
  776. LLAMA_LOG_ERROR("\n\n============================================================\n");
  777. LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
  778. LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
  779. LLAMA_LOG_ERROR("============================================================\n\n");
  780. throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
  781. }
  782. float * f32_data;
  783. if (tensor->type == GGML_TYPE_F32) {
  784. f32_data = (float *) tensor->data;
  785. } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
  786. throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
  787. } else {
  788. llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread);
  789. f32_data = (float *) f32_conv_buf.data();
  790. }
  791. LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
  792. fflush(stdout);
  793. if (work.size() < (size_t)nelements * 4) {
  794. work.resize(nelements * 4); // upper bound on size
  795. }
  796. new_data = work.data();
  797. const int64_t n_per_row = tensor->ne[0];
  798. const int64_t nrows = tensor->ne[1];
  799. static const int64_t min_chunk_size = 32 * 512;
  800. const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
  801. const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
  802. const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
  803. const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
  804. // quantize each expert separately since they have different importance matrices
  805. new_size = 0;
  806. for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
  807. const float * f32_data_03 = f32_data + i03 * nelements_matrix;
  808. void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
  809. const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
  810. new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
  811. }
  812. LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
  813. }
  814. total_size_org += ggml_nbytes(tensor);
  815. total_size_new += new_size;
  816. // update the gguf meta data as we go
  817. gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
  818. GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
  819. gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
  820. // write tensor data + padding
  821. fout.write((const char *) new_data, new_size);
  822. zeros(fout, GGML_PAD(new_size, align) - new_size);
  823. }
  824. close_ofstream();
  825. LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
  826. LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
  827. if (qs.n_fallback > 0) {
  828. LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
  829. __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
  830. }
  831. }
  832. //
  833. // interface implementation
  834. //
  835. llama_model_quantize_params llama_model_quantize_default_params() {
  836. llama_model_quantize_params result = {
  837. /*.nthread =*/ 0,
  838. /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
  839. /*.output_tensor_type =*/ GGML_TYPE_COUNT,
  840. /*.token_embedding_type =*/ GGML_TYPE_COUNT,
  841. /*.allow_requantize =*/ false,
  842. /*.quantize_output_tensor =*/ true,
  843. /*.only_copy =*/ false,
  844. /*.pure =*/ false,
  845. /*.keep_split =*/ false,
  846. /*.imatrix =*/ nullptr,
  847. /*.kv_overrides =*/ nullptr,
  848. /*.tensor_type =*/ nullptr,
  849. };
  850. return result;
  851. }
  852. uint32_t llama_model_quantize(
  853. const char * fname_inp,
  854. const char * fname_out,
  855. const llama_model_quantize_params * params) {
  856. try {
  857. llama_model_quantize_impl(fname_inp, fname_out, params);
  858. } catch (const std::exception & err) {
  859. LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what());
  860. return 1;
  861. }
  862. return 0;
  863. }