imatrix.cpp 37 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033
  1. #include "arg.h"
  2. #include "common.h"
  3. #include "log.h"
  4. #include "llama.h"
  5. #include "gguf.h"
  6. #include <algorithm>
  7. #include <chrono>
  8. #include <cmath>
  9. #include <cstdio>
  10. #include <cstring>
  11. #include <ctime>
  12. #include <thread>
  13. #include <mutex>
  14. #include <vector>
  15. #include <fstream>
  16. #include <unordered_map>
  17. #include <map>
  18. #if defined(_MSC_VER)
  19. #pragma warning(disable: 4244 4267) // possible loss of data
  20. #endif
  21. static void print_usage(int, char ** argv) {
  22. LOG("\nexample usage:\n");
  23. LOG("\n %s \\\n"
  24. " -m model.gguf -f some-text.txt [-o imatrix.gguf] [--process-output] \\\n"
  25. " [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
  26. " [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] \\\n"
  27. " [--parse-special]\n" , argv[0]);
  28. LOG("\n");
  29. }
  30. static const char * const LLM_KV_IMATRIX_DATASETS = "imatrix.datasets";
  31. static const char * const LLM_KV_IMATRIX_CHUNK_COUNT = "imatrix.chunk_count";
  32. static const char * const LLM_KV_IMATRIX_CHUNK_SIZE = "imatrix.chunk_size";
  33. struct Stats {
  34. std::vector<float> values;
  35. std::vector<int64_t> counts;
  36. };
  37. class IMatrixCollector {
  38. public:
  39. IMatrixCollector() = default;
  40. void set_params(common_params params) { m_params = std::move(params); }
  41. bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
  42. void save_imatrix_legacy(int32_t ncall = -1) const;
  43. void save_imatrix(int32_t n_chunk = -1) const;
  44. bool load_imatrix_legacy(const char * fname);
  45. bool load_imatrix(const char * file_name);
  46. private:
  47. std::unordered_map<std::string, Stats> m_stats;
  48. common_params m_params;
  49. std::mutex m_mutex;
  50. std::vector<std::string> m_datasets;
  51. int32_t m_last_chunk = 0;
  52. std::vector<char> m_src1_data;
  53. std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
  54. };
  55. // remove any prefix and suffixes from the name
  56. // CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight
  57. static std::string filter_tensor_name(const char * name) {
  58. std::string wname;
  59. const char * p = strchr(name, '#');
  60. if (p != NULL) {
  61. p = p + 1;
  62. const char * q = strchr(p, '#');
  63. if (q != NULL) {
  64. wname = std::string(p, q - p);
  65. } else {
  66. wname = p;
  67. }
  68. } else {
  69. wname = name;
  70. }
  71. return wname;
  72. }
  73. bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
  74. GGML_UNUSED(user_data);
  75. const struct ggml_tensor * src0 = t->src[0];
  76. const struct ggml_tensor * src1 = t->src[1];
  77. std::string wname = filter_tensor_name(src0->name);
  78. const int32_t chunk_size = m_params.n_ctx / m_params.n_parallel;
  79. // when ask is true, the scheduler wants to know if we are interested in data from this tensor
  80. // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
  81. if (ask) {
  82. if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
  83. if (t->op != GGML_OP_MUL_MAT) return false;
  84. // why are small batches ignored (<16 tokens)?
  85. if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
  86. if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == "output.weight"))) return false;
  87. return true;
  88. }
  89. std::lock_guard<std::mutex> lock(m_mutex);
  90. // copy the data from the GPU memory if needed
  91. const bool is_host = ggml_backend_buffer_is_host(src1->buffer);
  92. if (!is_host) {
  93. const size_t src1_nbytes = ggml_nbytes(src1);
  94. m_src1_data.resize(src1_nbytes);
  95. ggml_backend_tensor_get(src1, m_src1_data.data(), 0, src1_nbytes);
  96. }
  97. const char * data = is_host ? (const char *) src1->data : m_src1_data.data();
  98. GGML_ASSERT(src1->nb[0] == ggml_element_size(src1));
  99. // TODO: 4d? (is that even used in practice?)
  100. // the extra dimension would need to be stored somewhere to be reflected in the imatrix file
  101. if (ggml_nrows(src1) != src1->ne[1] * src1->ne[2]) {
  102. LOG_ERR("%s: tensor has more than 3 dimensions: %s", __func__, wname.c_str());
  103. GGML_ASSERT(false);
  104. }
  105. // this has been adapted to the new format of storing merged experts in a single 3d tensor
  106. // ref: https://github.com/ggml-org/llama.cpp/pull/6387
  107. if (t->op == GGML_OP_MUL_MAT_ID) {
  108. // ids -> [n_experts_used, n_tokens]
  109. // src1 -> [cols, n_expert_used, n_tokens]
  110. const ggml_tensor * ids = t->src[2];
  111. const int64_t n_as = src0->ne[2];
  112. const int64_t n_ids = ids->ne[0];
  113. // the top-k selected expert ids are stored in the ids tensor
  114. // for simplicity, always copy ids to host, because it is small
  115. // take into account that ids is not contiguous!
  116. GGML_ASSERT(ids->ne[1] == src1->ne[2]);
  117. m_ids.resize(ggml_nbytes(ids));
  118. ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
  119. auto & e = m_stats[wname];
  120. if (e.counts.size() == 1 && n_as > 1) {
  121. // broadcast, when loading an old imatrix
  122. e.counts.resize(n_as, e.counts[0]);
  123. }
  124. if (e.values.empty()) {
  125. e.values.resize(src1->ne[0]*n_as, 0);
  126. e.counts.resize(n_as, 0);
  127. }
  128. else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
  129. LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)(src1->ne[0]*n_as));
  130. exit(1); //GGML_ABORT("fatal error");
  131. }
  132. else if (e.counts.size() != (size_t)n_as) {
  133. LOG_ERR("%s: inconsistent expert count for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.counts.size(), (int)n_as);
  134. exit(1); //GGML_ABORT("fatal error");
  135. }
  136. LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
  137. // loop over all possible experts, regardless if they are used or not in the batch
  138. for (int64_t ex = 0; ex < n_as; ++ex) {
  139. size_t e_start = ex*src1->ne[0];
  140. for (int64_t idx = 0; idx < n_ids; ++idx) {
  141. for (int64_t row = 0; row < src1->ne[2]; ++row) {
  142. const int excur = *(const int32_t *) (m_ids.data() + row*ids->nb[1] + idx*ids->nb[0]);
  143. GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
  144. if (excur != ex) continue;
  145. const int64_t i11 = idx % src1->ne[1];
  146. const int64_t i12 = row;
  147. const float * x = (const float *)(data + i11*src1->nb[1] + i12*src1->nb[2]);
  148. e.counts[ex]++;
  149. for (int64_t j = 0; j < src1->ne[0]; ++j) {
  150. e.values[e_start + j] += x[j] * x[j];
  151. if (!std::isfinite((float)e.values[e_start + j])) {
  152. LOG_ERR("%f detected in %s\n", (float)e.values[e_start + j], wname.c_str());
  153. exit(1);
  154. }
  155. }
  156. }
  157. }
  158. const int32_t n_chunk = e.counts[ex] / chunk_size;
  159. if (n_chunk > m_last_chunk) {
  160. const int32_t chunk_step = n_chunk - m_last_chunk;
  161. m_last_chunk = n_chunk;
  162. if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
  163. save_imatrix();
  164. }
  165. if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
  166. save_imatrix(m_last_chunk);
  167. }
  168. }
  169. }
  170. } else {
  171. auto & e = m_stats[wname];
  172. const int64_t n_mat = src1->ne[2] * src1->ne[3];
  173. if (e.values.empty()) {
  174. e.values.resize(src1->ne[0] * n_mat, 0);
  175. e.counts.resize(n_mat, 0);
  176. }
  177. else if (e.values.size() != (size_t)(src1->ne[0] * n_mat)) {
  178. LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)(src1->ne[0] * n_mat));
  179. exit(1); //GGML_ABORT("fatal error");
  180. }
  181. else if (e.counts.size() != (size_t)n_mat) {
  182. LOG_ERR("%s: inconsistent expert count for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.counts.size(), (int)n_mat);
  183. exit(1); //GGML_ABORT("fatal error");
  184. }
  185. LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->ne[2], (int)src1->type);
  186. for (int64_t i3 = 0; i3 < src1->ne[3]; ++i3) {
  187. for (int64_t i2 = 0; i2 < src1->ne[2]; ++i2) {
  188. const int64_t mat_id = i3 * src1->ne[2] + i2;
  189. const int64_t mat_start = mat_id * src1->ne[0];
  190. for (int64_t row = 0; row < src1->ne[1]; ++row) {
  191. const float * x = (const float *) (data + row * src1->nb[1] + i2 * src1->nb[2] + i3 * src1->ne[3]);
  192. e.counts[mat_id]++;
  193. for (int64_t j = 0; j < src1->ne[0]; ++j) {
  194. e.values[mat_start + j] += x[j] * x[j];
  195. if (!std::isfinite((float)e.values[j])) {
  196. LOG_ERR("%f detected in %s\n", (float)e.values[j], wname.c_str());
  197. exit(1);
  198. }
  199. }
  200. }
  201. const int32_t n_chunk = e.counts[mat_id] / chunk_size;
  202. if (n_chunk > m_last_chunk) {
  203. const int32_t chunk_step = n_chunk - m_last_chunk;
  204. m_last_chunk = n_chunk;
  205. if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
  206. save_imatrix();
  207. }
  208. if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
  209. save_imatrix(m_last_chunk);
  210. }
  211. }
  212. }
  213. }
  214. }
  215. return true;
  216. }
  217. void IMatrixCollector::save_imatrix_legacy(int32_t ncall) const {
  218. auto fname = m_params.out_file;
  219. if (ncall > 0) {
  220. fname += ".at_";
  221. fname += std::to_string(ncall);
  222. }
  223. // warn when writing imatrix entries that do not have full data
  224. // this can happen with MoE models where some of the experts end up not being exercised by the provided training data
  225. int n_entries = 0;
  226. std::vector<std::string> to_store;
  227. bool is_first = true; // for printing
  228. for (const auto & kv : m_stats) {
  229. const int n_all = kv.second.counts.size();
  230. if (n_all == 0) {
  231. continue;
  232. }
  233. int n_zeros = 0;
  234. for (const int c : kv.second.counts) {
  235. if (c == 0) {
  236. n_zeros++;
  237. }
  238. }
  239. if (n_zeros != 0 && is_first) {
  240. LOG_INF("\n");
  241. is_first = false;
  242. }
  243. if (n_zeros == n_all) {
  244. LOG_WRN("%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
  245. continue;
  246. }
  247. if (n_zeros > 0) {
  248. LOG_WRN("%s: entry '%40s' has partial data (%.2f%%)\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
  249. }
  250. n_entries++;
  251. to_store.push_back(kv.first);
  252. }
  253. if (to_store.size() < m_stats.size()) {
  254. LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
  255. }
  256. // deterministic tensor name order
  257. std::sort(to_store.begin(), to_store.end());
  258. const int32_t chunk_size = m_params.n_ctx / m_params.n_parallel;
  259. std::ofstream out(fname, std::ios::binary);
  260. out.write((const char *) &n_entries, sizeof(n_entries));
  261. for (const auto & name : to_store) {
  262. const auto & stat = m_stats.at(name);
  263. const int32_t len = name.size();
  264. out.write((const char *) &len, sizeof(len));
  265. out.write(name.c_str(), len);
  266. // ceiling division to avoid accidental zeros
  267. const int32_t ncall = (*std::max_element(stat.counts.begin(), stat.counts.end()) + (chunk_size - 1)) / chunk_size;
  268. out.write((const char *) &ncall, sizeof(ncall));
  269. const int32_t nval = stat.values.size();
  270. const int32_t nmat = stat.counts.size();
  271. out.write((const char *) &nval, sizeof(nval));
  272. if (nval > 0 && nmat > 0) {
  273. std::vector<float> tmp(nval);
  274. for (int32_t i = 0; i < nval; i++) {
  275. float count = static_cast<float>(stat.counts[i / (nval / nmat)]);
  276. float value = stat.values[i];
  277. if (count == 0.0f) {
  278. // store 1 for partial data
  279. value = 1.0f;
  280. count = 1.0f;
  281. }
  282. tmp[i] = (value / count) * static_cast<float>(ncall);
  283. }
  284. out.write((const char *) tmp.data(), nval * sizeof(float));
  285. }
  286. }
  287. // Write the number of call the matrix was computed with
  288. out.write((const char *) &m_last_chunk, sizeof(m_last_chunk));
  289. // Write the input filename at the end of the file to later on specify it in quantize
  290. {
  291. const char * dataset_file = m_params.prompt_file.c_str();
  292. int32_t len = m_params.prompt_file.size();
  293. // When there is no prompt but there were other imatrix files loaded, use the last dataset
  294. if (m_params.prompt_file.empty() && !m_datasets.empty()) {
  295. const std::string & dataset_str = m_datasets[m_datasets.size() - 1];
  296. dataset_file = dataset_str.c_str();
  297. len = dataset_str.size();
  298. }
  299. out.write((const char *) &len, sizeof(len));
  300. out.write(dataset_file, len);
  301. }
  302. LOGV(1, "\n");
  303. LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_chunk, fname.c_str());
  304. }
  305. void IMatrixCollector::save_imatrix(int32_t n_chunk) const {
  306. auto fname = m_params.out_file;
  307. // TODO: use the new format in more cases
  308. if (!string_ends_with(fname, ".gguf")) {
  309. LOG_WRN("\n%s: saving to legacy imatrix format because output suffix is not .gguf\n", __func__);
  310. this->save_imatrix_legacy(n_chunk);
  311. return;
  312. }
  313. if (n_chunk > 0) {
  314. fname += ".at_";
  315. fname += std::to_string(n_chunk);
  316. }
  317. // write imatrix entries even if they don't have full data. (can be corrected when reading)
  318. // this can happen with MoE models where some of the experts end up not being exercised by the provided training data
  319. std::vector<std::string> to_store;
  320. size_t data_size = 0;
  321. bool is_first = true; // for printing
  322. for (const auto & kv : m_stats) {
  323. const int n_all = kv.second.counts.size();
  324. int n_zeros = 0;
  325. for (const auto c : kv.second.counts) {
  326. if (c == 0) {
  327. n_zeros++;
  328. }
  329. }
  330. if (n_zeros != 0 && is_first) {
  331. LOG_INF("\n");
  332. is_first = false;
  333. }
  334. if (n_zeros > 0) {
  335. LOG_WRN("%s: entry '%40s' has partial data (%.2f%%)\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
  336. }
  337. to_store.push_back(kv.first);
  338. data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.values.size(), GGML_MEM_ALIGN);
  339. data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.counts.size(), GGML_MEM_ALIGN);
  340. }
  341. // deterministic tensor name order
  342. std::sort(to_store.begin(), to_store.end());
  343. struct ggml_init_params params = {
  344. /* .mem_size = */ data_size,
  345. /* .mem_buffer = */ NULL,
  346. /* .no_alloc = */ false,
  347. };
  348. struct ggml_context * ctx = ggml_init(params);
  349. struct gguf_context * ctx_gguf = gguf_init_empty();
  350. {
  351. std::vector<const char *> datasets;
  352. datasets.reserve(m_datasets.size() + 1);
  353. for (size_t i = 0; i < m_datasets.size(); ++i) {
  354. datasets.push_back(m_datasets[i].c_str());
  355. }
  356. if (!m_params.prompt_file.empty()) {
  357. datasets.push_back(m_params.prompt_file.c_str());
  358. }
  359. gguf_set_val_str(ctx_gguf, "general.type", "imatrix");
  360. // Write the dataset paths
  361. gguf_set_arr_str(ctx_gguf, LLM_KV_IMATRIX_DATASETS, datasets.data(), datasets.size());
  362. // Write the number of chunks the matrix was computed with
  363. gguf_set_val_u32(ctx_gguf, LLM_KV_IMATRIX_CHUNK_COUNT, m_last_chunk);
  364. gguf_set_val_u32(ctx_gguf, LLM_KV_IMATRIX_CHUNK_SIZE, m_params.n_ctx / m_params.n_parallel);
  365. }
  366. for (const auto & name : to_store) {
  367. const auto & stat = m_stats.at(name);
  368. const int32_t nval = (int32_t) stat.values.size();
  369. const int32_t nmat = (int32_t) stat.counts.size();
  370. if (nval > 0 && nmat > 0) {
  371. struct ggml_tensor * in_sum2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nval / nmat, nmat);
  372. struct ggml_tensor * counts = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, nmat);
  373. ggml_format_name(in_sum2, "%s.in_sum2", name.c_str());
  374. ggml_format_name(counts, "%s.counts", name.c_str());
  375. for (int32_t j = 0; j < nval; ++j) {
  376. ((float *) in_sum2->data)[j] = (float) stat.values[j];
  377. }
  378. for (int32_t j = 0; j < nmat; ++j) {
  379. ((float *) counts->data)[j] = (float) stat.counts[j];
  380. }
  381. gguf_add_tensor(ctx_gguf, in_sum2);
  382. gguf_add_tensor(ctx_gguf, counts);
  383. }
  384. }
  385. gguf_write_to_file(ctx_gguf, fname.c_str(), false);
  386. LOGV(1, "\n");
  387. LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_chunk, fname.c_str());
  388. gguf_free(ctx_gguf);
  389. ggml_free(ctx);
  390. }
  391. bool IMatrixCollector::load_imatrix_legacy(const char * fname) {
  392. std::ifstream in(fname, std::ios::binary);
  393. if (!in) {
  394. LOG_ERR("%s: failed to open %s\n", __func__, fname);
  395. return false;
  396. }
  397. int n_entries;
  398. in.read((char *) &n_entries, sizeof(n_entries));
  399. if (in.fail() || n_entries < 1) {
  400. LOG_ERR("%s: no data in file %s\n", __func__, fname);
  401. return false;
  402. }
  403. // Guess the chunk size because it's not stored in the file
  404. const int32_t chunk_size = m_params.n_ctx / m_params.n_parallel;
  405. for (int i = 0; i < n_entries; ++i) {
  406. int32_t len = 0;
  407. in.read((char *) &len, sizeof(len));
  408. std::vector<char> name_as_vec(len + 1);
  409. in.read((char *) name_as_vec.data(), len);
  410. if (in.fail()) {
  411. LOG_ERR("%s: failed reading name for entry %d from %s\n", __func__, i + 1, fname);
  412. return false;
  413. }
  414. name_as_vec[len] = 0;
  415. std::string name{ name_as_vec.data() };
  416. auto & e = m_stats[std::move(name)];
  417. int32_t ncall = 0;
  418. in.read((char *) &ncall, sizeof(ncall));
  419. int32_t nval = 0;
  420. in.read((char *) &nval, sizeof(nval));
  421. if (in.fail() || nval < 1) {
  422. LOG_ERR("%s: failed reading number of values for entry %d\n", __func__, i);
  423. m_stats = {};
  424. return false;
  425. }
  426. if (e.values.empty()) {
  427. e.values.resize(nval, 0.0f);
  428. e.counts.resize(1, 0);
  429. }
  430. std::vector<float> tmp(nval);
  431. in.read((char *) tmp.data(), nval * sizeof(float));
  432. if (in.fail()) {
  433. LOG_ERR("%s: failed reading data for entry %d\n", __func__, i);
  434. m_stats = {};
  435. return false;
  436. }
  437. // Recreate the state as expected by save_imatrix(), and correct for weighted sum.
  438. for (int i = 0; i < nval; i++) {
  439. e.values[i] += tmp[i] * chunk_size;
  440. }
  441. // The legacy format doesn't distinguish the counts for different experts
  442. for (size_t j = 0; j < e.counts.size(); ++j) {
  443. e.counts[j] += ncall * chunk_size;
  444. }
  445. }
  446. {
  447. // TODO: extract into its own method; this is also used by the GGUF-based format
  448. // Calculate the last chunk count
  449. int64_t max_count = 0;
  450. for (const auto & stats : m_stats) {
  451. for (int64_t count : stats.second.counts) {
  452. if (count > max_count) {
  453. max_count = count;
  454. }
  455. }
  456. }
  457. m_last_chunk = max_count / (chunk_size);
  458. }
  459. {
  460. // Read the number of calls the matrix was computed with
  461. int32_t n_calls;
  462. in.read((char *) &n_calls, sizeof(n_calls));
  463. // ignore it because it's not important
  464. }
  465. // Read the dataset path to include it when writing to GGUF
  466. if (!in.fail()){
  467. int32_t len = 0;
  468. in.read((char *) &len, sizeof(len));
  469. if (!in.fail()) {
  470. std::vector<char> dataset;
  471. dataset.resize(len + 1, 0);
  472. in.read(dataset.data(), len);
  473. if (!in.fail()) {
  474. m_datasets.push_back(dataset.data());
  475. }
  476. }
  477. }
  478. return true;
  479. }
  480. // Using GGUF as the file format, for greater extensibility
  481. bool IMatrixCollector::load_imatrix(const char * file_name) {
  482. struct ggml_context * ctx = nullptr;
  483. struct gguf_init_params meta_gguf_params = {
  484. /* .no_alloc = */ false, // the data is needed
  485. /* .ctx = */ &ctx,
  486. };
  487. struct gguf_context * ctx_gguf = gguf_init_from_file(file_name, meta_gguf_params);
  488. if (!ctx_gguf) {
  489. return this->load_imatrix_legacy(file_name);
  490. }
  491. const int32_t n_entries = gguf_get_n_tensors(ctx_gguf);
  492. if (n_entries < 1) {
  493. LOG_ERR("%s: no data in file %s\n", __func__, file_name);
  494. gguf_free(ctx_gguf);
  495. ggml_free(ctx);
  496. return false;
  497. }
  498. const int64_t datasets_key = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_DATASETS);
  499. if (datasets_key != -1 && gguf_get_arr_type(ctx_gguf, datasets_key) == GGUF_TYPE_STRING) {
  500. const int64_t n = gguf_get_arr_n(ctx_gguf, datasets_key);
  501. m_datasets.reserve(m_datasets.size() + n);
  502. for (int64_t i = 0; i < n; ++i) {
  503. m_datasets.push_back(gguf_get_arr_str(ctx_gguf, datasets_key, i));
  504. }
  505. }
  506. const std::string in_sum2_suffix{ ".in_sum2" };
  507. const std::string counts_suffix{ ".counts" };
  508. // Could re-use m_stats instead, but this allows
  509. // checking for completeness of *each* loaded imatrix file
  510. // and also makes it easier to re-use a similar implementation in quantize.cpp
  511. // Using an ordered map to get a deterministic iteration order.
  512. std::map<std::string, std::pair<struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
  513. for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
  514. std::string name = cur->name;
  515. if (name.empty()) { continue; }
  516. if (string_remove_suffix(name, in_sum2_suffix)) {
  517. // in_sum2
  518. sums_counts_for[std::move(name)].first = cur;
  519. } else if (string_remove_suffix(name, counts_suffix)) {
  520. // counts
  521. sums_counts_for[std::move(name)].second = cur;
  522. } else {
  523. // ignore other tensors
  524. }
  525. }
  526. for (const auto & sc : sums_counts_for) {
  527. const std::string & name = sc.first;
  528. const struct ggml_tensor * in_sum2 = sc.second.first;
  529. const struct ggml_tensor * counts = sc.second.second;
  530. if (!in_sum2 || !counts) {
  531. LOG_ERR("%s: mismatched sums and counts for %s\n", __func__, name.c_str());
  532. gguf_free(ctx_gguf);
  533. ggml_free(ctx);
  534. return false;
  535. }
  536. auto & e = m_stats[name];
  537. int64_t nval = ggml_nelements(in_sum2);
  538. if (e.values.empty()) {
  539. e.values.resize(nval, 0.0f);
  540. } else if ((size_t) nval != e.values.size()) {
  541. LOG_ERR("%s: mismatched sums size for %s: %zu != %zu\n", __func__, name.c_str(), (size_t) nval, e.values.size());
  542. gguf_free(ctx_gguf);
  543. ggml_free(ctx);
  544. return false;
  545. }
  546. int64_t ncounts = ggml_nelements(counts);
  547. if (e.counts.empty()) {
  548. e.counts.resize(ncounts, 0);
  549. } else if (e.counts.size() == 1 && ncounts > 1) {
  550. // broadcast, when loading an old imatrix
  551. e.counts.resize(ncounts, e.counts[0]);
  552. } else if ((size_t) ncounts != e.counts.size()) {
  553. LOG_ERR("%s: mismatched counts size for %s: %zu != %zu\n", __func__, name.c_str(), (size_t) ncounts, e.counts.size());
  554. gguf_free(ctx_gguf);
  555. ggml_free(ctx);
  556. return false;
  557. }
  558. // Recreate the state as expected by save_imatrix()
  559. for (int64_t j = 0; j < nval; j++) {
  560. e.values[j] += ((const float *) in_sum2->data)[j];
  561. }
  562. for (int64_t j = 0; j < ncounts; j++) {
  563. e.counts[j] += std::lround(((const float *) counts->data)[j]);
  564. }
  565. }
  566. // TODO: extract into its own method; this is also used by the legacy format
  567. // Calculate the last chunk count
  568. int64_t max_count = 0;
  569. for (const auto & stats : m_stats) {
  570. for (int64_t count : stats.second.counts) {
  571. if (count > max_count) {
  572. max_count = count;
  573. }
  574. }
  575. }
  576. m_last_chunk = max_count / (m_params.n_ctx / m_params.n_parallel);
  577. gguf_free(ctx_gguf);
  578. ggml_free(ctx);
  579. return true;
  580. }
  581. static IMatrixCollector g_collector;
  582. static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
  583. return g_collector.collect_imatrix(t, ask, user_data);
  584. }
  585. struct results_log_softmax {
  586. double log_softmax;
  587. float logit;
  588. float prob;
  589. };
  590. static std::vector<float> softmax(const std::vector<float> & logits) {
  591. std::vector<float> probs(logits.size());
  592. float max_logit = logits[0];
  593. for (float v : logits) {
  594. max_logit = std::max(max_logit, v);
  595. }
  596. double sum_exp = 0.0;
  597. for (size_t i = 0; i < logits.size(); i++) {
  598. // Subtract the maximum logit value from the current logit value for numerical stability
  599. const float logit = logits[i] - max_logit;
  600. const float exp_logit = expf(logit);
  601. sum_exp += exp_logit;
  602. probs[i] = exp_logit;
  603. }
  604. for (size_t i = 0; i < probs.size(); i++) {
  605. probs[i] /= sum_exp;
  606. }
  607. return probs;
  608. }
  609. static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
  610. float max_logit = logits[0];
  611. for (int i = 1; i < n_vocab; ++i) {
  612. max_logit = std::max(max_logit, logits[i]);
  613. }
  614. double sum_exp = 0.0;
  615. for (int i = 0; i < n_vocab; ++i) {
  616. sum_exp += expf(logits[i] - max_logit);
  617. }
  618. return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
  619. }
  620. static void process_logits(
  621. int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
  622. double & nll, double & nll2, float * logit_history, float * prob_history) {
  623. std::mutex mutex;
  624. int counter = 0;
  625. auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
  626. double local_nll = 0;
  627. double local_nll2 = 0;
  628. while (true) {
  629. std::unique_lock<std::mutex> lock(mutex);
  630. int i = counter++;
  631. if (i >= n_token) {
  632. nll += local_nll; nll2 += local_nll2;
  633. break;
  634. }
  635. lock.unlock();
  636. const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
  637. const double v = -results.log_softmax;
  638. local_nll += v;
  639. local_nll2 += v*v;
  640. logit_history[i] = results.logit;
  641. prob_history[i] = results.prob;
  642. }
  643. };
  644. for (auto & w : workers) {
  645. w = std::thread(compute);
  646. }
  647. compute();
  648. for (auto & w : workers) {
  649. w.join();
  650. }
  651. }
  652. static bool compute_imatrix(llama_context * ctx, const common_params & params, const int32_t n_ctx) {
  653. const llama_model * model = llama_get_model(ctx);
  654. const llama_vocab * vocab = llama_model_get_vocab(model);
  655. const bool add_bos = llama_vocab_get_add_bos(vocab);
  656. GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
  657. auto tim1 = std::chrono::high_resolution_clock::now();
  658. LOG_INF("%s: tokenizing the input ..\n", __func__);
  659. std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true, params.parse_special);
  660. auto tim2 = std::chrono::high_resolution_clock::now();
  661. LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
  662. if (params.i_chunk > 0) {
  663. if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
  664. LOG_ERR("%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
  665. return false;
  666. }
  667. LOG_INF("%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
  668. tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
  669. }
  670. if (int(tokens.size()) < 2*n_ctx) {
  671. LOG_ERR("%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2*n_ctx, n_ctx);
  672. LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n", __func__, tokens.size());
  673. return false;
  674. }
  675. std::vector<float> logit_history;
  676. std::vector<float> prob_history;
  677. if (params.compute_ppl) {
  678. logit_history.resize(tokens.size());
  679. prob_history.resize(tokens.size());
  680. }
  681. const int n_chunk_max = tokens.size() / n_ctx;
  682. const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
  683. const int n_vocab = llama_vocab_n_tokens(vocab);
  684. const int n_batch = params.n_batch;
  685. int count = 0;
  686. double nll = 0.0;
  687. double nll2 = 0.0;
  688. const int num_batches = (n_ctx + n_batch - 1) / n_batch;
  689. const int n_seq = std::max(1, n_batch / n_ctx);
  690. GGML_ASSERT(n_batch < n_ctx || n_batch % n_ctx == 0);
  691. GGML_ASSERT(params.n_ctx == n_seq * n_ctx);
  692. llama_batch batch = llama_batch_init(std::min(n_batch, n_ctx*n_seq), 0, 1);
  693. std::vector<float> logits;
  694. if (params.compute_ppl && num_batches > 1) {
  695. logits.reserve((size_t)n_ctx * n_vocab);
  696. }
  697. LOG_INF("%s: computing over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
  698. std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
  699. for (int i = 0; i < n_chunk; i += n_seq) {
  700. const int start = i * n_ctx;
  701. const int end = start + n_ctx;
  702. const int n_seq_batch = std::min(n_seq, n_chunk - i);
  703. const auto t_start = std::chrono::high_resolution_clock::now();
  704. // clear the KV cache
  705. llama_memory_clear(llama_get_memory(ctx), true);
  706. for (int j = 0; j < num_batches; ++j) {
  707. const int batch_start = start + j * n_batch;
  708. const int batch_size = std::min(end - batch_start, n_batch);
  709. // clear the batch
  710. common_batch_clear(batch);
  711. for (int seq = 0; seq < n_seq_batch; seq++) {
  712. int seq_start = batch_start + seq*n_ctx;
  713. // save original token and restore it after eval
  714. const auto token_org = tokens[seq_start];
  715. // add BOS token for the first batch of each chunk
  716. if (add_bos && j == 0) {
  717. tokens[seq_start] = llama_vocab_bos(vocab);
  718. }
  719. for (int k = 0; k < batch_size; ++k) {
  720. // NOTE: specifying all logits to get activations for the output.weight tensor
  721. // and also for the perplexity calculation.
  722. // TODO: only get outputs when (params.process_output || params.compute_ppl)
  723. // (not possible when this skips FFN computation of the last layer)
  724. common_batch_add(batch, tokens[seq_start + k], j*n_batch + k, { seq }, true);
  725. }
  726. // restore the original token in case it was set to BOS
  727. tokens[seq_start] = token_org;
  728. }
  729. if (llama_decode(ctx, batch)) {
  730. LOG_ERR("%s : failed to eval\n", __func__);
  731. llama_batch_free(batch);
  732. return false;
  733. }
  734. if (params.compute_ppl && num_batches > 1) {
  735. const auto * batch_logits = llama_get_logits(ctx);
  736. logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
  737. }
  738. }
  739. if (i == 0) {
  740. llama_synchronize(ctx);
  741. const auto t_end = std::chrono::high_resolution_clock::now();
  742. const float t_total = std::chrono::duration<float>(t_end - t_start).count();
  743. LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
  744. int total_seconds = (int)(t_total * n_chunk / n_seq);
  745. if (total_seconds >= 60*60) {
  746. LOG("%d hours ", total_seconds / (60*60));
  747. total_seconds = total_seconds % (60*60);
  748. }
  749. LOG("%.2f minutes\n", total_seconds / 60.0);
  750. }
  751. if (params.compute_ppl) {
  752. const int first = n_ctx/2;
  753. for (int seq = 0; seq < n_seq_batch; seq++) {
  754. const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx);
  755. llama_token * tokens_data = tokens.data() + start + seq*n_ctx + first;
  756. process_logits(n_vocab, all_logits + first*n_vocab,
  757. tokens_data, n_ctx - 1 - first,
  758. workers, nll, nll2,
  759. logit_history.data() + start + seq*n_ctx + first,
  760. prob_history.data() + start + seq*n_ctx + first);
  761. count += n_ctx - first - 1;
  762. LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
  763. }
  764. fflush(stdout);
  765. logits.clear();
  766. }
  767. }
  768. LOG("\n");
  769. if (params.compute_ppl) {
  770. nll2 /= count;
  771. nll /= count;
  772. const double ppl = exp(nll);
  773. nll2 -= nll * nll;
  774. if (nll2 > 0) {
  775. nll2 = sqrt(nll2/(count-1));
  776. LOG("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
  777. } else {
  778. LOG("Unexpected negative standard deviation of log(prob)\n");
  779. }
  780. }
  781. llama_batch_free(batch);
  782. return true;
  783. }
  784. int main(int argc, char ** argv) {
  785. common_params params;
  786. params.out_file = "imatrix.gguf";
  787. params.n_ctx = 512;
  788. params.escape = false;
  789. if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
  790. return 1;
  791. }
  792. common_init();
  793. const int32_t n_ctx = params.n_ctx;
  794. if (n_ctx <= 0) {
  795. LOG_ERR("%s: imatrix tool requires '--ctx-size' > 0\n", __func__);
  796. return 1;
  797. }
  798. {
  799. const int32_t n_seq = std::max(1, params.n_batch / n_ctx);
  800. const int32_t n_kv = n_seq * n_ctx;
  801. params.n_parallel = n_seq;
  802. params.n_ctx = n_kv;
  803. params.n_batch = std::min(params.n_batch, n_kv);
  804. }
  805. g_collector.set_params(params);
  806. for (const auto & in_file : params.in_files) {
  807. LOG_INF("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
  808. if (!g_collector.load_imatrix(in_file.c_str())) {
  809. LOG_ERR("%s : failed to load %s\n", __func__, in_file.c_str());
  810. return 1;
  811. }
  812. }
  813. if (params.prompt.empty()) {
  814. LOG_INF("No prompt provided; combining precomputed matrices only.\n");
  815. if (params.in_files.empty()) {
  816. LOG_ERR("Error: No prompt provided and no precomputed matrices (--in-file) to combine.\n");
  817. return 1;
  818. }
  819. if (params.in_files.size() == 1) {
  820. LOG_INF("%s : saving imatrix to '%s'\n", __func__, params.out_file.c_str());
  821. } else if (params.in_files.size() > 1) {
  822. LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
  823. }
  824. g_collector.save_imatrix();
  825. return 0;
  826. }
  827. llama_backend_init();
  828. llama_numa_init(params.numa);
  829. // pass the callback to the backend scheduler
  830. // it will be executed for each node during the graph computation
  831. params.cb_eval = ik_collect_imatrix;
  832. params.cb_eval_user_data = NULL;
  833. params.warmup = false;
  834. // init
  835. common_init_result llama_init = common_init_from_params(params);
  836. llama_model * model = llama_init.model.get();
  837. llama_context * ctx = llama_init.context.get();
  838. if (model == nullptr || ctx == nullptr) {
  839. LOG_ERR("%s : failed to init\n", __func__);
  840. return 1;
  841. }
  842. const int n_ctx_train = llama_model_n_ctx_train(model);
  843. if (params.n_ctx > n_ctx_train) {
  844. LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
  845. __func__, n_ctx_train, params.n_ctx);
  846. }
  847. // print system information
  848. {
  849. LOG_INF("\n");
  850. LOG_INF("%s\n", common_params_get_system_info(params).c_str());
  851. }
  852. if (!compute_imatrix(ctx, params, n_ctx)) {
  853. return 1;
  854. }
  855. g_collector.save_imatrix();
  856. LOG("\n");
  857. llama_perf_context_print(ctx);
  858. llama_backend_free();
  859. return 0;
  860. }