llama-model.h 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532
  1. #pragma once
  2. #include "llama.h"
  3. #include "llama-arch.h"
  4. #include "llama-graph.h"
  5. #include "llama-hparams.h"
  6. #include "llama-memory.h"
  7. #include "llama-vocab.h"
  8. #include <map>
  9. #include <memory>
  10. #include <string>
  11. #include <unordered_map>
  12. #include <vector>
  13. struct llama_cparams;
  14. struct llama_ubatch;
  15. struct llama_model_loader;
  16. // available models
  17. enum llm_type {
  18. LLM_TYPE_UNKNOWN,
  19. LLM_TYPE_14M,
  20. LLM_TYPE_17M,
  21. LLM_TYPE_22M,
  22. LLM_TYPE_33M,
  23. LLM_TYPE_60M,
  24. LLM_TYPE_70M,
  25. LLM_TYPE_80M,
  26. LLM_TYPE_109M,
  27. LLM_TYPE_137M,
  28. LLM_TYPE_140M,
  29. LLM_TYPE_160M,
  30. LLM_TYPE_190M,
  31. LLM_TYPE_220M,
  32. LLM_TYPE_250M,
  33. LLM_TYPE_256M,
  34. LLM_TYPE_270M,
  35. LLM_TYPE_335M,
  36. LLM_TYPE_350M,
  37. LLM_TYPE_360M,
  38. LLM_TYPE_410M,
  39. LLM_TYPE_450M,
  40. LLM_TYPE_475M,
  41. LLM_TYPE_558M,
  42. LLM_TYPE_700M,
  43. LLM_TYPE_770M,
  44. LLM_TYPE_780M,
  45. LLM_TYPE_950M,
  46. LLM_TYPE_0_3B,
  47. LLM_TYPE_0_5B,
  48. LLM_TYPE_0_6B,
  49. LLM_TYPE_1B,
  50. LLM_TYPE_1_2B,
  51. LLM_TYPE_1_3B,
  52. LLM_TYPE_1_4B,
  53. LLM_TYPE_1_5B,
  54. LLM_TYPE_1_6B,
  55. LLM_TYPE_1_7B,
  56. LLM_TYPE_1_8B,
  57. LLM_TYPE_2B,
  58. LLM_TYPE_2_6B,
  59. LLM_TYPE_2_8B,
  60. LLM_TYPE_2_9B,
  61. LLM_TYPE_3B,
  62. LLM_TYPE_4B,
  63. LLM_TYPE_6B,
  64. LLM_TYPE_6_9B,
  65. LLM_TYPE_7B,
  66. LLM_TYPE_8B,
  67. LLM_TYPE_9B,
  68. LLM_TYPE_11B,
  69. LLM_TYPE_12B,
  70. LLM_TYPE_13B,
  71. LLM_TYPE_14B,
  72. LLM_TYPE_15B,
  73. LLM_TYPE_16B,
  74. LLM_TYPE_20B,
  75. LLM_TYPE_26B,
  76. LLM_TYPE_27B,
  77. LLM_TYPE_30B,
  78. LLM_TYPE_32B,
  79. LLM_TYPE_34B,
  80. LLM_TYPE_35B,
  81. LLM_TYPE_36B,
  82. LLM_TYPE_40B,
  83. LLM_TYPE_65B,
  84. LLM_TYPE_70B,
  85. LLM_TYPE_120B,
  86. LLM_TYPE_142B,
  87. LLM_TYPE_236B,
  88. LLM_TYPE_290B,
  89. LLM_TYPE_314B,
  90. LLM_TYPE_405B,
  91. LLM_TYPE_671B,
  92. LLM_TYPE_SMALL,
  93. LLM_TYPE_MEDIUM,
  94. LLM_TYPE_LARGE,
  95. LLM_TYPE_XL,
  96. LLM_TYPE_A1_7B,
  97. LLM_TYPE_A2_7B,
  98. LLM_TYPE_8x7B,
  99. LLM_TYPE_8x22B,
  100. LLM_TYPE_16x12B,
  101. LLM_TYPE_16x3_8B,
  102. LLM_TYPE_10B_128x3_66B,
  103. LLM_TYPE_57B_A14B,
  104. LLM_TYPE_17B_16E, // llama4 Scout
  105. LLM_TYPE_17B_128E, // llama4 Maverick
  106. LLM_TYPE_A13B,
  107. LLM_TYPE_7B_A1B,
  108. LLM_TYPE_8B_A1B, // lfm2moe
  109. LLM_TYPE_16B_A1B,
  110. LLM_TYPE_21B_A3B, // Ernie MoE small
  111. LLM_TYPE_30B_A3B,
  112. LLM_TYPE_80B_A3B, // Qwen3 Next
  113. LLM_TYPE_100B_A6B,
  114. LLM_TYPE_106B_A12B, // GLM-4.5-Air
  115. LLM_TYPE_230B_A10B, // Minimax M2
  116. LLM_TYPE_235B_A22B,
  117. LLM_TYPE_300B_A47B, // Ernie MoE big
  118. LLM_TYPE_355B_A32B, // GLM-4.5
  119. LLM_TYPE_E2B,
  120. LLM_TYPE_E4B,
  121. };
  122. std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
  123. struct llama_layer_posnet {
  124. // resnet
  125. struct ggml_tensor * norm1 = nullptr;
  126. struct ggml_tensor * norm1_b = nullptr;
  127. struct ggml_tensor * conv1 = nullptr;
  128. struct ggml_tensor * conv1_b = nullptr;
  129. struct ggml_tensor * norm2 = nullptr;
  130. struct ggml_tensor * norm2_b = nullptr;
  131. struct ggml_tensor * conv2 = nullptr;
  132. struct ggml_tensor * conv2_b = nullptr;
  133. // attention
  134. struct ggml_tensor * attn_norm = nullptr;
  135. struct ggml_tensor * attn_norm_b = nullptr;
  136. struct ggml_tensor * attn_q = nullptr;
  137. struct ggml_tensor * attn_q_b = nullptr;
  138. struct ggml_tensor * attn_k = nullptr;
  139. struct ggml_tensor * attn_k_b = nullptr;
  140. struct ggml_tensor * attn_v = nullptr;
  141. struct ggml_tensor * attn_v_b = nullptr;
  142. struct ggml_tensor * attn_o = nullptr;
  143. struct ggml_tensor * attn_o_b = nullptr;
  144. // normalize
  145. struct ggml_tensor * norm = nullptr;
  146. struct ggml_tensor * norm_b = nullptr;
  147. };
  148. struct llama_layer_convnext {
  149. struct ggml_tensor * dw = nullptr;
  150. struct ggml_tensor * dw_b = nullptr;
  151. struct ggml_tensor * norm = nullptr;
  152. struct ggml_tensor * norm_b = nullptr;
  153. struct ggml_tensor * pw1 = nullptr;
  154. struct ggml_tensor * pw1_b = nullptr;
  155. struct ggml_tensor * pw2 = nullptr;
  156. struct ggml_tensor * pw2_b = nullptr;
  157. struct ggml_tensor * gamma = nullptr;
  158. };
  159. struct llama_layer_shortconv {
  160. struct ggml_tensor * in_proj = nullptr;
  161. struct ggml_tensor * conv = nullptr;
  162. struct ggml_tensor * out_proj = nullptr;
  163. };
  164. struct llama_layer_nextn {
  165. struct ggml_tensor * eh_proj = nullptr;
  166. struct ggml_tensor * embed_tokens = nullptr;
  167. struct ggml_tensor * enorm = nullptr;
  168. struct ggml_tensor * hnorm = nullptr;
  169. struct ggml_tensor * shared_head_head = nullptr;
  170. struct ggml_tensor * shared_head_norm = nullptr;
  171. };
  172. struct llama_layer {
  173. // normalization
  174. struct ggml_tensor * attn_norm = nullptr;
  175. struct ggml_tensor * attn_norm_b = nullptr;
  176. struct ggml_tensor * attn_norm_2 = nullptr;
  177. struct ggml_tensor * attn_norm_2_b = nullptr;
  178. struct ggml_tensor * attn_q_norm = nullptr;
  179. struct ggml_tensor * attn_q_norm_b = nullptr;
  180. struct ggml_tensor * attn_k_norm = nullptr;
  181. struct ggml_tensor * attn_k_norm_b = nullptr;
  182. struct ggml_tensor * attn_out_norm = nullptr;
  183. struct ggml_tensor * attn_out_norm_b = nullptr;
  184. struct ggml_tensor * attn_q_a_norm = nullptr;
  185. struct ggml_tensor * attn_kv_a_norm = nullptr;
  186. struct ggml_tensor * attn_sub_norm = nullptr;
  187. struct ggml_tensor * attn_post_norm = nullptr;
  188. struct ggml_tensor * ffn_sub_norm = nullptr;
  189. struct ggml_tensor * attn_norm_cross = nullptr;
  190. struct ggml_tensor * attn_norm_enc = nullptr;
  191. struct ggml_tensor * ssm_norm = nullptr;
  192. struct ggml_tensor * ssm_dt_norm = nullptr;
  193. struct ggml_tensor * ssm_b_norm = nullptr;
  194. struct ggml_tensor * ssm_c_norm = nullptr;
  195. // attention
  196. struct ggml_tensor * wq = nullptr;
  197. struct ggml_tensor * wk = nullptr;
  198. struct ggml_tensor * wv = nullptr;
  199. struct ggml_tensor * wo = nullptr;
  200. struct ggml_tensor * wqkv = nullptr;
  201. struct ggml_tensor * wq_a = nullptr;
  202. struct ggml_tensor * wq_b = nullptr;
  203. struct ggml_tensor * wkv_a_mqa = nullptr;
  204. struct ggml_tensor * wkv_b = nullptr;
  205. struct ggml_tensor * wk_b = nullptr;
  206. struct ggml_tensor * wv_b = nullptr;
  207. struct ggml_tensor * wq_cross = nullptr;
  208. struct ggml_tensor * wk_cross = nullptr;
  209. struct ggml_tensor * wv_cross = nullptr;
  210. struct ggml_tensor * wo_cross = nullptr;
  211. struct ggml_tensor * wq_enc = nullptr;
  212. struct ggml_tensor * wk_enc = nullptr;
  213. struct ggml_tensor * wv_enc = nullptr;
  214. struct ggml_tensor * wo_enc = nullptr;
  215. struct ggml_tensor * wqkv_gate = nullptr;
  216. // attention bias
  217. struct ggml_tensor * bq = nullptr;
  218. struct ggml_tensor * bk = nullptr;
  219. struct ggml_tensor * bv = nullptr;
  220. struct ggml_tensor * bo = nullptr;
  221. struct ggml_tensor * bqkv = nullptr;
  222. // relative position bias
  223. struct ggml_tensor * attn_rel_b = nullptr;
  224. struct ggml_tensor * attn_rel_b_enc = nullptr;
  225. struct ggml_tensor * attn_rel_b_cross = nullptr;
  226. // normalization
  227. struct ggml_tensor * ffn_norm = nullptr;
  228. struct ggml_tensor * ffn_norm_b = nullptr;
  229. struct ggml_tensor * ffn_post_norm = nullptr;
  230. struct ggml_tensor * layer_out_norm = nullptr;
  231. struct ggml_tensor * layer_out_norm_b = nullptr;
  232. struct ggml_tensor * ffn_norm_exps = nullptr;
  233. struct ggml_tensor * ffn_norm_enc = nullptr;
  234. // ff
  235. struct ggml_tensor * ffn_gate = nullptr; // w1
  236. struct ggml_tensor * ffn_down = nullptr; // w2
  237. struct ggml_tensor * ffn_up = nullptr; // w3
  238. struct ggml_tensor * ffn_gate_enc = nullptr;
  239. struct ggml_tensor * ffn_down_enc = nullptr;
  240. struct ggml_tensor * ffn_up_enc = nullptr;
  241. // ff MoE
  242. struct ggml_tensor * ffn_gate_inp = nullptr;
  243. struct ggml_tensor * ffn_gate_exps = nullptr;
  244. struct ggml_tensor * ffn_down_exps = nullptr;
  245. struct ggml_tensor * ffn_up_exps = nullptr;
  246. struct ggml_tensor * ffn_gate_inp_b = nullptr;
  247. struct ggml_tensor * ffn_gate_exps_b = nullptr;
  248. struct ggml_tensor * ffn_down_exps_b = nullptr;
  249. struct ggml_tensor * ffn_up_exps_b = nullptr;
  250. // ff shared expert (shexp)
  251. struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
  252. struct ggml_tensor * ffn_gate_shexp = nullptr;
  253. struct ggml_tensor * ffn_down_shexp = nullptr;
  254. struct ggml_tensor * ffn_up_shexp = nullptr;
  255. // ff adjugate experts (chexps)
  256. struct ggml_tensor * ffn_gate_chexps = nullptr;
  257. struct ggml_tensor * ffn_down_chexps = nullptr;
  258. struct ggml_tensor * ffn_up_chexps = nullptr;
  259. // ff bias
  260. struct ggml_tensor * ffn_gate_b = nullptr;
  261. struct ggml_tensor * ffn_down_b = nullptr; // b2
  262. struct ggml_tensor * ffn_up_b = nullptr; // b3
  263. struct ggml_tensor * ffn_act = nullptr;
  264. struct ggml_tensor * ffn_exp_probs_b = nullptr;
  265. // mamba proj
  266. struct ggml_tensor * ssm_in = nullptr;
  267. struct ggml_tensor * ssm_x = nullptr;
  268. struct ggml_tensor * ssm_dt = nullptr;
  269. struct ggml_tensor * ssm_out = nullptr;
  270. // mamba
  271. struct ggml_tensor * ssm_conv1d = nullptr;
  272. struct ggml_tensor * ssm_a = nullptr;
  273. struct ggml_tensor * ssm_d = nullptr;
  274. // mamba bias
  275. struct ggml_tensor * ssm_conv1d_b = nullptr;
  276. struct ggml_tensor * ssm_dt_b = nullptr;
  277. // qwen3next
  278. struct ggml_tensor * ssm_beta_alpha = nullptr;
  279. // rwkv
  280. struct ggml_tensor * time_mix_w1 = nullptr;
  281. struct ggml_tensor * time_mix_w2 = nullptr;
  282. struct ggml_tensor * time_mix_lerp_x = nullptr;
  283. struct ggml_tensor * time_mix_lerp_w = nullptr;
  284. struct ggml_tensor * time_mix_lerp_k = nullptr;
  285. struct ggml_tensor * time_mix_lerp_v = nullptr;
  286. struct ggml_tensor * time_mix_lerp_r = nullptr;
  287. struct ggml_tensor * time_mix_lerp_g = nullptr;
  288. struct ggml_tensor * time_mix_lerp_fused = nullptr;
  289. struct ggml_tensor * time_mix_first = nullptr;
  290. struct ggml_tensor * time_mix_decay = nullptr;
  291. struct ggml_tensor * time_mix_decay_w1 = nullptr;
  292. struct ggml_tensor * time_mix_decay_w2 = nullptr;
  293. struct ggml_tensor * time_mix_key = nullptr;
  294. struct ggml_tensor * time_mix_key_b = nullptr;
  295. struct ggml_tensor * time_mix_value = nullptr;
  296. struct ggml_tensor * time_mix_value_b = nullptr;
  297. struct ggml_tensor * time_mix_receptance = nullptr;
  298. struct ggml_tensor * time_mix_receptance_b = nullptr;
  299. struct ggml_tensor * time_mix_gate = nullptr;
  300. // rwkv7
  301. struct ggml_tensor * time_mix_w0 = nullptr;
  302. struct ggml_tensor * time_mix_a0 = nullptr;
  303. struct ggml_tensor * time_mix_a1 = nullptr;
  304. struct ggml_tensor * time_mix_a2 = nullptr;
  305. struct ggml_tensor * time_mix_v0 = nullptr;
  306. struct ggml_tensor * time_mix_v1 = nullptr;
  307. struct ggml_tensor * time_mix_v2 = nullptr;
  308. struct ggml_tensor * time_mix_g1 = nullptr;
  309. struct ggml_tensor * time_mix_g2 = nullptr;
  310. struct ggml_tensor * time_mix_k_k = nullptr;
  311. struct ggml_tensor * time_mix_k_a = nullptr;
  312. struct ggml_tensor * time_mix_r_k = nullptr;
  313. struct ggml_tensor * time_mix_ln = nullptr;
  314. struct ggml_tensor * time_mix_ln_b = nullptr;
  315. struct ggml_tensor * time_mix_output = nullptr;
  316. struct ggml_tensor * channel_mix_lerp_k = nullptr;
  317. struct ggml_tensor * channel_mix_lerp_r = nullptr;
  318. struct ggml_tensor * channel_mix_key = nullptr;
  319. struct ggml_tensor * channel_mix_receptance = nullptr;
  320. struct ggml_tensor * channel_mix_value = nullptr;
  321. // long rope factors
  322. struct ggml_tensor * rope_long = nullptr;
  323. struct ggml_tensor * rope_short = nullptr;
  324. struct ggml_tensor * rope_freqs = nullptr;
  325. // bitnet scale
  326. struct ggml_tensor * wq_scale = nullptr;
  327. struct ggml_tensor * wk_scale = nullptr;
  328. struct ggml_tensor * wv_scale = nullptr;
  329. struct ggml_tensor * wo_scale = nullptr;
  330. struct ggml_tensor * ffn_gate_scale = nullptr;
  331. struct ggml_tensor * ffn_up_scale = nullptr;
  332. struct ggml_tensor * ffn_down_scale = nullptr;
  333. // altup & laurel
  334. struct ggml_tensor * per_layer_inp_gate = nullptr;
  335. struct ggml_tensor * per_layer_proj = nullptr;
  336. struct ggml_tensor * per_layer_post_norm = nullptr;
  337. struct ggml_tensor * altup_correct_coef = nullptr;
  338. struct ggml_tensor * altup_correct_scale = nullptr;
  339. struct ggml_tensor * altup_predict_coef = nullptr;
  340. struct ggml_tensor * altup_router = nullptr;
  341. struct ggml_tensor * altup_router_norm = nullptr;
  342. struct ggml_tensor * laurel_l = nullptr;
  343. struct ggml_tensor * laurel_r = nullptr;
  344. struct ggml_tensor * laurel_post_norm = nullptr;
  345. // openai-moe
  346. struct ggml_tensor * attn_sinks = nullptr;
  347. // cogvlm
  348. struct ggml_tensor * visexp_attn_wqkv = nullptr;
  349. struct ggml_tensor * visexp_attn_wo = nullptr;
  350. struct ggml_tensor * visexp_ffn_gate = nullptr;
  351. struct ggml_tensor * visexp_ffn_down = nullptr;
  352. struct ggml_tensor * visexp_ffn_up = nullptr;
  353. // xIELU activation parameters for Apertus
  354. struct ggml_tensor * ffn_act_alpha_n = nullptr;
  355. struct ggml_tensor * ffn_act_alpha_p = nullptr;
  356. struct ggml_tensor * ffn_act_beta = nullptr;
  357. struct ggml_tensor * ffn_act_eps = nullptr;
  358. struct llama_layer_posnet posnet;
  359. struct llama_layer_convnext convnext;
  360. struct llama_layer_shortconv shortconv;
  361. struct llama_layer_nextn nextn;
  362. };
  363. struct llama_model {
  364. llm_type type = LLM_TYPE_UNKNOWN;
  365. llm_arch arch = LLM_ARCH_UNKNOWN;
  366. std::string name = "n/a";
  367. llama_hparams hparams = {};
  368. llama_vocab vocab;
  369. // for classifier models
  370. std::vector<std::string> classifier_labels;
  371. struct ggml_tensor * tok_embd = nullptr;
  372. struct ggml_tensor * type_embd = nullptr;
  373. struct ggml_tensor * pos_embd = nullptr;
  374. struct ggml_tensor * tok_norm = nullptr;
  375. struct ggml_tensor * tok_norm_b = nullptr;
  376. struct ggml_tensor * output_norm = nullptr;
  377. struct ggml_tensor * output_norm_b = nullptr;
  378. struct ggml_tensor * output = nullptr;
  379. struct ggml_tensor * output_b = nullptr;
  380. struct ggml_tensor * output_norm_enc = nullptr;
  381. // classifier
  382. struct ggml_tensor * cls = nullptr;
  383. struct ggml_tensor * cls_b = nullptr;
  384. struct ggml_tensor * cls_out = nullptr;
  385. struct ggml_tensor * cls_out_b = nullptr;
  386. struct ggml_tensor * conv1d = nullptr;
  387. struct ggml_tensor * conv1d_b = nullptr;
  388. // gemma3n altup
  389. struct ggml_tensor * tok_embd_per_layer = nullptr;
  390. struct ggml_tensor * altup_proj = nullptr;
  391. struct ggml_tensor * altup_unembd_proj = nullptr;
  392. struct ggml_tensor * per_layer_model_proj = nullptr;
  393. struct ggml_tensor * per_layer_proj_norm = nullptr;
  394. std::vector<llama_layer> layers;
  395. //Dense linear projections for SentenceTransformers models like embeddinggemma
  396. // For Sentence Transformers models structure see
  397. // https://sbert.net/docs/sentence_transformer/usage/custom_models.html#structure-of-sentence-transformer-models
  398. struct ggml_tensor * dense_2_out_layers = nullptr;
  399. struct ggml_tensor * dense_3_out_layers = nullptr;
  400. llama_model_params params;
  401. // gguf metadata
  402. std::unordered_map<std::string, std::string> gguf_kv;
  403. // list of devices used in this model
  404. std::vector<ggml_backend_dev_t> devices;
  405. // for quantize-stats only
  406. std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
  407. int64_t t_load_us = 0;
  408. int64_t t_start_us = 0;
  409. explicit llama_model(const struct llama_model_params & params);
  410. ~llama_model();
  411. void load_stats (llama_model_loader & ml);
  412. void load_arch (llama_model_loader & ml);
  413. void load_hparams(llama_model_loader & ml);
  414. void load_vocab (llama_model_loader & ml);
  415. bool load_tensors(llama_model_loader & ml); // returns false if cancelled by progress_callback
  416. std::string arch_name() const;
  417. std::string type_name() const;
  418. std::string desc() const;
  419. size_t size() const; // file size
  420. size_t n_tensors() const;
  421. size_t n_devices() const;
  422. std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const;
  423. // total number of parameters in the model
  424. uint64_t n_elements() const;
  425. void print_info() const;
  426. ggml_backend_dev_t dev_layer(int il) const;
  427. ggml_backend_dev_t dev_output() const;
  428. ggml_backend_buffer_type_t select_buft(int il) const;
  429. bool has_tensor_overrides() const;
  430. const struct ggml_tensor * get_tensor(const char * name) const;
  431. float get_rope_freq_base (const llama_cparams & cparams, int il) const;
  432. float get_rope_freq_scale(const llama_cparams & cparams, int il) const;
  433. ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const;
  434. // TODO: move this to new llm_arch_model_i interface
  435. llama_memory_i * create_memory(const llama_memory_params & params, const llama_cparams & cparams) const;
  436. // TODO: move this to new llm_arch_model_i interface
  437. ggml_cgraph * build_graph(const llm_graph_params & params) const;
  438. private:
  439. struct impl;
  440. std::unique_ptr<impl> pimpl;
  441. };
  442. const char * llm_type_name(llm_type type);
  443. // For internal test use
  444. // TODO: remove
  445. const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model);