llama-arch.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445
  1. #pragma once
  2. #include "ggml.h" // ggml_op
  3. #include <string>
  4. //
  5. // gguf constants (sync with gguf.py)
  6. //
  7. enum llm_arch {
  8. LLM_ARCH_LLAMA,
  9. LLM_ARCH_LLAMA4,
  10. LLM_ARCH_DECI,
  11. LLM_ARCH_FALCON,
  12. LLM_ARCH_BAICHUAN,
  13. LLM_ARCH_GROK,
  14. LLM_ARCH_GPT2,
  15. LLM_ARCH_GPTJ,
  16. LLM_ARCH_GPTNEOX,
  17. LLM_ARCH_MPT,
  18. LLM_ARCH_STARCODER,
  19. LLM_ARCH_REFACT,
  20. LLM_ARCH_BERT,
  21. LLM_ARCH_NOMIC_BERT,
  22. LLM_ARCH_NOMIC_BERT_MOE,
  23. LLM_ARCH_NEO_BERT,
  24. LLM_ARCH_JINA_BERT_V2,
  25. LLM_ARCH_BLOOM,
  26. LLM_ARCH_STABLELM,
  27. LLM_ARCH_QWEN,
  28. LLM_ARCH_QWEN2,
  29. LLM_ARCH_QWEN2MOE,
  30. LLM_ARCH_QWEN2VL,
  31. LLM_ARCH_QWEN3,
  32. LLM_ARCH_QWEN3MOE,
  33. LLM_ARCH_PHI2,
  34. LLM_ARCH_PHI3,
  35. LLM_ARCH_PHIMOE,
  36. LLM_ARCH_PLAMO,
  37. LLM_ARCH_CODESHELL,
  38. LLM_ARCH_ORION,
  39. LLM_ARCH_INTERNLM2,
  40. LLM_ARCH_MINICPM,
  41. LLM_ARCH_MINICPM3,
  42. LLM_ARCH_GEMMA,
  43. LLM_ARCH_GEMMA2,
  44. LLM_ARCH_GEMMA3,
  45. LLM_ARCH_STARCODER2,
  46. LLM_ARCH_MAMBA,
  47. LLM_ARCH_XVERSE,
  48. LLM_ARCH_COMMAND_R,
  49. LLM_ARCH_COHERE2,
  50. LLM_ARCH_DBRX,
  51. LLM_ARCH_OLMO,
  52. LLM_ARCH_OLMO2,
  53. LLM_ARCH_OLMOE,
  54. LLM_ARCH_OPENELM,
  55. LLM_ARCH_ARCTIC,
  56. LLM_ARCH_DEEPSEEK,
  57. LLM_ARCH_DEEPSEEK2,
  58. LLM_ARCH_CHATGLM,
  59. LLM_ARCH_GLM4,
  60. LLM_ARCH_BITNET,
  61. LLM_ARCH_T5,
  62. LLM_ARCH_T5ENCODER,
  63. LLM_ARCH_JAIS,
  64. LLM_ARCH_NEMOTRON,
  65. LLM_ARCH_EXAONE,
  66. LLM_ARCH_RWKV6,
  67. LLM_ARCH_RWKV6QWEN2,
  68. LLM_ARCH_RWKV7,
  69. LLM_ARCH_ARWKV7,
  70. LLM_ARCH_GRANITE,
  71. LLM_ARCH_GRANITE_MOE,
  72. LLM_ARCH_CHAMELEON,
  73. LLM_ARCH_WAVTOKENIZER_DEC,
  74. LLM_ARCH_PLM,
  75. LLM_ARCH_BAILINGMOE,
  76. LLM_ARCH_DOTS1,
  77. LLM_ARCH_ARCEE,
  78. LLM_ARCH_UNKNOWN,
  79. };
  80. enum llm_kv {
  81. LLM_KV_GENERAL_TYPE,
  82. LLM_KV_GENERAL_ARCHITECTURE,
  83. LLM_KV_GENERAL_QUANTIZATION_VERSION,
  84. LLM_KV_GENERAL_ALIGNMENT,
  85. LLM_KV_GENERAL_FILE_TYPE,
  86. LLM_KV_GENERAL_NAME,
  87. LLM_KV_GENERAL_AUTHOR,
  88. LLM_KV_GENERAL_VERSION,
  89. LLM_KV_GENERAL_URL,
  90. LLM_KV_GENERAL_DESCRIPTION,
  91. LLM_KV_GENERAL_LICENSE,
  92. LLM_KV_GENERAL_SOURCE_URL,
  93. LLM_KV_GENERAL_SOURCE_HF_REPO,
  94. LLM_KV_VOCAB_SIZE,
  95. LLM_KV_CONTEXT_LENGTH,
  96. LLM_KV_EMBEDDING_LENGTH,
  97. LLM_KV_FEATURES_LENGTH,
  98. LLM_KV_BLOCK_COUNT,
  99. LLM_KV_LEADING_DENSE_BLOCK_COUNT,
  100. LLM_KV_FEED_FORWARD_LENGTH,
  101. LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
  102. LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH,
  103. LLM_KV_USE_PARALLEL_RESIDUAL,
  104. LLM_KV_TENSOR_DATA_LAYOUT,
  105. LLM_KV_EXPERT_COUNT,
  106. LLM_KV_EXPERT_USED_COUNT,
  107. LLM_KV_EXPERT_SHARED_COUNT,
  108. LLM_KV_EXPERT_WEIGHTS_SCALE,
  109. LLM_KV_EXPERT_WEIGHTS_NORM,
  110. LLM_KV_EXPERT_GATING_FUNC,
  111. LLM_KV_MOE_EVERY_N_LAYERS,
  112. LLM_KV_POOLING_TYPE,
  113. LLM_KV_LOGIT_SCALE,
  114. LLM_KV_DECODER_START_TOKEN_ID,
  115. LLM_KV_ATTN_LOGIT_SOFTCAPPING,
  116. LLM_KV_FINAL_LOGIT_SOFTCAPPING,
  117. LLM_KV_SWIN_NORM,
  118. LLM_KV_RESCALE_EVERY_N_LAYERS,
  119. LLM_KV_TIME_MIX_EXTRA_DIM,
  120. LLM_KV_TIME_DECAY_EXTRA_DIM,
  121. LLM_KV_RESIDUAL_SCALE,
  122. LLM_KV_EMBEDDING_SCALE,
  123. LLM_KV_TOKEN_SHIFT_COUNT,
  124. LLM_KV_INTERLEAVE_MOE_LAYER_STEP,
  125. LLM_KV_ATTENTION_HEAD_COUNT,
  126. LLM_KV_ATTENTION_HEAD_COUNT_KV,
  127. LLM_KV_ATTENTION_MAX_ALIBI_BIAS,
  128. LLM_KV_ATTENTION_CLAMP_KQV,
  129. LLM_KV_ATTENTION_KEY_LENGTH,
  130. LLM_KV_ATTENTION_VALUE_LENGTH,
  131. LLM_KV_ATTENTION_LAYERNORM_EPS,
  132. LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
  133. LLM_KV_ATTENTION_GROUPNORM_EPS,
  134. LLM_KV_ATTENTION_GROUPNORM_GROUPS,
  135. LLM_KV_ATTENTION_CAUSAL,
  136. LLM_KV_ATTENTION_Q_LORA_RANK,
  137. LLM_KV_ATTENTION_KV_LORA_RANK,
  138. LLM_KV_ATTENTION_DECAY_LORA_RANK,
  139. LLM_KV_ATTENTION_ICLR_LORA_RANK,
  140. LLM_KV_ATTENTION_VALUE_RESIDUAL_MIX_LORA_RANK,
  141. LLM_KV_ATTENTION_GATE_LORA_RANK,
  142. LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
  143. LLM_KV_ATTENTION_SLIDING_WINDOW,
  144. LLM_KV_ATTENTION_SCALE,
  145. LLM_KV_ATTENTION_KEY_LENGTH_MLA,
  146. LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
  147. LLM_KV_ATTENTION_LAYER_INDICES,
  148. LLM_KV_ROPE_DIMENSION_COUNT,
  149. LLM_KV_ROPE_DIMENSION_SECTIONS,
  150. LLM_KV_ROPE_FREQ_BASE,
  151. LLM_KV_ROPE_SCALE_LINEAR,
  152. LLM_KV_ROPE_SCALING_TYPE,
  153. LLM_KV_ROPE_SCALING_FACTOR,
  154. LLM_KV_ROPE_SCALING_ATTN_FACTOR,
  155. LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
  156. LLM_KV_ROPE_SCALING_FINETUNED,
  157. LLM_KV_ROPE_SCALING_YARN_LOG_MUL,
  158. LLM_KV_SPLIT_NO,
  159. LLM_KV_SPLIT_COUNT,
  160. LLM_KV_SPLIT_TENSORS_COUNT,
  161. LLM_KV_SSM_INNER_SIZE,
  162. LLM_KV_SSM_CONV_KERNEL,
  163. LLM_KV_SSM_STATE_SIZE,
  164. LLM_KV_SSM_TIME_STEP_RANK,
  165. LLM_KV_SSM_DT_B_C_RMS,
  166. LLM_KV_WKV_HEAD_SIZE,
  167. LLM_KV_TOKENIZER_MODEL,
  168. LLM_KV_TOKENIZER_PRE,
  169. LLM_KV_TOKENIZER_LIST,
  170. LLM_KV_TOKENIZER_TOKEN_TYPE,
  171. LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
  172. LLM_KV_TOKENIZER_SCORES,
  173. LLM_KV_TOKENIZER_MERGES,
  174. LLM_KV_TOKENIZER_BOS_ID,
  175. LLM_KV_TOKENIZER_EOS_ID,
  176. LLM_KV_TOKENIZER_EOT_ID,
  177. LLM_KV_TOKENIZER_EOM_ID,
  178. LLM_KV_TOKENIZER_UNK_ID,
  179. LLM_KV_TOKENIZER_SEP_ID,
  180. LLM_KV_TOKENIZER_PAD_ID,
  181. LLM_KV_TOKENIZER_CLS_ID,
  182. LLM_KV_TOKENIZER_MASK_ID,
  183. LLM_KV_TOKENIZER_ADD_BOS,
  184. LLM_KV_TOKENIZER_ADD_EOS,
  185. LLM_KV_TOKENIZER_ADD_PREFIX,
  186. LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,
  187. LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
  188. LLM_KV_TOKENIZER_HF_JSON,
  189. LLM_KV_TOKENIZER_RWKV,
  190. LLM_KV_TOKENIZER_CHAT_TEMPLATE,
  191. LLM_KV_TOKENIZER_FIM_PRE_ID,
  192. LLM_KV_TOKENIZER_FIM_SUF_ID,
  193. LLM_KV_TOKENIZER_FIM_MID_ID,
  194. LLM_KV_TOKENIZER_FIM_PAD_ID,
  195. LLM_KV_TOKENIZER_FIM_REP_ID,
  196. LLM_KV_TOKENIZER_FIM_SEP_ID,
  197. LLM_KV_ADAPTER_TYPE,
  198. LLM_KV_ADAPTER_LORA_ALPHA,
  199. LLM_KV_POSNET_EMBEDDING_LENGTH,
  200. LLM_KV_POSNET_BLOCK_COUNT,
  201. LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
  202. LLM_KV_CONVNEXT_BLOCK_COUNT,
  203. LLM_KV_CLASSIFIER_OUTPUT_LABELS,
  204. // deprecated:
  205. LLM_KV_TOKENIZER_PREFIX_ID,
  206. LLM_KV_TOKENIZER_SUFFIX_ID,
  207. LLM_KV_TOKENIZER_MIDDLE_ID,
  208. };
  209. enum llm_tensor {
  210. LLM_TENSOR_TOKEN_EMBD,
  211. LLM_TENSOR_TOKEN_EMBD_NORM,
  212. LLM_TENSOR_TOKEN_TYPES,
  213. LLM_TENSOR_POS_EMBD,
  214. LLM_TENSOR_OUTPUT,
  215. LLM_TENSOR_OUTPUT_NORM,
  216. LLM_TENSOR_ROPE_FREQS,
  217. LLM_TENSOR_ROPE_FACTORS_LONG,
  218. LLM_TENSOR_ROPE_FACTORS_SHORT,
  219. LLM_TENSOR_ATTN_Q,
  220. LLM_TENSOR_ATTN_K,
  221. LLM_TENSOR_ATTN_V,
  222. LLM_TENSOR_ATTN_QKV,
  223. LLM_TENSOR_ATTN_OUT,
  224. LLM_TENSOR_ATTN_NORM,
  225. LLM_TENSOR_ATTN_NORM_2,
  226. LLM_TENSOR_ATTN_OUT_NORM,
  227. LLM_TENSOR_ATTN_POST_NORM,
  228. LLM_TENSOR_ATTN_ROT_EMBD,
  229. LLM_TENSOR_FFN_GATE_INP,
  230. LLM_TENSOR_FFN_GATE_INP_SHEXP,
  231. LLM_TENSOR_FFN_NORM,
  232. LLM_TENSOR_FFN_POST_NORM,
  233. LLM_TENSOR_FFN_GATE,
  234. LLM_TENSOR_FFN_DOWN,
  235. LLM_TENSOR_FFN_UP,
  236. LLM_TENSOR_FFN_ACT,
  237. LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
  238. LLM_TENSOR_FFN_GATE_EXP,
  239. LLM_TENSOR_FFN_UP_EXP,
  240. LLM_TENSOR_FFN_NORM_EXPS,
  241. LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
  242. LLM_TENSOR_FFN_GATE_EXPS,
  243. LLM_TENSOR_FFN_UP_EXPS,
  244. LLM_TENSOR_FFN_DOWN_SHEXP,
  245. LLM_TENSOR_FFN_GATE_SHEXP,
  246. LLM_TENSOR_FFN_UP_SHEXP,
  247. LLM_TENSOR_FFN_EXP_PROBS_B,
  248. LLM_TENSOR_ATTN_Q_NORM,
  249. LLM_TENSOR_ATTN_K_NORM,
  250. LLM_TENSOR_LAYER_OUT_NORM,
  251. LLM_TENSOR_POST_ATTN_NORM,
  252. LLM_TENSOR_POST_MLP_NORM,
  253. LLM_TENSOR_SSM_IN,
  254. LLM_TENSOR_SSM_CONV1D,
  255. LLM_TENSOR_SSM_X,
  256. LLM_TENSOR_SSM_DT,
  257. LLM_TENSOR_SSM_A,
  258. LLM_TENSOR_SSM_D,
  259. LLM_TENSOR_SSM_OUT,
  260. LLM_TENSOR_TIME_MIX_W0,
  261. LLM_TENSOR_TIME_MIX_W1,
  262. LLM_TENSOR_TIME_MIX_W2,
  263. LLM_TENSOR_TIME_MIX_A0,
  264. LLM_TENSOR_TIME_MIX_A1,
  265. LLM_TENSOR_TIME_MIX_A2,
  266. LLM_TENSOR_TIME_MIX_V0,
  267. LLM_TENSOR_TIME_MIX_V1,
  268. LLM_TENSOR_TIME_MIX_V2,
  269. LLM_TENSOR_TIME_MIX_G1,
  270. LLM_TENSOR_TIME_MIX_G2,
  271. LLM_TENSOR_TIME_MIX_K_K,
  272. LLM_TENSOR_TIME_MIX_K_A,
  273. LLM_TENSOR_TIME_MIX_R_K,
  274. LLM_TENSOR_TIME_MIX_LERP_X,
  275. LLM_TENSOR_TIME_MIX_LERP_W,
  276. LLM_TENSOR_TIME_MIX_LERP_K,
  277. LLM_TENSOR_TIME_MIX_LERP_V,
  278. LLM_TENSOR_TIME_MIX_LERP_R,
  279. LLM_TENSOR_TIME_MIX_LERP_G,
  280. LLM_TENSOR_TIME_MIX_LERP_FUSED,
  281. LLM_TENSOR_TIME_MIX_FIRST,
  282. LLM_TENSOR_TIME_MIX_DECAY,
  283. LLM_TENSOR_TIME_MIX_DECAY_W1,
  284. LLM_TENSOR_TIME_MIX_DECAY_W2,
  285. LLM_TENSOR_TIME_MIX_KEY,
  286. LLM_TENSOR_TIME_MIX_VALUE,
  287. LLM_TENSOR_TIME_MIX_RECEPTANCE,
  288. LLM_TENSOR_TIME_MIX_GATE,
  289. LLM_TENSOR_TIME_MIX_LN,
  290. LLM_TENSOR_TIME_MIX_OUTPUT,
  291. LLM_TENSOR_CHANNEL_MIX_LERP_K,
  292. LLM_TENSOR_CHANNEL_MIX_LERP_R,
  293. LLM_TENSOR_CHANNEL_MIX_KEY,
  294. LLM_TENSOR_CHANNEL_MIX_RECEPTANCE,
  295. LLM_TENSOR_CHANNEL_MIX_VALUE,
  296. LLM_TENSOR_ATTN_Q_A,
  297. LLM_TENSOR_ATTN_Q_B,
  298. LLM_TENSOR_ATTN_KV_A_MQA,
  299. LLM_TENSOR_ATTN_KV_B,
  300. LLM_TENSOR_ATTN_K_B,
  301. LLM_TENSOR_ATTN_V_B,
  302. LLM_TENSOR_ATTN_Q_A_NORM,
  303. LLM_TENSOR_ATTN_KV_A_NORM,
  304. LLM_TENSOR_ATTN_SUB_NORM,
  305. LLM_TENSOR_FFN_SUB_NORM,
  306. LLM_TENSOR_DEC_ATTN_NORM,
  307. LLM_TENSOR_DEC_ATTN_Q,
  308. LLM_TENSOR_DEC_ATTN_K,
  309. LLM_TENSOR_DEC_ATTN_V,
  310. LLM_TENSOR_DEC_ATTN_OUT,
  311. LLM_TENSOR_DEC_ATTN_REL_B,
  312. LLM_TENSOR_DEC_CROSS_ATTN_NORM,
  313. LLM_TENSOR_DEC_CROSS_ATTN_Q,
  314. LLM_TENSOR_DEC_CROSS_ATTN_K,
  315. LLM_TENSOR_DEC_CROSS_ATTN_V,
  316. LLM_TENSOR_DEC_CROSS_ATTN_OUT,
  317. LLM_TENSOR_DEC_CROSS_ATTN_REL_B,
  318. LLM_TENSOR_DEC_FFN_NORM,
  319. LLM_TENSOR_DEC_FFN_GATE,
  320. LLM_TENSOR_DEC_FFN_DOWN,
  321. LLM_TENSOR_DEC_FFN_UP,
  322. LLM_TENSOR_DEC_OUTPUT_NORM,
  323. LLM_TENSOR_ENC_ATTN_NORM,
  324. LLM_TENSOR_ENC_ATTN_Q,
  325. LLM_TENSOR_ENC_ATTN_K,
  326. LLM_TENSOR_ENC_ATTN_V,
  327. LLM_TENSOR_ENC_ATTN_OUT,
  328. LLM_TENSOR_ENC_ATTN_REL_B,
  329. LLM_TENSOR_ENC_FFN_NORM,
  330. LLM_TENSOR_ENC_FFN_GATE,
  331. LLM_TENSOR_ENC_FFN_DOWN,
  332. LLM_TENSOR_ENC_FFN_UP,
  333. LLM_TENSOR_ENC_OUTPUT_NORM,
  334. LLM_TENSOR_CLS,
  335. LLM_TENSOR_CLS_OUT,
  336. LLM_TENSOR_CONV1D,
  337. LLM_TENSOR_CONVNEXT_DW,
  338. LLM_TENSOR_CONVNEXT_NORM,
  339. LLM_TENSOR_CONVNEXT_PW1,
  340. LLM_TENSOR_CONVNEXT_PW2,
  341. LLM_TENSOR_CONVNEXT_GAMMA,
  342. LLM_TENSOR_POS_NET_CONV1,
  343. LLM_TENSOR_POS_NET_CONV2,
  344. LLM_TENSOR_POS_NET_NORM,
  345. LLM_TENSOR_POS_NET_NORM1,
  346. LLM_TENSOR_POS_NET_NORM2,
  347. LLM_TENSOR_POS_NET_ATTN_NORM,
  348. LLM_TENSOR_POS_NET_ATTN_Q,
  349. LLM_TENSOR_POS_NET_ATTN_K,
  350. LLM_TENSOR_POS_NET_ATTN_V,
  351. LLM_TENSOR_POS_NET_ATTN_OUT,
  352. };
  353. enum llm_tensor_layer {
  354. LLM_TENSOR_LAYER_INPUT,
  355. LLM_TENSOR_LAYER_REPEATING,
  356. LLM_TENSOR_LAYER_OUTPUT,
  357. };
  358. struct LLM_KV {
  359. LLM_KV(llm_arch arch, const char * suffix = nullptr);
  360. llm_arch arch;
  361. const char * suffix;
  362. std::string operator()(llm_kv kv) const;
  363. };
  364. // helper to handle gguf constants
  365. // usage:
  366. //
  367. // const auto tn = LLM_TN(LLM_ARCH_LLAMA);
  368. //
  369. // std::string name = tn(LLM_TENSOR_OUTPUT); -> "output"
  370. // std::string name = tn(LLM_TENSOR_TOKEN_EMBD, "bias"); -> "token_embd.bias"
  371. // std::string name = tn(LLM_TENSOR_ATTN_NORM, "weight", 3); -> "blk.3.attn_norm.weight"
  372. //
  373. struct LLM_TN_IMPL {
  374. const llm_arch arch;
  375. const llm_tensor tensor;
  376. const char * const suffix;
  377. const int bid;
  378. const int xid;
  379. std::string str() const;
  380. operator std::string() const {
  381. return str();
  382. }
  383. friend bool operator==(const std::string & str, const LLM_TN_IMPL & tn) {
  384. return str == tn.str();
  385. }
  386. friend bool operator!=(const std::string & str, const LLM_TN_IMPL & tn) {
  387. return str != tn.str();
  388. }
  389. };
  390. struct LLM_TN {
  391. LLM_TN(llm_arch arch) : arch(arch) {}
  392. llm_arch arch;
  393. LLM_TN_IMPL operator()(llm_tensor tensor, const char * suffix, int bid = -1, int xid = -1) const {
  394. return { arch, tensor, suffix, bid, xid };
  395. }
  396. LLM_TN_IMPL operator()(llm_tensor tensor, int bid = -1, int xid = -1) const {
  397. return { arch, tensor, nullptr, bid, xid };
  398. }
  399. };
  400. struct llm_tensor_info {
  401. llm_tensor_layer layer;
  402. ggml_op op;
  403. };
  404. const char * llm_arch_name(llm_arch arch);
  405. llm_arch llm_arch_from_string(const std::string & name);
  406. const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor);
  407. bool llm_arch_is_recurrent(const llm_arch & arch);
  408. bool llm_arch_is_hybrid (const llm_arch & arch);