llama-model.cpp 102 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213
  1. #include "llama-model.h"
  2. #include "llama-impl.h"
  3. #include "llama-model-loader.h"
  4. #include "unicode.h" // TODO: remove
  5. #include <algorithm>
  6. #include <cassert>
  7. #include <functional>
  8. #include <sstream>
  9. #include <stdexcept>
  10. static const size_t kiB = 1024;
  11. static const size_t MiB = 1024*kiB;
  12. static const size_t GiB = 1024*MiB;
  13. const char * llm_type_name(llm_type type) {
  14. switch (type) {
  15. case MODEL_14M: return "14M";
  16. case MODEL_17M: return "17M";
  17. case MODEL_22M: return "22M";
  18. case MODEL_33M: return "33M";
  19. case MODEL_60M: return "60M";
  20. case MODEL_70M: return "70M";
  21. case MODEL_80M: return "80M";
  22. case MODEL_109M: return "109M";
  23. case MODEL_137M: return "137M";
  24. case MODEL_160M: return "160M";
  25. case MODEL_220M: return "220M";
  26. case MODEL_250M: return "250M";
  27. case MODEL_270M: return "270M";
  28. case MODEL_335M: return "335M";
  29. case MODEL_410M: return "410M";
  30. case MODEL_450M: return "450M";
  31. case MODEL_770M: return "770M";
  32. case MODEL_780M: return "780M";
  33. case MODEL_0_5B: return "0.5B";
  34. case MODEL_1B: return "1B";
  35. case MODEL_1_3B: return "1.3B";
  36. case MODEL_1_4B: return "1.4B";
  37. case MODEL_1_5B: return "1.5B";
  38. case MODEL_1_6B: return "1.6B";
  39. case MODEL_2B: return "2B";
  40. case MODEL_2_8B: return "2.8B";
  41. case MODEL_3B: return "3B";
  42. case MODEL_4B: return "4B";
  43. case MODEL_6B: return "6B";
  44. case MODEL_6_9B: return "6.9B";
  45. case MODEL_7B: return "7B";
  46. case MODEL_8B: return "8B";
  47. case MODEL_9B: return "9B";
  48. case MODEL_11B: return "11B";
  49. case MODEL_12B: return "12B";
  50. case MODEL_13B: return "13B";
  51. case MODEL_14B: return "14B";
  52. case MODEL_15B: return "15B";
  53. case MODEL_16B: return "16B";
  54. case MODEL_20B: return "20B";
  55. case MODEL_30B: return "30B";
  56. case MODEL_32B: return "32B";
  57. case MODEL_34B: return "34B";
  58. case MODEL_35B: return "35B";
  59. case MODEL_40B: return "40B";
  60. case MODEL_65B: return "65B";
  61. case MODEL_70B: return "70B";
  62. case MODEL_236B: return "236B";
  63. case MODEL_314B: return "314B";
  64. case MODEL_671B: return "671B";
  65. case MODEL_SMALL: return "0.1B";
  66. case MODEL_MEDIUM: return "0.4B";
  67. case MODEL_LARGE: return "0.8B";
  68. case MODEL_XL: return "1.5B";
  69. case MODEL_A1_7B: return "A1.7B";
  70. case MODEL_A2_7B: return "A2.7B";
  71. case MODEL_8x7B: return "8x7B";
  72. case MODEL_8x22B: return "8x22B";
  73. case MODEL_16x12B: return "16x12B";
  74. case MODEL_16x3_8B: return "16x3.8B";
  75. case MODEL_10B_128x3_66B: return "10B+128x3.66B";
  76. case MODEL_57B_A14B: return "57B.A14B";
  77. case MODEL_27B: return "27B";
  78. default: return "?B";
  79. }
  80. }
  81. static std::string llama_model_ftype_name(llama_ftype ftype) {
  82. if (ftype & LLAMA_FTYPE_GUESSED) {
  83. return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
  84. }
  85. switch (ftype) {
  86. case LLAMA_FTYPE_ALL_F32: return "all F32";
  87. case LLAMA_FTYPE_MOSTLY_F16: return "F16";
  88. case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
  89. case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
  90. case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
  91. case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
  92. case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
  93. case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
  94. case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
  95. case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
  96. case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
  97. case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
  98. case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
  99. case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
  100. case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
  101. case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
  102. case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
  103. case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
  104. case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
  105. case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
  106. case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
  107. case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
  108. case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
  109. case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
  110. case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
  111. case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
  112. case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
  113. case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
  114. case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
  115. case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
  116. case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
  117. case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
  118. default: return "unknown, may not work";
  119. }
  120. }
  121. static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
  122. switch (type) {
  123. case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
  124. case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
  125. default: return "unknown";
  126. }
  127. }
  128. std::string llama_model_arch_name (const llama_model & model) {
  129. return llm_arch_name(model.arch);
  130. }
  131. std::string llama_model_type_name (const llama_model & model) {
  132. return llm_type_name(model.type);
  133. }
  134. std::string llama_model_ftype_name(const llama_model & model) {
  135. return llama_model_ftype_name(model.ftype);
  136. }
  137. template<typename F>
  138. static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
  139. ggml_init_params params = {
  140. /*.mem_size =*/ ggml_tensor_overhead()*8,
  141. /*.mem_buffer =*/ NULL,
  142. /*.no_alloc =*/ true,
  143. };
  144. ggml_context_ptr ctx { ggml_init(params) };
  145. if (!ctx) {
  146. throw std::runtime_error(format("failed to create ggml context"));
  147. }
  148. ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
  149. ggml_tensor * op_tensor = fn(ctx.get());
  150. for (int i = 0; i < GGML_MAX_SRC; i++) {
  151. if (op_tensor->src[i] != nullptr) {
  152. assert(op_tensor->src[i]->buffer == nullptr);
  153. op_tensor->src[i]->buffer = buf.get();
  154. }
  155. }
  156. bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
  157. return op_supported;
  158. }
  159. template<typename F>
  160. static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
  161. for (const auto & cur : buft_list) {
  162. ggml_backend_dev_t cur_dev = cur.first;
  163. ggml_backend_buffer_type_t cur_buft = cur.second;
  164. if (buft_supported(cur_buft, cur_dev, fn)) {
  165. return cur_buft;
  166. }
  167. }
  168. throw std::runtime_error(format("no suitable buffer type found"));
  169. }
  170. ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il) {
  171. return select_buft(
  172. *model.dev_layer.at(il).buft_list,
  173. [&](ggml_context * ctx) {
  174. ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
  175. ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
  176. return ggml_add(ctx, cur, layer_dir);
  177. });
  178. }
  179. struct ggml_tensor * llama_model_get_tensor(const struct llama_model & model, const char * name) {
  180. auto it = std::find_if(model.tensors_by_name.begin(), model.tensors_by_name.end(),
  181. [name](const std::pair<std::string, struct ggml_tensor *> & it) {
  182. return it.first == name;
  183. });
  184. if (it == model.tensors_by_name.end()) {
  185. return nullptr;
  186. }
  187. return it->second;
  188. }
  189. size_t llama_model_max_nodes(const llama_model & model) {
  190. return std::max<size_t>(8192, model.tensors_by_name.size()*5);
  191. }
  192. static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
  193. { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
  194. { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
  195. { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
  196. { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
  197. };
  198. static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
  199. for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
  200. if (kv.second == name) {
  201. return (llama_rope_scaling_type) kv.first;
  202. }
  203. }
  204. return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
  205. }
  206. // NOTE: avoid ever using this except for building the token_to_piece caches
  207. static std::string llama_token_to_piece(const struct llama_model * model, llama_token token, bool special) {
  208. std::string piece;
  209. piece.resize(piece.capacity()); // using string internal cache
  210. const int n_chars = llama_token_to_piece(model, token, &piece[0], piece.size(), 0, special);
  211. if (n_chars < 0) {
  212. piece.resize(-n_chars);
  213. int check = llama_token_to_piece(model, token, &piece[0], piece.size(), 0, special);
  214. GGML_ASSERT(check == -n_chars);
  215. }
  216. else {
  217. piece.resize(n_chars);
  218. }
  219. return piece;
  220. }
  221. void llm_load_stats(llama_model_loader & ml, llama_model & model) {
  222. model.n_elements = ml.n_elements;
  223. model.n_bytes = ml.n_bytes;
  224. }
  225. void llm_load_arch(llama_model_loader & ml, llama_model & model) {
  226. model.arch = ml.get_arch();
  227. if (model.arch == LLM_ARCH_UNKNOWN) {
  228. throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
  229. }
  230. }
  231. void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
  232. auto & hparams = model.hparams;
  233. const gguf_context * ctx = ml.meta.get();
  234. // get metadata as string
  235. for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
  236. enum gguf_type type = gguf_get_kv_type(ctx, i);
  237. if (type == GGUF_TYPE_ARRAY) {
  238. continue;
  239. }
  240. const char * name = gguf_get_key(ctx, i);
  241. const std::string value = gguf_kv_to_str(ctx, i);
  242. model.gguf_kv.emplace(name, value);
  243. }
  244. // get general kv
  245. ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
  246. // get hparams kv
  247. ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
  248. // everything past this point is not vocab-related
  249. if (hparams.vocab_only) {
  250. return;
  251. }
  252. ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
  253. ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
  254. ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
  255. ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
  256. ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
  257. if (model.arch == LLM_ARCH_WAVTOKENIZER_DEC) {
  258. ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
  259. ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
  260. ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
  261. ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
  262. ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer);
  263. }
  264. GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
  265. GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
  266. if (hparams.n_expert > 0) {
  267. GGML_ASSERT(hparams.n_expert_used > 0);
  268. } else {
  269. GGML_ASSERT(hparams.n_expert_used == 0);
  270. }
  271. // zero-out the array hparams
  272. std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
  273. std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
  274. std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
  275. ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
  276. ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
  277. // n_head_kv is optional, default to n_head
  278. hparams.n_head_kv_arr = hparams.n_head_arr;
  279. ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
  280. bool rope_finetuned = false;
  281. ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
  282. hparams.rope_finetuned = rope_finetuned;
  283. hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
  284. ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
  285. // rope_freq_base (optional)
  286. hparams.rope_freq_base_train = 10000.0f;
  287. ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
  288. std::string rope_scaling("linear");
  289. ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
  290. hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
  291. GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
  292. // rope_freq_scale (inverse of the kv) is optional
  293. float ropescale = 0.0f;
  294. if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
  295. // try the old key name
  296. ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
  297. }
  298. hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
  299. ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
  300. // non-transformer models do not have attention heads
  301. if (hparams.n_head() > 0) {
  302. // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
  303. // gpt-j n_rot = rotary_dim
  304. hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
  305. ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
  306. hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
  307. ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
  308. // sanity check for n_rot (optional)
  309. hparams.n_rot = hparams.n_embd_head_k;
  310. ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
  311. if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_FALCON) {
  312. if (hparams.n_rot != hparams.n_embd_head_k) {
  313. throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
  314. }
  315. }
  316. } else {
  317. hparams.n_rot = 0;
  318. hparams.n_embd_head_k = 0;
  319. hparams.n_embd_head_v = 0;
  320. }
  321. using e_model = llm_type; // TMP
  322. // arch-specific KVs
  323. switch (model.arch) {
  324. case LLM_ARCH_LLAMA:
  325. {
  326. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  327. if (hparams.n_expert == 8) {
  328. switch (hparams.n_layer) {
  329. case 32: model.type = e_model::MODEL_8x7B; break;
  330. case 56: model.type = e_model::MODEL_8x22B; break;
  331. default: model.type = e_model::MODEL_UNKNOWN;
  332. }
  333. } else {
  334. switch (hparams.n_layer) {
  335. case 16: model.type = e_model::MODEL_1B; break; // Llama 3.2 1B
  336. case 22: model.type = e_model::MODEL_1B; break;
  337. case 26: model.type = e_model::MODEL_3B; break;
  338. case 28: model.type = e_model::MODEL_3B; break; // Llama 3.2 3B
  339. // granite uses a vocab with len 49152
  340. case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
  341. case 36: model.type = e_model::MODEL_8B; break; // granite
  342. case 40: model.type = e_model::MODEL_13B; break;
  343. case 48: model.type = e_model::MODEL_34B; break;
  344. case 60: model.type = e_model::MODEL_30B; break;
  345. case 80: model.type = hparams.n_head() == hparams.n_head_kv() ? e_model::MODEL_65B : e_model::MODEL_70B; break;
  346. default: model.type = e_model::MODEL_UNKNOWN;
  347. }
  348. }
  349. } break;
  350. case LLM_ARCH_DECI:
  351. {
  352. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  353. switch (hparams.n_layer) {
  354. case 32: model.type = e_model::MODEL_7B; break;
  355. case 80: model.type = e_model::MODEL_70B; break;
  356. default: model.type = e_model::MODEL_UNKNOWN;
  357. }
  358. } break;
  359. case LLM_ARCH_MINICPM:
  360. {
  361. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  362. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
  363. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
  364. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  365. switch (hparams.n_layer) {
  366. case 52: model.type = e_model::MODEL_1B; break;
  367. case 40: model.type = e_model::MODEL_2B; break;
  368. default: model.type = e_model::MODEL_UNKNOWN;
  369. }
  370. } break;
  371. case LLM_ARCH_MINICPM3:
  372. {
  373. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  374. ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
  375. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  376. switch (hparams.n_layer) {
  377. case 62: model.type = e_model::MODEL_4B; break;
  378. default: model.type = e_model::MODEL_UNKNOWN;
  379. }
  380. } break;
  381. case LLM_ARCH_GROK:
  382. {
  383. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  384. switch (hparams.n_layer) {
  385. case 64: model.type = e_model::MODEL_314B; break;
  386. default: model.type = e_model::MODEL_UNKNOWN;
  387. }
  388. } break;
  389. case LLM_ARCH_FALCON:
  390. {
  391. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  392. switch (hparams.n_layer) {
  393. case 32: model.type = e_model::MODEL_7B; break;
  394. case 60: model.type = e_model::MODEL_40B; break;
  395. default: model.type = e_model::MODEL_UNKNOWN;
  396. }
  397. } break;
  398. case LLM_ARCH_BAICHUAN:
  399. {
  400. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  401. switch (hparams.n_layer) {
  402. case 32: model.type = e_model::MODEL_7B; break;
  403. case 40: model.type = e_model::MODEL_13B; break;
  404. default: model.type = e_model::MODEL_UNKNOWN;
  405. }
  406. if (model.type == e_model::MODEL_13B) {
  407. // TODO: become GGUF KV parameter
  408. hparams.f_max_alibi_bias = 8.0f;
  409. }
  410. } break;
  411. case LLM_ARCH_STARCODER:
  412. {
  413. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  414. switch (hparams.n_layer) {
  415. case 24: model.type = e_model::MODEL_1B; break;
  416. case 36: model.type = e_model::MODEL_3B; break;
  417. case 42: model.type = e_model::MODEL_7B; break;
  418. case 40: model.type = e_model::MODEL_15B; break;
  419. default: model.type = e_model::MODEL_UNKNOWN;
  420. }
  421. } break;
  422. case LLM_ARCH_REFACT:
  423. {
  424. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  425. switch (hparams.n_layer) {
  426. case 32: model.type = e_model::MODEL_1B; break;
  427. default: model.type = e_model::MODEL_UNKNOWN;
  428. }
  429. // TODO: become GGUF KV parameter
  430. hparams.f_max_alibi_bias = 8.0f;
  431. } break;
  432. case LLM_ARCH_BERT:
  433. {
  434. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  435. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  436. ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
  437. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  438. switch (hparams.n_layer) {
  439. case 3:
  440. model.type = e_model::MODEL_17M; break; // bge-micro
  441. case 6:
  442. model.type = e_model::MODEL_22M; break; // MiniLM-L6
  443. case 12:
  444. switch (hparams.n_embd) {
  445. case 384: model.type = e_model::MODEL_33M; break; // MiniLM-L12, bge-small
  446. case 768: model.type = e_model::MODEL_109M; break; // bge-base
  447. default: model.type = e_model::MODEL_UNKNOWN;
  448. } break;
  449. case 24:
  450. model.type = e_model::MODEL_335M; break; // bge-large
  451. default: model.type = e_model::MODEL_UNKNOWN;
  452. }
  453. } break;
  454. case LLM_ARCH_JINA_BERT_V2:
  455. {
  456. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  457. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  458. ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
  459. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  460. hparams.f_max_alibi_bias = 8.0f;
  461. switch (hparams.n_layer) {
  462. case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
  463. case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
  464. default: model.type = e_model::MODEL_UNKNOWN;
  465. }
  466. } break;
  467. case LLM_ARCH_NOMIC_BERT:
  468. {
  469. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  470. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  471. ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
  472. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
  473. if (hparams.n_layer == 12 && hparams.n_embd == 768) {
  474. model.type = e_model::MODEL_137M;
  475. }
  476. } break;
  477. case LLM_ARCH_BLOOM:
  478. {
  479. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  480. switch (hparams.n_layer) {
  481. case 24: model.type = e_model::MODEL_1B; break;
  482. case 30:
  483. switch (hparams.n_embd) {
  484. case 2560: model.type = e_model::MODEL_3B; break;
  485. case 4096: model.type = e_model::MODEL_7B; break;
  486. default: model.type = e_model::MODEL_UNKNOWN;
  487. } break;
  488. default: model.type = e_model::MODEL_UNKNOWN;
  489. }
  490. // TODO: become GGUF KV parameter
  491. hparams.f_max_alibi_bias = 8.0f;
  492. } break;
  493. case LLM_ARCH_MPT:
  494. {
  495. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  496. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
  497. ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
  498. switch (hparams.n_layer) {
  499. case 32: model.type = e_model::MODEL_7B; break;
  500. case 48: model.type = e_model::MODEL_30B; break;
  501. default: model.type = e_model::MODEL_UNKNOWN;
  502. }
  503. } break;
  504. case LLM_ARCH_STABLELM:
  505. {
  506. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  507. switch (hparams.n_layer) {
  508. case 24: model.type = e_model::MODEL_1B; break;
  509. case 32: model.type = e_model::MODEL_3B; break;
  510. case 40: model.type = e_model::MODEL_12B; break;
  511. default: model.type = e_model::MODEL_UNKNOWN;
  512. }
  513. } break;
  514. case LLM_ARCH_QWEN:
  515. {
  516. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  517. switch (hparams.n_layer) {
  518. case 32: model.type = e_model::MODEL_7B; break;
  519. case 40: model.type = e_model::MODEL_13B; break;
  520. default: model.type = e_model::MODEL_UNKNOWN;
  521. }
  522. } break;
  523. case LLM_ARCH_QWEN2VL:
  524. {
  525. ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
  526. }
  527. // fall through
  528. case LLM_ARCH_QWEN2:
  529. {
  530. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  531. switch (hparams.n_layer) {
  532. case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
  533. case 28: model.type = hparams.n_embd == 1536 ? e_model::MODEL_1_5B : e_model::MODEL_7B; break;
  534. case 32: model.type = e_model::MODEL_7B; break;
  535. case 36: model.type = e_model::MODEL_3B; break;
  536. case 40: model.type = hparams.n_head() == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
  537. case 48: model.type = e_model::MODEL_14B; break;
  538. case 64: model.type = e_model::MODEL_32B; break;
  539. case 80: model.type = e_model::MODEL_70B; break;
  540. default: model.type = e_model::MODEL_UNKNOWN;
  541. }
  542. } break;
  543. case LLM_ARCH_QWEN2MOE:
  544. {
  545. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  546. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
  547. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  548. switch (hparams.n_layer) {
  549. case 24: model.type = e_model::MODEL_A2_7B; break;
  550. case 28: model.type = e_model::MODEL_57B_A14B; break;
  551. default: model.type = e_model::MODEL_UNKNOWN;
  552. }
  553. } break;
  554. case LLM_ARCH_PHI2:
  555. {
  556. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  557. switch (hparams.n_layer) {
  558. case 24: model.type = e_model::MODEL_1B; break;
  559. case 32: model.type = e_model::MODEL_3B; break;
  560. default: model.type = e_model::MODEL_UNKNOWN;
  561. }
  562. } break;
  563. case LLM_ARCH_PHI3:
  564. {
  565. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  566. switch (hparams.n_layer) {
  567. case 24: model.type = e_model::MODEL_1B; break;
  568. case 32: model.type = e_model::MODEL_3B; break;
  569. case 40: model.type = e_model::MODEL_14B; break;
  570. default: model.type = e_model::MODEL_UNKNOWN;
  571. }
  572. // for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
  573. if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
  574. // default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
  575. hparams.n_swa = 2047;
  576. } else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
  577. // default value for Phi-3-mini-128k-instruct
  578. hparams.n_swa = 262144;
  579. } else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
  580. // default value for Phi-3-medium-128k-instruct
  581. hparams.n_swa = 131072;
  582. }
  583. bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  584. if (!found_swa && hparams.n_swa == 0) {
  585. throw std::runtime_error("invalid value for sliding_window");
  586. }
  587. } break;
  588. case LLM_ARCH_PHIMOE:
  589. {
  590. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  591. switch (hparams.n_layer) {
  592. case 32: model.type = e_model::MODEL_16x3_8B; break;
  593. default: model.type = e_model::MODEL_UNKNOWN;
  594. }
  595. } break;
  596. case LLM_ARCH_PLAMO:
  597. {
  598. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  599. switch (hparams.n_layer) {
  600. case 40: model.type = e_model::MODEL_13B; break;
  601. default: model.type = e_model::MODEL_UNKNOWN;
  602. }
  603. } break;
  604. case LLM_ARCH_GPT2:
  605. {
  606. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  607. switch (hparams.n_layer) {
  608. case 12: model.type = e_model::MODEL_SMALL; break;
  609. case 24: model.type = e_model::MODEL_MEDIUM; break;
  610. case 36: model.type = e_model::MODEL_LARGE; break;
  611. case 48: model.type = e_model::MODEL_XL; break;
  612. default: model.type = e_model::MODEL_UNKNOWN;
  613. }
  614. } break;
  615. case LLM_ARCH_CODESHELL:
  616. {
  617. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  618. switch (hparams.n_layer) {
  619. case 42: model.type = e_model::MODEL_7B; break;
  620. default: model.type = e_model::MODEL_UNKNOWN;
  621. }
  622. } break;
  623. case LLM_ARCH_ORION:
  624. {
  625. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  626. switch (hparams.n_layer) {
  627. case 40: model.type = e_model::MODEL_14B; break;
  628. default: model.type = e_model::MODEL_UNKNOWN;
  629. }
  630. } break;
  631. case LLM_ARCH_INTERNLM2:
  632. {
  633. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  634. switch (hparams.n_layer) {
  635. case 32: model.type = e_model::MODEL_7B; break;
  636. case 48: model.type = e_model::MODEL_20B; break;
  637. default: model.type = e_model::MODEL_UNKNOWN;
  638. }
  639. } break;
  640. case LLM_ARCH_GEMMA:
  641. {
  642. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  643. switch (hparams.n_layer) {
  644. case 18: model.type = e_model::MODEL_2B; break;
  645. case 28: model.type = e_model::MODEL_7B; break;
  646. default: model.type = e_model::MODEL_UNKNOWN;
  647. }
  648. } break;
  649. case LLM_ARCH_GEMMA2:
  650. {
  651. hparams.n_swa = 4096; // default value of gemma 2
  652. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  653. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  654. ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
  655. ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
  656. hparams.attn_soft_cap = true;
  657. switch (hparams.n_layer) {
  658. case 26: model.type = e_model::MODEL_2B; break;
  659. case 42: model.type = e_model::MODEL_9B; break;
  660. case 46: model.type = e_model::MODEL_27B; break;
  661. default: model.type = e_model::MODEL_UNKNOWN;
  662. }
  663. } break;
  664. case LLM_ARCH_STARCODER2:
  665. {
  666. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  667. switch (hparams.n_layer) {
  668. case 30: model.type = e_model::MODEL_3B; break;
  669. case 32: model.type = e_model::MODEL_7B; break;
  670. case 40: model.type = e_model::MODEL_15B; break;
  671. case 52: model.type = e_model::MODEL_20B; break; // granite
  672. case 88: model.type = e_model::MODEL_34B; break; // granite
  673. default: model.type = e_model::MODEL_UNKNOWN;
  674. }
  675. } break;
  676. case LLM_ARCH_MAMBA:
  677. {
  678. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  679. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  680. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  681. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  682. ml.get_key(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms, false);
  683. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  684. switch (hparams.n_layer) {
  685. case 24:
  686. switch (hparams.n_embd) {
  687. case 768: model.type = e_model::MODEL_SMALL; break;
  688. default: model.type = e_model::MODEL_UNKNOWN;
  689. } break;
  690. case 48:
  691. switch (hparams.n_embd) {
  692. case 1024: model.type = e_model::MODEL_MEDIUM; break;
  693. case 1536: model.type = e_model::MODEL_LARGE; break;
  694. case 2048: model.type = e_model::MODEL_XL; break;
  695. default: model.type = e_model::MODEL_UNKNOWN;
  696. } break;
  697. case 64:
  698. switch (hparams.n_embd) {
  699. case 2560: model.type = e_model::MODEL_3B; break;
  700. default: model.type = e_model::MODEL_UNKNOWN;
  701. } break;
  702. default: model.type = e_model::MODEL_UNKNOWN;
  703. }
  704. } break;
  705. case LLM_ARCH_XVERSE:
  706. {
  707. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  708. switch (hparams.n_layer) {
  709. case 32: model.type = e_model::MODEL_7B; break;
  710. case 40: model.type = e_model::MODEL_13B; break;
  711. case 80: model.type = e_model::MODEL_65B; break;
  712. default: model.type = e_model::MODEL_UNKNOWN;
  713. }
  714. } break;
  715. case LLM_ARCH_COMMAND_R:
  716. {
  717. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  718. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  719. switch (hparams.n_layer) {
  720. case 40: model.type = e_model::MODEL_35B; break;
  721. default: model.type = e_model::MODEL_UNKNOWN;
  722. }
  723. } break;
  724. case LLM_ARCH_COHERE2:
  725. {
  726. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  727. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  728. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  729. switch (hparams.n_layer) {
  730. case 32: model.type = e_model::MODEL_8B; break;
  731. default: model.type = e_model::MODEL_UNKNOWN;
  732. }
  733. } break;
  734. case LLM_ARCH_DBRX:
  735. {
  736. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  737. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
  738. switch (hparams.n_layer) {
  739. case 40: model.type = e_model::MODEL_16x12B; break;
  740. default: model.type = e_model::MODEL_UNKNOWN;
  741. }
  742. } break;
  743. case LLM_ARCH_OLMO:
  744. {
  745. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  746. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
  747. switch (hparams.n_layer) {
  748. case 22: model.type = e_model::MODEL_1B; break;
  749. case 32: model.type = e_model::MODEL_7B; break;
  750. case 80: model.type = e_model::MODEL_70B; break;
  751. default: model.type = e_model::MODEL_UNKNOWN;
  752. }
  753. } break;
  754. case LLM_ARCH_OLMO2:
  755. {
  756. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  757. switch (hparams.n_layer) {
  758. case 16: model.type = e_model::MODEL_1B; break;
  759. case 32: model.type = e_model::MODEL_7B; break;
  760. case 40: model.type = e_model::MODEL_13B; break;
  761. default: model.type = e_model::MODEL_UNKNOWN;
  762. }
  763. } break;
  764. case LLM_ARCH_OLMOE:
  765. {
  766. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  767. switch (hparams.n_layer) {
  768. case 16: model.type = e_model::MODEL_A1_7B; break;
  769. default: model.type = e_model::MODEL_UNKNOWN;
  770. }
  771. } break;
  772. case LLM_ARCH_OPENELM:
  773. {
  774. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  775. switch (hparams.n_layer) {
  776. case 16: model.type = e_model::MODEL_270M; break;
  777. case 20: model.type = e_model::MODEL_450M; break;
  778. case 28: model.type = e_model::MODEL_1B; break;
  779. case 36: model.type = e_model::MODEL_3B; break;
  780. default: model.type = e_model::MODEL_UNKNOWN;
  781. }
  782. } break;
  783. case LLM_ARCH_GPTNEOX:
  784. {
  785. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  786. ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
  787. switch (hparams.n_layer) {
  788. case 6:
  789. switch (hparams.n_ff()) {
  790. case 512: model.type = e_model::MODEL_14M; break;
  791. case 2048: model.type = e_model::MODEL_70M; break;
  792. default: model.type = e_model::MODEL_UNKNOWN;
  793. } break;
  794. case 12:
  795. switch (hparams.n_ff()) {
  796. case 3072: model.type = e_model::MODEL_160M; break;
  797. default: model.type = e_model::MODEL_UNKNOWN;
  798. } break;
  799. case 16:
  800. switch (hparams.n_ff()) {
  801. case 8192: model.type = e_model::MODEL_1B; break;
  802. default: model.type = e_model::MODEL_UNKNOWN;
  803. } break;
  804. case 24:
  805. switch (hparams.n_ff()) {
  806. case 4096: model.type = e_model::MODEL_410M; break;
  807. case 8192: model.type = e_model::MODEL_1_4B; break;
  808. default: model.type = e_model::MODEL_UNKNOWN;
  809. } break;
  810. case 32:
  811. switch (hparams.n_ff()) {
  812. case 10240: model.type = e_model::MODEL_2_8B; break;
  813. case 16384: model.type = e_model::MODEL_6_9B; break;
  814. default: model.type = e_model::MODEL_UNKNOWN;
  815. } break;
  816. case 36:
  817. switch (hparams.n_ff()) {
  818. case 20480: model.type = e_model::MODEL_12B; break;
  819. default: model.type = e_model::MODEL_UNKNOWN;
  820. } break;
  821. case 44:
  822. switch (hparams.n_ff()) {
  823. case 24576: model.type = e_model::MODEL_20B; break;
  824. default: model.type = e_model::MODEL_UNKNOWN;
  825. } break;
  826. default: model.type = e_model::MODEL_UNKNOWN;
  827. }
  828. } break;
  829. case LLM_ARCH_ARCTIC:
  830. {
  831. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  832. if (hparams.n_expert == 128) {
  833. switch (hparams.n_layer) {
  834. case 35: model.type = e_model::MODEL_10B_128x3_66B; break;
  835. default: model.type = e_model::MODEL_UNKNOWN;
  836. }
  837. } else {
  838. model.type = e_model::MODEL_UNKNOWN;
  839. }
  840. } break;
  841. case LLM_ARCH_DEEPSEEK:
  842. {
  843. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  844. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  845. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  846. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  847. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  848. switch (hparams.n_layer) {
  849. case 28: model.type = e_model::MODEL_20B; break;
  850. default: model.type = e_model::MODEL_UNKNOWN;
  851. }
  852. } break;
  853. case LLM_ARCH_DEEPSEEK2:
  854. {
  855. bool is_lite = (hparams.n_layer == 27);
  856. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  857. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  858. if (!is_lite) {
  859. ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
  860. }
  861. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  862. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  863. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  864. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  865. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  866. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  867. if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
  868. // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
  869. // that have no expert_gating_func model parameter set
  870. hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
  871. }
  872. ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
  873. switch (hparams.n_layer) {
  874. case 27: model.type = e_model::MODEL_16B; break;
  875. case 60: model.type = e_model::MODEL_236B; break;
  876. case 61: model.type = e_model::MODEL_671B; break;
  877. default: model.type = e_model::MODEL_UNKNOWN;
  878. }
  879. } break;
  880. case LLM_ARCH_CHATGLM:
  881. {
  882. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  883. switch (hparams.n_layer) {
  884. case 28: model.type = e_model::MODEL_6B; break;
  885. case 40: model.type = e_model::MODEL_9B; break;
  886. default: model.type = e_model::MODEL_UNKNOWN;
  887. }
  888. } break;
  889. case LLM_ARCH_BITNET:
  890. {
  891. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  892. switch (hparams.n_layer) {
  893. case 26: model.type = e_model::MODEL_3B; break;
  894. default: model.type = e_model::MODEL_UNKNOWN;
  895. }
  896. } break;
  897. case LLM_ARCH_T5:
  898. {
  899. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  900. ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
  901. uint32_t dec_start_token_id;
  902. if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) {
  903. hparams.dec_start_token_id = dec_start_token_id;
  904. }
  905. switch (hparams.n_layer) {
  906. case 6: model.type = e_model::MODEL_60M; break; // t5-small
  907. case 8: model.type = e_model::MODEL_80M; break; // flan-t5-small
  908. case 12:
  909. switch (hparams.n_ff()) {
  910. case 3072: model.type = e_model::MODEL_220M; break; // t5-base
  911. case 2048: model.type = e_model::MODEL_250M; break; // flan-t5-base
  912. default: model.type = e_model::MODEL_UNKNOWN;
  913. } break;
  914. case 24:
  915. switch (hparams.n_ff()) {
  916. case 4096: model.type = e_model::MODEL_770M; break; // t5-large
  917. case 2816: model.type = e_model::MODEL_780M; break; // flan-t5-large
  918. case 16384: model.type = e_model::MODEL_3B; break; // t5-3b
  919. case 5120: model.type = e_model::MODEL_3B; break; // flan-t5-xl
  920. case 65536: model.type = e_model::MODEL_11B; break; // t5-11b
  921. case 10240: model.type = e_model::MODEL_11B; break; // flan-t5-xxl
  922. default: model.type = e_model::MODEL_UNKNOWN;
  923. } break;
  924. default: model.type = e_model::MODEL_UNKNOWN;
  925. }
  926. } break;
  927. case LLM_ARCH_T5ENCODER:
  928. {
  929. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  930. ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
  931. model.type = e_model::MODEL_UNKNOWN;
  932. } break;
  933. case LLM_ARCH_JAIS:
  934. {
  935. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  936. ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
  937. switch (hparams.n_layer) {
  938. case 24: model.type = e_model::MODEL_1_3B; break;
  939. case 40: model.type = e_model::MODEL_13B; break;
  940. /* TODO: add variants */
  941. default: model.type = e_model::MODEL_UNKNOWN;
  942. }
  943. } break;
  944. case LLM_ARCH_NEMOTRON:
  945. {
  946. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  947. switch (hparams.n_layer) {
  948. case 32: model.type = e_model::MODEL_4B; break;
  949. default: model.type = e_model::MODEL_UNKNOWN;
  950. }
  951. } break;
  952. case LLM_ARCH_EXAONE:
  953. {
  954. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  955. switch (hparams.n_layer) {
  956. case 32: model.type = e_model::MODEL_8B; break;
  957. default: model.type = e_model::MODEL_UNKNOWN;
  958. }
  959. } break;
  960. case LLM_ARCH_RWKV6:
  961. {
  962. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  963. ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
  964. ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
  965. ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
  966. ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
  967. switch (hparams.n_layer) {
  968. case 24: model.type = e_model::MODEL_1_6B; break;
  969. case 32:
  970. switch (hparams.n_embd) {
  971. case 2560: model.type = e_model::MODEL_3B; break;
  972. case 4096: model.type = e_model::MODEL_7B; break;
  973. default: model.type = e_model::MODEL_UNKNOWN;
  974. } break;
  975. case 61: model.type = e_model::MODEL_14B; break;
  976. default: model.type = e_model::MODEL_UNKNOWN;
  977. }
  978. } break;
  979. case LLM_ARCH_GRANITE:
  980. case LLM_ARCH_GRANITE_MOE:
  981. {
  982. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  983. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  984. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
  985. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
  986. ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
  987. switch (hparams.n_layer) {
  988. case 32: model.type = e_model::MODEL_3B; break;
  989. case 40: model.type = e_model::MODEL_3B; break;
  990. // Add additional layer/vocab/etc checks here for other model sizes
  991. default: model.type = e_model::MODEL_UNKNOWN;
  992. }
  993. } break;
  994. case LLM_ARCH_CHAMELEON:
  995. {
  996. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  997. hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
  998. ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
  999. switch (hparams.n_layer) {
  1000. case 32: model.type = e_model::MODEL_7B; break;
  1001. case 48: model.type = e_model::MODEL_34B; break;
  1002. default: model.type = e_model::MODEL_UNKNOWN;
  1003. }
  1004. } break;
  1005. case LLM_ARCH_WAVTOKENIZER_DEC:
  1006. {
  1007. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  1008. ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
  1009. ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
  1010. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  1011. } break;
  1012. default: throw std::runtime_error("unsupported model architecture");
  1013. }
  1014. model.ftype = ml.ftype;
  1015. if (hparams.f_max_alibi_bias > 0.0f) {
  1016. hparams.use_alibi = true;
  1017. }
  1018. hparams.rope_type = llama_rope_type(&model);
  1019. }
  1020. void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
  1021. auto & vocab = model.vocab;
  1022. struct gguf_context * ctx = ml.meta.get();
  1023. const auto kv = LLM_KV(model.arch);
  1024. // determine vocab type
  1025. {
  1026. std::string tokenizer_model;
  1027. std::string tokenizer_pre;
  1028. ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
  1029. ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
  1030. if (tokenizer_model == "no_vocab" || tokenizer_model == "none") {
  1031. vocab.type = LLAMA_VOCAB_TYPE_NONE;
  1032. // default special tokens
  1033. vocab.special_bos_id = LLAMA_TOKEN_NULL;
  1034. vocab.special_eos_id = LLAMA_TOKEN_NULL;
  1035. vocab.special_unk_id = LLAMA_TOKEN_NULL;
  1036. vocab.special_sep_id = LLAMA_TOKEN_NULL;
  1037. vocab.special_pad_id = LLAMA_TOKEN_NULL;
  1038. vocab.special_cls_id = LLAMA_TOKEN_NULL;
  1039. vocab.special_mask_id = LLAMA_TOKEN_NULL;
  1040. vocab.linefeed_id = LLAMA_TOKEN_NULL;
  1041. // read vocab size from metadata
  1042. if (!ml.get_key(LLM_KV_VOCAB_SIZE, vocab.n_vocab, false)) {
  1043. vocab.n_vocab = 0;
  1044. LLAMA_LOG_WARN("%s: there is no vocab_size in metadata, vocab.n_vocab will be set to %u\n", __func__, vocab.n_vocab);
  1045. }
  1046. return;
  1047. }
  1048. if (tokenizer_model == "llama") {
  1049. vocab.type = LLAMA_VOCAB_TYPE_SPM;
  1050. // default special tokens
  1051. vocab.special_bos_id = 1;
  1052. vocab.special_eos_id = 2;
  1053. vocab.special_unk_id = 0;
  1054. vocab.special_sep_id = LLAMA_TOKEN_NULL;
  1055. vocab.special_pad_id = LLAMA_TOKEN_NULL;
  1056. vocab.special_cls_id = LLAMA_TOKEN_NULL;
  1057. vocab.special_mask_id = LLAMA_TOKEN_NULL;
  1058. } else if (tokenizer_model == "bert") {
  1059. vocab.type = LLAMA_VOCAB_TYPE_WPM;
  1060. // default special tokens
  1061. vocab.special_bos_id = LLAMA_TOKEN_NULL;
  1062. vocab.special_eos_id = LLAMA_TOKEN_NULL;
  1063. vocab.special_unk_id = 100;
  1064. vocab.special_sep_id = 102;
  1065. vocab.special_pad_id = 0;
  1066. vocab.special_cls_id = 101;
  1067. vocab.special_mask_id = 103;
  1068. } else if (tokenizer_model == "gpt2") {
  1069. vocab.type = LLAMA_VOCAB_TYPE_BPE;
  1070. // read bpe merges and populate bpe ranks
  1071. const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
  1072. if (merges_keyidx == -1) {
  1073. throw std::runtime_error("cannot find tokenizer merges in model file\n");
  1074. }
  1075. const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
  1076. for (int i = 0; i < n_merges; i++) {
  1077. const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
  1078. GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
  1079. std::string first;
  1080. std::string second;
  1081. const size_t pos = word.find(' ', 1);
  1082. if (pos != std::string::npos) {
  1083. first = word.substr(0, pos);
  1084. second = word.substr(pos + 1);
  1085. }
  1086. vocab.bpe_ranks.emplace(std::make_pair(first, second), i);
  1087. }
  1088. // default special tokens
  1089. vocab.special_bos_id = 11;
  1090. vocab.special_eos_id = 11;
  1091. vocab.special_unk_id = LLAMA_TOKEN_NULL;
  1092. vocab.special_sep_id = LLAMA_TOKEN_NULL;
  1093. vocab.special_pad_id = LLAMA_TOKEN_NULL;
  1094. vocab.special_cls_id = LLAMA_TOKEN_NULL;
  1095. vocab.special_mask_id = LLAMA_TOKEN_NULL;
  1096. } else if (tokenizer_model == "t5") {
  1097. vocab.type = LLAMA_VOCAB_TYPE_UGM;
  1098. // default special tokens
  1099. vocab.special_bos_id = LLAMA_TOKEN_NULL;
  1100. vocab.special_eos_id = 1;
  1101. vocab.special_unk_id = 2;
  1102. vocab.special_sep_id = LLAMA_TOKEN_NULL;
  1103. vocab.special_pad_id = 0;
  1104. vocab.special_cls_id = LLAMA_TOKEN_NULL;
  1105. vocab.special_mask_id = LLAMA_TOKEN_NULL;
  1106. const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
  1107. if (precompiled_charsmap_keyidx != -1) {
  1108. size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
  1109. const char * precompiled_charsmap = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
  1110. vocab.precompiled_charsmap.assign(precompiled_charsmap, precompiled_charsmap + n_precompiled_charsmap);
  1111. #ifdef IS_BIG_ENDIAN
  1112. // correct endiannes of data in precompiled_charsmap binary blob
  1113. uint32_t * xcda_blob_size = (uint32_t *) &vocab.precompiled_charsmap[0];
  1114. *xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
  1115. assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
  1116. size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t);
  1117. uint32_t * xcda_array = (uint32_t *) &vocab.precompiled_charsmap[sizeof(uint32_t)];
  1118. for (size_t i = 0; i < xcda_array_size; ++i) {
  1119. xcda_array[i] = __builtin_bswap32(xcda_array[i]);
  1120. }
  1121. #endif
  1122. }
  1123. } else if (tokenizer_model == "rwkv") {
  1124. vocab.type = LLAMA_VOCAB_TYPE_RWKV;
  1125. // default special tokens
  1126. vocab.special_bos_id = LLAMA_TOKEN_NULL;
  1127. vocab.special_eos_id = LLAMA_TOKEN_NULL;
  1128. vocab.special_unk_id = LLAMA_TOKEN_NULL;
  1129. vocab.special_sep_id = LLAMA_TOKEN_NULL;
  1130. vocab.special_pad_id = LLAMA_TOKEN_NULL;
  1131. } else {
  1132. throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
  1133. }
  1134. // for now, only BPE models have pre-tokenizers
  1135. if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
  1136. vocab.tokenizer_add_space_prefix = false;
  1137. vocab.tokenizer_clean_spaces = true;
  1138. if (tokenizer_pre.empty()) {
  1139. LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
  1140. LLAMA_LOG_WARN("%s: \n", __func__);
  1141. LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
  1142. LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
  1143. LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
  1144. LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
  1145. LLAMA_LOG_WARN("%s: \n", __func__);
  1146. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
  1147. } else if (tokenizer_pre == "default") {
  1148. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
  1149. } else if (
  1150. tokenizer_pre == "llama3" ||
  1151. tokenizer_pre == "llama-v3" ||
  1152. tokenizer_pre == "llama-bpe"||
  1153. tokenizer_pre == "falcon3") {
  1154. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
  1155. vocab.tokenizer_ignore_merges = true;
  1156. vocab.tokenizer_add_bos = true;
  1157. } else if (
  1158. tokenizer_pre == "deepseek-llm") {
  1159. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
  1160. vocab.tokenizer_clean_spaces = false;
  1161. } else if (
  1162. tokenizer_pre == "deepseek-coder") {
  1163. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
  1164. vocab.tokenizer_clean_spaces = false;
  1165. } else if (
  1166. tokenizer_pre == "deepseek-v3") {
  1167. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
  1168. vocab.tokenizer_clean_spaces = false;
  1169. } else if (
  1170. tokenizer_pre == "falcon") {
  1171. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
  1172. } else if (
  1173. tokenizer_pre == "mpt") {
  1174. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
  1175. } else if (
  1176. tokenizer_pre == "starcoder") {
  1177. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
  1178. } else if (
  1179. tokenizer_pre == "gpt-2" ||
  1180. tokenizer_pre == "phi-2" ||
  1181. tokenizer_pre == "jina-es" ||
  1182. tokenizer_pre == "jina-de" ||
  1183. tokenizer_pre == "gigachat" ||
  1184. tokenizer_pre == "jina-v1-en" ||
  1185. tokenizer_pre == "jina-v2-es" ||
  1186. tokenizer_pre == "jina-v2-de" ||
  1187. tokenizer_pre == "jina-v2-code" ||
  1188. tokenizer_pre == "roberta-bpe") {
  1189. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
  1190. } else if (
  1191. tokenizer_pre == "refact") {
  1192. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
  1193. } else if (
  1194. tokenizer_pre == "command-r") {
  1195. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
  1196. vocab.tokenizer_clean_spaces = false;
  1197. } else if (
  1198. tokenizer_pre == "qwen2") {
  1199. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
  1200. vocab.tokenizer_clean_spaces = false;
  1201. } else if (
  1202. tokenizer_pre == "stablelm2") {
  1203. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
  1204. } else if (
  1205. tokenizer_pre == "olmo") {
  1206. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
  1207. } else if (
  1208. tokenizer_pre == "dbrx") {
  1209. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
  1210. } else if (
  1211. tokenizer_pre == "smaug-bpe") {
  1212. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
  1213. } else if (
  1214. tokenizer_pre == "poro-chat") {
  1215. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
  1216. vocab.tokenizer_clean_spaces = false;
  1217. } else if (
  1218. tokenizer_pre == "chatglm-bpe") {
  1219. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
  1220. vocab.special_bos_id = LLAMA_TOKEN_NULL;
  1221. } else if (
  1222. tokenizer_pre == "viking") {
  1223. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
  1224. vocab.tokenizer_clean_spaces = false;
  1225. } else if (
  1226. tokenizer_pre == "jais") {
  1227. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
  1228. } else if (
  1229. tokenizer_pre == "tekken") {
  1230. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
  1231. vocab.tokenizer_clean_spaces = false;
  1232. vocab.tokenizer_ignore_merges = true;
  1233. vocab.tokenizer_add_bos = true;
  1234. } else if (
  1235. tokenizer_pre == "smollm") {
  1236. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
  1237. vocab.tokenizer_clean_spaces = false;
  1238. } else if (
  1239. tokenizer_pre == "codeshell") {
  1240. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
  1241. } else if (
  1242. tokenizer_pre == "bloom") {
  1243. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_BLOOM;
  1244. } else if (
  1245. tokenizer_pre == "gpt3-finnish") {
  1246. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
  1247. } else if (
  1248. tokenizer_pre == "exaone") {
  1249. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
  1250. } else if (
  1251. tokenizer_pre == "chameleon") {
  1252. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
  1253. vocab.tokenizer_add_bos = true;
  1254. vocab.tokenizer_clean_spaces = false;
  1255. } else if (
  1256. tokenizer_pre == "minerva-7b") {
  1257. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
  1258. } else if (
  1259. tokenizer_pre == "megrez") {
  1260. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
  1261. } else {
  1262. throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
  1263. }
  1264. } else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
  1265. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
  1266. vocab.tokenizer_add_space_prefix = true;
  1267. vocab.tokenizer_clean_spaces = false;
  1268. vocab.tokenizer_add_bos = true;
  1269. vocab.tokenizer_add_eos = false;
  1270. } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
  1271. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
  1272. vocab.tokenizer_add_space_prefix = false;
  1273. vocab.tokenizer_clean_spaces = true;
  1274. vocab.tokenizer_add_bos = true;
  1275. vocab.tokenizer_add_eos = false;
  1276. } else if (vocab.type == LLAMA_VOCAB_TYPE_UGM) {
  1277. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
  1278. vocab.tokenizer_add_bos = false;
  1279. vocab.tokenizer_add_eos = true;
  1280. } else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) {
  1281. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
  1282. vocab.tokenizer_add_space_prefix = false;
  1283. vocab.tokenizer_clean_spaces = false;
  1284. vocab.tokenizer_add_bos = false;
  1285. vocab.tokenizer_add_eos = false;
  1286. } else {
  1287. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
  1288. }
  1289. ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, vocab.tokenizer_add_space_prefix, false);
  1290. ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.tokenizer_remove_extra_whitespaces, false);
  1291. }
  1292. const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
  1293. if (token_idx == -1) {
  1294. throw std::runtime_error("cannot find tokenizer vocab in model file\n");
  1295. }
  1296. const float * scores = nullptr;
  1297. const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
  1298. if (score_idx != -1) {
  1299. scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
  1300. }
  1301. const int * toktypes = nullptr;
  1302. const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
  1303. if (toktype_idx != -1) {
  1304. toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
  1305. }
  1306. const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
  1307. vocab.n_vocab = n_vocab;
  1308. vocab.id_to_token.resize(n_vocab);
  1309. for (uint32_t i = 0; i < n_vocab; i++) {
  1310. std::string word = gguf_get_arr_str(ctx, token_idx, i);
  1311. if (word.empty()) {
  1312. LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i);
  1313. word = "[EMPTY_" + std::to_string(i) + "]";
  1314. }
  1315. vocab.token_to_id[word] = i;
  1316. vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
  1317. auto & token_data = vocab.id_to_token[i];
  1318. token_data.text = std::move(word);
  1319. token_data.score = scores ? scores[i] : 0.0f;
  1320. token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
  1321. if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
  1322. switch(toktypes[i]) {
  1323. case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
  1324. case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
  1325. case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
  1326. case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
  1327. case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
  1328. case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
  1329. case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
  1330. default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
  1331. }
  1332. }
  1333. }
  1334. GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
  1335. vocab.init_tokenizer();
  1336. // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
  1337. if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
  1338. try {
  1339. vocab.linefeed_id = llama_byte_to_token_impl(vocab, '\n');
  1340. } catch (const std::exception & e) {
  1341. LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
  1342. vocab.linefeed_id = vocab.special_pad_id;
  1343. }
  1344. } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
  1345. vocab.linefeed_id = vocab.special_pad_id;
  1346. } else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) {
  1347. const std::vector<int> ids = llama_tokenize_internal(vocab, "\n", false);
  1348. GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
  1349. vocab.linefeed_id = ids[0];
  1350. } else {
  1351. const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
  1352. //GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
  1353. if (ids.empty()) {
  1354. LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__);
  1355. vocab.linefeed_id = vocab.special_pad_id;
  1356. } else {
  1357. vocab.linefeed_id = ids[0];
  1358. }
  1359. }
  1360. // special tokens
  1361. {
  1362. const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
  1363. { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
  1364. { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
  1365. { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
  1366. { LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id },
  1367. { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
  1368. { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
  1369. { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
  1370. { LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
  1371. { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
  1372. { LLM_KV_TOKENIZER_FIM_PRE_ID, vocab.special_fim_pre_id },
  1373. { LLM_KV_TOKENIZER_FIM_SUF_ID, vocab.special_fim_suf_id },
  1374. { LLM_KV_TOKENIZER_FIM_MID_ID, vocab.special_fim_mid_id },
  1375. { LLM_KV_TOKENIZER_FIM_PAD_ID, vocab.special_fim_pad_id },
  1376. { LLM_KV_TOKENIZER_FIM_REP_ID, vocab.special_fim_rep_id },
  1377. { LLM_KV_TOKENIZER_FIM_SEP_ID, vocab.special_fim_sep_id },
  1378. // deprecated
  1379. { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_fim_pre_id },
  1380. { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_fim_suf_id },
  1381. { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_fim_mid_id },
  1382. };
  1383. for (const auto & it : special_token_types) {
  1384. const std::string & key = kv(std::get<0>(it));
  1385. int32_t & id = std::get<1>(it);
  1386. uint32_t new_id;
  1387. if (!ml.get_key(std::get<0>(it), new_id, false)) {
  1388. continue;
  1389. }
  1390. if (new_id >= vocab.id_to_token.size()) {
  1391. LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n",
  1392. __func__, key.c_str(), new_id, id);
  1393. } else {
  1394. id = new_id;
  1395. }
  1396. }
  1397. // Handle add_bos_token and add_eos_token
  1398. {
  1399. bool temp = true;
  1400. if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
  1401. vocab.tokenizer_add_bos = temp;
  1402. }
  1403. if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
  1404. vocab.tokenizer_add_eos = temp;
  1405. }
  1406. }
  1407. // auto-detect special tokens by text
  1408. // TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
  1409. // for now, we apply this workaround to find the tokens based on their text
  1410. for (const auto & t : vocab.token_to_id) {
  1411. // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
  1412. if (vocab.special_eot_id == LLAMA_TOKEN_NULL) {
  1413. if (false
  1414. || t.first == "<|eot_id|>"
  1415. || t.first == "<|im_end|>"
  1416. || t.first == "<|end|>"
  1417. || t.first == "<end_of_turn>"
  1418. || t.first == "<|endoftext|>"
  1419. || t.first == "<EOT>"
  1420. || t.first == "<|end▁of▁sentence|>" // DeepSeek
  1421. ) {
  1422. vocab.special_eot_id = t.second;
  1423. if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
  1424. LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
  1425. __func__, t.second, t.first.c_str());
  1426. vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
  1427. }
  1428. }
  1429. }
  1430. // find EOM token: "<|eom_id|>"
  1431. if (vocab.special_eom_id == LLAMA_TOKEN_NULL) {
  1432. if (false
  1433. || t.first == "<|eom_id|>"
  1434. ) {
  1435. vocab.special_eom_id = t.second;
  1436. if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
  1437. LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
  1438. __func__, t.second, t.first.c_str());
  1439. vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
  1440. }
  1441. }
  1442. }
  1443. // find FIM_PRE token: "<|fim_prefix|>", "<fim-prefix>", "<PRE>", etc.
  1444. if (vocab.special_fim_pre_id == LLAMA_TOKEN_NULL) {
  1445. if (false
  1446. || t.first == "<|fim_prefix|>" // Qwen
  1447. || t.first == "<fim-prefix>"
  1448. || t.first == "<|fim▁begin|>" // DeepSeek
  1449. || t.first == "<PRE>"
  1450. ) {
  1451. vocab.special_fim_pre_id = t.second;
  1452. if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
  1453. LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
  1454. __func__, t.second, t.first.c_str());
  1455. vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
  1456. }
  1457. }
  1458. }
  1459. // find FIM_SUF token: "<|fim_suffix|>", "<fim-suffix>", "<SUF>", etc.
  1460. if (vocab.special_fim_suf_id == LLAMA_TOKEN_NULL) {
  1461. if (false
  1462. || t.first == "<|fim_suffix|>" // Qwen
  1463. || t.first == "<fim-suffix>"
  1464. || t.first == "<|fim▁hole|>" // DeepSeek
  1465. || t.first == "<SUF>"
  1466. ) {
  1467. vocab.special_fim_suf_id = t.second;
  1468. if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
  1469. LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
  1470. __func__, t.second, t.first.c_str());
  1471. vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
  1472. }
  1473. }
  1474. }
  1475. // find FIM_MID token: "<|fim_middle|>", "<fim-middle>", "<MID>", etc.
  1476. if (vocab.special_fim_mid_id == LLAMA_TOKEN_NULL) {
  1477. if (false
  1478. || t.first == "<|fim_middle|>" // Qwen
  1479. || t.first == "<fim-middle>"
  1480. || t.first == "<|fim▁end|>" // DeepSeek
  1481. || t.first == "<MID>"
  1482. ) {
  1483. vocab.special_fim_mid_id = t.second;
  1484. if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
  1485. LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
  1486. __func__, t.second, t.first.c_str());
  1487. vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
  1488. }
  1489. }
  1490. }
  1491. // find FIM_PAD token: "<|fim_pad|>", "<fim-pad>", "<PAD>", etc.
  1492. if (vocab.special_fim_pad_id == LLAMA_TOKEN_NULL) {
  1493. if (false
  1494. || t.first == "<|fim_pad|>" // Qwen
  1495. || t.first == "<fim-pad>"
  1496. || t.first == "<PAD>"
  1497. ) {
  1498. vocab.special_fim_pad_id = t.second;
  1499. if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
  1500. LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
  1501. __func__, t.second, t.first.c_str());
  1502. vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
  1503. }
  1504. }
  1505. }
  1506. // find FIM_REP token: "<|fim_repo|>", "<fim-repo>", "<REP>", etc.
  1507. if (vocab.special_fim_rep_id == LLAMA_TOKEN_NULL) {
  1508. if (false
  1509. || t.first == "<|fim_repo|>" // Qwen
  1510. || t.first == "<|repo_name|>"
  1511. || t.first == "<fim-repo>"
  1512. || t.first == "<REPO>"
  1513. ) {
  1514. vocab.special_fim_rep_id = t.second;
  1515. if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
  1516. LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
  1517. __func__, t.second, t.first.c_str());
  1518. vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
  1519. }
  1520. }
  1521. }
  1522. // find FIM_SEP token: "<|file_sep|>"
  1523. if (vocab.special_fim_sep_id == LLAMA_TOKEN_NULL) {
  1524. if (false
  1525. || t.first == "<|file_sep|>" // Qwen
  1526. ) {
  1527. vocab.special_fim_sep_id = t.second;
  1528. if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
  1529. LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
  1530. __func__, t.second, t.first.c_str());
  1531. vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
  1532. }
  1533. }
  1534. }
  1535. }
  1536. // maintain a list of tokens that cause end-of-generation
  1537. // this is currently determined based on the token text, which is obviously not ideal
  1538. // ref: https://github.com/ggerganov/llama.cpp/issues/9606
  1539. vocab.special_eog_ids.clear();
  1540. if (vocab.special_fim_pad_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_pad_id) == 0) {
  1541. vocab.special_eog_ids.insert(vocab.special_fim_pad_id);
  1542. }
  1543. if (vocab.special_fim_rep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_rep_id) == 0) {
  1544. vocab.special_eog_ids.insert(vocab.special_fim_rep_id);
  1545. }
  1546. if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_sep_id) == 0) {
  1547. vocab.special_eog_ids.insert(vocab.special_fim_sep_id);
  1548. }
  1549. for (const auto & t : vocab.token_to_id) {
  1550. if (false
  1551. || t.first == "<|eot_id|>"
  1552. || t.first == "<|im_end|>"
  1553. || t.first == "<|end|>"
  1554. || t.first == "<end_of_turn>"
  1555. || t.first == "<|endoftext|>"
  1556. || t.first == "<|eom_id|>"
  1557. || t.first == "<EOT>"
  1558. ) {
  1559. vocab.special_eog_ids.insert(t.second);
  1560. if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
  1561. LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
  1562. __func__, t.second, t.first.c_str());
  1563. vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
  1564. }
  1565. } else {
  1566. // token is control, but not marked as EOG -> print a debug log
  1567. if (vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && vocab.special_eog_ids.count(t.second) == 0) {
  1568. LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
  1569. __func__, t.second, t.first.c_str());
  1570. }
  1571. }
  1572. }
  1573. // sanity checks
  1574. if (vocab.special_eos_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
  1575. vocab.special_eog_ids.insert(vocab.special_eos_id);
  1576. LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
  1577. }
  1578. if (vocab.special_eot_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
  1579. vocab.special_eog_ids.insert(vocab.special_eot_id);
  1580. LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
  1581. }
  1582. if (vocab.special_eom_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
  1583. vocab.special_eog_ids.insert(vocab.special_eom_id);
  1584. LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
  1585. }
  1586. }
  1587. // build special tokens cache
  1588. {
  1589. for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
  1590. if (vocab.id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) {
  1591. vocab.cache_special_tokens.push_back(id);
  1592. }
  1593. }
  1594. std::sort(vocab.cache_special_tokens.begin(), vocab.cache_special_tokens.end(),
  1595. [&] (const llama_vocab::id a, const llama_vocab::id b) {
  1596. return vocab.id_to_token[a].text.size() > vocab.id_to_token[b].text.size();
  1597. }
  1598. );
  1599. LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
  1600. }
  1601. // build token to piece cache
  1602. {
  1603. size_t size_cache = 0;
  1604. std::vector<llama_vocab::token> cache_token_to_piece(n_vocab);
  1605. for (uint32_t id = 0; id < n_vocab; ++id) {
  1606. cache_token_to_piece[id] = llama_token_to_piece(&model, id, true);
  1607. size_cache += cache_token_to_piece[id].size();
  1608. }
  1609. std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
  1610. LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
  1611. }
  1612. // Handle per token attributes
  1613. //NOTE: Each model customizes per token attributes.
  1614. //NOTE: Per token attributes are missing from the GGUF file.
  1615. //TODO: Extract attributes from GGUF file.
  1616. {
  1617. auto _contains_any = [] (const std::string &str, const std::vector<std::string> &substrs) -> bool {
  1618. for (auto substr : substrs) {
  1619. if (str.find(substr) < std::string::npos) {
  1620. return true;
  1621. }
  1622. }
  1623. return false;
  1624. };
  1625. auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) {
  1626. uint32_t current = vocab.id_to_token.at(id).attr;
  1627. current = value ? (current | attr) : (current & ~attr);
  1628. vocab.id_to_token[id].attr = (llama_token_attr) current;
  1629. };
  1630. auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
  1631. _set_tokenid_attr(vocab.token_to_id.at(token), attr, value);
  1632. };
  1633. std::string model_name;
  1634. std::string tokenizer_pre;
  1635. ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
  1636. ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
  1637. // model name to lowercase
  1638. std::transform(model_name.begin(), model_name.end(), model_name.begin(),
  1639. [] (const std::string::value_type x) {
  1640. return std::tolower(x);
  1641. }
  1642. );
  1643. // set attributes by model/tokenizer name
  1644. if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
  1645. _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
  1646. } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
  1647. for (auto id : vocab.cache_special_tokens) {
  1648. _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
  1649. }
  1650. for (auto token : {"</s>"}) {
  1651. _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
  1652. }
  1653. for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
  1654. _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
  1655. }
  1656. }
  1657. }
  1658. }
  1659. void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
  1660. const auto & hparams = model.hparams;
  1661. const auto & vocab = model.vocab;
  1662. const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
  1663. auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
  1664. bool is_var = false;
  1665. std::vector<uint32_t> v;
  1666. for (uint32_t i = 0; i < n; ++i) {
  1667. v.push_back(f(i));
  1668. if (v[i] != v[0]) {
  1669. is_var = true;
  1670. }
  1671. }
  1672. std::stringstream ss;
  1673. if (is_var) {
  1674. ss << "[";
  1675. for (uint32_t i = 0; i < n; ++i) {
  1676. ss << v[i];
  1677. if (i < n - 1) {
  1678. ss << ", ";
  1679. }
  1680. }
  1681. ss << "]";
  1682. } else {
  1683. ss << v[0];
  1684. }
  1685. return ss.str();
  1686. };
  1687. // hparams
  1688. LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
  1689. LLAMA_LOG_INFO("%s: arch = %s\n", __func__, llm_arch_name(model.arch));
  1690. LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type));
  1691. LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
  1692. LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
  1693. LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
  1694. if (!hparams.vocab_only) {
  1695. LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
  1696. LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
  1697. LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
  1698. LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
  1699. LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
  1700. LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
  1701. LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
  1702. LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
  1703. LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
  1704. LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
  1705. LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
  1706. LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
  1707. LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
  1708. LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
  1709. LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
  1710. LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
  1711. LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
  1712. LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
  1713. LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
  1714. LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
  1715. LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
  1716. LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
  1717. LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
  1718. LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
  1719. LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
  1720. LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
  1721. LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
  1722. LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
  1723. LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
  1724. LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
  1725. LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
  1726. LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
  1727. LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
  1728. }
  1729. LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model).c_str());
  1730. LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model).c_str());
  1731. if (ml.n_elements >= 1e12) {
  1732. LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, ml.n_elements*1e-12);
  1733. } else if (ml.n_elements >= 1e9) {
  1734. LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
  1735. } else if (ml.n_elements >= 1e6) {
  1736. LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, ml.n_elements*1e-6);
  1737. } else {
  1738. LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, ml.n_elements*1e-3);
  1739. }
  1740. if (ml.n_bytes < GiB) {
  1741. LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
  1742. } else {
  1743. LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
  1744. }
  1745. // general kv
  1746. LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
  1747. // special tokens
  1748. if (vocab.special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
  1749. if (vocab.special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
  1750. if (vocab.special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
  1751. if (vocab.special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, vocab.special_eom_id, vocab.id_to_token[vocab.special_eom_id].text.c_str() ); }
  1752. if (vocab.special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
  1753. if (vocab.special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
  1754. if (vocab.special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
  1755. if (vocab.special_cls_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
  1756. if (vocab.special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
  1757. if (vocab.linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
  1758. if (vocab.special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, vocab.special_fim_pre_id, vocab.id_to_token[vocab.special_fim_pre_id].text.c_str() ); }
  1759. if (vocab.special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, vocab.special_fim_suf_id, vocab.id_to_token[vocab.special_fim_suf_id].text.c_str() ); }
  1760. if (vocab.special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, vocab.special_fim_mid_id, vocab.id_to_token[vocab.special_fim_mid_id].text.c_str() ); }
  1761. if (vocab.special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, vocab.special_fim_pad_id, vocab.id_to_token[vocab.special_fim_pad_id].text.c_str() ); }
  1762. if (vocab.special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, vocab.special_fim_rep_id, vocab.id_to_token[vocab.special_fim_rep_id].text.c_str() ); }
  1763. if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, vocab.special_fim_sep_id, vocab.id_to_token[vocab.special_fim_sep_id].text.c_str() ); }
  1764. for (const auto & id : vocab.special_eog_ids) {
  1765. LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() );
  1766. }
  1767. LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
  1768. if (model.arch == LLM_ARCH_DEEPSEEK) {
  1769. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  1770. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  1771. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  1772. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  1773. }
  1774. if (model.arch == LLM_ARCH_DEEPSEEK2) {
  1775. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  1776. LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
  1777. LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
  1778. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  1779. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  1780. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  1781. LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
  1782. LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((enum llama_expert_gating_func_type) hparams.expert_gating_func));
  1783. LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
  1784. }
  1785. if (model.arch == LLM_ARCH_QWEN2MOE) {
  1786. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  1787. LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
  1788. }
  1789. if (model.arch == LLM_ARCH_MINICPM || model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
  1790. LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
  1791. LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
  1792. LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
  1793. }
  1794. }
  1795. //
  1796. // interface implementation
  1797. //
  1798. struct llama_model_params llama_model_default_params() {
  1799. struct llama_model_params result = {
  1800. /*.devices =*/ nullptr,
  1801. /*.n_gpu_layers =*/ 0,
  1802. /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
  1803. /*.main_gpu =*/ 0,
  1804. /*.tensor_split =*/ nullptr,
  1805. /*.rpc_servers =*/ nullptr,
  1806. /*.progress_callback =*/ nullptr,
  1807. /*.progress_callback_user_data =*/ nullptr,
  1808. /*.kv_overrides =*/ nullptr,
  1809. /*.vocab_only =*/ false,
  1810. /*.use_mmap =*/ true,
  1811. /*.use_mlock =*/ false,
  1812. /*.check_tensors =*/ false,
  1813. };
  1814. #ifdef GGML_USE_METAL
  1815. // note: we usually have plenty of VRAM, so by default offload all layers to the GPU
  1816. result.n_gpu_layers = 999;
  1817. #endif
  1818. return result;
  1819. }
  1820. void llama_free_model(struct llama_model * model) {
  1821. llama_model_free(model);
  1822. }
  1823. void llama_model_free(struct llama_model * model) {
  1824. delete model;
  1825. }
  1826. enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
  1827. return model->vocab.type;
  1828. }
  1829. int32_t llama_n_vocab(const struct llama_model * model) {
  1830. return model->hparams.n_vocab;
  1831. }
  1832. int32_t llama_n_ctx_train(const struct llama_model * model) {
  1833. return model->hparams.n_ctx_train;
  1834. }
  1835. int32_t llama_n_embd(const struct llama_model * model) {
  1836. return model->hparams.n_embd;
  1837. }
  1838. int32_t llama_n_layer(const struct llama_model * model) {
  1839. return model->hparams.n_layer;
  1840. }
  1841. int32_t llama_n_head(const struct llama_model * model) {
  1842. return model->hparams.n_head();
  1843. }
  1844. enum llama_rope_type llama_rope_type(const struct llama_model * model) {
  1845. switch (model->arch) {
  1846. // these models do not use RoPE
  1847. case LLM_ARCH_GPT2:
  1848. case LLM_ARCH_GPTJ:
  1849. case LLM_ARCH_MPT:
  1850. case LLM_ARCH_REFACT:
  1851. case LLM_ARCH_BLOOM:
  1852. case LLM_ARCH_MAMBA:
  1853. case LLM_ARCH_JINA_BERT_V2:
  1854. case LLM_ARCH_T5:
  1855. case LLM_ARCH_T5ENCODER:
  1856. case LLM_ARCH_JAIS:
  1857. case LLM_ARCH_RWKV6:
  1858. case LLM_ARCH_WAVTOKENIZER_DEC:
  1859. return LLAMA_ROPE_TYPE_NONE;
  1860. // use what we call a normal RoPE, operating on pairs of consecutive head values
  1861. case LLM_ARCH_LLAMA:
  1862. case LLM_ARCH_DECI:
  1863. case LLM_ARCH_BAICHUAN:
  1864. case LLM_ARCH_STARCODER:
  1865. case LLM_ARCH_PLAMO:
  1866. case LLM_ARCH_ORION:
  1867. case LLM_ARCH_INTERNLM2:
  1868. case LLM_ARCH_MINICPM:
  1869. case LLM_ARCH_XVERSE:
  1870. case LLM_ARCH_COMMAND_R:
  1871. case LLM_ARCH_COHERE2:
  1872. case LLM_ARCH_OLMO:
  1873. case LLM_ARCH_ARCTIC:
  1874. case LLM_ARCH_DEEPSEEK:
  1875. case LLM_ARCH_DEEPSEEK2:
  1876. case LLM_ARCH_CHATGLM:
  1877. case LLM_ARCH_GRANITE:
  1878. case LLM_ARCH_GRANITE_MOE:
  1879. case LLM_ARCH_CHAMELEON:
  1880. return LLAMA_ROPE_TYPE_NORM;
  1881. // the pairs of head values are offset by n_rot/2
  1882. case LLM_ARCH_FALCON:
  1883. case LLM_ARCH_GROK:
  1884. case LLM_ARCH_DBRX:
  1885. case LLM_ARCH_BERT:
  1886. case LLM_ARCH_NOMIC_BERT:
  1887. case LLM_ARCH_STABLELM:
  1888. case LLM_ARCH_BITNET:
  1889. case LLM_ARCH_QWEN:
  1890. case LLM_ARCH_QWEN2:
  1891. case LLM_ARCH_QWEN2MOE:
  1892. case LLM_ARCH_OLMO2:
  1893. case LLM_ARCH_OLMOE:
  1894. case LLM_ARCH_PHI2:
  1895. case LLM_ARCH_PHI3:
  1896. case LLM_ARCH_PHIMOE:
  1897. case LLM_ARCH_GEMMA:
  1898. case LLM_ARCH_GEMMA2:
  1899. case LLM_ARCH_STARCODER2:
  1900. case LLM_ARCH_OPENELM:
  1901. case LLM_ARCH_GPTNEOX:
  1902. case LLM_ARCH_CODESHELL:
  1903. case LLM_ARCH_NEMOTRON:
  1904. case LLM_ARCH_EXAONE:
  1905. case LLM_ARCH_MINICPM3:
  1906. return LLAMA_ROPE_TYPE_NEOX;
  1907. case LLM_ARCH_QWEN2VL:
  1908. return LLAMA_ROPE_TYPE_MROPE;
  1909. // all model arches should be listed explicitly here
  1910. case LLM_ARCH_UNKNOWN:
  1911. GGML_ABORT("unknown architecture");
  1912. }
  1913. return LLAMA_ROPE_TYPE_NONE;
  1914. }
  1915. float llama_rope_freq_scale_train(const struct llama_model * model) {
  1916. return model->hparams.rope_freq_scale_train;
  1917. }
  1918. int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
  1919. const auto & it = model->gguf_kv.find(key);
  1920. if (it == model->gguf_kv.end()) {
  1921. if (buf_size > 0) {
  1922. buf[0] = '\0';
  1923. }
  1924. return -1;
  1925. }
  1926. return snprintf(buf, buf_size, "%s", it->second.c_str());
  1927. }
  1928. int32_t llama_model_meta_count(const struct llama_model * model) {
  1929. return (int)model->gguf_kv.size();
  1930. }
  1931. int32_t llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
  1932. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  1933. if (buf_size > 0) {
  1934. buf[0] = '\0';
  1935. }
  1936. return -1;
  1937. }
  1938. auto it = model->gguf_kv.begin();
  1939. std::advance(it, i);
  1940. return snprintf(buf, buf_size, "%s", it->first.c_str());
  1941. }
  1942. int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size) {
  1943. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  1944. if (buf_size > 0) {
  1945. buf[0] = '\0';
  1946. }
  1947. return -1;
  1948. }
  1949. auto it = model->gguf_kv.begin();
  1950. std::advance(it, i);
  1951. return snprintf(buf, buf_size, "%s", it->second.c_str());
  1952. }
  1953. int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
  1954. return snprintf(buf, buf_size, "%s %s %s",
  1955. llama_model_arch_name (*model).c_str(),
  1956. llama_model_type_name (*model).c_str(),
  1957. llama_model_ftype_name(*model).c_str());
  1958. }
  1959. uint64_t llama_model_size(const struct llama_model * model) {
  1960. return model->n_bytes;
  1961. }
  1962. uint64_t llama_model_n_params(const struct llama_model * model) {
  1963. return model->n_elements;
  1964. }
  1965. bool llama_model_has_encoder(const struct llama_model * model) {
  1966. switch (model->arch) {
  1967. case LLM_ARCH_T5: return true;
  1968. case LLM_ARCH_T5ENCODER: return true;
  1969. default: return false;
  1970. }
  1971. }
  1972. bool llama_model_has_decoder(const struct llama_model * model) {
  1973. switch (model->arch) {
  1974. case LLM_ARCH_T5ENCODER: return false;
  1975. default: return true;
  1976. }
  1977. }
  1978. llama_token llama_model_decoder_start_token(const struct llama_model * model) {
  1979. return model->hparams.dec_start_token_id;
  1980. }
  1981. bool llama_model_is_recurrent(const struct llama_model * model) {
  1982. switch (model->arch) {
  1983. case LLM_ARCH_MAMBA: return true;
  1984. case LLM_ARCH_RWKV6: return true;
  1985. default: return false;
  1986. }
  1987. }