llama-model.cpp 101 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198
  1. #include "llama-model.h"
  2. #include "llama-impl.h"
  3. #include "llama-model-loader.h"
  4. #include "unicode.h" // TODO: remove
  5. #include <algorithm>
  6. #include <cassert>
  7. #include <functional>
  8. #include <sstream>
  9. #include <stdexcept>
  10. static const size_t kiB = 1024;
  11. static const size_t MiB = 1024*kiB;
  12. static const size_t GiB = 1024*MiB;
  13. const char * llm_type_name(llm_type type) {
  14. switch (type) {
  15. case MODEL_14M: return "14M";
  16. case MODEL_17M: return "17M";
  17. case MODEL_22M: return "22M";
  18. case MODEL_33M: return "33M";
  19. case MODEL_60M: return "60M";
  20. case MODEL_70M: return "70M";
  21. case MODEL_80M: return "80M";
  22. case MODEL_109M: return "109M";
  23. case MODEL_137M: return "137M";
  24. case MODEL_160M: return "160M";
  25. case MODEL_220M: return "220M";
  26. case MODEL_250M: return "250M";
  27. case MODEL_270M: return "270M";
  28. case MODEL_335M: return "335M";
  29. case MODEL_410M: return "410M";
  30. case MODEL_450M: return "450M";
  31. case MODEL_770M: return "770M";
  32. case MODEL_780M: return "780M";
  33. case MODEL_0_5B: return "0.5B";
  34. case MODEL_1B: return "1B";
  35. case MODEL_1_3B: return "1.3B";
  36. case MODEL_1_4B: return "1.4B";
  37. case MODEL_1_5B: return "1.5B";
  38. case MODEL_1_6B: return "1.6B";
  39. case MODEL_2B: return "2B";
  40. case MODEL_2_8B: return "2.8B";
  41. case MODEL_3B: return "3B";
  42. case MODEL_4B: return "4B";
  43. case MODEL_6B: return "6B";
  44. case MODEL_6_9B: return "6.9B";
  45. case MODEL_7B: return "7B";
  46. case MODEL_8B: return "8B";
  47. case MODEL_9B: return "9B";
  48. case MODEL_11B: return "11B";
  49. case MODEL_12B: return "12B";
  50. case MODEL_13B: return "13B";
  51. case MODEL_14B: return "14B";
  52. case MODEL_15B: return "15B";
  53. case MODEL_16B: return "16B";
  54. case MODEL_20B: return "20B";
  55. case MODEL_30B: return "30B";
  56. case MODEL_32B: return "32B";
  57. case MODEL_34B: return "34B";
  58. case MODEL_35B: return "35B";
  59. case MODEL_40B: return "40B";
  60. case MODEL_65B: return "65B";
  61. case MODEL_70B: return "70B";
  62. case MODEL_236B: return "236B";
  63. case MODEL_314B: return "314B";
  64. case MODEL_671B: return "671B";
  65. case MODEL_SMALL: return "0.1B";
  66. case MODEL_MEDIUM: return "0.4B";
  67. case MODEL_LARGE: return "0.8B";
  68. case MODEL_XL: return "1.5B";
  69. case MODEL_A1_7B: return "A1.7B";
  70. case MODEL_A2_7B: return "A2.7B";
  71. case MODEL_8x7B: return "8x7B";
  72. case MODEL_8x22B: return "8x22B";
  73. case MODEL_16x12B: return "16x12B";
  74. case MODEL_10B_128x3_66B: return "10B+128x3.66B";
  75. case MODEL_57B_A14B: return "57B.A14B";
  76. case MODEL_27B: return "27B";
  77. default: return "?B";
  78. }
  79. }
  80. static std::string llama_model_ftype_name(llama_ftype ftype) {
  81. if (ftype & LLAMA_FTYPE_GUESSED) {
  82. return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
  83. }
  84. switch (ftype) {
  85. case LLAMA_FTYPE_ALL_F32: return "all F32";
  86. case LLAMA_FTYPE_MOSTLY_F16: return "F16";
  87. case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
  88. case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
  89. case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
  90. case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0";
  91. case LLAMA_FTYPE_MOSTLY_Q5_1: return "Q5_1";
  92. case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
  93. case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
  94. case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
  95. case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
  96. case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
  97. case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
  98. case LLAMA_FTYPE_MOSTLY_Q4_K_S: return "Q4_K - Small";
  99. case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "Q4_K - Medium";
  100. case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
  101. case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
  102. case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
  103. case LLAMA_FTYPE_MOSTLY_TQ1_0: return "TQ1_0 - 1.69 bpw ternary";
  104. case LLAMA_FTYPE_MOSTLY_TQ2_0: return "TQ2_0 - 2.06 bpw ternary";
  105. case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return "IQ2_XXS - 2.0625 bpw";
  106. case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
  107. case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
  108. case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
  109. case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
  110. case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return "IQ3_XXS - 3.0625 bpw";
  111. case LLAMA_FTYPE_MOSTLY_IQ1_S: return "IQ1_S - 1.5625 bpw";
  112. case LLAMA_FTYPE_MOSTLY_IQ1_M: return "IQ1_M - 1.75 bpw";
  113. case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
  114. case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
  115. case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
  116. case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
  117. default: return "unknown, may not work";
  118. }
  119. }
  120. static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
  121. switch (type) {
  122. case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
  123. case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
  124. default: return "unknown";
  125. }
  126. }
  127. std::string llama_model_arch_name (const llama_model & model) {
  128. return llm_arch_name(model.arch);
  129. }
  130. std::string llama_model_type_name (const llama_model & model) {
  131. return llm_type_name(model.type);
  132. }
  133. std::string llama_model_ftype_name(const llama_model & model) {
  134. return llama_model_ftype_name(model.ftype);
  135. }
  136. template<typename F>
  137. static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
  138. ggml_init_params params = {
  139. /*.mem_size =*/ ggml_tensor_overhead()*8,
  140. /*.mem_buffer =*/ NULL,
  141. /*.no_alloc =*/ true,
  142. };
  143. ggml_context_ptr ctx { ggml_init(params) };
  144. if (!ctx) {
  145. throw std::runtime_error(format("failed to create ggml context"));
  146. }
  147. ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
  148. ggml_tensor * op_tensor = fn(ctx.get());
  149. for (int i = 0; i < GGML_MAX_SRC; i++) {
  150. if (op_tensor->src[i] != nullptr) {
  151. assert(op_tensor->src[i]->buffer == nullptr);
  152. op_tensor->src[i]->buffer = buf.get();
  153. }
  154. }
  155. bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
  156. return op_supported;
  157. }
  158. template<typename F>
  159. static ggml_backend_buffer_type_t select_buft(const llama_model::buft_list_t & buft_list, const F & fn) {
  160. for (const auto & cur : buft_list) {
  161. ggml_backend_dev_t cur_dev = cur.first;
  162. ggml_backend_buffer_type_t cur_buft = cur.second;
  163. if (buft_supported(cur_buft, cur_dev, fn)) {
  164. return cur_buft;
  165. }
  166. }
  167. throw std::runtime_error(format("no suitable buffer type found"));
  168. }
  169. ggml_backend_buffer_type_t llama_model_select_buft(const llama_model & model, int il) {
  170. return select_buft(
  171. *model.dev_layer.at(il).buft_list,
  172. [&](ggml_context * ctx) {
  173. ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
  174. ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
  175. return ggml_add(ctx, cur, layer_dir);
  176. });
  177. }
  178. struct ggml_tensor * llama_model_get_tensor(const struct llama_model & model, const char * name) {
  179. auto it = std::find_if(model.tensors_by_name.begin(), model.tensors_by_name.end(),
  180. [name](const std::pair<std::string, struct ggml_tensor *> & it) {
  181. return it.first == name;
  182. });
  183. if (it == model.tensors_by_name.end()) {
  184. return nullptr;
  185. }
  186. return it->second;
  187. }
  188. size_t llama_model_max_nodes(const llama_model & model) {
  189. return std::max<size_t>(8192, model.tensors_by_name.size()*5);
  190. }
  191. static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
  192. { LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
  193. { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
  194. { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
  195. { LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
  196. };
  197. static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
  198. for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
  199. if (kv.second == name) {
  200. return (llama_rope_scaling_type) kv.first;
  201. }
  202. }
  203. return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
  204. }
  205. // NOTE: avoid ever using this except for building the token_to_piece caches
  206. static std::string llama_token_to_piece(const struct llama_model * model, llama_token token, bool special) {
  207. std::string piece;
  208. piece.resize(piece.capacity()); // using string internal cache
  209. const int n_chars = llama_token_to_piece(model, token, &piece[0], piece.size(), 0, special);
  210. if (n_chars < 0) {
  211. piece.resize(-n_chars);
  212. int check = llama_token_to_piece(model, token, &piece[0], piece.size(), 0, special);
  213. GGML_ASSERT(check == -n_chars);
  214. }
  215. else {
  216. piece.resize(n_chars);
  217. }
  218. return piece;
  219. }
  220. void llm_load_stats(llama_model_loader & ml, llama_model & model) {
  221. model.n_elements = ml.n_elements;
  222. model.n_bytes = ml.n_bytes;
  223. }
  224. void llm_load_arch(llama_model_loader & ml, llama_model & model) {
  225. model.arch = ml.get_arch();
  226. if (model.arch == LLM_ARCH_UNKNOWN) {
  227. throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
  228. }
  229. }
  230. void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
  231. auto & hparams = model.hparams;
  232. const gguf_context * ctx = ml.meta.get();
  233. // get metadata as string
  234. for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
  235. enum gguf_type type = gguf_get_kv_type(ctx, i);
  236. if (type == GGUF_TYPE_ARRAY) {
  237. continue;
  238. }
  239. const char * name = gguf_get_key(ctx, i);
  240. const std::string value = gguf_kv_to_str(ctx, i);
  241. model.gguf_kv.emplace(name, value);
  242. }
  243. // get general kv
  244. ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
  245. // get hparams kv
  246. ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
  247. // everything past this point is not vocab-related
  248. if (hparams.vocab_only) {
  249. return;
  250. }
  251. ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
  252. ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
  253. ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
  254. ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
  255. ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
  256. if (model.arch == LLM_ARCH_WAVTOKENIZER_DEC) {
  257. ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
  258. ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
  259. ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
  260. ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
  261. ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer);
  262. }
  263. GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
  264. GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
  265. if (hparams.n_expert > 0) {
  266. GGML_ASSERT(hparams.n_expert_used > 0);
  267. } else {
  268. GGML_ASSERT(hparams.n_expert_used == 0);
  269. }
  270. // zero-out the array hparams
  271. std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
  272. std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
  273. std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
  274. ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
  275. ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
  276. // n_head_kv is optional, default to n_head
  277. hparams.n_head_kv_arr = hparams.n_head_arr;
  278. ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
  279. bool rope_finetuned = false;
  280. ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
  281. hparams.rope_finetuned = rope_finetuned;
  282. hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
  283. ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
  284. // rope_freq_base (optional)
  285. hparams.rope_freq_base_train = 10000.0f;
  286. ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
  287. std::string rope_scaling("linear");
  288. ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
  289. hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
  290. GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
  291. // rope_freq_scale (inverse of the kv) is optional
  292. float ropescale = 0.0f;
  293. if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
  294. // try the old key name
  295. ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
  296. }
  297. hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
  298. ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
  299. // non-transformer models do not have attention heads
  300. if (hparams.n_head() > 0) {
  301. // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
  302. // gpt-j n_rot = rotary_dim
  303. hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
  304. ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
  305. hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
  306. ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
  307. // sanity check for n_rot (optional)
  308. hparams.n_rot = hparams.n_embd_head_k;
  309. ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
  310. if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_FALCON) {
  311. if (hparams.n_rot != hparams.n_embd_head_k) {
  312. throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
  313. }
  314. }
  315. } else {
  316. hparams.n_rot = 0;
  317. hparams.n_embd_head_k = 0;
  318. hparams.n_embd_head_v = 0;
  319. }
  320. using e_model = llm_type; // TMP
  321. // arch-specific KVs
  322. switch (model.arch) {
  323. case LLM_ARCH_LLAMA:
  324. {
  325. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  326. if (hparams.n_expert == 8) {
  327. switch (hparams.n_layer) {
  328. case 32: model.type = e_model::MODEL_8x7B; break;
  329. case 56: model.type = e_model::MODEL_8x22B; break;
  330. default: model.type = e_model::MODEL_UNKNOWN;
  331. }
  332. } else {
  333. switch (hparams.n_layer) {
  334. case 16: model.type = e_model::MODEL_1B; break; // Llama 3.2 1B
  335. case 22: model.type = e_model::MODEL_1B; break;
  336. case 26: model.type = e_model::MODEL_3B; break;
  337. case 28: model.type = e_model::MODEL_3B; break; // Llama 3.2 3B
  338. // granite uses a vocab with len 49152
  339. case 32: model.type = hparams.n_vocab == 49152 ? e_model::MODEL_3B : (hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B); break;
  340. case 36: model.type = e_model::MODEL_8B; break; // granite
  341. case 40: model.type = e_model::MODEL_13B; break;
  342. case 48: model.type = e_model::MODEL_34B; break;
  343. case 60: model.type = e_model::MODEL_30B; break;
  344. case 80: model.type = hparams.n_head() == hparams.n_head_kv() ? e_model::MODEL_65B : e_model::MODEL_70B; break;
  345. default: model.type = e_model::MODEL_UNKNOWN;
  346. }
  347. }
  348. } break;
  349. case LLM_ARCH_DECI:
  350. {
  351. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  352. switch (hparams.n_layer) {
  353. case 32: model.type = e_model::MODEL_7B; break;
  354. case 80: model.type = e_model::MODEL_70B; break;
  355. default: model.type = e_model::MODEL_UNKNOWN;
  356. }
  357. } break;
  358. case LLM_ARCH_MINICPM:
  359. {
  360. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  361. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
  362. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
  363. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  364. switch (hparams.n_layer) {
  365. case 52: model.type = e_model::MODEL_1B; break;
  366. case 40: model.type = e_model::MODEL_2B; break;
  367. default: model.type = e_model::MODEL_UNKNOWN;
  368. }
  369. } break;
  370. case LLM_ARCH_MINICPM3:
  371. {
  372. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  373. ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
  374. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  375. switch (hparams.n_layer) {
  376. case 62: model.type = e_model::MODEL_4B; break;
  377. default: model.type = e_model::MODEL_UNKNOWN;
  378. }
  379. } break;
  380. case LLM_ARCH_GROK:
  381. {
  382. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  383. switch (hparams.n_layer) {
  384. case 64: model.type = e_model::MODEL_314B; break;
  385. default: model.type = e_model::MODEL_UNKNOWN;
  386. }
  387. } break;
  388. case LLM_ARCH_FALCON:
  389. {
  390. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  391. switch (hparams.n_layer) {
  392. case 32: model.type = e_model::MODEL_7B; break;
  393. case 60: model.type = e_model::MODEL_40B; break;
  394. default: model.type = e_model::MODEL_UNKNOWN;
  395. }
  396. } break;
  397. case LLM_ARCH_BAICHUAN:
  398. {
  399. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  400. switch (hparams.n_layer) {
  401. case 32: model.type = e_model::MODEL_7B; break;
  402. case 40: model.type = e_model::MODEL_13B; break;
  403. default: model.type = e_model::MODEL_UNKNOWN;
  404. }
  405. if (model.type == e_model::MODEL_13B) {
  406. // TODO: become GGUF KV parameter
  407. hparams.f_max_alibi_bias = 8.0f;
  408. }
  409. } break;
  410. case LLM_ARCH_STARCODER:
  411. {
  412. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  413. switch (hparams.n_layer) {
  414. case 24: model.type = e_model::MODEL_1B; break;
  415. case 36: model.type = e_model::MODEL_3B; break;
  416. case 42: model.type = e_model::MODEL_7B; break;
  417. case 40: model.type = e_model::MODEL_15B; break;
  418. default: model.type = e_model::MODEL_UNKNOWN;
  419. }
  420. } break;
  421. case LLM_ARCH_REFACT:
  422. {
  423. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  424. switch (hparams.n_layer) {
  425. case 32: model.type = e_model::MODEL_1B; break;
  426. default: model.type = e_model::MODEL_UNKNOWN;
  427. }
  428. // TODO: become GGUF KV parameter
  429. hparams.f_max_alibi_bias = 8.0f;
  430. } break;
  431. case LLM_ARCH_BERT:
  432. {
  433. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  434. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  435. ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
  436. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  437. switch (hparams.n_layer) {
  438. case 3:
  439. model.type = e_model::MODEL_17M; break; // bge-micro
  440. case 6:
  441. model.type = e_model::MODEL_22M; break; // MiniLM-L6
  442. case 12:
  443. switch (hparams.n_embd) {
  444. case 384: model.type = e_model::MODEL_33M; break; // MiniLM-L12, bge-small
  445. case 768: model.type = e_model::MODEL_109M; break; // bge-base
  446. default: model.type = e_model::MODEL_UNKNOWN;
  447. } break;
  448. case 24:
  449. model.type = e_model::MODEL_335M; break; // bge-large
  450. default: model.type = e_model::MODEL_UNKNOWN;
  451. }
  452. } break;
  453. case LLM_ARCH_JINA_BERT_V2:
  454. {
  455. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  456. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  457. ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
  458. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
  459. hparams.f_max_alibi_bias = 8.0f;
  460. switch (hparams.n_layer) {
  461. case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
  462. case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
  463. default: model.type = e_model::MODEL_UNKNOWN;
  464. }
  465. } break;
  466. case LLM_ARCH_NOMIC_BERT:
  467. {
  468. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  469. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  470. ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
  471. ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
  472. if (hparams.n_layer == 12 && hparams.n_embd == 768) {
  473. model.type = e_model::MODEL_137M;
  474. }
  475. } break;
  476. case LLM_ARCH_BLOOM:
  477. {
  478. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  479. switch (hparams.n_layer) {
  480. case 24: model.type = e_model::MODEL_1B; break;
  481. case 30:
  482. switch (hparams.n_embd) {
  483. case 2560: model.type = e_model::MODEL_3B; break;
  484. case 4096: model.type = e_model::MODEL_7B; break;
  485. default: model.type = e_model::MODEL_UNKNOWN;
  486. } break;
  487. default: model.type = e_model::MODEL_UNKNOWN;
  488. }
  489. // TODO: become GGUF KV parameter
  490. hparams.f_max_alibi_bias = 8.0f;
  491. } break;
  492. case LLM_ARCH_MPT:
  493. {
  494. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  495. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
  496. ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
  497. switch (hparams.n_layer) {
  498. case 32: model.type = e_model::MODEL_7B; break;
  499. case 48: model.type = e_model::MODEL_30B; break;
  500. default: model.type = e_model::MODEL_UNKNOWN;
  501. }
  502. } break;
  503. case LLM_ARCH_STABLELM:
  504. {
  505. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  506. switch (hparams.n_layer) {
  507. case 24: model.type = e_model::MODEL_1B; break;
  508. case 32: model.type = e_model::MODEL_3B; break;
  509. case 40: model.type = e_model::MODEL_12B; break;
  510. default: model.type = e_model::MODEL_UNKNOWN;
  511. }
  512. } break;
  513. case LLM_ARCH_QWEN:
  514. {
  515. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  516. switch (hparams.n_layer) {
  517. case 32: model.type = e_model::MODEL_7B; break;
  518. case 40: model.type = e_model::MODEL_13B; break;
  519. default: model.type = e_model::MODEL_UNKNOWN;
  520. }
  521. } break;
  522. case LLM_ARCH_QWEN2VL:
  523. {
  524. ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
  525. }
  526. // fall through
  527. case LLM_ARCH_QWEN2:
  528. {
  529. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  530. switch (hparams.n_layer) {
  531. case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
  532. case 28: model.type = hparams.n_embd == 1536 ? e_model::MODEL_1_5B : e_model::MODEL_7B; break;
  533. case 32: model.type = e_model::MODEL_7B; break;
  534. case 36: model.type = e_model::MODEL_3B; break;
  535. case 40: model.type = hparams.n_head() == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
  536. case 48: model.type = e_model::MODEL_14B; break;
  537. case 64: model.type = e_model::MODEL_32B; break;
  538. case 80: model.type = e_model::MODEL_70B; break;
  539. default: model.type = e_model::MODEL_UNKNOWN;
  540. }
  541. } break;
  542. case LLM_ARCH_QWEN2MOE:
  543. {
  544. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
  545. ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
  546. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  547. switch (hparams.n_layer) {
  548. case 24: model.type = e_model::MODEL_A2_7B; break;
  549. case 28: model.type = e_model::MODEL_57B_A14B; break;
  550. default: model.type = e_model::MODEL_UNKNOWN;
  551. }
  552. } break;
  553. case LLM_ARCH_PHI2:
  554. {
  555. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  556. switch (hparams.n_layer) {
  557. case 24: model.type = e_model::MODEL_1B; break;
  558. case 32: model.type = e_model::MODEL_3B; break;
  559. default: model.type = e_model::MODEL_UNKNOWN;
  560. }
  561. } break;
  562. case LLM_ARCH_PHI3:
  563. {
  564. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  565. switch (hparams.n_layer) {
  566. case 24: model.type = e_model::MODEL_1B; break;
  567. case 32: model.type = e_model::MODEL_3B; break;
  568. case 40: model.type = e_model::MODEL_14B; break;
  569. default: model.type = e_model::MODEL_UNKNOWN;
  570. }
  571. // for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
  572. if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
  573. // default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
  574. hparams.n_swa = 2047;
  575. } else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
  576. // default value for Phi-3-mini-128k-instruct
  577. hparams.n_swa = 262144;
  578. } else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
  579. // default value for Phi-3-medium-128k-instruct
  580. hparams.n_swa = 131072;
  581. }
  582. bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  583. if (!found_swa && hparams.n_swa == 0) {
  584. throw std::runtime_error("invalid value for sliding_window");
  585. }
  586. } break;
  587. case LLM_ARCH_PLAMO:
  588. {
  589. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  590. switch (hparams.n_layer) {
  591. case 40: model.type = e_model::MODEL_13B; break;
  592. default: model.type = e_model::MODEL_UNKNOWN;
  593. }
  594. } break;
  595. case LLM_ARCH_GPT2:
  596. {
  597. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  598. switch (hparams.n_layer) {
  599. case 12: model.type = e_model::MODEL_SMALL; break;
  600. case 24: model.type = e_model::MODEL_MEDIUM; break;
  601. case 36: model.type = e_model::MODEL_LARGE; break;
  602. case 48: model.type = e_model::MODEL_XL; break;
  603. default: model.type = e_model::MODEL_UNKNOWN;
  604. }
  605. } break;
  606. case LLM_ARCH_CODESHELL:
  607. {
  608. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  609. switch (hparams.n_layer) {
  610. case 42: model.type = e_model::MODEL_7B; break;
  611. default: model.type = e_model::MODEL_UNKNOWN;
  612. }
  613. } break;
  614. case LLM_ARCH_ORION:
  615. {
  616. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  617. switch (hparams.n_layer) {
  618. case 40: model.type = e_model::MODEL_14B; break;
  619. default: model.type = e_model::MODEL_UNKNOWN;
  620. }
  621. } break;
  622. case LLM_ARCH_INTERNLM2:
  623. {
  624. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  625. switch (hparams.n_layer) {
  626. case 32: model.type = e_model::MODEL_7B; break;
  627. case 48: model.type = e_model::MODEL_20B; break;
  628. default: model.type = e_model::MODEL_UNKNOWN;
  629. }
  630. } break;
  631. case LLM_ARCH_GEMMA:
  632. {
  633. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  634. switch (hparams.n_layer) {
  635. case 18: model.type = e_model::MODEL_2B; break;
  636. case 28: model.type = e_model::MODEL_7B; break;
  637. default: model.type = e_model::MODEL_UNKNOWN;
  638. }
  639. } break;
  640. case LLM_ARCH_GEMMA2:
  641. {
  642. hparams.n_swa = 4096; // default value of gemma 2
  643. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
  644. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  645. ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
  646. ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
  647. hparams.attn_soft_cap = true;
  648. switch (hparams.n_layer) {
  649. case 26: model.type = e_model::MODEL_2B; break;
  650. case 42: model.type = e_model::MODEL_9B; break;
  651. case 46: model.type = e_model::MODEL_27B; break;
  652. default: model.type = e_model::MODEL_UNKNOWN;
  653. }
  654. } break;
  655. case LLM_ARCH_STARCODER2:
  656. {
  657. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  658. switch (hparams.n_layer) {
  659. case 30: model.type = e_model::MODEL_3B; break;
  660. case 32: model.type = e_model::MODEL_7B; break;
  661. case 40: model.type = e_model::MODEL_15B; break;
  662. case 52: model.type = e_model::MODEL_20B; break; // granite
  663. case 88: model.type = e_model::MODEL_34B; break; // granite
  664. default: model.type = e_model::MODEL_UNKNOWN;
  665. }
  666. } break;
  667. case LLM_ARCH_MAMBA:
  668. {
  669. ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
  670. ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
  671. ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
  672. ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
  673. ml.get_key(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms, false);
  674. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  675. switch (hparams.n_layer) {
  676. case 24:
  677. switch (hparams.n_embd) {
  678. case 768: model.type = e_model::MODEL_SMALL; break;
  679. default: model.type = e_model::MODEL_UNKNOWN;
  680. } break;
  681. case 48:
  682. switch (hparams.n_embd) {
  683. case 1024: model.type = e_model::MODEL_MEDIUM; break;
  684. case 1536: model.type = e_model::MODEL_LARGE; break;
  685. case 2048: model.type = e_model::MODEL_XL; break;
  686. default: model.type = e_model::MODEL_UNKNOWN;
  687. } break;
  688. case 64:
  689. switch (hparams.n_embd) {
  690. case 2560: model.type = e_model::MODEL_3B; break;
  691. default: model.type = e_model::MODEL_UNKNOWN;
  692. } break;
  693. default: model.type = e_model::MODEL_UNKNOWN;
  694. }
  695. } break;
  696. case LLM_ARCH_XVERSE:
  697. {
  698. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  699. switch (hparams.n_layer) {
  700. case 32: model.type = e_model::MODEL_7B; break;
  701. case 40: model.type = e_model::MODEL_13B; break;
  702. case 80: model.type = e_model::MODEL_65B; break;
  703. default: model.type = e_model::MODEL_UNKNOWN;
  704. }
  705. } break;
  706. case LLM_ARCH_COMMAND_R:
  707. {
  708. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  709. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  710. switch (hparams.n_layer) {
  711. case 40: model.type = e_model::MODEL_35B; break;
  712. default: model.type = e_model::MODEL_UNKNOWN;
  713. }
  714. } break;
  715. case LLM_ARCH_COHERE2:
  716. {
  717. ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
  718. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  719. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  720. switch (hparams.n_layer) {
  721. case 32: model.type = e_model::MODEL_8B; break;
  722. default: model.type = e_model::MODEL_UNKNOWN;
  723. }
  724. } break;
  725. case LLM_ARCH_DBRX:
  726. {
  727. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  728. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
  729. switch (hparams.n_layer) {
  730. case 40: model.type = e_model::MODEL_16x12B; break;
  731. default: model.type = e_model::MODEL_UNKNOWN;
  732. }
  733. } break;
  734. case LLM_ARCH_OLMO:
  735. {
  736. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  737. ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
  738. switch (hparams.n_layer) {
  739. case 22: model.type = e_model::MODEL_1B; break;
  740. case 32: model.type = e_model::MODEL_7B; break;
  741. case 80: model.type = e_model::MODEL_70B; break;
  742. default: model.type = e_model::MODEL_UNKNOWN;
  743. }
  744. } break;
  745. case LLM_ARCH_OLMO2:
  746. {
  747. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  748. switch (hparams.n_layer) {
  749. case 16: model.type = e_model::MODEL_1B; break;
  750. case 32: model.type = e_model::MODEL_7B; break;
  751. case 40: model.type = e_model::MODEL_13B; break;
  752. default: model.type = e_model::MODEL_UNKNOWN;
  753. }
  754. } break;
  755. case LLM_ARCH_OLMOE:
  756. {
  757. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  758. switch (hparams.n_layer) {
  759. case 16: model.type = e_model::MODEL_A1_7B; break;
  760. default: model.type = e_model::MODEL_UNKNOWN;
  761. }
  762. } break;
  763. case LLM_ARCH_OPENELM:
  764. {
  765. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  766. switch (hparams.n_layer) {
  767. case 16: model.type = e_model::MODEL_270M; break;
  768. case 20: model.type = e_model::MODEL_450M; break;
  769. case 28: model.type = e_model::MODEL_1B; break;
  770. case 36: model.type = e_model::MODEL_3B; break;
  771. default: model.type = e_model::MODEL_UNKNOWN;
  772. }
  773. } break;
  774. case LLM_ARCH_GPTNEOX:
  775. {
  776. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  777. ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
  778. switch (hparams.n_layer) {
  779. case 6:
  780. switch (hparams.n_ff()) {
  781. case 512: model.type = e_model::MODEL_14M; break;
  782. case 2048: model.type = e_model::MODEL_70M; break;
  783. default: model.type = e_model::MODEL_UNKNOWN;
  784. } break;
  785. case 12:
  786. switch (hparams.n_ff()) {
  787. case 3072: model.type = e_model::MODEL_160M; break;
  788. default: model.type = e_model::MODEL_UNKNOWN;
  789. } break;
  790. case 16:
  791. switch (hparams.n_ff()) {
  792. case 8192: model.type = e_model::MODEL_1B; break;
  793. default: model.type = e_model::MODEL_UNKNOWN;
  794. } break;
  795. case 24:
  796. switch (hparams.n_ff()) {
  797. case 4096: model.type = e_model::MODEL_410M; break;
  798. case 8192: model.type = e_model::MODEL_1_4B; break;
  799. default: model.type = e_model::MODEL_UNKNOWN;
  800. } break;
  801. case 32:
  802. switch (hparams.n_ff()) {
  803. case 10240: model.type = e_model::MODEL_2_8B; break;
  804. case 16384: model.type = e_model::MODEL_6_9B; break;
  805. default: model.type = e_model::MODEL_UNKNOWN;
  806. } break;
  807. case 36:
  808. switch (hparams.n_ff()) {
  809. case 20480: model.type = e_model::MODEL_12B; break;
  810. default: model.type = e_model::MODEL_UNKNOWN;
  811. } break;
  812. case 44:
  813. switch (hparams.n_ff()) {
  814. case 24576: model.type = e_model::MODEL_20B; break;
  815. default: model.type = e_model::MODEL_UNKNOWN;
  816. } break;
  817. default: model.type = e_model::MODEL_UNKNOWN;
  818. }
  819. } break;
  820. case LLM_ARCH_ARCTIC:
  821. {
  822. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  823. if (hparams.n_expert == 128) {
  824. switch (hparams.n_layer) {
  825. case 35: model.type = e_model::MODEL_10B_128x3_66B; break;
  826. default: model.type = e_model::MODEL_UNKNOWN;
  827. }
  828. } else {
  829. model.type = e_model::MODEL_UNKNOWN;
  830. }
  831. } break;
  832. case LLM_ARCH_DEEPSEEK:
  833. {
  834. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  835. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  836. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  837. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  838. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  839. switch (hparams.n_layer) {
  840. case 28: model.type = e_model::MODEL_20B; break;
  841. default: model.type = e_model::MODEL_UNKNOWN;
  842. }
  843. } break;
  844. case LLM_ARCH_DEEPSEEK2:
  845. {
  846. bool is_lite = (hparams.n_layer == 27);
  847. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  848. ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
  849. if (!is_lite) {
  850. ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
  851. }
  852. ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
  853. ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
  854. ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
  855. ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
  856. ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
  857. ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
  858. if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
  859. // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
  860. // that have no expert_gating_func model parameter set
  861. hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
  862. }
  863. ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
  864. switch (hparams.n_layer) {
  865. case 27: model.type = e_model::MODEL_16B; break;
  866. case 60: model.type = e_model::MODEL_236B; break;
  867. case 61: model.type = e_model::MODEL_671B; break;
  868. default: model.type = e_model::MODEL_UNKNOWN;
  869. }
  870. } break;
  871. case LLM_ARCH_CHATGLM:
  872. {
  873. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  874. switch (hparams.n_layer) {
  875. case 28: model.type = e_model::MODEL_6B; break;
  876. case 40: model.type = e_model::MODEL_9B; break;
  877. default: model.type = e_model::MODEL_UNKNOWN;
  878. }
  879. } break;
  880. case LLM_ARCH_BITNET:
  881. {
  882. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  883. switch (hparams.n_layer) {
  884. case 26: model.type = e_model::MODEL_3B; break;
  885. default: model.type = e_model::MODEL_UNKNOWN;
  886. }
  887. } break;
  888. case LLM_ARCH_T5:
  889. {
  890. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  891. ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
  892. uint32_t dec_start_token_id;
  893. if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) {
  894. hparams.dec_start_token_id = dec_start_token_id;
  895. }
  896. switch (hparams.n_layer) {
  897. case 6: model.type = e_model::MODEL_60M; break; // t5-small
  898. case 8: model.type = e_model::MODEL_80M; break; // flan-t5-small
  899. case 12:
  900. switch (hparams.n_ff()) {
  901. case 3072: model.type = e_model::MODEL_220M; break; // t5-base
  902. case 2048: model.type = e_model::MODEL_250M; break; // flan-t5-base
  903. default: model.type = e_model::MODEL_UNKNOWN;
  904. } break;
  905. case 24:
  906. switch (hparams.n_ff()) {
  907. case 4096: model.type = e_model::MODEL_770M; break; // t5-large
  908. case 2816: model.type = e_model::MODEL_780M; break; // flan-t5-large
  909. case 16384: model.type = e_model::MODEL_3B; break; // t5-3b
  910. case 5120: model.type = e_model::MODEL_3B; break; // flan-t5-xl
  911. case 65536: model.type = e_model::MODEL_11B; break; // t5-11b
  912. case 10240: model.type = e_model::MODEL_11B; break; // flan-t5-xxl
  913. default: model.type = e_model::MODEL_UNKNOWN;
  914. } break;
  915. default: model.type = e_model::MODEL_UNKNOWN;
  916. }
  917. } break;
  918. case LLM_ARCH_T5ENCODER:
  919. {
  920. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  921. ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
  922. model.type = e_model::MODEL_UNKNOWN;
  923. } break;
  924. case LLM_ARCH_JAIS:
  925. {
  926. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  927. ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
  928. switch (hparams.n_layer) {
  929. case 24: model.type = e_model::MODEL_1_3B; break;
  930. case 40: model.type = e_model::MODEL_13B; break;
  931. /* TODO: add variants */
  932. default: model.type = e_model::MODEL_UNKNOWN;
  933. }
  934. } break;
  935. case LLM_ARCH_NEMOTRON:
  936. {
  937. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  938. switch (hparams.n_layer) {
  939. case 32: model.type = e_model::MODEL_4B; break;
  940. default: model.type = e_model::MODEL_UNKNOWN;
  941. }
  942. } break;
  943. case LLM_ARCH_EXAONE:
  944. {
  945. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  946. switch (hparams.n_layer) {
  947. case 32: model.type = e_model::MODEL_8B; break;
  948. default: model.type = e_model::MODEL_UNKNOWN;
  949. }
  950. } break;
  951. case LLM_ARCH_RWKV6:
  952. {
  953. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  954. ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
  955. ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
  956. ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
  957. ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
  958. switch (hparams.n_layer) {
  959. case 24: model.type = e_model::MODEL_1_6B; break;
  960. case 32:
  961. switch (hparams.n_embd) {
  962. case 2560: model.type = e_model::MODEL_3B; break;
  963. case 4096: model.type = e_model::MODEL_7B; break;
  964. default: model.type = e_model::MODEL_UNKNOWN;
  965. } break;
  966. case 61: model.type = e_model::MODEL_14B; break;
  967. default: model.type = e_model::MODEL_UNKNOWN;
  968. }
  969. } break;
  970. case LLM_ARCH_GRANITE:
  971. case LLM_ARCH_GRANITE_MOE:
  972. {
  973. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  974. ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
  975. ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
  976. ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
  977. ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
  978. switch (hparams.n_layer) {
  979. case 32: model.type = e_model::MODEL_3B; break;
  980. case 40: model.type = e_model::MODEL_3B; break;
  981. // Add additional layer/vocab/etc checks here for other model sizes
  982. default: model.type = e_model::MODEL_UNKNOWN;
  983. }
  984. } break;
  985. case LLM_ARCH_CHAMELEON:
  986. {
  987. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
  988. hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
  989. ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
  990. switch (hparams.n_layer) {
  991. case 32: model.type = e_model::MODEL_7B; break;
  992. case 48: model.type = e_model::MODEL_34B; break;
  993. default: model.type = e_model::MODEL_UNKNOWN;
  994. }
  995. } break;
  996. case LLM_ARCH_WAVTOKENIZER_DEC:
  997. {
  998. ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
  999. ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
  1000. ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
  1001. ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
  1002. } break;
  1003. default: throw std::runtime_error("unsupported model architecture");
  1004. }
  1005. model.ftype = ml.ftype;
  1006. if (hparams.f_max_alibi_bias > 0.0f) {
  1007. hparams.use_alibi = true;
  1008. }
  1009. hparams.rope_type = llama_rope_type(&model);
  1010. }
  1011. void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
  1012. auto & vocab = model.vocab;
  1013. struct gguf_context * ctx = ml.meta.get();
  1014. const auto kv = LLM_KV(model.arch);
  1015. // determine vocab type
  1016. {
  1017. std::string tokenizer_model;
  1018. std::string tokenizer_pre;
  1019. ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
  1020. ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
  1021. if (tokenizer_model == "no_vocab" || tokenizer_model == "none") {
  1022. vocab.type = LLAMA_VOCAB_TYPE_NONE;
  1023. // default special tokens
  1024. vocab.special_bos_id = LLAMA_TOKEN_NULL;
  1025. vocab.special_eos_id = LLAMA_TOKEN_NULL;
  1026. vocab.special_unk_id = LLAMA_TOKEN_NULL;
  1027. vocab.special_sep_id = LLAMA_TOKEN_NULL;
  1028. vocab.special_pad_id = LLAMA_TOKEN_NULL;
  1029. vocab.special_cls_id = LLAMA_TOKEN_NULL;
  1030. vocab.special_mask_id = LLAMA_TOKEN_NULL;
  1031. vocab.linefeed_id = LLAMA_TOKEN_NULL;
  1032. // read vocab size from metadata
  1033. if (!ml.get_key(LLM_KV_VOCAB_SIZE, vocab.n_vocab, false)) {
  1034. vocab.n_vocab = 0;
  1035. LLAMA_LOG_WARN("%s: there is no vocab_size in metadata, vocab.n_vocab will be set to %u\n", __func__, vocab.n_vocab);
  1036. }
  1037. return;
  1038. }
  1039. if (tokenizer_model == "llama") {
  1040. vocab.type = LLAMA_VOCAB_TYPE_SPM;
  1041. // default special tokens
  1042. vocab.special_bos_id = 1;
  1043. vocab.special_eos_id = 2;
  1044. vocab.special_unk_id = 0;
  1045. vocab.special_sep_id = LLAMA_TOKEN_NULL;
  1046. vocab.special_pad_id = LLAMA_TOKEN_NULL;
  1047. vocab.special_cls_id = LLAMA_TOKEN_NULL;
  1048. vocab.special_mask_id = LLAMA_TOKEN_NULL;
  1049. } else if (tokenizer_model == "bert") {
  1050. vocab.type = LLAMA_VOCAB_TYPE_WPM;
  1051. // default special tokens
  1052. vocab.special_bos_id = LLAMA_TOKEN_NULL;
  1053. vocab.special_eos_id = LLAMA_TOKEN_NULL;
  1054. vocab.special_unk_id = 100;
  1055. vocab.special_sep_id = 102;
  1056. vocab.special_pad_id = 0;
  1057. vocab.special_cls_id = 101;
  1058. vocab.special_mask_id = 103;
  1059. } else if (tokenizer_model == "gpt2") {
  1060. vocab.type = LLAMA_VOCAB_TYPE_BPE;
  1061. // read bpe merges and populate bpe ranks
  1062. const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
  1063. if (merges_keyidx == -1) {
  1064. throw std::runtime_error("cannot find tokenizer merges in model file\n");
  1065. }
  1066. const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
  1067. for (int i = 0; i < n_merges; i++) {
  1068. const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
  1069. GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
  1070. std::string first;
  1071. std::string second;
  1072. const size_t pos = word.find(' ', 1);
  1073. if (pos != std::string::npos) {
  1074. first = word.substr(0, pos);
  1075. second = word.substr(pos + 1);
  1076. }
  1077. vocab.bpe_ranks.emplace(std::make_pair(first, second), i);
  1078. }
  1079. // default special tokens
  1080. vocab.special_bos_id = 11;
  1081. vocab.special_eos_id = 11;
  1082. vocab.special_unk_id = LLAMA_TOKEN_NULL;
  1083. vocab.special_sep_id = LLAMA_TOKEN_NULL;
  1084. vocab.special_pad_id = LLAMA_TOKEN_NULL;
  1085. vocab.special_cls_id = LLAMA_TOKEN_NULL;
  1086. vocab.special_mask_id = LLAMA_TOKEN_NULL;
  1087. } else if (tokenizer_model == "t5") {
  1088. vocab.type = LLAMA_VOCAB_TYPE_UGM;
  1089. // default special tokens
  1090. vocab.special_bos_id = LLAMA_TOKEN_NULL;
  1091. vocab.special_eos_id = 1;
  1092. vocab.special_unk_id = 2;
  1093. vocab.special_sep_id = LLAMA_TOKEN_NULL;
  1094. vocab.special_pad_id = 0;
  1095. vocab.special_cls_id = LLAMA_TOKEN_NULL;
  1096. vocab.special_mask_id = LLAMA_TOKEN_NULL;
  1097. const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
  1098. if (precompiled_charsmap_keyidx != -1) {
  1099. size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
  1100. const char * precompiled_charsmap = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
  1101. vocab.precompiled_charsmap.assign(precompiled_charsmap, precompiled_charsmap + n_precompiled_charsmap);
  1102. #ifdef IS_BIG_ENDIAN
  1103. // correct endiannes of data in precompiled_charsmap binary blob
  1104. uint32_t * xcda_blob_size = (uint32_t *) &vocab.precompiled_charsmap[0];
  1105. *xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
  1106. assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
  1107. size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t);
  1108. uint32_t * xcda_array = (uint32_t *) &vocab.precompiled_charsmap[sizeof(uint32_t)];
  1109. for (size_t i = 0; i < xcda_array_size; ++i) {
  1110. xcda_array[i] = __builtin_bswap32(xcda_array[i]);
  1111. }
  1112. #endif
  1113. }
  1114. } else if (tokenizer_model == "rwkv") {
  1115. vocab.type = LLAMA_VOCAB_TYPE_RWKV;
  1116. // default special tokens
  1117. vocab.special_bos_id = LLAMA_TOKEN_NULL;
  1118. vocab.special_eos_id = LLAMA_TOKEN_NULL;
  1119. vocab.special_unk_id = LLAMA_TOKEN_NULL;
  1120. vocab.special_sep_id = LLAMA_TOKEN_NULL;
  1121. vocab.special_pad_id = LLAMA_TOKEN_NULL;
  1122. } else {
  1123. throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
  1124. }
  1125. // for now, only BPE models have pre-tokenizers
  1126. if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
  1127. vocab.tokenizer_add_space_prefix = false;
  1128. vocab.tokenizer_clean_spaces = true;
  1129. if (tokenizer_pre.empty()) {
  1130. LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
  1131. LLAMA_LOG_WARN("%s: \n", __func__);
  1132. LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
  1133. LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
  1134. LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
  1135. LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
  1136. LLAMA_LOG_WARN("%s: \n", __func__);
  1137. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
  1138. } else if (tokenizer_pre == "default") {
  1139. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
  1140. } else if (
  1141. tokenizer_pre == "llama3" ||
  1142. tokenizer_pre == "llama-v3" ||
  1143. tokenizer_pre == "llama-bpe"||
  1144. tokenizer_pre == "falcon3") {
  1145. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
  1146. vocab.tokenizer_ignore_merges = true;
  1147. vocab.tokenizer_add_bos = true;
  1148. } else if (
  1149. tokenizer_pre == "deepseek-llm") {
  1150. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
  1151. vocab.tokenizer_clean_spaces = false;
  1152. } else if (
  1153. tokenizer_pre == "deepseek-coder") {
  1154. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
  1155. vocab.tokenizer_clean_spaces = false;
  1156. } else if (
  1157. tokenizer_pre == "deepseek-v3") {
  1158. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
  1159. vocab.tokenizer_clean_spaces = false;
  1160. } else if (
  1161. tokenizer_pre == "falcon") {
  1162. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
  1163. } else if (
  1164. tokenizer_pre == "mpt") {
  1165. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
  1166. } else if (
  1167. tokenizer_pre == "starcoder") {
  1168. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
  1169. } else if (
  1170. tokenizer_pre == "gpt-2" ||
  1171. tokenizer_pre == "phi-2" ||
  1172. tokenizer_pre == "jina-es" ||
  1173. tokenizer_pre == "jina-de" ||
  1174. tokenizer_pre == "gigachat" ||
  1175. tokenizer_pre == "jina-v1-en" ||
  1176. tokenizer_pre == "jina-v2-es" ||
  1177. tokenizer_pre == "jina-v2-de" ||
  1178. tokenizer_pre == "jina-v2-code" ||
  1179. tokenizer_pre == "roberta-bpe") {
  1180. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
  1181. } else if (
  1182. tokenizer_pre == "refact") {
  1183. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
  1184. } else if (
  1185. tokenizer_pre == "command-r") {
  1186. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
  1187. vocab.tokenizer_clean_spaces = false;
  1188. } else if (
  1189. tokenizer_pre == "qwen2") {
  1190. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
  1191. vocab.tokenizer_clean_spaces = false;
  1192. } else if (
  1193. tokenizer_pre == "stablelm2") {
  1194. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
  1195. } else if (
  1196. tokenizer_pre == "olmo") {
  1197. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
  1198. } else if (
  1199. tokenizer_pre == "dbrx") {
  1200. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
  1201. } else if (
  1202. tokenizer_pre == "smaug-bpe") {
  1203. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
  1204. } else if (
  1205. tokenizer_pre == "poro-chat") {
  1206. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_PORO;
  1207. vocab.tokenizer_clean_spaces = false;
  1208. } else if (
  1209. tokenizer_pre == "chatglm-bpe") {
  1210. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
  1211. vocab.special_bos_id = LLAMA_TOKEN_NULL;
  1212. } else if (
  1213. tokenizer_pre == "viking") {
  1214. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING;
  1215. vocab.tokenizer_clean_spaces = false;
  1216. } else if (
  1217. tokenizer_pre == "jais") {
  1218. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
  1219. } else if (
  1220. tokenizer_pre == "tekken") {
  1221. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
  1222. vocab.tokenizer_clean_spaces = false;
  1223. vocab.tokenizer_ignore_merges = true;
  1224. vocab.tokenizer_add_bos = true;
  1225. } else if (
  1226. tokenizer_pre == "smollm") {
  1227. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
  1228. vocab.tokenizer_clean_spaces = false;
  1229. } else if (
  1230. tokenizer_pre == "codeshell") {
  1231. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
  1232. } else if (
  1233. tokenizer_pre == "bloom") {
  1234. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_BLOOM;
  1235. } else if (
  1236. tokenizer_pre == "gpt3-finnish") {
  1237. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
  1238. } else if (
  1239. tokenizer_pre == "exaone") {
  1240. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_EXAONE;
  1241. } else if (
  1242. tokenizer_pre == "chameleon") {
  1243. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
  1244. vocab.tokenizer_add_bos = true;
  1245. vocab.tokenizer_clean_spaces = false;
  1246. } else if (
  1247. tokenizer_pre == "minerva-7b") {
  1248. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
  1249. } else if (
  1250. tokenizer_pre == "megrez") {
  1251. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
  1252. } else {
  1253. throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
  1254. }
  1255. } else if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
  1256. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
  1257. vocab.tokenizer_add_space_prefix = true;
  1258. vocab.tokenizer_clean_spaces = false;
  1259. vocab.tokenizer_add_bos = true;
  1260. vocab.tokenizer_add_eos = false;
  1261. } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
  1262. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
  1263. vocab.tokenizer_add_space_prefix = false;
  1264. vocab.tokenizer_clean_spaces = true;
  1265. vocab.tokenizer_add_bos = true;
  1266. vocab.tokenizer_add_eos = false;
  1267. } else if (vocab.type == LLAMA_VOCAB_TYPE_UGM) {
  1268. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
  1269. vocab.tokenizer_add_bos = false;
  1270. vocab.tokenizer_add_eos = true;
  1271. } else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) {
  1272. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
  1273. vocab.tokenizer_add_space_prefix = false;
  1274. vocab.tokenizer_clean_spaces = false;
  1275. vocab.tokenizer_add_bos = false;
  1276. vocab.tokenizer_add_eos = false;
  1277. } else {
  1278. vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
  1279. }
  1280. ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, vocab.tokenizer_add_space_prefix, false);
  1281. ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.tokenizer_remove_extra_whitespaces, false);
  1282. }
  1283. const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
  1284. if (token_idx == -1) {
  1285. throw std::runtime_error("cannot find tokenizer vocab in model file\n");
  1286. }
  1287. const float * scores = nullptr;
  1288. const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
  1289. if (score_idx != -1) {
  1290. scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
  1291. }
  1292. const int * toktypes = nullptr;
  1293. const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
  1294. if (toktype_idx != -1) {
  1295. toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
  1296. }
  1297. const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
  1298. vocab.n_vocab = n_vocab;
  1299. vocab.id_to_token.resize(n_vocab);
  1300. for (uint32_t i = 0; i < n_vocab; i++) {
  1301. std::string word = gguf_get_arr_str(ctx, token_idx, i);
  1302. if (word.empty()) {
  1303. LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i);
  1304. word = "[EMPTY_" + std::to_string(i) + "]";
  1305. }
  1306. vocab.token_to_id[word] = i;
  1307. vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size());
  1308. auto & token_data = vocab.id_to_token[i];
  1309. token_data.text = std::move(word);
  1310. token_data.score = scores ? scores[i] : 0.0f;
  1311. token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
  1312. if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
  1313. switch(toktypes[i]) {
  1314. case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
  1315. case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
  1316. case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
  1317. case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
  1318. case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
  1319. case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
  1320. case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
  1321. default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
  1322. }
  1323. }
  1324. }
  1325. GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
  1326. vocab.init_tokenizer();
  1327. // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
  1328. if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
  1329. try {
  1330. vocab.linefeed_id = llama_byte_to_token_impl(vocab, '\n');
  1331. } catch (const std::exception & e) {
  1332. LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
  1333. vocab.linefeed_id = vocab.special_pad_id;
  1334. }
  1335. } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
  1336. vocab.linefeed_id = vocab.special_pad_id;
  1337. } else if (vocab.type == LLAMA_VOCAB_TYPE_RWKV) {
  1338. const std::vector<int> ids = llama_tokenize_internal(vocab, "\n", false);
  1339. GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
  1340. vocab.linefeed_id = ids[0];
  1341. } else {
  1342. const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
  1343. //GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
  1344. if (ids.empty()) {
  1345. LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__);
  1346. vocab.linefeed_id = vocab.special_pad_id;
  1347. } else {
  1348. vocab.linefeed_id = ids[0];
  1349. }
  1350. }
  1351. // special tokens
  1352. {
  1353. const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
  1354. { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
  1355. { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
  1356. { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
  1357. { LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id },
  1358. { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
  1359. { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
  1360. { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
  1361. { LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id },
  1362. { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id },
  1363. { LLM_KV_TOKENIZER_FIM_PRE_ID, vocab.special_fim_pre_id },
  1364. { LLM_KV_TOKENIZER_FIM_SUF_ID, vocab.special_fim_suf_id },
  1365. { LLM_KV_TOKENIZER_FIM_MID_ID, vocab.special_fim_mid_id },
  1366. { LLM_KV_TOKENIZER_FIM_PAD_ID, vocab.special_fim_pad_id },
  1367. { LLM_KV_TOKENIZER_FIM_REP_ID, vocab.special_fim_rep_id },
  1368. { LLM_KV_TOKENIZER_FIM_SEP_ID, vocab.special_fim_sep_id },
  1369. // deprecated
  1370. { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_fim_pre_id },
  1371. { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_fim_suf_id },
  1372. { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_fim_mid_id },
  1373. };
  1374. for (const auto & it : special_token_types) {
  1375. const std::string & key = kv(std::get<0>(it));
  1376. int32_t & id = std::get<1>(it);
  1377. uint32_t new_id;
  1378. if (!ml.get_key(std::get<0>(it), new_id, false)) {
  1379. continue;
  1380. }
  1381. if (new_id >= vocab.id_to_token.size()) {
  1382. LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n",
  1383. __func__, key.c_str(), new_id, id);
  1384. } else {
  1385. id = new_id;
  1386. }
  1387. }
  1388. // Handle add_bos_token and add_eos_token
  1389. {
  1390. bool temp = true;
  1391. if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
  1392. vocab.tokenizer_add_bos = temp;
  1393. }
  1394. if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
  1395. vocab.tokenizer_add_eos = temp;
  1396. }
  1397. }
  1398. // auto-detect special tokens by text
  1399. // TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
  1400. // for now, we apply this workaround to find the tokens based on their text
  1401. for (const auto & t : vocab.token_to_id) {
  1402. // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
  1403. if (vocab.special_eot_id == LLAMA_TOKEN_NULL) {
  1404. if (false
  1405. || t.first == "<|eot_id|>"
  1406. || t.first == "<|im_end|>"
  1407. || t.first == "<|end|>"
  1408. || t.first == "<end_of_turn>"
  1409. || t.first == "<|endoftext|>"
  1410. || t.first == "<EOT>"
  1411. || t.first == "<|end▁of▁sentence|>" // DeepSeek
  1412. ) {
  1413. vocab.special_eot_id = t.second;
  1414. if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
  1415. LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
  1416. __func__, t.second, t.first.c_str());
  1417. vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
  1418. }
  1419. }
  1420. }
  1421. // find EOM token: "<|eom_id|>"
  1422. if (vocab.special_eom_id == LLAMA_TOKEN_NULL) {
  1423. if (false
  1424. || t.first == "<|eom_id|>"
  1425. ) {
  1426. vocab.special_eom_id = t.second;
  1427. if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
  1428. LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
  1429. __func__, t.second, t.first.c_str());
  1430. vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
  1431. }
  1432. }
  1433. }
  1434. // find FIM_PRE token: "<|fim_prefix|>", "<fim-prefix>", "<PRE>", etc.
  1435. if (vocab.special_fim_pre_id == LLAMA_TOKEN_NULL) {
  1436. if (false
  1437. || t.first == "<|fim_prefix|>" // Qwen
  1438. || t.first == "<fim-prefix>"
  1439. || t.first == "<|fim▁begin|>" // DeepSeek
  1440. || t.first == "<PRE>"
  1441. ) {
  1442. vocab.special_fim_pre_id = t.second;
  1443. if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
  1444. LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
  1445. __func__, t.second, t.first.c_str());
  1446. vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
  1447. }
  1448. }
  1449. }
  1450. // find FIM_SUF token: "<|fim_suffix|>", "<fim-suffix>", "<SUF>", etc.
  1451. if (vocab.special_fim_suf_id == LLAMA_TOKEN_NULL) {
  1452. if (false
  1453. || t.first == "<|fim_suffix|>" // Qwen
  1454. || t.first == "<fim-suffix>"
  1455. || t.first == "<|fim▁hole|>" // DeepSeek
  1456. || t.first == "<SUF>"
  1457. ) {
  1458. vocab.special_fim_suf_id = t.second;
  1459. if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
  1460. LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
  1461. __func__, t.second, t.first.c_str());
  1462. vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
  1463. }
  1464. }
  1465. }
  1466. // find FIM_MID token: "<|fim_middle|>", "<fim-middle>", "<MID>", etc.
  1467. if (vocab.special_fim_mid_id == LLAMA_TOKEN_NULL) {
  1468. if (false
  1469. || t.first == "<|fim_middle|>" // Qwen
  1470. || t.first == "<fim-middle>"
  1471. || t.first == "<|fim▁end|>" // DeepSeek
  1472. || t.first == "<MID>"
  1473. ) {
  1474. vocab.special_fim_mid_id = t.second;
  1475. if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
  1476. LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
  1477. __func__, t.second, t.first.c_str());
  1478. vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
  1479. }
  1480. }
  1481. }
  1482. // find FIM_PAD token: "<|fim_pad|>", "<fim-pad>", "<PAD>", etc.
  1483. if (vocab.special_fim_pad_id == LLAMA_TOKEN_NULL) {
  1484. if (false
  1485. || t.first == "<|fim_pad|>" // Qwen
  1486. || t.first == "<fim-pad>"
  1487. || t.first == "<PAD>"
  1488. ) {
  1489. vocab.special_fim_pad_id = t.second;
  1490. if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
  1491. LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
  1492. __func__, t.second, t.first.c_str());
  1493. vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
  1494. }
  1495. }
  1496. }
  1497. // find FIM_REP token: "<|fim_repo|>", "<fim-repo>", "<REP>", etc.
  1498. if (vocab.special_fim_rep_id == LLAMA_TOKEN_NULL) {
  1499. if (false
  1500. || t.first == "<|fim_repo|>" // Qwen
  1501. || t.first == "<|repo_name|>"
  1502. || t.first == "<fim-repo>"
  1503. || t.first == "<REPO>"
  1504. ) {
  1505. vocab.special_fim_rep_id = t.second;
  1506. if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
  1507. LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
  1508. __func__, t.second, t.first.c_str());
  1509. vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
  1510. }
  1511. }
  1512. }
  1513. // find FIM_SEP token: "<|file_sep|>"
  1514. if (vocab.special_fim_sep_id == LLAMA_TOKEN_NULL) {
  1515. if (false
  1516. || t.first == "<|file_sep|>" // Qwen
  1517. ) {
  1518. vocab.special_fim_sep_id = t.second;
  1519. if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
  1520. LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
  1521. __func__, t.second, t.first.c_str());
  1522. vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
  1523. }
  1524. }
  1525. }
  1526. }
  1527. // maintain a list of tokens that cause end-of-generation
  1528. // this is currently determined based on the token text, which is obviously not ideal
  1529. // ref: https://github.com/ggerganov/llama.cpp/issues/9606
  1530. vocab.special_eog_ids.clear();
  1531. if (vocab.special_fim_pad_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_pad_id) == 0) {
  1532. vocab.special_eog_ids.insert(vocab.special_fim_pad_id);
  1533. }
  1534. if (vocab.special_fim_rep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_rep_id) == 0) {
  1535. vocab.special_eog_ids.insert(vocab.special_fim_rep_id);
  1536. }
  1537. if (vocab.special_fim_sep_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_fim_sep_id) == 0) {
  1538. vocab.special_eog_ids.insert(vocab.special_fim_sep_id);
  1539. }
  1540. for (const auto & t : vocab.token_to_id) {
  1541. if (false
  1542. || t.first == "<|eot_id|>"
  1543. || t.first == "<|im_end|>"
  1544. || t.first == "<|end|>"
  1545. || t.first == "<end_of_turn>"
  1546. || t.first == "<|endoftext|>"
  1547. || t.first == "<|eom_id|>"
  1548. || t.first == "<EOT>"
  1549. ) {
  1550. vocab.special_eog_ids.insert(t.second);
  1551. if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
  1552. LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
  1553. __func__, t.second, t.first.c_str());
  1554. vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
  1555. }
  1556. } else {
  1557. // token is control, but not marked as EOG -> print a debug log
  1558. if (vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && vocab.special_eog_ids.count(t.second) == 0) {
  1559. LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
  1560. __func__, t.second, t.first.c_str());
  1561. }
  1562. }
  1563. }
  1564. // sanity checks
  1565. if (vocab.special_eos_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eos_id) == 0) {
  1566. vocab.special_eog_ids.insert(vocab.special_eos_id);
  1567. LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
  1568. }
  1569. if (vocab.special_eot_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eot_id) == 0) {
  1570. vocab.special_eog_ids.insert(vocab.special_eot_id);
  1571. LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
  1572. }
  1573. if (vocab.special_eom_id != LLAMA_TOKEN_NULL && vocab.special_eog_ids.count(vocab.special_eom_id) == 0) {
  1574. vocab.special_eog_ids.insert(vocab.special_eom_id);
  1575. LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
  1576. }
  1577. }
  1578. // build special tokens cache
  1579. {
  1580. for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
  1581. if (vocab.id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) {
  1582. vocab.cache_special_tokens.push_back(id);
  1583. }
  1584. }
  1585. std::sort(vocab.cache_special_tokens.begin(), vocab.cache_special_tokens.end(),
  1586. [&] (const llama_vocab::id a, const llama_vocab::id b) {
  1587. return vocab.id_to_token[a].text.size() > vocab.id_to_token[b].text.size();
  1588. }
  1589. );
  1590. LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
  1591. }
  1592. // build token to piece cache
  1593. {
  1594. size_t size_cache = 0;
  1595. std::vector<llama_vocab::token> cache_token_to_piece(n_vocab);
  1596. for (uint32_t id = 0; id < n_vocab; ++id) {
  1597. cache_token_to_piece[id] = llama_token_to_piece(&model, id, true);
  1598. size_cache += cache_token_to_piece[id].size();
  1599. }
  1600. std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
  1601. LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
  1602. }
  1603. // Handle per token attributes
  1604. //NOTE: Each model customizes per token attributes.
  1605. //NOTE: Per token attributes are missing from the GGUF file.
  1606. //TODO: Extract attributes from GGUF file.
  1607. {
  1608. auto _contains_any = [] (const std::string &str, const std::vector<std::string> &substrs) -> bool {
  1609. for (auto substr : substrs) {
  1610. if (str.find(substr) < std::string::npos) {
  1611. return true;
  1612. }
  1613. }
  1614. return false;
  1615. };
  1616. auto _set_tokenid_attr = [&] (const llama_vocab::id id, llama_token_attr attr, bool value) {
  1617. uint32_t current = vocab.id_to_token.at(id).attr;
  1618. current = value ? (current | attr) : (current & ~attr);
  1619. vocab.id_to_token[id].attr = (llama_token_attr) current;
  1620. };
  1621. auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
  1622. _set_tokenid_attr(vocab.token_to_id.at(token), attr, value);
  1623. };
  1624. std::string model_name;
  1625. std::string tokenizer_pre;
  1626. ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
  1627. ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
  1628. // model name to lowercase
  1629. std::transform(model_name.begin(), model_name.end(), model_name.begin(),
  1630. [] (const std::string::value_type x) {
  1631. return std::tolower(x);
  1632. }
  1633. );
  1634. // set attributes by model/tokenizer name
  1635. if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
  1636. _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
  1637. } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
  1638. for (auto id : vocab.cache_special_tokens) {
  1639. _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
  1640. }
  1641. for (auto token : {"</s>"}) {
  1642. _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
  1643. }
  1644. for (auto token : {"<unk>", "<s>", "<|endoftext|>"}) {
  1645. _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
  1646. }
  1647. }
  1648. }
  1649. }
  1650. void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
  1651. const auto & hparams = model.hparams;
  1652. const auto & vocab = model.vocab;
  1653. const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
  1654. auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
  1655. bool is_var = false;
  1656. std::vector<uint32_t> v;
  1657. for (uint32_t i = 0; i < n; ++i) {
  1658. v.push_back(f(i));
  1659. if (v[i] != v[0]) {
  1660. is_var = true;
  1661. }
  1662. }
  1663. std::stringstream ss;
  1664. if (is_var) {
  1665. ss << "[";
  1666. for (uint32_t i = 0; i < n; ++i) {
  1667. ss << v[i];
  1668. if (i < n - 1) {
  1669. ss << ", ";
  1670. }
  1671. }
  1672. ss << "]";
  1673. } else {
  1674. ss << v[0];
  1675. }
  1676. return ss.str();
  1677. };
  1678. // hparams
  1679. LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver));
  1680. LLAMA_LOG_INFO("%s: arch = %s\n", __func__, llm_arch_name(model.arch));
  1681. LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type));
  1682. LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
  1683. LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size());
  1684. LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
  1685. if (!hparams.vocab_only) {
  1686. LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
  1687. LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
  1688. LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
  1689. LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
  1690. LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
  1691. LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
  1692. LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
  1693. LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
  1694. LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
  1695. LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
  1696. LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
  1697. LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
  1698. LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
  1699. LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
  1700. LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
  1701. LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
  1702. LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
  1703. LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
  1704. LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
  1705. LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
  1706. LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
  1707. LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
  1708. LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
  1709. LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
  1710. LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
  1711. LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
  1712. LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
  1713. LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
  1714. LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
  1715. LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
  1716. LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
  1717. LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
  1718. LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
  1719. }
  1720. LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model).c_str());
  1721. LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model).c_str());
  1722. if (ml.n_elements >= 1e12) {
  1723. LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, ml.n_elements*1e-12);
  1724. } else if (ml.n_elements >= 1e9) {
  1725. LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
  1726. } else if (ml.n_elements >= 1e6) {
  1727. LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, ml.n_elements*1e-6);
  1728. } else {
  1729. LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, ml.n_elements*1e-3);
  1730. }
  1731. if (ml.n_bytes < GiB) {
  1732. LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
  1733. } else {
  1734. LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
  1735. }
  1736. // general kv
  1737. LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
  1738. // special tokens
  1739. if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
  1740. if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
  1741. if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
  1742. if (vocab.special_eom_id != -1) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, vocab.special_eom_id, vocab.id_to_token[vocab.special_eom_id].text.c_str() ); }
  1743. if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
  1744. if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
  1745. if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
  1746. if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
  1747. if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
  1748. if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
  1749. if (vocab.special_fim_pre_id != -1) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, vocab.special_fim_pre_id, vocab.id_to_token[vocab.special_fim_pre_id].text.c_str() ); }
  1750. if (vocab.special_fim_suf_id != -1) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, vocab.special_fim_suf_id, vocab.id_to_token[vocab.special_fim_suf_id].text.c_str() ); }
  1751. if (vocab.special_fim_mid_id != -1) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, vocab.special_fim_mid_id, vocab.id_to_token[vocab.special_fim_mid_id].text.c_str() ); }
  1752. if (vocab.special_fim_pad_id != -1) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, vocab.special_fim_pad_id, vocab.id_to_token[vocab.special_fim_pad_id].text.c_str() ); }
  1753. if (vocab.special_fim_rep_id != -1) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, vocab.special_fim_rep_id, vocab.id_to_token[vocab.special_fim_rep_id].text.c_str() ); }
  1754. if (vocab.special_fim_sep_id != -1) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, vocab.special_fim_sep_id, vocab.id_to_token[vocab.special_fim_sep_id].text.c_str() ); }
  1755. for (const auto & id : vocab.special_eog_ids) {
  1756. LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, vocab.id_to_token[id].text.c_str() );
  1757. }
  1758. LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
  1759. if (model.arch == LLM_ARCH_DEEPSEEK) {
  1760. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  1761. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  1762. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  1763. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  1764. }
  1765. if (model.arch == LLM_ARCH_DEEPSEEK2) {
  1766. LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
  1767. LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
  1768. LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
  1769. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  1770. LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
  1771. LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
  1772. LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
  1773. LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((enum llama_expert_gating_func_type) hparams.expert_gating_func));
  1774. LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
  1775. }
  1776. if (model.arch == LLM_ARCH_QWEN2MOE) {
  1777. LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
  1778. LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
  1779. }
  1780. if (model.arch == LLM_ARCH_MINICPM || model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
  1781. LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
  1782. LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
  1783. LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
  1784. }
  1785. }
  1786. //
  1787. // interface implementation
  1788. //
  1789. struct llama_model_params llama_model_default_params() {
  1790. struct llama_model_params result = {
  1791. /*.devices =*/ nullptr,
  1792. /*.n_gpu_layers =*/ 0,
  1793. /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
  1794. /*.main_gpu =*/ 0,
  1795. /*.tensor_split =*/ nullptr,
  1796. /*.rpc_servers =*/ nullptr,
  1797. /*.progress_callback =*/ nullptr,
  1798. /*.progress_callback_user_data =*/ nullptr,
  1799. /*.kv_overrides =*/ nullptr,
  1800. /*.vocab_only =*/ false,
  1801. /*.use_mmap =*/ true,
  1802. /*.use_mlock =*/ false,
  1803. /*.check_tensors =*/ false,
  1804. };
  1805. #ifdef GGML_USE_METAL
  1806. // note: we usually have plenty of VRAM, so by default offload all layers to the GPU
  1807. result.n_gpu_layers = 999;
  1808. #endif
  1809. return result;
  1810. }
  1811. void llama_free_model(struct llama_model * model) {
  1812. delete model;
  1813. }
  1814. enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
  1815. return model->vocab.type;
  1816. }
  1817. int32_t llama_n_vocab(const struct llama_model * model) {
  1818. return model->hparams.n_vocab;
  1819. }
  1820. int32_t llama_n_ctx_train(const struct llama_model * model) {
  1821. return model->hparams.n_ctx_train;
  1822. }
  1823. int32_t llama_n_embd(const struct llama_model * model) {
  1824. return model->hparams.n_embd;
  1825. }
  1826. int32_t llama_n_layer(const struct llama_model * model) {
  1827. return model->hparams.n_layer;
  1828. }
  1829. int32_t llama_n_head(const struct llama_model * model) {
  1830. return model->hparams.n_head();
  1831. }
  1832. enum llama_rope_type llama_rope_type(const struct llama_model * model) {
  1833. switch (model->arch) {
  1834. // these models do not use RoPE
  1835. case LLM_ARCH_GPT2:
  1836. case LLM_ARCH_GPTJ:
  1837. case LLM_ARCH_MPT:
  1838. case LLM_ARCH_REFACT:
  1839. case LLM_ARCH_BLOOM:
  1840. case LLM_ARCH_MAMBA:
  1841. case LLM_ARCH_JINA_BERT_V2:
  1842. case LLM_ARCH_T5:
  1843. case LLM_ARCH_T5ENCODER:
  1844. case LLM_ARCH_JAIS:
  1845. case LLM_ARCH_RWKV6:
  1846. case LLM_ARCH_WAVTOKENIZER_DEC:
  1847. return LLAMA_ROPE_TYPE_NONE;
  1848. // use what we call a normal RoPE, operating on pairs of consecutive head values
  1849. case LLM_ARCH_LLAMA:
  1850. case LLM_ARCH_DECI:
  1851. case LLM_ARCH_BAICHUAN:
  1852. case LLM_ARCH_STARCODER:
  1853. case LLM_ARCH_PLAMO:
  1854. case LLM_ARCH_ORION:
  1855. case LLM_ARCH_INTERNLM2:
  1856. case LLM_ARCH_MINICPM:
  1857. case LLM_ARCH_XVERSE:
  1858. case LLM_ARCH_COMMAND_R:
  1859. case LLM_ARCH_COHERE2:
  1860. case LLM_ARCH_OLMO:
  1861. case LLM_ARCH_ARCTIC:
  1862. case LLM_ARCH_DEEPSEEK:
  1863. case LLM_ARCH_DEEPSEEK2:
  1864. case LLM_ARCH_CHATGLM:
  1865. case LLM_ARCH_GRANITE:
  1866. case LLM_ARCH_GRANITE_MOE:
  1867. case LLM_ARCH_CHAMELEON:
  1868. return LLAMA_ROPE_TYPE_NORM;
  1869. // the pairs of head values are offset by n_rot/2
  1870. case LLM_ARCH_FALCON:
  1871. case LLM_ARCH_GROK:
  1872. case LLM_ARCH_DBRX:
  1873. case LLM_ARCH_BERT:
  1874. case LLM_ARCH_NOMIC_BERT:
  1875. case LLM_ARCH_STABLELM:
  1876. case LLM_ARCH_BITNET:
  1877. case LLM_ARCH_QWEN:
  1878. case LLM_ARCH_QWEN2:
  1879. case LLM_ARCH_QWEN2MOE:
  1880. case LLM_ARCH_OLMO2:
  1881. case LLM_ARCH_OLMOE:
  1882. case LLM_ARCH_PHI2:
  1883. case LLM_ARCH_PHI3:
  1884. case LLM_ARCH_GEMMA:
  1885. case LLM_ARCH_GEMMA2:
  1886. case LLM_ARCH_STARCODER2:
  1887. case LLM_ARCH_OPENELM:
  1888. case LLM_ARCH_GPTNEOX:
  1889. case LLM_ARCH_CODESHELL:
  1890. case LLM_ARCH_NEMOTRON:
  1891. case LLM_ARCH_EXAONE:
  1892. case LLM_ARCH_MINICPM3:
  1893. return LLAMA_ROPE_TYPE_NEOX;
  1894. case LLM_ARCH_QWEN2VL:
  1895. return LLAMA_ROPE_TYPE_MROPE;
  1896. // all model arches should be listed explicitly here
  1897. case LLM_ARCH_UNKNOWN:
  1898. GGML_ABORT("unknown architecture");
  1899. }
  1900. return LLAMA_ROPE_TYPE_NONE;
  1901. }
  1902. float llama_rope_freq_scale_train(const struct llama_model * model) {
  1903. return model->hparams.rope_freq_scale_train;
  1904. }
  1905. int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
  1906. const auto & it = model->gguf_kv.find(key);
  1907. if (it == model->gguf_kv.end()) {
  1908. if (buf_size > 0) {
  1909. buf[0] = '\0';
  1910. }
  1911. return -1;
  1912. }
  1913. return snprintf(buf, buf_size, "%s", it->second.c_str());
  1914. }
  1915. int32_t llama_model_meta_count(const struct llama_model * model) {
  1916. return (int)model->gguf_kv.size();
  1917. }
  1918. int32_t llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
  1919. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  1920. if (buf_size > 0) {
  1921. buf[0] = '\0';
  1922. }
  1923. return -1;
  1924. }
  1925. auto it = model->gguf_kv.begin();
  1926. std::advance(it, i);
  1927. return snprintf(buf, buf_size, "%s", it->first.c_str());
  1928. }
  1929. int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size) {
  1930. if (i < 0 || i >= (int)model->gguf_kv.size()) {
  1931. if (buf_size > 0) {
  1932. buf[0] = '\0';
  1933. }
  1934. return -1;
  1935. }
  1936. auto it = model->gguf_kv.begin();
  1937. std::advance(it, i);
  1938. return snprintf(buf, buf_size, "%s", it->second.c_str());
  1939. }
  1940. int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
  1941. return snprintf(buf, buf_size, "%s %s %s",
  1942. llama_model_arch_name (*model).c_str(),
  1943. llama_model_type_name (*model).c_str(),
  1944. llama_model_ftype_name(*model).c_str());
  1945. }
  1946. uint64_t llama_model_size(const struct llama_model * model) {
  1947. return model->n_bytes;
  1948. }
  1949. uint64_t llama_model_n_params(const struct llama_model * model) {
  1950. return model->n_elements;
  1951. }
  1952. bool llama_model_has_encoder(const struct llama_model * model) {
  1953. switch (model->arch) {
  1954. case LLM_ARCH_T5: return true;
  1955. case LLM_ARCH_T5ENCODER: return true;
  1956. default: return false;
  1957. }
  1958. }
  1959. bool llama_model_has_decoder(const struct llama_model * model) {
  1960. switch (model->arch) {
  1961. case LLM_ARCH_T5ENCODER: return false;
  1962. default: return true;
  1963. }
  1964. }
  1965. llama_token llama_model_decoder_start_token(const struct llama_model * model) {
  1966. return model->hparams.dec_start_token_id;
  1967. }
  1968. bool llama_model_is_recurrent(const struct llama_model * model) {
  1969. switch (model->arch) {
  1970. case LLM_ARCH_MAMBA: return true;
  1971. case LLM_ARCH_RWKV6: return true;
  1972. default: return false;
  1973. }
  1974. }