llama.cpp 58 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815
  1. #include "llama.h"
  2. #include "ggml.h"
  3. #include <cinttypes>
  4. #include <fstream>
  5. #include <random>
  6. #include <map>
  7. #include <unordered_map>
  8. #include <queue>
  9. #include <regex>
  10. #include <cassert>
  11. #include <cstring>
  12. #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
  13. #define WIN32_LEAN_AND_MEAN
  14. #include <Windows.h>
  15. #else
  16. #include <sys/types.h>
  17. #include <sys/mman.h>
  18. #include <unistd.h>
  19. #include <fcntl.h>
  20. #endif
  21. #define Min(X, Y) ((Y) > (X) ? (X) : (Y))
  22. #define Max(X, Y) ((Y) < (X) ? (X) : (Y))
  23. #define LLAMA_USE_SCRATCH
  24. #define LLAMA_MAX_SCRATCH_BUFFERS 16
  25. #define LLAMA_ASSERT(x) \
  26. do { \
  27. if (!(x)) { \
  28. fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
  29. abort(); \
  30. } \
  31. } while (0)
  32. // determine number of model parts based on the dimension
  33. static const std::unordered_map<int, int> LLAMA_N_PARTS = {
  34. { 4096, 1 },
  35. { 5120, 2 },
  36. { 6656, 4 },
  37. { 8192, 8 },
  38. };
  39. // available llama models
  40. enum e_model {
  41. MODEL_UNKNOWN,
  42. MODEL_7B,
  43. MODEL_13B,
  44. MODEL_30B,
  45. MODEL_65B,
  46. };
  47. static const size_t MB = 1024*1024;
  48. // computed for n_ctx == 2048
  49. // TODO: dynamically determine these sizes
  50. // needs modifications in ggml
  51. static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
  52. { MODEL_7B, 512ull*MB },
  53. { MODEL_13B, 512ull*MB },
  54. { MODEL_30B, 512ull*MB },
  55. { MODEL_65B, 512ull*MB },
  56. };
  57. static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
  58. { MODEL_7B, 512ull*MB },
  59. { MODEL_13B, 512ull*MB },
  60. { MODEL_30B, 512ull*MB },
  61. { MODEL_65B, 512ull*MB },
  62. };
  63. // 2*n_embd*n_ctx*n_layer*sizeof(float16)
  64. static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
  65. { MODEL_7B, 1026ull*MB },
  66. { MODEL_13B, 1608ull*MB },
  67. { MODEL_30B, 3124ull*MB },
  68. { MODEL_65B, 5120ull*MB },
  69. };
  70. // this is mostly needed for temporary mul_mat buffers to dequantize the data
  71. // not actually needed if BLAS is disabled
  72. static const std::map<e_model, size_t> MEM_REQ_EVAL = {
  73. { MODEL_7B, 768ull*MB },
  74. { MODEL_13B, 1024ull*MB },
  75. { MODEL_30B, 1280ull*MB },
  76. { MODEL_65B, 1536ull*MB },
  77. };
  78. // default hparams (LLaMA 7B)
  79. struct llama_hparams {
  80. int32_t n_vocab = 32000;
  81. int32_t n_ctx = 512; // this is provided as user input?
  82. int32_t n_embd = 4096;
  83. int32_t n_mult = 256;
  84. int32_t n_head = 32;
  85. int32_t n_layer = 32;
  86. int32_t n_rot = 64;
  87. int32_t f16 = 1;
  88. };
  89. struct llama_layer {
  90. // normalization
  91. struct ggml_tensor * attention_norm;
  92. // attention
  93. struct ggml_tensor * wq;
  94. struct ggml_tensor * wk;
  95. struct ggml_tensor * wv;
  96. struct ggml_tensor * wo;
  97. // normalization
  98. struct ggml_tensor * ffn_norm;
  99. // ff
  100. struct ggml_tensor * w1;
  101. struct ggml_tensor * w2;
  102. struct ggml_tensor * w3;
  103. };
  104. struct llama_kv_cache {
  105. struct ggml_tensor * k;
  106. struct ggml_tensor * v;
  107. struct ggml_context * ctx;
  108. std::vector<uint8_t> buf;
  109. int n; // number of tokens currently in the cache
  110. };
  111. struct llama_model {
  112. e_model type = MODEL_UNKNOWN;
  113. llama_hparams hparams;
  114. struct ggml_tensor * tok_embeddings;
  115. struct ggml_tensor * norm;
  116. struct ggml_tensor * output;
  117. std::vector<llama_layer> layers;
  118. // context
  119. struct ggml_context * ctx;
  120. // key + value cache for the self attention
  121. // TODO: move to llama_state
  122. struct llama_kv_cache kv_self;
  123. // the model memory buffer
  124. std::vector<uint8_t> buf;
  125. // model memory mapped file
  126. void * mm_addr = NULL;
  127. uint64_t mm_length = 0;
  128. // tensors
  129. int n_loaded;
  130. std::unordered_map<std::string, struct ggml_tensor *> tensors;
  131. };
  132. struct llama_vocab {
  133. using id = int32_t;
  134. using token = std::string;
  135. struct token_score {
  136. token tok;
  137. float score;
  138. };
  139. std::unordered_map<token, id> token_to_id;
  140. std::vector<token_score> id_to_token;
  141. };
  142. struct llama_context {
  143. std::mt19937 rng;
  144. int64_t t_load_us = 0;
  145. int64_t t_start_us = 0;
  146. bool has_evaluated_once = false;
  147. int64_t t_sample_us = 0;
  148. int64_t t_eval_us = 0;
  149. int64_t t_p_eval_us = 0;
  150. int32_t n_sample = 0; // number of tokens sampled
  151. int32_t n_eval = 0; // number of eval calls
  152. int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
  153. llama_model model;
  154. llama_vocab vocab;
  155. size_t mem_per_token = 0;
  156. // decode output (2-dimensional array: [n_tokens][n_vocab])
  157. std::vector<float> logits;
  158. bool logits_all = false;
  159. // input embedding (1-dimensional array: [n_embd])
  160. std::vector<float> embedding;
  161. // memory buffers used to evaluate the model
  162. // TODO: move in llama_state
  163. std::vector<uint8_t> buf_compute;
  164. std::vector<uint8_t> buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
  165. int buf_last = 0;
  166. size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
  167. void use_buf(struct ggml_context * ctx, int i) {
  168. #if defined(LLAMA_USE_SCRATCH)
  169. size_t last_size = 0;
  170. if (i == -1) {
  171. last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, });
  172. } else {
  173. auto & buf = buf_scratch[i];
  174. last_size = ggml_set_scratch(ctx, { 0, buf.size(), buf.data(), });
  175. }
  176. if (buf_last >= 0) {
  177. buf_max_size[buf_last] = Max(buf_max_size[buf_last], last_size);
  178. }
  179. buf_last = i;
  180. #else
  181. (void) i;
  182. (void) ctx;
  183. #endif
  184. }
  185. size_t get_buf_max_mem(int i) const {
  186. #if defined(LLAMA_USE_SCRATCH)
  187. return buf_max_size[i];
  188. #else
  189. (void) i;
  190. return 0;
  191. #endif
  192. }
  193. };
  194. //
  195. // kv cache
  196. //
  197. static bool kv_cache_init(
  198. const struct llama_hparams & hparams,
  199. struct llama_kv_cache & cache,
  200. ggml_type wtype,
  201. int n_ctx) {
  202. const int n_embd = hparams.n_embd;
  203. const int n_layer = hparams.n_layer;
  204. const int n_mem = n_layer*n_ctx;
  205. const int n_elements = n_embd*n_mem;
  206. cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
  207. struct ggml_init_params params;
  208. params.mem_size = cache.buf.size();
  209. params.mem_buffer = cache.buf.data();
  210. params.no_alloc = false;
  211. cache.ctx = ggml_init(params);
  212. if (!cache.ctx) {
  213. fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
  214. return false;
  215. }
  216. cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
  217. cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
  218. return true;
  219. }
  220. static void kv_cache_free(struct llama_kv_cache & cache) {
  221. if (cache.ctx) {
  222. ggml_free(cache.ctx);
  223. cache.ctx = nullptr;
  224. }
  225. }
  226. struct llama_context_params llama_context_default_params() {
  227. struct llama_context_params result = {
  228. /*.n_ctx =*/ 512,
  229. /*.n_parts =*/ -1,
  230. /*.seed =*/ 0,
  231. /*.f16_kv =*/ false,
  232. /*.logits_all =*/ false,
  233. /*.vocab_only =*/ false,
  234. /*.use_mlock =*/ false,
  235. /*.embedding =*/ false,
  236. /*.progress_callback =*/ nullptr,
  237. /*.progress_callback_user_data =*/ nullptr,
  238. };
  239. return result;
  240. }
  241. //
  242. // model loading
  243. //
  244. static void *mmap_file(const char *fname, uint64_t *mm_length) {
  245. #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
  246. HANDLE hFile = CreateFileA(fname,
  247. GENERIC_READ,
  248. FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
  249. NULL,
  250. OPEN_EXISTING,
  251. FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED,
  252. NULL);
  253. if (hFile == INVALID_HANDLE_VALUE) return 0;
  254. LARGE_INTEGER fileSize;
  255. fileSize.QuadPart = -1;
  256. GetFileSizeEx(hFile, &fileSize);
  257. int64_t length = fileSize.QuadPart;
  258. HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
  259. CloseHandle(hFile);
  260. if (!hMapping) return 0;
  261. void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
  262. CloseHandle(hMapping);
  263. if (!addr) return 0;
  264. #else
  265. int fd = open(fname, O_RDONLY);
  266. if (fd == -1) return 0;
  267. int64_t length = lseek(fd, 0, SEEK_END);
  268. void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
  269. close(fd);
  270. if (addr == MAP_FAILED) return 0;
  271. #endif
  272. *mm_length = length;
  273. return addr;
  274. }
  275. static void munmap_file(void * addr, size_t length) {
  276. #if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
  277. UnmapViewOfFile(addr);
  278. #else
  279. munmap(addr, length);
  280. #endif
  281. }
  282. static bool report_bad_magic(const char *path, uint32_t got, uint32_t want) {
  283. fprintf(stderr,
  284. "%s: invalid model file (bad magic [got %#x want %#x])\n"
  285. "\tyou most likely need to regenerate your ggml files\n"
  286. "\tthe benefit is you'll get 10-100x faster load times\n"
  287. "\tsee https://github.com/ggerganov/llama.cpp/issues/91\n"
  288. "\tuse convert-pth-to-ggml.py to regenerate from original pth\n"
  289. "\tuse migrate-ggml-2023-03-30-pr613.py if you deleted originals\n",
  290. path, got, want);
  291. return false;
  292. }
  293. static bool llama_model_load(
  294. const std::string & fname,
  295. llama_context & lctx,
  296. int n_ctx,
  297. int n_parts,
  298. ggml_type memory_type,
  299. bool vocab_only,
  300. llama_progress_callback progress_callback,
  301. void *progress_callback_user_data) {
  302. fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
  303. lctx.t_start_us = ggml_time_us();
  304. auto & model = lctx.model;
  305. auto & vocab = lctx.vocab;
  306. auto fin = std::ifstream(fname, std::ios::binary);
  307. if (!fin) {
  308. fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
  309. return false;
  310. }
  311. std::vector<char> f_buf(1024*1024);
  312. fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
  313. fin.seekg(0, fin.end);
  314. const size_t file_size = fin.tellg();
  315. fin.seekg(0);
  316. // verify magic
  317. {
  318. uint32_t magic;
  319. fin.read((char *) &magic, sizeof(magic));
  320. if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) {
  321. fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files or convert them with convert-unversioned-ggml-to-ggml.py!)\n",
  322. __func__, fname.c_str());
  323. return false;
  324. }
  325. if (magic != LLAMA_FILE_MAGIC) {
  326. return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC);
  327. }
  328. uint32_t format_version;
  329. fin.read((char *) &format_version, sizeof(format_version));
  330. if (format_version != LLAMA_FILE_VERSION) {
  331. fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n",
  332. __func__, fname.c_str(), format_version, LLAMA_FILE_VERSION);
  333. return false;
  334. }
  335. }
  336. int n_ff = 0;
  337. // load hparams
  338. {
  339. auto & hparams = model.hparams;
  340. fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
  341. //fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
  342. fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
  343. fin.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
  344. fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
  345. fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
  346. fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
  347. fin.read((char *) &hparams.f16, sizeof(hparams.f16));
  348. hparams.n_ctx = n_ctx;
  349. n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
  350. if (n_parts < 1) {
  351. n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
  352. }
  353. // temp warning to tell the user to use "--n_parts"
  354. if (hparams.f16 == 4 && n_parts != 1) {
  355. fprintf(stderr, "%s: GPTQ model detected - are you sure n_parts should be %d? we normally expect it to be 1\n", __func__, n_parts);
  356. fprintf(stderr, "%s: use '--n_parts 1' if necessary\n", __func__);
  357. }
  358. if (hparams.n_layer == 32) {
  359. model.type = e_model::MODEL_7B;
  360. }
  361. if (hparams.n_layer == 40) {
  362. model.type = e_model::MODEL_13B;
  363. }
  364. if (hparams.n_layer == 60) {
  365. model.type = e_model::MODEL_30B;
  366. }
  367. if (hparams.n_layer == 80) {
  368. model.type = e_model::MODEL_65B;
  369. }
  370. fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
  371. fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx);
  372. fprintf(stderr, "%s: n_embd = %d\n", __func__, hparams.n_embd);
  373. fprintf(stderr, "%s: n_mult = %d\n", __func__, hparams.n_mult);
  374. fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head);
  375. fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer);
  376. fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot);
  377. fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
  378. fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff);
  379. fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts);
  380. fprintf(stderr, "%s: type = %d\n", __func__, model.type);
  381. }
  382. // load vocab
  383. {
  384. std::string word;
  385. vocab.id_to_token.resize(model.hparams.n_vocab);
  386. std::vector<char> tmp(64);
  387. for (int i = 0; i < model.hparams.n_vocab; i++) {
  388. uint32_t len;
  389. fin.read((char *) &len, sizeof(len));
  390. word.resize(len);
  391. if (len > 0) {
  392. tmp.resize(len);
  393. fin.read(tmp.data(), len);
  394. word.assign(tmp.data(), len);
  395. } else {
  396. word.clear();
  397. }
  398. float score;
  399. fin.read((char *) &score, sizeof(score));
  400. vocab.token_to_id[word] = i;
  401. auto &tok_score = vocab.id_to_token[i];
  402. tok_score.tok = word;
  403. tok_score.score = score;
  404. }
  405. }
  406. if (vocab_only) {
  407. return true;
  408. }
  409. // for the big tensors, we have the option to store the data in 16-bit floats or quantized
  410. // in order to save memory and also to speed up the computation
  411. // wtype is for per-layer weights, while vtype is for other weights
  412. ggml_type wtype, vtype;
  413. switch (model.hparams.f16) {
  414. case 0: wtype = vtype = GGML_TYPE_F32; break;
  415. case 1: wtype = vtype = GGML_TYPE_F16; break;
  416. case 2: wtype = vtype = GGML_TYPE_Q4_0; break;
  417. case 3: wtype = vtype = GGML_TYPE_Q4_1; break;
  418. case 4: wtype = GGML_TYPE_Q4_1; vtype = GGML_TYPE_F16; break;
  419. default:
  420. {
  421. fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
  422. __func__, fname.c_str(), model.hparams.f16);
  423. return false;
  424. }
  425. }
  426. // map model into memory
  427. char *mm_addr = NULL;
  428. model.mm_addr = mmap_file(fname.c_str(), &model.mm_length);
  429. if (model.mm_addr == NULL) {
  430. fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str());
  431. return false;
  432. }
  433. mm_addr = (char *)model.mm_addr;
  434. fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0));
  435. auto & ctx = model.ctx;
  436. size_t ctx_size = 0;
  437. {
  438. const auto &hparams = model.hparams;
  439. const int n_layer = hparams.n_layer;
  440. ctx_size += (5 + 10*n_layer)*256; // object overhead
  441. fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
  442. }
  443. // print memory requirements
  444. {
  445. const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
  446. // this is the total memory required to run the inference
  447. const size_t mem_required =
  448. ctx_size +
  449. model.mm_length +
  450. MEM_REQ_SCRATCH0.at(model.type) +
  451. MEM_REQ_SCRATCH1.at(model.type) +
  452. MEM_REQ_EVAL.at (model.type);
  453. // this is the memory required by one llama_state
  454. const size_t mem_required_state =
  455. scale*MEM_REQ_KV_SELF.at(model.type);
  456. fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
  457. mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
  458. }
  459. // create the ggml context
  460. {
  461. lctx.model.buf.resize(ctx_size);
  462. struct ggml_init_params params = {
  463. /*.mem_size =*/ lctx.model.buf.size(),
  464. /*.mem_buffer =*/ lctx.model.buf.data(),
  465. /*.no_alloc =*/ true,
  466. };
  467. model.ctx = ggml_init(params);
  468. if (!model.ctx) {
  469. fprintf(stderr, "%s: ggml_init() failed\n", __func__);
  470. return false;
  471. }
  472. }
  473. // prepare memory for the weights
  474. {
  475. const auto & hparams = model.hparams;
  476. const int n_embd = hparams.n_embd;
  477. const int n_layer = hparams.n_layer;
  478. const int n_vocab = hparams.n_vocab;
  479. model.layers.resize(n_layer);
  480. model.tok_embeddings = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab);
  481. model.norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
  482. model.output = ggml_new_tensor_2d(ctx, vtype, n_embd, n_vocab);
  483. // map by name
  484. model.tensors["tok_embeddings.weight"] = model.tok_embeddings;
  485. model.tensors["norm.weight"] = model.norm;
  486. model.tensors["output.weight"] = model.output;
  487. for (int i = 0; i < n_layer; ++i) {
  488. auto & layer = model.layers[i];
  489. layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
  490. layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
  491. layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
  492. layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
  493. layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
  494. layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
  495. layer.w1 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff);
  496. layer.w2 = ggml_new_tensor_2d(ctx, wtype, n_ff, n_embd);
  497. layer.w3 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff);
  498. // map by name
  499. model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = layer.attention_norm;
  500. model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = layer.wq;
  501. model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = layer.wk;
  502. model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = layer.wv;
  503. model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = layer.wo;
  504. model.tensors["layers." + std::to_string(i) + ".ffn_norm.weight"] = layer.ffn_norm;
  505. model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = layer.w1;
  506. model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = layer.w2;
  507. model.tensors["layers." + std::to_string(i) + ".feed_forward.w3.weight"] = layer.w3;
  508. }
  509. }
  510. std::vector<uint8_t> tmp;
  511. if (progress_callback) {
  512. progress_callback(0.0, progress_callback_user_data);
  513. }
  514. fprintf(stderr, "%s: loading tensors from '%s'\n", __func__, fname.c_str());
  515. // load weights
  516. {
  517. size_t total_size = 0;
  518. model.n_loaded = 0;
  519. while (true) {
  520. int32_t n_dims;
  521. int32_t length;
  522. int32_t ftype;
  523. fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
  524. fin.read(reinterpret_cast<char *>(&length), sizeof(length));
  525. fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
  526. if (fin.eof()) {
  527. break;
  528. }
  529. int32_t nelements = 1;
  530. int32_t ne[2] = { 1, 1 };
  531. for (int i = 0; i < n_dims; ++i) {
  532. fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
  533. nelements *= ne[i];
  534. }
  535. std::string name(length, 0);
  536. fin.read(&name[0], length);
  537. if (model.tensors.find(name.data()) == model.tensors.end()) {
  538. fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
  539. return false;
  540. }
  541. auto tensor = model.tensors[name.data()];
  542. if (ggml_nelements(tensor) != nelements) {
  543. fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
  544. return false;
  545. }
  546. if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
  547. fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
  548. __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
  549. return false;
  550. }
  551. if (0) {
  552. static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
  553. fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
  554. }
  555. switch (ftype) {
  556. case 0: // f32
  557. case 1: // f16
  558. break;
  559. case 2: // q4_0
  560. case 3: // q4_1
  561. assert(ne[0] % 64 == 0);
  562. break;
  563. default:
  564. fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
  565. return false;
  566. };
  567. // load the tensor data into memory without copying or reading it
  568. size_t offset = fin.tellg();
  569. size_t tensor_data_size = ggml_nbytes(tensor);
  570. offset = (offset + 31) & -32;
  571. tensor->data = mm_addr + offset;
  572. fin.seekg(offset + tensor_data_size);
  573. total_size += tensor_data_size;
  574. model.n_loaded++;
  575. // progress
  576. if (progress_callback) {
  577. double current_progress = size_t(fin.tellg()) / double(file_size);
  578. progress_callback(current_progress, progress_callback_user_data);
  579. }
  580. }
  581. fin.close();
  582. fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
  583. if (model.n_loaded == 0) {
  584. fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
  585. } else if (model.n_loaded != (int) model.tensors.size()) {
  586. fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
  587. return false;
  588. }
  589. }
  590. // loading time will be recalculate after the first eval, so
  591. // we take page faults deferred by mmap() into consideration
  592. lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
  593. if (progress_callback) {
  594. progress_callback(1.0, progress_callback_user_data);
  595. }
  596. return true;
  597. }
  598. // evaluate the transformer
  599. //
  600. // - lctx: llama context
  601. // - tokens: new batch of tokens to process
  602. // - n_past: the context size so far
  603. // - n_threads: number of threads to use
  604. //
  605. static bool llama_eval_internal(
  606. llama_context & lctx,
  607. const llama_token * tokens,
  608. const int n_tokens,
  609. const int n_past,
  610. const int n_threads) {
  611. const int64_t t_start_us = ggml_time_us();
  612. const int N = n_tokens;
  613. const auto & model = lctx.model;
  614. const auto & hparams = model.hparams;
  615. auto & kv_self = model.kv_self;
  616. LLAMA_ASSERT(!!kv_self.ctx);
  617. const int n_embd = hparams.n_embd;
  618. const int n_layer = hparams.n_layer;
  619. const int n_ctx = hparams.n_ctx;
  620. const int n_head = hparams.n_head;
  621. const int n_vocab = hparams.n_vocab;
  622. const int n_rot = hparams.n_embd/hparams.n_head;
  623. auto & mem_per_token = lctx.mem_per_token;
  624. auto & buf_compute = lctx.buf_compute;
  625. struct ggml_init_params params = {
  626. /*.mem_size =*/ buf_compute.size(),
  627. /*.mem_buffer =*/ buf_compute.data(),
  628. /*.no_alloc =*/ false,
  629. };
  630. struct ggml_context * ctx0 = ggml_init(params);
  631. // for big prompts, if BLAS is enabled, it is better to use only one thread
  632. // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
  633. ggml_cgraph gf = {};
  634. gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
  635. struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
  636. memcpy(embd->data, tokens, N*ggml_element_size(embd));
  637. struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
  638. for (int il = 0; il < n_layer; ++il) {
  639. struct ggml_tensor * inpSA = inpL;
  640. struct ggml_tensor * cur;
  641. lctx.use_buf(ctx0, 0);
  642. // norm
  643. {
  644. cur = ggml_rms_norm(ctx0, inpL);
  645. // cur = attention_norm*cur
  646. cur = ggml_mul(ctx0,
  647. ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
  648. cur);
  649. }
  650. // self-attention
  651. {
  652. struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
  653. struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
  654. struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
  655. // store key and value to memory
  656. if (N >= 1) {
  657. struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
  658. struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past));
  659. ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
  660. ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
  661. }
  662. // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
  663. struct ggml_tensor * Q =
  664. ggml_permute(ctx0,
  665. ggml_rope(ctx0,
  666. ggml_cpy(ctx0,
  667. Qcur,
  668. ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
  669. n_past, n_rot, 0),
  670. 0, 2, 1, 3);
  671. // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
  672. struct ggml_tensor * K =
  673. ggml_permute(ctx0,
  674. ggml_rope(ctx0,
  675. ggml_reshape_3d(ctx0,
  676. ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
  677. n_embd/n_head, n_head, n_past + N),
  678. n_past, n_rot, 1),
  679. 0, 2, 1, 3);
  680. // K * Q
  681. struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
  682. // KQ_scaled = KQ / sqrt(n_embd/n_head)
  683. struct ggml_tensor * KQ_scaled =
  684. ggml_scale(ctx0,
  685. KQ,
  686. ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
  687. // KQ_masked = mask_past(KQ_scaled)
  688. struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
  689. // KQ = soft_max(KQ_masked)
  690. struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
  691. // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
  692. struct ggml_tensor * V_trans =
  693. ggml_cpy(ctx0,
  694. ggml_permute(ctx0,
  695. ggml_reshape_3d(ctx0,
  696. ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd),
  697. n_embd/n_head, n_head, n_past + N),
  698. 1, 2, 0, 3),
  699. ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head));
  700. // KQV = transpose(V) * KQ_soft_max
  701. struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
  702. // KQV_merged = KQV.permute(0, 2, 1, 3)
  703. struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
  704. // cur = KQV_merged.contiguous().view(n_embd, N)
  705. cur = ggml_cpy(ctx0,
  706. KQV_merged,
  707. ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
  708. // projection (no bias)
  709. cur = ggml_mul_mat(ctx0,
  710. model.layers[il].wo,
  711. cur);
  712. }
  713. lctx.use_buf(ctx0, 1);
  714. struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
  715. // feed-forward network
  716. {
  717. // norm
  718. {
  719. cur = ggml_rms_norm(ctx0, inpFF);
  720. // cur = ffn_norm*cur
  721. cur = ggml_mul(ctx0,
  722. ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
  723. cur);
  724. }
  725. struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
  726. model.layers[il].w3,
  727. cur);
  728. cur = ggml_mul_mat(ctx0,
  729. model.layers[il].w1,
  730. cur);
  731. // SILU activation
  732. cur = ggml_silu(ctx0, cur);
  733. cur = ggml_mul(ctx0, cur, tmp);
  734. cur = ggml_mul_mat(ctx0,
  735. model.layers[il].w2,
  736. cur);
  737. }
  738. cur = ggml_add(ctx0, cur, inpFF);
  739. // input for next layer
  740. inpL = cur;
  741. }
  742. lctx.use_buf(ctx0, 0);
  743. // used at the end to optionally extract the embeddings
  744. struct ggml_tensor * embeddings = NULL;
  745. // norm
  746. {
  747. inpL = ggml_rms_norm(ctx0, inpL);
  748. // inpL = norm*inpL
  749. inpL = ggml_mul(ctx0,
  750. ggml_repeat(ctx0, model.norm, inpL),
  751. inpL);
  752. embeddings = inpL;
  753. }
  754. // lm_head
  755. inpL = ggml_mul_mat(ctx0, model.output, inpL);
  756. lctx.use_buf(ctx0, -1);
  757. // logits -> probs
  758. //inpL = ggml_soft_max(ctx0, inpL);
  759. // run the computation
  760. ggml_build_forward_expand(&gf, inpL);
  761. ggml_graph_compute (ctx0, &gf);
  762. //if (n_past%100 == 0) {
  763. // ggml_graph_print (&gf);
  764. // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
  765. //}
  766. //embd_w.resize(n_vocab*N);
  767. //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
  768. // extract logits
  769. {
  770. auto & logits_out = lctx.logits;
  771. if (lctx.logits_all) {
  772. logits_out.resize(n_vocab * N);
  773. memcpy(logits_out.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N);
  774. } else {
  775. // return result for just the last token
  776. logits_out.resize(n_vocab);
  777. memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
  778. }
  779. }
  780. // extract embeddings
  781. if (lctx.embedding.size()) {
  782. auto & embedding_out = lctx.embedding;
  783. embedding_out.resize(n_embd);
  784. memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
  785. }
  786. if (mem_per_token == 0) {
  787. mem_per_token = ggml_used_mem(ctx0)/N;
  788. }
  789. #if 0
  790. printf("\n%s: used_mem = %.3f MB, scratch -- %.3f MB %.3f MB\n", __func__,
  791. ggml_used_mem(ctx0)/1024.0/1024.0,
  792. lctx.get_buf_max_mem(0)/1024.0/1024.0,
  793. lctx.get_buf_max_mem(1)/1024.0/1024.0);
  794. #endif
  795. ggml_free(ctx0);
  796. // measure the performance only for the single-token evals
  797. if (N == 1) {
  798. lctx.t_eval_us += ggml_time_us() - t_start_us;
  799. lctx.n_eval++;
  800. }
  801. else if (N > 1) {
  802. lctx.t_p_eval_us += ggml_time_us() - t_start_us;
  803. lctx.n_p_eval += N;
  804. }
  805. return true;
  806. }
  807. //
  808. // tokenizer
  809. //
  810. static size_t utf8_len(char src) {
  811. const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
  812. uint8_t highbits = static_cast<uint8_t>(src) >> 4;
  813. return lookup[highbits];
  814. }
  815. struct llama_sp_symbol {
  816. using index = int;
  817. index prev;
  818. index next;
  819. const char * text;
  820. size_t n;
  821. };
  822. struct llama_sp_bigram {
  823. struct comparator {
  824. bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
  825. return (l.score < r.score) || (l.score == r.score && l.left > r.left);
  826. }
  827. };
  828. using queue_storage = std::vector<llama_sp_bigram>;
  829. using queue = std::priority_queue<llama_sp_bigram, queue_storage, comparator>;
  830. llama_sp_symbol::index left;
  831. llama_sp_symbol::index right;
  832. float score;
  833. size_t size;
  834. };
  835. // original implementation:
  836. // https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
  837. struct llama_tokenizer {
  838. llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {}
  839. void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
  840. // split string into utf8 chars
  841. int index = 0;
  842. size_t offs = 0;
  843. while (offs < text.size()) {
  844. llama_sp_symbol sym;
  845. size_t char_len = Min(text.size() - offs, utf8_len(text[offs]));
  846. sym.text = text.c_str() + offs;
  847. sym.n = char_len;
  848. offs += char_len;
  849. sym.prev = index - 1;
  850. sym.next = offs == text.size() ? -1 : index + 1;
  851. index++;
  852. symbols_.emplace_back(std::move(sym));
  853. }
  854. // seed the work queue with all possible 2-character tokens.
  855. for (size_t i = 1; i < symbols_.size(); ++i) {
  856. try_add_bigram(i - 1, i);
  857. }
  858. // keep substituting the highest frequency pairs for as long as we can.
  859. while (!work_queue_.empty()) {
  860. auto bigram = work_queue_.top();
  861. work_queue_.pop();
  862. auto & left_sym = symbols_[bigram.left];
  863. auto & right_sym = symbols_[bigram.right];
  864. // if one of the symbols already got merged, skip it.
  865. if (left_sym.n == 0 || right_sym.n == 0 ||
  866. left_sym.n + right_sym.n != bigram.size) {
  867. continue;
  868. }
  869. // merge the right sym into the left one
  870. left_sym.n += right_sym.n;
  871. right_sym.n = 0;
  872. //printf("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
  873. // remove the right sym from the chain
  874. left_sym.next = right_sym.next;
  875. if (right_sym.next >= 0) {
  876. symbols_[right_sym.next].prev = bigram.left;
  877. }
  878. // find more substitutions
  879. try_add_bigram(left_sym.prev, bigram.left);
  880. try_add_bigram(bigram.left, left_sym.next);
  881. }
  882. for (int i = 0; i != -1; i = symbols_[i].next) {
  883. auto & symbol = symbols_[i];
  884. auto token = vocab_.token_to_id.find(std::string(symbol.text, symbol.n));
  885. if (token == vocab_.token_to_id.end()) {
  886. // output any symbols that did not form tokens as bytes.
  887. for (int j = 0; j < (int) symbol.n; ++j) {
  888. llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
  889. output.push_back(token_id);
  890. }
  891. } else {
  892. output.push_back((*token).second);
  893. }
  894. }
  895. }
  896. private:
  897. void try_add_bigram(int left, int right) {
  898. if (left == -1 || right == -1) {
  899. return;
  900. }
  901. const std::string text = std::string(symbols_[left].text, symbols_[left].n + symbols_[right].n);
  902. auto token = vocab_.token_to_id.find(text);
  903. if (token == vocab_.token_to_id.end()) {
  904. return;
  905. }
  906. if (static_cast<size_t>((*token).second) >= vocab_.id_to_token.size()) {
  907. return;
  908. }
  909. const auto &tok_score = vocab_.id_to_token[(*token).second];
  910. llama_sp_bigram bigram;
  911. bigram.left = left;
  912. bigram.right = right;
  913. bigram.score = tok_score.score;
  914. bigram.size = text.size();
  915. work_queue_.push(bigram);
  916. }
  917. const llama_vocab & vocab_;
  918. std::vector<llama_sp_symbol> symbols_;
  919. llama_sp_bigram::queue work_queue_;
  920. };
  921. static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos) {
  922. llama_tokenizer tokenizer(vocab);
  923. std::vector<llama_vocab::id> output;
  924. if (text.size() == 0) {
  925. return output;
  926. }
  927. if (bos) {
  928. output.push_back(1);
  929. }
  930. tokenizer.tokenize(text, output);
  931. return output;
  932. }
  933. //
  934. // sampling
  935. //
  936. static void sample_top_k(std::vector<std::pair<float, llama_vocab::id>> & logits_id, int top_k) {
  937. // find the top k tokens
  938. std::partial_sort(
  939. logits_id.begin(),
  940. logits_id.begin() + top_k, logits_id.end(),
  941. [](const std::pair<float, llama_vocab::id> & a, const std::pair<float, llama_vocab::id> & b) {
  942. return a.first > b.first;
  943. });
  944. logits_id.resize(top_k);
  945. }
  946. static llama_vocab::id llama_sample_top_p_top_k(
  947. llama_context & lctx,
  948. const std::vector<llama_vocab::id> & last_n_tokens,
  949. int top_k,
  950. float top_p,
  951. float temp,
  952. float repeat_penalty) {
  953. auto & rng = lctx.rng;
  954. const int n_logits = lctx.model.hparams.n_vocab;
  955. const auto & logits = lctx.logits;
  956. const auto * plogits = logits.data() + logits.size() - n_logits;
  957. std::vector<std::pair<float, llama_vocab::id>> logits_id;
  958. logits_id.reserve(n_logits);
  959. {
  960. const float scale = 1.0f/temp;
  961. for (int i = 0; i < n_logits; ++i) {
  962. // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
  963. // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
  964. if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
  965. // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
  966. if (plogits[i] < 0.0f) {
  967. logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
  968. } else {
  969. logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
  970. }
  971. } else {
  972. logits_id.push_back(std::make_pair(plogits[i]*scale, i));
  973. }
  974. }
  975. }
  976. sample_top_k(logits_id, top_k);
  977. float maxl = -std::numeric_limits<float>::infinity();
  978. for (const auto & kv : logits_id) {
  979. maxl = Max(maxl, kv.first);
  980. }
  981. // compute probs for the top k tokens
  982. std::vector<float> probs;
  983. probs.reserve(logits_id.size());
  984. double sum = 0.0;
  985. for (const auto & kv : logits_id) {
  986. const float p = expf(kv.first - maxl);
  987. probs.push_back(p);
  988. sum += p;
  989. }
  990. // normalize the probs
  991. for (auto & p : probs) {
  992. p /= sum;
  993. }
  994. if (top_p < 1.0) {
  995. double cumsum = 0.0;
  996. for (int i = 0; i < (int) probs.size(); i++) {
  997. cumsum += probs[i];
  998. if (cumsum >= top_p) {
  999. probs.resize(i + 1);
  1000. logits_id.resize(i + 1);
  1001. break;
  1002. }
  1003. }
  1004. cumsum = 1.0/cumsum;
  1005. for (int i = 0; i < (int) probs.size(); i++) {
  1006. probs[i] *= cumsum;
  1007. }
  1008. }
  1009. //printf("\n");
  1010. //for (int i = 0; i < (int) 10; i++) {
  1011. // printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
  1012. //}
  1013. //printf("\n\n");
  1014. //exit(0);
  1015. std::discrete_distribution<> dist(probs.begin(), probs.end());
  1016. int idx = dist(rng);
  1017. return logits_id[idx].second;
  1018. }
  1019. //
  1020. // quantization
  1021. //
  1022. // TODO: reuse code from the llama_model_load() somehow
  1023. static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) {
  1024. ggml_type type = GGML_TYPE_Q4_1;
  1025. switch (itype) {
  1026. case 2: type = GGML_TYPE_Q4_0; break;
  1027. case 3: type = GGML_TYPE_Q4_1; break;
  1028. default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
  1029. };
  1030. if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) {
  1031. fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type);
  1032. return false;
  1033. }
  1034. llama_vocab vocab;
  1035. printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
  1036. auto finp = std::ifstream(fname_inp, std::ios::binary);
  1037. if (!finp) {
  1038. fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
  1039. return false;
  1040. }
  1041. auto fout = std::ofstream(fname_out, std::ios::binary);
  1042. if (!fout) {
  1043. fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
  1044. return false;
  1045. }
  1046. // verify magic
  1047. {
  1048. uint32_t magic;
  1049. finp.read((char *) &magic, sizeof(magic));
  1050. if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) {
  1051. fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
  1052. __func__, fname_inp.c_str());
  1053. return false;
  1054. }
  1055. if (magic != LLAMA_FILE_MAGIC) {
  1056. return report_bad_magic(fname_inp.c_str(), magic, LLAMA_FILE_MAGIC);
  1057. }
  1058. fout.write((char *) &magic, sizeof(magic));
  1059. uint32_t format_version;
  1060. finp.read((char *) &format_version, sizeof(format_version));
  1061. if (format_version != LLAMA_FILE_VERSION) {
  1062. fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n",
  1063. __func__, fname_inp.c_str(), format_version, LLAMA_FILE_VERSION);
  1064. return false;
  1065. }
  1066. fout.write((char *) &format_version, sizeof(format_version));
  1067. }
  1068. llama_hparams hparams;
  1069. // load hparams
  1070. {
  1071. finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
  1072. //finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
  1073. finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
  1074. finp.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
  1075. finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
  1076. finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
  1077. finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
  1078. finp.read((char *) &hparams.f16, sizeof(hparams.f16));
  1079. printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
  1080. printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
  1081. printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
  1082. printf("%s: n_mult = %d\n", __func__, hparams.n_mult);
  1083. printf("%s: n_head = %d\n", __func__, hparams.n_head);
  1084. printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
  1085. printf("%s: f16 = %d\n", __func__, hparams.f16);
  1086. fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
  1087. //fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
  1088. fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd));
  1089. fout.write((char *) &hparams.n_mult, sizeof(hparams.n_mult));
  1090. fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
  1091. fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
  1092. fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
  1093. fout.write((char *) &itype, sizeof(hparams.f16));
  1094. }
  1095. // load vocab
  1096. {
  1097. const int32_t n_vocab = hparams.n_vocab;
  1098. if (n_vocab != hparams.n_vocab) {
  1099. fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
  1100. __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
  1101. return false;
  1102. }
  1103. std::vector<char> word(32);
  1104. vocab.id_to_token.resize(n_vocab);
  1105. for (int i = 0; i < n_vocab; i++) {
  1106. uint32_t len;
  1107. finp.read ((char *) &len, sizeof(len));
  1108. fout.write((char *) &len, sizeof(len));
  1109. word.resize(len);
  1110. finp.read ((char *) &word[0], len);
  1111. fout.write((char *) &word[0], len);
  1112. float score;
  1113. finp.read ((char *) &score, sizeof(score));
  1114. fout.write((char *) &score, sizeof(score));
  1115. vocab.token_to_id[word.data()] = i;
  1116. auto &tok_score = vocab.id_to_token[i];
  1117. tok_score.tok = word.data();
  1118. tok_score.score = score;
  1119. }
  1120. }
  1121. // load weights
  1122. {
  1123. size_t total_size_org = 0;
  1124. size_t total_size_new = 0;
  1125. std::vector<float> work;
  1126. std::vector<uint8_t> data_u8;
  1127. std::vector<ggml_fp16_t> data_f16;
  1128. std::vector<float> data_f32;
  1129. std::vector<int64_t> hist_all(1 << 4, 0);
  1130. while (true) {
  1131. int32_t n_dims;
  1132. int32_t length;
  1133. int32_t ftype;
  1134. finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
  1135. finp.read(reinterpret_cast<char *>(&length), sizeof(length));
  1136. finp.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
  1137. if (finp.eof()) {
  1138. break;
  1139. }
  1140. int32_t nelements = 1;
  1141. int32_t ne[2] = { 1, 1 };
  1142. for (int i = 0; i < n_dims; ++i) {
  1143. finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
  1144. nelements *= ne[i];
  1145. }
  1146. std::string name(length, 0);
  1147. finp.read (&name[0], length);
  1148. {
  1149. // ensure tensor data is aligned
  1150. uint64_t offset = finp.tellg();
  1151. offset = (offset + 31) & -32;
  1152. finp.seekg(offset);
  1153. }
  1154. {
  1155. static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
  1156. printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
  1157. }
  1158. // regexes of tensor names to be quantized
  1159. const std::vector<std::string> k_names = {
  1160. ".*weight",
  1161. };
  1162. bool quantize = false;
  1163. for (const auto & s : k_names) {
  1164. if (std::regex_match(name, std::regex(s))) {
  1165. quantize = true;
  1166. break;
  1167. }
  1168. }
  1169. // quantize only 2D tensors
  1170. quantize &= (n_dims == 2);
  1171. if (quantize) {
  1172. if (ftype != 0 && ftype != 1) {
  1173. fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
  1174. return false;
  1175. }
  1176. if (ftype == 1) {
  1177. data_f16.resize(nelements);
  1178. finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
  1179. data_f32.resize(nelements);
  1180. for (int i = 0; i < nelements; ++i) {
  1181. data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
  1182. }
  1183. } else {
  1184. data_f32.resize(nelements);
  1185. finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
  1186. }
  1187. ftype = itype;
  1188. } else {
  1189. const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);
  1190. data_u8.resize(nelements*bpe);
  1191. finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
  1192. }
  1193. fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
  1194. fout.write(reinterpret_cast<char *>(&length), sizeof(length));
  1195. fout.write(reinterpret_cast<char *>(&ftype), sizeof(ftype));
  1196. for (int i = 0; i < n_dims; ++i) {
  1197. fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
  1198. }
  1199. fout.write(&name[0], length);
  1200. {
  1201. // ensure tensor data is aligned
  1202. uint64_t offset = fout.tellp();
  1203. offset = (offset + 31) & -32;
  1204. fout.seekp(offset);
  1205. }
  1206. if (quantize) {
  1207. printf("quantizing .. ");
  1208. work.resize(nelements); // for quantization
  1209. size_t cur_size = 0;
  1210. std::vector<int64_t> hist_cur(1 << 4, 0);
  1211. switch (type) {
  1212. case GGML_TYPE_Q4_0:
  1213. {
  1214. cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
  1215. } break;
  1216. case GGML_TYPE_Q4_1:
  1217. {
  1218. cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
  1219. } break;
  1220. default:
  1221. {
  1222. fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
  1223. return false;
  1224. }
  1225. }
  1226. fout.write(reinterpret_cast<char *>(work.data()), cur_size);
  1227. total_size_new += cur_size;
  1228. printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
  1229. for (int i = 0; i < (int) hist_cur.size(); ++i) {
  1230. hist_all[i] += hist_cur[i];
  1231. }
  1232. for (int i = 0; i < (int) hist_cur.size(); ++i) {
  1233. printf("%5.3f ", hist_cur[i] / float(nelements));
  1234. }
  1235. printf("\n");
  1236. } else {
  1237. printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
  1238. fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
  1239. total_size_new += data_u8.size();
  1240. }
  1241. total_size_org += nelements * sizeof(float);
  1242. }
  1243. printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
  1244. printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
  1245. {
  1246. int64_t sum_all = 0;
  1247. for (int i = 0; i < (int) hist_all.size(); ++i) {
  1248. sum_all += hist_all[i];
  1249. }
  1250. printf("%s: hist: ", __func__);
  1251. for (int i = 0; i < (int) hist_all.size(); ++i) {
  1252. printf("%5.3f ", hist_all[i] / float(sum_all));
  1253. }
  1254. printf("\n");
  1255. }
  1256. }
  1257. finp.close();
  1258. fout.close();
  1259. return true;
  1260. }
  1261. //
  1262. // interface implementation
  1263. //
  1264. struct llama_context * llama_init_from_file(
  1265. const char * path_model,
  1266. struct llama_context_params params) {
  1267. ggml_time_init();
  1268. llama_context * ctx = new llama_context;
  1269. if (params.seed <= 0) {
  1270. params.seed = time(NULL);
  1271. }
  1272. ctx->rng = std::mt19937(params.seed);
  1273. ctx->logits_all = params.logits_all;
  1274. ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
  1275. if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, memory_type,
  1276. params.vocab_only, params.progress_callback,
  1277. params.progress_callback_user_data)) {
  1278. fprintf(stderr, "%s: failed to load model\n", __func__);
  1279. llama_free(ctx);
  1280. return nullptr;
  1281. }
  1282. if (params.use_mlock) {
  1283. char *err;
  1284. if (!ggml_mlock(ctx->model.ctx,
  1285. ctx->model.mm_addr,
  1286. ctx->model.mm_length,
  1287. &err)) {
  1288. fprintf(stderr, "%s\n", err);
  1289. free(err);
  1290. llama_free(ctx);
  1291. return nullptr;
  1292. }
  1293. }
  1294. // reserve memory for context buffers
  1295. {
  1296. if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
  1297. fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
  1298. llama_free(ctx);
  1299. return nullptr;
  1300. }
  1301. {
  1302. const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v);
  1303. fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
  1304. }
  1305. const auto & hparams = ctx->model.hparams;
  1306. // resized during inference
  1307. if (params.logits_all) {
  1308. ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
  1309. } else {
  1310. ctx->logits.reserve(hparams.n_ctx);
  1311. }
  1312. if (params.embedding){
  1313. ctx->embedding.resize(hparams.n_embd);
  1314. }
  1315. ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
  1316. ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
  1317. ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
  1318. }
  1319. return ctx;
  1320. }
  1321. void llama_free(struct llama_context * ctx) {
  1322. kv_cache_free(ctx->model.kv_self);
  1323. if (ctx->model.ctx) {
  1324. ggml_free(ctx->model.ctx);
  1325. }
  1326. if (ctx->model.mm_addr) {
  1327. munmap_file(ctx->model.mm_addr, ctx->model.mm_length);
  1328. }
  1329. delete ctx;
  1330. }
  1331. int llama_model_quantize(
  1332. const char * fname_inp,
  1333. const char * fname_out,
  1334. int itype) {
  1335. if (!llama_model_quantize_internal(fname_inp, fname_out, itype)) {
  1336. fprintf(stderr, "%s: failed to quantize\n", __func__);
  1337. return 1;
  1338. }
  1339. return 0;
  1340. }
  1341. int llama_eval(
  1342. struct llama_context * ctx,
  1343. const llama_token * tokens,
  1344. int n_tokens,
  1345. int n_past,
  1346. int n_threads) {
  1347. if (!llama_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads)) {
  1348. fprintf(stderr, "%s: failed to eval\n", __func__);
  1349. return 1;
  1350. }
  1351. // get a more accurate load time, upon first eval
  1352. if (!ctx->has_evaluated_once) {
  1353. ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
  1354. ctx->has_evaluated_once = true;
  1355. }
  1356. return 0;
  1357. }
  1358. int llama_tokenize(
  1359. struct llama_context * ctx,
  1360. const char * text,
  1361. llama_token * tokens,
  1362. int n_max_tokens,
  1363. bool add_bos) {
  1364. auto res = llama_tokenize(ctx->vocab, text, add_bos);
  1365. if (n_max_tokens < (int) res.size()) {
  1366. fprintf(stderr, "%s: too many tokens\n", __func__);
  1367. return -((int) res.size());
  1368. }
  1369. for (size_t i = 0; i < res.size(); i++) {
  1370. tokens[i] = res[i];
  1371. }
  1372. return res.size();
  1373. }
  1374. int llama_n_vocab(struct llama_context * ctx) {
  1375. return ctx->vocab.id_to_token.size();
  1376. }
  1377. int llama_n_ctx(struct llama_context * ctx) {
  1378. return ctx->model.hparams.n_ctx;
  1379. }
  1380. int llama_n_embd(struct llama_context * ctx) {
  1381. return ctx->model.hparams.n_embd;
  1382. }
  1383. float * llama_get_logits(struct llama_context * ctx) {
  1384. return ctx->logits.data();
  1385. }
  1386. float * llama_get_embeddings(struct llama_context * ctx) {
  1387. return ctx->embedding.data();
  1388. }
  1389. const char * llama_token_to_str(struct llama_context * ctx, llama_token token) {
  1390. if (token >= llama_n_vocab(ctx)) {
  1391. return nullptr;
  1392. }
  1393. return ctx->vocab.id_to_token[token].tok.c_str();
  1394. }
  1395. llama_token llama_token_bos() {
  1396. return 1;
  1397. }
  1398. llama_token llama_token_eos() {
  1399. return 2;
  1400. }
  1401. llama_token llama_sample_top_p_top_k(
  1402. llama_context * ctx,
  1403. const llama_token * last_n_tokens_data,
  1404. int last_n_tokens_size,
  1405. int top_k,
  1406. float top_p,
  1407. float temp,
  1408. float repeat_penalty) {
  1409. const int64_t t_start_sample_us = ggml_time_us();
  1410. llama_token result = 0;
  1411. // TODO: avoid this ...
  1412. const auto last_n_tokens = std::vector<llama_token>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_size);
  1413. result = llama_sample_top_p_top_k(
  1414. *ctx,
  1415. last_n_tokens,
  1416. top_k,
  1417. top_p,
  1418. temp,
  1419. repeat_penalty);
  1420. ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
  1421. ctx->n_sample++;
  1422. return result;
  1423. }
  1424. void llama_print_timings(struct llama_context * ctx) {
  1425. const int64_t t_end_us = ggml_time_us();
  1426. const int32_t n_sample = Max(1, ctx->n_sample);
  1427. const int32_t n_eval = Max(1, ctx->n_eval);
  1428. const int32_t n_p_eval = Max(1, ctx->n_p_eval);
  1429. fprintf(stderr, "\n");
  1430. fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
  1431. fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
  1432. fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
  1433. fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
  1434. fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
  1435. }
  1436. void llama_reset_timings(struct llama_context * ctx) {
  1437. ctx->t_start_us = ggml_time_us();
  1438. ctx->t_sample_us = ctx->n_sample = 0;
  1439. ctx->t_eval_us = ctx->n_eval = 0;
  1440. ctx->t_p_eval_us = ctx->n_p_eval = 0;
  1441. }
  1442. const char * llama_print_system_info(void) {
  1443. static std::string s;
  1444. s = "";
  1445. s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
  1446. s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
  1447. s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
  1448. s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
  1449. s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
  1450. s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
  1451. s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
  1452. s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
  1453. s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
  1454. s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
  1455. s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
  1456. s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
  1457. return s.c_str();
  1458. }