llama-context.cpp 79 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457
  1. #include "llama-context.h"
  2. #include "llama-impl.h"
  3. #include "llama-io.h"
  4. #include "llama-mmap.h"
  5. #include "llama-model.h"
  6. #include "llama-kv-cache.h"
  7. #include <cstring>
  8. #include <stdexcept>
  9. #include <cinttypes>
  10. //
  11. // llama_context
  12. //
  13. llama_context::llama_context(
  14. const llama_model & model,
  15. llama_context_params params) :
  16. model(model) {
  17. LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__);
  18. t_start_us = model.t_start_us;
  19. t_load_us = model.t_load_us;
  20. const auto & hparams = model.hparams;
  21. cparams.n_seq_max = std::max(1u, params.n_seq_max);
  22. cparams.n_threads = params.n_threads;
  23. cparams.n_threads_batch = params.n_threads_batch;
  24. cparams.yarn_ext_factor = params.yarn_ext_factor;
  25. cparams.yarn_attn_factor = params.yarn_attn_factor;
  26. cparams.yarn_beta_fast = params.yarn_beta_fast;
  27. cparams.yarn_beta_slow = params.yarn_beta_slow;
  28. cparams.defrag_thold = params.defrag_thold;
  29. cparams.embeddings = params.embeddings;
  30. cparams.offload_kqv = params.offload_kqv;
  31. cparams.flash_attn = params.flash_attn;
  32. cparams.no_perf = params.no_perf;
  33. cparams.pooling_type = params.pooling_type;
  34. cparams.warmup = false;
  35. cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
  36. cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
  37. cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
  38. cparams.n_ctx_orig_yarn = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx :
  39. hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
  40. hparams.n_ctx_train;
  41. cparams.cb_eval = params.cb_eval;
  42. cparams.cb_eval_user_data = params.cb_eval_user_data;
  43. auto rope_scaling_type = params.rope_scaling_type;
  44. if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
  45. rope_scaling_type = hparams.rope_scaling_type_train;
  46. }
  47. if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
  48. cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
  49. }
  50. if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
  51. cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
  52. }
  53. cparams.yarn_attn_factor *= hparams.rope_attn_factor;
  54. if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
  55. if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
  56. cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
  57. } else {
  58. cparams.pooling_type = hparams.pooling_type;
  59. }
  60. }
  61. if (params.attention_type == LLAMA_ATTENTION_TYPE_UNSPECIFIED) {
  62. cparams.causal_attn = hparams.causal_attn;
  63. } else {
  64. cparams.causal_attn = params.attention_type == LLAMA_ATTENTION_TYPE_CAUSAL;
  65. }
  66. // with causal attention, the batch size is limited by the context size
  67. cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
  68. // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
  69. // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
  70. // ref: https://github.com/ggerganov/llama.cpp/pull/5021
  71. // TODO: this padding is not needed for the cache-less context so we should probably move it to llama_context_kv_self
  72. if (cparams.n_batch < GGML_KQ_MASK_PAD) {
  73. LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
  74. cparams.n_batch = GGML_KQ_MASK_PAD;
  75. }
  76. cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
  77. const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
  78. LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
  79. LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
  80. LLAMA_LOG_INFO("%s: n_ctx_per_seq = %u\n", __func__, n_ctx_per_seq);
  81. LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch);
  82. LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
  83. LLAMA_LOG_INFO("%s: causal_attn = %d\n", __func__, cparams.causal_attn);
  84. LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
  85. LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
  86. LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
  87. if (n_ctx_per_seq < hparams.n_ctx_train) {
  88. LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n",
  89. __func__, n_ctx_per_seq, hparams.n_ctx_train);
  90. }
  91. if (n_ctx_per_seq > hparams.n_ctx_train) {
  92. LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
  93. __func__, n_ctx_per_seq, hparams.n_ctx_train);
  94. }
  95. if (!hparams.vocab_only) {
  96. // GPU backends
  97. for (auto * dev : model.devices) {
  98. ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
  99. if (backend == nullptr) {
  100. throw std::runtime_error(format("failed to initialize %s backend", ggml_backend_dev_name(dev)));
  101. }
  102. backends.emplace_back(backend);
  103. }
  104. // add ACCEL backends (such as BLAS)
  105. for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
  106. ggml_backend_dev_t dev = ggml_backend_dev_get(i);
  107. if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
  108. ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
  109. if (backend == nullptr) {
  110. throw std::runtime_error(format("failed to initialize %s backend", ggml_backend_dev_name(dev)));
  111. }
  112. backends.emplace_back(backend);
  113. }
  114. }
  115. // add CPU backend
  116. backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
  117. if (backend_cpu == nullptr) {
  118. throw std::runtime_error("failed to initialize CPU backend");
  119. }
  120. backends.emplace_back(backend_cpu);
  121. // create a list of the set_n_threads functions in the backends
  122. for (auto & backend : backends) {
  123. ggml_backend_dev_t dev = ggml_backend_get_device(backend.get());
  124. ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
  125. if (reg) {
  126. auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
  127. if (ggml_backend_set_n_threads_fn) {
  128. set_n_threads_fns.emplace_back(backend.get(), ggml_backend_set_n_threads_fn);
  129. }
  130. }
  131. }
  132. llama_set_abort_callback(this, params.abort_callback, params.abort_callback_data);
  133. // graph outputs buffer
  134. {
  135. // resized during inference when a batch uses more outputs
  136. if ((uint32_t) output_reserve(params.n_seq_max) < params.n_seq_max) {
  137. throw std::runtime_error("failed to reserve initial output buffer");
  138. }
  139. LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__,
  140. ggml_backend_buffer_name (buf_output.get()),
  141. ggml_backend_buffer_get_size(buf_output.get()) / 1024.0 / 1024.0);
  142. }
  143. }
  144. // init the memory module
  145. if (!hparams.vocab_only) {
  146. llama_memory_params params_mem = {
  147. /*.type_k =*/ params.type_k,
  148. /*.type_v =*/ params.type_v,
  149. };
  150. memory.reset(model.create_memory(params_mem, cparams));
  151. }
  152. // init backends
  153. if (!hparams.vocab_only) {
  154. LLAMA_LOG_DEBUG("%s: enumerating backends\n", __func__);
  155. backend_buft.clear();
  156. backend_ptrs.clear();
  157. for (auto & backend : backends) {
  158. auto * buft = ggml_backend_get_default_buffer_type(backend.get());
  159. auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
  160. if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model.devices.empty()) {
  161. // use the host buffer of the first device CPU for faster transfer of the intermediate state
  162. auto * dev = model.devices[0];
  163. auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
  164. if (host_buft) {
  165. buft = host_buft;
  166. }
  167. }
  168. backend_buft.push_back(buft);
  169. backend_ptrs.push_back(backend.get());
  170. }
  171. LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size());
  172. const size_t max_nodes = this->graph_max_nodes();
  173. LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
  174. // buffer used to store the computation graph and the tensor meta data
  175. buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
  176. // TODO: move these checks to ggml_backend_sched
  177. // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
  178. bool pipeline_parallel =
  179. model.n_devices() > 1 &&
  180. model.params.n_gpu_layers > (int) model.hparams.n_layer &&
  181. model.params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
  182. cparams.offload_kqv &&
  183. !model.has_tensor_overrides();
  184. // pipeline parallelism requires support for async compute and events in all devices
  185. if (pipeline_parallel) {
  186. for (auto & backend : backends) {
  187. auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
  188. if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
  189. // ignore CPU backend
  190. continue;
  191. }
  192. auto * dev = ggml_backend_get_device(backend.get());
  193. ggml_backend_dev_props props;
  194. ggml_backend_dev_get_props(dev, &props);
  195. if (!props.caps.async || !props.caps.events) {
  196. // device does not support async compute or events
  197. pipeline_parallel = false;
  198. break;
  199. }
  200. }
  201. }
  202. sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel));
  203. if (pipeline_parallel) {
  204. LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
  205. }
  206. }
  207. // reserve worst-case graph
  208. if (!hparams.vocab_only && memory) {
  209. const uint32_t n_seqs = 1; // TODO: worst-case number of sequences
  210. const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
  211. llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
  212. // restore later
  213. // TODO: something cleaner
  214. const auto n_outputs_save = n_outputs;
  215. LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
  216. int n_splits_pp = -1;
  217. int n_nodes_pp = -1;
  218. int n_splits_tg = -1;
  219. int n_nodes_tg = -1;
  220. // simulate full KV cache
  221. llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
  222. kv_self->set_full();
  223. cross.v_embd.clear();
  224. // reserve pp graph first so that buffers are only allocated once
  225. {
  226. llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
  227. // max number of outputs
  228. n_outputs = ubatch_pp.n_tokens;
  229. LLAMA_LOG_DEBUG("%s: reserving graph for n_tokens = %d, n_seqs = %d\n", __func__, ubatch_pp.n_tokens, ubatch_pp.n_seqs);
  230. auto * gf = graph_init();
  231. graph_build(ctx_compute.get(), gf, ubatch_pp, LLM_GRAPH_TYPE_DEFAULT);
  232. if (!ggml_backend_sched_reserve(sched.get(), gf)) {
  233. throw std::runtime_error("failed to allocate compute pp buffers");
  234. }
  235. n_splits_pp = ggml_backend_sched_get_n_splits(sched.get());
  236. n_nodes_pp = ggml_graph_n_nodes(gf);
  237. }
  238. // reserve with tg graph to get the number of splits and nodes
  239. {
  240. llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
  241. n_outputs = ubatch_tg.n_tokens;
  242. LLAMA_LOG_DEBUG("%s: reserving graph for n_tokens = %d, n_seqs = %d\n", __func__, ubatch_tg.n_tokens, ubatch_tg.n_seqs);
  243. auto * gf = graph_init();
  244. graph_build(ctx_compute.get(), gf, ubatch_tg, LLM_GRAPH_TYPE_DEFAULT);
  245. if (!ggml_backend_sched_reserve(sched.get(), gf)) {
  246. throw std::runtime_error("failed to allocate compute tg buffers");
  247. }
  248. n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
  249. n_nodes_tg = ggml_graph_n_nodes(gf);
  250. }
  251. // reserve again with pp graph to avoid ggml-alloc reallocations during inference
  252. {
  253. llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
  254. n_outputs = ubatch_pp.n_tokens;
  255. LLAMA_LOG_DEBUG("%s: reserving graph for n_tokens = %d, n_seqs = %d\n", __func__, ubatch_pp.n_tokens, ubatch_pp.n_seqs);
  256. auto * gf = graph_init();
  257. graph_build(ctx_compute.get(), gf, ubatch_pp, LLM_GRAPH_TYPE_DEFAULT);
  258. if (!ggml_backend_sched_reserve(sched.get(), gf)) {
  259. throw std::runtime_error("failed to allocate compute pp buffers");
  260. }
  261. }
  262. n_outputs = n_outputs_save;
  263. for (size_t i = 0; i < backend_ptrs.size(); ++i) {
  264. ggml_backend_t backend = backend_ptrs[i];
  265. ggml_backend_buffer_type_t buft = backend_buft[i];
  266. size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend);
  267. if (size > 1) {
  268. LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
  269. ggml_backend_buft_name(buft),
  270. size / 1024.0 / 1024.0);
  271. }
  272. }
  273. if (n_nodes_pp == n_nodes_tg) {
  274. LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, n_nodes_pp);
  275. } else {
  276. LLAMA_LOG_INFO("%s: graph nodes = %d (with bs=%d), %d (with bs=1)\n", __func__, n_nodes_pp, n_tokens, n_nodes_tg);
  277. }
  278. if (n_splits_pp == n_splits_tg) {
  279. LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits_pp);
  280. } else {
  281. LLAMA_LOG_INFO("%s: graph splits = %d (with bs=%d), %d (with bs=1)\n", __func__, n_splits_pp, n_tokens, n_splits_tg);
  282. }
  283. }
  284. }
  285. llama_context::~llama_context() = default;
  286. void llama_context::synchronize() {
  287. ggml_backend_sched_synchronize(sched.get());
  288. // FIXME: if multiple single tokens are evaluated without a synchronization,
  289. // the stats will be added to the prompt evaluation stats
  290. // this should only happen when using batch size 1 to evaluate a batch
  291. // add the evaluation to the stats
  292. if (n_queued_tokens == 1) {
  293. if (!cparams.no_perf) {
  294. t_eval_us += ggml_time_us() - t_compute_start_us;
  295. }
  296. n_eval++;
  297. } else if (n_queued_tokens > 1) {
  298. if (!cparams.no_perf) {
  299. t_p_eval_us += ggml_time_us() - t_compute_start_us;
  300. }
  301. n_p_eval += n_queued_tokens;
  302. }
  303. // get a more accurate load time, upon first eval
  304. if (n_queued_tokens > 0 && !has_evaluated_once) {
  305. t_load_us = ggml_time_us() - t_start_us;
  306. has_evaluated_once = true;
  307. }
  308. n_queued_tokens = 0;
  309. t_compute_start_us = 0;
  310. }
  311. const llama_model & llama_context::get_model() const {
  312. return model;
  313. }
  314. const llama_cparams & llama_context::get_cparams() const {
  315. return cparams;
  316. }
  317. ggml_backend_sched_t llama_context::get_sched() const {
  318. return sched.get();
  319. }
  320. ggml_context * llama_context::get_ctx_compute() const {
  321. return ctx_compute.get();
  322. }
  323. uint32_t llama_context::n_ctx() const {
  324. return cparams.n_ctx;
  325. }
  326. uint32_t llama_context::n_ctx_per_seq() const {
  327. return cparams.n_ctx / cparams.n_seq_max;
  328. }
  329. uint32_t llama_context::n_batch() const {
  330. return cparams.n_batch;
  331. }
  332. uint32_t llama_context::n_ubatch() const {
  333. return cparams.n_ubatch;
  334. }
  335. uint32_t llama_context::n_seq_max() const {
  336. return cparams.n_seq_max;
  337. }
  338. uint32_t llama_context::n_threads() const {
  339. return cparams.n_threads;
  340. }
  341. uint32_t llama_context::n_threads_batch() const {
  342. return cparams.n_threads_batch;
  343. }
  344. llama_kv_cache * llama_context::get_kv_self() {
  345. llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
  346. return kv_self;
  347. }
  348. const llama_kv_cache * llama_context::get_kv_self() const {
  349. llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
  350. return kv_self;
  351. }
  352. void llama_context::kv_self_update() {
  353. bool need_reserve = false;
  354. llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
  355. need_reserve = kv_self->update(*this);
  356. // reserve a worst case graph if needed
  357. if (need_reserve) {
  358. LLAMA_LOG_DEBUG("%s: reserving a worst case graph\n", __func__);
  359. // build worst-case graph
  360. uint32_t n_seqs = 1; // TODO: worst-case number of sequences
  361. uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
  362. // simulate full KV cache
  363. kv_self->set_full();
  364. llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
  365. llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
  366. auto * gf = graph_init();
  367. graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT);
  368. // initialize scheduler with the worst-case graph
  369. ggml_backend_sched_reset(sched.get());
  370. if (!ggml_backend_sched_reserve(sched.get(), gf)) {
  371. LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
  372. }
  373. }
  374. }
  375. enum llama_pooling_type llama_context::pooling_type() const {
  376. return cparams.pooling_type;
  377. }
  378. float * llama_context::get_logits() {
  379. return logits;
  380. }
  381. float * llama_context::get_logits_ith(int32_t i) {
  382. int32_t j = -1;
  383. try {
  384. if (logits == nullptr) {
  385. throw std::runtime_error("no logits");
  386. }
  387. if (i < 0) {
  388. j = n_outputs + i;
  389. if (j < 0) {
  390. throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs));
  391. }
  392. } else if ((size_t) i >= output_ids.size()) {
  393. throw std::runtime_error(format("out of range [0, %zu)", output_ids.size()));
  394. } else {
  395. j = output_ids[i];
  396. }
  397. if (j < 0) {
  398. throw std::runtime_error(format("batch.logits[%d] != true", i));
  399. }
  400. if (j >= n_outputs) {
  401. // This should not happen
  402. throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
  403. }
  404. return logits + j*model.vocab.n_tokens();
  405. } catch (const std::exception & err) {
  406. LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
  407. #ifndef NDEBUG
  408. GGML_ABORT("fatal error");
  409. #else
  410. return nullptr;
  411. #endif
  412. }
  413. }
  414. float * llama_context::get_embeddings() {
  415. return embd;
  416. }
  417. float * llama_context::get_embeddings_ith(int32_t i) {
  418. int32_t j = -1;
  419. try {
  420. if (embd == nullptr) {
  421. throw std::runtime_error("no embeddings");
  422. }
  423. if (i < 0) {
  424. j = n_outputs + i;
  425. if (j < 0) {
  426. throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs));
  427. }
  428. } else if ((size_t) i >= output_ids.size()) {
  429. throw std::runtime_error(format("out of range [0, %zu)", output_ids.size()));
  430. } else {
  431. j = output_ids[i];
  432. }
  433. if (j < 0) {
  434. throw std::runtime_error(format("batch.logits[%d] != true", i));
  435. }
  436. if (j >= n_outputs) {
  437. // This should not happen
  438. throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, n_outputs));
  439. }
  440. return embd + j*model.hparams.n_embd;
  441. } catch (const std::exception & err) {
  442. LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
  443. #ifndef NDEBUG
  444. GGML_ABORT("fatal error");
  445. #else
  446. return nullptr;
  447. #endif
  448. }
  449. }
  450. float * llama_context::get_embeddings_seq(llama_seq_id seq_id) {
  451. auto it = embd_seq.find(seq_id);
  452. if (it == embd_seq.end()) {
  453. return nullptr;
  454. }
  455. return it->second.data();
  456. }
  457. void llama_context::attach_threadpool(
  458. ggml_threadpool_t threadpool,
  459. ggml_threadpool_t threadpool_batch) {
  460. LLAMA_LOG_DEBUG("%s: call\n", __func__);
  461. this->threadpool = threadpool;
  462. this->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
  463. }
  464. void llama_context::detach_threadpool() {
  465. LLAMA_LOG_DEBUG("%s: call\n", __func__);
  466. this->threadpool = nullptr;
  467. this->threadpool_batch = nullptr;
  468. }
  469. void llama_context::set_n_threads(int32_t n_threads, int32_t n_threads_batch) {
  470. LLAMA_LOG_DEBUG("%s: n_threads = %d, n_threads_batch = %d\n", __func__, n_threads, n_threads_batch);
  471. cparams.n_threads = n_threads;
  472. cparams.n_threads_batch = n_threads_batch;
  473. }
  474. void llama_context::set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data) {
  475. LLAMA_LOG_DEBUG("%s: call\n", __func__);
  476. this->abort_callback = abort_callback;
  477. this->abort_callback_data = abort_callback_data;
  478. for (auto & backend : backends) {
  479. auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
  480. auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
  481. if (set_abort_callback_fn) {
  482. set_abort_callback_fn(backend.get(), this->abort_callback, this->abort_callback_data);
  483. }
  484. }
  485. }
  486. void llama_context::set_embeddings(bool value) {
  487. LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
  488. cparams.embeddings = value;
  489. }
  490. void llama_context::set_causal_attn(bool value) {
  491. LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
  492. cparams.causal_attn = value;
  493. }
  494. void llama_context::set_warmup(bool value) {
  495. LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
  496. cparams.warmup = value;
  497. }
  498. void llama_context::set_adapter_lora(
  499. llama_adapter_lora * adapter,
  500. float scale) {
  501. LLAMA_LOG_DEBUG("%s: adapter = %p, scale = %f\n", __func__, (void *) adapter, scale);
  502. loras[adapter] = scale;
  503. }
  504. bool llama_context::rm_adapter_lora(
  505. llama_adapter_lora * adapter) {
  506. LLAMA_LOG_DEBUG("%s: adapter = %p\n", __func__, (void *) adapter);
  507. auto pos = loras.find(adapter);
  508. if (pos != loras.end()) {
  509. loras.erase(pos);
  510. return true;
  511. }
  512. return false;
  513. }
  514. void llama_context::clear_adapter_lora() {
  515. LLAMA_LOG_DEBUG("%s: call\n", __func__);
  516. loras.clear();
  517. }
  518. bool llama_context::apply_adapter_cvec(
  519. const float * data,
  520. size_t len,
  521. int32_t n_embd,
  522. int32_t il_start,
  523. int32_t il_end) {
  524. LLAMA_LOG_DEBUG("%s: il_start = %d, il_end = %d\n", __func__, il_start, il_end);
  525. return cvec.apply(model, data, len, n_embd, il_start, il_end);
  526. }
  527. int llama_context::encode(llama_batch & inp_batch) {
  528. if (inp_batch.n_tokens == 0) {
  529. LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
  530. return -1;
  531. }
  532. // temporary allocate memory for the input batch if needed
  533. // note: during encode, we always pass the full sequence starting from pos = 0
  534. llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : 0);
  535. const llama_batch & batch = batch_allocr.batch;
  536. const int32_t n_tokens = batch.n_tokens;
  537. const auto & hparams = model.hparams;
  538. GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
  539. if (batch.token) {
  540. for (int32_t i = 0; i < n_tokens; ++i) {
  541. if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
  542. LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]);
  543. return -1;
  544. }
  545. }
  546. }
  547. // micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
  548. GGML_ASSERT(cparams.n_ubatch >= (uint32_t) n_tokens && "encoder requires n_ubatch >= n_tokens");
  549. if (t_compute_start_us == 0) {
  550. t_compute_start_us = ggml_time_us();
  551. }
  552. embd_seq.clear();
  553. n_queued_tokens += n_tokens;
  554. const int64_t n_embd = hparams.n_embd;
  555. llama_sbatch sbatch = llama_sbatch(batch, n_embd, /* simple_split */ true, /* logits_all */ true);
  556. const llama_ubatch ubatch = sbatch.split_simple(n_tokens);
  557. // reserve output buffer
  558. if (output_reserve(n_tokens) < n_tokens) {
  559. LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_tokens);
  560. return -2;
  561. };
  562. for (int32_t i = 0; i < n_tokens; ++i) {
  563. output_ids[i] = i;
  564. }
  565. n_outputs = n_tokens;
  566. //batch_manager->prepare(ubatch);
  567. ggml_backend_sched_reset(sched.get());
  568. ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
  569. const auto causal_attn_org = cparams.causal_attn;
  570. // always use non-causal attention for encoder graphs
  571. // TODO: this is a tmp solution until we have a proper way to support enc-dec models
  572. // ref: https://github.com/ggml-org/llama.cpp/pull/12181#issuecomment-2730451223
  573. cparams.causal_attn = false;
  574. auto * gf = graph_init();
  575. auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_ENCODER);
  576. ggml_backend_sched_alloc_graph(sched.get(), gf);
  577. res->set_inputs(&ubatch);
  578. cparams.causal_attn = causal_attn_org;
  579. const auto compute_status = graph_compute(gf, n_tokens > 1);
  580. switch (compute_status) {
  581. case GGML_STATUS_SUCCESS:
  582. break;
  583. case GGML_STATUS_ABORTED:
  584. return 2;
  585. case GGML_STATUS_ALLOC_FAILED:
  586. return -2;
  587. case GGML_STATUS_FAILED:
  588. default:
  589. return -3;
  590. }
  591. auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd();
  592. // extract embeddings
  593. if (t_embd) {
  594. ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
  595. GGML_ASSERT(backend_embd != nullptr);
  596. switch (cparams.pooling_type) {
  597. case LLAMA_POOLING_TYPE_NONE:
  598. {
  599. // extract token embeddings
  600. GGML_ASSERT(embd != nullptr);
  601. GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_size);
  602. ggml_backend_tensor_get_async(backend_embd, t_embd, embd, 0, n_tokens*n_embd*sizeof(float));
  603. } break;
  604. case LLAMA_POOLING_TYPE_MEAN:
  605. case LLAMA_POOLING_TYPE_CLS:
  606. case LLAMA_POOLING_TYPE_LAST:
  607. {
  608. // extract sequence embeddings
  609. auto & embd_seq_out = embd_seq;
  610. embd_seq_out.clear();
  611. GGML_ASSERT(!ubatch.equal_seqs); // TODO: handle equal splits
  612. for (int32_t i = 0; i < n_tokens; i++) {
  613. const llama_seq_id seq_id = ubatch.seq_id[i][0];
  614. if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
  615. continue;
  616. }
  617. embd_seq_out[seq_id].resize(n_embd);
  618. ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
  619. }
  620. } break;
  621. case LLAMA_POOLING_TYPE_RANK:
  622. {
  623. // extract the rerank score - a single float per sequence
  624. auto & embd_seq_out = embd_seq;
  625. for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
  626. const llama_seq_id seq_id = ubatch.seq_id[s][0];
  627. if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
  628. continue;
  629. }
  630. embd_seq_out[seq_id].resize(1);
  631. ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
  632. }
  633. } break;
  634. case LLAMA_POOLING_TYPE_UNSPECIFIED:
  635. {
  636. GGML_ABORT("unknown pooling type");
  637. }
  638. }
  639. }
  640. // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
  641. // overlap with device computation.
  642. ggml_backend_sched_reset(sched.get());
  643. // TODO: hacky solution
  644. if (model.arch == LLM_ARCH_T5 && t_embd) {
  645. //cross.t_embd = t_embd;
  646. synchronize();
  647. cross.n_embd = t_embd->ne[0];
  648. cross.n_enc = t_embd->ne[1];
  649. cross.v_embd.resize(cross.n_embd*cross.n_enc);
  650. memcpy(cross.v_embd.data(), embd, ggml_nbytes(t_embd));
  651. // remember the sequence ids used during the encoding - needed for cross attention later
  652. cross.seq_ids_enc.resize(n_tokens);
  653. for (int32_t i = 0; i < n_tokens; i++) {
  654. cross.seq_ids_enc[i].clear();
  655. for (int s = 0; s < ubatch.n_seq_id[i]; s++) {
  656. llama_seq_id seq_id = ubatch.seq_id[i][s];
  657. cross.seq_ids_enc[i].insert(seq_id);
  658. }
  659. }
  660. }
  661. return 0;
  662. }
  663. int llama_context::decode(llama_batch & inp_batch) {
  664. if (!memory) {
  665. LLAMA_LOG_WARN("%s: cannot decode batches with this context (use llama_encode() instead)\n", __func__);
  666. return encode(inp_batch);
  667. }
  668. if (inp_batch.n_tokens == 0) {
  669. LLAMA_LOG_ERROR("%s: n_tokens == 0\n", __func__);
  670. return -1;
  671. }
  672. llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
  673. // temporary allocate memory for the input batch if needed
  674. // TODO: this is incorrect for multiple sequences because get_pos_max() is the maximum across all sequences
  675. llama_batch_allocr batch_allocr(inp_batch, inp_batch.pos ? -1 : kv_self->get_pos_max() + 1);
  676. const llama_batch & batch = batch_allocr.batch;
  677. const auto & vocab = model.vocab;
  678. const auto & hparams = model.hparams;
  679. const int32_t n_vocab = vocab.n_tokens();
  680. const int64_t n_tokens_all = batch.n_tokens;
  681. const int64_t n_embd = hparams.n_embd;
  682. llama_kv_cache_guard kv_guard(kv_self);
  683. GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
  684. if (batch.token) {
  685. for (int64_t i = 0; i < n_tokens_all; ++i) {
  686. if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) {
  687. LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]);
  688. throw std::runtime_error("invalid token");
  689. }
  690. }
  691. }
  692. GGML_ASSERT(n_tokens_all <= cparams.n_batch);
  693. GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens");
  694. if (t_compute_start_us == 0) {
  695. t_compute_start_us = ggml_time_us();
  696. }
  697. n_queued_tokens += n_tokens_all;
  698. // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
  699. const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
  700. embd_seq.clear();
  701. int64_t n_outputs_all = 0;
  702. // count outputs
  703. if (batch.logits && !embd_pooled) {
  704. for (uint32_t i = 0; i < n_tokens_all; ++i) {
  705. n_outputs_all += batch.logits[i] != 0;
  706. }
  707. } else if (embd_pooled) {
  708. n_outputs_all = n_tokens_all;
  709. } else {
  710. // keep last output only
  711. n_outputs_all = 1;
  712. }
  713. llama_sbatch sbatch = kv_self->sbatch_init(batch, /* logits_all */ n_outputs_all == n_tokens_all);
  714. // reserve output buffer
  715. if (output_reserve(n_outputs_all) < n_outputs_all) {
  716. LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
  717. return -2;
  718. };
  719. // handle any pending defrags/shifts
  720. kv_self_update();
  721. int64_t n_outputs_prev = 0;
  722. while (sbatch.n_tokens > 0) {
  723. llama_ubatch ubatch = kv_self->ubatch_next(sbatch, cparams.n_ubatch, embd_pooled);
  724. // count the outputs in this u_batch
  725. {
  726. int32_t n_outputs_new = 0;
  727. if (n_outputs_all == n_tokens_all) {
  728. n_outputs_new = ubatch.n_tokens;
  729. } else {
  730. GGML_ASSERT(ubatch.output);
  731. for (uint32_t i = 0; i < ubatch.n_tokens; i++) {
  732. n_outputs_new += (int32_t) (ubatch.output[i] != 0);
  733. }
  734. }
  735. // needs to happen before the graph is built
  736. n_outputs = n_outputs_new;
  737. }
  738. // find KV slot
  739. if (!kv_self->find_slot(ubatch)) {
  740. LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
  741. return 1;
  742. }
  743. ggml_backend_sched_reset(sched.get());
  744. ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
  745. auto * gf = graph_init();
  746. auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DECODER);
  747. // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
  748. ggml_backend_sched_alloc_graph(sched.get(), gf);
  749. res->set_inputs(&ubatch);
  750. const auto compute_status = graph_compute(gf, ubatch.n_tokens > 1);
  751. if (compute_status != GGML_STATUS_SUCCESS) {
  752. switch (compute_status) {
  753. case GGML_STATUS_ABORTED:
  754. return 2;
  755. case GGML_STATUS_ALLOC_FAILED:
  756. return -2;
  757. case GGML_STATUS_FAILED:
  758. default:
  759. return -3;
  760. }
  761. }
  762. // plot the computation graph in dot format (for debugging purposes)
  763. //if (n_past%100 == 0) {
  764. // ggml_graph_dump_dot(gf, NULL, "llama.dot");
  765. //}
  766. auto * t_logits = cparams.embeddings ? nullptr : res->get_logits();
  767. auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr;
  768. if (t_embd && res->get_embd_pooled()) {
  769. t_embd = res->get_embd_pooled();
  770. }
  771. // extract logits
  772. if (t_logits && n_outputs > 0) {
  773. ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
  774. GGML_ASSERT(backend_res != nullptr);
  775. GGML_ASSERT(logits != nullptr);
  776. float * logits_out = logits + n_outputs_prev*n_vocab;
  777. if (n_outputs) {
  778. GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all);
  779. GGML_ASSERT((n_outputs_prev + n_outputs)*n_vocab <= (int64_t) logits_size);
  780. ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float));
  781. }
  782. }
  783. // extract embeddings
  784. if (t_embd && n_outputs > 0) {
  785. ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
  786. GGML_ASSERT(backend_embd != nullptr);
  787. switch (cparams.pooling_type) {
  788. case LLAMA_POOLING_TYPE_NONE:
  789. {
  790. // extract token embeddings
  791. GGML_ASSERT(embd != nullptr);
  792. float * embd_out = embd + n_outputs_prev*n_embd;
  793. if (n_outputs) {
  794. GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all);
  795. GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd <= (int64_t) embd_size);
  796. ggml_backend_tensor_get_async(backend_embd, t_embd, embd_out, 0, n_outputs*n_embd*sizeof(float));
  797. }
  798. } break;
  799. case LLAMA_POOLING_TYPE_MEAN:
  800. case LLAMA_POOLING_TYPE_CLS:
  801. case LLAMA_POOLING_TYPE_LAST:
  802. {
  803. // extract sequence embeddings (cleared before processing each batch)
  804. auto & embd_seq_out = embd_seq;
  805. for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
  806. const llama_seq_id seq_id = ubatch.seq_id[s][0];
  807. if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
  808. continue;
  809. }
  810. embd_seq_out[seq_id].resize(n_embd);
  811. ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (n_embd*seq_id)*sizeof(float), n_embd*sizeof(float));
  812. }
  813. } break;
  814. case LLAMA_POOLING_TYPE_RANK:
  815. {
  816. // extract the rerank score - a single float per sequence
  817. auto & embd_seq_out = embd_seq;
  818. for (uint32_t s = 0; s < ubatch.n_seqs; ++s) {
  819. const llama_seq_id seq_id = ubatch.seq_id[s][0];
  820. if (embd_seq_out.find(seq_id) != embd_seq_out.end()) {
  821. continue;
  822. }
  823. embd_seq_out[seq_id].resize(1);
  824. ggml_backend_tensor_get_async(backend_embd, t_embd, embd_seq_out[seq_id].data(), (seq_id)*sizeof(float), sizeof(float));
  825. }
  826. } break;
  827. case LLAMA_POOLING_TYPE_UNSPECIFIED:
  828. {
  829. GGML_ABORT("unknown pooling type");
  830. }
  831. }
  832. }
  833. n_outputs_prev += n_outputs;
  834. }
  835. // finalize the batch processing
  836. kv_guard.commit();
  837. // set to total number of outputs in the batch, for use in llama_get_logits_ith
  838. n_outputs = n_outputs_all;
  839. // set output mappings
  840. {
  841. bool sorted_output = true;
  842. auto & out_ids = sbatch.out_ids;
  843. GGML_ASSERT(out_ids.size() == (size_t) n_outputs_all);
  844. for (int64_t i = 0; i < n_outputs_all; ++i) {
  845. int64_t out_id = out_ids[i];
  846. output_ids[out_id] = i;
  847. if (out_id != i) {
  848. sorted_output = false;
  849. }
  850. }
  851. // make the outputs have the same order they had in the user-provided batch
  852. // note: this is mostly relevant for recurrent models atm
  853. if (!sorted_output) {
  854. const uint32_t n_vocab = model.vocab.n_tokens();
  855. const uint32_t n_embd = model.hparams.n_embd;
  856. GGML_ASSERT((size_t) n_outputs == out_ids.size());
  857. // TODO: is there something more efficient which also minimizes swaps?
  858. // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
  859. for (int32_t i = 0; i < n_outputs - 1; ++i) {
  860. int32_t j_min = i;
  861. for (int32_t j = i + 1; j < n_outputs; ++j) {
  862. if (out_ids[j] < out_ids[j_min]) {
  863. j_min = j;
  864. }
  865. }
  866. if (j_min == i) { continue; }
  867. std::swap(out_ids[i], out_ids[j_min]);
  868. if (logits_size > 0) {
  869. for (uint32_t k = 0; k < n_vocab; k++) {
  870. std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]);
  871. }
  872. }
  873. if (embd_size > 0) {
  874. for (uint32_t k = 0; k < n_embd; k++) {
  875. std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]);
  876. }
  877. }
  878. }
  879. std::fill(output_ids.begin(), output_ids.end(), -1);
  880. for (int32_t i = 0; i < n_outputs; ++i) {
  881. output_ids[out_ids[i]] = i;
  882. }
  883. }
  884. }
  885. // wait for the computation to finish (automatically done when obtaining the model output)
  886. //synchronize();
  887. // decide if we need to defrag the kv cache
  888. if (cparams.defrag_thold > 0.0f) {
  889. kv_self->defrag_sched(cparams.defrag_thold);
  890. }
  891. // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
  892. // overlap with device computation.
  893. ggml_backend_sched_reset(sched.get());
  894. return 0;
  895. }
  896. //
  897. // output
  898. //
  899. int32_t llama_context::output_reserve(int32_t n_outputs) {
  900. const auto & hparams = model.hparams;
  901. const auto & vocab = model.vocab;
  902. const int64_t n_outputs_max = std::max<int64_t>(n_outputs, n_seq_max());
  903. const auto n_batch = cparams.n_batch;
  904. const auto n_vocab = vocab.n_tokens();
  905. const auto n_embd = hparams.n_embd;
  906. // TODO: use a per-batch flag for logits presence instead
  907. bool has_logits = !cparams.embeddings;
  908. bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
  909. // TODO: hacky enc-dec support
  910. if (model.arch == LLM_ARCH_T5) {
  911. has_logits = true;
  912. has_embd = true;
  913. }
  914. logits_size = has_logits ? n_vocab*n_outputs_max : 0;
  915. embd_size = has_embd ? n_embd*n_outputs_max : 0;
  916. if (output_ids.empty()) {
  917. // init, never resized afterwards
  918. output_ids.resize(n_batch);
  919. }
  920. const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0;
  921. const size_t new_size = (logits_size + embd_size) * sizeof(float);
  922. // alloc only when more than the current capacity is required
  923. // TODO: also consider shrinking the buffer
  924. if (!buf_output || prev_size < new_size) {
  925. if (buf_output) {
  926. #ifndef NDEBUG
  927. // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
  928. LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
  929. #endif
  930. buf_output = nullptr;
  931. logits = nullptr;
  932. embd = nullptr;
  933. }
  934. auto * buft = ggml_backend_cpu_buffer_type();
  935. // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
  936. auto * output_dev = model.dev_output();
  937. auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
  938. if (output_dev_host_buft) {
  939. buft = output_dev_host_buft;
  940. }
  941. buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size));
  942. if (buf_output == nullptr) {
  943. LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
  944. return 0;
  945. }
  946. }
  947. float * output_base = (float *) ggml_backend_buffer_get_base(buf_output.get());
  948. logits = has_logits ? output_base : nullptr;
  949. embd = has_embd ? output_base + logits_size : nullptr;
  950. // set all ids as invalid (negative)
  951. std::fill(output_ids.begin(), output_ids.end(), -1);
  952. this->n_outputs = 0;
  953. this->n_outputs_max = n_outputs_max;
  954. return n_outputs_max;
  955. }
  956. //
  957. // graph
  958. //
  959. int32_t llama_context::graph_max_nodes() const {
  960. return std::max<int32_t>(65536, 5*model.n_tensors());
  961. }
  962. ggml_cgraph * llama_context::graph_init() {
  963. ggml_init_params params = {
  964. /*.mem_size =*/ buf_compute_meta.size(),
  965. /*.mem_buffer =*/ buf_compute_meta.data(),
  966. /*.no_alloc =*/ true,
  967. };
  968. ctx_compute.reset(ggml_init(params));
  969. return ggml_new_graph_custom(ctx_compute.get(), graph_max_nodes(), false);
  970. }
  971. llm_graph_result_ptr llama_context::graph_build(
  972. ggml_context * ctx,
  973. ggml_cgraph * gf,
  974. const llama_ubatch & ubatch,
  975. llm_graph_type gtype) {
  976. return model.build_graph(
  977. {
  978. /*.ctx =*/ ctx,
  979. /*.arch =*/ model.arch,
  980. /*.hparams =*/ model.hparams,
  981. /*.cparams =*/ cparams,
  982. /*.ubatch =*/ ubatch,
  983. /*.sched =*/ sched.get(),
  984. /*.backend_cpu =*/ backend_cpu,
  985. /*.cvec =*/ &cvec,
  986. /*.loras =*/ &loras,
  987. /*.memory =*/ memory.get(),
  988. /*.cross =*/ &cross,
  989. /*.n_outputs =*/ n_outputs,
  990. /*.cb =*/ graph_get_cb(),
  991. }, gf, gtype);
  992. }
  993. ggml_status llama_context::graph_compute(
  994. ggml_cgraph * gf,
  995. bool batched) {
  996. int n_threads = batched ? cparams.n_threads_batch : cparams.n_threads;
  997. ggml_threadpool_t tp = batched ? threadpool_batch : threadpool;
  998. if (backend_cpu != nullptr) {
  999. auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend_cpu));
  1000. auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
  1001. set_threadpool_fn(backend_cpu, tp);
  1002. }
  1003. // set the number of threads for all the backends
  1004. for (const auto & set_n_threads_fn : set_n_threads_fns) {
  1005. set_n_threads_fn.second(set_n_threads_fn.first, n_threads);
  1006. }
  1007. auto status = ggml_backend_sched_graph_compute_async(sched.get(), gf);
  1008. if (status != GGML_STATUS_SUCCESS) {
  1009. LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status);
  1010. }
  1011. // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(sched));
  1012. return status;
  1013. }
  1014. llm_graph_cb llama_context::graph_get_cb() const {
  1015. return [&](const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il) {
  1016. if (il >= 0) {
  1017. ggml_format_name(cur, "%s-%d", name, il);
  1018. } else {
  1019. ggml_set_name(cur, name);
  1020. }
  1021. if (!cparams.offload_kqv) {
  1022. if (strcmp(name, "kqv_merged_cont") == 0) {
  1023. // all nodes between the KV store and the attention output are run on the CPU
  1024. ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend_cpu);
  1025. }
  1026. }
  1027. // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
  1028. // FIXME: fix in ggml_backend_sched
  1029. const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer;
  1030. if (ubatch.n_tokens < 32 || full_offload) {
  1031. if (il != -1 && strcmp(name, "norm") == 0) {
  1032. const auto & dev_layer = model.dev_layer(il);
  1033. for (const auto & backend : backends) {
  1034. if (ggml_backend_get_device(backend.get()) == dev_layer) {
  1035. if (ggml_backend_supports_op(backend.get(), cur)) {
  1036. ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend.get());
  1037. }
  1038. }
  1039. }
  1040. }
  1041. }
  1042. };
  1043. }
  1044. //
  1045. // state save/load
  1046. //
  1047. class llama_io_write_dummy : public llama_io_write_i {
  1048. public:
  1049. llama_io_write_dummy() = default;
  1050. void write(const void * /* src */, size_t size) override {
  1051. size_written += size;
  1052. }
  1053. void write_tensor(const ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
  1054. size_written += size;
  1055. }
  1056. size_t n_bytes() override {
  1057. return size_written;
  1058. }
  1059. private:
  1060. size_t size_written = 0;
  1061. };
  1062. class llama_io_write_buffer : public llama_io_write_i {
  1063. public:
  1064. llama_io_write_buffer(
  1065. uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
  1066. void write(const void * src, size_t size) override {
  1067. if (size > buf_size) {
  1068. throw std::runtime_error("unexpectedly reached end of buffer");
  1069. }
  1070. memcpy(ptr, src, size);
  1071. ptr += size;
  1072. size_written += size;
  1073. buf_size -= size;
  1074. }
  1075. void write_tensor(const ggml_tensor * tensor, size_t offset, size_t size) override {
  1076. if (size > buf_size) {
  1077. throw std::runtime_error("unexpectedly reached end of buffer");
  1078. }
  1079. ggml_backend_tensor_get(tensor, ptr, offset, size);
  1080. ptr += size;
  1081. size_written += size;
  1082. buf_size -= size;
  1083. }
  1084. size_t n_bytes() override {
  1085. return size_written;
  1086. }
  1087. private:
  1088. uint8_t * ptr;
  1089. size_t buf_size = 0;
  1090. size_t size_written = 0;
  1091. };
  1092. class llama_io_read_buffer : public llama_io_read_i {
  1093. public:
  1094. llama_io_read_buffer(const uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
  1095. const uint8_t * read(size_t size) override {
  1096. const uint8_t * base_ptr = ptr;
  1097. if (size > buf_size) {
  1098. throw std::runtime_error("unexpectedly reached end of buffer");
  1099. }
  1100. ptr += size;
  1101. size_read += size;
  1102. buf_size -= size;
  1103. return base_ptr;
  1104. }
  1105. void read_to(void * dst, size_t size) override {
  1106. memcpy(dst, read(size), size);
  1107. }
  1108. size_t n_bytes() override {
  1109. return size_read;
  1110. }
  1111. private:
  1112. const uint8_t * ptr;
  1113. size_t buf_size = 0;
  1114. size_t size_read = 0;
  1115. };
  1116. class llama_io_write_file : public llama_io_write_i {
  1117. public:
  1118. llama_io_write_file(llama_file * f) : file(f) {}
  1119. void write(const void * src, size_t size) override {
  1120. file->write_raw(src, size);
  1121. size_written += size;
  1122. }
  1123. void write_tensor(const ggml_tensor * tensor, size_t offset, size_t size) override {
  1124. temp_buffer.resize(size);
  1125. ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
  1126. write(temp_buffer.data(), temp_buffer.size());
  1127. }
  1128. size_t n_bytes() override {
  1129. return size_written;
  1130. }
  1131. private:
  1132. llama_file * file;
  1133. size_t size_written = 0;
  1134. std::vector<uint8_t> temp_buffer;
  1135. };
  1136. class llama_io_read_file : public llama_io_read_i {
  1137. public:
  1138. llama_io_read_file(llama_file * f) : file(f) {}
  1139. void read_to(void * dst, size_t size) override {
  1140. file->read_raw(dst, size);
  1141. size_read += size;
  1142. }
  1143. const uint8_t * read(size_t size) override {
  1144. temp_buffer.resize(size);
  1145. read_to(temp_buffer.data(), size);
  1146. return temp_buffer.data();
  1147. }
  1148. size_t n_bytes() override {
  1149. return size_read;
  1150. }
  1151. private:
  1152. llama_file * file;
  1153. size_t size_read = 0;
  1154. std::vector<uint8_t> temp_buffer;
  1155. };
  1156. size_t llama_context::state_get_size() {
  1157. llama_io_write_dummy io;
  1158. try {
  1159. return state_write_data(io);
  1160. } catch (const std::exception & err) {
  1161. LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
  1162. return 0;
  1163. }
  1164. }
  1165. size_t llama_context::state_get_data(uint8_t * dst, size_t size) {
  1166. llama_io_write_buffer io(dst, size);
  1167. try {
  1168. return state_write_data(io);
  1169. } catch (const std::exception & err) {
  1170. LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
  1171. return 0;
  1172. }
  1173. }
  1174. size_t llama_context::state_set_data(const uint8_t * src, size_t size) {
  1175. llama_io_read_buffer io(src, size);
  1176. try {
  1177. return state_read_data(io);
  1178. } catch (const std::exception & err) {
  1179. LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
  1180. return 0;
  1181. }
  1182. }
  1183. size_t llama_context::state_seq_get_size(llama_seq_id seq_id) {
  1184. llama_io_write_dummy io;
  1185. try {
  1186. return state_seq_write_data(io, seq_id);
  1187. } catch (const std::exception & err) {
  1188. LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
  1189. return 0;
  1190. }
  1191. }
  1192. size_t llama_context::state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) {
  1193. llama_io_write_buffer io(dst, size);
  1194. try {
  1195. return state_seq_write_data(io, seq_id);
  1196. } catch (const std::exception & err) {
  1197. LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
  1198. return 0;
  1199. }
  1200. }
  1201. size_t llama_context::state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) {
  1202. llama_io_read_buffer io(src, size);
  1203. try {
  1204. return state_seq_read_data(io, seq_id);
  1205. } catch (const std::exception & err) {
  1206. LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
  1207. return 0;
  1208. }
  1209. }
  1210. bool llama_context::state_load_file(const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
  1211. llama_file file(filepath, "rb");
  1212. // sanity checks
  1213. {
  1214. const uint32_t magic = file.read_u32();
  1215. const uint32_t version = file.read_u32();
  1216. if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
  1217. LLAMA_LOG_ERROR("%s: unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
  1218. return false;
  1219. }
  1220. }
  1221. // load the prompt
  1222. {
  1223. const uint32_t n_token_count = file.read_u32();
  1224. if (n_token_count > n_token_capacity) {
  1225. LLAMA_LOG_ERROR("%s: token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
  1226. return false;
  1227. }
  1228. file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
  1229. *n_token_count_out = n_token_count;
  1230. }
  1231. // restore the context state
  1232. {
  1233. const size_t n_state_size_cur = file.size() - file.tell();
  1234. llama_io_read_file io( &file);
  1235. const size_t n_read = state_read_data(io);
  1236. if (n_read != n_state_size_cur) {
  1237. LLAMA_LOG_ERROR("%s: did not read all of the session file data! size %zu, got %zu\n", __func__, n_state_size_cur, n_read);
  1238. return false;
  1239. }
  1240. }
  1241. return true;
  1242. }
  1243. bool llama_context::state_save_file(const char * filepath, const llama_token * tokens, size_t n_token_count) {
  1244. llama_file file(filepath, "wb");
  1245. file.write_u32(LLAMA_SESSION_MAGIC);
  1246. file.write_u32(LLAMA_SESSION_VERSION);
  1247. // save the prompt
  1248. file.write_u32((uint32_t) n_token_count);
  1249. file.write_raw(tokens, sizeof(llama_token) * n_token_count);
  1250. // save the context state using stream saving
  1251. llama_io_write_file io(&file);
  1252. state_write_data(io);
  1253. return true;
  1254. }
  1255. size_t llama_context::state_seq_load_file(llama_seq_id seq_id, const char * filepath, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
  1256. llama_file file(filepath, "rb");
  1257. // version checks
  1258. {
  1259. const uint32_t magic = file.read_u32();
  1260. const uint32_t version = file.read_u32();
  1261. if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) {
  1262. LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version);
  1263. return 0;
  1264. }
  1265. }
  1266. // load the prompt
  1267. {
  1268. const uint32_t n_token_count = file.read_u32();
  1269. if (n_token_count > n_token_capacity) {
  1270. LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
  1271. return 0;
  1272. }
  1273. file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
  1274. *n_token_count_out = n_token_count;
  1275. }
  1276. // restore the context state
  1277. {
  1278. const size_t state_size = file.size() - file.tell();
  1279. llama_io_read_file io(&file);
  1280. const size_t nread = state_seq_read_data(io, seq_id);
  1281. if (!nread) {
  1282. LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
  1283. return 0;
  1284. }
  1285. GGML_ASSERT(nread <= state_size);
  1286. GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell());
  1287. }
  1288. return file.tell();
  1289. }
  1290. size_t llama_context::state_seq_save_file(llama_seq_id seq_id, const char * filepath, const llama_token * tokens, size_t n_token_count) {
  1291. llama_file file(filepath, "wb");
  1292. file.write_u32(LLAMA_STATE_SEQ_MAGIC);
  1293. file.write_u32(LLAMA_STATE_SEQ_VERSION);
  1294. // save the prompt
  1295. file.write_u32((uint32_t) n_token_count);
  1296. file.write_raw(tokens, sizeof(llama_token) * n_token_count);
  1297. // save the context state using stream saving
  1298. llama_io_write_file io(&file);
  1299. state_seq_write_data(io, seq_id);
  1300. const size_t res = file.tell();
  1301. GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + io.n_bytes());
  1302. return res;
  1303. }
  1304. size_t llama_context::state_write_data(llama_io_write_i & io) {
  1305. LLAMA_LOG_DEBUG("%s: writing state\n", __func__);
  1306. // write model info
  1307. {
  1308. LLAMA_LOG_DEBUG("%s: - writing model info\n", __func__);
  1309. const std::string arch_str = llm_arch_name(model.arch);
  1310. io.write_string(arch_str);
  1311. // TODO: add more model-specific info which should prevent loading the session file if not identical
  1312. }
  1313. // write output ids
  1314. {
  1315. LLAMA_LOG_DEBUG("%s: - writing output ids\n", __func__);
  1316. const auto n_outputs = this->n_outputs;
  1317. const auto & output_ids = this->output_ids;
  1318. std::vector<int32_t> w_output_pos;
  1319. GGML_ASSERT(n_outputs <= n_outputs_max);
  1320. w_output_pos.resize(n_outputs);
  1321. // build a more compact representation of the output ids
  1322. for (size_t i = 0; i < n_batch(); ++i) {
  1323. // map an output id to a position in the batch
  1324. int32_t pos = output_ids[i];
  1325. if (pos >= 0) {
  1326. GGML_ASSERT(pos < n_outputs);
  1327. w_output_pos[pos] = i;
  1328. }
  1329. }
  1330. io.write(&n_outputs, sizeof(n_outputs));
  1331. if (n_outputs) {
  1332. io.write(w_output_pos.data(), n_outputs * sizeof(int32_t));
  1333. }
  1334. }
  1335. // write logits
  1336. {
  1337. LLAMA_LOG_DEBUG("%s: - writing logits\n", __func__);
  1338. const uint64_t logits_size = std::min((uint64_t) this->logits_size, (uint64_t) n_outputs * model.vocab.n_tokens());
  1339. io.write(&logits_size, sizeof(logits_size));
  1340. if (logits_size) {
  1341. io.write(logits, logits_size * sizeof(float));
  1342. }
  1343. }
  1344. // write embeddings
  1345. {
  1346. LLAMA_LOG_DEBUG("%s: - writing embeddings\n", __func__);
  1347. const uint64_t embd_size = std::min((uint64_t) this->embd_size, (uint64_t) n_outputs * model.hparams.n_embd);
  1348. io.write(&embd_size, sizeof(embd_size));
  1349. if (embd_size) {
  1350. io.write(embd, embd_size * sizeof(float));
  1351. }
  1352. }
  1353. LLAMA_LOG_DEBUG("%s: - writing KV self\n", __func__);
  1354. llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
  1355. kv_self->state_write(io);
  1356. return io.n_bytes();
  1357. }
  1358. size_t llama_context::state_read_data(llama_io_read_i & io) {
  1359. LLAMA_LOG_DEBUG("%s: reading state\n", __func__);
  1360. // read model info
  1361. {
  1362. LLAMA_LOG_DEBUG("%s: - reading model info\n", __func__);
  1363. const std::string cur_arch_str = llm_arch_name(model.arch);
  1364. std::string arch_str;
  1365. io.read_string(arch_str);
  1366. if (cur_arch_str != arch_str) {
  1367. throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str()));
  1368. }
  1369. // TODO: add more info which needs to be identical but which is not verified otherwise
  1370. }
  1371. // read output ids
  1372. {
  1373. LLAMA_LOG_DEBUG("%s: - reading output ids\n", __func__);
  1374. auto n_outputs = this->n_outputs;
  1375. io.read_to(&n_outputs, sizeof(n_outputs));
  1376. if (n_outputs > output_reserve(n_outputs)) {
  1377. throw std::runtime_error("could not reserve outputs");
  1378. }
  1379. std::vector<int32_t> output_pos;
  1380. if (n_outputs) {
  1381. output_pos.resize(n_outputs);
  1382. io.read_to(output_pos.data(), n_outputs * sizeof(int32_t));
  1383. for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
  1384. int32_t id = output_pos[i];
  1385. if ((uint32_t) id >= n_batch()) {
  1386. throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, n_batch()));
  1387. }
  1388. this->output_ids[id] = i;
  1389. }
  1390. this->n_outputs = n_outputs;
  1391. }
  1392. }
  1393. // read logits
  1394. {
  1395. LLAMA_LOG_DEBUG("%s: - reading logits\n", __func__);
  1396. uint64_t logits_size;
  1397. io.read_to(&logits_size, sizeof(logits_size));
  1398. if (this->logits_size < logits_size) {
  1399. throw std::runtime_error("logits buffer too small");
  1400. }
  1401. if (logits_size) {
  1402. io.read_to(this->logits, logits_size * sizeof(float));
  1403. }
  1404. }
  1405. // read embeddings
  1406. {
  1407. LLAMA_LOG_DEBUG("%s: - reading embeddings\n", __func__);
  1408. uint64_t embd_size;
  1409. io.read_to(&embd_size, sizeof(embd_size));
  1410. if (this->embd_size < embd_size) {
  1411. throw std::runtime_error("embeddings buffer too small");
  1412. }
  1413. if (embd_size) {
  1414. io.read_to(this->embd, embd_size * sizeof(float));
  1415. }
  1416. }
  1417. LLAMA_LOG_DEBUG("%s: - reading KV self\n", __func__);
  1418. llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
  1419. kv_self->state_read(io);
  1420. return io.n_bytes();
  1421. }
  1422. size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) {
  1423. GGML_UNUSED(seq_id);
  1424. llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
  1425. kv_self->state_write(io, seq_id);
  1426. return io.n_bytes();
  1427. }
  1428. size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id) {
  1429. GGML_UNUSED(seq_id);
  1430. llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
  1431. kv_self->state_read(io, seq_id);
  1432. return io.n_bytes();
  1433. }
  1434. //
  1435. // perf
  1436. //
  1437. llama_perf_context_data llama_context::perf_get_data() const {
  1438. llama_perf_context_data data = {};
  1439. data.t_start_ms = 1e-3 * t_start_us;
  1440. data.t_load_ms = 1e-3 * t_load_us;
  1441. data.t_p_eval_ms = 1e-3 * t_p_eval_us;
  1442. data.t_eval_ms = 1e-3 * t_eval_us;
  1443. data.n_p_eval = std::max(1, n_p_eval);
  1444. data.n_eval = std::max(1, n_eval);
  1445. return data;
  1446. }
  1447. void llama_context::perf_reset() {
  1448. t_start_us = ggml_time_us();
  1449. t_eval_us = n_eval = 0;
  1450. t_p_eval_us = n_p_eval = 0;
  1451. }
  1452. //
  1453. // interface implementation
  1454. //
  1455. llama_context_params llama_context_default_params() {
  1456. llama_context_params result = {
  1457. /*.n_ctx =*/ 512,
  1458. /*.n_batch =*/ 2048,
  1459. /*.n_ubatch =*/ 512,
  1460. /*.n_seq_max =*/ 1,
  1461. /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
  1462. /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
  1463. /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
  1464. /*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
  1465. /*.attention_type =*/ LLAMA_ATTENTION_TYPE_UNSPECIFIED,
  1466. /*.rope_freq_base =*/ 0.0f,
  1467. /*.rope_freq_scale =*/ 0.0f,
  1468. /*.yarn_ext_factor =*/ -1.0f,
  1469. /*.yarn_attn_factor =*/ 1.0f,
  1470. /*.yarn_beta_fast =*/ 32.0f,
  1471. /*.yarn_beta_slow =*/ 1.0f,
  1472. /*.yarn_orig_ctx =*/ 0,
  1473. /*.defrag_thold =*/ -1.0f,
  1474. /*.cb_eval =*/ nullptr,
  1475. /*.cb_eval_user_data =*/ nullptr,
  1476. /*.type_k =*/ GGML_TYPE_F16,
  1477. /*.type_v =*/ GGML_TYPE_F16,
  1478. /*.abort_callback =*/ nullptr,
  1479. /*.abort_callback_data =*/ nullptr,
  1480. /*.embeddings =*/ false,
  1481. /*.offload_kqv =*/ true,
  1482. /*.flash_attn =*/ false,
  1483. /*.no_perf =*/ true,
  1484. };
  1485. return result;
  1486. }
  1487. llama_context * llama_init_from_model(
  1488. llama_model * model,
  1489. llama_context_params params) {
  1490. if (!model) {
  1491. LLAMA_LOG_ERROR("%s: model cannot be NULL\n", __func__);
  1492. return nullptr;
  1493. }
  1494. if (params.n_batch == 0 && params.n_ubatch == 0) {
  1495. LLAMA_LOG_ERROR("%s: n_batch and n_ubatch cannot both be zero\n", __func__);
  1496. return nullptr;
  1497. }
  1498. if (params.n_ctx == 0 && model->hparams.n_ctx_train == 0) {
  1499. LLAMA_LOG_ERROR("%s: n_ctx and model->hparams.n_ctx_train cannot both be zero\n", __func__);
  1500. return nullptr;
  1501. }
  1502. if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
  1503. LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
  1504. params.flash_attn = false;
  1505. }
  1506. if (ggml_is_quantized(params.type_v) && !params.flash_attn) {
  1507. LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
  1508. return nullptr;
  1509. }
  1510. try {
  1511. auto * ctx = new llama_context(*model, params);
  1512. return ctx;
  1513. } catch (const std::exception & err) {
  1514. LLAMA_LOG_ERROR("%s: failed to initialize the context: %s\n", __func__, err.what());
  1515. }
  1516. return nullptr;
  1517. }
  1518. // deprecated
  1519. llama_context * llama_new_context_with_model(
  1520. llama_model * model,
  1521. llama_context_params params) {
  1522. return llama_init_from_model(model, params);
  1523. }
  1524. void llama_free(llama_context * ctx) {
  1525. delete ctx;
  1526. }
  1527. uint32_t llama_n_ctx(const llama_context * ctx) {
  1528. return ctx->n_ctx();
  1529. }
  1530. uint32_t llama_n_batch(const llama_context * ctx) {
  1531. return ctx->n_batch();
  1532. }
  1533. uint32_t llama_n_ubatch(const llama_context * ctx) {
  1534. return ctx->n_ubatch();
  1535. }
  1536. uint32_t llama_n_seq_max(const llama_context * ctx) {
  1537. return ctx->n_seq_max();
  1538. }
  1539. const llama_model * llama_get_model(const llama_context * ctx) {
  1540. return &ctx->get_model();
  1541. }
  1542. llama_kv_cache * llama_get_kv_self(llama_context * ctx) {
  1543. return ctx->get_kv_self();
  1544. }
  1545. void llama_kv_self_update(llama_context * ctx) {
  1546. ctx->kv_self_update();
  1547. }
  1548. enum llama_pooling_type llama_pooling_type(const llama_context * ctx) {
  1549. return ctx->pooling_type();
  1550. }
  1551. void llama_attach_threadpool(
  1552. llama_context * ctx,
  1553. ggml_threadpool_t threadpool,
  1554. ggml_threadpool_t threadpool_batch) {
  1555. ctx->attach_threadpool(threadpool, threadpool_batch);
  1556. }
  1557. void llama_detach_threadpool(llama_context * ctx) {
  1558. ctx->detach_threadpool();
  1559. }
  1560. void llama_set_n_threads(llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) {
  1561. ctx->set_n_threads(n_threads, n_threads_batch);
  1562. }
  1563. int32_t llama_n_threads(llama_context * ctx) {
  1564. return ctx->n_threads();
  1565. }
  1566. int32_t llama_n_threads_batch(llama_context * ctx) {
  1567. return ctx->n_threads_batch();
  1568. }
  1569. void llama_set_abort_callback(llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
  1570. ctx->set_abort_callback(abort_callback, abort_callback_data);
  1571. }
  1572. void llama_set_embeddings(llama_context * ctx, bool embeddings) {
  1573. ctx->set_embeddings(embeddings);
  1574. }
  1575. void llama_set_causal_attn(llama_context * ctx, bool causal_attn) {
  1576. ctx->set_causal_attn(causal_attn);
  1577. }
  1578. void llama_set_warmup(llama_context * ctx, bool warmup) {
  1579. ctx->set_warmup(warmup);
  1580. }
  1581. void llama_synchronize(llama_context * ctx) {
  1582. ctx->synchronize();
  1583. }
  1584. float * llama_get_logits(llama_context * ctx) {
  1585. ctx->synchronize();
  1586. return ctx->get_logits();
  1587. }
  1588. float * llama_get_logits_ith(llama_context * ctx, int32_t i) {
  1589. ctx->synchronize();
  1590. return ctx->get_logits_ith(i);
  1591. }
  1592. float * llama_get_embeddings(llama_context * ctx) {
  1593. ctx->synchronize();
  1594. return ctx->get_embeddings();
  1595. }
  1596. float * llama_get_embeddings_ith(llama_context * ctx, int32_t i) {
  1597. ctx->synchronize();
  1598. return ctx->get_embeddings_ith(i);
  1599. }
  1600. float * llama_get_embeddings_seq(llama_context * ctx, llama_seq_id seq_id) {
  1601. ctx->synchronize();
  1602. return ctx->get_embeddings_seq(seq_id);
  1603. }
  1604. // llama adapter API
  1605. int32_t llama_set_adapter_lora(
  1606. llama_context * ctx,
  1607. llama_adapter_lora * adapter,
  1608. float scale) {
  1609. ctx->set_adapter_lora(adapter, scale);
  1610. return 0;
  1611. }
  1612. int32_t llama_rm_adapter_lora(
  1613. llama_context * ctx,
  1614. llama_adapter_lora * adapter) {
  1615. bool res = ctx->rm_adapter_lora(adapter);
  1616. return res ? 0 : -1;
  1617. }
  1618. void llama_clear_adapter_lora(llama_context * ctx) {
  1619. ctx->clear_adapter_lora();
  1620. }
  1621. int32_t llama_apply_adapter_cvec(
  1622. llama_context * ctx,
  1623. const float * data,
  1624. size_t len,
  1625. int32_t n_embd,
  1626. int32_t il_start,
  1627. int32_t il_end) {
  1628. bool res = ctx->apply_adapter_cvec(data, len, n_embd, il_start, il_end);
  1629. return res ? 0 : -1;
  1630. }
  1631. //
  1632. // kv cache view
  1633. //
  1634. llama_kv_cache_view llama_kv_cache_view_init(const llama_context * ctx, int32_t n_seq_max) {
  1635. const auto * kv = ctx->get_kv_self();
  1636. if (kv == nullptr) {
  1637. LLAMA_LOG_WARN("%s: the context does not have a KV cache\n", __func__);
  1638. return {};
  1639. }
  1640. return llama_kv_cache_view_init(*kv, n_seq_max);
  1641. }
  1642. void llama_kv_cache_view_update(const llama_context * ctx, llama_kv_cache_view * view) {
  1643. const auto * kv = ctx->get_kv_self();
  1644. if (kv == nullptr) {
  1645. LLAMA_LOG_WARN("%s: the context does not have a KV cache\n", __func__);
  1646. return;
  1647. }
  1648. llama_kv_cache_view_update(view, kv);
  1649. }
  1650. //
  1651. // kv cache
  1652. //
  1653. // deprecated
  1654. int32_t llama_get_kv_cache_token_count(const llama_context * ctx) {
  1655. return llama_kv_self_n_tokens(ctx);
  1656. }
  1657. int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
  1658. const auto * kv = ctx->get_kv_self();
  1659. if (!kv) {
  1660. return 0;
  1661. }
  1662. return kv->get_n_tokens();
  1663. }
  1664. // deprecated
  1665. int32_t llama_get_kv_cache_used_cells(const llama_context * ctx) {
  1666. return llama_kv_self_used_cells(ctx);
  1667. }
  1668. int32_t llama_kv_self_used_cells(const llama_context * ctx) {
  1669. const auto * kv = ctx->get_kv_self();
  1670. if (!kv) {
  1671. return 0;
  1672. }
  1673. return kv->get_used_cells();
  1674. }
  1675. // deprecated
  1676. void llama_kv_cache_clear(llama_context * ctx) {
  1677. llama_kv_self_clear(ctx);
  1678. }
  1679. void llama_kv_self_clear(llama_context * ctx) {
  1680. auto * kv = ctx->get_kv_self();
  1681. if (!kv) {
  1682. return;
  1683. }
  1684. kv->clear();
  1685. }
  1686. // deprecated
  1687. bool llama_kv_cache_seq_rm(
  1688. llama_context * ctx,
  1689. llama_seq_id seq_id,
  1690. llama_pos p0,
  1691. llama_pos p1) {
  1692. return llama_kv_self_seq_rm(ctx, seq_id, p0, p1);
  1693. }
  1694. bool llama_kv_self_seq_rm(
  1695. llama_context * ctx,
  1696. llama_seq_id seq_id,
  1697. llama_pos p0,
  1698. llama_pos p1) {
  1699. auto * kv = ctx->get_kv_self();
  1700. if (!kv) {
  1701. return true;
  1702. }
  1703. return kv->seq_rm(seq_id, p0, p1);
  1704. }
  1705. // deprecated
  1706. void llama_kv_cache_seq_cp(
  1707. llama_context * ctx,
  1708. llama_seq_id seq_id_src,
  1709. llama_seq_id seq_id_dst,
  1710. llama_pos p0,
  1711. llama_pos p1) {
  1712. llama_kv_self_seq_cp(ctx, seq_id_src, seq_id_dst, p0, p1);
  1713. }
  1714. void llama_kv_self_seq_cp(
  1715. llama_context * ctx,
  1716. llama_seq_id seq_id_src,
  1717. llama_seq_id seq_id_dst,
  1718. llama_pos p0,
  1719. llama_pos p1) {
  1720. auto * kv = ctx->get_kv_self();
  1721. if (!kv) {
  1722. return;
  1723. }
  1724. kv->seq_cp(seq_id_src, seq_id_dst, p0, p1);
  1725. }
  1726. // deprecated
  1727. void llama_kv_cache_seq_keep(
  1728. llama_context * ctx,
  1729. llama_seq_id seq_id) {
  1730. llama_kv_self_seq_keep(ctx, seq_id);
  1731. }
  1732. void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
  1733. auto * kv = ctx->get_kv_self();
  1734. if (!kv) {
  1735. return;
  1736. }
  1737. kv->seq_keep(seq_id);
  1738. }
  1739. // deprecated
  1740. void llama_kv_cache_seq_add(
  1741. llama_context * ctx,
  1742. llama_seq_id seq_id,
  1743. llama_pos p0,
  1744. llama_pos p1,
  1745. llama_pos delta) {
  1746. llama_kv_self_seq_add(ctx, seq_id, p0, p1, delta);
  1747. }
  1748. void llama_kv_self_seq_add(
  1749. llama_context * ctx,
  1750. llama_seq_id seq_id,
  1751. llama_pos p0,
  1752. llama_pos p1,
  1753. llama_pos delta) {
  1754. auto * kv = ctx->get_kv_self();
  1755. if (!kv) {
  1756. return;
  1757. }
  1758. kv->seq_add(seq_id, p0, p1, delta);
  1759. }
  1760. // deprecated
  1761. void llama_kv_cache_seq_div(
  1762. llama_context * ctx,
  1763. llama_seq_id seq_id,
  1764. llama_pos p0,
  1765. llama_pos p1,
  1766. int d) {
  1767. llama_kv_self_seq_div(ctx, seq_id, p0, p1, d);
  1768. }
  1769. void llama_kv_self_seq_div(
  1770. llama_context * ctx,
  1771. llama_seq_id seq_id,
  1772. llama_pos p0,
  1773. llama_pos p1,
  1774. int d) {
  1775. auto * kv = ctx->get_kv_self();
  1776. if (!kv) {
  1777. return;
  1778. }
  1779. kv->seq_div(seq_id, p0, p1, d);
  1780. }
  1781. // deprecated
  1782. llama_pos llama_kv_cache_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
  1783. return llama_kv_self_seq_pos_max(ctx, seq_id);
  1784. }
  1785. llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
  1786. const auto * kv = ctx->get_kv_self();
  1787. if (!kv) {
  1788. return 0;
  1789. }
  1790. return kv->seq_pos_max(seq_id);
  1791. }
  1792. // deprecated
  1793. void llama_kv_cache_defrag(llama_context * ctx) {
  1794. llama_kv_self_defrag(ctx);
  1795. }
  1796. void llama_kv_self_defrag(llama_context * ctx) {
  1797. auto * kv = ctx->get_kv_self();
  1798. if (!kv) {
  1799. return;
  1800. }
  1801. // force defrag
  1802. kv->defrag_sched(-1.0f);
  1803. }
  1804. // deprecated
  1805. bool llama_kv_cache_can_shift(const llama_context * ctx) {
  1806. return llama_kv_self_can_shift(ctx);
  1807. }
  1808. bool llama_kv_self_can_shift(const llama_context * ctx) {
  1809. const auto * kv = ctx->get_kv_self();
  1810. if (!kv) {
  1811. return false;
  1812. }
  1813. return kv->get_can_shift();
  1814. }
  1815. // deprecated
  1816. void llama_kv_cache_update(llama_context * ctx) {
  1817. llama_kv_self_update(ctx);
  1818. }
  1819. // llama state API
  1820. // deprecated
  1821. size_t llama_get_state_size(llama_context * ctx) {
  1822. return llama_state_get_size(ctx);
  1823. }
  1824. // deprecated
  1825. size_t llama_copy_state_data(llama_context * ctx, uint8_t * dst) {
  1826. return llama_state_get_data(ctx, dst, -1);
  1827. }
  1828. // deprecated
  1829. size_t llama_set_state_data(llama_context * ctx, const uint8_t * src) {
  1830. return llama_state_set_data(ctx, src, -1);
  1831. }
  1832. // deprecated
  1833. bool llama_load_session_file(llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
  1834. return llama_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
  1835. }
  1836. // deprecated
  1837. bool llama_save_session_file(llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
  1838. return llama_state_save_file(ctx, path_session, tokens, n_token_count);
  1839. }
  1840. // Returns the *actual* size of the state.
  1841. // Intended to be used when saving to state to a buffer.
  1842. size_t llama_state_get_size(llama_context * ctx) {
  1843. return ctx->state_get_size();
  1844. }
  1845. size_t llama_state_get_data(llama_context * ctx, uint8_t * dst, size_t size) {
  1846. ctx->synchronize();
  1847. return ctx->state_get_data(dst, size);
  1848. }
  1849. // Sets the state reading from the specified source address
  1850. size_t llama_state_set_data(llama_context * ctx, const uint8_t * src, size_t size) {
  1851. ctx->synchronize();
  1852. return ctx->state_set_data(src, size);
  1853. }
  1854. bool llama_state_load_file(llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
  1855. ctx->synchronize();
  1856. try {
  1857. return ctx->state_load_file(path_session, tokens_out, n_token_capacity, n_token_count_out);
  1858. } catch (const std::exception & err) {
  1859. LLAMA_LOG_ERROR("%s: error loading session file: %s\n", __func__, err.what());
  1860. return false;
  1861. }
  1862. }
  1863. bool llama_state_save_file(llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
  1864. ctx->synchronize();
  1865. try {
  1866. return ctx->state_save_file(path_session, tokens, n_token_count);
  1867. } catch (const std::exception & err) {
  1868. LLAMA_LOG_ERROR("%s: error saving session file: %s\n", __func__, err.what());
  1869. return false;
  1870. }
  1871. }
  1872. size_t llama_state_seq_get_size(llama_context * ctx, llama_seq_id seq_id) {
  1873. return ctx->state_seq_get_size(seq_id);
  1874. }
  1875. size_t llama_state_seq_get_data(llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) {
  1876. ctx->synchronize();
  1877. return ctx->state_seq_get_data(seq_id, dst, size);
  1878. }
  1879. size_t llama_state_seq_set_data(llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id seq_id) {
  1880. ctx->synchronize();
  1881. return ctx->state_seq_set_data(seq_id, src, size);
  1882. }
  1883. size_t llama_state_seq_save_file(llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
  1884. ctx->synchronize();
  1885. try {
  1886. return ctx->state_seq_save_file(seq_id, filepath, tokens, n_token_count);
  1887. } catch (const std::exception & err) {
  1888. LLAMA_LOG_ERROR("%s: error saving sequence state file: %s\n", __func__, err.what());
  1889. return 0;
  1890. }
  1891. }
  1892. size_t llama_state_seq_load_file(llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
  1893. ctx->synchronize();
  1894. try {
  1895. return ctx->state_seq_load_file(dest_seq_id, filepath, tokens_out, n_token_capacity, n_token_count_out);
  1896. } catch (const std::exception & err) {
  1897. LLAMA_LOG_ERROR("%s: error loading sequence state file: %s\n", __func__, err.what());
  1898. return 0;
  1899. }
  1900. }
  1901. ///
  1902. int32_t llama_encode(
  1903. llama_context * ctx,
  1904. llama_batch batch) {
  1905. const int ret = ctx->encode(batch);
  1906. if (ret != 0) {
  1907. LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret);
  1908. }
  1909. return ret;
  1910. }
  1911. int32_t llama_decode(
  1912. llama_context * ctx,
  1913. llama_batch batch) {
  1914. const int ret = ctx->decode(batch);
  1915. if (ret != 0) {
  1916. LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
  1917. }
  1918. return ret;
  1919. }
  1920. //
  1921. // perf
  1922. //
  1923. llama_perf_context_data llama_perf_context(const llama_context * ctx) {
  1924. llama_perf_context_data data = {};
  1925. if (ctx == nullptr) {
  1926. return data;
  1927. }
  1928. data = ctx->perf_get_data();
  1929. return data;
  1930. }
  1931. void llama_perf_context_print(const llama_context * ctx) {
  1932. const auto data = llama_perf_context(ctx);
  1933. const double t_end_ms = 1e-3 * ggml_time_us();
  1934. LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
  1935. LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
  1936. __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
  1937. LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
  1938. __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
  1939. LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
  1940. }
  1941. void llama_perf_context_reset(llama_context * ctx) {
  1942. ctx->perf_reset();
  1943. }