llama-context.cpp 64 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775
  1. #include "llama-context.h"
  2. #include "llama-impl.h"
  3. #include "llama-mmap.h"
  4. #include <cassert>
  5. #include <cmath>
  6. #include <cstring>
  7. #include <stdexcept>
  8. void llama_set_k_shift(struct llama_context & lctx) {
  9. const int64_t kv_size = lctx.kv_self.size;
  10. assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
  11. int32_t * data = (int32_t *) lctx.inp_K_shift->data;
  12. for (int i = 0; i < kv_size; ++i) {
  13. data[i] = lctx.kv_self.cells[i].delta;
  14. }
  15. }
  16. void llama_set_s_copy(struct llama_context & lctx) {
  17. const int64_t kv_size = lctx.kv_self.size;
  18. assert(ggml_backend_buffer_is_host(lctx.inp_s_copy->buffer));
  19. int32_t * data = (int32_t *) lctx.inp_s_copy->data;
  20. for (int i = 0; i < kv_size; ++i) {
  21. data[i] = lctx.kv_self.cells[i].src;
  22. }
  23. }
  24. // llama input
  25. static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
  26. // TODO move to hparams if a T5 variant appears that uses a different value
  27. const int64_t max_distance = 128;
  28. if (bidirectional) {
  29. n_buckets >>= 1;
  30. }
  31. const int64_t max_exact = n_buckets >> 1;
  32. int32_t relative_position = x - y;
  33. int32_t relative_bucket = 0;
  34. if (bidirectional) {
  35. relative_bucket += (relative_position > 0) * n_buckets;
  36. relative_position = abs(relative_position);
  37. } else {
  38. relative_position = -std::min<int32_t>(relative_position, 0);
  39. }
  40. int32_t relative_position_if_large = floorf(max_exact + logf(1.0 * relative_position / max_exact) * (n_buckets - max_exact) / log(1.0 * max_distance / max_exact));
  41. relative_position_if_large = std::min<int32_t>(relative_position_if_large, n_buckets - 1);
  42. relative_bucket += (relative_position < max_exact ? relative_position : relative_position_if_large);
  43. return relative_bucket;
  44. }
  45. void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) {
  46. //
  47. // set input data
  48. //
  49. const auto & hparams = lctx.model.hparams;
  50. const auto & cparams = lctx.cparams;
  51. const auto & kv_self = lctx.kv_self;
  52. if (ubatch.token) {
  53. const int64_t n_tokens = ubatch.n_tokens;
  54. ggml_backend_tensor_set(lctx.inp_tokens, ubatch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
  55. }
  56. if (ubatch.embd) {
  57. const int64_t n_embd = hparams.n_embd;
  58. const int64_t n_tokens = ubatch.n_tokens;
  59. ggml_backend_tensor_set(lctx.inp_embd, ubatch.embd, 0, n_tokens*n_embd*ggml_element_size(lctx.inp_embd));
  60. }
  61. if (ubatch.pos && lctx.inp_pos) {
  62. const int64_t n_tokens = ubatch.n_tokens;
  63. auto n_pos = lctx.n_pos_per_token;
  64. ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*n_pos*ggml_element_size(lctx.inp_pos));
  65. }
  66. if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
  67. //GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
  68. if (!lctx.inp_out_ids) {
  69. LLAMA_LOG_WARN("%s: 'lctx.inp_out_ids' is not created\n", __func__);
  70. } else {
  71. const int64_t n_tokens = ubatch.n_tokens;
  72. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
  73. int32_t * data = (int32_t *) lctx.inp_out_ids->data;
  74. if (lctx.n_outputs == n_tokens) {
  75. for (int i = 0; i < n_tokens; ++i) {
  76. data[i] = i;
  77. }
  78. } else if (ubatch.output) {
  79. int32_t n_outputs = 0;
  80. for (int i = 0; i < n_tokens; ++i) {
  81. if (ubatch.output[i]) {
  82. data[n_outputs++] = i;
  83. }
  84. }
  85. // the graph needs to have been passed the correct number of outputs
  86. GGML_ASSERT(lctx.n_outputs == n_outputs);
  87. } else if (lctx.n_outputs == 1) {
  88. // only keep last output
  89. data[0] = n_tokens - 1;
  90. } else {
  91. GGML_ASSERT(lctx.n_outputs == 0);
  92. }
  93. }
  94. }
  95. GGML_ASSERT(
  96. // (!a || b) is a logical implication (a -> b)
  97. // !hparams.causal_attn -> !cparams.causal_attn
  98. (hparams.causal_attn || !cparams.causal_attn) &&
  99. "causal attention is not supported by this model"
  100. );
  101. if (lctx.inp_KQ_mask || lctx.inp_KQ_mask_swa) {
  102. // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache.
  103. if (cparams.causal_attn && !lctx.is_encoding) {
  104. const int64_t n_kv = kv_self.n;
  105. const int64_t n_tokens = ubatch.n_tokens;
  106. const int64_t n_seq_tokens = ubatch.n_seq_tokens;
  107. const int64_t n_seqs = ubatch.n_seqs;
  108. float * data = nullptr;
  109. float * data_swa = nullptr;
  110. if (lctx.inp_KQ_mask) {
  111. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
  112. data = (float *) lctx.inp_KQ_mask->data;
  113. }
  114. if (lctx.inp_KQ_mask_swa) {
  115. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_swa->buffer));
  116. data_swa = (float *) lctx.inp_KQ_mask_swa->data;
  117. }
  118. // For causal attention, use only the previous KV cells
  119. // of the correct sequence for each token of the ubatch.
  120. // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
  121. for (int h = 0; h < 1; ++h) {
  122. for (int s = 0; s < n_seqs; ++s) {
  123. const llama_seq_id seq_id = ubatch.seq_id[s][0];
  124. for (int j = 0; j < n_seq_tokens; ++j) {
  125. const llama_pos pos = ubatch.pos[s*n_seq_tokens + j];
  126. for (int i = 0; i < n_kv; ++i) {
  127. float f;
  128. if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
  129. f = -INFINITY;
  130. } else {
  131. if (hparams.use_alibi) {
  132. f = -std::abs(kv_self.cells[i].pos - pos);
  133. } else {
  134. f = 0.0f;
  135. }
  136. }
  137. if (data) {
  138. data[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
  139. }
  140. // may need to cut off old tokens for sliding window
  141. if (data_swa) {
  142. if (pos - kv_self.cells[i].pos >= (int32_t)hparams.n_swa) {
  143. f = -INFINITY;
  144. }
  145. data_swa[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
  146. }
  147. }
  148. }
  149. }
  150. if (data) {
  151. for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
  152. for (int j = 0; j < n_kv; ++j) {
  153. data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
  154. }
  155. }
  156. }
  157. if (data_swa) {
  158. for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
  159. for (int j = 0; j < n_kv; ++j) {
  160. data_swa[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
  161. }
  162. }
  163. }
  164. }
  165. } else {
  166. const int64_t n_tokens = ubatch.n_tokens;
  167. const int64_t n_seq_tokens = ubatch.n_seq_tokens;
  168. const int64_t n_seqs = ubatch.n_seqs;
  169. // when using kv cache, the mask needs to match the kv cache size
  170. const int64_t n_stride = hparams.causal_attn && !lctx.is_encoding ? kv_self.n : n_tokens;
  171. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer));
  172. float * data = (float *) lctx.inp_KQ_mask->data;
  173. for (int h = 0; h < 1; ++h) {
  174. for (int s1 = 0; s1 < n_seqs; ++s1) {
  175. const llama_seq_id seq_id = ubatch.seq_id[s1][0];
  176. for (int j = 0; j < n_seq_tokens; ++j) {
  177. const int32_t tj = s1*n_seq_tokens + j;
  178. for (int s0 = 0; s0 < n_seqs; ++s0) {
  179. for (int i = 0; i < n_seq_tokens; ++i) {
  180. const int32_t ti = s0*n_seq_tokens + i;
  181. float f = -INFINITY;
  182. for (int s = 0; s < ubatch.n_seq_id[s0]; ++s) {
  183. if (ubatch.seq_id[s0][s] == seq_id) {
  184. if (hparams.use_alibi) {
  185. f = -std::abs(ubatch.pos[ti] - ubatch.pos[tj]);
  186. } else {
  187. f = 0.0f;
  188. }
  189. break;
  190. }
  191. }
  192. data[h*(n_tokens*n_tokens) + tj*n_stride + ti] = f;
  193. }
  194. }
  195. for (int i = n_tokens; i < n_stride; ++i) {
  196. data[h*(n_tokens*n_tokens) + tj*n_stride + i] = -INFINITY;
  197. }
  198. }
  199. }
  200. }
  201. }
  202. }
  203. if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
  204. const int64_t n_tokens = ubatch.n_tokens;
  205. const int64_t n_seq_tokens = ubatch.n_seq_tokens;
  206. const int64_t n_seqs = ubatch.n_seqs;
  207. GGML_ASSERT(lctx.inp_mean);
  208. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
  209. float * data = (float *) lctx.inp_mean->data;
  210. memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
  211. std::vector<uint64_t> sum(n_tokens, 0);
  212. for (int s = 0; s < n_seqs; ++s) {
  213. const llama_seq_id seq_id = ubatch.seq_id[s][0];
  214. // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
  215. GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == MEAN");
  216. sum[seq_id] += ubatch.n_seq_tokens;
  217. }
  218. std::vector<float> div(n_tokens, 0.0f);
  219. for (int i = 0; i < n_tokens; ++i) {
  220. const uint64_t s = sum[i];
  221. if (s > 0) {
  222. div[i] = 1.0f/float(s);
  223. }
  224. }
  225. for (int s = 0; s < n_seqs; ++s) {
  226. const llama_seq_id seq_id = ubatch.seq_id[s][0];
  227. for (int i = 0; i < n_seq_tokens; ++i) {
  228. data[seq_id*n_tokens + s*n_seq_tokens + i] = div[seq_id];
  229. }
  230. }
  231. }
  232. if (cparams.embeddings && (
  233. cparams.pooling_type == LLAMA_POOLING_TYPE_CLS ||
  234. cparams.pooling_type == LLAMA_POOLING_TYPE_RANK)) {
  235. const int64_t n_tokens = ubatch.n_tokens;
  236. const int64_t n_seq_tokens = ubatch.n_seq_tokens;
  237. const int64_t n_seqs = ubatch.n_seqs;
  238. GGML_ASSERT(lctx.inp_cls);
  239. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
  240. uint32_t * data = (uint32_t *) lctx.inp_cls->data;
  241. memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
  242. for (int s = 0; s < n_seqs; ++s) {
  243. const llama_seq_id seq_id = ubatch.seq_id[s][0];
  244. // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
  245. GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == CLS or RANK");
  246. for (int i = 0; i < n_seq_tokens; ++i) {
  247. const llama_pos pos = ubatch.pos[s*n_seq_tokens + i];
  248. if (pos == 0) {
  249. data[seq_id] = s*n_seq_tokens + i;
  250. }
  251. }
  252. }
  253. }
  254. if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_LAST) {
  255. const int64_t n_tokens = ubatch.n_tokens;
  256. const int64_t n_seq_tokens = ubatch.n_seq_tokens;
  257. const int64_t n_seqs = ubatch.n_seqs;
  258. GGML_ASSERT(lctx.inp_cls);
  259. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
  260. uint32_t * data = (uint32_t *) lctx.inp_cls->data;
  261. memset(lctx.inp_cls->data, 0, n_tokens * ggml_element_size(lctx.inp_cls));
  262. std::vector<int> last_pos(n_tokens, -1);
  263. std::vector<int> last_row(n_tokens, -1);
  264. for (int s = 0; s < n_seqs; ++s) {
  265. const llama_seq_id seq_id = ubatch.seq_id[s][0];
  266. // TODO: adapt limits to n_seqs when ubatch.equal_seqs is true
  267. GGML_ASSERT(seq_id < n_tokens && "seq_id cannot be larger than n_tokens with pooling_type == LAST");
  268. for (int i = 0; i < n_seq_tokens; ++i) {
  269. const llama_pos pos = ubatch.pos[s*n_seq_tokens + i];
  270. if (pos >= last_pos[seq_id]) {
  271. last_pos[seq_id] = pos;
  272. last_row[seq_id] = s*n_seq_tokens + i;
  273. }
  274. }
  275. }
  276. for (int i = 0; i < n_tokens; ++i) {
  277. if (last_row[i] >= 0) {
  278. data[i] = last_row[i];
  279. }
  280. }
  281. }
  282. if (kv_self.recurrent) {
  283. const int64_t n_kv = kv_self.n;
  284. if (lctx.inp_s_mask) {
  285. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_mask->buffer));
  286. float * data = (float *) lctx.inp_s_mask->data;
  287. // clear unused states
  288. for (int i = 0; i < n_kv; ++i) {
  289. const uint32_t cell_id = i + kv_self.head;
  290. llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id];
  291. data[i] = (float) (kv_cell.src >= 0);
  292. // only clear once
  293. if (kv_cell.src < 0) {
  294. kv_cell.src = cell_id;
  295. }
  296. }
  297. }
  298. if (lctx.inp_s_copy) {
  299. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_copy->buffer));
  300. int32_t * data = (int32_t *) lctx.inp_s_copy->data;
  301. // assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
  302. for (uint32_t i = 0; i < n_kv; ++i) {
  303. const uint32_t cell_id = i + kv_self.head;
  304. llama_kv_cell & kv_cell = lctx.kv_self.cells[cell_id];
  305. // prevent out-of-bound sources
  306. if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self.size) {
  307. kv_cell.src = cell_id;
  308. }
  309. data[i] = kv_cell.src;
  310. // ensure copy only happens once
  311. if (kv_cell.src != (int32_t) cell_id) {
  312. kv_cell.src = cell_id;
  313. }
  314. }
  315. }
  316. }
  317. if (lctx.inp_pos_bucket) {
  318. const int64_t n_tokens = ubatch.n_tokens;
  319. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_pos_bucket->buffer));
  320. GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing
  321. int32_t * data = (int32_t *) lctx.inp_pos_bucket->data;
  322. if (!lctx.is_encoding) {
  323. const int64_t n_kv = kv_self.n;
  324. for (int h = 0; h < 1; ++h) {
  325. for (int j = 0; j < n_tokens; ++j) {
  326. for (int i = 0; i < n_kv; ++i) {
  327. data[h*(n_kv*n_tokens) + j*n_kv + i] = llama_relative_position_bucket(lctx.kv_self.cells[i].pos, ubatch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
  328. }
  329. }
  330. }
  331. } else {
  332. for (int h = 0; h < 1; ++h) {
  333. for (int j = 0; j < n_tokens; ++j) {
  334. for (int i = 0; i < n_tokens; ++i) {
  335. data[h*(n_tokens*n_tokens) + j*n_tokens + i] = llama_relative_position_bucket(ubatch.pos[i], ubatch.pos[j], hparams.n_rel_attn_bkts, lctx.is_encoding);
  336. }
  337. }
  338. }
  339. }
  340. }
  341. if (!lctx.is_encoding && lctx.inp_embd_enc) {
  342. assert(lctx.inp_embd_enc->type == GGML_TYPE_F32);
  343. assert((size_t) ggml_nelements(lctx.inp_embd_enc) == lctx.embd_enc.size());
  344. ggml_backend_tensor_set(lctx.inp_embd_enc, lctx.embd_enc.data(), 0, ggml_nbytes(lctx.inp_embd_enc));
  345. }
  346. if (!lctx.is_encoding && lctx.inp_KQ_mask_cross) {
  347. const int64_t n_output_enc = lctx.embd_enc.size() / hparams.n_embd;
  348. const int64_t n_tokens = ubatch.n_tokens;
  349. GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_mask_cross->buffer));
  350. GGML_ASSERT(!ubatch.equal_seqs); // TODO: use ubatch.n_seqs instead of failing
  351. float * data = (float *) lctx.inp_KQ_mask_cross->data;
  352. for (int h = 0; h < 1; ++h) {
  353. for (int j = 0; j < n_tokens; ++j) {
  354. for (int i = 0; i < n_output_enc; ++i) {
  355. float f = -INFINITY;
  356. for (int s = 0; s < ubatch.n_seq_id[j]; ++s) {
  357. const llama_seq_id seq_id = ubatch.seq_id[j][s];
  358. if (lctx.seq_ids_enc[i].find(seq_id) != lctx.seq_ids_enc[i].end()) {
  359. f = 0.0f;
  360. }
  361. }
  362. data[h*(n_output_enc*n_tokens) + j*n_output_enc + i] = f;
  363. }
  364. }
  365. for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
  366. for (int j = 0; j < n_output_enc; ++j) {
  367. data[h*(n_output_enc*n_tokens) + i*n_output_enc + j] = -INFINITY;
  368. }
  369. }
  370. }
  371. }
  372. }
  373. // llama output
  374. size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
  375. const auto & cparams = lctx.cparams;
  376. const auto & hparams = lctx.model.hparams;
  377. const auto & vocab = lctx.model.vocab;
  378. const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max);
  379. const auto n_batch = cparams.n_batch;
  380. const auto n_vocab = vocab.n_tokens();
  381. const auto n_embd = hparams.n_embd;
  382. // TODO: use a per-batch flag for logits presence instead
  383. const bool has_logits = !cparams.embeddings;
  384. const bool has_embd = cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
  385. const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0;
  386. const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0;
  387. if (lctx.output_ids.empty()) {
  388. // init, never resized afterwards
  389. lctx.output_ids.resize(n_batch);
  390. }
  391. const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output.get()) : 0;
  392. const size_t new_size = (logits_size + embd_size) * sizeof(float);
  393. // alloc only when more than the current capacity is required
  394. // TODO: also consider shrinking the buffer
  395. if (!lctx.buf_output || prev_size < new_size) {
  396. if (lctx.buf_output) {
  397. #ifndef NDEBUG
  398. // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark)
  399. LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
  400. #endif
  401. lctx.buf_output = nullptr;
  402. lctx.logits = nullptr;
  403. lctx.embd = nullptr;
  404. }
  405. auto * buft = ggml_backend_cpu_buffer_type();
  406. // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
  407. auto * output_dev = lctx.model.dev_output();
  408. auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
  409. if (output_dev_host_buft) {
  410. buft = output_dev_host_buft;
  411. }
  412. lctx.buf_output.reset(ggml_backend_buft_alloc_buffer(buft, new_size));
  413. if (lctx.buf_output == nullptr) {
  414. LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0));
  415. return 0;
  416. }
  417. }
  418. float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output.get());
  419. lctx.logits = has_logits ? output_base : nullptr;
  420. lctx.embd = has_embd ? output_base + logits_size : nullptr;
  421. lctx.output_size = n_outputs_max;
  422. lctx.logits_size = logits_size;
  423. lctx.embd_size = embd_size;
  424. // set all ids as invalid (negative)
  425. std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1);
  426. ggml_backend_buffer_clear(lctx.buf_output.get(), 0);
  427. lctx.n_outputs = 0;
  428. return n_outputs_max;
  429. }
  430. void llama_output_reorder(struct llama_context & ctx) {
  431. std::vector<size_t> & out_ids = ctx.sbatch.out_ids;
  432. if (!out_ids.empty()) {
  433. const uint32_t n_vocab = ctx.model.vocab.n_tokens();
  434. const uint32_t n_embd = ctx.model.hparams.n_embd;
  435. const int32_t n_outputs = ctx.n_outputs;
  436. GGML_ASSERT((size_t) n_outputs == out_ids.size());
  437. // TODO: is there something more efficient which also minimizes swaps?
  438. // selection sort, to minimize swaps (from https://en.wikipedia.org/wiki/Selection_sort)
  439. for (int32_t i = 0; i < n_outputs - 1; ++i) {
  440. int32_t j_min = i;
  441. for (int32_t j = i + 1; j < n_outputs; ++j) {
  442. if (out_ids[j] < out_ids[j_min]) {
  443. j_min = j;
  444. }
  445. }
  446. if (j_min == i) { continue; }
  447. std::swap(out_ids[i], out_ids[j_min]);
  448. if (ctx.logits_size > 0) {
  449. for (uint32_t k = 0; k < n_vocab; k++) {
  450. std::swap(ctx.logits[i*n_vocab + k], ctx.logits[j_min*n_vocab + k]);
  451. }
  452. }
  453. if (ctx.embd_size > 0) {
  454. for (uint32_t k = 0; k < n_embd; k++) {
  455. std::swap(ctx.embd[i*n_embd + k], ctx.embd[j_min*n_embd + k]);
  456. }
  457. }
  458. }
  459. std::fill(ctx.output_ids.begin(), ctx.output_ids.end(), -1);
  460. for (int32_t i = 0; i < n_outputs; ++i) {
  461. ctx.output_ids[out_ids[i]] = i;
  462. }
  463. out_ids.clear();
  464. }
  465. }
  466. //
  467. // interface implementation
  468. //
  469. void llama_free(struct llama_context * ctx) {
  470. delete ctx;
  471. }
  472. uint32_t llama_n_ctx(const struct llama_context * ctx) {
  473. return ctx->cparams.n_ctx;
  474. }
  475. uint32_t llama_n_batch(const struct llama_context * ctx) {
  476. return ctx->cparams.n_batch;
  477. }
  478. uint32_t llama_n_ubatch(const struct llama_context * ctx) {
  479. return ctx->cparams.n_ubatch;
  480. }
  481. uint32_t llama_n_seq_max(const struct llama_context * ctx) {
  482. return ctx->kv_self.size;
  483. }
  484. const struct llama_model * llama_get_model(const struct llama_context * ctx) {
  485. return &ctx->model;
  486. }
  487. enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
  488. return ctx->cparams.pooling_type;
  489. }
  490. void llama_attach_threadpool(
  491. struct llama_context * ctx,
  492. ggml_threadpool_t threadpool,
  493. ggml_threadpool_t threadpool_batch) {
  494. ctx->threadpool = threadpool;
  495. ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
  496. }
  497. void llama_detach_threadpool(struct llama_context * ctx) {
  498. ctx->threadpool = nullptr;
  499. ctx->threadpool_batch = nullptr;
  500. }
  501. void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) {
  502. ctx->cparams.n_threads = n_threads;
  503. ctx->cparams.n_threads_batch = n_threads_batch;
  504. }
  505. int32_t llama_n_threads(struct llama_context * ctx) {
  506. return ctx->cparams.n_threads;
  507. }
  508. int32_t llama_n_threads_batch(struct llama_context * ctx) {
  509. return ctx->cparams.n_threads_batch;
  510. }
  511. void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
  512. ctx->abort_callback = abort_callback;
  513. ctx->abort_callback_data = abort_callback_data;
  514. for (auto & backend : ctx->backends) {
  515. auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
  516. auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
  517. if (set_abort_callback_fn) {
  518. set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data);
  519. }
  520. }
  521. }
  522. void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
  523. ctx->cparams.embeddings = embeddings;
  524. }
  525. void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) {
  526. ctx->cparams.causal_attn = causal_attn;
  527. }
  528. void llama_synchronize(struct llama_context * ctx) {
  529. ggml_backend_sched_synchronize(ctx->sched.get());
  530. // FIXME: if multiple single tokens are evaluated without a synchronization,
  531. // the stats will be added to the prompt evaluation stats
  532. // this should only happen when using batch size 1 to evaluate a batch
  533. // add the evaluation to the stats
  534. if (ctx->n_queued_tokens == 1) {
  535. if (!ctx->cparams.no_perf) {
  536. ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
  537. }
  538. ctx->n_eval++;
  539. } else if (ctx->n_queued_tokens > 1) {
  540. if (!ctx->cparams.no_perf) {
  541. ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
  542. }
  543. ctx->n_p_eval += ctx->n_queued_tokens;
  544. }
  545. // get a more accurate load time, upon first eval
  546. if (ctx->n_queued_tokens > 0 && !ctx->has_evaluated_once) {
  547. ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
  548. ctx->has_evaluated_once = true;
  549. }
  550. ctx->n_queued_tokens = 0;
  551. ctx->t_compute_start_us = 0;
  552. }
  553. float * llama_get_logits(struct llama_context * ctx) {
  554. llama_synchronize(ctx);
  555. // reorder logits for backward compatibility
  556. // TODO: maybe deprecate this
  557. llama_output_reorder(*ctx);
  558. return ctx->logits;
  559. }
  560. float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
  561. int32_t j = -1;
  562. llama_synchronize(ctx);
  563. try {
  564. if (ctx->logits == nullptr) {
  565. throw std::runtime_error("no logits");
  566. }
  567. if (i < 0) {
  568. j = ctx->n_outputs + i;
  569. if (j < 0) {
  570. throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
  571. }
  572. } else if ((size_t) i >= ctx->output_ids.size()) {
  573. throw std::runtime_error(format("out of range [0, %zu)", ctx->output_ids.size()));
  574. } else {
  575. j = ctx->output_ids[i];
  576. }
  577. if (j < 0) {
  578. throw std::runtime_error(format("batch.logits[%d] != true", i));
  579. }
  580. if (j >= ctx->n_outputs) {
  581. // This should not happen
  582. throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
  583. }
  584. return ctx->logits + j*ctx->model.vocab.n_tokens();
  585. } catch (const std::exception & err) {
  586. LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what());
  587. #ifndef NDEBUG
  588. GGML_ABORT("fatal error");
  589. #else
  590. return nullptr;
  591. #endif
  592. }
  593. }
  594. float * llama_get_embeddings(struct llama_context * ctx) {
  595. llama_synchronize(ctx);
  596. // reorder embeddings for backward compatibility
  597. // TODO: maybe deprecate this
  598. llama_output_reorder(*ctx);
  599. return ctx->embd;
  600. }
  601. float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
  602. int32_t j = -1;
  603. llama_synchronize(ctx);
  604. try {
  605. if (ctx->embd == nullptr) {
  606. throw std::runtime_error("no embeddings");
  607. }
  608. if (i < 0) {
  609. j = ctx->n_outputs + i;
  610. if (j < 0) {
  611. throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
  612. }
  613. } else if ((size_t) i >= ctx->output_ids.size()) {
  614. throw std::runtime_error(format("out of range [0, %zu)", ctx->output_ids.size()));
  615. } else {
  616. j = ctx->output_ids[i];
  617. }
  618. if (j < 0) {
  619. throw std::runtime_error(format("batch.logits[%d] != true", i));
  620. }
  621. if (j >= ctx->n_outputs) {
  622. // This should not happen
  623. throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs));
  624. }
  625. return ctx->embd + j*ctx->model.hparams.n_embd;
  626. } catch (const std::exception & err) {
  627. LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what());
  628. #ifndef NDEBUG
  629. GGML_ABORT("fatal error");
  630. #else
  631. return nullptr;
  632. #endif
  633. }
  634. }
  635. float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) {
  636. llama_synchronize(ctx);
  637. auto it = ctx->embd_seq.find(seq_id);
  638. if (it == ctx->embd_seq.end()) {
  639. return nullptr;
  640. }
  641. return it->second.data();
  642. }
  643. // llama state API
  644. // deprecated
  645. size_t llama_get_state_size(struct llama_context * ctx) {
  646. return llama_state_get_size(ctx);
  647. }
  648. // deprecated
  649. size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
  650. return llama_state_get_data(ctx, dst, -1);
  651. }
  652. // deprecated
  653. size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
  654. return llama_state_set_data(ctx, src, -1);
  655. }
  656. // deprecated
  657. bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
  658. return llama_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
  659. }
  660. // deprecated
  661. bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
  662. return llama_state_save_file(ctx, path_session, tokens, n_token_count);
  663. }
  664. // TODO: replace all non-fatal assertions with returned errors or exceptions
  665. struct llama_data_write {
  666. virtual void write(const void * src, size_t size) = 0;
  667. virtual void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) = 0;
  668. virtual size_t get_size_written() = 0;
  669. virtual ~llama_data_write() = default;
  670. void write_string(const std::string & str) {
  671. uint32_t str_size = str.size();
  672. write(&str_size, sizeof(str_size));
  673. write(str.data(), str_size);
  674. }
  675. void write_model_info(const struct llama_context * ctx) {
  676. const std::string arch_str = llm_arch_name(ctx->model.arch);
  677. write_string(arch_str);
  678. // TODO: add more model-specific info which should prevent loading the session file if not identical
  679. }
  680. //void write_rng(const std::mt19937 & rng) {
  681. // std::ostringstream rng_ss;
  682. // rng_ss << rng;
  683. // const std::string & rng_str = rng_ss.str();
  684. // write_string(rng_str);
  685. //}
  686. void write_output_ids(struct llama_context * ctx) {
  687. llama_output_reorder(*ctx);
  688. const uint32_t n_outputs = ctx->n_outputs;
  689. std::vector<int32_t> output_pos;
  690. const size_t n_batch = ctx->cparams.n_batch;
  691. const auto & output_ids = ctx->output_ids;
  692. GGML_ASSERT(n_outputs <= ctx->output_size);
  693. output_pos.resize(n_outputs);
  694. // build a more compact representation of the output ids
  695. for (size_t i = 0; i < n_batch; ++i) {
  696. // map an output id to a position in the batch
  697. int32_t pos = output_ids[i];
  698. if (pos >= 0) {
  699. GGML_ASSERT((uint32_t) pos < n_outputs);
  700. output_pos[pos] = i;
  701. }
  702. }
  703. write(&n_outputs, sizeof(n_outputs));
  704. if (n_outputs) {
  705. write(output_pos.data(), n_outputs * sizeof(int32_t));
  706. }
  707. }
  708. void write_logits(const struct llama_context * ctx) {
  709. const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.vocab.n_tokens());
  710. write(&logits_size, sizeof(logits_size));
  711. if (logits_size) {
  712. write(ctx->logits, logits_size * sizeof(float));
  713. }
  714. }
  715. void write_embeddings(const struct llama_context * ctx) {
  716. const uint64_t embeddings_size = std::min((uint64_t) ctx->embd_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_embd);
  717. write(&embeddings_size, sizeof(embeddings_size));
  718. if (embeddings_size) {
  719. write(ctx->embd, embeddings_size * sizeof(float));
  720. }
  721. }
  722. void write_kv_cache_meta(const llama_kv_cache & kv_self, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) {
  723. for (const auto & range : cell_ranges) {
  724. for (uint32_t i = range.first; i < range.second; ++i) {
  725. const auto & cell = kv_self.cells[i];
  726. const llama_pos pos = cell.pos;
  727. const uint32_t n_seq_id = seq_id == -1 ? cell.seq_id.size() : 0;
  728. write(&pos, sizeof(pos));
  729. write(&n_seq_id, sizeof(n_seq_id));
  730. if (n_seq_id) {
  731. for (auto seq_id : cell.seq_id) {
  732. write(&seq_id, sizeof(seq_id));
  733. }
  734. }
  735. }
  736. }
  737. }
  738. void write_kv_cache_data(const struct llama_context * ctx, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) {
  739. const struct llama_kv_cache & kv_self = ctx->kv_self;
  740. const struct llama_hparams & hparams = ctx->model.hparams;
  741. const uint32_t v_trans = kv_self.v_trans ? 1 : 0;
  742. const uint32_t n_layer = hparams.n_layer;
  743. write(&v_trans, sizeof(v_trans));
  744. write(&n_layer, sizeof(n_layer));
  745. std::vector<uint8_t> tmp_buf;
  746. // Iterate and write all the keys first, each row is a cell
  747. // Get whole range at a time
  748. for (uint32_t il = 0; il < n_layer; ++il) {
  749. const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
  750. // Write key type
  751. const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
  752. write(&k_type_i, sizeof(k_type_i));
  753. // Write row size of key
  754. const uint64_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
  755. write(&k_size_row, sizeof(k_size_row));
  756. // Read each range of cells of k_size length each into tmp_buf and write out
  757. for (const auto & range : cell_ranges) {
  758. const size_t range_size = range.second - range.first;
  759. const size_t buf_size = range_size * k_size_row;
  760. write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size);
  761. }
  762. }
  763. if (!kv_self.v_trans) {
  764. for (uint32_t il = 0; il < n_layer; ++il) {
  765. const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
  766. // Write value type
  767. const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
  768. write(&v_type_i, sizeof(v_type_i));
  769. // Write row size of value
  770. const uint64_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
  771. write(&v_size_row, sizeof(v_size_row));
  772. // Read each range of cells of v_size length each into tmp_buf and write out
  773. for (const auto & range : cell_ranges) {
  774. const size_t range_size = range.second - range.first;
  775. const size_t buf_size = range_size * v_size_row;
  776. write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size);
  777. }
  778. }
  779. } else {
  780. // When v is transposed, we also need the element size and get the element ranges from each row
  781. const uint32_t kv_size = kv_self.size;
  782. for (uint32_t il = 0; il < n_layer; ++il) {
  783. const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
  784. // Write value type
  785. const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
  786. write(&v_type_i, sizeof(v_type_i));
  787. // Write element size
  788. const uint32_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
  789. write(&v_size_el, sizeof(v_size_el));
  790. // Write GQA embedding size
  791. write(&n_embd_v_gqa, sizeof(n_embd_v_gqa));
  792. // For each row, we get the element values of each cell
  793. for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
  794. // Read each range of cells of v_size_el length each into tmp_buf and write out
  795. for (const auto & range : cell_ranges) {
  796. const size_t range_size = range.second - range.first;
  797. const size_t src_offset = (range.first + j * kv_size) * v_size_el;
  798. const size_t buf_size = range_size * v_size_el;
  799. write_tensor_data(kv_self.v_l[il], src_offset, buf_size);
  800. }
  801. }
  802. }
  803. }
  804. }
  805. void write_kv_cache(const struct llama_context * ctx, llama_seq_id seq_id = -1) {
  806. const struct llama_kv_cache & kv_self = ctx->kv_self;
  807. std::vector<std::pair<uint32_t, uint32_t>> cell_ranges; // ranges, from inclusive, to exclusive
  808. uint32_t cell_count = 0;
  809. // Count the number of cells with the specified seq_id
  810. // Find all the ranges of cells with this seq id (or all, when -1)
  811. uint32_t cell_range_begin = kv_self.size;
  812. for (uint32_t i = 0; i < kv_self.size; ++i) {
  813. const auto & cell = kv_self.cells[i];
  814. if ((seq_id == -1 && !cell.is_empty()) || cell.has_seq_id(seq_id)) {
  815. ++cell_count;
  816. if (cell_range_begin == kv_self.size) {
  817. cell_range_begin = i;
  818. }
  819. } else {
  820. if (cell_range_begin != kv_self.size) {
  821. cell_ranges.emplace_back(cell_range_begin, i);
  822. cell_range_begin = kv_self.size;
  823. }
  824. }
  825. }
  826. if (cell_range_begin != kv_self.size) {
  827. cell_ranges.emplace_back(cell_range_begin, kv_self.size);
  828. }
  829. // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
  830. uint32_t cell_count_check = 0;
  831. for (const auto & range : cell_ranges) {
  832. cell_count_check += range.second - range.first;
  833. }
  834. GGML_ASSERT(cell_count == cell_count_check);
  835. write(&cell_count, sizeof(cell_count));
  836. write_kv_cache_meta(kv_self, cell_ranges, seq_id);
  837. write_kv_cache_data(ctx, cell_ranges);
  838. }
  839. };
  840. struct llama_data_read {
  841. virtual const uint8_t * read(size_t size) = 0;
  842. virtual void read_to(void * dst, size_t size) = 0;
  843. virtual size_t get_size_read() = 0;
  844. virtual ~llama_data_read() = default;
  845. void read_string(std::string & str) {
  846. uint32_t str_size;
  847. read_to(&str_size, sizeof(str_size));
  848. str.assign((const char *) read(str_size), str_size);
  849. }
  850. // validate model information
  851. void read_model_info(const struct llama_context * ctx) {
  852. const std::string cur_arch_str = llm_arch_name(ctx->model.arch);
  853. std::string arch_str;
  854. read_string(arch_str);
  855. if (cur_arch_str != arch_str) {
  856. throw std::runtime_error(format("wrong model arch: '%s' instead of '%s'", arch_str.c_str(), cur_arch_str.c_str()));
  857. }
  858. // TODO: add more info which needs to be identical but which is not verified otherwise
  859. }
  860. //void read_rng(std::mt19937 & rng) {
  861. // std::string rng_str;
  862. // read_string(rng_str);
  863. // std::istringstream rng_ss(rng_str);
  864. // rng_ss >> rng;
  865. // if (rng_ss.fail()) {
  866. // throw std::runtime_error("failed to load RNG state");
  867. // }
  868. //}
  869. void read_output_ids(struct llama_context * ctx) {
  870. std::vector<int32_t> output_pos;
  871. uint32_t n_outputs;
  872. read_to(&n_outputs, sizeof(n_outputs));
  873. if (n_outputs > llama_output_reserve(*ctx, n_outputs)) {
  874. throw std::runtime_error("could not reserve outputs");
  875. }
  876. if (n_outputs) {
  877. output_pos.resize(n_outputs);
  878. read_to(output_pos.data(), n_outputs * sizeof(int32_t));
  879. for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) {
  880. int32_t id = output_pos[i];
  881. if ((uint32_t) id >= ctx->cparams.n_batch) {
  882. throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, ctx->cparams.n_batch));
  883. }
  884. ctx->output_ids[id] = i;
  885. }
  886. ctx->n_outputs = n_outputs;
  887. }
  888. }
  889. void read_logits(struct llama_context * ctx) {
  890. uint64_t logits_size;
  891. read_to(&logits_size, sizeof(logits_size));
  892. if (ctx->logits_size < logits_size) {
  893. throw std::runtime_error("logits buffer too small");
  894. }
  895. if (logits_size) {
  896. read_to(ctx->logits, logits_size * sizeof(float));
  897. }
  898. }
  899. void read_embeddings(struct llama_context * ctx) {
  900. uint64_t embeddings_size;
  901. read_to(&embeddings_size, sizeof(embeddings_size));
  902. if (ctx->embd_size < embeddings_size) {
  903. throw std::runtime_error("embeddings buffer too small");
  904. }
  905. if (embeddings_size) {
  906. read_to(ctx->embd, embeddings_size * sizeof(float));
  907. }
  908. }
  909. bool read_kv_cache_meta(struct llama_context * ctx, uint32_t cell_count, llama_seq_id dest_seq_id = -1) {
  910. struct llama_kv_cache & kv_self = ctx->kv_self;
  911. if (dest_seq_id != -1) {
  912. // single sequence
  913. llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1);
  914. llama_ubatch batch = ctx->sbatch.reserve_ubatch(cell_count, /* has_embd */ false);
  915. batch.n_tokens = cell_count;
  916. batch.n_seq_tokens = cell_count;
  917. batch.n_seqs = 1;
  918. for (uint32_t i = 0; i < cell_count; ++i) {
  919. llama_pos pos;
  920. uint32_t n_seq_id;
  921. read_to(&pos, sizeof(pos));
  922. read_to(&n_seq_id, sizeof(n_seq_id));
  923. if (n_seq_id != 0) {
  924. LLAMA_LOG_ERROR("%s: invalid seq_id-agnostic kv cell\n", __func__);
  925. return false;
  926. }
  927. batch.pos[i] = pos;
  928. }
  929. batch.n_seq_id[0] = 1;
  930. batch.seq_id[0] = &dest_seq_id;
  931. if (!llama_kv_cache_find_slot(kv_self, batch)) {
  932. LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__);
  933. return false;
  934. }
  935. // DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values)
  936. // Assume that this is one contiguous block of cells
  937. GGML_ASSERT(kv_self.head + cell_count <= kv_self.size);
  938. GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]);
  939. GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]);
  940. GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id));
  941. GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id));
  942. } else {
  943. // whole KV cache restore
  944. if (cell_count > kv_self.size) {
  945. LLAMA_LOG_ERROR("%s: not enough cells in kv cache\n", __func__);
  946. return false;
  947. }
  948. llama_kv_cache_clear(kv_self);
  949. for (uint32_t i = 0; i < cell_count; ++i) {
  950. llama_kv_cell & cell = kv_self.cells[i];
  951. llama_pos pos;
  952. uint32_t n_seq_id;
  953. read_to(&pos, sizeof(pos));
  954. read_to(&n_seq_id, sizeof(n_seq_id));
  955. cell.pos = pos;
  956. for (uint32_t j = 0; j < n_seq_id; ++j) {
  957. llama_seq_id seq_id;
  958. read_to(&seq_id, sizeof(seq_id));
  959. if (seq_id < 0 || (uint32_t) seq_id >= llama_n_seq_max(ctx)) {
  960. LLAMA_LOG_ERROR("%s: invalid seq_id, %d is out of range [0, %u)\n", __func__, seq_id, llama_n_seq_max(ctx));
  961. return false;
  962. }
  963. cell.seq_id.insert(seq_id);
  964. if (kv_self.recurrent) {
  965. int32_t & tail = kv_self.cells[seq_id].tail;
  966. if (tail != -1) {
  967. LLAMA_LOG_ERROR("%s: duplicate tail for seq_id %d in cell %d and %d\n", __func__, seq_id, i, tail);
  968. return false;
  969. }
  970. tail = i;
  971. }
  972. }
  973. }
  974. kv_self.head = 0;
  975. kv_self.used = cell_count;
  976. }
  977. if (kv_self.recurrent) {
  978. for (uint32_t i = 0; i < cell_count; ++i) {
  979. uint32_t cell_id = kv_self.head + i;
  980. // make sure the recurrent states will keep their restored state
  981. kv_self.cells[cell_id].src = cell_id;
  982. }
  983. }
  984. return true;
  985. }
  986. bool read_kv_cache_data(struct llama_context * ctx, uint32_t cell_count) {
  987. const struct llama_hparams & hparams = ctx->model.hparams;
  988. struct llama_kv_cache & kv_self = ctx->kv_self;
  989. uint32_t v_trans;
  990. uint32_t n_layer;
  991. read_to(&v_trans, sizeof(v_trans));
  992. read_to(&n_layer, sizeof(n_layer));
  993. if (n_layer != hparams.n_layer) {
  994. LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer);
  995. return false;
  996. }
  997. if (cell_count > kv_self.size) {
  998. LLAMA_LOG_ERROR("%s: not enough cells in kv cache to restore state (%u > %u)\n", __func__, cell_count, kv_self.size);
  999. return false;
  1000. }
  1001. if (kv_self.v_trans != (bool) v_trans) {
  1002. LLAMA_LOG_ERROR("%s: incompatible V transposition\n", __func__);
  1003. return false;
  1004. }
  1005. // For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
  1006. for (uint32_t il = 0; il < n_layer; ++il) {
  1007. const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(il) + hparams.n_embd_k_s();
  1008. // Read type of key
  1009. int32_t k_type_i_ref;
  1010. read_to(&k_type_i_ref, sizeof(k_type_i_ref));
  1011. const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type;
  1012. if (k_type_i != k_type_i_ref) {
  1013. LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il);
  1014. return false;
  1015. }
  1016. // Read row size of key
  1017. uint64_t k_size_row_ref;
  1018. read_to(&k_size_row_ref, sizeof(k_size_row_ref));
  1019. const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
  1020. if (k_size_row != k_size_row_ref) {
  1021. LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, (size_t) k_size_row_ref, il);
  1022. return false;
  1023. }
  1024. if (cell_count) {
  1025. // Read and set the keys for the whole cell range
  1026. ggml_backend_tensor_set(kv_self.k_l[il], read(cell_count * k_size_row), kv_self.head * k_size_row, cell_count * k_size_row);
  1027. }
  1028. }
  1029. if (!kv_self.v_trans) {
  1030. for (uint32_t il = 0; il < n_layer; ++il) {
  1031. const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
  1032. // Read type of value
  1033. int32_t v_type_i_ref;
  1034. read_to(&v_type_i_ref, sizeof(v_type_i_ref));
  1035. const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
  1036. if (v_type_i != v_type_i_ref) {
  1037. LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
  1038. return false;
  1039. }
  1040. // Read row size of value
  1041. uint64_t v_size_row_ref;
  1042. read_to(&v_size_row_ref, sizeof(v_size_row_ref));
  1043. const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa);
  1044. if (v_size_row != v_size_row_ref) {
  1045. LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, (size_t) v_size_row_ref, il);
  1046. return false;
  1047. }
  1048. if (cell_count) {
  1049. // Read and set the values for the whole cell range
  1050. ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_row), kv_self.head * v_size_row, cell_count * v_size_row);
  1051. }
  1052. }
  1053. } else {
  1054. // For each layer, read the values for each cell (transposed)
  1055. for (uint32_t il = 0; il < n_layer; ++il) {
  1056. const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(il) + hparams.n_embd_v_s();
  1057. // Read type of value
  1058. int32_t v_type_i_ref;
  1059. read_to(&v_type_i_ref, sizeof(v_type_i_ref));
  1060. const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type;
  1061. if (v_type_i != v_type_i_ref) {
  1062. LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il);
  1063. return false;
  1064. }
  1065. // Read element size of value
  1066. uint32_t v_size_el_ref;
  1067. read_to(&v_size_el_ref, sizeof(v_size_el_ref));
  1068. const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
  1069. if (v_size_el != v_size_el_ref) {
  1070. LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, (size_t) v_size_el_ref, il);
  1071. return false;
  1072. }
  1073. // Read GQA embedding size
  1074. uint32_t n_embd_v_gqa_ref;
  1075. read_to(&n_embd_v_gqa_ref, sizeof(n_embd_v_gqa_ref));
  1076. if (n_embd_v_gqa != n_embd_v_gqa_ref) {
  1077. LLAMA_LOG_ERROR("%s: mismatched GQA embedding size (%u != %u, layer %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref, il);
  1078. return false;
  1079. }
  1080. if (cell_count) {
  1081. // For each row in the transposed matrix, read the values for the whole cell range
  1082. for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
  1083. const size_t dst_offset = (kv_self.head + j * kv_self.size) * v_size_el;
  1084. ggml_backend_tensor_set(kv_self.v_l[il], read(cell_count * v_size_el), dst_offset, cell_count * v_size_el);
  1085. }
  1086. }
  1087. }
  1088. }
  1089. return true;
  1090. }
  1091. void read_kv_cache(struct llama_context * ctx, llama_seq_id seq_id = -1) {
  1092. uint32_t cell_count;
  1093. read_to(&cell_count, sizeof(cell_count));
  1094. bool res = read_kv_cache_meta(ctx, cell_count, seq_id) && read_kv_cache_data(ctx, cell_count);
  1095. if (!res) {
  1096. if (seq_id == -1) {
  1097. llama_kv_cache_clear(ctx);
  1098. } else {
  1099. llama_kv_cache_seq_rm(ctx, seq_id, -1, -1);
  1100. }
  1101. throw std::runtime_error("failed to restore kv cache");
  1102. }
  1103. }
  1104. };
  1105. struct llama_data_write_dummy : llama_data_write {
  1106. size_t size_written = 0;
  1107. llama_data_write_dummy() {}
  1108. void write(const void * /* src */, size_t size) override {
  1109. size_written += size;
  1110. }
  1111. void write_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
  1112. size_written += size;
  1113. }
  1114. size_t get_size_written() override {
  1115. return size_written;
  1116. }
  1117. };
  1118. struct llama_data_write_buffer : llama_data_write {
  1119. uint8_t * ptr;
  1120. size_t buf_size = 0;
  1121. size_t size_written = 0;
  1122. llama_data_write_buffer(uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
  1123. void write(const void * src, size_t size) override {
  1124. if (size > buf_size) {
  1125. throw std::runtime_error("unexpectedly reached end of buffer");
  1126. }
  1127. memcpy(ptr, src, size);
  1128. ptr += size;
  1129. size_written += size;
  1130. buf_size -= size;
  1131. }
  1132. void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
  1133. if (size > buf_size) {
  1134. throw std::runtime_error("unexpectedly reached end of buffer");
  1135. }
  1136. ggml_backend_tensor_get(tensor, ptr, offset, size);
  1137. ptr += size;
  1138. size_written += size;
  1139. buf_size -= size;
  1140. }
  1141. size_t get_size_written() override {
  1142. return size_written;
  1143. }
  1144. };
  1145. struct llama_data_read_buffer : llama_data_read {
  1146. const uint8_t * ptr;
  1147. size_t buf_size = 0;
  1148. size_t size_read = 0;
  1149. llama_data_read_buffer(const uint8_t * p, size_t len) : ptr(p), buf_size(len) {}
  1150. const uint8_t * read(size_t size) override {
  1151. const uint8_t * base_ptr = ptr;
  1152. if (size > buf_size) {
  1153. throw std::runtime_error("unexpectedly reached end of buffer");
  1154. }
  1155. ptr += size;
  1156. size_read += size;
  1157. buf_size -= size;
  1158. return base_ptr;
  1159. }
  1160. void read_to(void * dst, size_t size) override {
  1161. memcpy(dst, read(size), size);
  1162. }
  1163. size_t get_size_read() override {
  1164. return size_read;
  1165. }
  1166. };
  1167. struct llama_data_write_file : llama_data_write {
  1168. llama_file * file;
  1169. size_t size_written = 0;
  1170. std::vector<uint8_t> temp_buffer;
  1171. llama_data_write_file(llama_file * f) : file(f) {}
  1172. void write(const void * src, size_t size) override {
  1173. file->write_raw(src, size);
  1174. size_written += size;
  1175. }
  1176. void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
  1177. temp_buffer.resize(size);
  1178. ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
  1179. write(temp_buffer.data(), temp_buffer.size());
  1180. }
  1181. size_t get_size_written() override {
  1182. return size_written;
  1183. }
  1184. };
  1185. struct llama_data_read_file : llama_data_read {
  1186. llama_file * file;
  1187. size_t size_read = 0;
  1188. std::vector<uint8_t> temp_buffer;
  1189. llama_data_read_file(llama_file * f) : file(f) {}
  1190. void read_to(void * dst, size_t size) override {
  1191. file->read_raw(dst, size);
  1192. size_read += size;
  1193. }
  1194. const uint8_t * read(size_t size) override {
  1195. temp_buffer.resize(size);
  1196. read_to(temp_buffer.data(), size);
  1197. return temp_buffer.data();
  1198. }
  1199. size_t get_size_read() override {
  1200. return size_read;
  1201. }
  1202. };
  1203. /** copy state data into either a buffer or file depending on the passed in context
  1204. *
  1205. * file context:
  1206. * llama_file file("/path", "wb");
  1207. * llama_data_write_file data_ctx(&file);
  1208. * llama_state_get_data_internal(ctx, data_ctx);
  1209. *
  1210. * buffer context:
  1211. * std::vector<uint8_t> buf(max_size, 0);
  1212. * llama_data_write_buffer data_ctx(buf.data(), max_size);
  1213. * llama_state_get_data_internal(ctx, data_ctx);
  1214. *
  1215. */
  1216. static size_t llama_state_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx) {
  1217. llama_synchronize(ctx);
  1218. data_ctx.write_model_info(ctx);
  1219. // copy outputs
  1220. data_ctx.write_output_ids(ctx);
  1221. data_ctx.write_logits(ctx);
  1222. data_ctx.write_embeddings(ctx);
  1223. data_ctx.write_kv_cache(ctx);
  1224. return data_ctx.get_size_written();
  1225. }
  1226. size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst, size_t size) {
  1227. llama_data_write_buffer data_ctx(dst, size);
  1228. try {
  1229. return llama_state_get_data_internal(ctx, data_ctx);
  1230. } catch (const std::exception & err) {
  1231. LLAMA_LOG_ERROR("%s: error saving state: %s\n", __func__, err.what());
  1232. return 0;
  1233. }
  1234. }
  1235. // Returns the *actual* size of the state.
  1236. // Intended to be used when saving to state to a buffer.
  1237. size_t llama_state_get_size(struct llama_context * ctx) {
  1238. llama_data_write_dummy data_ctx;
  1239. try {
  1240. return llama_state_get_data_internal(ctx, data_ctx);
  1241. } catch (const std::exception & err) {
  1242. LLAMA_LOG_ERROR("%s: error getting state size: %s\n", __func__, err.what());
  1243. return 0;
  1244. }
  1245. }
  1246. static size_t llama_state_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx) {
  1247. llama_synchronize(ctx);
  1248. data_ctx.read_model_info(ctx);
  1249. // set outputs
  1250. data_ctx.read_output_ids(ctx);
  1251. data_ctx.read_logits(ctx);
  1252. data_ctx.read_embeddings(ctx);
  1253. data_ctx.read_kv_cache(ctx);
  1254. return data_ctx.get_size_read();
  1255. }
  1256. // Sets the state reading from the specified source address
  1257. size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src, size_t size) {
  1258. llama_data_read_buffer data_ctx(src, size);
  1259. try {
  1260. return llama_state_set_data_internal(ctx, data_ctx);
  1261. } catch (const std::exception & err) {
  1262. LLAMA_LOG_ERROR("%s: error loading state: %s\n", __func__, err.what());
  1263. return 0;
  1264. }
  1265. }
  1266. static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
  1267. llama_file file(path_session, "rb");
  1268. // sanity checks
  1269. {
  1270. const uint32_t magic = file.read_u32();
  1271. const uint32_t version = file.read_u32();
  1272. if (magic != LLAMA_SESSION_MAGIC || version != LLAMA_SESSION_VERSION) {
  1273. LLAMA_LOG_ERROR("%s: unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
  1274. return false;
  1275. }
  1276. }
  1277. // load the prompt
  1278. {
  1279. const uint32_t n_token_count = file.read_u32();
  1280. if (n_token_count > n_token_capacity) {
  1281. LLAMA_LOG_ERROR("%s: token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
  1282. return false;
  1283. }
  1284. file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
  1285. *n_token_count_out = n_token_count;
  1286. }
  1287. // restore the context state
  1288. {
  1289. const size_t n_state_size_cur = file.size() - file.tell();
  1290. llama_data_read_file data_ctx(&file);
  1291. const size_t n_read = llama_state_set_data_internal(ctx, data_ctx);
  1292. if (n_read != n_state_size_cur) {
  1293. LLAMA_LOG_ERROR("%s: did not read all of the session file data! size %zu, got %zu\n", __func__, n_state_size_cur, n_read);
  1294. return false;
  1295. }
  1296. }
  1297. return true;
  1298. }
  1299. bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
  1300. try {
  1301. return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out);
  1302. } catch (const std::exception & err) {
  1303. LLAMA_LOG_ERROR("%s: error loading session file: %s\n", __func__, err.what());
  1304. return false;
  1305. }
  1306. }
  1307. static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
  1308. llama_file file(path_session, "wb");
  1309. file.write_u32(LLAMA_SESSION_MAGIC);
  1310. file.write_u32(LLAMA_SESSION_VERSION);
  1311. // save the prompt
  1312. file.write_u32((uint32_t) n_token_count);
  1313. file.write_raw(tokens, sizeof(llama_token) * n_token_count);
  1314. // save the context state using stream saving
  1315. llama_data_write_file data_ctx(&file);
  1316. llama_state_get_data_internal(ctx, data_ctx);
  1317. return true;
  1318. }
  1319. bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) {
  1320. try {
  1321. return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count);
  1322. } catch (const std::exception & err) {
  1323. LLAMA_LOG_ERROR("%s: error saving session file: %s\n", __func__, err.what());
  1324. return false;
  1325. }
  1326. }
  1327. static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_write & data_ctx, llama_seq_id seq_id) {
  1328. llama_synchronize(ctx);
  1329. data_ctx.write_kv_cache(ctx, seq_id);
  1330. return data_ctx.get_size_written();
  1331. }
  1332. size_t llama_state_seq_get_size(struct llama_context * ctx, llama_seq_id seq_id) {
  1333. llama_data_write_dummy data_ctx;
  1334. return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
  1335. }
  1336. size_t llama_state_seq_get_data(struct llama_context * ctx, uint8_t * dst, size_t size, llama_seq_id seq_id) {
  1337. llama_data_write_buffer data_ctx(dst, size);
  1338. try {
  1339. return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
  1340. } catch (const std::exception & err) {
  1341. LLAMA_LOG_ERROR("%s: error saving sequence state: %s\n", __func__, err.what());
  1342. return 0;
  1343. }
  1344. }
  1345. static size_t llama_state_seq_set_data_internal(struct llama_context * ctx, llama_data_read & data_ctx, llama_seq_id dest_seq_id) {
  1346. llama_synchronize(ctx);
  1347. data_ctx.read_kv_cache(ctx, dest_seq_id);
  1348. return data_ctx.get_size_read();
  1349. }
  1350. size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, size_t size, llama_seq_id dest_seq_id) {
  1351. llama_data_read_buffer data_ctx(src, size);
  1352. try {
  1353. return llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id);
  1354. } catch (const std::exception & err) {
  1355. LLAMA_LOG_ERROR("%s: error loading sequence state: %s\n", __func__, err.what());
  1356. return 0;
  1357. }
  1358. }
  1359. static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
  1360. llama_file file(filepath, "wb");
  1361. file.write_u32(LLAMA_STATE_SEQ_MAGIC);
  1362. file.write_u32(LLAMA_STATE_SEQ_VERSION);
  1363. // save the prompt
  1364. file.write_u32((uint32_t) n_token_count);
  1365. file.write_raw(tokens, sizeof(llama_token) * n_token_count);
  1366. // save the context state using stream saving
  1367. llama_data_write_file data_ctx(&file);
  1368. llama_state_seq_get_data_internal(ctx, data_ctx, seq_id);
  1369. const size_t res = file.tell();
  1370. GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + data_ctx.get_size_written());
  1371. return res;
  1372. }
  1373. static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
  1374. llama_file file(filepath, "rb");
  1375. // version checks
  1376. {
  1377. const uint32_t magic = file.read_u32();
  1378. const uint32_t version = file.read_u32();
  1379. if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) {
  1380. LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version);
  1381. return 0;
  1382. }
  1383. }
  1384. // load the prompt
  1385. {
  1386. const uint32_t n_token_count = file.read_u32();
  1387. if (n_token_count > n_token_capacity) {
  1388. LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity);
  1389. return 0;
  1390. }
  1391. file.read_raw(tokens_out, sizeof(llama_token) * n_token_count);
  1392. *n_token_count_out = n_token_count;
  1393. }
  1394. // restore the context state
  1395. {
  1396. const size_t state_size = file.size() - file.tell();
  1397. llama_data_read_file data_ctx(&file);
  1398. const size_t nread = llama_state_seq_set_data_internal(ctx, data_ctx, dest_seq_id);
  1399. if (!nread) {
  1400. LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__);
  1401. return 0;
  1402. }
  1403. GGML_ASSERT(nread <= state_size);
  1404. GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell());
  1405. }
  1406. return file.tell();
  1407. }
  1408. size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) {
  1409. try {
  1410. return llama_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count);
  1411. } catch (const std::exception & err) {
  1412. LLAMA_LOG_ERROR("%s: error saving sequence state file: %s\n", __func__, err.what());
  1413. return 0;
  1414. }
  1415. }
  1416. size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
  1417. try {
  1418. return llama_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out);
  1419. } catch (const std::exception & err) {
  1420. LLAMA_LOG_ERROR("%s: error loading sequence state file: %s\n", __func__, err.what());
  1421. return 0;
  1422. }
  1423. }
  1424. const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
  1425. struct llama_context * ctx
  1426. ) {
  1427. return ctx->model.tensors_by_name;
  1428. }