llama-kv-cells.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491
  1. #pragma once
  2. #include "llama.h"
  3. #include "llama-cparams.h"
  4. #include <bitset>
  5. #include <cassert>
  6. #include <vector>
  7. #include <set>
  8. #include <map>
  9. // meta information about KV cells that can be part of multiple sequences at the same time
  10. // TODO: add unit tests
  11. class llama_kv_cells_unified {
  12. public:
  13. void reset() {
  14. for (uint32_t i = 0; i < pos.size(); ++i) {
  15. pos[i] = -1;
  16. shift[i] = 0;
  17. seq[i].reset();
  18. }
  19. has_shift = false;
  20. used.clear();
  21. for (uint32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
  22. seq_pos[s].clear();
  23. }
  24. }
  25. void reset_shift() {
  26. has_shift = false;
  27. for (uint32_t i = 0; i < shift.size(); ++i) {
  28. shift[i] = 0;
  29. }
  30. }
  31. uint32_t size() const {
  32. return pos.size();
  33. }
  34. void resize(uint32_t n) {
  35. pos.resize(n);
  36. shift.resize(n);
  37. seq.resize(n);
  38. reset();
  39. }
  40. bool is_empty(uint32_t i) const {
  41. assert(i < pos.size());
  42. assert((pos[i] < 0 && pos[i] == -1) || pos[i] >= 0);
  43. return pos[i] == -1;
  44. }
  45. uint32_t get_used() const {
  46. return used.size();
  47. }
  48. // the index of the first cell that is used
  49. // return 0 if no cells are used
  50. uint32_t used_min() const {
  51. return used.empty() ? 0 : *used.begin();
  52. }
  53. // the index of the last cell that is used + 1
  54. // return 0 if no cells are used
  55. uint32_t used_max_p1() const {
  56. return used.empty() ? 0 : *used.rbegin() + 1;
  57. }
  58. bool get_has_shift() const {
  59. return has_shift;
  60. }
  61. // move cell isrc to idst (used during defrag)
  62. void mv(uint32_t isrc, uint32_t idst) {
  63. assert(isrc < pos.size());
  64. assert(idst < pos.size());
  65. assert(pos[idst] == -1);
  66. assert(pos[isrc] != -1);
  67. pos [idst] = pos [isrc];
  68. shift[idst] = shift[isrc];
  69. seq [idst] = seq [isrc];
  70. pos [isrc] = -1;
  71. shift[isrc] = 0;
  72. seq [isrc].reset();
  73. used.erase (isrc);
  74. used.insert(idst);
  75. }
  76. // copy the state of cells [i, i + n) (used for save/restore the state of the cells)
  77. llama_kv_cells_unified cp(uint32_t i, uint32_t n) const {
  78. assert(i + n <= pos.size());
  79. llama_kv_cells_unified res;
  80. res.resize(n);
  81. for (uint32_t j = 0; j < n; ++j) {
  82. const auto idx = i + j;
  83. res.pos[j] = pos[idx];
  84. res.seq[j] = seq[idx];
  85. assert(shift[idx] == 0);
  86. }
  87. return res;
  88. }
  89. // copy the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1])
  90. llama_kv_cells_unified cp(const std::vector<uint32_t> & idxs) const {
  91. llama_kv_cells_unified res;
  92. res.resize(idxs.size());
  93. for (uint32_t j = 0; j < idxs.size(); ++j) {
  94. const auto idx = idxs[j];
  95. res.pos[j] = pos[idx];
  96. res.seq[j] = seq[idx];
  97. assert(shift[idx] == 0);
  98. }
  99. return res;
  100. }
  101. // set the state of cells [i, i + other.pos.size()) (used for save/restore the state of the cells)
  102. void set(uint32_t i, const llama_kv_cells_unified & other) {
  103. assert(i + other.pos.size() <= pos.size());
  104. for (uint32_t j = 0; j < other.pos.size(); ++j) {
  105. const auto idx = i + j;
  106. if (pos[idx] == -1 && other.pos[j] != -1) {
  107. used.insert(i + j);
  108. }
  109. if (pos[idx] != -1 && other.pos[j] == -1) {
  110. used.erase(i + j);
  111. }
  112. if (pos[idx] != -1) {
  113. seq_pos_rm(i + j);
  114. }
  115. pos[idx] = other.pos[j];
  116. seq[idx] = other.seq[j];
  117. if (pos[idx] != -1) {
  118. seq_pos_add(i + j);
  119. }
  120. assert(shift[idx] == 0);
  121. }
  122. }
  123. // set the state of cells [idxs[0], idxs[1], ..., idxs[idxs.size() - 1])
  124. void set(const std::vector<uint32_t> & idxs, const llama_kv_cells_unified & other) {
  125. assert(idxs.size() == other.pos.size());
  126. for (uint32_t j = 0; j < other.pos.size(); ++j) {
  127. const auto idx = idxs[j];
  128. if (pos[idx] == -1 && other.pos[j] != -1) {
  129. used.insert(idx);
  130. }
  131. if (pos[idx] != -1 && other.pos[j] == -1) {
  132. used.erase(idx);
  133. }
  134. if (pos[idx] != -1) {
  135. seq_pos_rm(idx);
  136. }
  137. pos[idx] = other.pos[j];
  138. seq[idx] = other.seq[j];
  139. if (pos[idx] != -1) {
  140. seq_pos_add(idx);
  141. }
  142. assert(shift[idx] == 0);
  143. }
  144. }
  145. // clear a non-empty cell
  146. void rm(uint32_t i) {
  147. assert(i < pos.size());
  148. assert(pos[i] != -1);
  149. seq_pos_rm(i);
  150. seq[i].reset();
  151. pos[i] = -1;
  152. shift[i] = 0;
  153. used.erase(i);
  154. }
  155. // note: call only if the cell has seq_id
  156. // return true if the cell becomes empty
  157. bool seq_rm(uint32_t i, llama_seq_id seq_id) {
  158. assert(i < pos.size());
  159. assert(seq[i].test(seq_id));
  160. assert(pos[i] != -1);
  161. assert(seq_id >= 0);
  162. seq[i].reset(seq_id);
  163. seq_pos_dec(seq_id, pos[i]);
  164. if (seq[i].none()) {
  165. pos[i] = -1;
  166. shift[i] = 0;
  167. used.erase(i);
  168. return true;
  169. }
  170. return false;
  171. }
  172. // return true if the cell becomes empty (i.e. it did not contain seq_id before the call)
  173. bool seq_keep(uint32_t i, llama_seq_id seq_id) {
  174. assert(i < pos.size());
  175. if (seq[i].test(seq_id)) {
  176. seq_pos_rm(i);
  177. seq[i].reset();
  178. seq[i].set(seq_id);
  179. seq_pos_inc(seq_id, pos[i]);
  180. return false;
  181. }
  182. if (seq[i].any()) {
  183. seq_pos_rm(i);
  184. seq[i].reset();
  185. pos[i] = -1;
  186. shift[i] = 0;
  187. used.erase(i);
  188. return true;
  189. }
  190. assert(pos[i] == -1);
  191. return false;
  192. }
  193. // number of different sequences in the cell
  194. int seq_count(uint32_t i) const {
  195. assert(i < pos.size());
  196. assert(pos[i] != -1);
  197. return seq[i].count();
  198. }
  199. // check if the cell contains seq_id
  200. bool seq_has(uint32_t i, llama_seq_id seq_id) const {
  201. assert(i < pos.size());
  202. assert(seq_id >= 0);
  203. return seq[i].test(seq_id);
  204. }
  205. // note: call only if the cell is not empty and the seq_id is not in the cell
  206. void seq_add(uint32_t i, llama_seq_id seq_id) {
  207. assert(i < pos.size());
  208. assert(pos[i] != -1);
  209. assert(!seq[i].test(seq_id));
  210. seq[i].set(seq_id);
  211. seq_pos_inc(seq_id, pos[i]);
  212. }
  213. // return the sequence id of this cell
  214. // note: call only for cells with exactly one sequence
  215. llama_seq_id seq_get(uint32_t i) const {
  216. assert(seq[i].count() == 1);
  217. for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
  218. if (seq[i].test(s)) {
  219. return s;
  220. }
  221. }
  222. return -1;
  223. }
  224. // the minimum position of sequence seq_id currently present in any of the cells
  225. // return -1 if the sequence is not present
  226. llama_pos seq_pos_min(llama_seq_id seq_id) const {
  227. assert(seq_id >= 0);
  228. assert(seq_id < LLAMA_MAX_SEQ);
  229. if (seq_pos[seq_id].empty()) {
  230. return -1;
  231. }
  232. assert(seq_pos[seq_id].begin()->second > 0);
  233. return seq_pos[seq_id].begin()->first;
  234. }
  235. // the maximum position of sequence seq_id currently present in any of the cells
  236. // return -1 if the sequence is not present
  237. llama_pos seq_pos_max(llama_seq_id seq_id) const {
  238. assert(seq_id >= 0);
  239. assert(seq_id < LLAMA_MAX_SEQ);
  240. if (seq_pos[seq_id].empty()) {
  241. return -1;
  242. }
  243. assert(seq_pos[seq_id].rbegin()->second > 0);
  244. return seq_pos[seq_id].rbegin()->first;
  245. }
  246. // note: call only if the cell is not empty
  247. llama_pos pos_get(uint32_t i) const {
  248. assert(i < pos.size());
  249. assert(pos[i] != -1);
  250. return pos[i];
  251. }
  252. // note: call only if the cell is not empty
  253. llama_pos get_shift(uint32_t i) const {
  254. assert(i < pos.size());
  255. assert(pos[i] != -1);
  256. return shift[i];
  257. }
  258. // check if a cell is not empty and its position is within [p0, p1)
  259. bool pos_in(uint32_t i, llama_pos p0, llama_pos p1) const {
  260. assert(i < pos.size());
  261. return pos[i] >= p0 && pos[i] < p1;
  262. }
  263. // set the position of an empty cell
  264. // does not modify "has_shift"
  265. // note: call only if the cell is empty
  266. void pos_set(uint32_t i, llama_pos p) {
  267. assert(i < pos.size());
  268. assert(pos[i] == -1);
  269. assert(seq[i].none());
  270. pos[i] = p;
  271. used.insert(i);
  272. }
  273. // pos[i] = pos[i] + d
  274. // sets "has_shift" to true
  275. // note: call only if the cell is not empty
  276. bool pos_add(uint32_t i, llama_pos d) {
  277. assert(i < pos.size());
  278. assert(pos[i] != -1);
  279. seq_pos_rm(i);
  280. pos[i] += d;
  281. shift[i] += d;
  282. has_shift = true;
  283. if (pos[i] < 0) {
  284. seq[i].reset();
  285. pos[i] = -1;
  286. shift[i] = 0;
  287. used.erase(i);
  288. return true;
  289. }
  290. seq_pos_add(i);
  291. return false;
  292. }
  293. // pos[i] = pos[i] / d
  294. // sets "has_shift" to true
  295. // note: call only if the cell is not empty
  296. void pos_div(uint32_t i, int d) {
  297. assert(i < pos.size());
  298. assert(pos[i] != -1);
  299. const llama_pos p_old = pos[i];
  300. seq_pos_rm(i);
  301. pos[i] /= d;
  302. shift[i] += p_old - pos[i];
  303. seq_pos_add(i);
  304. has_shift = true;
  305. }
  306. private:
  307. bool has_shift = false;
  308. // set of indices of used cells (i.e. pos[i] != -1, allowed to not have any seq_id)
  309. std::set<uint32_t> used;
  310. std::vector<llama_pos> pos;
  311. // this array accumulates any applied shifts to the pos array since the last reset_shift() call
  312. // this is used to queue multiple updates to the pos array, which in the end can be applied in one go:
  313. //
  314. // cells.pos_add(x, shift_x);
  315. // cells.pos_div(y, shift_y);
  316. // ...
  317. //
  318. // if (cells.has_shift()) {
  319. // for (int i = 0; i < n; ++i) {
  320. // auto shift_i = cells.get_shift(i);
  321. // ...
  322. // }
  323. // cells.reset_shift();
  324. // }
  325. //
  326. std::vector<llama_pos> shift;
  327. using seq_set_t = std::bitset<LLAMA_MAX_SEQ>;
  328. // the bitset seq[i] tells us which sequences are currently occupying the i-th cell
  329. std::vector<seq_set_t> seq;
  330. // the set seq_pos[s][p] tells us how many times the position p is currently present for sequence s
  331. // if the position p is not present, seq_pos[s][p] is not set
  332. // this way seq_pos[s].begin() and seq_pos[s].rbegin() give us the min/max positions currently in the cache
  333. //
  334. // note that we cannot a use an std::set because in some cases a position can occur more than once for the same seq:
  335. // - during performing a cache reuse via (rm + add)
  336. // - some vision models have input embeddings with repeating positions
  337. //
  338. std::map<llama_pos, int> seq_pos[LLAMA_MAX_SEQ];
  339. // helper functions for updating `seq_pos`, once cell at a time:
  340. void seq_pos_dec(llama_seq_id s, llama_pos p) {
  341. auto it = seq_pos[s].find(p);
  342. assert(it != seq_pos[s].end());
  343. if (--it->second == 0) {
  344. seq_pos[s].erase(it);
  345. }
  346. }
  347. void seq_pos_inc(llama_seq_id s, llama_pos p) {
  348. seq_pos[s][p]++;
  349. }
  350. // remove cell i
  351. void seq_pos_rm(uint32_t i) {
  352. for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
  353. if (seq[i].test(s)) {
  354. seq_pos_dec(s, pos[i]);
  355. }
  356. }
  357. }
  358. // add cell i
  359. void seq_pos_add(uint32_t i) {
  360. for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
  361. if (seq[i].test(s)) {
  362. seq_pos_inc(s, pos[i]);
  363. }
  364. }
  365. }
  366. };