|
|
@@ -525,39 +525,11 @@ llama_memory_context_ptr llama_kv_cache::init_full() {
|
|
|
}
|
|
|
|
|
|
llama_memory_context_ptr llama_kv_cache::init_update(llama_context * lctx, bool optimize) {
|
|
|
- bool do_shift = get_has_shift();
|
|
|
-
|
|
|
- defrag_info dinfo;
|
|
|
-
|
|
|
- // see if we need to defrag
|
|
|
- if (n_stream == 1) {
|
|
|
- // note : for now do not consider defrag for n_stream > 1
|
|
|
- const auto & cells = v_cells[seq_to_stream[0]];
|
|
|
-
|
|
|
- bool do_defrag = optimize;
|
|
|
-
|
|
|
- const auto thold = lctx->get_cparams().defrag_thold;
|
|
|
-
|
|
|
- if (!do_defrag && thold > 0.0f) {
|
|
|
- const auto n_kv = cells.used_max_p1();
|
|
|
-
|
|
|
- // - do not defrag small contexts (i.e. < 2048 tokens)
|
|
|
- // - count the padding towards the number of used tokens
|
|
|
- const float fragmentation = n_kv >= 2048 ? std::max(0.0f, 1.0f - (float(cells.get_used() + n_pad)/n_kv)) : 0.0f;
|
|
|
-
|
|
|
- if (fragmentation > thold) {
|
|
|
- LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
|
|
|
-
|
|
|
- do_defrag = true;
|
|
|
- }
|
|
|
- }
|
|
|
+ GGML_UNUSED(optimize);
|
|
|
|
|
|
- if (do_defrag) {
|
|
|
- dinfo = defrag_prepare(lctx->graph_max_nodes());
|
|
|
- }
|
|
|
- }
|
|
|
+ bool do_shift = get_has_shift();
|
|
|
|
|
|
- return std::make_unique<llama_kv_cache_context>(this, lctx, do_shift, std::move(dinfo), std::move(sc_info));
|
|
|
+ return std::make_unique<llama_kv_cache_context>(this, lctx, do_shift, std::move(sc_info));
|
|
|
}
|
|
|
|
|
|
llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vector<llama_ubatch> & ubatches) {
|
|
|
@@ -629,7 +601,7 @@ llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vector<llama_
|
|
|
return res;
|
|
|
}
|
|
|
|
|
|
-bool llama_kv_cache::update(llama_context * lctx, bool do_shift, const defrag_info & dinfo, const stream_copy_info & sc_info) {
|
|
|
+bool llama_kv_cache::update(llama_context * lctx, bool do_shift, const stream_copy_info & sc_info) {
|
|
|
bool updated = false;
|
|
|
|
|
|
auto * sched = lctx->get_sched();
|
|
|
@@ -699,53 +671,6 @@ bool llama_kv_cache::update(llama_context * lctx, bool do_shift, const defrag_in
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- if (!dinfo.empty()) {
|
|
|
- LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__);
|
|
|
-
|
|
|
- // note: for now do not consider defrag for n_stream > 1
|
|
|
- auto & cells = v_cells[seq_to_stream[0]];
|
|
|
- auto & head = v_heads[seq_to_stream[0]];
|
|
|
-
|
|
|
- // apply moves:
|
|
|
- {
|
|
|
- const auto n_kv = dinfo.ids.size();
|
|
|
-
|
|
|
- for (uint32_t i = 0; i < n_kv; ++i) {
|
|
|
- assert(dinfo.ids[i] <= n_kv);
|
|
|
-
|
|
|
- if (dinfo.ids[i] == n_kv || dinfo.ids[i] == i) {
|
|
|
- continue;
|
|
|
- }
|
|
|
-
|
|
|
- cells.mv(i, dinfo.ids[i]);
|
|
|
- }
|
|
|
-
|
|
|
- // reset the head so we can find the first free slot during the next ubatch
|
|
|
- head = 0;
|
|
|
- }
|
|
|
-
|
|
|
- ggml_backend_sched_reset(sched);
|
|
|
-
|
|
|
- auto * res = lctx->get_gf_res_reserve();
|
|
|
-
|
|
|
- res->reset();
|
|
|
-
|
|
|
- auto * gf = build_graph_defrag(res, lctx, dinfo);
|
|
|
- if (!ggml_backend_sched_alloc_graph(sched, gf)) {
|
|
|
- LLAMA_LOG_ERROR("%s: failed to allocate compute graph for defrag\n", __func__);
|
|
|
- return updated;
|
|
|
- }
|
|
|
-
|
|
|
- res->set_inputs(nullptr);
|
|
|
-
|
|
|
- if (lctx->graph_compute(gf, false) != GGML_STATUS_SUCCESS) {
|
|
|
- LLAMA_LOG_ERROR("%s: failed to compute defrag\n", __func__);
|
|
|
- return updated;
|
|
|
- }
|
|
|
-
|
|
|
- updated = true;
|
|
|
- }
|
|
|
-
|
|
|
return updated;
|
|
|
}
|
|
|
|
|
|
@@ -1525,283 +1450,6 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
|
|
|
return gf;
|
|
|
}
|
|
|
|
|
|
-ggml_cgraph * llama_kv_cache::build_graph_defrag(
|
|
|
- llm_graph_result * res,
|
|
|
- llama_context * lctx,
|
|
|
- const defrag_info & dinfo) const {
|
|
|
- auto * ctx = res->get_ctx();
|
|
|
- auto * gf = res->get_gf();
|
|
|
-
|
|
|
- GGML_ASSERT(n_stream == 1 && "n_stream > 1 does not support defrag");
|
|
|
-
|
|
|
- const auto & cells = v_cells[0];
|
|
|
-
|
|
|
- const auto & ids = dinfo.ids;
|
|
|
-
|
|
|
- const auto & cparams = lctx->get_cparams();
|
|
|
-
|
|
|
-#if 0
|
|
|
- // CPU defrag
|
|
|
- //
|
|
|
- // TODO: optimizations are possible:
|
|
|
- // - multiple threads
|
|
|
- // - avoid copying to the host memory when already there
|
|
|
- //
|
|
|
- // likely not worth the effort, as we have ggml_graph based defrag
|
|
|
- //
|
|
|
-
|
|
|
- const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
|
|
- const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
|
|
-
|
|
|
- const uint32_t kv_size = size;
|
|
|
-
|
|
|
- std::vector<uint8_t> buf_k;
|
|
|
- std::vector<uint8_t> buf_v;
|
|
|
-
|
|
|
- for (uint32_t il = 0; il < n_layer; ++il) {
|
|
|
- const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa);
|
|
|
- const size_t k_size = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size);
|
|
|
-
|
|
|
- const size_t v_size_el = ggml_type_size(v_l[il]->type);
|
|
|
- const size_t v_size = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size);
|
|
|
-
|
|
|
- buf_k.resize(k_size);
|
|
|
- buf_v.resize(v_size);
|
|
|
-
|
|
|
- ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size());
|
|
|
- ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size());
|
|
|
-
|
|
|
- // batch move [i, i+nm) to [id, id+nm)
|
|
|
- // note: cells can move only to a lower index
|
|
|
- for (uint32_t i = 0; i < n_kv; ++i) {
|
|
|
- const uint32_t id = ids[i];
|
|
|
-
|
|
|
- if (i == id || id == n_kv) {
|
|
|
- continue;
|
|
|
- }
|
|
|
-
|
|
|
- uint32_t nm = 1;
|
|
|
-
|
|
|
- while (i + nm < n_kv && ids[i + nm] == id + nm) {
|
|
|
- nm++;
|
|
|
- }
|
|
|
-
|
|
|
- // move keys
|
|
|
- {
|
|
|
- const int64_t os = i*k_size_row;
|
|
|
- const int64_t od = id*k_size_row;
|
|
|
-
|
|
|
- memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
|
|
|
- }
|
|
|
-
|
|
|
- // move values (note: they are transposed)
|
|
|
- {
|
|
|
- const int64_t os = i;
|
|
|
- const int64_t od = id;
|
|
|
-
|
|
|
- for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
|
|
- memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- i += nm - 1;
|
|
|
- }
|
|
|
-
|
|
|
- ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size());
|
|
|
- ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size());
|
|
|
- }
|
|
|
-#else
|
|
|
- for (uint32_t i = 0; i < ids.size(); ++i) {
|
|
|
- const uint32_t id = ids[i];
|
|
|
-
|
|
|
- if (i == id || id == ids.size()) {
|
|
|
- continue;
|
|
|
- }
|
|
|
-
|
|
|
- uint32_t nm = 1;
|
|
|
-
|
|
|
- while (i + nm < ids.size() && ids[i + nm] == id + nm) {
|
|
|
- nm++;
|
|
|
- }
|
|
|
-
|
|
|
- for (const auto & layer : layers) {
|
|
|
- const uint32_t il = layer.il;
|
|
|
-
|
|
|
- const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
|
|
|
- const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il);
|
|
|
-
|
|
|
- ggml_tensor * view_k_src = ggml_view_2d(ctx, layer.k,
|
|
|
- n_embd_k_gqa, nm,
|
|
|
- ggml_row_size(layer.k->type, n_embd_k_gqa),
|
|
|
- ggml_row_size(layer.k->type, n_embd_k_gqa*i));
|
|
|
-
|
|
|
- ggml_tensor * view_k_dst = ggml_view_2d(ctx, layer.k,
|
|
|
- n_embd_k_gqa, nm,
|
|
|
- ggml_row_size(layer.k->type, n_embd_k_gqa),
|
|
|
- ggml_row_size(layer.k->type, n_embd_k_gqa*id));
|
|
|
-
|
|
|
- ggml_tensor * view_v_src;
|
|
|
- ggml_tensor * view_v_dst;
|
|
|
-
|
|
|
- if (cparams.flash_attn) {
|
|
|
- // NOTE: the V cache is not transposed when using flash attention
|
|
|
- view_v_src = ggml_view_2d(ctx, layer.v,
|
|
|
- n_embd_v_gqa, nm,
|
|
|
- ggml_row_size(layer.v->type, n_embd_v_gqa),
|
|
|
- ggml_row_size(layer.v->type, n_embd_v_gqa*i));
|
|
|
-
|
|
|
- view_v_dst = ggml_view_2d(ctx, layer.v,
|
|
|
- n_embd_v_gqa, nm,
|
|
|
- ggml_row_size(layer.v->type, n_embd_v_gqa),
|
|
|
- ggml_row_size(layer.v->type, n_embd_v_gqa*id));
|
|
|
- } else {
|
|
|
- view_v_src = ggml_view_2d(ctx, layer.v,
|
|
|
- nm, n_embd_v_gqa,
|
|
|
- ggml_row_size(layer.v->type, cells.size()),
|
|
|
- ggml_row_size(layer.v->type, i));
|
|
|
-
|
|
|
- view_v_dst = ggml_view_2d(ctx, layer.v,
|
|
|
- nm, n_embd_v_gqa,
|
|
|
- ggml_row_size(layer.v->type, cells.size()),
|
|
|
- ggml_row_size(layer.v->type, id));
|
|
|
- }
|
|
|
-
|
|
|
- ggml_build_forward_expand(gf, ggml_cpy(ctx, view_k_src, view_k_dst));
|
|
|
- ggml_build_forward_expand(gf, ggml_cpy(ctx, view_v_src, view_v_dst));
|
|
|
- }
|
|
|
-
|
|
|
- i += nm - 1;
|
|
|
- }
|
|
|
-
|
|
|
- //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
|
|
|
-#endif
|
|
|
-
|
|
|
- return gf;
|
|
|
-}
|
|
|
-
|
|
|
-llama_kv_cache::defrag_info llama_kv_cache::defrag_prepare(int32_t n_max_nodes) const {
|
|
|
- GGML_ASSERT(n_stream == 1 && "n_stream > 1 does not support defrag");
|
|
|
-
|
|
|
- const auto & cells = v_cells[0];
|
|
|
-
|
|
|
- const uint32_t n_layer = layers.size();
|
|
|
-
|
|
|
- const uint32_t n_kv = cells.used_max_p1();
|
|
|
- const uint32_t n_used = cells.get_used();
|
|
|
-
|
|
|
- assert(n_used <= n_kv);
|
|
|
-
|
|
|
- //const int64_t t_start = ggml_time_us();
|
|
|
-
|
|
|
- // number of cells moved
|
|
|
- uint32_t n_moves = 0;
|
|
|
-
|
|
|
- // each move requires 6*n_layer tensors (see graph_build_kv_self_defrag)
|
|
|
- // - source view, destination view, copy operation
|
|
|
- // - x2 for keys and values
|
|
|
- //const uint32_t max_moves = max_nodes()/(6*n_layer);
|
|
|
- // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516
|
|
|
- const uint32_t max_moves = (n_max_nodes - 2*n_layer)/(6*n_layer);
|
|
|
-
|
|
|
- // determine which KV cells to move where
|
|
|
- defrag_info res;
|
|
|
- auto & ids = res.ids;
|
|
|
-
|
|
|
- ids.resize(n_kv, n_kv);
|
|
|
-
|
|
|
- for (uint32_t i0 = 0; i0 < n_used; ++i0) {
|
|
|
- if (!cells.is_empty(i0)) {
|
|
|
- ids[i0] = i0;
|
|
|
-
|
|
|
- continue;
|
|
|
- }
|
|
|
-
|
|
|
- // found a hole - fill it with data from the end of the cache
|
|
|
-
|
|
|
- uint32_t nh = 1;
|
|
|
-
|
|
|
- // determine the size of the hole
|
|
|
- while (i0 + nh < n_used && cells.is_empty(i0 + nh)) {
|
|
|
- nh++;
|
|
|
- }
|
|
|
-
|
|
|
- uint32_t nf = 0;
|
|
|
- uint32_t is = n_kv - 1;
|
|
|
-
|
|
|
- // starting from the end, find nh non-empty cells
|
|
|
- for (; is > i0; --is) {
|
|
|
- if (cells.is_empty(is) || ids[is] != n_kv) {
|
|
|
- continue;
|
|
|
- }
|
|
|
-
|
|
|
- // non-empty cell which is not yet moved
|
|
|
- nf++;
|
|
|
-
|
|
|
- if (nf == nh) {
|
|
|
- break;
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- // this can only happen if `n_used` is not accurate, which would be a bug
|
|
|
- GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
|
|
|
-
|
|
|
- nf = 0;
|
|
|
-
|
|
|
- uint32_t i1 = is;
|
|
|
-
|
|
|
- // are we moving a continuous block of memory?
|
|
|
- bool cont = false;
|
|
|
-
|
|
|
- // should we stop searching for the next move?
|
|
|
- bool stop = false;
|
|
|
-
|
|
|
- // go back and move the nf cells to the hole
|
|
|
- for (; i1 < n_kv; ++i1) {
|
|
|
- if (cells.is_empty(i1) || ids[i1] != n_kv) {
|
|
|
- if (n_moves == max_moves) {
|
|
|
- stop = true;
|
|
|
- break;
|
|
|
- }
|
|
|
-
|
|
|
- cont = false;
|
|
|
- continue;
|
|
|
- }
|
|
|
-
|
|
|
- // this cell goes to (i0 + nf)
|
|
|
- ids[i1] = i0 + nf;
|
|
|
-
|
|
|
- if (!cont) {
|
|
|
- n_moves++;
|
|
|
- cont = true;
|
|
|
- }
|
|
|
-
|
|
|
- nf++;
|
|
|
-
|
|
|
- if (nf == nh) {
|
|
|
- break;
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- if (stop || n_moves == max_moves) {
|
|
|
- break;
|
|
|
- }
|
|
|
-
|
|
|
- //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
|
|
|
-
|
|
|
- i0 += nh - 1;
|
|
|
- }
|
|
|
-
|
|
|
- if (n_moves == 0) {
|
|
|
- return {};
|
|
|
- }
|
|
|
-
|
|
|
- LLAMA_LOG_DEBUG("%s: (tmp log) KV defrag cell moves: %u\n", __func__, n_moves);
|
|
|
-
|
|
|
- LLAMA_LOG_DEBUG("%s: expected gf nodes: %u\n", __func__, 6*n_moves*n_layer);
|
|
|
-
|
|
|
- return res;
|
|
|
-}
|
|
|
-
|
|
|
bool llama_kv_cache::is_masked_swa(llama_pos p0, llama_pos p1) const {
|
|
|
assert(p0 >= 0 && p1 >= 0);
|
|
|
|
|
|
@@ -2300,9 +1948,8 @@ llama_kv_cache_context::llama_kv_cache_context(
|
|
|
llama_kv_cache * kv,
|
|
|
llama_context * lctx,
|
|
|
bool do_shift,
|
|
|
- defrag_info dinfo,
|
|
|
- stream_copy_info sc_info) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), lctx(lctx), do_shift(do_shift), dinfo(std::move(dinfo)), sc_info(std::move(sc_info)) {
|
|
|
- if (!do_shift && this->dinfo.empty() && this->sc_info.empty()) {
|
|
|
+ stream_copy_info sc_info) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), lctx(lctx), do_shift(do_shift), sc_info(std::move(sc_info)) {
|
|
|
+ if (!do_shift && this->sc_info.empty()) {
|
|
|
status = LLAMA_MEMORY_STATUS_NO_UPDATE;
|
|
|
}
|
|
|
}
|
|
|
@@ -2330,7 +1977,7 @@ bool llama_kv_cache_context::apply() {
|
|
|
|
|
|
// no ubatches -> this is a KV cache update
|
|
|
if (ubatches.empty()) {
|
|
|
- kv->update(lctx, do_shift, dinfo, sc_info);
|
|
|
+ kv->update(lctx, do_shift, sc_info);
|
|
|
|
|
|
return true;
|
|
|
}
|