|
@@ -93,7 +93,7 @@ llama_context::llama_context(
|
|
|
// the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
|
|
// the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
|
|
|
// this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
|
|
// this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
|
|
|
// ref: https://github.com/ggerganov/llama.cpp/pull/5021
|
|
// ref: https://github.com/ggerganov/llama.cpp/pull/5021
|
|
|
- // TODO: this padding is not needed for the cache-less context so we should probably move it to llama_context_kv_self
|
|
|
|
|
|
|
+ // TODO: this padding is not needed for the cache-less context so we should probably move it to llama_memory
|
|
|
if (cparams.n_batch < GGML_KQ_MASK_PAD) {
|
|
if (cparams.n_batch < GGML_KQ_MASK_PAD) {
|
|
|
LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
|
|
LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
|
|
|
cparams.n_batch = GGML_KQ_MASK_PAD;
|
|
cparams.n_batch = GGML_KQ_MASK_PAD;
|
|
@@ -439,26 +439,12 @@ llama_memory_t llama_context::get_memory() const {
|
|
|
return memory.get();
|
|
return memory.get();
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-// deprecated
|
|
|
|
|
-void llama_context::kv_self_defrag_sched() {
|
|
|
|
|
- if (!memory) {
|
|
|
|
|
- return;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- memory_force_optimize = true;
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-// deprecated
|
|
|
|
|
-bool llama_context::kv_self_update(bool optimize) {
|
|
|
|
|
|
|
+bool llama_context::memory_update(bool optimize) {
|
|
|
if (!memory) {
|
|
if (!memory) {
|
|
|
return false;
|
|
return false;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
{
|
|
{
|
|
|
- // TODO: remove in the future
|
|
|
|
|
- optimize |= memory_force_optimize;
|
|
|
|
|
- memory_force_optimize = false;
|
|
|
|
|
-
|
|
|
|
|
const auto mctx = memory->init_update(this, optimize);
|
|
const auto mctx = memory->init_update(this, optimize);
|
|
|
switch (mctx->get_status()) {
|
|
switch (mctx->get_status()) {
|
|
|
case LLAMA_MEMORY_STATUS_SUCCESS:
|
|
case LLAMA_MEMORY_STATUS_SUCCESS:
|
|
@@ -993,7 +979,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|
|
bool did_optimize = false;
|
|
bool did_optimize = false;
|
|
|
|
|
|
|
|
// handle any pending defrags/shifts
|
|
// handle any pending defrags/shifts
|
|
|
- kv_self_update(false);
|
|
|
|
|
|
|
+ memory_update(false);
|
|
|
|
|
|
|
|
llama_memory_context_ptr mctx;
|
|
llama_memory_context_ptr mctx;
|
|
|
|
|
|
|
@@ -1018,7 +1004,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|
|
if (!did_optimize) {
|
|
if (!did_optimize) {
|
|
|
did_optimize = true;
|
|
did_optimize = true;
|
|
|
|
|
|
|
|
- if (kv_self_update(true)) {
|
|
|
|
|
|
|
+ if (memory_update(true)) {
|
|
|
LLAMA_LOG_DEBUG("%s: retrying batch size %d after cache optimization\n", __func__, balloc->get_n_tokens());
|
|
LLAMA_LOG_DEBUG("%s: retrying batch size %d after cache optimization\n", __func__, balloc->get_n_tokens());
|
|
|
|
|
|
|
|
continue;
|
|
continue;
|
|
@@ -2338,11 +2324,6 @@ const llama_model * llama_get_model(const llama_context * ctx) {
|
|
|
return &ctx->get_model();
|
|
return &ctx->get_model();
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-// deprecated
|
|
|
|
|
-void llama_kv_self_update(llama_context * ctx) {
|
|
|
|
|
- ctx->kv_self_update(false);
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
enum llama_pooling_type llama_pooling_type(const llama_context * ctx) {
|
|
enum llama_pooling_type llama_pooling_type(const llama_context * ctx) {
|
|
|
return ctx->pooling_type();
|
|
return ctx->pooling_type();
|
|
|
}
|
|
}
|
|
@@ -2560,168 +2541,6 @@ bool llama_memory_can_shift(llama_memory_t mem) {
|
|
|
return mem->get_can_shift();
|
|
return mem->get_can_shift();
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-//
|
|
|
|
|
-// kv cache
|
|
|
|
|
-//
|
|
|
|
|
-
|
|
|
|
|
-// deprecated
|
|
|
|
|
-int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
|
|
|
|
|
- const auto * kv = llama_get_memory(ctx);
|
|
|
|
|
- if (!kv) {
|
|
|
|
|
- return 0;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- int32_t res = 0;
|
|
|
|
|
-
|
|
|
|
|
- for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
|
|
|
|
|
- const llama_pos p0 = kv->seq_pos_min(s);
|
|
|
|
|
- const llama_pos p1 = kv->seq_pos_max(s);
|
|
|
|
|
-
|
|
|
|
|
- if (p0 >= 0) {
|
|
|
|
|
- res += (p1 - p0) + 1;
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- return res;
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-// deprecated
|
|
|
|
|
-// note: this is the same as above - will be removed anyway, so it's ok
|
|
|
|
|
-int32_t llama_kv_self_used_cells(const llama_context * ctx) {
|
|
|
|
|
- const auto * kv = llama_get_memory(ctx);
|
|
|
|
|
- if (!kv) {
|
|
|
|
|
- return 0;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- int32_t res = 0;
|
|
|
|
|
-
|
|
|
|
|
- for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
|
|
|
|
|
- const llama_pos p0 = kv->seq_pos_min(s);
|
|
|
|
|
- const llama_pos p1 = kv->seq_pos_max(s);
|
|
|
|
|
-
|
|
|
|
|
- if (p0 >= 0) {
|
|
|
|
|
- res += (p1 - p0) + 1;
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- return res;
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-// deprecated
|
|
|
|
|
-void llama_kv_self_clear(llama_context * ctx) {
|
|
|
|
|
- auto * kv = llama_get_memory(ctx);
|
|
|
|
|
- if (!kv) {
|
|
|
|
|
- return;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- llama_memory_clear(kv, true);
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-// deprecated
|
|
|
|
|
-bool llama_kv_self_seq_rm(
|
|
|
|
|
- llama_context * ctx,
|
|
|
|
|
- llama_seq_id seq_id,
|
|
|
|
|
- llama_pos p0,
|
|
|
|
|
- llama_pos p1) {
|
|
|
|
|
- auto * kv = llama_get_memory(ctx);
|
|
|
|
|
- if (!kv) {
|
|
|
|
|
- return true;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- return llama_memory_seq_rm(kv, seq_id, p0, p1);
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-// deprecated
|
|
|
|
|
-void llama_kv_self_seq_cp(
|
|
|
|
|
- llama_context * ctx,
|
|
|
|
|
- llama_seq_id seq_id_src,
|
|
|
|
|
- llama_seq_id seq_id_dst,
|
|
|
|
|
- llama_pos p0,
|
|
|
|
|
- llama_pos p1) {
|
|
|
|
|
- auto * kv = llama_get_memory(ctx);
|
|
|
|
|
- if (!kv) {
|
|
|
|
|
- return;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- llama_memory_seq_cp(kv, seq_id_src, seq_id_dst, p0, p1);
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-// deprecated
|
|
|
|
|
-void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
|
|
|
|
|
- auto * kv = llama_get_memory(ctx);
|
|
|
|
|
- if (!kv) {
|
|
|
|
|
- return;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- llama_memory_seq_keep(kv, seq_id);
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-// deprecated
|
|
|
|
|
-void llama_kv_self_seq_add(
|
|
|
|
|
- llama_context * ctx,
|
|
|
|
|
- llama_seq_id seq_id,
|
|
|
|
|
- llama_pos p0,
|
|
|
|
|
- llama_pos p1,
|
|
|
|
|
- llama_pos delta) {
|
|
|
|
|
- auto * kv = llama_get_memory(ctx);
|
|
|
|
|
- if (!kv) {
|
|
|
|
|
- return;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- llama_memory_seq_add(kv, seq_id, p0, p1, delta);
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-// deprecated
|
|
|
|
|
-void llama_kv_self_seq_div(
|
|
|
|
|
- llama_context * ctx,
|
|
|
|
|
- llama_seq_id seq_id,
|
|
|
|
|
- llama_pos p0,
|
|
|
|
|
- llama_pos p1,
|
|
|
|
|
- int d) {
|
|
|
|
|
- auto * kv = llama_get_memory(ctx);
|
|
|
|
|
- if (!kv) {
|
|
|
|
|
- return;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- llama_memory_seq_div(kv, seq_id, p0, p1, d);
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-// deprecated
|
|
|
|
|
-llama_pos llama_kv_self_seq_pos_min(llama_context * ctx, llama_seq_id seq_id) {
|
|
|
|
|
- auto * kv = llama_get_memory(ctx);
|
|
|
|
|
- if (!kv) {
|
|
|
|
|
- return -1;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- return llama_memory_seq_pos_min(kv, seq_id);
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-// deprecated
|
|
|
|
|
-llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
|
|
|
|
|
- auto * kv = llama_get_memory(ctx);
|
|
|
|
|
- if (!kv) {
|
|
|
|
|
- return -1;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- return llama_memory_seq_pos_max(kv, seq_id);
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-// deprecated
|
|
|
|
|
-void llama_kv_self_defrag(llama_context * ctx) {
|
|
|
|
|
- // force defrag
|
|
|
|
|
- ctx->kv_self_defrag_sched();
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-// deprecated
|
|
|
|
|
-bool llama_kv_self_can_shift(const llama_context * ctx) {
|
|
|
|
|
- auto * kv = llama_get_memory(ctx);
|
|
|
|
|
- if (!kv) {
|
|
|
|
|
- return false;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- return llama_memory_can_shift(kv);
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
// llama state API
|
|
// llama state API
|
|
|
|
|
|
|
|
// deprecated
|
|
// deprecated
|