4 months ago · cd36b5e5c7
--- a/include/llama.h
+++ b/include/llama.h
@@ -663,111 +663,6 @@ extern "C" {
 
															     // Check if the memory supports shifting
														
 
															     LLAMA_API bool llama_memory_can_shift(llama_memory_t mem);
														
 
															-    //
														
 
															-    // KV cache for self-attention (TODO: deprecate in favor of llama_memory)
														
 
															-    //
														
 
															-
														
 
															-    // Returns the number of tokens in the KV cache (slow, use only for debug)
														
 
															-    // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
														
 
															-    DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
														
 
															-               "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
														
 
															-
														
 
															-    // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
														
 
															-    DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
														
 
															-               "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
														
 
															-
														
 
															-    // Clear the KV cache - both cell info is erased and KV data is zeroed
														
 
															-    DEPRECATED(LLAMA_API void llama_kv_self_clear(
														
 
															-                struct llama_context * ctx),
														
 
															-            "Use llama_memory_clear() instead");
														
 
															-
														
 
															-    // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
														
 
															-    // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
														
 
															-    // seq_id < 0 : match any sequence
														
 
															-    // p0 < 0     : [0,  p1]
														
 
															-    // p1 < 0     : [p0, inf)
														
 
															-    DEPRECATED(LLAMA_API bool llama_kv_self_seq_rm(
														
 
															-            struct llama_context * ctx,
														
 
															-                    llama_seq_id   seq_id,
														
 
															-                       llama_pos   p0,
														
 
															-                       llama_pos   p1),
														
 
															-            "Use llama_memory_seq_rm() instead");
														
 
															-
														
 
															-    // Copy all tokens that belong to the specified sequence to another sequence
														
 
															-    // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
														
 
															-    // p0 < 0 : [0,  p1]
														
 
															-    // p1 < 0 : [p0, inf)
														
 
															-    DEPRECATED(LLAMA_API void llama_kv_self_seq_cp(
														
 
															-            struct llama_context * ctx,
														
 
															-                    llama_seq_id   seq_id_src,
														
 
															-                    llama_seq_id   seq_id_dst,
														
 
															-                       llama_pos   p0,
														
 
															-                       llama_pos   p1),
														
 
															-            "Use llama_memory_seq_cp() instead");
														
 
															-
														
 
															-    // Removes all tokens that do not belong to the specified sequence
														
 
															-    DEPRECATED(LLAMA_API void llama_kv_self_seq_keep(
														
 
															-            struct llama_context * ctx,
														
 
															-                    llama_seq_id   seq_id),
														
 
															-            "Use llama_memory_seq_keep() instead");
														
 
															-
														
 
															-    // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
														
 
															-    // If the KV cache is RoPEd, the KV data is updated accordingly:
														
 
															-    //   - lazily on next llama_decode()
														
 
															-    // p0 < 0 : [0,  p1]
														
 
															-    // p1 < 0 : [p0, inf)
														
 
															-    DEPRECATED(LLAMA_API void llama_kv_self_seq_add(
														
 
															-            struct llama_context * ctx,
														
 
															-                    llama_seq_id   seq_id,
														
 
															-                       llama_pos   p0,
														
 
															-                       llama_pos   p1,
														
 
															-                       llama_pos   delta),
														
 
															-            "Use llama_memory_seq_add() instead");
														
 
															-
														
 
															-    // Integer division of the positions by factor of `d > 1`
														
 
															-    // If the KV cache is RoPEd, the KV data is updated accordingly:
														
 
															-    //   - lazily on next llama_decode()
														
 
															-    // p0 < 0 : [0,  p1]
														
 
															-    // p1 < 0 : [p0, inf)
														
 
															-    DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
														
 
															-            struct llama_context * ctx,
														
 
															-                    llama_seq_id   seq_id,
														
 
															-                       llama_pos   p0,
														
 
															-                       llama_pos   p1,
														
 
															-                             int   d),
														
 
															-            "Use llama_memory_seq_div() instead");
														
 
															-
														
 
															-    // Returns the smallest position present in the KV cache for the specified sequence
														
 
															-    // This is typically non-zero only for SWA caches
														
 
															-    // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
														
 
															-    // Return -1 if the sequence is empty
														
 
															-    DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_min(
														
 
															-            struct llama_context * ctx,
														
 
															-                    llama_seq_id   seq_id),
														
 
															-            "Use llama_memory_seq_pos_min() instead");
														
 
															-
														
 
															-    // Returns the largest position present in the KV cache for the specified sequence
														
 
															-    // Note that all positions in the range [pos_min, pos_max] are guaranteed to be present in the KV cache
														
 
															-    // Return -1 if the sequence is empty
														
 
															-    DEPRECATED(LLAMA_API llama_pos llama_kv_self_seq_pos_max(
														
 
															-            struct llama_context * ctx,
														
 
															-                    llama_seq_id   seq_id),
														
 
															-            "Use llama_memory_seq_pos_max() instead");
														
 
															-
														
 
															-    // Defragment the KV cache
														
 
															-    // This will be applied:
														
 
															-    //   - lazily on next llama_decode()
														
 
															-    DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
														
 
															-            "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
														
 
															-
														
 
															-    // Check if the context supports KV cache shifting
														
 
															-    DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
														
 
															-            "use llama_memory_can_shift() instead");
														
 
															-
														
 
															-    // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
														
 
															-    DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
														
 
															-            "simply remove this call, updates are applied lazily on the next llama_decode()");
														
 
															-
														
 
															     //
														
 
															     // State / sessions
														
 
															     //
														
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -93,7 +93,7 @@ llama_context::llama_context(
 
															     // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
														
 
															     // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
														
 
															     // ref: https://github.com/ggerganov/llama.cpp/pull/5021
														
 
															-    // TODO: this padding is not needed for the cache-less context so we should probably move it to llama_context_kv_self
														
 
															+    // TODO: this padding is not needed for the cache-less context so we should probably move it to llama_memory
														
 
															     if (cparams.n_batch < GGML_KQ_MASK_PAD) {
														
 
															         LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
														
 
															         cparams.n_batch = GGML_KQ_MASK_PAD;
														
@@ -439,26 +439,12 @@ llama_memory_t llama_context::get_memory() const {
 
															     return memory.get();
														
 
															 }
														
 
															-// deprecated
														
 
															-void llama_context::kv_self_defrag_sched() {
														
 
															-    if (!memory) {
														
 
															-        return;
														
 
															-    }
														
 
															-
														
 
															-    memory_force_optimize = true;
														
 
															-}
														
 
															-
														
 
															-// deprecated
														
 
															-bool llama_context::kv_self_update(bool optimize) {
														
 
															+bool llama_context::memory_update(bool optimize) {
														
 
															     if (!memory) {
														
 
															         return false;
														
 
															     }
														
 
															     {
														
 
															-        // TODO: remove in the future
														
 
															-        optimize |= memory_force_optimize;
														
 
															-        memory_force_optimize = false;
														
 
															-
														
 
															         const auto mctx = memory->init_update(this, optimize);
														
 
															         switch (mctx->get_status()) {
														
 
															             case LLAMA_MEMORY_STATUS_SUCCESS:
														
@@ -993,7 +979,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
 
															     bool did_optimize = false;
														
 
															     // handle any pending defrags/shifts
														
 
															-    kv_self_update(false);
														
 
															+    memory_update(false);
														
 
															     llama_memory_context_ptr mctx;
														
@@ -1018,7 +1004,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
 
															                     if (!did_optimize) {
														
 
															                         did_optimize = true;
														
 
															-                        if (kv_self_update(true)) {
														
 
															+                        if (memory_update(true)) {
														
 
															                             LLAMA_LOG_DEBUG("%s: retrying batch size %d after cache optimization\n", __func__, balloc->get_n_tokens());
														
 
															                             continue;
														
@@ -2338,11 +2324,6 @@ const llama_model * llama_get_model(const llama_context * ctx) {
 
															     return &ctx->get_model();
														
 
															 }
														
 
															-// deprecated
														
 
															-void llama_kv_self_update(llama_context * ctx) {
														
 
															-    ctx->kv_self_update(false);
														
 
															-}
														
 
															-
														
 
															 enum llama_pooling_type llama_pooling_type(const llama_context * ctx) {
														
 
															     return ctx->pooling_type();
														
 
															 }
														
@@ -2560,168 +2541,6 @@ bool llama_memory_can_shift(llama_memory_t mem) {
 
															     return mem->get_can_shift();
														
 
															 }
														
 
															-//
														
 
															-// kv cache
														
 
															-//
														
 
															-
														
 
															-// deprecated
														
 
															-int32_t llama_kv_self_n_tokens(const llama_context * ctx) {
														
 
															-    const auto * kv = llama_get_memory(ctx);
														
 
															-    if (!kv) {
														
 
															-        return 0;
														
 
															-    }
														
 
															-
														
 
															-    int32_t res = 0;
														
 
															-
														
 
															-    for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
														
 
															-        const llama_pos p0 = kv->seq_pos_min(s);
														
 
															-        const llama_pos p1 = kv->seq_pos_max(s);
														
 
															-
														
 
															-        if (p0 >= 0) {
														
 
															-            res += (p1 - p0) + 1;
														
 
															-        }
														
 
															-    }
														
 
															-
														
 
															-    return res;
														
 
															-}
														
 
															-
														
 
															-// deprecated
														
 
															-// note: this is the same as above - will be removed anyway, so it's ok
														
 
															-int32_t llama_kv_self_used_cells(const llama_context * ctx) {
														
 
															-    const auto * kv = llama_get_memory(ctx);
														
 
															-    if (!kv) {
														
 
															-        return 0;
														
 
															-    }
														
 
															-
														
 
															-    int32_t res = 0;
														
 
															-
														
 
															-    for (uint32_t s = 0; s < ctx->get_cparams().n_seq_max; s++) {
														
 
															-        const llama_pos p0 = kv->seq_pos_min(s);
														
 
															-        const llama_pos p1 = kv->seq_pos_max(s);
														
 
															-
														
 
															-        if (p0 >= 0) {
														
 
															-            res += (p1 - p0) + 1;
														
 
															-        }
														
 
															-    }
														
 
															-
														
 
															-    return res;
														
 
															-}
														
 
															-
														
 
															-// deprecated
														
 
															-void llama_kv_self_clear(llama_context * ctx) {
														
 
															-    auto * kv = llama_get_memory(ctx);
														
 
															-    if (!kv) {
														
 
															-        return;
														
 
															-    }
														
 
															-
														
 
															-    llama_memory_clear(kv, true);
														
 
															-}
														
 
															-
														
 
															-// deprecated
														
 
															-bool llama_kv_self_seq_rm(
														
 
															-        llama_context * ctx,
														
 
															-         llama_seq_id   seq_id,
														
 
															-            llama_pos   p0,
														
 
															-            llama_pos   p1) {
														
 
															-    auto * kv = llama_get_memory(ctx);
														
 
															-    if (!kv) {
														
 
															-        return true;
														
 
															-    }
														
 
															-
														
 
															-    return llama_memory_seq_rm(kv, seq_id, p0, p1);
														
 
															-}
														
 
															-
														
 
															-// deprecated
														
 
															-void llama_kv_self_seq_cp(
														
 
															-        llama_context * ctx,
														
 
															-         llama_seq_id   seq_id_src,
														
 
															-         llama_seq_id   seq_id_dst,
														
 
															-            llama_pos   p0,
														
 
															-            llama_pos   p1) {
														
 
															-    auto * kv = llama_get_memory(ctx);
														
 
															-    if (!kv) {
														
 
															-        return;
														
 
															-    }
														
 
															-
														
 
															-    llama_memory_seq_cp(kv, seq_id_src, seq_id_dst, p0, p1);
														
 
															-}
														
 
															-
														
 
															-// deprecated
														
 
															-void llama_kv_self_seq_keep(llama_context * ctx, llama_seq_id seq_id) {
														
 
															-    auto * kv = llama_get_memory(ctx);
														
 
															-    if (!kv) {
														
 
															-        return;
														
 
															-    }
														
 
															-
														
 
															-    llama_memory_seq_keep(kv, seq_id);
														
 
															-}
														
 
															-
														
 
															-// deprecated
														
 
															-void llama_kv_self_seq_add(
														
 
															-        llama_context * ctx,
														
 
															-         llama_seq_id   seq_id,
														
 
															-            llama_pos   p0,
														
 
															-            llama_pos   p1,
														
 
															-            llama_pos   delta) {
														
 
															-    auto * kv = llama_get_memory(ctx);
														
 
															-    if (!kv) {
														
 
															-        return;
														
 
															-    }
														
 
															-
														
 
															-    llama_memory_seq_add(kv, seq_id, p0, p1, delta);
														
 
															-}
														
 
															-
														
 
															-// deprecated
														
 
															-void llama_kv_self_seq_div(
														
 
															-        llama_context * ctx,
														
 
															-         llama_seq_id   seq_id,
														
 
															-            llama_pos   p0,
														
 
															-            llama_pos   p1,
														
 
															-                  int   d) {
														
 
															-    auto * kv = llama_get_memory(ctx);
														
 
															-    if (!kv) {
														
 
															-        return;
														
 
															-    }
														
 
															-
														
 
															-    llama_memory_seq_div(kv, seq_id, p0, p1, d);
														
 
															-}
														
 
															-
														
 
															-// deprecated
														
 
															-llama_pos llama_kv_self_seq_pos_min(llama_context * ctx, llama_seq_id seq_id) {
														
 
															-    auto * kv = llama_get_memory(ctx);
														
 
															-    if (!kv) {
														
 
															-        return -1;
														
 
															-    }
														
 
															-
														
 
															-    return llama_memory_seq_pos_min(kv, seq_id);
														
 
															-}
														
 
															-
														
 
															-// deprecated
														
 
															-llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
														
 
															-    auto * kv = llama_get_memory(ctx);
														
 
															-    if (!kv) {
														
 
															-        return -1;
														
 
															-    }
														
 
															-
														
 
															-    return llama_memory_seq_pos_max(kv, seq_id);
														
 
															-}
														
 
															-
														
 
															-// deprecated
														
 
															-void llama_kv_self_defrag(llama_context * ctx) {
														
 
															-    // force defrag
														
 
															-    ctx->kv_self_defrag_sched();
														
 
															-}
														
 
															-
														
 
															-// deprecated
														
 
															-bool llama_kv_self_can_shift(const llama_context * ctx) {
														
 
															-    auto * kv = llama_get_memory(ctx);
														
 
															-    if (!kv) {
														
 
															-        return false;
														
 
															-    }
														
 
															-
														
 
															-    return llama_memory_can_shift(kv);
														
 
															-}
														
 
															-
														
 
															 // llama state API
														
 
															 // deprecated
														
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -46,10 +46,8 @@ struct llama_context {
 
															     llama_memory_t get_memory() const;
														
 
															-    // return true of the KV cache was updated
														
 
															-    // TODO: remove
														
 
															-    bool kv_self_update(bool optimize);
														
 
															-    void kv_self_defrag_sched();
														
 
															+    // return true if the memory was updated
														
 
															+    bool memory_update(bool optimize);
														
 
															     enum llama_pooling_type pooling_type() const;
														
@@ -230,9 +228,6 @@ private:
 
															     std::unique_ptr<llama_memory_i> memory;
														
 
															-    // TODO: temporary, until the llama_kv_self_defrag() API is removed
														
 
															-    bool memory_force_optimize = false;
														
 
															-
														
 
															     // decode output (2-dimensional array: [n_outputs][n_vocab])
														
 
															     size_t  logits_size = 0; // capacity (of floats) for logits
														
 
															     float * logits      = nullptr;