|
@@ -405,6 +405,7 @@ struct llama_server_context
|
|
|
// compare the evaluated prompt with the new prompt
|
|
// compare the evaluated prompt with the new prompt
|
|
|
n_past = common_part(embd, prompt_tokens);
|
|
n_past = common_part(embd, prompt_tokens);
|
|
|
embd = prompt_tokens;
|
|
embd = prompt_tokens;
|
|
|
|
|
+
|
|
|
if (n_past == num_prompt_tokens)
|
|
if (n_past == num_prompt_tokens)
|
|
|
{
|
|
{
|
|
|
// we have to evaluate at least 1 token to generate logits.
|
|
// we have to evaluate at least 1 token to generate logits.
|
|
@@ -412,6 +413,9 @@ struct llama_server_context
|
|
|
n_past--;
|
|
n_past--;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ // since #3228 we now have to manually manage the KV cache
|
|
|
|
|
+ llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
|
|
|
|
+
|
|
|
LOG_VERBOSE("prompt ingested", {
|
|
LOG_VERBOSE("prompt ingested", {
|
|
|
{"n_past", n_past},
|
|
{"n_past", n_past},
|
|
|
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
|
|
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
|
|
@@ -461,9 +465,6 @@ struct llama_server_context
|
|
|
// compare the evaluated prompt with the new prompt
|
|
// compare the evaluated prompt with the new prompt
|
|
|
n_past = common_part(embd, prompt_tokens);
|
|
n_past = common_part(embd, prompt_tokens);
|
|
|
|
|
|
|
|
- // since #3228 we now have to manually manage the KV cache
|
|
|
|
|
- llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
|
|
|
|
-
|
|
|
|
|
embd = prompt_tokens;
|
|
embd = prompt_tokens;
|
|
|
if (n_past == num_prompt_tokens)
|
|
if (n_past == num_prompt_tokens)
|
|
|
{
|
|
{
|
|
@@ -471,6 +472,9 @@ struct llama_server_context
|
|
|
n_past--;
|
|
n_past--;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ // since #3228 we now have to manually manage the KV cache
|
|
|
|
|
+ llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
|
|
|
|
+
|
|
|
LOG_VERBOSE("prompt ingested", {
|
|
LOG_VERBOSE("prompt ingested", {
|
|
|
{"n_past", n_past},
|
|
{"n_past", n_past},
|
|
|
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
|
|
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
|