2 лет назад · 57dd55e2c7
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -405,6 +405,7 @@ struct llama_server_context
 
				         // compare the evaluated prompt with the new prompt
			
 
				         n_past = common_part(embd, prompt_tokens);
			
 
				         embd = prompt_tokens;
			
 
				+
			
 
				         if (n_past == num_prompt_tokens)
			
 
				         {
			
 
				             // we have to evaluate at least 1 token to generate logits.
			
@@ -412,6 +413,9 @@ struct llama_server_context
 
				             n_past--;
			
 
				         }
			
 
				 
			
 
				+        // since #3228 we now have to manually manage the KV cache
			
 
				+        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
			
 
				+
			
 
				         LOG_VERBOSE("prompt ingested", {
			
 
				                                            {"n_past", n_past},
			
 
				                                            {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
			
@@ -461,9 +465,6 @@ struct llama_server_context
 
				         // compare the evaluated prompt with the new prompt
			
 
				         n_past = common_part(embd, prompt_tokens);
			
 
				 
			
 
				-        // since #3228 we now have to manually manage the KV cache
			
 
				-        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
			
 
				-
			
 
				         embd = prompt_tokens;
			
 
				         if (n_past == num_prompt_tokens)
			
 
				         {
			
@@ -471,6 +472,9 @@ struct llama_server_context
 
				             n_past--;
			
 
				         }
			
 
				 
			
 
				+        // since #3228 we now have to manually manage the KV cache
			
 
				+        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
			
 
				+
			
 
				         LOG_VERBOSE("prompt ingested", {
			
 
				                                            {"n_past", n_past},
			
 
				                                            {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},