3 месяцев назад · 554fd578a5
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -3812,7 +3812,7 @@ struct server_context {
 
				                             if (slot.n_past > 0 && slot.n_past < (int) slot.prompt.tokens.size()) {
			
 
				                                 const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);
			
 
				                                 if (pos_min == -1) {
			
 
				-                                    SLT_ERR(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d\n", slot.n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min);
			
 
				+                                    SLT_ERR(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d\n", slot.n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min);
			
 
				                                     GGML_ABORT("pos_min == -1, but n_past > 0 - should not happen: https://github.com/ggml-org/llama.cpp/pull/13833#discussion_r2116181237");
			
 
				                                 }
			
 
				 
			
@@ -3860,7 +3860,7 @@ struct server_context {
 
				                                 }
			
 
				 
			
 
				                                 if (pos_min > pos_min_thold) {
			
 
				-                                    SLT_WRN(slot, "n_past = %d, cache_tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", slot.n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min, n_swa);
			
 
				+                                    SLT_WRN(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", slot.n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min, n_swa);
			
 
				 
			
 
				                                     // search for a context checkpoint
			
 
				                                     const auto it = std::find_if(
			
@@ -4028,7 +4028,7 @@ struct server_context {
 
				                         }
			
 
				                     }
			
 
				 
			
 
				-                    // SLT_INF(slot, "new cache_tokens: %s\n", slot.cache_tokens.str().c_str());
			
 
				+                    // SLT_INF(slot, "new slot.prompt.tokens: %s\n", slot.slot.prompt.tokens.str().c_str());
			
 
				 
			
 
				                     SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_past / slot.n_prompt_tokens());
			
 
				 
			
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -1237,9 +1237,10 @@ public:
 
				             // allowed to resize      ^                    ^
			
 
				             // disallowed to resize          ^      ^             ^
			
 
				             if (n > 0) {
			
 
				-                llama_token last_token = tokens[n - 1];
			
 
				                 // make sure we never remove tokens in the middle of an image
			
 
				-                if (last_token == LLAMA_TOKEN_NULL) {
			
 
				+                // note that the case where we keep a full image at the end is allowed:
			
 
				+                //   tokens[n - 1] == LLAMA_TOKEN_NULL && tokens[n] != LLAMA_TOKEN_NULL
			
 
				+                if (tokens[n - 1] == LLAMA_TOKEN_NULL && tokens[n] == LLAMA_TOKEN_NULL) {
			
 
				                     find_chunk(n - 1); // will throw an error if the token is not begin-of-chunk
			
 
				                 }
			
 
				             }