5 månader sedan · 657b8a77bd
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -147,6 +147,7 @@ struct templates_params {
 
				     json extra_context;
			
 
				     bool add_bos;
			
 
				     bool add_eos;
			
 
				+    bool is_inference = true;
			
 
				 };
			
 
				 
			
 
				 common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
			
@@ -1336,6 +1337,17 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
 
				     common_chat_params data;
			
 
				     auto prompt = apply(tmpl, inputs);
			
 
				 
			
 
				+    // Check if we need to replace the return token with end token during
			
 
				+    // inference and without generation prompt. For more details see:
			
 
				+    // https://github.com/ggml-org/llama.cpp/issues/15417
			
 
				+    if (inputs.is_inference && !inputs.add_generation_prompt) {
			
 
				+        static constexpr std::string_view return_token = "<|return|>";
			
 
				+        static constexpr std::string_view end_token    = "<|end|>";
			
 
				+        if (size_t pos = prompt.rfind(return_token); pos != std::string::npos) {
			
 
				+            prompt.replace(pos, return_token.length(), end_token);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				     data.prompt = prompt;
			
 
				     data.format = COMMON_CHAT_FORMAT_GPT_OSS;