1 неделя назад · 6df686bee6
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -601,18 +601,18 @@ bool common_chat_templates_was_explicit(const struct common_chat_templates * tmp
 
															     return tmpls->has_explicit_template;
														
 
															 }
														
 
															-const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant) {
														
 
															-    if (variant != nullptr) {
														
 
															-        if (strcmp(variant, "tool_use") == 0) {
														
 
															+std::string common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant) {
														
 
															+    if (!variant.empty()) {
														
 
															+        if (variant == "tool_use") {
														
 
															             if (tmpls->template_tool_use) {
														
 
															-                return tmpls->template_tool_use->source().c_str();
														
 
															+                return tmpls->template_tool_use->source();
														
 
															             }
														
 
															-            return nullptr;
														
 
															+            return "";
														
 
															         } else {
														
 
															-            LOG_DBG("%s: unknown template variant: %s\n", __func__, variant);
														
 
															+            LOG_DBG("%s: unknown template variant: %s\n", __func__, variant.c_str());
														
 
															         }
														
 
															     }
														
 
															-    return tmpls->template_default->source().c_str();
														
 
															+    return tmpls->template_default->source();
														
 
															 }
														
 
															 common_chat_templates_ptr common_chat_templates_init(
														
--- a/common/chat.h
+++ b/common/chat.h
@@ -191,7 +191,7 @@ common_chat_templates_ptr common_chat_templates_init(
 
															                                            const std::string & eos_token_override = "");
														
 
															 bool         common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
														
 
															-const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant = nullptr);
														
 
															+std::string  common_chat_templates_source(const struct common_chat_templates * tmpls, const std::string & variant = "");
														
 
															 struct common_chat_params      common_chat_templates_apply(
														
--- a/tools/cli/cli.cpp
+++ b/tools/cli/cli.cpp
@@ -71,14 +71,16 @@ struct cli_context {
 
															     std::string generate_completion(result_timings & out_timings) {
														
 
															         server_response_reader rd = ctx_server.get_response_reader();
														
 
															+        auto formatted = format_chat();
														
 
															         {
														
 
															             // TODO: reduce some copies here in the future
														
 
															             server_task task = server_task(SERVER_TASK_TYPE_COMPLETION);
														
 
															-            task.id        = rd.get_new_id();
														
 
															-            task.index     = 0;
														
 
															-            task.params    = defaults;    // copy
														
 
															-            task.cli_input = messages;    // copy
														
 
															-            task.cli_files = input_files; // copy
														
 
															+            task.id         = rd.get_new_id();
														
 
															+            task.index      = 0;
														
 
															+            task.params     = defaults;         // copy
														
 
															+            task.cli_prompt = formatted.prompt; // copy
														
 
															+            task.cli_files  = input_files;      // copy
														
 
															+            task.cli        = true;
														
 
															             rd.post_task({std::move(task)});
														
 
															         }
														
@@ -156,6 +158,26 @@ struct cli_context {
 
															             return content;
														
 
															         }
														
 
															     }
														
 
															+
														
 
															+    common_chat_params format_chat() {
														
 
															+        auto meta = ctx_server.get_meta();
														
 
															+        auto & chat_params = meta.chat_params;
														
 
															+
														
 
															+        common_chat_templates_inputs inputs;
														
 
															+        inputs.messages              = common_chat_msgs_parse_oaicompat(messages);
														
 
															+        inputs.tools                 = {}; // TODO
														
 
															+        inputs.tool_choice           = COMMON_CHAT_TOOL_CHOICE_NONE;
														
 
															+        inputs.json_schema           = ""; // TODO
														
 
															+        inputs.grammar               = ""; // TODO
														
 
															+        inputs.use_jinja             = chat_params.use_jinja;
														
 
															+        inputs.parallel_tool_calls   = false;
														
 
															+        inputs.add_generation_prompt = true;
														
 
															+        inputs.reasoning_format      = chat_params.reasoning_format;
														
 
															+        inputs.enable_thinking       = chat_params.enable_thinking;
														
 
															+
														
 
															+        // Apply chat template to the list of messages
														
 
															+        return common_chat_templates_apply(chat_params.tmpls.get(), inputs);
														
 
															+    }
														
 
															 };
														
 
															 int main(int argc, char ** argv) {
														
--- a/tools/server/server-common.cpp
+++ b/tools/server/server-common.cpp
@@ -831,7 +831,7 @@ static void handle_media(
 
															 // used by /chat/completions endpoint
														
 
															 json oaicompat_chat_params_parse(
														
 
															     json & body, /* openai api json semantics */
														
 
															-    const oaicompat_parser_options & opt,
														
 
															+    const server_chat_params & opt,
														
 
															     std::vector<raw_buffer> & out_files)
														
 
															 {
														
 
															     json llama_params;
														
@@ -1012,7 +1012,7 @@ json oaicompat_chat_params_parse(
 
															     }
														
 
															     // Apply chat template to the list of messages
														
 
															-    auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);
														
 
															+    auto chat_params = common_chat_templates_apply(opt.tmpls.get(), inputs);
														
 
															     /* Append assistant prefilled message */
														
 
															     if (prefill_assistant_message) {
														
--- a/tools/server/server-common.h
+++ b/tools/server/server-common.h
@@ -274,25 +274,25 @@ std::vector<server_tokens> tokenize_input_prompts(
 
															 // OAI utils
														
 
															 //
														
 
															-// used by /completions endpoint
														
 
															-json oaicompat_completion_params_parse(const json & body);
														
 
															-
														
 
															-struct oaicompat_parser_options {
														
 
															+struct server_chat_params {
														
 
															     bool use_jinja;
														
 
															     bool prefill_assistant;
														
 
															     common_reasoning_format reasoning_format;
														
 
															-    std::map<std::string,std::string> chat_template_kwargs;
														
 
															-    common_chat_templates * tmpls;
														
 
															+    std::map<std::string, std::string> chat_template_kwargs; // mapping key --> json value
														
 
															+    common_chat_templates_ptr tmpls;
														
 
															     bool allow_image;
														
 
															     bool allow_audio;
														
 
															     bool enable_thinking = true;
														
 
															     std::string media_path;
														
 
															 };
														
 
															+// used by /completions endpoint
														
 
															+json oaicompat_completion_params_parse(const json & body);
														
 
															+
														
 
															 // used by /chat/completions endpoint
														
 
															 json oaicompat_chat_params_parse(
														
 
															     json & body, /* openai api json semantics */
														
 
															-    const oaicompat_parser_options & opt,
														
 
															+    const server_chat_params & opt,
														
 
															     std::vector<raw_buffer> & out_files);
														
 
															 // convert Anthropic Messages API format to OpenAI Chat Completions API format
														
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -534,8 +534,8 @@ public:
 
															     server_queue    queue_tasks;
														
 
															     server_response queue_results;
														
 
															-    common_chat_templates_ptr chat_templates;
														
 
															-    oaicompat_parser_options  oai_parser_opt;
														
 
															+    // note: chat_params must not be refreshed upon existing sleeping state
														
 
															+    server_chat_params chat_params;
														
 
															     ~server_context_impl() {
														
 
															         if (!sleeping) {
														
@@ -688,15 +688,6 @@ private:
 
															             llama_init_dft->free_context();
														
 
															         }
														
 
															-        chat_templates = common_chat_templates_init(model, params_base.chat_template);
														
 
															-        try {
														
 
															-            common_chat_format_example(chat_templates.get(), params.use_jinja, params.default_template_kwargs);
														
 
															-        } catch (const std::exception & e) {
														
 
															-            SRV_WRN("%s: Chat template parsing error: %s\n", __func__, e.what());
														
 
															-            SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
														
 
															-            chat_templates = common_chat_templates_init(model, "chatml");
														
 
															-        }
														
 
															-
														
 
															         std::string & mmproj_path = params_base.mmproj.path;
														
 
															         if (!mmproj_path.empty()) {
														
 
															             if (!is_resume) {
														
@@ -845,30 +836,6 @@ private:
 
															             model_name = model_path.filename().string();
														
 
															         }
														
 
															-        // thinking is enabled if:
														
 
															-        // 1. It's not explicitly disabled (reasoning_budget == 0)
														
 
															-        // 2. The chat template supports it
														
 
															-        const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
														
 
															-        SRV_INF("thinking = %d\n", enable_thinking);
														
 
															-
														
 
															-        oai_parser_opt = {
														
 
															-            /* use_jinja             */ params_base.use_jinja,
														
 
															-            /* prefill_assistant     */ params_base.prefill_assistant,
														
 
															-            /* reasoning_format      */ params_base.reasoning_format,
														
 
															-            /* chat_template_kwargs  */ params_base.default_template_kwargs,
														
 
															-            /* common_chat_templates */ chat_templates.get(),
														
 
															-            /* allow_image           */ mctx ? mtmd_support_vision(mctx) : false,
														
 
															-            /* allow_audio           */ mctx ? mtmd_support_audio (mctx) : false,
														
 
															-            /* enable_thinking       */ enable_thinking,
														
 
															-            /* media_path            */ params_base.media_path,
														
 
															-        };
														
 
															-
														
 
															-        // print sample chat example to make it clear which template is used
														
 
															-        // @ngxson modern templates are too long, spam the logs; printing the example is enough
														
 
															-        LOG_INF("%s: chat template, example_format: '%s'\n", __func__,
														
 
															-        //      common_chat_templates_source(chat_templates.get()),
														
 
															-                common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str());
														
 
															-
														
 
															         if (!is_resume) {
														
 
															             return init();
														
 
															         }
														
@@ -907,6 +874,42 @@ private:
 
															             }
														
 
															         }
														
 
															+        // populate chat template params
														
 
															+        {
														
 
															+            common_chat_templates_ptr chat_templates;
														
 
															+
														
 
															+            try {
														
 
															+                chat_templates = common_chat_templates_init(model, params_base.chat_template);
														
 
															+
														
 
															+                LOG_INF("%s: chat template, example_format: '%s'\n", __func__,
														
 
															+                    common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str());
														
 
															+
														
 
															+            } catch (const std::exception & e) {
														
 
															+                SRV_ERR("%s: chat template parsing error: %s\n", __func__, e.what());
														
 
															+                SRV_ERR("%s: please consider disabling jinja via --no-jinja, or use a custom chat template via --chat-template\n", __func__);
														
 
															+                SRV_ERR("%s: for example: --no-jinja --chat-template chatml\n", __func__);
														
 
															+                return false;
														
 
															+            }
														
 
															+
														
 
															+            // thinking is enabled if:
														
 
															+            // 1. It's not explicitly disabled (reasoning_budget == 0)
														
 
															+            // 2. The chat template supports it
														
 
															+            const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
														
 
															+            SRV_INF("%s: chat template, thinking = %d\n", __func__, enable_thinking);
														
 
															+
														
 
															+            chat_params = {
														
 
															+                /* use_jinja             */ params_base.use_jinja,
														
 
															+                /* prefill_assistant     */ params_base.prefill_assistant,
														
 
															+                /* reasoning_format      */ params_base.reasoning_format,
														
 
															+                /* chat_template_kwargs  */ params_base.default_template_kwargs,
														
 
															+                /* tmpls                 */ std::move(chat_templates),
														
 
															+                /* allow_image           */ mctx ? mtmd_support_vision(mctx) : false,
														
 
															+                /* allow_audio           */ mctx ? mtmd_support_audio (mctx) : false,
														
 
															+                /* enable_thinking       */ enable_thinking,
														
 
															+                /* media_path            */ params_base.media_path,
														
 
															+            };
														
 
															+        }
														
 
															+
														
 
															         return true;
														
 
															     }
														
@@ -1588,32 +1591,14 @@ private:
 
															     // tokenize the input if it's set by CLI, return false on error
														
 
															     bool tokenize_cli_input(server_task & task) {
														
 
															-        GGML_ASSERT(task.cli_input != nullptr);
														
 
															         try {
														
 
															-            auto & opt = oai_parser_opt;
														
 
															-            common_chat_templates_inputs inputs;
														
 
															-            inputs.messages              = common_chat_msgs_parse_oaicompat(task.cli_input);
														
 
															-            inputs.tools                 = {}; // TODO
														
 
															-            inputs.tool_choice           = COMMON_CHAT_TOOL_CHOICE_NONE;
														
 
															-            inputs.json_schema           = ""; // TODO
														
 
															-            inputs.grammar               = ""; // TODO
														
 
															-            inputs.use_jinja             = opt.use_jinja;
														
 
															-            inputs.parallel_tool_calls   = false;
														
 
															-            inputs.add_generation_prompt = true;
														
 
															-            inputs.reasoning_format      = opt.reasoning_format;
														
 
															-            inputs.enable_thinking       = opt.enable_thinking;
														
 
															-
														
 
															-            // Apply chat template to the list of messages
														
 
															-            auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);
														
 
															-
														
 
															-            // tokenize the resulting prompt
														
 
															-            auto & prompt = chat_params.prompt;
														
 
															+            auto & prompt = task.cli_prompt;
														
 
															             if (mctx != nullptr) {
														
 
															                 task.tokens = process_mtmd_prompt(mctx, prompt, task.cli_files);
														
 
															             } else {
														
 
															                 task.tokens = std::move(tokenize_input_prompts(vocab, mctx, prompt, true, true)[0]);
														
 
															             }
														
 
															-            task.cli_input.clear();
														
 
															+            task.cli_prompt.clear();
														
 
															             task.cli_files.clear();
														
 
															         } catch (const std::exception & e) {
														
 
															             send_error(task, std::string("Failed to format input: ") + e.what(), ERROR_TYPE_INVALID_REQUEST);
														
@@ -1689,7 +1674,7 @@ private:
 
															                 {
														
 
															                     // special case: if input is provided via CLI, tokenize it first
														
 
															                     // otherwise, no need to tokenize as it's already done inside the HTTP thread
														
 
															-                    if (task.cli_input != nullptr) {
														
 
															+                    if (task.cli) {
														
 
															                         if (!tokenize_cli_input(task)) {
														
 
															                             break;
														
 
															                         }
														
@@ -2901,8 +2886,6 @@ server_response_reader server_context::get_response_reader() {
 
															 }
														
 
															 server_context_meta server_context::get_meta() const {
														
 
															-    auto tool_use_src = common_chat_templates_source(impl->chat_templates.get(), "tool_use");
														
 
															-
														
 
															     auto bos_id = llama_vocab_bos(impl->vocab);
														
 
															     auto eos_id = llama_vocab_eos(impl->vocab);
														
 
															     auto bos_token_str = bos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, bos_id, true) : "";
														
@@ -2913,14 +2896,13 @@ server_context_meta server_context::get_meta() const {
 
															         /* model_name             */ impl->model_name,
														
 
															         /* model_path             */ impl->params_base.model.path,
														
 
															         /* has_mtmd               */ impl->mctx != nullptr,
														
 
															-        /* has_inp_image          */ impl->oai_parser_opt.allow_image,
														
 
															-        /* has_inp_audio          */ impl->oai_parser_opt.allow_audio,
														
 
															+        /* has_inp_image          */ impl->chat_params.allow_image,
														
 
															+        /* has_inp_audio          */ impl->chat_params.allow_audio,
														
 
															         /* json_webui_settings    */ impl->json_webui_settings,
														
 
															         /* slot_n_ctx             */ impl->get_slot_n_ctx(),
														
 
															         /* pooling_type           */ llama_pooling_type(impl->ctx),
														
 
															-        /* chat_template          */ common_chat_templates_source(impl->chat_templates.get()),
														
 
															-        /* chat_template_tool_use */ tool_use_src ? tool_use_src : "",
														
 
															+        /* chat_params            */ impl->chat_params,
														
 
															         /* bos_token_str          */ bos_token_str,
														
 
															         /* eos_token_str          */ eos_token_str,
														
@@ -3202,8 +3184,8 @@ void server_routes::init_routes() {
 
															         // this endpoint can be accessed during sleeping
														
 
															         // the next LOC is to avoid someone accidentally use ctx_server
														
 
															-        bool server_ctx; // do NOT delete this line
														
 
															-        GGML_UNUSED(server_ctx);
														
 
															+        bool ctx_server; // do NOT delete this line
														
 
															+        GGML_UNUSED(ctx_server);
														
 
															         res->ok({{"status", "ok"}});
														
 
															         return res;
														
@@ -3393,8 +3375,8 @@ void server_routes::init_routes() {
 
															         // this endpoint can be accessed during sleeping
														
 
															         // the next LOC is to avoid someone accidentally use ctx_server
														
 
															-        bool server_ctx; // do NOT delete this line
														
 
															-        GGML_UNUSED(server_ctx);
														
 
															+        bool ctx_server; // do NOT delete this line
														
 
															+        GGML_UNUSED(ctx_server);
														
 
															         task_params tparams;
														
 
															         tparams.sampling = params.sampling;
														
@@ -3403,6 +3385,9 @@ void server_routes::init_routes() {
 
															             { "n_ctx",  meta->slot_n_ctx },
														
 
															         };
														
 
															+        std::string tmpl_default = common_chat_templates_source(meta->chat_params.tmpls.get(), "");
														
 
															+        std::string tmpl_tools   = common_chat_templates_source(meta->chat_params.tmpls.get(), "tool_use");
														
 
															+
														
 
															         json props = {
														
 
															             { "default_generation_settings", default_generation_settings_for_props },
														
 
															             { "total_slots",                 params.n_parallel },
														
@@ -3417,15 +3402,15 @@ void server_routes::init_routes() {
 
															             { "endpoint_metrics",            params.endpoint_metrics },
														
 
															             { "webui",                       params.webui },
														
 
															             { "webui_settings",              meta->json_webui_settings },
														
 
															-            { "chat_template",               meta->chat_template },
														
 
															+            { "chat_template",               tmpl_default },
														
 
															             { "bos_token",                   meta->bos_token_str },
														
 
															             { "eos_token",                   meta->eos_token_str },
														
 
															             { "build_info",                  meta->build_info },
														
 
															             { "is_sleeping",                 queue_tasks.is_sleeping() },
														
 
															         };
														
 
															         if (params.use_jinja) {
														
 
															-            if (!meta->chat_template_tool_use.empty()) {
														
 
															-                props["chat_template_tool_use"] = meta->chat_template_tool_use;
														
 
															+            if (!tmpl_tools.empty()) {
														
 
															+                props["chat_template_tool_use"] = tmpl_tools;
														
 
															             }
														
 
															         }
														
 
															         res->ok(props);
														
@@ -3446,6 +3431,7 @@ void server_routes::init_routes() {
 
															     this->get_api_show = [this](const server_http_req &) {
														
 
															         auto res = create_response();
														
 
															+        std::string tmpl_default = common_chat_templates_source(meta->chat_params.tmpls.get(), "");
														
 
															         json data = {
														
 
															             {
														
 
															                 "model_info", {
														
@@ -3454,7 +3440,7 @@ void server_routes::init_routes() {
 
															             },
														
 
															             {"modelfile", ""},
														
 
															             {"parameters", ""},
														
 
															-            {"template", meta->chat_template},
														
 
															+            {"template", tmpl_default},
														
 
															             {"details", {
														
 
															                 {"parent_model", ""},
														
 
															                 {"format", "gguf"},
														
@@ -3579,7 +3565,7 @@ void server_routes::init_routes() {
 
															         json body = json::parse(req.body);
														
 
															         json body_parsed = oaicompat_chat_params_parse(
														
 
															             body,
														
 
															-            ctx_server.oai_parser_opt,
														
 
															+            meta->chat_params,
														
 
															             files);
														
 
															         return handle_completions_impl(
														
 
															             req,
														
@@ -3595,7 +3581,7 @@ void server_routes::init_routes() {
 
															         json body = convert_anthropic_to_oai(json::parse(req.body));
														
 
															         json body_parsed = oaicompat_chat_params_parse(
														
 
															             body,
														
 
															-            ctx_server.oai_parser_opt,
														
 
															+            meta->chat_params,
														
 
															             files);
														
 
															         return handle_completions_impl(
														
 
															             req,
														
@@ -3611,7 +3597,7 @@ void server_routes::init_routes() {
 
															         json body = convert_anthropic_to_oai(json::parse(req.body));
														
 
															         json body_parsed = oaicompat_chat_params_parse(
														
 
															             body,
														
 
															-            ctx_server.oai_parser_opt,
														
 
															+            meta->chat_params,
														
 
															             files);
														
 
															         json prompt = body_parsed.at("prompt");
														
@@ -3627,7 +3613,7 @@ void server_routes::init_routes() {
 
															         json body = json::parse(req.body);
														
 
															         json data = oaicompat_chat_params_parse(
														
 
															             body,
														
 
															-            ctx_server.oai_parser_opt,
														
 
															+            meta->chat_params,
														
 
															             files);
														
 
															         res->ok({{ "prompt", std::move(data.at("prompt")) }});
														
 
															         return res;
														
@@ -3638,8 +3624,8 @@ void server_routes::init_routes() {
 
															         // this endpoint can be accessed during sleeping
														
 
															         // the next LOC is to avoid someone accidentally use ctx_server
														
 
															-        bool server_ctx; // do NOT delete this line
														
 
															-        GGML_UNUSED(server_ctx);
														
 
															+        bool ctx_server; // do NOT delete this line
														
 
															+        GGML_UNUSED(ctx_server);
														
 
															         json models = {
														
 
															             {"models", {
														
--- a/tools/server/server-context.h
+++ b/tools/server/server-context.h
@@ -20,9 +20,8 @@ struct server_context_meta {
 
															     int slot_n_ctx;
														
 
															     enum llama_pooling_type pooling_type;
														
 
															-    // chat template
														
 
															-    std::string chat_template;
														
 
															-    std::string chat_template_tool_use;
														
 
															+    // chat params
														
 
															+    server_chat_params & chat_params;
														
 
															     // tokens
														
 
															     std::string bos_token_str;
														
--- a/tools/server/server-task.h
+++ b/tools/server/server-task.h
@@ -130,8 +130,10 @@ struct server_task {
 
															     task_params   params;
														
 
															     server_tokens tokens;
														
 
															-    // only used by CLI, this delegates the tokenization to the server
														
 
															-    json                    cli_input = nullptr;
														
 
															+    // only used by CLI, this allow tokenizing CLI inputs on server side
														
 
															+    // we need this because mtmd_context and vocab are not accessible outside of server_context
														
 
															+    bool                    cli = false;
														
 
															+    std::string             cli_prompt;
														
 
															     std::vector<raw_buffer> cli_files;
														
 
															     server_task_type type;