|
@@ -534,8 +534,8 @@ public:
|
|
|
server_queue queue_tasks;
|
|
server_queue queue_tasks;
|
|
|
server_response queue_results;
|
|
server_response queue_results;
|
|
|
|
|
|
|
|
- common_chat_templates_ptr chat_templates;
|
|
|
|
|
- oaicompat_parser_options oai_parser_opt;
|
|
|
|
|
|
|
+ // note: chat_params must not be refreshed upon existing sleeping state
|
|
|
|
|
+ server_chat_params chat_params;
|
|
|
|
|
|
|
|
~server_context_impl() {
|
|
~server_context_impl() {
|
|
|
if (!sleeping) {
|
|
if (!sleeping) {
|
|
@@ -688,15 +688,6 @@ private:
|
|
|
llama_init_dft->free_context();
|
|
llama_init_dft->free_context();
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- chat_templates = common_chat_templates_init(model, params_base.chat_template);
|
|
|
|
|
- try {
|
|
|
|
|
- common_chat_format_example(chat_templates.get(), params.use_jinja, params.default_template_kwargs);
|
|
|
|
|
- } catch (const std::exception & e) {
|
|
|
|
|
- SRV_WRN("%s: Chat template parsing error: %s\n", __func__, e.what());
|
|
|
|
|
- SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__);
|
|
|
|
|
- chat_templates = common_chat_templates_init(model, "chatml");
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
std::string & mmproj_path = params_base.mmproj.path;
|
|
std::string & mmproj_path = params_base.mmproj.path;
|
|
|
if (!mmproj_path.empty()) {
|
|
if (!mmproj_path.empty()) {
|
|
|
if (!is_resume) {
|
|
if (!is_resume) {
|
|
@@ -845,30 +836,6 @@ private:
|
|
|
model_name = model_path.filename().string();
|
|
model_name = model_path.filename().string();
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- // thinking is enabled if:
|
|
|
|
|
- // 1. It's not explicitly disabled (reasoning_budget == 0)
|
|
|
|
|
- // 2. The chat template supports it
|
|
|
|
|
- const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
|
|
|
|
|
- SRV_INF("thinking = %d\n", enable_thinking);
|
|
|
|
|
-
|
|
|
|
|
- oai_parser_opt = {
|
|
|
|
|
- /* use_jinja */ params_base.use_jinja,
|
|
|
|
|
- /* prefill_assistant */ params_base.prefill_assistant,
|
|
|
|
|
- /* reasoning_format */ params_base.reasoning_format,
|
|
|
|
|
- /* chat_template_kwargs */ params_base.default_template_kwargs,
|
|
|
|
|
- /* common_chat_templates */ chat_templates.get(),
|
|
|
|
|
- /* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
|
|
|
|
|
- /* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
|
|
|
|
|
- /* enable_thinking */ enable_thinking,
|
|
|
|
|
- /* media_path */ params_base.media_path,
|
|
|
|
|
- };
|
|
|
|
|
-
|
|
|
|
|
- // print sample chat example to make it clear which template is used
|
|
|
|
|
- // @ngxson modern templates are too long, spam the logs; printing the example is enough
|
|
|
|
|
- LOG_INF("%s: chat template, example_format: '%s'\n", __func__,
|
|
|
|
|
- // common_chat_templates_source(chat_templates.get()),
|
|
|
|
|
- common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str());
|
|
|
|
|
-
|
|
|
|
|
if (!is_resume) {
|
|
if (!is_resume) {
|
|
|
return init();
|
|
return init();
|
|
|
}
|
|
}
|
|
@@ -907,6 +874,42 @@ private:
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+ // populate chat template params
|
|
|
|
|
+ {
|
|
|
|
|
+ common_chat_templates_ptr chat_templates;
|
|
|
|
|
+
|
|
|
|
|
+ try {
|
|
|
|
|
+ chat_templates = common_chat_templates_init(model, params_base.chat_template);
|
|
|
|
|
+
|
|
|
|
|
+ LOG_INF("%s: chat template, example_format: '%s'\n", __func__,
|
|
|
|
|
+ common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str());
|
|
|
|
|
+
|
|
|
|
|
+ } catch (const std::exception & e) {
|
|
|
|
|
+ SRV_ERR("%s: chat template parsing error: %s\n", __func__, e.what());
|
|
|
|
|
+ SRV_ERR("%s: please consider disabling jinja via --no-jinja, or use a custom chat template via --chat-template\n", __func__);
|
|
|
|
|
+ SRV_ERR("%s: for example: --no-jinja --chat-template chatml\n", __func__);
|
|
|
|
|
+ return false;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // thinking is enabled if:
|
|
|
|
|
+ // 1. It's not explicitly disabled (reasoning_budget == 0)
|
|
|
|
|
+ // 2. The chat template supports it
|
|
|
|
|
+ const bool enable_thinking = params_base.use_jinja && params_base.reasoning_budget != 0 && common_chat_templates_support_enable_thinking(chat_templates.get());
|
|
|
|
|
+ SRV_INF("%s: chat template, thinking = %d\n", __func__, enable_thinking);
|
|
|
|
|
+
|
|
|
|
|
+ chat_params = {
|
|
|
|
|
+ /* use_jinja */ params_base.use_jinja,
|
|
|
|
|
+ /* prefill_assistant */ params_base.prefill_assistant,
|
|
|
|
|
+ /* reasoning_format */ params_base.reasoning_format,
|
|
|
|
|
+ /* chat_template_kwargs */ params_base.default_template_kwargs,
|
|
|
|
|
+ /* tmpls */ std::move(chat_templates),
|
|
|
|
|
+ /* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
|
|
|
|
|
+ /* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
|
|
|
|
|
+ /* enable_thinking */ enable_thinking,
|
|
|
|
|
+ /* media_path */ params_base.media_path,
|
|
|
|
|
+ };
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
return true;
|
|
return true;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -1588,32 +1591,14 @@ private:
|
|
|
|
|
|
|
|
// tokenize the input if it's set by CLI, return false on error
|
|
// tokenize the input if it's set by CLI, return false on error
|
|
|
bool tokenize_cli_input(server_task & task) {
|
|
bool tokenize_cli_input(server_task & task) {
|
|
|
- GGML_ASSERT(task.cli_input != nullptr);
|
|
|
|
|
try {
|
|
try {
|
|
|
- auto & opt = oai_parser_opt;
|
|
|
|
|
- common_chat_templates_inputs inputs;
|
|
|
|
|
- inputs.messages = common_chat_msgs_parse_oaicompat(task.cli_input);
|
|
|
|
|
- inputs.tools = {}; // TODO
|
|
|
|
|
- inputs.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE;
|
|
|
|
|
- inputs.json_schema = ""; // TODO
|
|
|
|
|
- inputs.grammar = ""; // TODO
|
|
|
|
|
- inputs.use_jinja = opt.use_jinja;
|
|
|
|
|
- inputs.parallel_tool_calls = false;
|
|
|
|
|
- inputs.add_generation_prompt = true;
|
|
|
|
|
- inputs.reasoning_format = opt.reasoning_format;
|
|
|
|
|
- inputs.enable_thinking = opt.enable_thinking;
|
|
|
|
|
-
|
|
|
|
|
- // Apply chat template to the list of messages
|
|
|
|
|
- auto chat_params = common_chat_templates_apply(opt.tmpls, inputs);
|
|
|
|
|
-
|
|
|
|
|
- // tokenize the resulting prompt
|
|
|
|
|
- auto & prompt = chat_params.prompt;
|
|
|
|
|
|
|
+ auto & prompt = task.cli_prompt;
|
|
|
if (mctx != nullptr) {
|
|
if (mctx != nullptr) {
|
|
|
task.tokens = process_mtmd_prompt(mctx, prompt, task.cli_files);
|
|
task.tokens = process_mtmd_prompt(mctx, prompt, task.cli_files);
|
|
|
} else {
|
|
} else {
|
|
|
task.tokens = std::move(tokenize_input_prompts(vocab, mctx, prompt, true, true)[0]);
|
|
task.tokens = std::move(tokenize_input_prompts(vocab, mctx, prompt, true, true)[0]);
|
|
|
}
|
|
}
|
|
|
- task.cli_input.clear();
|
|
|
|
|
|
|
+ task.cli_prompt.clear();
|
|
|
task.cli_files.clear();
|
|
task.cli_files.clear();
|
|
|
} catch (const std::exception & e) {
|
|
} catch (const std::exception & e) {
|
|
|
send_error(task, std::string("Failed to format input: ") + e.what(), ERROR_TYPE_INVALID_REQUEST);
|
|
send_error(task, std::string("Failed to format input: ") + e.what(), ERROR_TYPE_INVALID_REQUEST);
|
|
@@ -1689,7 +1674,7 @@ private:
|
|
|
{
|
|
{
|
|
|
// special case: if input is provided via CLI, tokenize it first
|
|
// special case: if input is provided via CLI, tokenize it first
|
|
|
// otherwise, no need to tokenize as it's already done inside the HTTP thread
|
|
// otherwise, no need to tokenize as it's already done inside the HTTP thread
|
|
|
- if (task.cli_input != nullptr) {
|
|
|
|
|
|
|
+ if (task.cli) {
|
|
|
if (!tokenize_cli_input(task)) {
|
|
if (!tokenize_cli_input(task)) {
|
|
|
break;
|
|
break;
|
|
|
}
|
|
}
|
|
@@ -2901,8 +2886,6 @@ server_response_reader server_context::get_response_reader() {
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
server_context_meta server_context::get_meta() const {
|
|
server_context_meta server_context::get_meta() const {
|
|
|
- auto tool_use_src = common_chat_templates_source(impl->chat_templates.get(), "tool_use");
|
|
|
|
|
-
|
|
|
|
|
auto bos_id = llama_vocab_bos(impl->vocab);
|
|
auto bos_id = llama_vocab_bos(impl->vocab);
|
|
|
auto eos_id = llama_vocab_eos(impl->vocab);
|
|
auto eos_id = llama_vocab_eos(impl->vocab);
|
|
|
auto bos_token_str = bos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, bos_id, true) : "";
|
|
auto bos_token_str = bos_id != LLAMA_TOKEN_NULL ? common_token_to_piece(impl->ctx, bos_id, true) : "";
|
|
@@ -2913,14 +2896,13 @@ server_context_meta server_context::get_meta() const {
|
|
|
/* model_name */ impl->model_name,
|
|
/* model_name */ impl->model_name,
|
|
|
/* model_path */ impl->params_base.model.path,
|
|
/* model_path */ impl->params_base.model.path,
|
|
|
/* has_mtmd */ impl->mctx != nullptr,
|
|
/* has_mtmd */ impl->mctx != nullptr,
|
|
|
- /* has_inp_image */ impl->oai_parser_opt.allow_image,
|
|
|
|
|
- /* has_inp_audio */ impl->oai_parser_opt.allow_audio,
|
|
|
|
|
|
|
+ /* has_inp_image */ impl->chat_params.allow_image,
|
|
|
|
|
+ /* has_inp_audio */ impl->chat_params.allow_audio,
|
|
|
/* json_webui_settings */ impl->json_webui_settings,
|
|
/* json_webui_settings */ impl->json_webui_settings,
|
|
|
/* slot_n_ctx */ impl->get_slot_n_ctx(),
|
|
/* slot_n_ctx */ impl->get_slot_n_ctx(),
|
|
|
/* pooling_type */ llama_pooling_type(impl->ctx),
|
|
/* pooling_type */ llama_pooling_type(impl->ctx),
|
|
|
|
|
|
|
|
- /* chat_template */ common_chat_templates_source(impl->chat_templates.get()),
|
|
|
|
|
- /* chat_template_tool_use */ tool_use_src ? tool_use_src : "",
|
|
|
|
|
|
|
+ /* chat_params */ impl->chat_params,
|
|
|
|
|
|
|
|
/* bos_token_str */ bos_token_str,
|
|
/* bos_token_str */ bos_token_str,
|
|
|
/* eos_token_str */ eos_token_str,
|
|
/* eos_token_str */ eos_token_str,
|
|
@@ -3202,8 +3184,8 @@ void server_routes::init_routes() {
|
|
|
|
|
|
|
|
// this endpoint can be accessed during sleeping
|
|
// this endpoint can be accessed during sleeping
|
|
|
// the next LOC is to avoid someone accidentally use ctx_server
|
|
// the next LOC is to avoid someone accidentally use ctx_server
|
|
|
- bool server_ctx; // do NOT delete this line
|
|
|
|
|
- GGML_UNUSED(server_ctx);
|
|
|
|
|
|
|
+ bool ctx_server; // do NOT delete this line
|
|
|
|
|
+ GGML_UNUSED(ctx_server);
|
|
|
|
|
|
|
|
res->ok({{"status", "ok"}});
|
|
res->ok({{"status", "ok"}});
|
|
|
return res;
|
|
return res;
|
|
@@ -3393,8 +3375,8 @@ void server_routes::init_routes() {
|
|
|
|
|
|
|
|
// this endpoint can be accessed during sleeping
|
|
// this endpoint can be accessed during sleeping
|
|
|
// the next LOC is to avoid someone accidentally use ctx_server
|
|
// the next LOC is to avoid someone accidentally use ctx_server
|
|
|
- bool server_ctx; // do NOT delete this line
|
|
|
|
|
- GGML_UNUSED(server_ctx);
|
|
|
|
|
|
|
+ bool ctx_server; // do NOT delete this line
|
|
|
|
|
+ GGML_UNUSED(ctx_server);
|
|
|
|
|
|
|
|
task_params tparams;
|
|
task_params tparams;
|
|
|
tparams.sampling = params.sampling;
|
|
tparams.sampling = params.sampling;
|
|
@@ -3403,6 +3385,9 @@ void server_routes::init_routes() {
|
|
|
{ "n_ctx", meta->slot_n_ctx },
|
|
{ "n_ctx", meta->slot_n_ctx },
|
|
|
};
|
|
};
|
|
|
|
|
|
|
|
|
|
+ std::string tmpl_default = common_chat_templates_source(meta->chat_params.tmpls.get(), "");
|
|
|
|
|
+ std::string tmpl_tools = common_chat_templates_source(meta->chat_params.tmpls.get(), "tool_use");
|
|
|
|
|
+
|
|
|
json props = {
|
|
json props = {
|
|
|
{ "default_generation_settings", default_generation_settings_for_props },
|
|
{ "default_generation_settings", default_generation_settings_for_props },
|
|
|
{ "total_slots", params.n_parallel },
|
|
{ "total_slots", params.n_parallel },
|
|
@@ -3417,15 +3402,15 @@ void server_routes::init_routes() {
|
|
|
{ "endpoint_metrics", params.endpoint_metrics },
|
|
{ "endpoint_metrics", params.endpoint_metrics },
|
|
|
{ "webui", params.webui },
|
|
{ "webui", params.webui },
|
|
|
{ "webui_settings", meta->json_webui_settings },
|
|
{ "webui_settings", meta->json_webui_settings },
|
|
|
- { "chat_template", meta->chat_template },
|
|
|
|
|
|
|
+ { "chat_template", tmpl_default },
|
|
|
{ "bos_token", meta->bos_token_str },
|
|
{ "bos_token", meta->bos_token_str },
|
|
|
{ "eos_token", meta->eos_token_str },
|
|
{ "eos_token", meta->eos_token_str },
|
|
|
{ "build_info", meta->build_info },
|
|
{ "build_info", meta->build_info },
|
|
|
{ "is_sleeping", queue_tasks.is_sleeping() },
|
|
{ "is_sleeping", queue_tasks.is_sleeping() },
|
|
|
};
|
|
};
|
|
|
if (params.use_jinja) {
|
|
if (params.use_jinja) {
|
|
|
- if (!meta->chat_template_tool_use.empty()) {
|
|
|
|
|
- props["chat_template_tool_use"] = meta->chat_template_tool_use;
|
|
|
|
|
|
|
+ if (!tmpl_tools.empty()) {
|
|
|
|
|
+ props["chat_template_tool_use"] = tmpl_tools;
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
res->ok(props);
|
|
res->ok(props);
|
|
@@ -3446,6 +3431,7 @@ void server_routes::init_routes() {
|
|
|
|
|
|
|
|
this->get_api_show = [this](const server_http_req &) {
|
|
this->get_api_show = [this](const server_http_req &) {
|
|
|
auto res = create_response();
|
|
auto res = create_response();
|
|
|
|
|
+ std::string tmpl_default = common_chat_templates_source(meta->chat_params.tmpls.get(), "");
|
|
|
json data = {
|
|
json data = {
|
|
|
{
|
|
{
|
|
|
"model_info", {
|
|
"model_info", {
|
|
@@ -3454,7 +3440,7 @@ void server_routes::init_routes() {
|
|
|
},
|
|
},
|
|
|
{"modelfile", ""},
|
|
{"modelfile", ""},
|
|
|
{"parameters", ""},
|
|
{"parameters", ""},
|
|
|
- {"template", meta->chat_template},
|
|
|
|
|
|
|
+ {"template", tmpl_default},
|
|
|
{"details", {
|
|
{"details", {
|
|
|
{"parent_model", ""},
|
|
{"parent_model", ""},
|
|
|
{"format", "gguf"},
|
|
{"format", "gguf"},
|
|
@@ -3579,7 +3565,7 @@ void server_routes::init_routes() {
|
|
|
json body = json::parse(req.body);
|
|
json body = json::parse(req.body);
|
|
|
json body_parsed = oaicompat_chat_params_parse(
|
|
json body_parsed = oaicompat_chat_params_parse(
|
|
|
body,
|
|
body,
|
|
|
- ctx_server.oai_parser_opt,
|
|
|
|
|
|
|
+ meta->chat_params,
|
|
|
files);
|
|
files);
|
|
|
return handle_completions_impl(
|
|
return handle_completions_impl(
|
|
|
req,
|
|
req,
|
|
@@ -3595,7 +3581,7 @@ void server_routes::init_routes() {
|
|
|
json body = convert_anthropic_to_oai(json::parse(req.body));
|
|
json body = convert_anthropic_to_oai(json::parse(req.body));
|
|
|
json body_parsed = oaicompat_chat_params_parse(
|
|
json body_parsed = oaicompat_chat_params_parse(
|
|
|
body,
|
|
body,
|
|
|
- ctx_server.oai_parser_opt,
|
|
|
|
|
|
|
+ meta->chat_params,
|
|
|
files);
|
|
files);
|
|
|
return handle_completions_impl(
|
|
return handle_completions_impl(
|
|
|
req,
|
|
req,
|
|
@@ -3611,7 +3597,7 @@ void server_routes::init_routes() {
|
|
|
json body = convert_anthropic_to_oai(json::parse(req.body));
|
|
json body = convert_anthropic_to_oai(json::parse(req.body));
|
|
|
json body_parsed = oaicompat_chat_params_parse(
|
|
json body_parsed = oaicompat_chat_params_parse(
|
|
|
body,
|
|
body,
|
|
|
- ctx_server.oai_parser_opt,
|
|
|
|
|
|
|
+ meta->chat_params,
|
|
|
files);
|
|
files);
|
|
|
|
|
|
|
|
json prompt = body_parsed.at("prompt");
|
|
json prompt = body_parsed.at("prompt");
|
|
@@ -3627,7 +3613,7 @@ void server_routes::init_routes() {
|
|
|
json body = json::parse(req.body);
|
|
json body = json::parse(req.body);
|
|
|
json data = oaicompat_chat_params_parse(
|
|
json data = oaicompat_chat_params_parse(
|
|
|
body,
|
|
body,
|
|
|
- ctx_server.oai_parser_opt,
|
|
|
|
|
|
|
+ meta->chat_params,
|
|
|
files);
|
|
files);
|
|
|
res->ok({{ "prompt", std::move(data.at("prompt")) }});
|
|
res->ok({{ "prompt", std::move(data.at("prompt")) }});
|
|
|
return res;
|
|
return res;
|
|
@@ -3638,8 +3624,8 @@ void server_routes::init_routes() {
|
|
|
|
|
|
|
|
// this endpoint can be accessed during sleeping
|
|
// this endpoint can be accessed during sleeping
|
|
|
// the next LOC is to avoid someone accidentally use ctx_server
|
|
// the next LOC is to avoid someone accidentally use ctx_server
|
|
|
- bool server_ctx; // do NOT delete this line
|
|
|
|
|
- GGML_UNUSED(server_ctx);
|
|
|
|
|
|
|
+ bool ctx_server; // do NOT delete this line
|
|
|
|
|
+ GGML_UNUSED(ctx_server);
|
|
|
|
|
|
|
|
json models = {
|
|
json models = {
|
|
|
{"models", {
|
|
{"models", {
|