1 miesiąc temu · 380b4c984e
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -105,6 +105,16 @@ bool common_arg::is_exclude(enum llama_example ex) {
 
															 bool common_arg::get_value_from_env(std::string & output) const {
														
 
															     if (env == nullptr) return false;
														
 
															+    if (!args_neg.empty()) {
														
 
															+        // for compatibility, we need to check LLAMA_ARG_NO_ env as well
														
 
															+        std::string neg_env = env;
														
 
															+        string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
														
 
															+        char * neg_value = std::getenv(neg_env.c_str());
														
 
															+        if (neg_value) {
														
 
															+            output = "0"; // falsey
														
 
															+            return true;
														
 
															+        }
														
 
															+    }
														
 
															     char * value = std::getenv(env);
														
 
															     if (value) {
														
 
															         output = value;
														
@@ -114,6 +124,14 @@ bool common_arg::get_value_from_env(std::string & output) const {
 
															 }
														
 
															 bool common_arg::has_value_from_env() const {
														
 
															+    if (env != nullptr && !args_neg.empty()) {
														
 
															+        // for compatibility, we need to check LLAMA_ARG_NO_ env as well
														
 
															+        std::string neg_env = env;
														
 
															+        string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
														
 
															+        if (std::getenv(neg_env.c_str())) {
														
 
															+            return true;
														
 
															+        }
														
 
															+    }
														
 
															     return env != nullptr && std::getenv(env);
														
 
															 }
														
@@ -151,9 +169,10 @@ std::string common_arg::to_string() const {
 
															     std::string leading_spaces(n_leading_spaces, ' ');
														
 
															     std::ostringstream ss;
														
 
															-    for (const auto arg : args) {
														
 
															-        if (arg == args.front()) {
														
 
															-            if (args.size() == 1) {
														
 
															+    auto all_args = get_args(); // also contains args_neg
														
 
															+    for (const auto & arg : all_args) {
														
 
															+        if (arg == all_args.front()) {
														
 
															+            if (all_args.size() == 1) {
														
 
															                 ss << arg;
														
 
															             } else {
														
 
															                 // first arg is usually abbreviation, we need padding to make it more beautiful
														
@@ -162,7 +181,7 @@ std::string common_arg::to_string() const {
 
															                 ss << tmp << spaces;
														
 
															             }
														
 
															         } else {
														
 
															-            ss << arg << (arg != args.back() ? ", " : "");
														
 
															+            ss << arg << (arg != all_args.back() ? ", " : "");
														
 
															         }
														
 
															     }
														
 
															     if (value_hint) ss << " " << value_hint;
														
@@ -181,6 +200,31 @@ std::string common_arg::to_string() const {
 
															     return ss.str();
														
 
															 }
														
 
															+std::vector<std::string> common_arg::get_args() const {
														
 
															+    std::vector<std::string> result;
														
 
															+    for (const auto & arg : args) {
														
 
															+        result.push_back(std::string(arg));
														
 
															+    }
														
 
															+    for (const auto & arg : args_neg) {
														
 
															+        result.push_back(std::string(arg));
														
 
															+    }
														
 
															+    return result;
														
 
															+}
														
 
															+
														
 
															+std::vector<std::string> common_arg::get_env() const {
														
 
															+    std::vector<std::string> result;
														
 
															+    if (env) {
														
 
															+        result.push_back(std::string(env));
														
 
															+    }
														
 
															+    if (!args_neg.empty() && env) {
														
 
															+        // for compatibility, we need to add LLAMA_ARG_NO_ variant
														
 
															+        std::string neg_env = env;
														
 
															+        string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
														
 
															+        result.push_back(neg_env);
														
 
															+    }
														
 
															+    return result;
														
 
															+}
														
 
															+
														
 
															 //
														
 
															 // utils
														
 
															 //
														
@@ -316,6 +360,16 @@ static std::string get_all_kv_cache_types() {
 
															     return msg.str();
														
 
															 }
														
 
															+static bool parse_bool_value(const std::string & value) {
														
 
															+    if (is_truthy(value)) {
														
 
															+        return true;
														
 
															+    } else if (is_falsey(value)) {
														
 
															+        return false;
														
 
															+    } else {
														
 
															+        throw std::invalid_argument("invalid boolean value");
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															 //
														
 
															 // CLI argument parsing functions
														
 
															 //
														
@@ -323,10 +377,13 @@ static std::string get_all_kv_cache_types() {
 
															 static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
														
 
															     common_params & params = ctx_arg.params;
														
 
															-    std::unordered_map<std::string, common_arg *> arg_to_options;
														
 
															+    std::unordered_map<std::string, std::pair<common_arg *, bool>> arg_to_options;
														
 
															     for (auto & opt : ctx_arg.options) {
														
 
															         for (const auto & arg : opt.args) {
														
 
															-            arg_to_options[arg] = &opt;
														
 
															+            arg_to_options[arg] = {&opt, /* is_positive */ true};
														
 
															+        }
														
 
															+        for (const auto & arg : opt.args_neg) {
														
 
															+            arg_to_options[arg] = {&opt, /* is_positive */ false};
														
 
															         }
														
 
															     }
														
@@ -335,12 +392,15 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
 
															         std::string value;
														
 
															         if (opt.get_value_from_env(value)) {
														
 
															             try {
														
 
															-                if (opt.handler_void && (value == "1" || value == "true")) {
														
 
															+                if (opt.handler_void && is_truthy(value)) {
														
 
															                     opt.handler_void(params);
														
 
															                 }
														
 
															                 if (opt.handler_int) {
														
 
															                     opt.handler_int(params, std::stoi(value));
														
 
															                 }
														
 
															+                if (opt.handler_bool) {
														
 
															+                    opt.handler_bool(params, parse_bool_value(value));
														
 
															+                }
														
 
															                 if (opt.handler_string) {
														
 
															                     opt.handler_string(params, value);
														
 
															                     continue;
														
@@ -369,7 +429,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
 
															         if (arg_to_options.find(arg) == arg_to_options.end()) {
														
 
															             throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
														
 
															         }
														
 
															-        auto opt = *arg_to_options[arg];
														
 
															+        auto & tmp = arg_to_options[arg];
														
 
															+        auto opt = *tmp.first;
														
 
															+        bool is_positive = tmp.second;
														
 
															         if (opt.has_value_from_env()) {
														
 
															             fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
														
 
															         }
														
@@ -378,6 +440,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
 
															                 opt.handler_void(params);
														
 
															                 continue;
														
 
															             }
														
 
															+            if (opt.handler_bool) {
														
 
															+                opt.handler_bool(params, is_positive);
														
 
															+                continue;
														
 
															+            }
														
 
															             // arg with single value
														
 
															             check_arg(i);
														
@@ -402,7 +468,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
 
															             throw std::invalid_argument(string_format(
														
 
															                 "error while handling argument \"%s\": %s\n\n"
														
 
															                 "usage:\n%s\n\nto show complete usage, run with -h",
														
 
															-                arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
														
 
															+                arg.c_str(), e.what(), opt.to_string().c_str()));
														
 
															         }
														
 
															     }
														
@@ -750,11 +816,11 @@ static std::string list_builtin_chat_templates() {
 
															 }
														
 
															 bool common_arg_utils::is_truthy(const std::string & value) {
														
 
															-    return value == "on" || value == "enabled" || value == "1";
														
 
															+    return value == "on" || value == "enabled" || value == "true" || value == "1";
														
 
															 }
														
 
															 bool common_arg_utils::is_falsey(const std::string & value) {
														
 
															-    return value == "off" || value == "disabled" || value == "0";
														
 
															+    return value == "off" || value == "disabled" || value == "false" || value == "0";
														
 
															 }
														
 
															 bool common_arg_utils::is_autoy(const std::string & value) {
														
@@ -839,10 +905,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
 
															         }
														
 
															     ));
														
 
															     add_opt(common_arg(
														
 
															+        {"--display-prompt"},
														
 
															         {"--no-display-prompt"},
														
 
															-        string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
														
 
															-        [](common_params & params) {
														
 
															-            params.display_prompt = false;
														
 
															+        string_format("whether to print prompt at generation (default: %s)", params.display_prompt ? "true" : "false"),
														
 
															+        [](common_params & params, bool value) {
														
 
															+            params.display_prompt = value;
														
 
															         }
														
 
															     ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
														
 
															     add_opt(common_arg(
														
@@ -1055,18 +1122,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
 
															             params.kv_unified = true;
														
 
															         }
														
 
															     ).set_env("LLAMA_ARG_KV_UNIFIED"));
														
 
															-    add_opt(common_arg(
														
 
															-        {"--no-context-shift"},
														
 
															-        string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
														
 
															-        [](common_params & params) {
														
 
															-            params.ctx_shift = false;
														
 
															-        }
														
 
															-    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
														
 
															     add_opt(common_arg(
														
 
															         {"--context-shift"},
														
 
															-        string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
														
 
															-        [](common_params & params) {
														
 
															-            params.ctx_shift = true;
														
 
															+        {"--no-context-shift"},
														
 
															+        string_format("whether to use context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
														
 
															+        [](common_params & params, bool value) {
														
 
															+            params.ctx_shift = value;
														
 
															         }
														
 
															     ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
														
 
															     add_opt(common_arg(
														
@@ -1106,20 +1167,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
 
															         }
														
 
															     ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
														
 
															     add_opt(common_arg(
														
 
															+        {"--perf"},
														
 
															         {"--no-perf"},
														
 
															-        string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
														
 
															-        [](common_params & params) {
														
 
															-            params.no_perf = true;
														
 
															-            params.sampling.no_perf = true;
														
 
															+        string_format("whether to enable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
														
 
															+        [](common_params & params, bool value) {
														
 
															+            params.no_perf = !value;
														
 
															+            params.sampling.no_perf = !value;
														
 
															         }
														
 
															-    ).set_env("LLAMA_ARG_NO_PERF"));
														
 
															+    ).set_env("LLAMA_ARG_PERF"));
														
 
															     add_opt(common_arg(
														
 
															+        {"--show-timings"},
														
 
															         {"--no-show-timings"},
														
 
															-        string_format("disable timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
														
 
															-        [](common_params & params) {
														
 
															-            params.show_timings = false;
														
 
															+        string_format("whether to show timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
														
 
															+        [](common_params & params, bool value) {
														
 
															+            params.show_timings = value;
														
 
															         }
														
 
															-    ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_NO_SHOW_TIMINGS"));
														
 
															+    ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SHOW_TIMINGS"));
														
 
															     add_opt(common_arg(
														
 
															         {"-f", "--file"}, "FNAME",
														
 
															         "a file containing the prompt (default: none)",
														
@@ -1171,16 +1234,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
 
															     ).set_excludes({LLAMA_EXAMPLE_SERVER}));
														
 
															     add_opt(common_arg(
														
 
															         {"-e", "--escape"},
														
 
															-        string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
														
 
															-        [](common_params & params) {
														
 
															-            params.escape = true;
														
 
															-        }
														
 
															-    ));
														
 
															-    add_opt(common_arg(
														
 
															         {"--no-escape"},
														
 
															-        "do not process escape sequences",
														
 
															-        [](common_params & params) {
														
 
															-            params.escape = false;
														
 
															+        string_format("whether to process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
														
 
															+        [](common_params & params, bool value) {
														
 
															+            params.escape = value;
														
 
															         }
														
 
															     ));
														
 
															     add_opt(common_arg(
														
@@ -1227,19 +1284,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
 
															     ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
														
 
															     add_opt(common_arg(
														
 
															         {"-cnv", "--conversation"},
														
 
															-        "run in conversation mode:\n"
														
 
															+        {"-no-cnv", "--no-conversation"},
														
 
															+        "whether to run in conversation mode:\n"
														
 
															         "- does not print special tokens and suffix/prefix\n"
														
 
															         "- interactive mode is also enabled\n"
														
 
															         "(default: auto enabled if chat template is available)",
														
 
															-        [](common_params & params) {
														
 
															-            params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
														
 
															-        }
														
 
															-    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
														
 
															-    add_opt(common_arg(
														
 
															-        {"-no-cnv", "--no-conversation"},
														
 
															-        "force disable conversation mode (default: false)",
														
 
															-        [](common_params & params) {
														
 
															-            params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
														
 
															+        [](common_params & params, bool value) {
														
 
															+            params.conversation_mode = value ? COMMON_CONVERSATION_MODE_ENABLED : COMMON_CONVERSATION_MODE_DISABLED;
														
 
															         }
														
 
															     ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
														
 
															     add_opt(common_arg(
														
@@ -1297,10 +1348,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
 
															         }
														
 
															     ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
														
 
															     add_opt(common_arg(
														
 
															+        {"--warmup"},
														
 
															         {"--no-warmup"},
														
 
															-        "skip warming up the model with an empty run",
														
 
															-        [](common_params & params) {
														
 
															-            params.warmup = false;
														
 
															+        string_format("whether to perform warmup with an empty run (default: %s)", params.warmup ? "enabled" : "disabled"),
														
 
															+        [](common_params & params, bool value) {
														
 
															+            params.warmup = value;
														
 
															         }
														
 
															     ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
														
 
															     add_opt(common_arg(
														
@@ -1702,19 +1754,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
 
															         }
														
 
															     ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION}));
														
 
															     add_opt(common_arg(
														
 
															+        {"-kvo", "--kv-offload"},
														
 
															         {"-nkvo", "--no-kv-offload"},
														
 
															-        "disable KV offload",
														
 
															-        [](common_params & params) {
														
 
															-            params.no_kv_offload = true;
														
 
															+        string_format("whether to enable KV cache offloading (default: %s)", params.no_kv_offload ? "disabled" : "enabled"),
														
 
															+        [](common_params & params, bool value) {
														
 
															+            params.no_kv_offload = !value;
														
 
															         }
														
 
															-    ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
														
 
															+    ).set_env("LLAMA_ARG_KV_OFFLOAD"));
														
 
															     add_opt(common_arg(
														
 
															+        {"--repack"},
														
 
															         {"-nr", "--no-repack"},
														
 
															-        "disable weight repacking",
														
 
															-        [](common_params & params) {
														
 
															-            params.no_extra_bufts = true;
														
 
															+        string_format("whether to enable weight repacking (default: %s)", params.no_extra_bufts ? "disabled" : "enabled"),
														
 
															+        [](common_params & params, bool value) {
														
 
															+            params.no_extra_bufts = !value;
														
 
															         }
														
 
															-    ).set_env("LLAMA_ARG_NO_REPACK"));
														
 
															+    ).set_env("LLAMA_ARG_REPACK"));
														
 
															     add_opt(common_arg(
														
 
															         {"--no-host"},
														
 
															         "bypass host buffer allowing extra buffers to be used",
														
@@ -1843,18 +1897,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
 
															     ).set_examples({LLAMA_EXAMPLE_PARALLEL}));
														
 
															     add_opt(common_arg(
														
 
															         {"-cb", "--cont-batching"},
														
 
															-        string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
														
 
															-        [](common_params & params) {
														
 
															-            params.cont_batching = true;
														
 
															-        }
														
 
															-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
														
 
															-    add_opt(common_arg(
														
 
															         {"-nocb", "--no-cont-batching"},
														
 
															-        "disable continuous batching",
														
 
															-        [](common_params & params) {
														
 
															-            params.cont_batching = false;
														
 
															+        string_format("whether to enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
														
 
															+        [](common_params & params, bool value) {
														
 
															+            params.cont_batching = value;
														
 
															         }
														
 
															-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
														
 
															+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
														
 
															     add_opt(common_arg(
														
 
															         {"-mm", "--mmproj"}, "FILE",
														
 
															         "path to a multimodal projector file. see tools/mtmd/README.md\n"
														
@@ -1871,19 +1919,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
 
															         }
														
 
															     ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
														
 
															     add_opt(common_arg(
														
 
															-        {"--no-mmproj"},
														
 
															-        "explicitly disable multimodal projector, useful when using -hf",
														
 
															-        [](common_params & params) {
														
 
															-            params.no_mmproj = true;
														
 
															+        {"--mmproj-auto"},
														
 
															+        {"--no-mmproj", "--no-mmproj-auto"},
														
 
															+        string_format("whether to use multimodal projector file (if available), useful when using -hf (default: %s)", params.no_mmproj ? "disabled" : "enabled"),
														
 
															+        [](common_params & params, bool value) {
														
 
															+            params.no_mmproj = !value;
														
 
															         }
														
 
															-    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ"));
														
 
															+    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO"));
														
 
															     add_opt(common_arg(
														
 
															+        {"--mmproj-offload"},
														
 
															         {"--no-mmproj-offload"},
														
 
															-        "do not offload multimodal projector to GPU",
														
 
															-        [](common_params & params) {
														
 
															-            params.mmproj_use_gpu = false;
														
 
															+        string_format("whether to enable GPU offloading for multimodal projector (default: %s)", params.mmproj_use_gpu ? "enabled" : "disabled"),
														
 
															+        [](common_params & params, bool value) {
														
 
															+            params.mmproj_use_gpu = value;
														
 
															         }
														
 
															-    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
														
 
															+    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
														
 
															     add_opt(common_arg(
														
 
															         {"--image", "--audio"}, "FILE",
														
 
															         "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
														
@@ -1923,12 +1973,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
 
															         }
														
 
															     ).set_env("LLAMA_ARG_MLOCK"));
														
 
															     add_opt(common_arg(
														
 
															+        {"--mmap"},
														
 
															         {"--no-mmap"},
														
 
															-        "do not memory-map model (slower load but may reduce pageouts if not using mlock)",
														
 
															-        [](common_params & params) {
														
 
															-            params.use_mmap = false;
														
 
															+        string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
														
 
															+        [](common_params & params, bool value) {
														
 
															+            params.use_mmap = value;
														
 
															         }
														
 
															-    ).set_env("LLAMA_ARG_NO_MMAP"));
														
 
															+    ).set_env("LLAMA_ARG_MMAP"));
														
 
															     add_opt(common_arg(
														
 
															         {"--numa"}, "TYPE",
														
 
															         "attempt optimizations that help on some NUMA systems\n"
														
@@ -2116,10 +2167,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
 
															         }
														
 
															     ));
														
 
															     add_opt(common_arg(
														
 
															+        {"--op-offload"},
														
 
															         {"--no-op-offload"},
														
 
															-        string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
														
 
															-        [](common_params & params) {
														
 
															-            params.no_op_offload = true;
														
 
															+        string_format("whether to offload host tensor operations to device (default: %s)", params.no_op_offload ? "false" : "true"),
														
 
															+        [](common_params & params, bool value) {
														
 
															+            params.no_op_offload = !value;
														
 
															         }
														
 
															     ));
														
 
															     add_opt(common_arg(
														
@@ -2315,10 +2367,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
 
															         }
														
 
															     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
														
 
															     add_opt(common_arg(
														
 
															+        {"--ppl"},
														
 
															         {"--no-ppl"},
														
 
															-        string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
														
 
															-        [](common_params & params) {
														
 
															-            params.compute_ppl = false;
														
 
															+        string_format("whether to compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
														
 
															+        [](common_params & params, bool value) {
														
 
															+            params.compute_ppl = value;
														
 
															         }
														
 
															     ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
														
 
															     add_opt(common_arg(
														
@@ -2437,12 +2490,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
 
															         }
														
 
															     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
														
 
															     add_opt(common_arg(
														
 
															+        {"--webui"},
														
 
															         {"--no-webui"},
														
 
															-        string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
														
 
															-        [](common_params & params) {
														
 
															-            params.webui = false;
														
 
															+        string_format("whether to enable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
														
 
															+        [](common_params & params, bool value) {
														
 
															+            params.webui = value;
														
 
															         }
														
 
															-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI"));
														
 
															+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI"));
														
 
															     add_opt(common_arg(
														
 
															         {"--embedding", "--embeddings"},
														
 
															         string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
														
@@ -2547,18 +2601,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
 
															     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
														
 
															     add_opt(common_arg(
														
 
															         {"--slots"},
														
 
															-        string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
														
 
															-        [](common_params & params) {
														
 
															-            params.endpoint_slots = true;
														
 
															-        }
														
 
															-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
														
 
															-    add_opt(common_arg(
														
 
															         {"--no-slots"},
														
 
															-        "disables slots monitoring endpoint",
														
 
															-        [](common_params & params) {
														
 
															-            params.endpoint_slots = false;
														
 
															+        string_format("expose slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
														
 
															+        [](common_params & params, bool value) {
														
 
															+            params.endpoint_slots = value;
														
 
															         }
														
 
															-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
														
 
															+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
														
 
															     add_opt(common_arg(
														
 
															         {"--slot-save-path"}, "PATH",
														
 
															         "path to save slot kv cache (default: disabled)",
														
@@ -2609,26 +2657,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
 
															         }
														
 
															     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
														
 
															     add_opt(common_arg(
														
 
															+        {"--models-autoload"},
														
 
															         {"--no-models-autoload"},
														
 
															-        "disables automatic loading of models (default: enabled)",
														
 
															-        [](common_params & params) {
														
 
															-            params.models_autoload = false;
														
 
															+        string_format("for router server, whether to automatically load models (default: %s)", params.models_autoload ? "enabled" : "disabled"),
														
 
															+        [](common_params & params, bool value) {
														
 
															+            params.models_autoload = value;
														
 
															         }
														
 
															-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD"));
														
 
															+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_AUTOLOAD"));
														
 
															     add_opt(common_arg(
														
 
															         {"--jinja"},
														
 
															-        string_format("use jinja template for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
														
 
															-        [](common_params & params) {
														
 
															-            params.use_jinja = true;
														
 
															-        }
														
 
															-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
														
 
															-    add_opt(common_arg(
														
 
															         {"--no-jinja"},
														
 
															-        string_format("disable jinja template for chat (default: %s)", params.use_jinja ? "disabled" : "enabled"),
														
 
															-        [](common_params & params) {
														
 
															-            params.use_jinja = false;
														
 
															+        string_format("whether to use jinja template engine for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
														
 
															+        [](common_params & params, bool value) {
														
 
															+            params.use_jinja = value;
														
 
															         }
														
 
															-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA"));
														
 
															+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
														
 
															     add_opt(common_arg(
														
 
															         {"--reasoning-format"}, "FORMAT",
														
 
															         "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
														
@@ -2673,15 +2716,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
 
															         }
														
 
															     ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
														
 
															     add_opt(common_arg(
														
 
															+        {"--prefill-assistant"},
														
 
															         {"--no-prefill-assistant"},
														
 
															         string_format(
														
 
															             "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
														
 
															             "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
														
 
															         ),
														
 
															-        [](common_params & params) {
														
 
															-            params.prefill_assistant = false;
														
 
															+        [](common_params & params, bool value) {
														
 
															+            params.prefill_assistant = value;
														
 
															         }
														
 
															-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
														
 
															+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PREFILL_ASSISTANT"));
														
 
															     add_opt(common_arg(
														
 
															         {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
														
 
															         string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
														
--- a/common/arg.h
+++ b/common/arg.h
@@ -16,6 +16,7 @@ struct common_arg {
 
															     std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
														
 
															     std::set<enum llama_example> excludes = {};
														
 
															     std::vector<const char *> args;
														
 
															+    std::vector<const char *> args_neg;  // for negated args like --no-xxx
														
 
															     const char * value_hint   = nullptr; // help text or example for arg value
														
 
															     const char * value_hint_2 = nullptr; // for second arg value
														
 
															     const char * env          = nullptr;
														
@@ -25,6 +26,7 @@ struct common_arg {
 
															     void (*handler_string) (common_params & params, const std::string &) = nullptr;
														
 
															     void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
														
 
															     void (*handler_int)    (common_params & params, int) = nullptr;
														
 
															+    void (*handler_bool)   (common_params & params, bool) = nullptr;
														
 
															     common_arg() = default;
														
@@ -48,6 +50,13 @@ struct common_arg {
 
															         void (*handler)(common_params & params)
														
 
															     ) : args(args), help(help), handler_void(handler) {}
														
 
															+    common_arg(
														
 
															+        const std::initializer_list<const char *> & args,
														
 
															+        const std::initializer_list<const char *> & args_neg,
														
 
															+        const std::string & help,
														
 
															+        void (*handler)(common_params & params, bool)
														
 
															+    ) : args(args), args_neg(args_neg), help(help), handler_bool(handler) {}
														
 
															+
														
 
															     // support 2 values for arg
														
 
															     common_arg(
														
 
															         const std::initializer_list<const char *> & args,
														
@@ -80,6 +89,10 @@ struct common_arg {
 
															         }
														
 
															         return strcmp(args[0], other.args[0]) == 0;
														
 
															     }
														
 
															+
														
 
															+    // get all args and env vars (including negated args/env)
														
 
															+    std::vector<std::string> get_args() const;
														
 
															+    std::vector<std::string> get_env() const;
														
 
															 };
														
 
															 namespace common_arg_utils {
														
--- a/common/preset.cpp
+++ b/common/preset.cpp
@@ -23,8 +23,14 @@ std::vector<std::string> common_preset::to_args() const {
 
															         if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
														
 
															             // flag option, no value
														
 
															             if (common_arg_utils::is_falsey(value)) {
														
 
															-                // skip the flag
														
 
															-                args.pop_back();
														
 
															+                // use negative arg if available
														
 
															+                if (!opt.args_neg.empty()) {
														
 
															+                    args.back() = opt.args_neg.back();
														
 
															+                } else {
														
 
															+                    // otherwise, skip the flag
														
 
															+                    // TODO: maybe throw an error instead?
														
 
															+                    args.pop_back();
														
 
															+                }
														
 
															             }
														
 
															         }
														
 
															         if (opt.value_hint != nullptr) {
														
@@ -141,10 +147,10 @@ static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_
 
															 static std::map<std::string, common_arg> get_map_key_opt(common_params_context & ctx_params) {
														
 
															     std::map<std::string, common_arg> mapping;
														
 
															     for (const auto & opt : ctx_params.options) {
														
 
															-        if (opt.env != nullptr) {
														
 
															-            mapping[opt.env] = opt;
														
 
															+        for (const auto & env : opt.get_env()) {
														
 
															+            mapping[env] = opt;
														
 
															         }
														
 
															-        for (const auto & arg : opt.args) {
														
 
															+        for (const auto & arg : opt.get_args()) {
														
 
															             mapping[rm_leading_dashes(arg)] = opt;
														
 
															         }
														
 
															     }
														
--- a/examples/gen-docs/gen-docs.cpp
+++ b/examples/gen-docs/gen-docs.cpp
@@ -14,12 +14,13 @@ static void write_table_header(std::ofstream & file) {
 
															 static void write_table_entry(std::ofstream & file, const common_arg & opt) {
														
 
															     file << "| `";
														
 
															     // args
														
 
															-    for (const auto & arg : opt.args) {
														
 
															-    if (arg == opt.args.front()) {
														
 
															+    auto all_args = opt.get_args();
														
 
															+    for (const auto & arg : all_args) {
														
 
															+    if (arg == all_args.front()) {
														
 
															             file << arg;
														
 
															-            if (opt.args.size() > 1) file << ", ";
														
 
															+            if (all_args.size() > 1) file << ", ";
														
 
															         } else {
														
 
															-            file << arg << (arg != opt.args.back() ? ", " : "");
														
 
															+            file << arg << (arg != all_args.back() ? ", " : "");
														
 
															         }
														
 
															     }
														
 
															     // value hint
														
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@@ -20,20 +20,20 @@ int main(void) {
 
															             std::unordered_set<std::string> seen_env_vars;
														
 
															             for (const auto & opt : ctx_arg.options) {
														
 
															                 // check for args duplications
														
 
															-                for (const auto & arg : opt.args) {
														
 
															+                for (const auto & arg : opt.get_args()) {
														
 
															                     if (seen_args.find(arg) == seen_args.end()) {
														
 
															                         seen_args.insert(arg);
														
 
															                     } else {
														
 
															-                        fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg);
														
 
															+                        fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg.c_str());
														
 
															                         exit(1);
														
 
															                     }
														
 
															                 }
														
 
															                 // check for env var duplications
														
 
															-                if (opt.env) {
														
 
															-                    if (seen_env_vars.find(opt.env) == seen_env_vars.end()) {
														
 
															-                        seen_env_vars.insert(opt.env);
														
 
															+                for (const auto & env : opt.get_env()) {
														
 
															+                    if (seen_env_vars.find(env) == seen_env_vars.end()) {
														
 
															+                        seen_env_vars.insert(env);
														
 
															                     } else {
														
 
															-                        fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", opt.env);
														
 
															+                        fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", env.c_str());
														
 
															                         exit(1);
														
 
															                     }
														
 
															                 }
														
@@ -115,6 +115,14 @@ int main(void) {
 
															     assert(params.model.path == "blah.gguf");
														
 
															     assert(params.cpuparams.n_threads == 1010);
														
 
															+    printf("test-arg-parser: test negated environment variables\n\n");
														
 
															+
														
 
															+    setenv("LLAMA_ARG_MMAP", "0", true);
														
 
															+    setenv("LLAMA_ARG_NO_PERF", "1", true); // legacy format
														
 
															+    argv = {"binary_name"};
														
 
															+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
														
 
															+    assert(params.use_mmap == false);
														
 
															+    assert(params.no_perf == true);
														
 
															     printf("test-arg-parser: test environment variables being overwritten\n\n");
														
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -54,9 +54,8 @@ For the ful list of features, please refer to [server's changelog](https://githu
 
															 | `--swa-full` | use full-size SWA cache (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)<br/>(env: LLAMA_ARG_SWA_FULL) |
														
 
															 | `--kv-unified, -kvu` | use single unified KV buffer for the KV cache of all sequences (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
														
 
															 | `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')<br/>(env: LLAMA_ARG_FLASH_ATTN) |
														
 
															-| `--no-perf` | disable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_NO_PERF) |
														
 
															-| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
														
 
															-| `--no-escape` | do not process escape sequences |
														
 
															+| `--perf, --no-perf` | whether to enable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_PERF) |
														
 
															+| `-e, --escape, --no-escape` | whether to process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
														
 
															 | `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: LLAMA_ARG_ROPE_SCALING_TYPE) |
														
 
															 | `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N<br/>(env: LLAMA_ARG_ROPE_SCALE) |
														
 
															 | `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)<br/>(env: LLAMA_ARG_ROPE_FREQ_BASE) |
														
@@ -66,15 +65,15 @@ For the ful list of features, please refer to [server's changelog](https://githu
 
															 | `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
														
 
															 | `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
														
 
															 | `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
														
 
															-| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
														
 
															-| `-nr, --no-repack` | disable weight repacking<br/>(env: LLAMA_ARG_NO_REPACK) |
														
 
															-| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_NO_HOST) |
														
 
															+| `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)<br/>(env: LLAMA_ARG_KV_OFFLOAD) |
														
 
															+| `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)<br/>(env: LLAMA_ARG_REPACK) |
														
 
															+| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_HOST) |
														
 
															 | `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
														
 
															 | `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
														
 
															 | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
														
 
															 | `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
														
 
															 | `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
														
 
															-| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)<br/>(env: LLAMA_ARG_NO_MMAP) |
														
 
															+| `--mmap, --no-mmap` | whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
														
 
															 | `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
														
 
															 | `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
														
 
															 | `--list-devices` | print list of available devices and exit |
														
@@ -87,7 +86,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
 
															 | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
														
 
															 | `--check-tensors` | check model tensor data for invalid values (default: false) |
														
 
															 | `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false |
														
 
															-| `--no-op-offload` | disable offloading host tensor operations to device (default: false) |
														
 
															+| `--op-offload, --no-op-offload` | whether to offload host tensor operations to device (default: true) |
														
 
															 | `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) |
														
 
															 | `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) |
														
 
															 | `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
														
@@ -157,19 +156,18 @@ For the ful list of features, please refer to [server's changelog](https://githu
 
															 | -------- | ----------- |
														
 
															 | `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
														
 
															 | `--cache-ram, -cram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
														
 
															-| `--no-context-shift` | disables context shift on infinite text generation (default: enabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
														
 
															-| `--context-shift` | enables context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
														
 
															+| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
														
 
															 | `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode<br/> |
														
 
															 | `-sp, --special` | special tokens output enabled (default: false) |
														
 
															-| `--no-warmup` | skip warming up the model with an empty run |
														
 
															+| `--warmup, --no-warmup` | whether to perform warmup with an empty run (default: enabled) |
														
 
															 | `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
														
 
															 | `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
														
 
															-| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
														
 
															-| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
														
 
															+| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
														
 
															+| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
														
 
															 | `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
														
 
															 | `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
														
 
															-| `--no-mmproj` | explicitly disable multimodal projector, useful when using -hf<br/>(env: LLAMA_ARG_NO_MMPROJ) |
														
 
															-| `--no-mmproj-offload` | do not offload multimodal projector to GPU<br/>(env: LLAMA_ARG_NO_MMPROJ_OFFLOAD) |
														
 
															+| `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_AUTO) |
														
 
															+| `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_OFFLOAD) |
														
 
															 | `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
														
 
															 | `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
														
 
															 | `--override-tensor-draft, -otd <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
														
@@ -180,7 +178,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
 
															 | `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
														
 
															 | `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
														
 
															 | `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )<br/>(env: LLAMA_ARG_API_PREFIX) |
														
 
															-| `--no-webui` | Disable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_NO_WEBUI) |
														
 
															+| `--webui, --no-webui` | whether to enable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_WEBUI) |
														
 
															 | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
														
 
															 | `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
														
 
															 | `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
														
@@ -193,20 +191,19 @@ For the ful list of features, please refer to [server's changelog](https://githu
 
															 | `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
														
 
															 | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
														
 
															 | `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
														
 
															-| `--slots` | enable slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
														
 
															-| `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
														
 
															+| `--slots, --no-slots` | expose slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
														
 
															 | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
														
 
															+| `--media-path PATH` | directory for loading local media files; files can be accessed via file:// URLs using relative paths (default: disabled) |
														
 
															 | `--models-dir PATH` | directory containing models for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_DIR) |
														
 
															+| `--models-preset PATH` | path to INI file containing model presets for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_PRESET) |
														
 
															 | `--models-max N` | for router server, maximum number of models to load simultaneously (default: 4, 0 = unlimited)<br/>(env: LLAMA_ARG_MODELS_MAX) |
														
 
															-| `--models-allow-extra-args` | for router server, allow extra arguments for models; important: some arguments can allow users to access local file system, use with caution (default: disabled)<br/>(env: LLAMA_ARG_MODELS_ALLOW_EXTRA_ARGS) |
														
 
															-| `--no-models-autoload` | disables automatic loading of models (default: enabled)<br/>(env: LLAMA_ARG_NO_MODELS_AUTOLOAD) |
														
 
															-| `--jinja` | use jinja template for chat (default: enabled)<br/><br/>(env: LLAMA_ARG_JINJA) |
														
 
															-| `--no-jinja` | disable jinja template for chat (default: enabled)<br/><br/>(env: LLAMA_ARG_NO_JINJA) |
														
 
															+| `--models-autoload, --no-models-autoload` | for router server, whether to automatically load models (default: enabled)<br/>(env: LLAMA_ARG_MODELS_AUTOLOAD) |
														
 
															+| `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)<br/>(env: LLAMA_ARG_JINJA) |
														
 
															 | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
														
 
															 | `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
														
 
															 | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
														
 
															 | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
														
 
															-| `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) |
														
 
															+| `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_PREFILL_ASSISTANT) |
														
 
															 | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled)<br/> |
														
 
															 | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
														
 
															 | `-td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
														
@@ -236,6 +233,11 @@ For the ful list of features, please refer to [server's changelog](https://githu
 
															 Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
														
 
															+For boolean options like `--mmap` or `--kv-offload`, the environment variable is handled as shown in this example:
														
 
															+- `LLAMA_ARG_MMAP=true` means enabled, other accepted values are: `1`, `on`, `enabled`
														
 
															+- `LLAMA_ARG_MMAP=false` means disabled, other accepted values are: `0`, `off`, `disabled`
														
 
															+- If `LLAMA_ARG_NO_MMAP` is present (no matter the value), it means disabling mmap
														
 
															+
														
 
															 Example usage of docker compose with environment variables:
														
 
															 ```yml