|
|
@@ -51,6 +51,7 @@ using json = nlohmann::ordered_json;
|
|
|
static std::initializer_list<enum llama_example> mmproj_examples = {
|
|
|
LLAMA_EXAMPLE_MTMD,
|
|
|
LLAMA_EXAMPLE_SERVER,
|
|
|
+ LLAMA_EXAMPLE_CLI,
|
|
|
};
|
|
|
|
|
|
static std::string read_file(const std::string & fname) {
|
|
|
@@ -468,6 +469,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|
|
));
|
|
|
}
|
|
|
|
|
|
+ common_log_set_verbosity_thold(params.verbosity);
|
|
|
+
|
|
|
return true;
|
|
|
}
|
|
|
|
|
|
@@ -790,7 +793,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
[](common_params & params) {
|
|
|
params.display_prompt = false;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
|
|
|
add_opt(common_arg(
|
|
|
{"-co", "--color"}, "[on|off|auto]",
|
|
|
"Colorize output to distinguish prompt and user input from generations ('on', 'off', or 'auto', default: 'auto')\n"
|
|
|
@@ -807,7 +810,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
string_format("error: unknown value for --color: '%s'\n", value.c_str()));
|
|
|
}
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
|
|
|
add_opt(common_arg(
|
|
|
{"-t", "--threads"}, "N",
|
|
|
string_format("number of CPU threads to use during generation (default: %d)", params.cpuparams.n_threads),
|
|
|
@@ -940,7 +943,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
add_opt(common_arg(
|
|
|
{"-n", "--predict", "--n-predict"}, "N",
|
|
|
string_format(
|
|
|
- ex == LLAMA_EXAMPLE_MAIN
|
|
|
+ ex == LLAMA_EXAMPLE_COMPLETION
|
|
|
? "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)"
|
|
|
: "number of tokens to predict (default: %d, -1 = infinity)",
|
|
|
params.n_predict),
|
|
|
@@ -984,7 +987,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
[](common_params & params, int value) {
|
|
|
params.n_ctx_checkpoints = value;
|
|
|
}
|
|
|
- ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
|
+ ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
|
add_opt(common_arg(
|
|
|
{"--cache-ram", "-cram"}, "N",
|
|
|
string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)\n"
|
|
|
@@ -992,7 +995,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
[](common_params & params, int value) {
|
|
|
params.cache_ram_mib = value;
|
|
|
}
|
|
|
- ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
|
+ ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
|
add_opt(common_arg(
|
|
|
{"--kv-unified", "-kvu"},
|
|
|
string_format("use single unified KV buffer for the KV cache of all sequences (default: %s)\n"
|
|
|
@@ -1007,14 +1010,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
[](common_params & params) {
|
|
|
params.ctx_shift = false;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
|
|
|
add_opt(common_arg(
|
|
|
{"--context-shift"},
|
|
|
string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
|
|
|
[](common_params & params) {
|
|
|
params.ctx_shift = true;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
|
|
|
add_opt(common_arg(
|
|
|
{"--chunks"}, "N",
|
|
|
string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks),
|
|
|
@@ -1050,7 +1053,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
[](common_params & params, const std::string & value) {
|
|
|
params.system_prompt = value;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
|
|
|
add_opt(common_arg(
|
|
|
{"--no-perf"},
|
|
|
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
|
|
|
@@ -1059,6 +1062,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
params.sampling.no_perf = true;
|
|
|
}
|
|
|
).set_env("LLAMA_ARG_NO_PERF"));
|
|
|
+ add_opt(common_arg(
|
|
|
+ {"--no-show-timings"},
|
|
|
+ string_format("disable timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
|
|
|
+ [](common_params & params) {
|
|
|
+ params.show_timings = false;
|
|
|
+ }
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_NO_SHOW_TIMINGS"));
|
|
|
add_opt(common_arg(
|
|
|
{"-f", "--file"}, "FNAME",
|
|
|
"a file containing the prompt (default: none)",
|
|
|
@@ -1080,7 +1090,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
params.system_prompt.pop_back();
|
|
|
}
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
|
|
|
add_opt(common_arg(
|
|
|
{"--in-file"}, "FNAME",
|
|
|
"an input file (repeat to specify multiple files)",
|
|
|
@@ -1128,42 +1138,42 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
[](common_params & params, int value) {
|
|
|
params.n_print = value;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
|
add_opt(common_arg(
|
|
|
{"--prompt-cache"}, "FNAME",
|
|
|
"file to cache prompt state for faster startup (default: none)",
|
|
|
[](common_params & params, const std::string & value) {
|
|
|
params.path_prompt_cache = value;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
|
add_opt(common_arg(
|
|
|
{"--prompt-cache-all"},
|
|
|
"if specified, saves user input and generations to cache as well\n",
|
|
|
[](common_params & params) {
|
|
|
params.prompt_cache_all = true;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
|
add_opt(common_arg(
|
|
|
{"--prompt-cache-ro"},
|
|
|
"if specified, uses the prompt cache but does not update it",
|
|
|
[](common_params & params) {
|
|
|
params.prompt_cache_ro = true;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
|
add_opt(common_arg(
|
|
|
{"-r", "--reverse-prompt"}, "PROMPT",
|
|
|
"halt generation at PROMPT, return control in interactive mode\n",
|
|
|
[](common_params & params, const std::string & value) {
|
|
|
params.antiprompt.emplace_back(value);
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
|
|
|
add_opt(common_arg(
|
|
|
{"-sp", "--special"},
|
|
|
string_format("special tokens output enabled (default: %s)", params.special ? "true" : "false"),
|
|
|
[](common_params & params) {
|
|
|
params.special = true;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
|
|
|
add_opt(common_arg(
|
|
|
{"-cnv", "--conversation"},
|
|
|
"run in conversation mode:\n"
|
|
|
@@ -1173,14 +1183,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
[](common_params & params) {
|
|
|
params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
|
add_opt(common_arg(
|
|
|
{"-no-cnv", "--no-conversation"},
|
|
|
"force disable conversation mode (default: false)",
|
|
|
[](common_params & params) {
|
|
|
params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
|
|
|
add_opt(common_arg(
|
|
|
{"-st", "--single-turn"},
|
|
|
"run conversation for a single turn only, then exit when done\n"
|
|
|
@@ -1189,28 +1199,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
[](common_params & params) {
|
|
|
params.single_turn = true;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
|
|
|
add_opt(common_arg(
|
|
|
{"-i", "--interactive"},
|
|
|
string_format("run in interactive mode (default: %s)", params.interactive ? "true" : "false"),
|
|
|
[](common_params & params) {
|
|
|
params.interactive = true;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
|
add_opt(common_arg(
|
|
|
{"-if", "--interactive-first"},
|
|
|
string_format("run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false"),
|
|
|
[](common_params & params) {
|
|
|
params.interactive_first = true;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
|
add_opt(common_arg(
|
|
|
{"-mli", "--multiline-input"},
|
|
|
"allows you to write or paste multiple lines without ending each in '\\'",
|
|
|
[](common_params & params) {
|
|
|
params.multiline_input = true;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
|
|
|
add_opt(common_arg(
|
|
|
{"--in-prefix-bos"},
|
|
|
"prefix BOS to user inputs, preceding the `--in-prefix` string",
|
|
|
@@ -1218,7 +1228,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
params.input_prefix_bos = true;
|
|
|
params.enable_chat_template = false;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
|
add_opt(common_arg(
|
|
|
{"--in-prefix"}, "STRING",
|
|
|
"string to prefix user inputs with (default: empty)",
|
|
|
@@ -1226,7 +1236,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
params.input_prefix = value;
|
|
|
params.enable_chat_template = false;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
|
add_opt(common_arg(
|
|
|
{"--in-suffix"}, "STRING",
|
|
|
"string to suffix after user inputs with (default: empty)",
|
|
|
@@ -1234,14 +1244,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
params.input_suffix = value;
|
|
|
params.enable_chat_template = false;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
|
add_opt(common_arg(
|
|
|
{"--no-warmup"},
|
|
|
"skip warming up the model with an empty run",
|
|
|
[](common_params & params) {
|
|
|
params.warmup = false;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
|
|
|
add_opt(common_arg(
|
|
|
{"--spm-infill"},
|
|
|
string_format(
|
|
|
@@ -1632,14 +1642,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
[](common_params & params, int value) {
|
|
|
params.grp_attn_n = value;
|
|
|
}
|
|
|
- ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_PASSKEY}));
|
|
|
+ ).set_env("LLAMA_ARG_GRP_ATTN_N").set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_PASSKEY}));
|
|
|
add_opt(common_arg(
|
|
|
{"-gaw", "--grp-attn-w"}, "N",
|
|
|
string_format("group-attention width (default: %d)", params.grp_attn_w),
|
|
|
[](common_params & params, int value) {
|
|
|
params.grp_attn_w = value;
|
|
|
}
|
|
|
- ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
|
+ ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION}));
|
|
|
add_opt(common_arg(
|
|
|
{"-nkvo", "--no-kv-offload"},
|
|
|
"disable KV offload",
|
|
|
@@ -1829,7 +1839,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
[](common_params & params, const std::string & value) {
|
|
|
params.image.emplace_back(value);
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_MTMD}));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_CLI}));
|
|
|
add_opt(common_arg(
|
|
|
{"--image-min-tokens"}, "N",
|
|
|
"minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
|
|
|
@@ -1922,7 +1932,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
"override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
|
|
|
parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
|
add_opt(common_arg(
|
|
|
{"--cpu-moe", "-cmoe"},
|
|
|
"keep all Mixture of Experts (MoE) weights in the CPU",
|
|
|
@@ -1951,7 +1961,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
[](common_params & params) {
|
|
|
params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
|
|
|
add_opt(common_arg(
|
|
|
{"--n-cpu-moe-draft", "-ncmoed"}, "N",
|
|
|
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
|
|
|
@@ -1965,7 +1975,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
|
|
|
}
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
|
|
|
add_opt(common_arg(
|
|
|
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
|
|
|
string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
|
|
|
@@ -2444,7 +2454,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
params.default_template_kwargs[item.key()] = item.value().dump();
|
|
|
}
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
|
|
|
add_opt(common_arg(
|
|
|
{"-to", "--timeout"}, "N",
|
|
|
string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),
|
|
|
@@ -2553,14 +2563,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
[](common_params & params) {
|
|
|
params.use_jinja = true;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
|
|
|
add_opt(common_arg(
|
|
|
{"--no-jinja"},
|
|
|
string_format("disable jinja template for chat (default: %s)\n", params.use_jinja ? "enabled" : "disabled"),
|
|
|
[](common_params & params) {
|
|
|
params.use_jinja = false;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA"));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA"));
|
|
|
add_opt(common_arg(
|
|
|
{"--reasoning-format"}, "FORMAT",
|
|
|
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
|
|
|
@@ -2571,7 +2581,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
[](common_params & params, const std::string & value) {
|
|
|
params.reasoning_format = common_reasoning_format_from_name(value);
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK"));
|
|
|
add_opt(common_arg(
|
|
|
{"--reasoning-budget"}, "N",
|
|
|
"controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
|
|
|
@@ -2579,7 +2589,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
|
|
|
params.reasoning_budget = value;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_THINK_BUDGET"));
|
|
|
add_opt(common_arg(
|
|
|
{"--chat-template"}, "JINJA_TEMPLATE",
|
|
|
string_format(
|
|
|
@@ -2591,7 +2601,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
[](common_params & params, const std::string & value) {
|
|
|
params.chat_template = value;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
|
|
|
add_opt(common_arg(
|
|
|
{"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
|
|
|
string_format(
|
|
|
@@ -2603,7 +2613,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
[](common_params & params, const std::string & value) {
|
|
|
params.chat_template = read_file(value);
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
|
|
|
add_opt(common_arg(
|
|
|
{"--no-prefill-assistant"},
|
|
|
string_format(
|
|
|
@@ -2634,7 +2644,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
[](common_params & params) {
|
|
|
params.simple_io = true;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_MAIN}));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
|
|
|
add_opt(common_arg(
|
|
|
{"--positive-file"}, "FNAME",
|
|
|
string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
|
|
|
@@ -2717,7 +2727,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
"Set verbosity level to infinity (i.e. log all messages, useful for debugging)",
|
|
|
[](common_params & params) {
|
|
|
params.verbosity = INT_MAX;
|
|
|
- common_log_set_verbosity_thold(INT_MAX);
|
|
|
}
|
|
|
));
|
|
|
add_opt(common_arg(
|
|
|
@@ -2738,7 +2747,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
"(default: %d)\n", params.verbosity),
|
|
|
[](common_params & params, int value) {
|
|
|
params.verbosity = value;
|
|
|
- common_log_set_verbosity_thold(value);
|
|
|
}
|
|
|
).set_env("LLAMA_LOG_VERBOSITY"));
|
|
|
add_opt(common_arg(
|
|
|
@@ -2871,14 +2879,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
[](common_params & params, int value) {
|
|
|
params.speculative.n_max = value;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MAX"));
|
|
|
add_opt(common_arg(
|
|
|
{"--draft-min", "--draft-n-min"}, "N",
|
|
|
string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
|
|
|
[](common_params & params, int value) {
|
|
|
params.speculative.n_min = value;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_MIN"));
|
|
|
add_opt(common_arg(
|
|
|
{"--draft-p-split"}, "P",
|
|
|
string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
|
|
|
@@ -2892,14 +2900,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
[](common_params & params, const std::string & value) {
|
|
|
params.speculative.p_min = std::stof(value);
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
|
|
|
add_opt(common_arg(
|
|
|
{"-cd", "--ctx-size-draft"}, "N",
|
|
|
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
|
|
|
[](common_params & params, int value) {
|
|
|
params.speculative.n_ctx = value;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
|
|
|
add_opt(common_arg(
|
|
|
{"-devd", "--device-draft"}, "<dev1,dev2,..>",
|
|
|
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
|
|
|
@@ -2907,7 +2915,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
[](common_params & params, const std::string & value) {
|
|
|
params.speculative.devices = parse_device_list(value);
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
|
add_opt(common_arg(
|
|
|
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
|
|
|
"number of layers to store in VRAM for the draft model",
|
|
|
@@ -2919,21 +2927,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
|
|
|
}
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
|
|
|
add_opt(common_arg(
|
|
|
{"-md", "--model-draft"}, "FNAME",
|
|
|
"draft model for speculative decoding (default: unused)",
|
|
|
[](common_params & params, const std::string & value) {
|
|
|
params.speculative.model.path = value;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_MODEL_DRAFT"));
|
|
|
add_opt(common_arg(
|
|
|
{"--spec-replace"}, "TARGET", "DRAFT",
|
|
|
"translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
|
|
|
[](common_params & params, const std::string & tgt, const std::string & dft) {
|
|
|
params.speculative.replacements.push_back({ tgt, dft });
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
|
add_opt(common_arg(
|
|
|
{"-ctkd", "--cache-type-k-draft"}, "TYPE",
|
|
|
string_format(
|
|
|
@@ -3197,7 +3205,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
params.use_jinja = true;
|
|
|
//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
|
|
|
|
add_opt(common_arg(
|
|
|
{"--gpt-oss-120b-default"},
|
|
|
@@ -3216,7 +3224,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
params.use_jinja = true;
|
|
|
//params.default_template_kwargs["reasoning_effort"] = "\"high\"";
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
|
|
|
|
add_opt(common_arg(
|
|
|
{"--vision-gemma-4b-default"},
|
|
|
@@ -3227,7 +3235,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
params.n_ctx = 0;
|
|
|
params.use_jinja = true;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
|
|
|
|
add_opt(common_arg(
|
|
|
{"--vision-gemma-12b-default"},
|
|
|
@@ -3238,7 +3246,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|
|
params.n_ctx = 0;
|
|
|
params.use_jinja = true;
|
|
|
}
|
|
|
- ).set_examples({LLAMA_EXAMPLE_SERVER}));
|
|
|
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
|
|
|
|
|
|
return ctx_arg;
|
|
|
}
|