4 周之前 · 14931a826e
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1144,7 +1144,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
 
															         }
														
 
															     ).set_env("LLAMA_ARG_CTX_CHECKPOINTS").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
														
 
															     add_opt(common_arg(
														
 
															-        {"--cache-ram", "-cram"}, "N",
														
 
															+        {"-cram", "--cache-ram"}, "N",
														
 
															         string_format("set the maximum cache size in MiB (default: %d, -1 - no limit, 0 - disable)"
														
 
															             "[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)", params.cache_ram_mib),
														
 
															         [](common_params & params, int value) {
														
@@ -1152,7 +1152,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
 
															         }
														
 
															     ).set_env("LLAMA_ARG_CACHE_RAM").set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
														
 
															     add_opt(common_arg(
														
 
															-        {"--kv-unified", "-kvu"},
														
 
															+        {"-kvu", "--kv-unified"},
														
 
															         "use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)",
														
 
															         [](common_params & params) {
														
 
															             params.kv_unified = true;
														
@@ -1420,7 +1420,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
 
															         }
														
 
															     ).set_sparam());
														
 
															     add_opt(common_arg(
														
 
															-        {"--sampling-seq", "--sampler-seq"}, "SEQUENCE",
														
 
															+        {"--sampler-seq", "--sampling-seq"}, "SEQUENCE",
														
 
															         string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
														
 
															         [](common_params & params, const std::string & value) {
														
 
															             params.sampling.samplers = common_sampler_types_from_chars(value);
														
@@ -2078,26 +2078,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
 
															         }
														
 
															     ));
														
 
															     add_opt(common_arg(
														
 
															-        {"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
														
 
															+        {"-ot", "--override-tensor"}, "<tensor name pattern>=<buffer type>,...",
														
 
															         "override tensor buffer type", [](common_params & params, const std::string & value) {
														
 
															             parse_tensor_buffer_overrides(value, params.tensor_buft_overrides);
														
 
															         }
														
 
															     ));
														
 
															     add_opt(common_arg(
														
 
															-        {"--override-tensor-draft", "-otd"}, "<tensor name pattern>=<buffer type>,...",
														
 
															+        {"-otd", "--override-tensor-draft"}, "<tensor name pattern>=<buffer type>,...",
														
 
															         "override tensor buffer type for draft model", [](common_params & params, const std::string & value) {
														
 
															             parse_tensor_buffer_overrides(value, params.speculative.tensor_buft_overrides);
														
 
															         }
														
 
															     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
														
 
															     add_opt(common_arg(
														
 
															-        {"--cpu-moe", "-cmoe"},
														
 
															+        {"-cmoe", "--cpu-moe"},
														
 
															         "keep all Mixture of Experts (MoE) weights in the CPU",
														
 
															         [](common_params & params) {
														
 
															             params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
														
 
															         }
														
 
															     ).set_env("LLAMA_ARG_CPU_MOE"));
														
 
															     add_opt(common_arg(
														
 
															-        {"--n-cpu-moe", "-ncmoe"}, "N",
														
 
															+        {"-ncmoe", "--n-cpu-moe"}, "N",
														
 
															         "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
														
 
															         [](common_params & params, int value) {
														
 
															             if (value < 0) {
														
@@ -2112,14 +2112,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
 
															         }
														
 
															     ).set_env("LLAMA_ARG_N_CPU_MOE"));
														
 
															     add_opt(common_arg(
														
 
															-        {"--cpu-moe-draft", "-cmoed"},
														
 
															+        {"-cmoed", "--cpu-moe-draft"},
														
 
															         "keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
														
 
															         [](common_params & params) {
														
 
															             params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
														
 
															         }
														
 
															     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
														
 
															     add_opt(common_arg(
														
 
															-        {"--n-cpu-moe-draft", "-ncmoed"}, "N",
														
 
															+        {"-ncmoed", "--n-cpu-moe-draft"}, "N",
														
 
															         "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model",
														
 
															         [](common_params & params, int value) {
														
 
															             if (value < 0) {
														
@@ -2647,7 +2647,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
 
															         }
														
 
															     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_EMBEDDINGS"));
														
 
															     add_opt(common_arg(
														
 
															-        {"--reranking", "--rerank"},
														
 
															+        {"--rerank", "--reranking"},
														
 
															         string_format("enable reranking endpoint on server (default: %s)", "disabled"),
														
 
															         [](common_params & params) {
														
 
															             params.embedding = true;
														
@@ -3118,7 +3118,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
 
															         }
														
 
															     ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
														
 
															     add_opt(common_arg(
														
 
															-        {"--draft-max", "--draft", "--draft-n"}, "N",
														
 
															+        {"--draft", "--draft-n", "--draft-max"}, "N",
														
 
															         string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max),
														
 
															         [](common_params & params, int value) {
														
 
															             params.speculative.n_max = value;
														
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@@ -37,6 +37,30 @@ int main(void) {
 
															                         exit(1);
														
 
															                     }
														
 
															                 }
														
 
															+
														
 
															+                // ensure shorter argument precedes longer argument
														
 
															+                if (opt.args.size() > 1) {
														
 
															+                    const std::string first(opt.args.front());
														
 
															+                    const std::string last(opt.args.back());
														
 
															+
														
 
															+                    if (first.length() > last.length()) {
														
 
															+                        fprintf(stderr, "test-arg-parser: shorter argument should come before longer one: %s, %s\n",
														
 
															+                                first.c_str(), last.c_str());
														
 
															+                        assert(false);
														
 
															+                    }
														
 
															+                }
														
 
															+
														
 
															+                // same check for negated arguments
														
 
															+                if (opt.args_neg.size() > 1) {
														
 
															+                    const std::string first(opt.args_neg.front());
														
 
															+                    const std::string last(opt.args_neg.back());
														
 
															+
														
 
															+                    if (first.length() > last.length()) {
														
 
															+                        fprintf(stderr, "test-arg-parser: shorter negated argument should come before longer one: %s, %s\n",
														
 
															+                                first.c_str(), last.c_str());
														
 
															+                        assert(false);
														
 
															+                    }
														
 
															+                }
														
 
															             }
														
 
															         } catch (std::exception & e) {
														
 
															             printf("%s\n", e.what());
														
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -75,9 +75,9 @@ For the ful list of features, please refer to [server's changelog](https://githu
 
															 | `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
														
 
															 | `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
														
 
															 | `--list-devices` | print list of available devices and exit |
														
 
															-| `--override-tensor, -ot <tensor name pattern>=<buffer type>,...` | override tensor buffer type |
														
 
															-| `--cpu-moe, -cmoe` | keep all Mixture of Experts (MoE) weights in the CPU<br/>(env: LLAMA_ARG_CPU_MOE) |
														
 
															-| `--n-cpu-moe, -ncmoe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU<br/>(env: LLAMA_ARG_N_CPU_MOE) |
														
 
															+| `-ot, --override-tensor <tensor name pattern>=<buffer type>,...` | override tensor buffer type |
														
 
															+| `-cmoe, --cpu-moe` | keep all Mixture of Experts (MoE) weights in the CPU<br/>(env: LLAMA_ARG_CPU_MOE) |
														
 
															+| `-ncmoe, --n-cpu-moe N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU<br/>(env: LLAMA_ARG_N_CPU_MOE) |
														
 
															 | `-ngl, --gpu-layers, --n-gpu-layers N` | max. number of layers to store in VRAM (default: -1)<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
														
 
															 | `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs<br/>(env: LLAMA_ARG_SPLIT_MODE) |
														
 
															 | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1<br/>(env: LLAMA_ARG_TENSOR_SPLIT) |
														
@@ -120,7 +120,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
 
															 | -------- | ----------- |
														
 
															 | `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: penalties;dry;top_n_sigma;top_k;typ_p;top_p;min_p;xtc;temperature) |
														
 
															 | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
														
 
															-| `--sampling-seq, --sampler-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
														
 
															+| `--sampler-seq, --sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: edskypmxt) |
														
 
															 | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
														
 
															 | `--temp N` | temperature (default: 0.8) |
														
 
															 | `--top-k N` | top-k sampling (default: 40, 0 = disabled)<br/>(env: LLAMA_ARG_TOP_K) |
														
@@ -156,8 +156,8 @@ For the ful list of features, please refer to [server's changelog](https://githu
 
															 | Argument | Explanation |
														
 
															 | -------- | ----------- |
														
 
															 | `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
														
 
															-| `--cache-ram, -cram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
														
 
															-| `--kv-unified, -kvu` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
														
 
															+| `-cram, --cache-ram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
														
 
															+| `-kvu, --kv-unified` | use single unified KV buffer shared across all sequences (default: enabled if number of slots is auto)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
														
 
															 | `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
														
 
															 | `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode<br/> |
														
 
															 | `-sp, --special` | special tokens output enabled (default: false) |
														
@@ -172,9 +172,9 @@ For the ful list of features, please refer to [server's changelog](https://githu
 
															 | `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_OFFLOAD) |
														
 
															 | `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
														
 
															 | `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
														
 
															-| `--override-tensor-draft, -otd <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
														
 
															-| `--cpu-moe-draft, -cmoed` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model<br/>(env: LLAMA_ARG_CPU_MOE_DRAFT) |
														
 
															-| `--n-cpu-moe-draft, -ncmoed N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model<br/>(env: LLAMA_ARG_N_CPU_MOE_DRAFT) |
														
 
															+| `-otd, --override-tensor-draft <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
														
 
															+| `-cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model<br/>(env: LLAMA_ARG_CPU_MOE_DRAFT) |
														
 
															+| `-ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model<br/>(env: LLAMA_ARG_N_CPU_MOE_DRAFT) |
														
 
															 | `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_ALIAS) |
														
 
															 | `--host HOST` | ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
														
 
															 | `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
														
@@ -184,7 +184,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
 
															 | `--webui-config-file PATH` | JSON file that provides default WebUI settings (overrides WebUI defaults)<br/>(env: LLAMA_ARG_WEBUI_CONFIG_FILE) |
														
 
															 | `--webui, --no-webui` | whether to enable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_WEBUI) |
														
 
															 | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
														
 
															-| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
														
 
															+| `--rerank, --reranking` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
														
 
															 | `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
														
 
															 | `--api-key-file FNAME` | path to file containing API keys (default: none) |
														
 
															 | `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) |
														
@@ -212,7 +212,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
 
															 | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
														
 
															 | `-td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
														
 
															 | `-tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) |
														
 
															-| `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
														
 
															+| `--draft, --draft-n, --draft-max N` | number of tokens to draft for speculative decoding (default: 16)<br/>(env: LLAMA_ARG_DRAFT_MAX) |
														
 
															 | `--draft-min, --draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)<br/>(env: LLAMA_ARG_DRAFT_MIN) |
														
 
															 | `--draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.8)<br/>(env: LLAMA_ARG_DRAFT_P_MIN) |
														
 
															 | `-cd, --ctx-size-draft N` | size of the prompt context for the draft model (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE_DRAFT) |