8 месяцев назад · cf0a43bb64
--- a/include/llama.h
+++ b/include/llama.h
@@ -345,7 +345,7 @@ extern "C" {
 
				         float    yarn_beta_fast;   // YaRN low correction dim
			
 
				         float    yarn_beta_slow;   // YaRN high correction dim
			
 
				         uint32_t yarn_orig_ctx;    // YaRN original context size
			
 
				-        float    defrag_thold;     // defragment the KV cache if holes/size > thold, < 0 disabled (default)
			
 
				+        float    defrag_thold;     // defragment the KV cache if holes/size > thold, <= 0 disabled (default)
			
 
				 
			
 
				         ggml_backend_sched_eval_callback cb_eval;
			
 
				         void * cb_eval_user_data;
			
--- a/tools/llama-bench/README.md
+++ b/tools/llama-bench/README.md
@@ -43,12 +43,13 @@ test parameters:
 
				   -ub, --ubatch-size <n>                    (default: 512)
			
 
				   -ctk, --cache-type-k <t>                  (default: f16)
			
 
				   -ctv, --cache-type-v <t>                  (default: f16)
			
 
				-  -t, --threads <n>                         (default: 16)
			
 
				+  -dt, --defrag-thold <f>                   (default: -1)
			
 
				+  -t, --threads <n>                         (default: system dependent)
			
 
				   -C, --cpu-mask <hex,hex>                  (default: 0x0)
			
 
				   --cpu-strict <0|1>                        (default: 0)
			
 
				   --poll <0...100>                          (default: 50)
			
 
				   -ngl, --n-gpu-layers <n>                  (default: 99)
			
 
				-  -rpc, --rpc <rpc_servers>                 (default: )
			
 
				+  -rpc, --rpc <rpc_servers>                 (default: none)
			
 
				   -sm, --split-mode <none|layer|row>        (default: layer)
			
 
				   -mg, --main-gpu <i>                       (default: 0)
			
 
				   -nkvo, --no-kv-offload <0|1>              (default: 0)
			
@@ -62,7 +63,7 @@ test parameters:
 
				 
			
 
				 Multiple values can be given for each parameter by separating them with ','
			
 
				 or by specifying the parameter multiple times. Ranges can be given as
			
 
				-'start-end' or 'start-end+step' or 'start-end*mult'.
			
 
				+'first-last' or 'first-last+step' or 'first-last*mult'.
			
 
				 ```
			
 
				 
			
 
				 llama-bench can perform three types of tests:
			
--- a/tools/llama-bench/llama-bench.cpp
+++ b/tools/llama-bench/llama-bench.cpp
@@ -211,6 +211,8 @@ static std::vector<int> parse_int_range(const std::string & s) {
 
				         for (int i = first; i <= last;) {
			
 
				             result.push_back(i);
			
 
				 
			
 
				+            int prev_i = i;
			
 
				+
			
 
				             if (op == '+') {
			
 
				                 i += step;
			
 
				             } else if (op == '*') {
			
@@ -218,6 +220,10 @@ static std::vector<int> parse_int_range(const std::string & s) {
 
				             } else {
			
 
				                 throw std::invalid_argument("invalid range format");
			
 
				             }
			
 
				+
			
 
				+            if (i <= prev_i) {
			
 
				+                throw std::invalid_argument("invalid range");
			
 
				+            }
			
 
				         }
			
 
				         search_start = match.suffix().first;
			
 
				     }
			
@@ -239,6 +245,7 @@ struct cmd_params {
 
				     std::vector<int>                 n_ubatch;
			
 
				     std::vector<ggml_type>           type_k;
			
 
				     std::vector<ggml_type>           type_v;
			
 
				+    std::vector<float>               defrag_thold;
			
 
				     std::vector<int>                 n_threads;
			
 
				     std::vector<std::string>         cpu_mask;
			
 
				     std::vector<bool>                cpu_strict;
			
@@ -274,6 +281,7 @@ static const cmd_params cmd_params_defaults = {
 
				     /* n_ubatch             */ { 512 },
			
 
				     /* type_k               */ { GGML_TYPE_F16 },
			
 
				     /* type_v               */ { GGML_TYPE_F16 },
			
 
				+    /* defrag_thold         */ { -1.0f },
			
 
				     /* n_threads            */ { cpu_get_num_math() },
			
 
				     /* cpu_mask             */ { "0x0" },
			
 
				     /* cpu_strict           */ { false },
			
@@ -335,6 +343,8 @@ static void print_usage(int /* argc */, char ** argv) {
 
				            join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
			
 
				     printf("  -ctv, --cache-type-v <t>                  (default: %s)\n",
			
 
				            join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
			
 
				+    printf("  -dt, --defrag-thold <f>                   (default: %s)\n",
			
 
				+           join(cmd_params_defaults.defrag_thold, ",").c_str());
			
 
				     printf("  -t, --threads <n>                         (default: %s)\n",
			
 
				            join(cmd_params_defaults.n_threads, ",").c_str());
			
 
				     printf("  -C, --cpu-mask <hex,hex>                  (default: %s)\n",
			
@@ -368,7 +378,7 @@ static void print_usage(int /* argc */, char ** argv) {
 
				     printf(
			
 
				         "Multiple values can be given for each parameter by separating them with ','\n"
			
 
				         "or by specifying the parameter multiple times. Ranges can be given as\n"
			
 
				-        "'start-end' or 'start-end+step' or 'start-end*mult'.\n");
			
 
				+        "'first-last' or 'first-last+step' or 'first-last*mult'.\n");
			
 
				 }
			
 
				 
			
 
				 static ggml_type ggml_type_from_name(const std::string & s) {
			
@@ -519,6 +529,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
 
				                     break;
			
 
				                 }
			
 
				                 params.type_v.insert(params.type_v.end(), types.begin(), types.end());
			
 
				+            } else if (arg == "-dt" || arg == "--defrag-thold") {
			
 
				+                if (++i >= argc) {
			
 
				+                    invalid_param = true;
			
 
				+                    break;
			
 
				+                }
			
 
				+                auto p = string_split<float>(argv[i], split_delim);
			
 
				+                params.defrag_thold.insert(params.defrag_thold.end(), p.begin(), p.end());
			
 
				             } else if (arg == "-t" || arg == "--threads") {
			
 
				                 if (++i >= argc) {
			
 
				                     invalid_param = true;
			
@@ -825,6 +842,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
 
				     if (params.type_v.empty()) {
			
 
				         params.type_v = cmd_params_defaults.type_v;
			
 
				     }
			
 
				+    if (params.defrag_thold.empty()) {
			
 
				+        params.defrag_thold = cmd_params_defaults.defrag_thold;
			
 
				+    }
			
 
				     if (params.n_gpu_layers.empty()) {
			
 
				         params.n_gpu_layers = cmd_params_defaults.n_gpu_layers;
			
 
				     }
			
@@ -883,6 +903,7 @@ struct cmd_params_instance {
 
				     int                n_ubatch;
			
 
				     ggml_type          type_k;
			
 
				     ggml_type          type_v;
			
 
				+    float              defrag_thold;
			
 
				     int                n_threads;
			
 
				     std::string        cpu_mask;
			
 
				     bool               cpu_strict;
			
@@ -959,15 +980,16 @@ struct cmd_params_instance {
 
				     llama_context_params to_llama_cparams() const {
			
 
				         llama_context_params cparams = llama_context_default_params();
			
 
				 
			
 
				-        cparams.n_ctx       = n_prompt + n_gen + n_depth;
			
 
				-        cparams.n_batch     = n_batch;
			
 
				-        cparams.n_ubatch    = n_ubatch;
			
 
				-        cparams.type_k      = type_k;
			
 
				-        cparams.type_v      = type_v;
			
 
				-        cparams.offload_kqv = !no_kv_offload;
			
 
				-        cparams.flash_attn  = flash_attn;
			
 
				-        cparams.embeddings  = embeddings;
			
 
				-        cparams.op_offload  = !no_op_offload;
			
 
				+        cparams.n_ctx        = n_prompt + n_gen + n_depth;
			
 
				+        cparams.n_batch      = n_batch;
			
 
				+        cparams.n_ubatch     = n_ubatch;
			
 
				+        cparams.type_k       = type_k;
			
 
				+        cparams.type_v       = type_v;
			
 
				+        cparams.defrag_thold = defrag_thold;
			
 
				+        cparams.offload_kqv  = !no_kv_offload;
			
 
				+        cparams.flash_attn   = flash_attn;
			
 
				+        cparams.embeddings   = embeddings;
			
 
				+        cparams.op_offload   = !no_op_offload;
			
 
				 
			
 
				         return cparams;
			
 
				     }
			
@@ -992,6 +1014,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
 
				     for (const auto & nub : params.n_ubatch)
			
 
				     for (const auto & tk : params.type_k)
			
 
				     for (const auto & tv : params.type_v)
			
 
				+    for (const auto & defrag_thold : params.defrag_thold)
			
 
				     for (const auto & nkvo : params.no_kv_offload)
			
 
				     for (const auto & fa : params.flash_attn)
			
 
				     for (const auto & nt : params.n_threads)
			
@@ -1012,6 +1035,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
 
				                 /* .n_ubatch     = */ nub,
			
 
				                 /* .type_k       = */ tk,
			
 
				                 /* .type_v       = */ tv,
			
 
				+                /* .defrag_thold = */ defrag_thold,
			
 
				                 /* .n_threads    = */ nt,
			
 
				                 /* .cpu_mask     = */ cm,
			
 
				                 /* .cpu_strict   = */ cs,
			
@@ -1044,6 +1068,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
 
				                 /* .n_ubatch     = */ nub,
			
 
				                 /* .type_k       = */ tk,
			
 
				                 /* .type_v       = */ tv,
			
 
				+                /* .defrag_thold = */ defrag_thold,
			
 
				                 /* .n_threads    = */ nt,
			
 
				                 /* .cpu_mask     = */ cm,
			
 
				                 /* .cpu_strict   = */ cs,
			
@@ -1076,6 +1101,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
 
				                 /* .n_ubatch     = */ nub,
			
 
				                 /* .type_k       = */ tk,
			
 
				                 /* .type_v       = */ tv,
			
 
				+                /* .defrag_thold = */ defrag_thold,
			
 
				                 /* .n_threads    = */ nt,
			
 
				                 /* .cpu_mask     = */ cm,
			
 
				                 /* .cpu_strict   = */ cs,
			
@@ -1117,6 +1143,7 @@ struct test {
 
				     int                      poll;
			
 
				     ggml_type                type_k;
			
 
				     ggml_type                type_v;
			
 
				+    float                    defrag_thold;
			
 
				     int                      n_gpu_layers;
			
 
				     llama_split_mode         split_mode;
			
 
				     int                      main_gpu;
			
@@ -1151,6 +1178,7 @@ struct test {
 
				         poll           = inst.poll;
			
 
				         type_k         = inst.type_k;
			
 
				         type_v         = inst.type_v;
			
 
				+        defrag_thold   = inst.defrag_thold;
			
 
				         n_gpu_layers   = inst.n_gpu_layers;
			
 
				         split_mode     = inst.split_mode;
			
 
				         main_gpu       = inst.main_gpu;
			
@@ -1206,6 +1234,7 @@ struct test {
 
				             "model_type",   "model_size",   "model_n_params", "n_batch",    "n_ubatch",     "n_threads",
			
 
				             "cpu_mask",     "cpu_strict",   "poll",           "type_k",     "type_v",       "n_gpu_layers",
			
 
				             "split_mode",   "main_gpu",     "no_kv_offload",  "flash_attn", "tensor_split", "tensor_buft_overrides",
			
 
				+            "defrag_thold",
			
 
				             "use_mmap",     "embeddings",   "no_op_offload",   "n_prompt",       "n_gen",      "n_depth",      "test_time",
			
 
				             "avg_ns",       "stddev_ns",    "avg_ts",         "stddev_ts",
			
 
				         };
			
@@ -1225,7 +1254,7 @@ struct test {
 
				             field == "use_mmap" || field == "embeddings") {
			
 
				             return BOOL;
			
 
				         }
			
 
				-        if (field == "avg_ts" || field == "stddev_ts") {
			
 
				+        if (field == "avg_ts" || field == "stddev_ts" || field == "defrag_thold") {
			
 
				             return FLOAT;
			
 
				         }
			
 
				         return STRING;
			
@@ -1292,6 +1321,7 @@ struct test {
 
				                                             std::to_string(flash_attn),
			
 
				                                             tensor_split_str,
			
 
				                                             tensor_buft_overrides_str,
			
 
				+                                            std::to_string(defrag_thold),
			
 
				                                             std::to_string(use_mmap),
			
 
				                                             std::to_string(embeddings),
			
 
				                                             std::to_string(no_op_offload),
			
@@ -1558,6 +1588,9 @@ struct markdown_printer : public printer {
 
				         if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
			
 
				             fields.emplace_back("type_v");
			
 
				         }
			
 
				+        if (params.defrag_thold.size() > 1 || params.defrag_thold != cmd_params_defaults.defrag_thold) {
			
 
				+            fields.emplace_back("defrag_thold");
			
 
				+        }
			
 
				         if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
			
 
				             fields.emplace_back("main_gpu");
			
 
				         }