Просмотр исходного кода

llama-bench : add defrag-thold, check for invalid ranges (#13487)

Diego Devesa 8 месяцев назад
Родитель
Сommit
cf0a43bb64
3 измененных файлов с 49 добавлено и 15 удалено
  1. 1 1
      include/llama.h
  2. 4 3
      tools/llama-bench/README.md
  3. 44 11
      tools/llama-bench/llama-bench.cpp

+ 1 - 1
include/llama.h

@@ -345,7 +345,7 @@ extern "C" {
         float    yarn_beta_fast;   // YaRN low correction dim
         float    yarn_beta_fast;   // YaRN low correction dim
         float    yarn_beta_slow;   // YaRN high correction dim
         float    yarn_beta_slow;   // YaRN high correction dim
         uint32_t yarn_orig_ctx;    // YaRN original context size
         uint32_t yarn_orig_ctx;    // YaRN original context size
-        float    defrag_thold;     // defragment the KV cache if holes/size > thold, < 0 disabled (default)
+        float    defrag_thold;     // defragment the KV cache if holes/size > thold, <= 0 disabled (default)
 
 
         ggml_backend_sched_eval_callback cb_eval;
         ggml_backend_sched_eval_callback cb_eval;
         void * cb_eval_user_data;
         void * cb_eval_user_data;

+ 4 - 3
tools/llama-bench/README.md

@@ -43,12 +43,13 @@ test parameters:
   -ub, --ubatch-size <n>                    (default: 512)
   -ub, --ubatch-size <n>                    (default: 512)
   -ctk, --cache-type-k <t>                  (default: f16)
   -ctk, --cache-type-k <t>                  (default: f16)
   -ctv, --cache-type-v <t>                  (default: f16)
   -ctv, --cache-type-v <t>                  (default: f16)
-  -t, --threads <n>                         (default: 16)
+  -dt, --defrag-thold <f>                   (default: -1)
+  -t, --threads <n>                         (default: system dependent)
   -C, --cpu-mask <hex,hex>                  (default: 0x0)
   -C, --cpu-mask <hex,hex>                  (default: 0x0)
   --cpu-strict <0|1>                        (default: 0)
   --cpu-strict <0|1>                        (default: 0)
   --poll <0...100>                          (default: 50)
   --poll <0...100>                          (default: 50)
   -ngl, --n-gpu-layers <n>                  (default: 99)
   -ngl, --n-gpu-layers <n>                  (default: 99)
-  -rpc, --rpc <rpc_servers>                 (default: )
+  -rpc, --rpc <rpc_servers>                 (default: none)
   -sm, --split-mode <none|layer|row>        (default: layer)
   -sm, --split-mode <none|layer|row>        (default: layer)
   -mg, --main-gpu <i>                       (default: 0)
   -mg, --main-gpu <i>                       (default: 0)
   -nkvo, --no-kv-offload <0|1>              (default: 0)
   -nkvo, --no-kv-offload <0|1>              (default: 0)
@@ -62,7 +63,7 @@ test parameters:
 
 
 Multiple values can be given for each parameter by separating them with ','
 Multiple values can be given for each parameter by separating them with ','
 or by specifying the parameter multiple times. Ranges can be given as
 or by specifying the parameter multiple times. Ranges can be given as
-'start-end' or 'start-end+step' or 'start-end*mult'.
+'first-last' or 'first-last+step' or 'first-last*mult'.
 ```
 ```
 
 
 llama-bench can perform three types of tests:
 llama-bench can perform three types of tests:

+ 44 - 11
tools/llama-bench/llama-bench.cpp

@@ -211,6 +211,8 @@ static std::vector<int> parse_int_range(const std::string & s) {
         for (int i = first; i <= last;) {
         for (int i = first; i <= last;) {
             result.push_back(i);
             result.push_back(i);
 
 
+            int prev_i = i;
+
             if (op == '+') {
             if (op == '+') {
                 i += step;
                 i += step;
             } else if (op == '*') {
             } else if (op == '*') {
@@ -218,6 +220,10 @@ static std::vector<int> parse_int_range(const std::string & s) {
             } else {
             } else {
                 throw std::invalid_argument("invalid range format");
                 throw std::invalid_argument("invalid range format");
             }
             }
+
+            if (i <= prev_i) {
+                throw std::invalid_argument("invalid range");
+            }
         }
         }
         search_start = match.suffix().first;
         search_start = match.suffix().first;
     }
     }
@@ -239,6 +245,7 @@ struct cmd_params {
     std::vector<int>                 n_ubatch;
     std::vector<int>                 n_ubatch;
     std::vector<ggml_type>           type_k;
     std::vector<ggml_type>           type_k;
     std::vector<ggml_type>           type_v;
     std::vector<ggml_type>           type_v;
+    std::vector<float>               defrag_thold;
     std::vector<int>                 n_threads;
     std::vector<int>                 n_threads;
     std::vector<std::string>         cpu_mask;
     std::vector<std::string>         cpu_mask;
     std::vector<bool>                cpu_strict;
     std::vector<bool>                cpu_strict;
@@ -274,6 +281,7 @@ static const cmd_params cmd_params_defaults = {
     /* n_ubatch             */ { 512 },
     /* n_ubatch             */ { 512 },
     /* type_k               */ { GGML_TYPE_F16 },
     /* type_k               */ { GGML_TYPE_F16 },
     /* type_v               */ { GGML_TYPE_F16 },
     /* type_v               */ { GGML_TYPE_F16 },
+    /* defrag_thold         */ { -1.0f },
     /* n_threads            */ { cpu_get_num_math() },
     /* n_threads            */ { cpu_get_num_math() },
     /* cpu_mask             */ { "0x0" },
     /* cpu_mask             */ { "0x0" },
     /* cpu_strict           */ { false },
     /* cpu_strict           */ { false },
@@ -335,6 +343,8 @@ static void print_usage(int /* argc */, char ** argv) {
            join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
            join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
     printf("  -ctv, --cache-type-v <t>                  (default: %s)\n",
     printf("  -ctv, --cache-type-v <t>                  (default: %s)\n",
            join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
            join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
+    printf("  -dt, --defrag-thold <f>                   (default: %s)\n",
+           join(cmd_params_defaults.defrag_thold, ",").c_str());
     printf("  -t, --threads <n>                         (default: %s)\n",
     printf("  -t, --threads <n>                         (default: %s)\n",
            join(cmd_params_defaults.n_threads, ",").c_str());
            join(cmd_params_defaults.n_threads, ",").c_str());
     printf("  -C, --cpu-mask <hex,hex>                  (default: %s)\n",
     printf("  -C, --cpu-mask <hex,hex>                  (default: %s)\n",
@@ -368,7 +378,7 @@ static void print_usage(int /* argc */, char ** argv) {
     printf(
     printf(
         "Multiple values can be given for each parameter by separating them with ','\n"
         "Multiple values can be given for each parameter by separating them with ','\n"
         "or by specifying the parameter multiple times. Ranges can be given as\n"
         "or by specifying the parameter multiple times. Ranges can be given as\n"
-        "'start-end' or 'start-end+step' or 'start-end*mult'.\n");
+        "'first-last' or 'first-last+step' or 'first-last*mult'.\n");
 }
 }
 
 
 static ggml_type ggml_type_from_name(const std::string & s) {
 static ggml_type ggml_type_from_name(const std::string & s) {
@@ -519,6 +529,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                     break;
                     break;
                 }
                 }
                 params.type_v.insert(params.type_v.end(), types.begin(), types.end());
                 params.type_v.insert(params.type_v.end(), types.begin(), types.end());
+            } else if (arg == "-dt" || arg == "--defrag-thold") {
+                if (++i >= argc) {
+                    invalid_param = true;
+                    break;
+                }
+                auto p = string_split<float>(argv[i], split_delim);
+                params.defrag_thold.insert(params.defrag_thold.end(), p.begin(), p.end());
             } else if (arg == "-t" || arg == "--threads") {
             } else if (arg == "-t" || arg == "--threads") {
                 if (++i >= argc) {
                 if (++i >= argc) {
                     invalid_param = true;
                     invalid_param = true;
@@ -825,6 +842,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
     if (params.type_v.empty()) {
     if (params.type_v.empty()) {
         params.type_v = cmd_params_defaults.type_v;
         params.type_v = cmd_params_defaults.type_v;
     }
     }
+    if (params.defrag_thold.empty()) {
+        params.defrag_thold = cmd_params_defaults.defrag_thold;
+    }
     if (params.n_gpu_layers.empty()) {
     if (params.n_gpu_layers.empty()) {
         params.n_gpu_layers = cmd_params_defaults.n_gpu_layers;
         params.n_gpu_layers = cmd_params_defaults.n_gpu_layers;
     }
     }
@@ -883,6 +903,7 @@ struct cmd_params_instance {
     int                n_ubatch;
     int                n_ubatch;
     ggml_type          type_k;
     ggml_type          type_k;
     ggml_type          type_v;
     ggml_type          type_v;
+    float              defrag_thold;
     int                n_threads;
     int                n_threads;
     std::string        cpu_mask;
     std::string        cpu_mask;
     bool               cpu_strict;
     bool               cpu_strict;
@@ -959,15 +980,16 @@ struct cmd_params_instance {
     llama_context_params to_llama_cparams() const {
     llama_context_params to_llama_cparams() const {
         llama_context_params cparams = llama_context_default_params();
         llama_context_params cparams = llama_context_default_params();
 
 
-        cparams.n_ctx       = n_prompt + n_gen + n_depth;
-        cparams.n_batch     = n_batch;
-        cparams.n_ubatch    = n_ubatch;
-        cparams.type_k      = type_k;
-        cparams.type_v      = type_v;
-        cparams.offload_kqv = !no_kv_offload;
-        cparams.flash_attn  = flash_attn;
-        cparams.embeddings  = embeddings;
-        cparams.op_offload  = !no_op_offload;
+        cparams.n_ctx        = n_prompt + n_gen + n_depth;
+        cparams.n_batch      = n_batch;
+        cparams.n_ubatch     = n_ubatch;
+        cparams.type_k       = type_k;
+        cparams.type_v       = type_v;
+        cparams.defrag_thold = defrag_thold;
+        cparams.offload_kqv  = !no_kv_offload;
+        cparams.flash_attn   = flash_attn;
+        cparams.embeddings   = embeddings;
+        cparams.op_offload   = !no_op_offload;
 
 
         return cparams;
         return cparams;
     }
     }
@@ -992,6 +1014,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
     for (const auto & nub : params.n_ubatch)
     for (const auto & nub : params.n_ubatch)
     for (const auto & tk : params.type_k)
     for (const auto & tk : params.type_k)
     for (const auto & tv : params.type_v)
     for (const auto & tv : params.type_v)
+    for (const auto & defrag_thold : params.defrag_thold)
     for (const auto & nkvo : params.no_kv_offload)
     for (const auto & nkvo : params.no_kv_offload)
     for (const auto & fa : params.flash_attn)
     for (const auto & fa : params.flash_attn)
     for (const auto & nt : params.n_threads)
     for (const auto & nt : params.n_threads)
@@ -1012,6 +1035,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .n_ubatch     = */ nub,
                 /* .n_ubatch     = */ nub,
                 /* .type_k       = */ tk,
                 /* .type_k       = */ tk,
                 /* .type_v       = */ tv,
                 /* .type_v       = */ tv,
+                /* .defrag_thold = */ defrag_thold,
                 /* .n_threads    = */ nt,
                 /* .n_threads    = */ nt,
                 /* .cpu_mask     = */ cm,
                 /* .cpu_mask     = */ cm,
                 /* .cpu_strict   = */ cs,
                 /* .cpu_strict   = */ cs,
@@ -1044,6 +1068,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .n_ubatch     = */ nub,
                 /* .n_ubatch     = */ nub,
                 /* .type_k       = */ tk,
                 /* .type_k       = */ tk,
                 /* .type_v       = */ tv,
                 /* .type_v       = */ tv,
+                /* .defrag_thold = */ defrag_thold,
                 /* .n_threads    = */ nt,
                 /* .n_threads    = */ nt,
                 /* .cpu_mask     = */ cm,
                 /* .cpu_mask     = */ cm,
                 /* .cpu_strict   = */ cs,
                 /* .cpu_strict   = */ cs,
@@ -1076,6 +1101,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .n_ubatch     = */ nub,
                 /* .n_ubatch     = */ nub,
                 /* .type_k       = */ tk,
                 /* .type_k       = */ tk,
                 /* .type_v       = */ tv,
                 /* .type_v       = */ tv,
+                /* .defrag_thold = */ defrag_thold,
                 /* .n_threads    = */ nt,
                 /* .n_threads    = */ nt,
                 /* .cpu_mask     = */ cm,
                 /* .cpu_mask     = */ cm,
                 /* .cpu_strict   = */ cs,
                 /* .cpu_strict   = */ cs,
@@ -1117,6 +1143,7 @@ struct test {
     int                      poll;
     int                      poll;
     ggml_type                type_k;
     ggml_type                type_k;
     ggml_type                type_v;
     ggml_type                type_v;
+    float                    defrag_thold;
     int                      n_gpu_layers;
     int                      n_gpu_layers;
     llama_split_mode         split_mode;
     llama_split_mode         split_mode;
     int                      main_gpu;
     int                      main_gpu;
@@ -1151,6 +1178,7 @@ struct test {
         poll           = inst.poll;
         poll           = inst.poll;
         type_k         = inst.type_k;
         type_k         = inst.type_k;
         type_v         = inst.type_v;
         type_v         = inst.type_v;
+        defrag_thold   = inst.defrag_thold;
         n_gpu_layers   = inst.n_gpu_layers;
         n_gpu_layers   = inst.n_gpu_layers;
         split_mode     = inst.split_mode;
         split_mode     = inst.split_mode;
         main_gpu       = inst.main_gpu;
         main_gpu       = inst.main_gpu;
@@ -1206,6 +1234,7 @@ struct test {
             "model_type",   "model_size",   "model_n_params", "n_batch",    "n_ubatch",     "n_threads",
             "model_type",   "model_size",   "model_n_params", "n_batch",    "n_ubatch",     "n_threads",
             "cpu_mask",     "cpu_strict",   "poll",           "type_k",     "type_v",       "n_gpu_layers",
             "cpu_mask",     "cpu_strict",   "poll",           "type_k",     "type_v",       "n_gpu_layers",
             "split_mode",   "main_gpu",     "no_kv_offload",  "flash_attn", "tensor_split", "tensor_buft_overrides",
             "split_mode",   "main_gpu",     "no_kv_offload",  "flash_attn", "tensor_split", "tensor_buft_overrides",
+            "defrag_thold",
             "use_mmap",     "embeddings",   "no_op_offload",   "n_prompt",       "n_gen",      "n_depth",      "test_time",
             "use_mmap",     "embeddings",   "no_op_offload",   "n_prompt",       "n_gen",      "n_depth",      "test_time",
             "avg_ns",       "stddev_ns",    "avg_ts",         "stddev_ts",
             "avg_ns",       "stddev_ns",    "avg_ts",         "stddev_ts",
         };
         };
@@ -1225,7 +1254,7 @@ struct test {
             field == "use_mmap" || field == "embeddings") {
             field == "use_mmap" || field == "embeddings") {
             return BOOL;
             return BOOL;
         }
         }
-        if (field == "avg_ts" || field == "stddev_ts") {
+        if (field == "avg_ts" || field == "stddev_ts" || field == "defrag_thold") {
             return FLOAT;
             return FLOAT;
         }
         }
         return STRING;
         return STRING;
@@ -1292,6 +1321,7 @@ struct test {
                                             std::to_string(flash_attn),
                                             std::to_string(flash_attn),
                                             tensor_split_str,
                                             tensor_split_str,
                                             tensor_buft_overrides_str,
                                             tensor_buft_overrides_str,
+                                            std::to_string(defrag_thold),
                                             std::to_string(use_mmap),
                                             std::to_string(use_mmap),
                                             std::to_string(embeddings),
                                             std::to_string(embeddings),
                                             std::to_string(no_op_offload),
                                             std::to_string(no_op_offload),
@@ -1558,6 +1588,9 @@ struct markdown_printer : public printer {
         if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
         if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
             fields.emplace_back("type_v");
             fields.emplace_back("type_v");
         }
         }
+        if (params.defrag_thold.size() > 1 || params.defrag_thold != cmd_params_defaults.defrag_thold) {
+            fields.emplace_back("defrag_thold");
+        }
         if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
         if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
             fields.emplace_back("main_gpu");
             fields.emplace_back("main_gpu");
         }
         }