|
|
@@ -211,6 +211,8 @@ static std::vector<int> parse_int_range(const std::string & s) {
|
|
|
for (int i = first; i <= last;) {
|
|
|
result.push_back(i);
|
|
|
|
|
|
+ int prev_i = i;
|
|
|
+
|
|
|
if (op == '+') {
|
|
|
i += step;
|
|
|
} else if (op == '*') {
|
|
|
@@ -218,6 +220,10 @@ static std::vector<int> parse_int_range(const std::string & s) {
|
|
|
} else {
|
|
|
throw std::invalid_argument("invalid range format");
|
|
|
}
|
|
|
+
|
|
|
+ if (i <= prev_i) {
|
|
|
+ throw std::invalid_argument("invalid range");
|
|
|
+ }
|
|
|
}
|
|
|
search_start = match.suffix().first;
|
|
|
}
|
|
|
@@ -239,6 +245,7 @@ struct cmd_params {
|
|
|
std::vector<int> n_ubatch;
|
|
|
std::vector<ggml_type> type_k;
|
|
|
std::vector<ggml_type> type_v;
|
|
|
+ std::vector<float> defrag_thold;
|
|
|
std::vector<int> n_threads;
|
|
|
std::vector<std::string> cpu_mask;
|
|
|
std::vector<bool> cpu_strict;
|
|
|
@@ -274,6 +281,7 @@ static const cmd_params cmd_params_defaults = {
|
|
|
/* n_ubatch */ { 512 },
|
|
|
/* type_k */ { GGML_TYPE_F16 },
|
|
|
/* type_v */ { GGML_TYPE_F16 },
|
|
|
+ /* defrag_thold */ { -1.0f },
|
|
|
/* n_threads */ { cpu_get_num_math() },
|
|
|
/* cpu_mask */ { "0x0" },
|
|
|
/* cpu_strict */ { false },
|
|
|
@@ -335,6 +343,8 @@ static void print_usage(int /* argc */, char ** argv) {
|
|
|
join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
|
|
|
printf(" -ctv, --cache-type-v <t> (default: %s)\n",
|
|
|
join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
|
|
+ printf(" -dt, --defrag-thold <f> (default: %s)\n",
|
|
|
+ join(cmd_params_defaults.defrag_thold, ",").c_str());
|
|
|
printf(" -t, --threads <n> (default: %s)\n",
|
|
|
join(cmd_params_defaults.n_threads, ",").c_str());
|
|
|
printf(" -C, --cpu-mask <hex,hex> (default: %s)\n",
|
|
|
@@ -368,7 +378,7 @@ static void print_usage(int /* argc */, char ** argv) {
|
|
|
printf(
|
|
|
"Multiple values can be given for each parameter by separating them with ','\n"
|
|
|
"or by specifying the parameter multiple times. Ranges can be given as\n"
|
|
|
- "'start-end' or 'start-end+step' or 'start-end*mult'.\n");
|
|
|
+ "'first-last' or 'first-last+step' or 'first-last*mult'.\n");
|
|
|
}
|
|
|
|
|
|
static ggml_type ggml_type_from_name(const std::string & s) {
|
|
|
@@ -519,6 +529,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
break;
|
|
|
}
|
|
|
params.type_v.insert(params.type_v.end(), types.begin(), types.end());
|
|
|
+ } else if (arg == "-dt" || arg == "--defrag-thold") {
|
|
|
+ if (++i >= argc) {
|
|
|
+ invalid_param = true;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ auto p = string_split<float>(argv[i], split_delim);
|
|
|
+ params.defrag_thold.insert(params.defrag_thold.end(), p.begin(), p.end());
|
|
|
} else if (arg == "-t" || arg == "--threads") {
|
|
|
if (++i >= argc) {
|
|
|
invalid_param = true;
|
|
|
@@ -825,6 +842,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
if (params.type_v.empty()) {
|
|
|
params.type_v = cmd_params_defaults.type_v;
|
|
|
}
|
|
|
+ if (params.defrag_thold.empty()) {
|
|
|
+ params.defrag_thold = cmd_params_defaults.defrag_thold;
|
|
|
+ }
|
|
|
if (params.n_gpu_layers.empty()) {
|
|
|
params.n_gpu_layers = cmd_params_defaults.n_gpu_layers;
|
|
|
}
|
|
|
@@ -883,6 +903,7 @@ struct cmd_params_instance {
|
|
|
int n_ubatch;
|
|
|
ggml_type type_k;
|
|
|
ggml_type type_v;
|
|
|
+ float defrag_thold;
|
|
|
int n_threads;
|
|
|
std::string cpu_mask;
|
|
|
bool cpu_strict;
|
|
|
@@ -959,15 +980,16 @@ struct cmd_params_instance {
|
|
|
llama_context_params to_llama_cparams() const {
|
|
|
llama_context_params cparams = llama_context_default_params();
|
|
|
|
|
|
- cparams.n_ctx = n_prompt + n_gen + n_depth;
|
|
|
- cparams.n_batch = n_batch;
|
|
|
- cparams.n_ubatch = n_ubatch;
|
|
|
- cparams.type_k = type_k;
|
|
|
- cparams.type_v = type_v;
|
|
|
- cparams.offload_kqv = !no_kv_offload;
|
|
|
- cparams.flash_attn = flash_attn;
|
|
|
- cparams.embeddings = embeddings;
|
|
|
- cparams.op_offload = !no_op_offload;
|
|
|
+ cparams.n_ctx = n_prompt + n_gen + n_depth;
|
|
|
+ cparams.n_batch = n_batch;
|
|
|
+ cparams.n_ubatch = n_ubatch;
|
|
|
+ cparams.type_k = type_k;
|
|
|
+ cparams.type_v = type_v;
|
|
|
+ cparams.defrag_thold = defrag_thold;
|
|
|
+ cparams.offload_kqv = !no_kv_offload;
|
|
|
+ cparams.flash_attn = flash_attn;
|
|
|
+ cparams.embeddings = embeddings;
|
|
|
+ cparams.op_offload = !no_op_offload;
|
|
|
|
|
|
return cparams;
|
|
|
}
|
|
|
@@ -992,6 +1014,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
for (const auto & nub : params.n_ubatch)
|
|
|
for (const auto & tk : params.type_k)
|
|
|
for (const auto & tv : params.type_v)
|
|
|
+ for (const auto & defrag_thold : params.defrag_thold)
|
|
|
for (const auto & nkvo : params.no_kv_offload)
|
|
|
for (const auto & fa : params.flash_attn)
|
|
|
for (const auto & nt : params.n_threads)
|
|
|
@@ -1012,6 +1035,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
/* .n_ubatch = */ nub,
|
|
|
/* .type_k = */ tk,
|
|
|
/* .type_v = */ tv,
|
|
|
+ /* .defrag_thold = */ defrag_thold,
|
|
|
/* .n_threads = */ nt,
|
|
|
/* .cpu_mask = */ cm,
|
|
|
/* .cpu_strict = */ cs,
|
|
|
@@ -1044,6 +1068,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
/* .n_ubatch = */ nub,
|
|
|
/* .type_k = */ tk,
|
|
|
/* .type_v = */ tv,
|
|
|
+ /* .defrag_thold = */ defrag_thold,
|
|
|
/* .n_threads = */ nt,
|
|
|
/* .cpu_mask = */ cm,
|
|
|
/* .cpu_strict = */ cs,
|
|
|
@@ -1076,6 +1101,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
/* .n_ubatch = */ nub,
|
|
|
/* .type_k = */ tk,
|
|
|
/* .type_v = */ tv,
|
|
|
+ /* .defrag_thold = */ defrag_thold,
|
|
|
/* .n_threads = */ nt,
|
|
|
/* .cpu_mask = */ cm,
|
|
|
/* .cpu_strict = */ cs,
|
|
|
@@ -1117,6 +1143,7 @@ struct test {
|
|
|
int poll;
|
|
|
ggml_type type_k;
|
|
|
ggml_type type_v;
|
|
|
+ float defrag_thold;
|
|
|
int n_gpu_layers;
|
|
|
llama_split_mode split_mode;
|
|
|
int main_gpu;
|
|
|
@@ -1151,6 +1178,7 @@ struct test {
|
|
|
poll = inst.poll;
|
|
|
type_k = inst.type_k;
|
|
|
type_v = inst.type_v;
|
|
|
+ defrag_thold = inst.defrag_thold;
|
|
|
n_gpu_layers = inst.n_gpu_layers;
|
|
|
split_mode = inst.split_mode;
|
|
|
main_gpu = inst.main_gpu;
|
|
|
@@ -1206,6 +1234,7 @@ struct test {
|
|
|
"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
|
|
|
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
|
|
|
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
|
|
|
+ "defrag_thold",
|
|
|
"use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time",
|
|
|
"avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
|
|
|
};
|
|
|
@@ -1225,7 +1254,7 @@ struct test {
|
|
|
field == "use_mmap" || field == "embeddings") {
|
|
|
return BOOL;
|
|
|
}
|
|
|
- if (field == "avg_ts" || field == "stddev_ts") {
|
|
|
+ if (field == "avg_ts" || field == "stddev_ts" || field == "defrag_thold") {
|
|
|
return FLOAT;
|
|
|
}
|
|
|
return STRING;
|
|
|
@@ -1292,6 +1321,7 @@ struct test {
|
|
|
std::to_string(flash_attn),
|
|
|
tensor_split_str,
|
|
|
tensor_buft_overrides_str,
|
|
|
+ std::to_string(defrag_thold),
|
|
|
std::to_string(use_mmap),
|
|
|
std::to_string(embeddings),
|
|
|
std::to_string(no_op_offload),
|
|
|
@@ -1558,6 +1588,9 @@ struct markdown_printer : public printer {
|
|
|
if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
|
|
|
fields.emplace_back("type_v");
|
|
|
}
|
|
|
+ if (params.defrag_thold.size() > 1 || params.defrag_thold != cmd_params_defaults.defrag_thold) {
|
|
|
+ fields.emplace_back("defrag_thold");
|
|
|
+ }
|
|
|
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
|
|
|
fields.emplace_back("main_gpu");
|
|
|
}
|