|
@@ -20,6 +20,7 @@
|
|
|
#include "llama.h"
|
|
#include "llama.h"
|
|
|
#include "common.h"
|
|
#include "common.h"
|
|
|
#include "ggml-cuda.h"
|
|
#include "ggml-cuda.h"
|
|
|
|
|
+#include "ggml-sycl.h"
|
|
|
|
|
|
|
|
// utils
|
|
// utils
|
|
|
static uint64_t get_time_ns() {
|
|
static uint64_t get_time_ns() {
|
|
@@ -120,6 +121,22 @@ static std::string get_gpu_info() {
|
|
|
id += "/";
|
|
id += "/";
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
+#endif
|
|
|
|
|
+#ifdef GGML_USE_SYCL
|
|
|
|
|
+ int device_list[GGML_SYCL_MAX_DEVICES];
|
|
|
|
|
+ ggml_sycl_get_gpu_list(device_list, GGML_SYCL_MAX_DEVICES);
|
|
|
|
|
+
|
|
|
|
|
+ for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
|
|
|
|
|
+ if (device_list[i] >0 ){
|
|
|
|
|
+ char buf[128];
|
|
|
|
|
+ ggml_sycl_get_device_description(i, buf, sizeof(buf));
|
|
|
|
|
+ id += buf;
|
|
|
|
|
+ id += "/";
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ if (id.length() >2 ) {
|
|
|
|
|
+ id.pop_back();
|
|
|
|
|
+ }
|
|
|
#endif
|
|
#endif
|
|
|
// TODO: other backends
|
|
// TODO: other backends
|
|
|
return id;
|
|
return id;
|
|
@@ -161,6 +178,7 @@ struct cmd_params {
|
|
|
std::vector<bool> no_kv_offload;
|
|
std::vector<bool> no_kv_offload;
|
|
|
std::vector<bool> mul_mat_q;
|
|
std::vector<bool> mul_mat_q;
|
|
|
std::vector<std::vector<float>> tensor_split;
|
|
std::vector<std::vector<float>> tensor_split;
|
|
|
|
|
+ std::vector<bool> use_mmap;
|
|
|
int reps;
|
|
int reps;
|
|
|
bool verbose;
|
|
bool verbose;
|
|
|
output_formats output_format;
|
|
output_formats output_format;
|
|
@@ -180,6 +198,7 @@ static const cmd_params cmd_params_defaults = {
|
|
|
/* no_kv_offload */ {false},
|
|
/* no_kv_offload */ {false},
|
|
|
/* mul_mat_q */ {true},
|
|
/* mul_mat_q */ {true},
|
|
|
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
|
|
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
|
|
|
|
|
+ /* use_mmap */ {true},
|
|
|
/* reps */ 5,
|
|
/* reps */ 5,
|
|
|
/* verbose */ false,
|
|
/* verbose */ false,
|
|
|
/* output_format */ MARKDOWN
|
|
/* output_format */ MARKDOWN
|
|
@@ -201,6 +220,7 @@ static void print_usage(int /* argc */, char ** argv) {
|
|
|
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
|
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
|
|
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
|
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
|
|
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
|
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
|
|
|
|
+ printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
|
|
|
printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
|
|
printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
|
|
|
printf(" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n");
|
|
printf(" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n");
|
|
|
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
|
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
|
@@ -370,6 +390,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
}
|
|
}
|
|
|
auto p = split<bool>(argv[i], split_delim);
|
|
auto p = split<bool>(argv[i], split_delim);
|
|
|
params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end());
|
|
params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end());
|
|
|
|
|
+ } else if (arg == "-mmp" || arg == "--mmap") {
|
|
|
|
|
+ if (++i >= argc) {
|
|
|
|
|
+ invalid_param = true;
|
|
|
|
|
+ break;
|
|
|
|
|
+ }
|
|
|
|
|
+ auto p = split<bool>(argv[i], split_delim);
|
|
|
|
|
+ params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
|
|
|
} else if (arg == "-ts" || arg == "--tensor-split") {
|
|
} else if (arg == "-ts" || arg == "--tensor-split") {
|
|
|
if (++i >= argc) {
|
|
if (++i >= argc) {
|
|
|
invalid_param = true;
|
|
invalid_param = true;
|
|
@@ -441,6 +468,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
|
|
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
|
|
|
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
|
|
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
|
|
|
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
|
|
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
|
|
|
|
|
+ if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
|
|
|
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
|
|
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
|
|
|
|
|
|
|
|
return params;
|
|
return params;
|
|
@@ -460,6 +488,7 @@ struct cmd_params_instance {
|
|
|
bool no_kv_offload;
|
|
bool no_kv_offload;
|
|
|
bool mul_mat_q;
|
|
bool mul_mat_q;
|
|
|
std::vector<float> tensor_split;
|
|
std::vector<float> tensor_split;
|
|
|
|
|
+ bool use_mmap;
|
|
|
|
|
|
|
|
llama_model_params to_llama_mparams() const {
|
|
llama_model_params to_llama_mparams() const {
|
|
|
llama_model_params mparams = llama_model_default_params();
|
|
llama_model_params mparams = llama_model_default_params();
|
|
@@ -468,6 +497,7 @@ struct cmd_params_instance {
|
|
|
mparams.split_mode = split_mode;
|
|
mparams.split_mode = split_mode;
|
|
|
mparams.main_gpu = main_gpu;
|
|
mparams.main_gpu = main_gpu;
|
|
|
mparams.tensor_split = tensor_split.data();
|
|
mparams.tensor_split = tensor_split.data();
|
|
|
|
|
+ mparams.use_mmap = use_mmap;
|
|
|
|
|
|
|
|
return mparams;
|
|
return mparams;
|
|
|
}
|
|
}
|
|
@@ -477,6 +507,7 @@ struct cmd_params_instance {
|
|
|
n_gpu_layers == other.n_gpu_layers &&
|
|
n_gpu_layers == other.n_gpu_layers &&
|
|
|
split_mode == other.split_mode &&
|
|
split_mode == other.split_mode &&
|
|
|
main_gpu == other.main_gpu &&
|
|
main_gpu == other.main_gpu &&
|
|
|
|
|
+ use_mmap == other.use_mmap &&
|
|
|
tensor_split == other.tensor_split;
|
|
tensor_split == other.tensor_split;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -503,6 +534,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
for (const auto & sm : params.split_mode)
|
|
for (const auto & sm : params.split_mode)
|
|
|
for (const auto & mg : params.main_gpu)
|
|
for (const auto & mg : params.main_gpu)
|
|
|
for (const auto & ts : params.tensor_split)
|
|
for (const auto & ts : params.tensor_split)
|
|
|
|
|
+ for (const auto & mmp : params.use_mmap)
|
|
|
for (const auto & nb : params.n_batch)
|
|
for (const auto & nb : params.n_batch)
|
|
|
for (const auto & tk : params.type_k)
|
|
for (const auto & tk : params.type_k)
|
|
|
for (const auto & tv : params.type_v)
|
|
for (const auto & tv : params.type_v)
|
|
@@ -527,6 +559,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
/* .no_kv_offload= */ nkvo,
|
|
/* .no_kv_offload= */ nkvo,
|
|
|
/* .mul_mat_q = */ mmq,
|
|
/* .mul_mat_q = */ mmq,
|
|
|
/* .tensor_split = */ ts,
|
|
/* .tensor_split = */ ts,
|
|
|
|
|
+ /* .use_mmap = */ mmp,
|
|
|
};
|
|
};
|
|
|
instances.push_back(instance);
|
|
instances.push_back(instance);
|
|
|
}
|
|
}
|
|
@@ -549,6 +582,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
/* .no_kv_offload= */ nkvo,
|
|
/* .no_kv_offload= */ nkvo,
|
|
|
/* .mul_mat_q = */ mmq,
|
|
/* .mul_mat_q = */ mmq,
|
|
|
/* .tensor_split = */ ts,
|
|
/* .tensor_split = */ ts,
|
|
|
|
|
+ /* .use_mmap = */ mmp,
|
|
|
};
|
|
};
|
|
|
instances.push_back(instance);
|
|
instances.push_back(instance);
|
|
|
}
|
|
}
|
|
@@ -565,6 +599,7 @@ struct test {
|
|
|
static const bool vulkan;
|
|
static const bool vulkan;
|
|
|
static const bool kompute;
|
|
static const bool kompute;
|
|
|
static const bool metal;
|
|
static const bool metal;
|
|
|
|
|
+ static const bool sycl;
|
|
|
static const bool gpu_blas;
|
|
static const bool gpu_blas;
|
|
|
static const bool blas;
|
|
static const bool blas;
|
|
|
static const std::string cpu_info;
|
|
static const std::string cpu_info;
|
|
@@ -583,6 +618,7 @@ struct test {
|
|
|
bool no_kv_offload;
|
|
bool no_kv_offload;
|
|
|
bool mul_mat_q;
|
|
bool mul_mat_q;
|
|
|
std::vector<float> tensor_split;
|
|
std::vector<float> tensor_split;
|
|
|
|
|
+ bool use_mmap;
|
|
|
int n_prompt;
|
|
int n_prompt;
|
|
|
int n_gen;
|
|
int n_gen;
|
|
|
std::string test_time;
|
|
std::string test_time;
|
|
@@ -605,6 +641,7 @@ struct test {
|
|
|
no_kv_offload = inst.no_kv_offload;
|
|
no_kv_offload = inst.no_kv_offload;
|
|
|
mul_mat_q = inst.mul_mat_q;
|
|
mul_mat_q = inst.mul_mat_q;
|
|
|
tensor_split = inst.tensor_split;
|
|
tensor_split = inst.tensor_split;
|
|
|
|
|
+ use_mmap = inst.use_mmap;
|
|
|
n_prompt = inst.n_prompt;
|
|
n_prompt = inst.n_prompt;
|
|
|
n_gen = inst.n_gen;
|
|
n_gen = inst.n_gen;
|
|
|
// RFC 3339 date-time format
|
|
// RFC 3339 date-time format
|
|
@@ -654,25 +691,29 @@ struct test {
|
|
|
if (metal) {
|
|
if (metal) {
|
|
|
return "Metal";
|
|
return "Metal";
|
|
|
}
|
|
}
|
|
|
|
|
+ if (sycl) {
|
|
|
|
|
+ return GGML_SYCL_NAME;
|
|
|
|
|
+ }
|
|
|
if (gpu_blas) {
|
|
if (gpu_blas) {
|
|
|
return "GPU BLAS";
|
|
return "GPU BLAS";
|
|
|
}
|
|
}
|
|
|
if (blas) {
|
|
if (blas) {
|
|
|
return "BLAS";
|
|
return "BLAS";
|
|
|
}
|
|
}
|
|
|
|
|
+
|
|
|
return "CPU";
|
|
return "CPU";
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
static const std::vector<std::string> & get_fields() {
|
|
static const std::vector<std::string> & get_fields() {
|
|
|
static const std::vector<std::string> fields = {
|
|
static const std::vector<std::string> fields = {
|
|
|
"build_commit", "build_number",
|
|
"build_commit", "build_number",
|
|
|
- "cuda", "opencl", "vulkan", "kompute", "metal", "gpu_blas", "blas",
|
|
|
|
|
|
|
+ "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas",
|
|
|
"cpu_info", "gpu_info",
|
|
"cpu_info", "gpu_info",
|
|
|
"model_filename", "model_type", "model_size", "model_n_params",
|
|
"model_filename", "model_type", "model_size", "model_n_params",
|
|
|
"n_batch", "n_threads", "type_k", "type_v",
|
|
"n_batch", "n_threads", "type_k", "type_v",
|
|
|
"n_gpu_layers", "split_mode",
|
|
"n_gpu_layers", "split_mode",
|
|
|
"main_gpu", "no_kv_offload",
|
|
"main_gpu", "no_kv_offload",
|
|
|
- "mul_mat_q", "tensor_split",
|
|
|
|
|
|
|
+ "mul_mat_q", "tensor_split", "use_mmap",
|
|
|
"n_prompt", "n_gen", "test_time",
|
|
"n_prompt", "n_gen", "test_time",
|
|
|
"avg_ns", "stddev_ns",
|
|
"avg_ns", "stddev_ns",
|
|
|
"avg_ts", "stddev_ts"
|
|
"avg_ts", "stddev_ts"
|
|
@@ -691,8 +732,8 @@ struct test {
|
|
|
return INT;
|
|
return INT;
|
|
|
}
|
|
}
|
|
|
if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
|
|
if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
|
|
|
- field == "gpu_blas" || field == "blas" || field == "f16_kv" || field == "no_kv_offload" ||
|
|
|
|
|
- field == "mul_mat_q") {
|
|
|
|
|
|
|
+ field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
|
|
|
|
|
+ field == "mul_mat_q" || field == "use_mmap") {
|
|
|
return BOOL;
|
|
return BOOL;
|
|
|
}
|
|
}
|
|
|
if (field == "avg_ts" || field == "stddev_ts") {
|
|
if (field == "avg_ts" || field == "stddev_ts") {
|
|
@@ -720,13 +761,13 @@ struct test {
|
|
|
std::vector<std::string> values = {
|
|
std::vector<std::string> values = {
|
|
|
build_commit, std::to_string(build_number),
|
|
build_commit, std::to_string(build_number),
|
|
|
std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
|
|
std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
|
|
|
- std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
|
|
|
|
|
|
|
+ std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas),
|
|
|
cpu_info, gpu_info,
|
|
cpu_info, gpu_info,
|
|
|
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
|
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
|
|
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
|
|
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
|
|
|
std::to_string(n_gpu_layers), split_mode_str(split_mode),
|
|
std::to_string(n_gpu_layers), split_mode_str(split_mode),
|
|
|
std::to_string(main_gpu), std::to_string(no_kv_offload),
|
|
std::to_string(main_gpu), std::to_string(no_kv_offload),
|
|
|
- std::to_string(mul_mat_q), tensor_split_str,
|
|
|
|
|
|
|
+ std::to_string(mul_mat_q), tensor_split_str, std::to_string(use_mmap),
|
|
|
std::to_string(n_prompt), std::to_string(n_gen), test_time,
|
|
std::to_string(n_prompt), std::to_string(n_gen), test_time,
|
|
|
std::to_string(avg_ns()), std::to_string(stdev_ns()),
|
|
std::to_string(avg_ns()), std::to_string(stdev_ns()),
|
|
|
std::to_string(avg_ts()), std::to_string(stdev_ts())
|
|
std::to_string(avg_ts()), std::to_string(stdev_ts())
|
|
@@ -753,6 +794,7 @@ const bool test::kompute = !!ggml_cpu_has_kompute();
|
|
|
const bool test::metal = !!ggml_cpu_has_metal();
|
|
const bool test::metal = !!ggml_cpu_has_metal();
|
|
|
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
|
|
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
|
|
|
const bool test::blas = !!ggml_cpu_has_blas();
|
|
const bool test::blas = !!ggml_cpu_has_blas();
|
|
|
|
|
+const bool test::sycl = !!ggml_cpu_has_sycl();
|
|
|
const std::string test::cpu_info = get_cpu_info();
|
|
const std::string test::cpu_info = get_cpu_info();
|
|
|
const std::string test::gpu_info = get_gpu_info();
|
|
const std::string test::gpu_info = get_gpu_info();
|
|
|
|
|
|
|
@@ -895,6 +937,9 @@ struct markdown_printer : public printer {
|
|
|
if (field == "no_kv_offload") {
|
|
if (field == "no_kv_offload") {
|
|
|
return "nkvo";
|
|
return "nkvo";
|
|
|
}
|
|
}
|
|
|
|
|
+ if (field == "use_mmap") {
|
|
|
|
|
+ return "mmap";
|
|
|
|
|
+ }
|
|
|
if (field == "tensor_split") {
|
|
if (field == "tensor_split") {
|
|
|
return "ts";
|
|
return "ts";
|
|
|
}
|
|
}
|
|
@@ -938,6 +983,9 @@ struct markdown_printer : public printer {
|
|
|
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
|
|
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
|
|
|
fields.push_back("tensor_split");
|
|
fields.push_back("tensor_split");
|
|
|
}
|
|
}
|
|
|
|
|
+ if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
|
|
|
|
|
+ fields.push_back("use_mmap");
|
|
|
|
|
+ }
|
|
|
fields.push_back("test");
|
|
fields.push_back("test");
|
|
|
fields.push_back("t/s");
|
|
fields.push_back("t/s");
|
|
|
|
|
|