il y a 1 an · 128dcbd3c9
--- a/README-sycl.md
+++ b/README-sycl.md
@@ -405,7 +405,7 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
 
															   llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
														
 
															-  Solution: add **--no-mmap**.
														
 
															+  Solution: add **--no-mmap** or **--mmap 0**.
														
 
															 ## Q&A
														
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -20,6 +20,7 @@
 
															 #include "llama.h"
														
 
															 #include "common.h"
														
 
															 #include "ggml-cuda.h"
														
 
															+#include "ggml-sycl.h"
														
 
															 // utils
														
 
															 static uint64_t get_time_ns() {
														
@@ -120,6 +121,22 @@ static std::string get_gpu_info() {
 
															             id += "/";
														
 
															         }
														
 
															     }
														
 
															+#endif
														
 
															+#ifdef GGML_USE_SYCL
														
 
															+    int device_list[GGML_SYCL_MAX_DEVICES];
														
 
															+    ggml_sycl_get_gpu_list(device_list, GGML_SYCL_MAX_DEVICES);
														
 
															+
														
 
															+    for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
														
 
															+        if (device_list[i] >0 ){
														
 
															+            char buf[128];
														
 
															+            ggml_sycl_get_device_description(i, buf, sizeof(buf));
														
 
															+            id += buf;
														
 
															+            id += "/";
														
 
															+        }
														
 
															+    }
														
 
															+    if (id.length() >2 ) {
														
 
															+        id.pop_back();
														
 
															+    }
														
 
															 #endif
														
 
															     // TODO: other backends
														
 
															     return id;
														
@@ -161,6 +178,7 @@ struct cmd_params {
 
															     std::vector<bool> no_kv_offload;
														
 
															     std::vector<bool> mul_mat_q;
														
 
															     std::vector<std::vector<float>> tensor_split;
														
 
															+    std::vector<bool> use_mmap;
														
 
															     int reps;
														
 
															     bool verbose;
														
 
															     output_formats output_format;
														
@@ -180,6 +198,7 @@ static const cmd_params cmd_params_defaults = {
 
															     /* no_kv_offload */ {false},
														
 
															     /* mul_mat_q     */ {true},
														
 
															     /* tensor_split  */ {std::vector<float>(llama_max_devices(), 0.0f)},
														
 
															+    /* use_mmap      */ {true},
														
 
															     /* reps          */ 5,
														
 
															     /* verbose       */ false,
														
 
															     /* output_format */ MARKDOWN
														
@@ -201,6 +220,7 @@ static void print_usage(int /* argc */, char ** argv) {
 
															     printf("  -sm, --split-mode <none|layer|row>  (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
														
 
															     printf("  -mg, --main-gpu <i>                 (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
														
 
															     printf("  -nkvo, --no-kv-offload <0|1>        (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
														
 
															+    printf("  -mmp, --mmap <0|1>                  (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
														
 
															     printf("  -mmq, --mul-mat-q <0|1>             (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
														
 
															     printf("  -ts, --tensor_split <ts0/ts1/..>    (default: 0)\n");
														
 
															     printf("  -r, --repetitions <n>               (default: %d)\n", cmd_params_defaults.reps);
														
@@ -370,6 +390,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
 
															             }
														
 
															             auto p = split<bool>(argv[i], split_delim);
														
 
															             params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end());
														
 
															+        } else if (arg == "-mmp" || arg == "--mmap") {
														
 
															+            if (++i >= argc) {
														
 
															+                invalid_param = true;
														
 
															+                break;
														
 
															+            }
														
 
															+            auto p = split<bool>(argv[i], split_delim);
														
 
															+            params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
														
 
															         } else if (arg == "-ts" || arg == "--tensor-split") {
														
 
															             if (++i >= argc) {
														
 
															                 invalid_param = true;
														
@@ -441,6 +468,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
 
															     if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
														
 
															     if (params.mul_mat_q.empty())    { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
														
 
															     if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
														
 
															+    if (params.use_mmap.empty())     { params.use_mmap = cmd_params_defaults.use_mmap; }
														
 
															     if (params.n_threads.empty())    { params.n_threads = cmd_params_defaults.n_threads; }
														
 
															     return params;
														
@@ -460,6 +488,7 @@ struct cmd_params_instance {
 
															     bool no_kv_offload;
														
 
															     bool mul_mat_q;
														
 
															     std::vector<float> tensor_split;
														
 
															+    bool use_mmap;
														
 
															     llama_model_params to_llama_mparams() const {
														
 
															         llama_model_params mparams = llama_model_default_params();
														
@@ -468,6 +497,7 @@ struct cmd_params_instance {
 
															         mparams.split_mode = split_mode;
														
 
															         mparams.main_gpu = main_gpu;
														
 
															         mparams.tensor_split = tensor_split.data();
														
 
															+        mparams.use_mmap = use_mmap;
														
 
															         return mparams;
														
 
															     }
														
@@ -477,6 +507,7 @@ struct cmd_params_instance {
 
															                n_gpu_layers == other.n_gpu_layers &&
														
 
															                split_mode == other.split_mode &&
														
 
															                main_gpu == other.main_gpu &&
														
 
															+               use_mmap == other.use_mmap &&
														
 
															                tensor_split == other.tensor_split;
														
 
															     }
														
@@ -503,6 +534,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
 
															     for (const auto & sm : params.split_mode)
														
 
															     for (const auto & mg : params.main_gpu)
														
 
															     for (const auto & ts : params.tensor_split)
														
 
															+    for (const auto & mmp : params.use_mmap)
														
 
															     for (const auto & nb : params.n_batch)
														
 
															     for (const auto & tk : params.type_k)
														
 
															     for (const auto & tv : params.type_v)
														
@@ -527,6 +559,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
 
															                 /* .no_kv_offload= */ nkvo,
														
 
															                 /* .mul_mat_q    = */ mmq,
														
 
															                 /* .tensor_split = */ ts,
														
 
															+                /* .use_mmap     = */ mmp,
														
 
															             };
														
 
															             instances.push_back(instance);
														
 
															         }
														
@@ -549,6 +582,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
 
															                 /* .no_kv_offload= */ nkvo,
														
 
															                 /* .mul_mat_q    = */ mmq,
														
 
															                 /* .tensor_split = */ ts,
														
 
															+                /* .use_mmap     = */ mmp,
														
 
															             };
														
 
															             instances.push_back(instance);
														
 
															         }
														
@@ -565,6 +599,7 @@ struct test {
 
															     static const bool vulkan;
														
 
															     static const bool kompute;
														
 
															     static const bool metal;
														
 
															+    static const bool sycl;
														
 
															     static const bool gpu_blas;
														
 
															     static const bool blas;
														
 
															     static const std::string cpu_info;
														
@@ -583,6 +618,7 @@ struct test {
 
															     bool no_kv_offload;
														
 
															     bool mul_mat_q;
														
 
															     std::vector<float> tensor_split;
														
 
															+    bool use_mmap;
														
 
															     int n_prompt;
														
 
															     int n_gen;
														
 
															     std::string test_time;
														
@@ -605,6 +641,7 @@ struct test {
 
															         no_kv_offload = inst.no_kv_offload;
														
 
															         mul_mat_q = inst.mul_mat_q;
														
 
															         tensor_split = inst.tensor_split;
														
 
															+        use_mmap = inst.use_mmap;
														
 
															         n_prompt = inst.n_prompt;
														
 
															         n_gen = inst.n_gen;
														
 
															         // RFC 3339 date-time format
														
@@ -654,25 +691,29 @@ struct test {
 
															         if (metal) {
														
 
															             return "Metal";
														
 
															         }
														
 
															+        if (sycl) {
														
 
															+            return GGML_SYCL_NAME;
														
 
															+        }
														
 
															         if (gpu_blas) {
														
 
															             return "GPU BLAS";
														
 
															         }
														
 
															         if (blas) {
														
 
															             return "BLAS";
														
 
															         }
														
 
															+
														
 
															         return "CPU";
														
 
															     }
														
 
															     static const std::vector<std::string> & get_fields() {
														
 
															         static const std::vector<std::string> fields = {
														
 
															             "build_commit", "build_number",
														
 
															-            "cuda", "opencl", "vulkan", "kompute", "metal", "gpu_blas", "blas",
														
 
															+            "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas",
														
 
															             "cpu_info", "gpu_info",
														
 
															             "model_filename", "model_type", "model_size", "model_n_params",
														
 
															             "n_batch", "n_threads", "type_k", "type_v",
														
 
															             "n_gpu_layers", "split_mode",
														
 
															             "main_gpu", "no_kv_offload",
														
 
															-            "mul_mat_q", "tensor_split",
														
 
															+            "mul_mat_q", "tensor_split", "use_mmap",
														
 
															             "n_prompt", "n_gen", "test_time",
														
 
															             "avg_ns", "stddev_ns",
														
 
															             "avg_ts", "stddev_ts"
														
@@ -691,8 +732,8 @@ struct test {
 
															             return INT;
														
 
															         }
														
 
															         if (field == "cuda" || field == "opencl"  || field == "vulkan" || field == "kompute" || field == "metal" ||
														
 
															-            field == "gpu_blas" || field == "blas" || field == "f16_kv" || field == "no_kv_offload" ||
														
 
															-            field == "mul_mat_q") {
														
 
															+            field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
														
 
															+            field == "mul_mat_q" || field == "use_mmap") {
														
 
															             return BOOL;
														
 
															         }
														
 
															         if (field == "avg_ts" || field == "stddev_ts") {
														
@@ -720,13 +761,13 @@ struct test {
 
															         std::vector<std::string> values = {
														
 
															             build_commit, std::to_string(build_number),
														
 
															             std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
														
 
															-            std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
														
 
															+            std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas),
														
 
															             cpu_info, gpu_info,
														
 
															             model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
														
 
															             std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
														
 
															             std::to_string(n_gpu_layers), split_mode_str(split_mode),
														
 
															             std::to_string(main_gpu), std::to_string(no_kv_offload),
														
 
															-            std::to_string(mul_mat_q), tensor_split_str,
														
 
															+            std::to_string(mul_mat_q), tensor_split_str, std::to_string(use_mmap),
														
 
															             std::to_string(n_prompt), std::to_string(n_gen), test_time,
														
 
															             std::to_string(avg_ns()), std::to_string(stdev_ns()),
														
 
															             std::to_string(avg_ts()), std::to_string(stdev_ts())
														
@@ -753,6 +794,7 @@ const bool        test::kompute      = !!ggml_cpu_has_kompute();
 
															 const bool        test::metal        = !!ggml_cpu_has_metal();
														
 
															 const bool        test::gpu_blas     = !!ggml_cpu_has_gpublas();
														
 
															 const bool        test::blas         = !!ggml_cpu_has_blas();
														
 
															+const bool        test::sycl         = !!ggml_cpu_has_sycl();
														
 
															 const std::string test::cpu_info     = get_cpu_info();
														
 
															 const std::string test::gpu_info     = get_gpu_info();
														
@@ -895,6 +937,9 @@ struct markdown_printer : public printer {
 
															         if (field == "no_kv_offload") {
														
 
															             return "nkvo";
														
 
															         }
														
 
															+        if (field == "use_mmap") {
														
 
															+            return "mmap";
														
 
															+        }
														
 
															         if (field == "tensor_split") {
														
 
															             return "ts";
														
 
															         }
														
@@ -938,6 +983,9 @@ struct markdown_printer : public printer {
 
															         if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
														
 
															             fields.push_back("tensor_split");
														
 
															         }
														
 
															+        if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
														
 
															+            fields.push_back("use_mmap");
														
 
															+        }
														
 
															         fields.push_back("test");
														
 
															         fields.push_back("t/s");
														
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -2928,7 +2928,6 @@ void   ggml_sycl_set_main_device(int main_device);
 
															 void   ggml_sycl_set_mul_mat_q(bool mul_mat_q);
														
 
															 void   ggml_sycl_set_scratch_size(size_t scratch_size);
														
 
															 void   ggml_sycl_free_scratch(void);
														
 
															-int    ggml_sycl_get_device_count(void);
														
 
															 void   ggml_sycl_get_device_description(int device, char * description, size_t description_size);
														
 
															 bool   ggml_backend_is_sycl(ggml_backend_t backend);
														
 
															 int    ggml_backend_sycl_get_device(ggml_backend_t backend);
														
@@ -14493,6 +14492,37 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
 
															     return true;
														
 
															 }
														
 
															+GGML_API GGML_CALL void   ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
														
 
															+    int max_compute_units = -1;
														
 
															+    for(int i=0;i<max_len;i++) id_list[i] = 0;
														
 
															+
														
 
															+    int device_count = dpct::dev_mgr::instance().device_count();
														
 
															+
														
 
															+    for(int id=0; id< device_count; id++){
														
 
															+        sycl::device device = dpct::dev_mgr::instance().get_device(id);
														
 
															+        if (!device.is_gpu()) continue;
														
 
															+        dpct::device_info prop;
														
 
															+        dpct::get_device_info(prop, device);
														
 
															+        if(max_compute_units < prop.get_max_compute_units()) max_compute_units = prop.get_max_compute_units();
														
 
															+    }
														
 
															+
														
 
															+    for(int id=0;id< device_count;id++){
														
 
															+        sycl::device device = dpct::dev_mgr::instance().get_device(id);
														
 
															+        if (!device.is_gpu()) continue;
														
 
															+        dpct::device_info prop;
														
 
															+        dpct::get_device_info(prop, device);
														
 
															+        if(max_compute_units == prop.get_max_compute_units() && prop.get_major_version() == 1 ){
														
 
															+            id_list[id] = 1;
														
 
															+        }
														
 
															+    }
														
 
															+    return;
														
 
															+}
														
 
															+catch (sycl::exception const &exc) {
														
 
															+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
														
 
															+            << ", line:" << __LINE__ << std::endl;
														
 
															+  std::exit(1);
														
 
															+}
														
 
															+
														
 
															 int ggml_sycl_get_device_count() try {
														
 
															     int device_count;
														
 
															     if (CHECK_TRY_ERROR(device_count =
														
@@ -14507,7 +14537,7 @@ catch (sycl::exception const &exc) {
 
															   std::exit(1);
														
 
															 }
														
 
															-void ggml_sycl_get_device_description(int device, char *description,
														
 
															+GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description,
														
 
															                                       size_t description_size) try {
														
 
															     dpct::device_info prop;
														
 
															     SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
														
--- a/ggml-sycl.h
+++ b/ggml-sycl.h
@@ -22,7 +22,8 @@ GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
 
															 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
														
 
															 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
														
 
															 GGML_API void   ggml_backend_sycl_print_sycl_devices(void);
														
 
															-
														
 
															+GGML_API GGML_CALL void   ggml_sycl_get_gpu_list(int *id_list, int max_len);
														
 
															+GGML_API GGML_CALL void   ggml_sycl_get_device_description(int device, char *description, size_t description_size);
														
 
															 #ifdef  __cplusplus
														
 
															 }
														
 
															 #endif