Parcourir la source

add --no-mmap in llama-bench (#5257)

* add --no-mmap, show sycl backend

* fix conflict

* fix code format, change print for --no-mmap

* ren no_mmap to mmap, show mmap when not default value in printer

* update guide for mmap

* mv position to reduce model reload
Neo Zhang Jianyu il y a 1 an
Parent
commit
128dcbd3c9
4 fichiers modifiés avec 89 ajouts et 10 suppressions
  1. 1 1
      README-sycl.md
  2. 54 6
      examples/llama-bench/llama-bench.cpp
  3. 32 2
      ggml-sycl.cpp
  4. 2 1
      ggml-sycl.h

+ 1 - 1
README-sycl.md

@@ -405,7 +405,7 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
 
 
   llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
   llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
 
 
-  Solution: add **--no-mmap**.
+  Solution: add **--no-mmap** or **--mmap 0**.
 
 
 ## Q&A
 ## Q&A
 
 

+ 54 - 6
examples/llama-bench/llama-bench.cpp

@@ -20,6 +20,7 @@
 #include "llama.h"
 #include "llama.h"
 #include "common.h"
 #include "common.h"
 #include "ggml-cuda.h"
 #include "ggml-cuda.h"
+#include "ggml-sycl.h"
 
 
 // utils
 // utils
 static uint64_t get_time_ns() {
 static uint64_t get_time_ns() {
@@ -120,6 +121,22 @@ static std::string get_gpu_info() {
             id += "/";
             id += "/";
         }
         }
     }
     }
+#endif
+#ifdef GGML_USE_SYCL
+    int device_list[GGML_SYCL_MAX_DEVICES];
+    ggml_sycl_get_gpu_list(device_list, GGML_SYCL_MAX_DEVICES);
+
+    for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
+        if (device_list[i] >0 ){
+            char buf[128];
+            ggml_sycl_get_device_description(i, buf, sizeof(buf));
+            id += buf;
+            id += "/";
+        }
+    }
+    if (id.length() >2 ) {
+        id.pop_back();
+    }
 #endif
 #endif
     // TODO: other backends
     // TODO: other backends
     return id;
     return id;
@@ -161,6 +178,7 @@ struct cmd_params {
     std::vector<bool> no_kv_offload;
     std::vector<bool> no_kv_offload;
     std::vector<bool> mul_mat_q;
     std::vector<bool> mul_mat_q;
     std::vector<std::vector<float>> tensor_split;
     std::vector<std::vector<float>> tensor_split;
+    std::vector<bool> use_mmap;
     int reps;
     int reps;
     bool verbose;
     bool verbose;
     output_formats output_format;
     output_formats output_format;
@@ -180,6 +198,7 @@ static const cmd_params cmd_params_defaults = {
     /* no_kv_offload */ {false},
     /* no_kv_offload */ {false},
     /* mul_mat_q     */ {true},
     /* mul_mat_q     */ {true},
     /* tensor_split  */ {std::vector<float>(llama_max_devices(), 0.0f)},
     /* tensor_split  */ {std::vector<float>(llama_max_devices(), 0.0f)},
+    /* use_mmap      */ {true},
     /* reps          */ 5,
     /* reps          */ 5,
     /* verbose       */ false,
     /* verbose       */ false,
     /* output_format */ MARKDOWN
     /* output_format */ MARKDOWN
@@ -201,6 +220,7 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  -sm, --split-mode <none|layer|row>  (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
     printf("  -sm, --split-mode <none|layer|row>  (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
     printf("  -mg, --main-gpu <i>                 (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
     printf("  -mg, --main-gpu <i>                 (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
     printf("  -nkvo, --no-kv-offload <0|1>        (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
     printf("  -nkvo, --no-kv-offload <0|1>        (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
+    printf("  -mmp, --mmap <0|1>                  (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
     printf("  -mmq, --mul-mat-q <0|1>             (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
     printf("  -mmq, --mul-mat-q <0|1>             (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
     printf("  -ts, --tensor_split <ts0/ts1/..>    (default: 0)\n");
     printf("  -ts, --tensor_split <ts0/ts1/..>    (default: 0)\n");
     printf("  -r, --repetitions <n>               (default: %d)\n", cmd_params_defaults.reps);
     printf("  -r, --repetitions <n>               (default: %d)\n", cmd_params_defaults.reps);
@@ -370,6 +390,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
             }
             }
             auto p = split<bool>(argv[i], split_delim);
             auto p = split<bool>(argv[i], split_delim);
             params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end());
             params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end());
+        } else if (arg == "-mmp" || arg == "--mmap") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = split<bool>(argv[i], split_delim);
+            params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
         } else if (arg == "-ts" || arg == "--tensor-split") {
         } else if (arg == "-ts" || arg == "--tensor-split") {
             if (++i >= argc) {
             if (++i >= argc) {
                 invalid_param = true;
                 invalid_param = true;
@@ -441,6 +468,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
     if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
     if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
     if (params.mul_mat_q.empty())    { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
     if (params.mul_mat_q.empty())    { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
     if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
     if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
+    if (params.use_mmap.empty())     { params.use_mmap = cmd_params_defaults.use_mmap; }
     if (params.n_threads.empty())    { params.n_threads = cmd_params_defaults.n_threads; }
     if (params.n_threads.empty())    { params.n_threads = cmd_params_defaults.n_threads; }
 
 
     return params;
     return params;
@@ -460,6 +488,7 @@ struct cmd_params_instance {
     bool no_kv_offload;
     bool no_kv_offload;
     bool mul_mat_q;
     bool mul_mat_q;
     std::vector<float> tensor_split;
     std::vector<float> tensor_split;
+    bool use_mmap;
 
 
     llama_model_params to_llama_mparams() const {
     llama_model_params to_llama_mparams() const {
         llama_model_params mparams = llama_model_default_params();
         llama_model_params mparams = llama_model_default_params();
@@ -468,6 +497,7 @@ struct cmd_params_instance {
         mparams.split_mode = split_mode;
         mparams.split_mode = split_mode;
         mparams.main_gpu = main_gpu;
         mparams.main_gpu = main_gpu;
         mparams.tensor_split = tensor_split.data();
         mparams.tensor_split = tensor_split.data();
+        mparams.use_mmap = use_mmap;
 
 
         return mparams;
         return mparams;
     }
     }
@@ -477,6 +507,7 @@ struct cmd_params_instance {
                n_gpu_layers == other.n_gpu_layers &&
                n_gpu_layers == other.n_gpu_layers &&
                split_mode == other.split_mode &&
                split_mode == other.split_mode &&
                main_gpu == other.main_gpu &&
                main_gpu == other.main_gpu &&
+               use_mmap == other.use_mmap &&
                tensor_split == other.tensor_split;
                tensor_split == other.tensor_split;
     }
     }
 
 
@@ -503,6 +534,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
     for (const auto & sm : params.split_mode)
     for (const auto & sm : params.split_mode)
     for (const auto & mg : params.main_gpu)
     for (const auto & mg : params.main_gpu)
     for (const auto & ts : params.tensor_split)
     for (const auto & ts : params.tensor_split)
+    for (const auto & mmp : params.use_mmap)
     for (const auto & nb : params.n_batch)
     for (const auto & nb : params.n_batch)
     for (const auto & tk : params.type_k)
     for (const auto & tk : params.type_k)
     for (const auto & tv : params.type_v)
     for (const auto & tv : params.type_v)
@@ -527,6 +559,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .no_kv_offload= */ nkvo,
                 /* .no_kv_offload= */ nkvo,
                 /* .mul_mat_q    = */ mmq,
                 /* .mul_mat_q    = */ mmq,
                 /* .tensor_split = */ ts,
                 /* .tensor_split = */ ts,
+                /* .use_mmap     = */ mmp,
             };
             };
             instances.push_back(instance);
             instances.push_back(instance);
         }
         }
@@ -549,6 +582,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .no_kv_offload= */ nkvo,
                 /* .no_kv_offload= */ nkvo,
                 /* .mul_mat_q    = */ mmq,
                 /* .mul_mat_q    = */ mmq,
                 /* .tensor_split = */ ts,
                 /* .tensor_split = */ ts,
+                /* .use_mmap     = */ mmp,
             };
             };
             instances.push_back(instance);
             instances.push_back(instance);
         }
         }
@@ -565,6 +599,7 @@ struct test {
     static const bool vulkan;
     static const bool vulkan;
     static const bool kompute;
     static const bool kompute;
     static const bool metal;
     static const bool metal;
+    static const bool sycl;
     static const bool gpu_blas;
     static const bool gpu_blas;
     static const bool blas;
     static const bool blas;
     static const std::string cpu_info;
     static const std::string cpu_info;
@@ -583,6 +618,7 @@ struct test {
     bool no_kv_offload;
     bool no_kv_offload;
     bool mul_mat_q;
     bool mul_mat_q;
     std::vector<float> tensor_split;
     std::vector<float> tensor_split;
+    bool use_mmap;
     int n_prompt;
     int n_prompt;
     int n_gen;
     int n_gen;
     std::string test_time;
     std::string test_time;
@@ -605,6 +641,7 @@ struct test {
         no_kv_offload = inst.no_kv_offload;
         no_kv_offload = inst.no_kv_offload;
         mul_mat_q = inst.mul_mat_q;
         mul_mat_q = inst.mul_mat_q;
         tensor_split = inst.tensor_split;
         tensor_split = inst.tensor_split;
+        use_mmap = inst.use_mmap;
         n_prompt = inst.n_prompt;
         n_prompt = inst.n_prompt;
         n_gen = inst.n_gen;
         n_gen = inst.n_gen;
         // RFC 3339 date-time format
         // RFC 3339 date-time format
@@ -654,25 +691,29 @@ struct test {
         if (metal) {
         if (metal) {
             return "Metal";
             return "Metal";
         }
         }
+        if (sycl) {
+            return GGML_SYCL_NAME;
+        }
         if (gpu_blas) {
         if (gpu_blas) {
             return "GPU BLAS";
             return "GPU BLAS";
         }
         }
         if (blas) {
         if (blas) {
             return "BLAS";
             return "BLAS";
         }
         }
+
         return "CPU";
         return "CPU";
     }
     }
 
 
     static const std::vector<std::string> & get_fields() {
     static const std::vector<std::string> & get_fields() {
         static const std::vector<std::string> fields = {
         static const std::vector<std::string> fields = {
             "build_commit", "build_number",
             "build_commit", "build_number",
-            "cuda", "opencl", "vulkan", "kompute", "metal", "gpu_blas", "blas",
+            "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas",
             "cpu_info", "gpu_info",
             "cpu_info", "gpu_info",
             "model_filename", "model_type", "model_size", "model_n_params",
             "model_filename", "model_type", "model_size", "model_n_params",
             "n_batch", "n_threads", "type_k", "type_v",
             "n_batch", "n_threads", "type_k", "type_v",
             "n_gpu_layers", "split_mode",
             "n_gpu_layers", "split_mode",
             "main_gpu", "no_kv_offload",
             "main_gpu", "no_kv_offload",
-            "mul_mat_q", "tensor_split",
+            "mul_mat_q", "tensor_split", "use_mmap",
             "n_prompt", "n_gen", "test_time",
             "n_prompt", "n_gen", "test_time",
             "avg_ns", "stddev_ns",
             "avg_ns", "stddev_ns",
             "avg_ts", "stddev_ts"
             "avg_ts", "stddev_ts"
@@ -691,8 +732,8 @@ struct test {
             return INT;
             return INT;
         }
         }
         if (field == "cuda" || field == "opencl"  || field == "vulkan" || field == "kompute" || field == "metal" ||
         if (field == "cuda" || field == "opencl"  || field == "vulkan" || field == "kompute" || field == "metal" ||
-            field == "gpu_blas" || field == "blas" || field == "f16_kv" || field == "no_kv_offload" ||
-            field == "mul_mat_q") {
+            field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
+            field == "mul_mat_q" || field == "use_mmap") {
             return BOOL;
             return BOOL;
         }
         }
         if (field == "avg_ts" || field == "stddev_ts") {
         if (field == "avg_ts" || field == "stddev_ts") {
@@ -720,13 +761,13 @@ struct test {
         std::vector<std::string> values = {
         std::vector<std::string> values = {
             build_commit, std::to_string(build_number),
             build_commit, std::to_string(build_number),
             std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
             std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
-            std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
+            std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas),
             cpu_info, gpu_info,
             cpu_info, gpu_info,
             model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
             model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
             std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
             std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
             std::to_string(n_gpu_layers), split_mode_str(split_mode),
             std::to_string(n_gpu_layers), split_mode_str(split_mode),
             std::to_string(main_gpu), std::to_string(no_kv_offload),
             std::to_string(main_gpu), std::to_string(no_kv_offload),
-            std::to_string(mul_mat_q), tensor_split_str,
+            std::to_string(mul_mat_q), tensor_split_str, std::to_string(use_mmap),
             std::to_string(n_prompt), std::to_string(n_gen), test_time,
             std::to_string(n_prompt), std::to_string(n_gen), test_time,
             std::to_string(avg_ns()), std::to_string(stdev_ns()),
             std::to_string(avg_ns()), std::to_string(stdev_ns()),
             std::to_string(avg_ts()), std::to_string(stdev_ts())
             std::to_string(avg_ts()), std::to_string(stdev_ts())
@@ -753,6 +794,7 @@ const bool        test::kompute      = !!ggml_cpu_has_kompute();
 const bool        test::metal        = !!ggml_cpu_has_metal();
 const bool        test::metal        = !!ggml_cpu_has_metal();
 const bool        test::gpu_blas     = !!ggml_cpu_has_gpublas();
 const bool        test::gpu_blas     = !!ggml_cpu_has_gpublas();
 const bool        test::blas         = !!ggml_cpu_has_blas();
 const bool        test::blas         = !!ggml_cpu_has_blas();
+const bool        test::sycl         = !!ggml_cpu_has_sycl();
 const std::string test::cpu_info     = get_cpu_info();
 const std::string test::cpu_info     = get_cpu_info();
 const std::string test::gpu_info     = get_gpu_info();
 const std::string test::gpu_info     = get_gpu_info();
 
 
@@ -895,6 +937,9 @@ struct markdown_printer : public printer {
         if (field == "no_kv_offload") {
         if (field == "no_kv_offload") {
             return "nkvo";
             return "nkvo";
         }
         }
+        if (field == "use_mmap") {
+            return "mmap";
+        }
         if (field == "tensor_split") {
         if (field == "tensor_split") {
             return "ts";
             return "ts";
         }
         }
@@ -938,6 +983,9 @@ struct markdown_printer : public printer {
         if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
         if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
             fields.push_back("tensor_split");
             fields.push_back("tensor_split");
         }
         }
+        if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
+            fields.push_back("use_mmap");
+        }
         fields.push_back("test");
         fields.push_back("test");
         fields.push_back("t/s");
         fields.push_back("t/s");
 
 

+ 32 - 2
ggml-sycl.cpp

@@ -2928,7 +2928,6 @@ void   ggml_sycl_set_main_device(int main_device);
 void   ggml_sycl_set_mul_mat_q(bool mul_mat_q);
 void   ggml_sycl_set_mul_mat_q(bool mul_mat_q);
 void   ggml_sycl_set_scratch_size(size_t scratch_size);
 void   ggml_sycl_set_scratch_size(size_t scratch_size);
 void   ggml_sycl_free_scratch(void);
 void   ggml_sycl_free_scratch(void);
-int    ggml_sycl_get_device_count(void);
 void   ggml_sycl_get_device_description(int device, char * description, size_t description_size);
 void   ggml_sycl_get_device_description(int device, char * description, size_t description_size);
 bool   ggml_backend_is_sycl(ggml_backend_t backend);
 bool   ggml_backend_is_sycl(ggml_backend_t backend);
 int    ggml_backend_sycl_get_device(ggml_backend_t backend);
 int    ggml_backend_sycl_get_device(ggml_backend_t backend);
@@ -14493,6 +14492,37 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
     return true;
     return true;
 }
 }
 
 
+GGML_API GGML_CALL void   ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
+    int max_compute_units = -1;
+    for(int i=0;i<max_len;i++) id_list[i] = 0;
+
+    int device_count = dpct::dev_mgr::instance().device_count();
+
+    for(int id=0; id< device_count; id++){
+        sycl::device device = dpct::dev_mgr::instance().get_device(id);
+        if (!device.is_gpu()) continue;
+        dpct::device_info prop;
+        dpct::get_device_info(prop, device);
+        if(max_compute_units < prop.get_max_compute_units()) max_compute_units = prop.get_max_compute_units();
+    }
+
+    for(int id=0;id< device_count;id++){
+        sycl::device device = dpct::dev_mgr::instance().get_device(id);
+        if (!device.is_gpu()) continue;
+        dpct::device_info prop;
+        dpct::get_device_info(prop, device);
+        if(max_compute_units == prop.get_max_compute_units() && prop.get_major_version() == 1 ){
+            id_list[id] = 1;
+        }
+    }
+    return;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
 int ggml_sycl_get_device_count() try {
 int ggml_sycl_get_device_count() try {
     int device_count;
     int device_count;
     if (CHECK_TRY_ERROR(device_count =
     if (CHECK_TRY_ERROR(device_count =
@@ -14507,7 +14537,7 @@ catch (sycl::exception const &exc) {
   std::exit(1);
   std::exit(1);
 }
 }
 
 
-void ggml_sycl_get_device_description(int device, char *description,
+GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description,
                                       size_t description_size) try {
                                       size_t description_size) try {
     dpct::device_info prop;
     dpct::device_info prop;
     SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
     SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(

+ 2 - 1
ggml-sycl.h

@@ -22,7 +22,8 @@ GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
 GGML_API void   ggml_backend_sycl_print_sycl_devices(void);
 GGML_API void   ggml_backend_sycl_print_sycl_devices(void);
-
+GGML_API GGML_CALL void   ggml_sycl_get_gpu_list(int *id_list, int max_len);
+GGML_API GGML_CALL void   ggml_sycl_get_device_description(int device, char *description, size_t description_size);
 #ifdef  __cplusplus
 #ifdef  __cplusplus
 }
 }
 #endif
 #endif