Browse Source

Support multiple GPUs (split mode) on SYCL backend (#5806)

* suport multiple cards: split-mode - layer|row

* rm warning

* rebase with master, support tow new OPs, close feature for -sm=row, fix for unit test

* update news

* fix merge error

* update according to review comments
Neo Zhang Jianyu 1 year ago
parent
commit
715641391d
8 changed files with 547 additions and 241 deletions
  1. 21 0
      README-sycl.md
  2. 4 0
      common/common.cpp
  3. 6 11
      examples/llama-bench/llama-bench.cpp
  4. 1 1
      examples/sycl/ls-sycl-device.cpp
  5. 12 5
      examples/sycl/run-llama2.sh
  6. 457 216
      ggml-sycl.cpp
  7. 5 0
      ggml-sycl.h
  8. 41 8
      llama.cpp

+ 21 - 0
README-sycl.md

@@ -1,6 +1,7 @@
 # llama.cpp for SYCL
 # llama.cpp for SYCL
 
 
 - [Background](#background)
 - [Background](#background)
+- [News](#news)
 - [OS](#os)
 - [OS](#os)
 - [Intel GPU](#intel-gpu)
 - [Intel GPU](#intel-gpu)
 - [Docker](#docker)
 - [Docker](#docker)
@@ -25,6 +26,21 @@ The llama.cpp for SYCL is used to support Intel GPUs.
 
 
 For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
 For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
 
 
+## News
+
+- 2024.3
+  - Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
+  - Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.
+  - Support detecting all GPUs with level-zero and same top **Max compute units**.
+  - Support OPs
+    - hardsigmoid
+    - hardswish
+    - pool2d
+
+- 2024.1
+  - Create SYCL backend for Intel GPU.
+  - Support Windows build
+
 ## OS
 ## OS
 
 
 |OS|Status|Verified|
 |OS|Status|Verified|
@@ -449,6 +465,7 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
 |-|-|-|
 |-|-|-|
 |GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
 |GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
 |GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
 |GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
+|ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer|
 
 
 ## Known Issue
 ## Known Issue
 
 
@@ -458,6 +475,10 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
 
 
   Solution: add **--no-mmap** or **--mmap 0**.
   Solution: add **--no-mmap** or **--mmap 0**.
 
 
+- Split-mode: [row] is not supported
+
+  It's on developing.
+
 ## Q&A
 ## Q&A
 
 
 - Error:  `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
 - Error:  `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.

+ 4 - 0
common/common.cpp

@@ -640,6 +640,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             } else if (arg_next == "layer") {
             } else if (arg_next == "layer") {
                 params.split_mode = LLAMA_SPLIT_MODE_LAYER;
                 params.split_mode = LLAMA_SPLIT_MODE_LAYER;
             } else if (arg_next == "row") {
             } else if (arg_next == "row") {
+#ifdef GGML_USE_SYCL
+                fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
+                exit(1);
+#endif // GGML_USE_SYCL
                 params.split_mode = LLAMA_SPLIT_MODE_ROW;
                 params.split_mode = LLAMA_SPLIT_MODE_ROW;
             } else {
             } else {
                 invalid_param = true;
                 invalid_param = true;

+ 6 - 11
examples/llama-bench/llama-bench.cpp

@@ -123,20 +123,15 @@ static std::string get_gpu_info() {
     }
     }
 #endif
 #endif
 #ifdef GGML_USE_SYCL
 #ifdef GGML_USE_SYCL
-    int device_list[GGML_SYCL_MAX_DEVICES];
-    ggml_sycl_get_gpu_list(device_list, GGML_SYCL_MAX_DEVICES);
-
-    for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
-        if (device_list[i] >0 ){
-            char buf[128];
-            ggml_sycl_get_device_description(i, buf, sizeof(buf));
-            id += buf;
+    int count = ggml_backend_sycl_get_device_count();
+    for (int i = 0; i < count; i++) {
+        char buf[128];
+        ggml_sycl_get_device_description(i, buf, sizeof(buf));
+        id += buf;
+        if (i < count - 1) {
             id += "/";
             id += "/";
         }
         }
     }
     }
-    if (id.length() >2 ) {
-        id.pop_back();
-    }
 #endif
 #endif
     // TODO: other backends
     // TODO: other backends
     return id;
     return id;

+ 1 - 1
examples/sycl/ls-sycl-device.cpp

@@ -7,7 +7,7 @@
 
 
 #include "ggml-sycl.h"
 #include "ggml-sycl.h"
 
 
-int main(int argc, char ** argv) {
+int main() {
     ggml_backend_sycl_print_sycl_devices();
     ggml_backend_sycl_print_sycl_devices();
     return 0;
     return 0;
 }
 }

+ 12 - 5
examples/sycl/run-llama2.sh

@@ -8,12 +8,19 @@ INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
 source /opt/intel/oneapi/setvars.sh
 source /opt/intel/oneapi/setvars.sh
 
 
 if [ $# -gt 0 ]; then
 if [ $# -gt 0 ]; then
-    export GGML_SYCL_DEVICE=$1
+    GGML_SYCL_DEVICE=$1
 else
 else
-    export GGML_SYCL_DEVICE=0
+    GGML_SYCL_DEVICE=0
 fi
 fi
-echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
+echo "use $GGML_SYCL_DEVICE as main GPU"
 #export GGML_SYCL_DEBUG=1
 #export GGML_SYCL_DEBUG=1
-./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
-#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 5 -e -ngl 33 -t 1 -s 0
+
+
+#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
+
+#use all GPUs with same max compute units
+ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
+
+#use main GPU only
+#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
 
 

File diff suppressed because it is too large
+ 457 - 216
ggml-sycl.cpp


+ 5 - 0
ggml-sycl.h

@@ -24,6 +24,11 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
 GGML_API void   ggml_backend_sycl_print_sycl_devices(void);
 GGML_API void   ggml_backend_sycl_print_sycl_devices(void);
 GGML_API GGML_CALL void   ggml_sycl_get_gpu_list(int *id_list, int max_len);
 GGML_API GGML_CALL void   ggml_sycl_get_gpu_list(int *id_list, int max_len);
 GGML_API GGML_CALL void   ggml_sycl_get_device_description(int device, char *description, size_t description_size);
 GGML_API GGML_CALL void   ggml_sycl_get_device_description(int device, char *description, size_t description_size);
+GGML_API GGML_CALL int   ggml_backend_sycl_get_device_count();
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
+GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
+GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
+
 #ifdef  __cplusplus
 #ifdef  __cplusplus
 }
 }
 #endif
 #endif

+ 41 - 8
llama.cpp

@@ -104,6 +104,7 @@
 #define LLAMA_MAX_NODES   8192
 #define LLAMA_MAX_NODES   8192
 #define LLAMA_MAX_EXPERTS 8
 #define LLAMA_MAX_EXPERTS 8
 
 
+
 //
 //
 // logging
 // logging
 //
 //
@@ -1429,7 +1430,9 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
         buft = ggml_backend_cuda_host_buffer_type();
         buft = ggml_backend_cuda_host_buffer_type();
     }
     }
 #elif defined(GGML_USE_SYCL)
 #elif defined(GGML_USE_SYCL)
-    buft = ggml_backend_sycl_host_buffer_type();
+    if (host_buffer) {
+        buft = ggml_backend_sycl_host_buffer_type();
+    }
 #elif defined(GGML_USE_CPU_HBM)
 #elif defined(GGML_USE_CPU_HBM)
     buft = ggml_backend_cpu_hbm_buffer_type();
     buft = ggml_backend_cpu_hbm_buffer_type();
 #elif defined(GGML_USE_VULKAN)
 #elif defined(GGML_USE_VULKAN)
@@ -1483,6 +1486,12 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
     }
     }
 #endif
 #endif
 
 
+#ifdef GGML_USE_SYCL
+    if (ggml_backend_sycl_get_device_count() > 1) {
+        buft = ggml_backend_sycl_split_buffer_type(tensor_split);
+    }
+#endif
+
     if (buft == nullptr) {
     if (buft == nullptr) {
         buft = llama_default_buffer_type_offload(fallback_gpu);
         buft = llama_default_buffer_type_offload(fallback_gpu);
     }
     }
@@ -1494,6 +1503,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
 static size_t llama_get_device_count() {
 static size_t llama_get_device_count() {
 #if defined(GGML_USE_CUBLAS)
 #if defined(GGML_USE_CUBLAS)
     return ggml_backend_cuda_get_device_count();
     return ggml_backend_cuda_get_device_count();
+#elif defined(GGML_USE_SYCL)
+    return ggml_backend_sycl_get_device_count();
 #elif defined(GGML_USE_VULKAN)
 #elif defined(GGML_USE_VULKAN)
     return ggml_backend_vk_get_device_count();
     return ggml_backend_vk_get_device_count();
 #else
 #else
@@ -1507,6 +1518,11 @@ static size_t llama_get_device_memory(int device) {
     size_t free;
     size_t free;
     ggml_backend_cuda_get_device_memory(device, &total, &free);
     ggml_backend_cuda_get_device_memory(device, &total, &free);
     return free;
     return free;
+#elif defined(GGML_USE_SYCL)
+    size_t total;
+    size_t free;
+    ggml_backend_sycl_get_device_memory(device, &total, &free);
+    return free;
 #elif defined(GGML_USE_VULKAN)
 #elif defined(GGML_USE_VULKAN)
     size_t total;
     size_t total;
     size_t free;
     size_t free;
@@ -12075,13 +12091,31 @@ struct llama_context * llama_new_context_with_model(
         }
         }
 #elif defined(GGML_USE_SYCL)
 #elif defined(GGML_USE_SYCL)
         if (model->n_gpu_layers > 0) {
         if (model->n_gpu_layers > 0) {
-            ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
-            if (backend == nullptr) {
-                LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
-                llama_free(ctx);
-                return nullptr;
+            // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
+            if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
+                int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu);
+                ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index);
+                if (backend == nullptr) {
+                    LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index);
+                    llama_free(ctx);
+                    return nullptr;
+                }
+                ctx->backends.push_back(backend);
+            } else {
+                // LLAMA_SPLIT_LAYER requires a backend for each GPU
+                int id_list[GGML_SYCL_MAX_DEVICES];
+                ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
+                for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
+                    int device_id = id_list[i];
+                    ggml_backend_t backend = ggml_backend_sycl_init(i);
+                    if (backend == nullptr) {
+                        LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
+                        llama_free(ctx);
+                        return nullptr;
+                    }
+                    ctx->backends.push_back(backend);
+                }
             }
             }
-            ctx->backends.push_back(backend);
         }
         }
 #elif defined(GGML_USE_KOMPUTE)
 #elif defined(GGML_USE_KOMPUTE)
         if (model->n_gpu_layers > 0) {
         if (model->n_gpu_layers > 0) {
@@ -12161,7 +12195,6 @@ struct llama_context * llama_new_context_with_model(
             ggml_set_name(ctx->inp_cls,     "inp_cls");
             ggml_set_name(ctx->inp_cls,     "inp_cls");
 
 
             ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
             ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
-
             LLAMA_LOG_INFO("%s: %10s input buffer size   = %8.2f MiB\n", __func__,
             LLAMA_LOG_INFO("%s: %10s input buffer size   = %8.2f MiB\n", __func__,
                     ggml_backend_buffer_name(ctx->buf_input),
                     ggml_backend_buffer_name(ctx->buf_input),
                     ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0);
                     ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0);

Some files were not shown because too many files changed in this diff