1 an în urmă · 89fb735fcf
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -3559,31 +3559,12 @@ class sycl_gpu_mgr {
 
															         int work_group_size = 0;
														
 
															         std::string gpus_list = "";
														
 
															-        /*
														
 
															-        Use all GPU with same top max compute units
														
 
															-        */
														
 
															         sycl_gpu_mgr() {
														
 
															             detect_sycl_gpu_list_with_max_cu();
														
 
															             get_allow_gpus();
														
 
															             create_context_with_gpus();
														
 
															         }
														
 
															-        /*
														
 
															-        Use the assigned GPU as only one
														
 
															-        */
														
 
															-        sycl_gpu_mgr(int main_gpu_id) {
														
 
															-            sycl::device device = dpct::dev_mgr::instance().get_device(main_gpu_id);
														
 
															-            dpct::device_info prop;
														
 
															-            dpct::get_device_info(prop, device);
														
 
															-            gpus.push_back(main_gpu_id);
														
 
															-            devices.push_back(device);
														
 
															-            work_group_size = prop.get_max_work_group_size();
														
 
															-            max_compute_units = prop.get_max_compute_units();
														
 
															-
														
 
															-            get_allow_gpus();
														
 
															-            create_context_with_gpus();
														
 
															-        }
														
 
															-
														
 
															         void create_context_with_gpus() {
														
 
															             sycl::context ctx = sycl::context(devices);
														
 
															             assert(gpus.size() > 0);
														
@@ -3599,7 +3580,7 @@ class sycl_gpu_mgr {
 
															                 gpus_list += std::to_string(gpus[i]);
														
 
															                 gpus_list += ",";
														
 
															             }
														
 
															-            if (gpus_list.length() > 1) {
														
 
															+            if (gpus_list.length() > 2) {
														
 
															                 gpus_list.pop_back();
														
 
															             }
														
 
															         }
														
@@ -3648,8 +3629,8 @@ class sycl_gpu_mgr {
 
															                 if (gpus[i] == id)
														
 
															                     return i;
														
 
															             }
														
 
															-            printf("miss to get device index by id=%d\n", id);
														
 
															-            GGML_ASSERT(false);
														
 
															+            assert(false);
														
 
															+            return -1;
														
 
															         }
														
 
															         int get_next_index(int id) {
														
@@ -3658,7 +3639,8 @@ class sycl_gpu_mgr {
 
															                 if (gpus[i] == id)
														
 
															                     return i;
														
 
															             }
														
 
															-            GGML_ASSERT(false);
														
 
															+            assert(false);
														
 
															+            return -1;
														
 
															         }
														
 
															 };
														
@@ -3667,7 +3649,6 @@ static int g_device_count = -1;
 
															 static int g_all_sycl_device_count = -1;
														
 
															 static int g_main_device = -1;
														
 
															 static int g_main_device_id = -1;
														
 
															-static bool g_ggml_backend_sycl_buffer_type_initialized = false;
														
 
															 static std::array<float, GGML_SYCL_MAX_DEVICES> g_default_tensor_split = {};
														
@@ -13244,7 +13225,7 @@ void ggml_backend_sycl_print_sycl_devices() {
 
															 }
														
 
															 void print_gpu_device_list() {
														
 
															-    fprintf(stderr, "detect %d SYCL GPUs: [%s] with top Max compute units:%d\n",
														
 
															+    fprintf(stderr, "detect %d SYCL GPUs: [%s] with Max compute units:%d\n",
														
 
															             g_sycl_gpu_mgr->get_gpu_count(),
														
 
															             g_sycl_gpu_mgr->gpus_list.c_str(),
														
 
															             g_sycl_gpu_mgr->max_compute_units);
														
@@ -13283,15 +13264,6 @@ void ggml_init_sycl() try {
 
															 #else
														
 
															         fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
														
 
															 #endif
														
 
															-
														
 
															-/* NOT REMOVE, keep it for next optimize for XMX.
														
 
															-#if defined(SYCL_USE_XMX)
														
 
															-        fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
														
 
															-#else
														
 
															-        fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
														
 
															-#endif
														
 
															-*/
														
 
															-
														
 
															         if (CHECK_TRY_ERROR(g_all_sycl_device_count =
														
 
															                             dpct::dev_mgr::instance().device_count()) != 0) {
														
 
															             initialized = true;
														
@@ -13300,61 +13272,68 @@ void ggml_init_sycl() try {
 
															         }
														
 
															         GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
														
 
															         ggml_backend_sycl_print_sycl_devices();
														
 
															-        if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
														
 
															-        print_gpu_device_list();
														
 
															-        initialized = true;
														
 
															-        g_sycl_loaded = true;
														
 
															-    }
														
 
															+        if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
														
 
															+        g_device_count = g_sycl_gpu_mgr->get_gpu_count();
														
 
															+        g_work_group_size = g_sycl_gpu_mgr->work_group_size;
														
 
															-    g_device_count = g_sycl_gpu_mgr->get_gpu_count();
														
 
															-    g_work_group_size = g_sycl_gpu_mgr->work_group_size;
														
 
															+        print_gpu_device_list();
														
 
															-    int64_t total_vram = 0;
														
 
															+        int64_t total_vram = 0;
														
 
															+/* NOT REMOVE, keep it for next optimize for XMX.
														
 
															+#if defined(SYCL_USE_XMX)
														
 
															+        fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
														
 
															+#else
														
 
															+        fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
														
 
															+#endif
														
 
															+*/
														
 
															+        for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
														
 
															+            g_device_caps[id].vmm = 0;
														
 
															+            g_device_caps[id].device_id = -1;
														
 
															+            g_device_caps[id].cc = 0;
														
 
															+            g_tensor_split[id] = 0;
														
 
															+            g_default_tensor_split[id] = 0;
														
 
															+        }
														
 
															-    for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
														
 
															-        g_device_caps[id].vmm = 0;
														
 
															-        g_device_caps[id].device_id = -1;
														
 
															-        g_device_caps[id].cc = 0;
														
 
															-        g_tensor_split[id] = 0;
														
 
															-        g_default_tensor_split[id] = 0;
														
 
															-    }
														
 
															+        for (int i = 0; i < g_device_count; ++i) {
														
 
															+            int device_id = g_sycl_gpu_mgr->gpus[i];
														
 
															+            g_device_caps[i].vmm = 0;
														
 
															-    for (int i = 0; i < g_device_count; ++i) {
														
 
															-        int device_id = g_sycl_gpu_mgr->gpus[i];
														
 
															-        g_device_caps[i].vmm = 0;
														
 
															+            dpct::device_info prop;
														
 
															+            SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
														
 
															+                prop, dpct::dev_mgr::instance().get_device(device_id))));
														
 
															-        dpct::device_info prop;
														
 
															-        SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
														
 
															-            prop, dpct::dev_mgr::instance().get_device(device_id))));
														
 
															+            g_default_tensor_split[i] = total_vram;
														
 
															+            total_vram += prop.get_global_mem_size();
														
 
															-        g_default_tensor_split[i] = total_vram;
														
 
															-        total_vram += prop.get_global_mem_size();
														
 
															+            g_device_caps[i].cc =
														
 
															+                100 * prop.get_major_version() + 10 * prop.get_minor_version();
														
 
															+        }
														
 
															-        g_device_caps[i].cc =
														
 
															-            100 * prop.get_major_version() + 10 * prop.get_minor_version();
														
 
															-    }
														
 
															+        for (int i = 0; i < g_device_count; ++i) {
														
 
															+            g_default_tensor_split[i] /= total_vram;
														
 
															+        }
														
 
															-    for (int i = 0; i < g_device_count; ++i) {
														
 
															-        g_default_tensor_split[i] /= total_vram;
														
 
															-    }
														
 
															+        for (int i = 0; i < g_device_count; ++i) {
														
 
															+            SYCL_CHECK(ggml_sycl_set_device(i));
														
 
															-    for (int i = 0; i < g_device_count; ++i) {
														
 
															-        SYCL_CHECK(ggml_sycl_set_device(i));
														
 
															+            // create sycl streams
														
 
															+            for (int is = 0; is < MAX_STREAMS; ++is) {
														
 
															+                SYCL_CHECK(CHECK_TRY_ERROR(
														
 
															+                    g_syclStreams[i][is] =
														
 
															+                        dpct::get_current_device().create_queue(
														
 
															+                            g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
														
 
															+            }
														
 
															-        // create sycl streams
														
 
															-        for (int is = 0; is < MAX_STREAMS; ++is) {
														
 
															-            SYCL_CHECK(CHECK_TRY_ERROR(
														
 
															-                g_syclStreams[i][is] =
														
 
															-                    dpct::get_current_device().create_queue(
														
 
															-                        g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
														
 
															+            const dpct::queue_ptr stream = g_syclStreams[i][0];
														
 
															+            // create sycl handle
														
 
															+            SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
														
 
															         }
														
 
															-        const dpct::queue_ptr stream = g_syclStreams[i][0];
														
 
															-        // create sycl handle
														
 
															-        SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
														
 
															+        initialized = true;
														
 
															+        g_sycl_loaded = true;
														
 
															     }
														
 
															 }
														
 
															 catch (sycl::exception const &exc) {
														
@@ -16753,24 +16732,22 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
 
															     /* .is_host          = */ nullptr,
														
 
															 };
														
 
															-ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
														
 
															-    if (device_index>=g_device_count or device_index<0) {
														
 
															-        printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
														
 
															-            device_index, g_device_count-1);
														
 
															-        GGML_ASSERT(device_index<g_device_count);
														
 
															-    }
														
 
															+ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
														
 
															     static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
														
 
															-    if (!g_ggml_backend_sycl_buffer_type_initialized) {
														
 
															+    static bool ggml_backend_sycl_buffer_type_initialized = false;
														
 
															+
														
 
															+    if (!ggml_backend_sycl_buffer_type_initialized) {
														
 
															         for (int i = 0; i < g_device_count; i++) {
														
 
															             ggml_backend_sycl_buffer_types[i] = {
														
 
															                 /* .iface    = */ ggml_backend_sycl_buffer_type_interface,
														
 
															                 /* .context  = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(g_sycl_gpu_mgr->gpus[i])},
														
 
															             };
														
 
															         }
														
 
															-        g_ggml_backend_sycl_buffer_type_initialized = true;
														
 
															+        ggml_backend_sycl_buffer_type_initialized = true;
														
 
															     }
														
 
															-    return &ggml_backend_sycl_buffer_types[device_index];
														
 
															+
														
 
															+    return &ggml_backend_sycl_buffer_types[device];
														
 
															 }
														
 
															 // sycl split buffer type
														
@@ -17519,17 +17496,6 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
 
															     return g_sycl_gpu_mgr->get_index(device_id);
														
 
															 }
														
 
															-GGML_API GGML_CALL void ggml_backend_sycl_set_single_device(int main_gpu_id) {
														
 
															-    GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
														
 
															-    printf("ggml_backend_sycl_set_single_device: use single device: %d\n", main_gpu_id);
														
 
															-    if (g_sycl_gpu_mgr) {
														
 
															-        delete g_sycl_gpu_mgr;
														
 
															-    }
														
 
															-    g_sycl_gpu_mgr = new sycl_gpu_mgr(main_gpu_id);
														
 
															-    ggml_init_sycl();
														
 
															-    g_ggml_backend_sycl_buffer_type_initialized = false;
														
 
															-}
														
 
															-
														
 
															 extern "C" int ggml_backend_sycl_reg_devices();
														
 
															 int ggml_backend_sycl_reg_devices() {
														
--- a/ggml-sycl.h
+++ b/ggml-sycl.h
@@ -28,7 +28,6 @@ GGML_API GGML_CALL int   ggml_backend_sycl_get_device_count();
 
															 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
														
 
															 GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
														
 
															 GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
														
 
															-GGML_API GGML_CALL void ggml_backend_sycl_set_single_device(int main_gpu);
														
 
															 #ifdef  __cplusplus
														
 
															 }
														
--- a/llama.cpp
+++ b/llama.cpp
@@ -3750,14 +3750,6 @@ static bool llm_load_tensors(
 
															     model.main_gpu     = main_gpu;
														
 
															     model.n_gpu_layers = n_gpu_layers;
														
 
															-#ifdef GGML_USE_SYCL
														
 
															-    if (split_mode == LLAMA_SPLIT_MODE_NONE) {
														
 
															-        ggml_backend_sycl_set_single_device(main_gpu);
														
 
															-        //SYCL use device index (0, 1, 2), instead if device id.
														
 
															-        main_gpu = ggml_backend_sycl_get_device_index(main_gpu);
														
 
															-    }
														
 
															-#endif
														
 
															-
														
 
															     const int64_t n_layer     = hparams.n_layer;
														
 
															     const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
														
@@ -12268,13 +12260,13 @@ struct llama_context * llama_new_context_with_model(
 
															                 ctx->backends.push_back(backend);
														
 
															             } else {
														
 
															                 // LLAMA_SPLIT_LAYER requires a backend for each GPU
														
 
															-
														
 
															+                int id_list[GGML_SYCL_MAX_DEVICES];
														
 
															+                ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
														
 
															                 for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
														
 
															+                    int device_id = id_list[i];
														
 
															                     ggml_backend_t backend = ggml_backend_sycl_init(i);
														
 
															                     if (backend == nullptr) {
														
 
															-                        int id_list[GGML_SYCL_MAX_DEVICES];
														
 
															-                        ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
														
 
															-                        LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, id_list[i], i);
														
 
															+                        LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
														
 
															                         llama_free(ctx);
														
 
															                         return nullptr;
														
 
															                     }