|
|
@@ -104,6 +104,7 @@
|
|
|
#define LLAMA_MAX_NODES 8192
|
|
|
#define LLAMA_MAX_EXPERTS 8
|
|
|
|
|
|
+
|
|
|
//
|
|
|
// logging
|
|
|
//
|
|
|
@@ -1429,7 +1430,9 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
|
|
buft = ggml_backend_cuda_host_buffer_type();
|
|
|
}
|
|
|
#elif defined(GGML_USE_SYCL)
|
|
|
- buft = ggml_backend_sycl_host_buffer_type();
|
|
|
+ if (host_buffer) {
|
|
|
+ buft = ggml_backend_sycl_host_buffer_type();
|
|
|
+ }
|
|
|
#elif defined(GGML_USE_CPU_HBM)
|
|
|
buft = ggml_backend_cpu_hbm_buffer_type();
|
|
|
#elif defined(GGML_USE_VULKAN)
|
|
|
@@ -1483,6 +1486,12 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
|
|
|
}
|
|
|
#endif
|
|
|
|
|
|
+#ifdef GGML_USE_SYCL
|
|
|
+ if (ggml_backend_sycl_get_device_count() > 1) {
|
|
|
+ buft = ggml_backend_sycl_split_buffer_type(tensor_split);
|
|
|
+ }
|
|
|
+#endif
|
|
|
+
|
|
|
if (buft == nullptr) {
|
|
|
buft = llama_default_buffer_type_offload(fallback_gpu);
|
|
|
}
|
|
|
@@ -1494,6 +1503,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
|
|
|
static size_t llama_get_device_count() {
|
|
|
#if defined(GGML_USE_CUBLAS)
|
|
|
return ggml_backend_cuda_get_device_count();
|
|
|
+#elif defined(GGML_USE_SYCL)
|
|
|
+ return ggml_backend_sycl_get_device_count();
|
|
|
#elif defined(GGML_USE_VULKAN)
|
|
|
return ggml_backend_vk_get_device_count();
|
|
|
#else
|
|
|
@@ -1507,6 +1518,11 @@ static size_t llama_get_device_memory(int device) {
|
|
|
size_t free;
|
|
|
ggml_backend_cuda_get_device_memory(device, &total, &free);
|
|
|
return free;
|
|
|
+#elif defined(GGML_USE_SYCL)
|
|
|
+ size_t total;
|
|
|
+ size_t free;
|
|
|
+ ggml_backend_sycl_get_device_memory(device, &total, &free);
|
|
|
+ return free;
|
|
|
#elif defined(GGML_USE_VULKAN)
|
|
|
size_t total;
|
|
|
size_t free;
|
|
|
@@ -12075,13 +12091,31 @@ struct llama_context * llama_new_context_with_model(
|
|
|
}
|
|
|
#elif defined(GGML_USE_SYCL)
|
|
|
if (model->n_gpu_layers > 0) {
|
|
|
- ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
|
|
|
- if (backend == nullptr) {
|
|
|
- LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
|
|
|
- llama_free(ctx);
|
|
|
- return nullptr;
|
|
|
+ // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
|
|
+ if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
|
|
+ int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu);
|
|
|
+ ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index);
|
|
|
+ if (backend == nullptr) {
|
|
|
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index);
|
|
|
+ llama_free(ctx);
|
|
|
+ return nullptr;
|
|
|
+ }
|
|
|
+ ctx->backends.push_back(backend);
|
|
|
+ } else {
|
|
|
+ // LLAMA_SPLIT_LAYER requires a backend for each GPU
|
|
|
+ int id_list[GGML_SYCL_MAX_DEVICES];
|
|
|
+ ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
|
|
|
+ for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
|
|
|
+ int device_id = id_list[i];
|
|
|
+ ggml_backend_t backend = ggml_backend_sycl_init(i);
|
|
|
+ if (backend == nullptr) {
|
|
|
+ LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
|
|
|
+ llama_free(ctx);
|
|
|
+ return nullptr;
|
|
|
+ }
|
|
|
+ ctx->backends.push_back(backend);
|
|
|
+ }
|
|
|
}
|
|
|
- ctx->backends.push_back(backend);
|
|
|
}
|
|
|
#elif defined(GGML_USE_KOMPUTE)
|
|
|
if (model->n_gpu_layers > 0) {
|
|
|
@@ -12161,7 +12195,6 @@ struct llama_context * llama_new_context_with_model(
|
|
|
ggml_set_name(ctx->inp_cls, "inp_cls");
|
|
|
|
|
|
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
|
|
|
-
|
|
|
LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__,
|
|
|
ggml_backend_buffer_name(ctx->buf_input),
|
|
|
ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0);
|