|
@@ -1355,7 +1355,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
|
|
#elif defined(GGML_USE_CUBLAS)
|
|
#elif defined(GGML_USE_CUBLAS)
|
|
|
buft = ggml_backend_cuda_buffer_type(gpu);
|
|
buft = ggml_backend_cuda_buffer_type(gpu);
|
|
|
#elif defined(GGML_USE_VULKAN)
|
|
#elif defined(GGML_USE_VULKAN)
|
|
|
- buft = ggml_backend_vk_buffer_type();
|
|
|
|
|
|
|
+ buft = ggml_backend_vk_buffer_type(gpu);
|
|
|
#elif defined(GGML_USE_SYCL)
|
|
#elif defined(GGML_USE_SYCL)
|
|
|
buft = ggml_backend_sycl_buffer_type(gpu);
|
|
buft = ggml_backend_sycl_buffer_type(gpu);
|
|
|
#elif defined(GGML_USE_CLBLAST)
|
|
#elif defined(GGML_USE_CLBLAST)
|
|
@@ -1392,6 +1392,33 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g
|
|
|
GGML_UNUSED(tensor_split);
|
|
GGML_UNUSED(tensor_split);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+static size_t llama_get_device_count() {
|
|
|
|
|
+#if defined(GGML_USE_CUBLAS)
|
|
|
|
|
+ return ggml_backend_cuda_get_device_count();
|
|
|
|
|
+#elif defined(GGML_USE_VULKAN)
|
|
|
|
|
+ return ggml_backend_vk_get_device_count();
|
|
|
|
|
+#else
|
|
|
|
|
+ return 1;
|
|
|
|
|
+#endif
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+static size_t llama_get_device_memory(int device) {
|
|
|
|
|
+#if defined(GGML_USE_CUBLAS)
|
|
|
|
|
+ size_t total;
|
|
|
|
|
+ size_t free;
|
|
|
|
|
+ ggml_backend_cuda_get_device_memory(device, &total, &free);
|
|
|
|
|
+ return free;
|
|
|
|
|
+#elif defined(GGML_USE_VULKAN)
|
|
|
|
|
+ size_t total;
|
|
|
|
|
+ size_t free;
|
|
|
|
|
+ ggml_backend_vk_get_device_memory(device, &total, &free);
|
|
|
|
|
+ return free;
|
|
|
|
|
+#else
|
|
|
|
|
+ return 1;
|
|
|
|
|
+ GGML_UNUSED(device);
|
|
|
|
|
+#endif
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
//
|
|
//
|
|
|
// globals
|
|
// globals
|
|
|
//
|
|
//
|
|
@@ -1763,6 +1790,10 @@ struct llama_context {
|
|
|
ggml_backend_free(backend);
|
|
ggml_backend_free(backend);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+#ifdef GGML_USE_VULKAN
|
|
|
|
|
+ ggml_vk_free_cpu_assist();
|
|
|
|
|
+#endif
|
|
|
|
|
+
|
|
|
ggml_backend_buffer_free(buf_input);
|
|
ggml_backend_buffer_free(buf_input);
|
|
|
ggml_free(ctx_input);
|
|
ggml_free(ctx_input);
|
|
|
}
|
|
}
|
|
@@ -3436,22 +3467,18 @@ static bool llm_load_tensors(
|
|
|
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
|
|
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-#ifdef GGML_USE_CUBLAS
|
|
|
|
|
if (split_mode == LLAMA_SPLIT_LAYER) {
|
|
if (split_mode == LLAMA_SPLIT_LAYER) {
|
|
|
// calculate the split points
|
|
// calculate the split points
|
|
|
- int device_count = ggml_backend_cuda_get_device_count();
|
|
|
|
|
|
|
+ int device_count = llama_get_device_count();
|
|
|
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
|
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
|
|
- float splits[GGML_CUDA_MAX_DEVICES];
|
|
|
|
|
|
|
+ std::vector<float> splits(device_count);
|
|
|
if (all_zero) {
|
|
if (all_zero) {
|
|
|
// default split, by free memory
|
|
// default split, by free memory
|
|
|
for (int i = 0; i < device_count; ++i) {
|
|
for (int i = 0; i < device_count; ++i) {
|
|
|
- size_t total;
|
|
|
|
|
- size_t free;
|
|
|
|
|
- ggml_backend_cuda_get_device_memory(i, &total, &free);
|
|
|
|
|
- splits[i] = free;
|
|
|
|
|
|
|
+ splits[i] = llama_get_device_memory(i);
|
|
|
}
|
|
}
|
|
|
} else {
|
|
} else {
|
|
|
- std::copy(tensor_split, tensor_split + device_count, splits);
|
|
|
|
|
|
|
+ std::copy(tensor_split, tensor_split + device_count, splits.begin());
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
// sum and normalize the splits to get the split points
|
|
// sum and normalize the splits to get the split points
|
|
@@ -3467,19 +3494,17 @@ static bool llm_load_tensors(
|
|
|
// assign the repeating layers to the devices according to the splits
|
|
// assign the repeating layers to the devices according to the splits
|
|
|
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
|
|
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
|
|
|
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
|
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
|
|
- int layer_gpu = std::upper_bound(splits, splits + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits;
|
|
|
|
|
|
|
+ int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
|
|
|
model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
|
|
model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
|
|
|
}
|
|
}
|
|
|
// assign the output layer
|
|
// assign the output layer
|
|
|
if (n_gpu_layers > n_layer) {
|
|
if (n_gpu_layers > n_layer) {
|
|
|
- int layer_gpu = std::upper_bound(splits, splits + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits;
|
|
|
|
|
|
|
+ int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
|
|
|
model.buft_output = llama_default_buffer_type_offload(layer_gpu);
|
|
model.buft_output = llama_default_buffer_type_offload(layer_gpu);
|
|
|
} else {
|
|
} else {
|
|
|
model.buft_output = llama_default_buffer_type_cpu(true);
|
|
model.buft_output = llama_default_buffer_type_cpu(true);
|
|
|
}
|
|
}
|
|
|
- } else
|
|
|
|
|
-#endif
|
|
|
|
|
- {
|
|
|
|
|
|
|
+ } else {
|
|
|
ggml_backend_buffer_type_t split_buft;
|
|
ggml_backend_buffer_type_t split_buft;
|
|
|
if (split_mode == LLAMA_SPLIT_ROW) {
|
|
if (split_mode == LLAMA_SPLIT_ROW) {
|
|
|
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
|
|
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
|
|
@@ -10483,6 +10508,8 @@ size_t llama_max_devices(void) {
|
|
|
return GGML_CUDA_MAX_DEVICES;
|
|
return GGML_CUDA_MAX_DEVICES;
|
|
|
#elif defined(GGML_USE_SYCL)
|
|
#elif defined(GGML_USE_SYCL)
|
|
|
return GGML_SYCL_MAX_DEVICES;
|
|
return GGML_SYCL_MAX_DEVICES;
|
|
|
|
|
+#elif defined(GGML_USE_VULKAN)
|
|
|
|
|
+ return GGML_VK_MAX_DEVICES;
|
|
|
#else
|
|
#else
|
|
|
return 1;
|
|
return 1;
|
|
|
#endif
|
|
#endif
|
|
@@ -10690,13 +10717,15 @@ struct llama_context * llama_new_context_with_model(
|
|
|
}
|
|
}
|
|
|
#elif defined(GGML_USE_VULKAN)
|
|
#elif defined(GGML_USE_VULKAN)
|
|
|
if (model->n_gpu_layers > 0) {
|
|
if (model->n_gpu_layers > 0) {
|
|
|
- ggml_backend_t backend = ggml_backend_vk_init();
|
|
|
|
|
- if (backend == nullptr) {
|
|
|
|
|
- LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
|
|
|
|
|
- llama_free(ctx);
|
|
|
|
|
- return nullptr;
|
|
|
|
|
|
|
+ for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
|
|
|
|
|
+ ggml_backend_t backend = ggml_backend_vk_init(device);
|
|
|
|
|
+ if (backend == nullptr) {
|
|
|
|
|
+ LLAMA_LOG_ERROR("%s: failed to initialize Vulkan%d backend\n", __func__, device);
|
|
|
|
|
+ llama_free(ctx);
|
|
|
|
|
+ return nullptr;
|
|
|
|
|
+ }
|
|
|
|
|
+ ctx->backends.push_back(backend);
|
|
|
}
|
|
}
|
|
|
- ctx->backends.push_back(backend);
|
|
|
|
|
}
|
|
}
|
|
|
#elif defined(GGML_USE_SYCL)
|
|
#elif defined(GGML_USE_SYCL)
|
|
|
if (model->n_gpu_layers > 0) {
|
|
if (model->n_gpu_layers > 0) {
|