|
|
@@ -6893,6 +6893,8 @@ static void ggml_cuda_op_mul_mat(
|
|
|
int64_t row_low[GGML_CUDA_MAX_DEVICES];
|
|
|
int64_t row_high[GGML_CUDA_MAX_DEVICES];
|
|
|
|
|
|
+ int used_devices = 0;
|
|
|
+
|
|
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
|
|
// by default, use all rows
|
|
|
row_low[id] = 0;
|
|
|
@@ -6920,6 +6922,8 @@ static void ggml_cuda_op_mul_mat(
|
|
|
continue;
|
|
|
}
|
|
|
|
|
|
+ used_devices++;
|
|
|
+
|
|
|
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
|
|
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
|
|
|
|
|
@@ -6958,12 +6962,12 @@ static void ggml_cuda_op_mul_mat(
|
|
|
|
|
|
// if multiple devices are used they need to wait for the main device
|
|
|
// here an event is recorded that signals that the main device has finished calculating the input data
|
|
|
- if (split && g_device_count > 1) {
|
|
|
+ if (split && used_devices > 1) {
|
|
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
|
|
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
|
|
|
}
|
|
|
|
|
|
- const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
|
|
|
+ const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
|
|
|
for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
|
|
|
const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
|
|
|
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
|
|
|
@@ -7079,6 +7083,9 @@ static void ggml_cuda_op_mul_mat(
|
|
|
}
|
|
|
|
|
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
|
|
+ if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
CUDA_CHECK(ggml_cuda_set_device(id));
|
|
|
|
|
|
// free buffers again when done
|
|
|
@@ -7103,6 +7110,9 @@ static void ggml_cuda_op_mul_mat(
|
|
|
|
|
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
|
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
|
|
+ if (row_low[id] == row_high[id]) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
for (int64_t is = 0; is < is_max; ++is) {
|
|
|
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
|
|
|
}
|
|
|
@@ -7400,7 +7410,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|
|
|
|
|
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
const bool all_on_device =
|
|
|
- (src0->backend == GGML_BACKEND_GPU) &&
|
|
|
+ (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
|
|
(src1->backend == GGML_BACKEND_GPU) &&
|
|
|
( dst->backend == GGML_BACKEND_GPU);
|
|
|
|