2 rokov pred · 132d25b8a6
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -6893,6 +6893,8 @@ static void ggml_cuda_op_mul_mat(
 
				     int64_t  row_low[GGML_CUDA_MAX_DEVICES];
			
 
				     int64_t row_high[GGML_CUDA_MAX_DEVICES];
			
 
				 
			
 
				+    int used_devices = 0;
			
 
				+
			
 
				     for (int64_t id = 0; id < g_device_count; ++id) {
			
 
				         // by default, use all rows
			
 
				         row_low[id]  = 0;
			
@@ -6920,6 +6922,8 @@ static void ggml_cuda_op_mul_mat(
 
				             continue;
			
 
				         }
			
 
				 
			
 
				+        used_devices++;
			
 
				+
			
 
				         const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
			
 
				         const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device;
			
 
				 
			
@@ -6958,12 +6962,12 @@ static void ggml_cuda_op_mul_mat(
 
				 
			
 
				     // if multiple devices are used they need to wait for the main device
			
 
				     // here an event is recorded that signals that the main device has finished calculating the input data
			
 
				-    if (split && g_device_count > 1) {
			
 
				+    if (split && used_devices > 1) {
			
 
				         CUDA_CHECK(ggml_cuda_set_device(g_main_device));
			
 
				         CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
			
 
				     }
			
 
				 
			
 
				-    const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
			
 
				+    const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
			
 
				     for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
			
 
				         const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
			
 
				         const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
			
@@ -7079,6 +7083,9 @@ static void ggml_cuda_op_mul_mat(
 
				     }
			
 
				 
			
 
				     for (int64_t id = 0; id < g_device_count; ++id) {
			
 
				+        if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
			
 
				+            continue;
			
 
				+        }
			
 
				         CUDA_CHECK(ggml_cuda_set_device(id));
			
 
				 
			
 
				         // free buffers again when done
			
@@ -7103,6 +7110,9 @@ static void ggml_cuda_op_mul_mat(
 
				 
			
 
				         CUDA_CHECK(ggml_cuda_set_device(g_main_device));
			
 
				         for (int64_t id = 0; id < g_device_count; ++id) {
			
 
				+            if (row_low[id] == row_high[id]) {
			
 
				+                continue;
			
 
				+            }
			
 
				             for (int64_t is = 0; is < is_max; ++is) {
			
 
				                 CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
			
 
				             }
			
@@ -7400,7 +7410,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
 
				 
			
 
				 static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
			
 
				     const bool all_on_device =
			
 
				-        (src0->backend == GGML_BACKEND_GPU) &&
			
 
				+        (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
			
 
				         (src1->backend == GGML_BACKEND_GPU) &&
			
 
				         ( dst->backend == GGML_BACKEND_GPU);