Browse Source

cuda: fix race condition in cumsum (#18448)

* ggml-cuda: fix race condition in cumsum

* remove unneccesary sync_threads
Aman Gupta 1 month ago
parent
commit
5fa66c6e67
1 changed files with 5 additions and 3 deletions
  1. 5 3
      ggml/src/ggml-cuda/cumsum.cu

+ 5 - 3
ggml/src/ggml-cuda/cumsum.cu

@@ -61,7 +61,7 @@ static __global__ void cumsum_cub_kernel(
 
         // Add offset to each item and store
         T thread_offset = thread_prefix - thread_sum + block_carry;
-        #pragma unroll
+#pragma unroll
         for (int i = 0; i < UNROLL_FACTOR; i++) {
             int64_t idx = start + tid * UNROLL_FACTOR + i;
             if (idx < ne00) {
@@ -69,11 +69,12 @@ static __global__ void cumsum_cub_kernel(
             }
         }
 
+        __syncthreads();
+
         // Update carry for next tile
         if (tid == 0) {
             block_carry += block_total;
         }
-        __syncthreads();
     }
 #else
     NO_DEVICE_CODE;
@@ -175,11 +176,12 @@ static __global__ void cumsum_kernel(
             }
         }
 
+        __syncthreads();
+
         // Update carry for next chunk
         if (tid == 0) {
             *s_carry += *s_chunk_total;
         }
-        __syncthreads();
     }
 }