1 month ago · 5fa66c6e67
--- a/ggml/src/ggml-cuda/cumsum.cu
+++ b/ggml/src/ggml-cuda/cumsum.cu
@@ -61,7 +61,7 @@ static __global__ void cumsum_cub_kernel(
 
				 
			
 
				         // Add offset to each item and store
			
 
				         T thread_offset = thread_prefix - thread_sum + block_carry;
			
 
				-        #pragma unroll
			
 
				+#pragma unroll
			
 
				         for (int i = 0; i < UNROLL_FACTOR; i++) {
			
 
				             int64_t idx = start + tid * UNROLL_FACTOR + i;
			
 
				             if (idx < ne00) {
			
@@ -69,11 +69,12 @@ static __global__ void cumsum_cub_kernel(
 
				             }
			
 
				         }
			
 
				 
			
 
				+        __syncthreads();
			
 
				+
			
 
				         // Update carry for next tile
			
 
				         if (tid == 0) {
			
 
				             block_carry += block_total;
			
 
				         }
			
 
				-        __syncthreads();
			
 
				     }
			
 
				 #else
			
 
				     NO_DEVICE_CODE;
			
@@ -175,11 +176,12 @@ static __global__ void cumsum_kernel(
 
				             }
			
 
				         }
			
 
				 
			
 
				+        __syncthreads();
			
 
				+
			
 
				         // Update carry for next chunk
			
 
				         if (tid == 0) {
			
 
				             *s_carry += *s_chunk_total;
			
 
				         }
			
 
				-        __syncthreads();
			
 
				     }
			
 
				 }