|
@@ -180,8 +180,8 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
|
|
return __reduce_add_sync(0xffffffff, x);
|
|
return __reduce_add_sync(0xffffffff, x);
|
|
|
#else
|
|
#else
|
|
|
#pragma unroll
|
|
#pragma unroll
|
|
|
- for (int mask = 16; mask > 0; mask >>= 1) {
|
|
|
|
|
- x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
|
|
|
|
|
|
+ for (int offset = 16; offset > 0; offset >>= 1) {
|
|
|
|
|
+ x += __shfl_xor_sync(0xffffffff, x, offset, 32);
|
|
|
}
|
|
}
|
|
|
return x;
|
|
return x;
|
|
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
|
|
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
|
|
@@ -189,17 +189,17 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
|
|
|
|
|
|
|
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
|
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
|
|
#pragma unroll
|
|
#pragma unroll
|
|
|
- for (int mask = 16; mask > 0; mask >>= 1) {
|
|
|
|
|
- x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
|
|
|
|
|
|
+ for (int offset = 16; offset > 0; offset >>= 1) {
|
|
|
|
|
+ x += __shfl_xor_sync(0xffffffff, x, offset, 32);
|
|
|
}
|
|
}
|
|
|
return x;
|
|
return x;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
|
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
|
|
#pragma unroll
|
|
#pragma unroll
|
|
|
- for (int mask = 16; mask > 0; mask >>= 1) {
|
|
|
|
|
- a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
|
|
|
|
- a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
|
|
|
|
|
|
+ for (int offset = 16; offset > 0; offset >>= 1) {
|
|
|
|
|
+ a.x += __shfl_xor_sync(0xffffffff, a.x, offset, 32);
|
|
|
|
|
+ a.y += __shfl_xor_sync(0xffffffff, a.y, offset, 32);
|
|
|
}
|
|
}
|
|
|
return a;
|
|
return a;
|
|
|
}
|
|
}
|
|
@@ -209,16 +209,16 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
|
|
|
|
|
|
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
|
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
|
|
#pragma unroll
|
|
#pragma unroll
|
|
|
- for (int mask = 16; mask > 0; mask >>= 1) {
|
|
|
|
|
- const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32);
|
|
|
|
|
|
|
+ for (int offset = 16; offset > 0; offset >>= 1) {
|
|
|
|
|
+ const half2 a_other = __shfl_xor_sync(0xffffffff, a, offset, 32);
|
|
|
reinterpret_cast<half&>(a.x) += __low2half(a_other);
|
|
reinterpret_cast<half&>(a.x) += __low2half(a_other);
|
|
|
reinterpret_cast<half&>(a.y) += __high2half(a_other);
|
|
reinterpret_cast<half&>(a.y) += __high2half(a_other);
|
|
|
}
|
|
}
|
|
|
return a;
|
|
return a;
|
|
|
#else
|
|
#else
|
|
|
#pragma unroll
|
|
#pragma unroll
|
|
|
- for (int mask = 16; mask > 0; mask >>= 1) {
|
|
|
|
|
- a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
|
|
|
|
|
|
+ for (int offset = 16; offset > 0; offset >>= 1) {
|
|
|
|
|
+ a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, offset, 32));
|
|
|
}
|
|
}
|
|
|
return a;
|
|
return a;
|
|
|
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
|
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
|
@@ -231,8 +231,8 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
|
|
|
|
|
|
|
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
|
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
|
|
#pragma unroll
|
|
#pragma unroll
|
|
|
- for (int mask = 16; mask > 0; mask >>= 1) {
|
|
|
|
|
- x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
|
|
|
|
|
|
+ for (int offset = 16; offset > 0; offset >>= 1) {
|
|
|
|
|
+ x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, offset, 32));
|
|
|
}
|
|
}
|
|
|
return x;
|
|
return x;
|
|
|
}
|
|
}
|
|
@@ -275,8 +275,8 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
|
|
|
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
|
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
|
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
|
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
|
|
#pragma unroll
|
|
#pragma unroll
|
|
|
- for (int mask = 16; mask > 0; mask >>= 1) {
|
|
|
|
|
- x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
|
|
|
|
|
|
+ for (int offset = 16; offset > 0; offset >>= 1) {
|
|
|
|
|
+ x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, offset, 32));
|
|
|
}
|
|
}
|
|
|
return x;
|
|
return x;
|
|
|
#else
|
|
#else
|