hai 5 meses · 6028bf7435
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -87,6 +87,10 @@
 
				 #define GGML_CUDA_CC_IS_QY2(cc)      (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NG)
			
 
				 #define GGML_CUDA_CC_IS_NG(cc)       (cc >= GGML_CUDA_CC_NG)
			
 
				 
			
 
				+#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
			
 
				+#    define GGML_CUDA_USE_CUB
			
 
				+#endif  // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
			
 
				+
			
 
				 #ifdef __CUDA_ARCH_LIST__
			
 
				 constexpr bool ggml_cuda_has_arch_impl(int) {
			
 
				     return false;
			
@@ -420,26 +424,6 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
 
				 #endif // FP16_AVAILABLE
			
 
				 }
			
 
				 
			
 
				-// Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
			
 
				-template<bool norm>
			
 
				-static __global__ void reduce_rows_f32(const float * x, float * dst, const int ncols) {
			
 
				-    const int row = blockIdx.x;
			
 
				-    const int col = threadIdx.x;
			
 
				-
			
 
				-    float sum = 0.0f;
			
 
				-    for (int i = col; i < ncols; i += blockDim.x) {
			
 
				-        sum += x[row * ncols + i];
			
 
				-    }
			
 
				-
			
 
				-    sum = warp_reduce_sum(sum);
			
 
				-
			
 
				-    if (col != 0) {
			
 
				-        return;
			
 
				-    }
			
 
				-
			
 
				-    dst[row] = norm ? sum / ncols : sum;
			
 
				-}
			
 
				-
			
 
				 template<int width = WARP_SIZE>
			
 
				 static __device__ __forceinline__ int warp_reduce_all(int x) {
			
 
				 #ifdef GGML_USE_HIP
			
--- a/ggml/src/ggml-cuda/mean.cu
+++ b/ggml/src/ggml-cuda/mean.cu
@@ -1,4 +1,14 @@
 
				 #include "mean.cuh"
			
 
				+#include "reduce_rows.cuh"
			
 
				+
			
 
				+#ifdef GGML_CUDA_USE_CUB
			
 
				+#include <cub/cub.cuh>
			
 
				+using namespace cub;
			
 
				+#endif  // GGML_CUDA_USE_CUB
			
 
				+
			
 
				+template <typename T> __global__ void divide_by_count(T * result, size_t count) {
			
 
				+    *result /= static_cast<T>(count);
			
 
				+}
			
 
				 
			
 
				 void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
 
				     const ggml_tensor * src0   = dst->src[0];
			
@@ -13,7 +23,45 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
				     const int64_t ncols = src0->ne[0];
			
 
				     const int64_t nrows = ggml_nrows(src0);
			
 
				 
			
 
				-    const dim3 block_dims(WARP_SIZE, 1, 1);
			
 
				+// Special case for reducing vectors
			
 
				+#ifdef GGML_CUDA_USE_CUB
			
 
				+    cudaStreamCaptureStatus iscapturing;
			
 
				+    CUDA_CHECK(cudaStreamIsCapturing(stream, &iscapturing));
			
 
				+    if ((nrows == 1) &&
			
 
				+            // CUDA_GRAPHS_DISABLED
			
 
				+            ((ncols > 65536) &&
			
 
				+             ((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
			
 
				+              ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates ||
			
 
				+              ctx.cuda_graph->disable_due_to_failed_graph_capture)) ||
			
 
				+        // CUDA_GRAPHS ENABLED
			
 
				+        ((ncols > 32768) &&
			
 
				+         !((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
			
 
				+           ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates ||
			
 
				+           ctx.cuda_graph->disable_due_to_failed_graph_capture))) {
			
 
				+        // Single row - use device-wide reduction
			
 
				+        size_t           tmp_size = 0;
			
 
				+        ggml_cuda_pool & pool     = ctx.pool();
			
 
				+
			
 
				+        DeviceReduce::Sum(nullptr, tmp_size, src0_d, dst_d, ncols, stream);
			
 
				+
			
 
				+        ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
			
 
				+        DeviceReduce::Sum(tmp_alloc.ptr, tmp_size, src0_d, dst_d, ncols, stream);
			
 
				+
			
 
				+        // Divide by ncols
			
 
				+        divide_by_count<float><<<1, 1, 0, stream>>>(dst_d, ncols);
			
 
				+        return;
			
 
				+    }
			
 
				+#endif
			
 
				+
			
 
				     const dim3 block_nums(nrows, 1, 1);
			
 
				-    reduce_rows_f32</*norm*/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
			
 
				+
			
 
				+    const int id  = ggml_cuda_get_device();
			
 
				+    const int nsm = ggml_cuda_info().devices[id].nsm;
			
 
				+    if ((nrows / nsm) < 2) {
			
 
				+        const dim3 block_dims(512, 1, 1);
			
 
				+        reduce_rows_f32</*norm=*/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
			
 
				+    } else {
			
 
				+        const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
			
 
				+        reduce_rows_f32</*norm=*/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
			
 
				+    }
			
 
				 }
			
--- a/ggml/src/ggml-cuda/reduce_rows.cuh
+++ b/ggml/src/ggml-cuda/reduce_rows.cuh
@@ -0,0 +1,53 @@
 
				+#include "common.cuh"
			
 
				+
			
 
				+// Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
			
 
				+template <bool norm>
			
 
				+static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __restrict__ dst, const int ncols) {
			
 
				+    const int row = blockIdx.x;
			
 
				+    const int col = threadIdx.x;
			
 
				+
			
 
				+    float     sum        = 0.0f;
			
 
				+    const int num_unroll = 8;
			
 
				+    float     temp[num_unroll];
			
 
				+    float     sum_temp[num_unroll] = { 0.0f };
			
 
				+    for (int i = col; i < ncols;) {
			
 
				+        for (int j = 0; j < num_unroll; ++j) {
			
 
				+            if (i < ncols) {
			
 
				+                temp[j] = x[row * ncols + i];
			
 
				+            } else {
			
 
				+                temp[j] = 0;
			
 
				+            }
			
 
				+            i += blockDim.x;
			
 
				+        }
			
 
				+        for (int j = 0; j < num_unroll; ++j) {
			
 
				+            sum_temp[j] += temp[j];
			
 
				+        }
			
 
				+    }
			
 
				+    for (int j = 0; j < num_unroll; ++j) {
			
 
				+        sum += sum_temp[j];
			
 
				+    }
			
 
				+
			
 
				+    // sum up partial sums
			
 
				+    sum = warp_reduce_sum(sum);
			
 
				+    if (blockDim.x > WARP_SIZE) {
			
 
				+        assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0);
			
 
				+        __shared__ float s_sum[32];
			
 
				+        const int        warp_id = threadIdx.x / WARP_SIZE;
			
 
				+        const int        lane_id = threadIdx.x % WARP_SIZE;
			
 
				+        if (lane_id == 0) {
			
 
				+            s_sum[warp_id] = sum;
			
 
				+        }
			
 
				+        __syncthreads();
			
 
				+        sum = 0.0f;
			
 
				+        if (lane_id < (blockDim.x / WARP_SIZE)) {
			
 
				+            sum = s_sum[lane_id];
			
 
				+        }
			
 
				+        sum = warp_reduce_sum(sum);
			
 
				+    }
			
 
				+
			
 
				+    if (col != 0) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    dst[row] = norm ? sum / ncols : sum;
			
 
				+}
			
--- a/ggml/src/ggml-cuda/sum.cu
+++ b/ggml/src/ggml-cuda/sum.cu
@@ -1,19 +1,15 @@
 
				-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
			
 
				-#define USE_CUB
			
 
				-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
			
 
				+#include "sum.cuh"
			
 
				+#include "sumrows.cuh"
			
 
				 
			
 
				-#ifdef USE_CUB
			
 
				+#ifdef GGML_CUDA_USE_CUB
			
 
				 #include <cub/cub.cuh>
			
 
				 using namespace cub;
			
 
				-#endif // USE_CUB
			
 
				-
			
 
				-#include "sumrows.cuh"
			
 
				-#include "sum.cuh"
			
 
				+#endif  // GGML_CUDA_USE_CUB
			
 
				 
			
 
				 #include <cstdint>
			
 
				 
			
 
				 void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) {
			
 
				-#ifdef USE_CUB
			
 
				+#ifdef GGML_CUDA_USE_CUB
			
 
				     size_t tmp_size = 0;
			
 
				     DeviceReduce::Sum(nullptr,       tmp_size, x, dst, ne, stream);
			
 
				     ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
			
@@ -23,7 +19,7 @@ void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int
 
				     // For AMD there is rocPRIM which could be used as a drop-in replacement via hipcub but this would require C++11 -> C++14.
			
 
				     sum_rows_f32_cuda(x, dst, ne, 1, stream);
			
 
				     GGML_UNUSED(pool);
			
 
				-#endif // USE_CUB
			
 
				+#endif // GGML_CUDA_USE_CUB
			
 
				 }
			
 
				 
			
 
				 void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
--- a/ggml/src/ggml-cuda/sumrows.cu
+++ b/ggml/src/ggml-cuda/sumrows.cu
@@ -1,9 +1,17 @@
 
				+#include "reduce_rows.cuh"
			
 
				 #include "sumrows.cuh"
			
 
				 
			
 
				 void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
			
 
				-    const dim3 block_dims(WARP_SIZE, 1, 1);
			
 
				+    const int  id  = ggml_cuda_get_device();
			
 
				+    const int  nsm = ggml_cuda_info().devices[id].nsm;
			
 
				     const dim3 block_nums(nrows, 1, 1);
			
 
				-    reduce_rows_f32</*norm*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
			
 
				+    if ((nrows / nsm) < 2) {
			
 
				+        const dim3 block_dims(512, 1, 1);
			
 
				+        reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
			
 
				+    } else {
			
 
				+        const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
			
 
				+        reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
			
 
				+    }
			
 
				 }
			
 
				 
			
 
				 void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
			
@@ -19,8 +27,17 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
				     const int64_t ncols = src0->ne[0];
			
 
				     const int64_t nrows = ggml_nrows(src0);
			
 
				 
			
 
				-    const dim3 block_dims(WARP_SIZE, 1, 1);
			
 
				     const dim3 block_nums(nrows, 1, 1);
			
 
				 
			
 
				-    reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
			
 
				+    const int id  = ggml_cuda_get_device();
			
 
				+    const int nsm = ggml_cuda_info().devices[id].nsm;
			
 
				+    if ((nrows / nsm) < 2) {
			
 
				+        // Increase num threads to 512 for small nrows to better hide the latency
			
 
				+        const dim3 block_dims(512, 1, 1);
			
 
				+        reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
			
 
				+    } else {
			
 
				+        // Enough active SMs to hide latency, use smaller blocks to allow better scheduling
			
 
				+        const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
			
 
				+        reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
			
 
				+    }
			
 
				 }
			
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -5998,6 +5998,15 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
 
				     test_cases.emplace_back(new test_sum());
			
 
				     test_cases.emplace_back(new test_sum_rows());
			
 
				     test_cases.emplace_back(new test_mean());
			
 
				+    test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 1, 1, 1 }));
			
 
				+    test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 33, 1, 1, 1 }));
			
 
				+    test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 33, 1, 1, 1 }));
			
 
				+    test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 1024, 1, 1 }));
			
 
				+    test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 33, 1024, 1, 1 }));
			
 
				+    test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 256, 1, 1 }));
			
 
				+    test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 33, 256, 1, 1 }));
			
 
				+    test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 33, 256, 1, 1 }));
			
 
				+    test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 32769, 1, 1, 1 }));
			
 
				     test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {64, 64, 320, 1}));
			
 
				     test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {9, 9, 1280, 1}));
			
 
				     test_cases.emplace_back(new test_acc());
			
@@ -6179,6 +6188,18 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
 
				         test_cases.emplace_back(new test_add_id(GGML_TYPE_F32, GGML_TYPE_F32, 2880, 32, 4, n_token));
			
 
				     }
			
 
				 
			
 
				+    std::vector<std::array<int64_t, 4>> reduce_rows_cases = {
			
 
				+        { 8192, 1,    1, 1 },
			
 
				+        { 8192, 8192, 1, 1 },
			
 
				+        { 128,  8192, 1, 1 },
			
 
				+    };
			
 
				+
			
 
				+    for (auto it: reduce_rows_cases){
			
 
				+        test_cases.emplace_back(new test_mean(GGML_TYPE_F32, it));
			
 
				+        test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, it));
			
 
				+        test_cases.emplace_back(new test_sum(GGML_TYPE_F32, it));
			
 
				+    }
			
 
				+
			
 
				     return test_cases;
			
 
				 }