9 месяцев назад · f01bd02376
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -353,6 +353,7 @@ struct vk_device_struct {
 
				     vk_pipeline pipeline_flash_attn_f32_f16_D112[GGML_TYPE_COUNT][2][2][2];
			
 
				     vk_pipeline pipeline_flash_attn_f32_f16_D128[GGML_TYPE_COUNT][2][2][2];
			
 
				     vk_pipeline pipeline_flash_attn_f32_f16_D256[GGML_TYPE_COUNT][2][2][2];
			
 
				+    vk_pipeline pipeline_flash_attn_split_k_reduce;
			
 
				 
			
 
				     std::unordered_map<std::string, vk_pipeline_ref> pipelines;
			
 
				     std::unordered_map<std::string, uint64_t> pipeline_descriptor_set_requirements;
			
@@ -504,6 +505,8 @@ struct vk_flash_attn_push_constants {
 
				     float m1;
			
 
				 
			
 
				     uint32_t gqa_ratio;
			
 
				+    uint32_t split_kv;
			
 
				+    uint32_t k_num;
			
 
				 };
			
 
				 
			
 
				 struct vk_op_push_constants {
			
@@ -1476,7 +1479,7 @@ static std::array<uint32_t, 2> fa_rows_cols(uint32_t D, uint32_t clamp, ggml_typ
 
				 
			
 
				     // small rows, large cols
			
 
				     if (small_rows) {
			
 
				-        return {flash_attention_num_small_rows, 128};
			
 
				+        return {flash_attention_num_small_rows, 64};
			
 
				     }
			
 
				     // small cols to reduce register count
			
 
				     if (ggml_is_quantized(type) || D == 256) {
			
@@ -2332,6 +2335,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
 
				     ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL],  "get_rows_iq4_nl_f32",  get_rows_iq4_nl_f32_len,  get_rows_iq4_nl_f32_data,  "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
			
 
				 
			
 
				     ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1);
			
 
				+    ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 2, 3 * sizeof(uint32_t), {1, 1, 1}, {}, 1, true);
			
 
				     ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1, "quantize_q8_1", quantize_q8_1_len, quantize_q8_1_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1);
			
 
				 
			
 
				     for (uint32_t i = 0; i < p021_max_gqa_ratio; ++i) {
			
@@ -5479,9 +5483,38 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
 
				         workgroups_y /= N;
			
 
				     }
			
 
				 
			
 
				+    uint32_t split_kv = KV;
			
 
				+    uint32_t split_k = 1;
			
 
				+
			
 
				+    if (gqa_ratio > 1 && ctx->device->shader_core_count > 0) {
			
 
				+        GGML_ASSERT(workgroups_x == 1);
			
 
				+        // Try to run two workgroups per SM.
			
 
				+        split_k = ctx->device->shader_core_count * 2 / workgroups_y;
			
 
				+        if (split_k > 1) {
			
 
				+            // Try to evenly split KV into split_k chunks, but it needs to be a multiple
			
 
				+            // of "align", so recompute split_k based on that.
			
 
				+            split_kv = ROUNDUP_POW2(KV / split_k, pipelines[1]->align);
			
 
				+            split_k = CEIL_DIV(KV, split_kv);
			
 
				+            workgroups_x = split_k;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // Reserve space for split_k temporaries. For each split, we need to store the O matrix (D x ne1)
			
 
				+    // and the per-row m and L values (ne1 rows).
			
 
				+    const uint64_t split_k_size = split_k > 1 ? (D * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k : 0;
			
 
				+    if (split_k_size > ctx->device->max_memory_allocation_size) {
			
 
				+        GGML_ABORT("Requested preallocation size is too large");
			
 
				+    }
			
 
				+    if (ctx->prealloc_size_split_k < split_k_size) {
			
 
				+        ctx->prealloc_size_split_k = split_k_size;
			
 
				+    }
			
 
				+
			
 
				     if (dryrun) {
			
 
				         // Request descriptor sets
			
 
				         ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1);
			
 
				+        if (split_k > 1) {
			
 
				+            ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_flash_attn_split_k_reduce, 1);
			
 
				+        }
			
 
				         return;
			
 
				     }
			
 
				 
			
@@ -5502,8 +5535,6 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
 
				     const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
			
 
				     const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
			
 
				 
			
 
				-    ggml_vk_sync_buffers(subctx);
			
 
				-
			
 
				     vk_buffer d_Q = nullptr, d_K = nullptr, d_V = nullptr, d_D = nullptr, d_M = nullptr;
			
 
				     size_t q_buf_offset = 0, k_buf_offset = 0, v_buf_offset = 0, d_buf_offset = 0, m_buf_offset = 0;
			
 
				 
			
@@ -5568,16 +5599,45 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
 
				                                               v_stride, (uint32_t)nbv2, (uint32_t)nbv3,
			
 
				                                               nbm1,
			
 
				                                               scale, max_bias, logit_softcap,
			
 
				-                                              mask != nullptr, n_head_log2, m0, m1, gqa_ratio };
			
 
				-    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
			
 
				-                                {
			
 
				-                                    vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
			
 
				-                                    vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE},
			
 
				-                                    vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE},
			
 
				-                                    vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
			
 
				-                                    vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
			
 
				-                                },
			
 
				-                                sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x, workgroups_y, workgroups_z });
			
 
				+                                              mask != nullptr, n_head_log2, m0, m1,
			
 
				+                                              gqa_ratio, split_kv, split_k };
			
 
				+
			
 
				+    ggml_vk_sync_buffers(subctx);
			
 
				+
			
 
				+    if (split_k > 1) {
			
 
				+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
			
 
				+                                    {
			
 
				+                                        vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
			
 
				+                                        vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE},
			
 
				+                                        vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE},
			
 
				+                                        vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
			
 
				+                                        vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
			
 
				+                                    },
			
 
				+                                    // We only use split_k when group query attention is enabled, which means
			
 
				+                                    // there's no more than one tile of rows (i.e. workgroups_x would have been
			
 
				+                                    // one). We reuse workgroups_x to mean the number of splits, so we need to
			
 
				+                                    // cancel out the divide by wg_denoms[0].
			
 
				+                                    sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z });
			
 
				+
			
 
				+        ggml_vk_sync_buffers(subctx);
			
 
				+        const std::array<uint32_t, 3> pc2 = { D, (uint32_t)ne1, split_k };
			
 
				+        ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce,
			
 
				+                                    {
			
 
				+                                        vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
			
 
				+                                        vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
			
 
				+                                    },
			
 
				+                                    pc2.size() * uint32_t{sizeof(uint32_t)}, pc2.data(), { (uint32_t)ne1, 1, 1 });
			
 
				+    } else {
			
 
				+        ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
			
 
				+                                    {
			
 
				+                                        vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
			
 
				+                                        vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE},
			
 
				+                                        vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE},
			
 
				+                                        vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
			
 
				+                                        vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
			
 
				+                                    },
			
 
				+                                    sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x, workgroups_y, workgroups_z });
			
 
				+    }
			
 
				 }
			
 
				 
			
 
				 static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) {
			
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
@@ -63,6 +63,8 @@ layout (push_constant) uniform parameter {
 
				     float m1;
			
 
				 
			
 
				     uint32_t gqa_ratio;
			
 
				+    uint32_t split_kv;
			
 
				+    uint32_t k_num;
			
 
				 } p;
			
 
				 
			
 
				 layout (binding = 0) readonly buffer Q {uint8_t data_q[];};
			
@@ -116,6 +118,16 @@ D_TYPE perElemOpGqaStore(const in uint32_t r, const in uint32_t c, const in D_TY
 
				     return elem;
			
 
				 }
			
 
				 
			
 
				+// Store column zero. This is used to save per-row m and L values for split_k.
			
 
				+ACC_TYPE perElemOpStoreCol0(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
			
 
				+{
			
 
				+    if (r < N && c == 0) {
			
 
				+        uint32_t offset = iq2 + r;
			
 
				+        data_o[o_offset + offset] = D_TYPE(elem);
			
 
				+    }
			
 
				+    return elem;
			
 
				+}
			
 
				+
			
 
				 // Load the slope matrix, indexed by Q's dimension 2.
			
 
				 ACC_TYPE perElemOpComputeSlope(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t iq2)
			
 
				 {
			
@@ -135,10 +147,18 @@ void main() {
 
				     const uint32_t N = p.N;
			
 
				     const uint32_t KV = p.KV;
			
 
				 
			
 
				+    uint32_t i = gl_WorkGroupID.x;
			
 
				+    uint32_t split_k_index = 0;
			
 
				+
			
 
				+    if (p.k_num > 1) {
			
 
				+        i = 0;
			
 
				+        split_k_index = gl_WorkGroupID.x;
			
 
				+    }
			
 
				+
			
 
				     const uint32_t Tr = CEIL_DIV(N, Br);
			
 
				-    const uint32_t Tc = CEIL_DIV(KV, Bc);
			
 
				 
			
 
				-    const uint32_t i = gl_WorkGroupID.x;
			
 
				+    const uint32_t start_j = split_k_index * p.split_kv / Bc;
			
 
				+    const uint32_t end_j = CEIL_DIV(min(KV, (split_k_index + 1) * p.split_kv), Bc);
			
 
				 
			
 
				     // When not using grouped query attention, all rows share the same iq2, equal to gl_WorkGroupID.y.
			
 
				     // When using grouped query attention, each workgroup does gqa_ratio consecutive values of iq2.
			
@@ -218,7 +238,7 @@ void main() {
 
				     }
			
 
				 
			
 
				     [[dont_unroll]]
			
 
				-    for (uint32_t j = 0; j < Tc; ++j) {
			
 
				+    for (uint32_t j = start_j; j < end_j; ++j) {
			
 
				 
			
 
				         coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> S = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(0);
			
 
				 
			
@@ -312,6 +332,20 @@ void main() {
 
				         O = coopMatMulAdd(P_A, V, O);
			
 
				     }
			
 
				 
			
 
				+    // If there is split_k, then the split_k resolve shader does the final
			
 
				+    // division by L. Store the intermediate O value and per-row m and L values.
			
 
				+    if (p.k_num > 1) {
			
 
				+        coopmat<D_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator>(O);
			
 
				+
			
 
				+        uint32_t o_offset = D * p.ne1 * split_k_index;
			
 
				+        coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N);
			
 
				+
			
 
				+        o_offset = D * p.ne1 * p.k_num + p.ne1 * split_k_index * 2;
			
 
				+        coopMatPerElementNV(L, L, perElemOpStoreCol0, o_offset, iq2, N);
			
 
				+        coopMatPerElementNV(M, M, perElemOpStoreCol0, o_offset + p.ne1, iq2, N);
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				     coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseAccumulator> Ldiag;
			
 
				 
			
 
				     // resize L by using smear/reduce
			
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp
@@ -0,0 +1,59 @@
 
				+#version 450
			
 
				+
			
 
				+#extension GL_EXT_control_flow_attributes : enable
			
 
				+
			
 
				+#define BLOCK_SIZE 32
			
 
				+
			
 
				+layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
			
 
				+
			
 
				+layout (binding = 0) readonly buffer A {float data_a[];};
			
 
				+layout (binding = 1) writeonly buffer D {float data_d[];};
			
 
				+
			
 
				+layout (push_constant) uniform parameter {
			
 
				+    uint D;
			
 
				+    uint N;
			
 
				+    uint k_num;
			
 
				+} p;
			
 
				+
			
 
				+void main() {
			
 
				+    // Each workgroup handles a row
			
 
				+    const uint n = gl_WorkGroupID.x;
			
 
				+    const uint tid = gl_LocalInvocationID.x;
			
 
				+
			
 
				+    uint D = p.D;
			
 
				+    uint N = p.N;
			
 
				+    uint k_num = p.k_num;
			
 
				+
			
 
				+    uint l_offset = D * N * k_num + n;
			
 
				+    uint m_offset = D * N * k_num + N + n;
			
 
				+    uint lm_stride = N * 2;
			
 
				+
			
 
				+    // Compute the max m value for the row
			
 
				+    float m_max = -1.0/0.0;
			
 
				+    [[unroll]] for (uint k = 0; k < k_num; ++k) {
			
 
				+        float m = data_a[m_offset + k * lm_stride];
			
 
				+        m_max = max(m_max, m);
			
 
				+    }
			
 
				+
			
 
				+    // Compute L based on m_max
			
 
				+    float L = 0;
			
 
				+    [[unroll]] for (uint k = 0; k < k_num; ++k) {
			
 
				+        float l = data_a[l_offset + k * lm_stride];
			
 
				+        float m = data_a[m_offset + k * lm_stride];
			
 
				+        L += exp(m - m_max) * l;
			
 
				+    }
			
 
				+
			
 
				+    L = 1.0 / L;
			
 
				+
			
 
				+    // Scale and sum the O contributions based on m_max and store the result to memory
			
 
				+    for (uint d = tid; d < D; d += BLOCK_SIZE) {
			
 
				+        float O = 0.0;
			
 
				+        [[unroll]] for (uint k = 0; k < k_num; ++k) {
			
 
				+            uint o_offset = D * N * k + D * n + d;
			
 
				+            float m = data_a[m_offset + k * lm_stride];
			
 
				+            O += exp(m - m_max) * data_a[o_offset];
			
 
				+        }
			
 
				+        O *= L;
			
 
				+        data_d[D * n + d] = O;
			
 
				+    }
			
 
				+}
			
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -465,6 +465,7 @@ void process_shaders() {
 
				     string_to_spv("acc_f32", "acc.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
			
 
				 
			
 
				     string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
			
 
				+    string_to_spv("fa_split_k_reduce", "flash_attn_split_k_reduce.comp", {});
			
 
				     string_to_spv("quantize_q8_1", "quantize_q8_1.comp", {});
			
 
				 
			
 
				     string_to_spv("mul_f32", "mul.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
			
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -4516,6 +4516,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
 
				         }
			
 
				     }
			
 
				 
			
 
				+    for (int kv : { 4096, 8192, 16384, }) {
			
 
				+        for (int hs : { 64, 128, }) {
			
 
				+            test_cases.emplace_back(new test_flash_attn_ext(hs, hs, 8, 4, kv, 1, true, 0, 0, GGML_PREC_F32, GGML_TYPE_F16));
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				     return test_cases;
			
 
				 }