1 개월 전 · 4dff236a52
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2305,13 +2305,11 @@ extern "C" {
 
				             float                 stop,
			
 
				             float                 step);
			
 
				 
			
 
				-#define GGML_KQ_MASK_PAD 1
			
 
				-
			
 
				-    // q:    [n_embd_k, n_batch,     n_head,    ne3 ]
			
 
				-    // k:    [n_embd_k, n_kv,        n_head_kv, ne3 ]
			
 
				-    // v:    [n_embd_v, n_kv,        n_head_kv, ne3 ] !! not transposed !!
			
 
				-    // mask: [n_kv,     n_batch_pad, ne32,      ne33] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
			
 
				-    // res:  [n_embd_v, n_head,      n_batch,   ne3 ] !! permuted !!
			
 
				+    // q:    [n_embd_k, n_batch, n_head,    ne3 ]
			
 
				+    // k:    [n_embd_k, n_kv,    n_head_kv, ne3 ]
			
 
				+    // v:    [n_embd_v, n_kv,    n_head_kv, ne3 ] !! not transposed !!
			
 
				+    // mask: [n_kv,     n_batch, ne32,      ne33]
			
 
				+    // res:  [n_embd_v, n_head,  n_batch,   ne3 ] !! permuted !!
			
 
				     //
			
 
				     // broadcast:
			
 
				     //   n_head % n_head_kv == 0
			
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -5260,8 +5260,6 @@ struct ggml_tensor * ggml_flash_attn_ext(
 
				 
			
 
				     if (mask) {
			
 
				         GGML_ASSERT(ggml_is_contiguous(mask));
			
 
				-        GGML_ASSERT(mask->ne[1] >= GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD) &&
			
 
				-                "the Flash-Attention kernel requires the mask to be padded to GGML_KQ_MASK_PAD and at least n_queries big");
			
 
				         //GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
			
 
				 
			
 
				         GGML_ASSERT(q->ne[2] % mask->ne[2] == 0);
			
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -93,14 +93,6 @@ llama_context::llama_context(
 
				     // with causal attention, the batch size is limited by the context size
			
 
				     cparams.n_batch = cparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
			
 
				 
			
 
				-    // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask
			
 
				-    // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext)
			
 
				-    // ref: https://github.com/ggerganov/llama.cpp/pull/5021
			
 
				-    // TODO: this padding is not needed for the cache-less context so we should probably move it to llama_memory
			
 
				-    if (cparams.n_batch < GGML_KQ_MASK_PAD) {
			
 
				-        LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
			
 
				-        cparams.n_batch = GGML_KQ_MASK_PAD;
			
 
				-    }
			
 
				     cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
			
 
				 
			
 
				     cparams.op_offload = params.op_offload;
			
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -385,7 +385,7 @@ bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
 
				   //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
			
 
				 
			
 
				     res &= self_kq_mask->ne[0] == mctx->get_n_kv();
			
 
				-    res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
			
 
				+    res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
			
 
				 
			
 
				     return res;
			
 
				 }
			
@@ -416,10 +416,10 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
 
				   //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
			
 
				 
			
 
				     res &= self_kq_mask->ne[0] == mctx->get_base()->get_n_kv();
			
 
				-    res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
			
 
				+    res &= self_kq_mask->ne[1] == params.ubatch.n_tokens;
			
 
				 
			
 
				     res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
			
 
				-    res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
			
 
				+    res &= self_kq_mask_swa->ne[1] == params.ubatch.n_tokens;
			
 
				 
			
 
				     return res;
			
 
				 }
			
@@ -452,7 +452,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
 
				             }
			
 
				         }
			
 
				 
			
 
				-        for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
			
 
				+        for (int i = n_tokens; i < n_tokens; ++i) {
			
 
				             for (int j = 0; j < n_enc; ++j) {
			
 
				                 data[h*(n_enc*n_tokens) + i*n_enc + j] = -INFINITY;
			
 
				             }
			
@@ -1470,13 +1470,13 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
 
				     auto inp = std::make_unique<llm_graph_input_attn_no_cache>(hparams, cparams);
			
 
				 
			
 
				     // note: there is no KV cache, so the number of KV values is equal to the number of tokens in the batch
			
 
				-    inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
			
 
				+    inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
			
 
				     ggml_set_input(inp->self_kq_mask);
			
 
				 
			
 
				     inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
			
 
				 
			
 
				     if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
			
 
				-        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
			
 
				+        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens, 1, 1);
			
 
				         ggml_set_input(inp->self_kq_mask_swa);
			
 
				 
			
 
				         inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
			
@@ -1558,7 +1558,7 @@ static std::unique_ptr<llm_graph_input_attn_kv> build_attn_inp_kv_impl(
 
				         inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
			
 
				         inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
			
 
				 
			
 
				-        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
			
 
				+        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
			
 
				         ggml_set_input(inp->self_kq_mask);
			
 
				 
			
 
				         inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
			
@@ -1701,7 +1701,7 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
 
				 
			
 
				     const int32_t n_enc = !cross->v_embd.empty() ? cross->n_enc : hparams.n_ctx_train;
			
 
				 
			
 
				-    inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
			
 
				+    inp->cross_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_enc, n_tokens, 1, 1);
			
 
				     ggml_set_input(inp->cross_kq_mask);
			
 
				 
			
 
				     inp->cross_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->cross_kq_mask, GGML_TYPE_F16) : inp->cross_kq_mask;
			
@@ -1767,7 +1767,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
 
				         inp->self_k_idxs = mctx_cur->get_base()->build_input_k_idxs(ctx0, ubatch);
			
 
				         inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);
			
 
				 
			
 
				-        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
			
 
				+        inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
			
 
				         ggml_set_input(inp->self_kq_mask);
			
 
				 
			
 
				         inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
			
@@ -1781,7 +1781,7 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
 
				         inp->self_k_idxs_swa = mctx_cur->get_swa()->build_input_k_idxs(ctx0, ubatch);
			
 
				         inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);
			
 
				 
			
 
				-        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
			
 
				+        inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, n_tokens/n_stream, 1, n_stream);
			
 
				         ggml_set_input(inp->self_kq_mask_swa);
			
 
				 
			
 
				         inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
			
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -1232,8 +1232,7 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
 
				     GGML_ASSERT(n_tokens%n_stream == 0);
			
 
				 
			
 
				     // n_tps == n_tokens_per_stream
			
 
				-    const int64_t n_tps     = n_tokens/n_stream;
			
 
				-    const int64_t n_tps_pad = GGML_PAD(n_tps, GGML_KQ_MASK_PAD);
			
 
				+    const int64_t n_tps = n_tokens/n_stream;
			
 
				 
			
 
				     std::fill(data, data + ggml_nelements(dst), -INFINITY);
			
 
				 
			
@@ -1266,7 +1265,7 @@ void llama_kv_cache::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * u
 
				                 const llama_pos p1_x = is_2d ? ubatch->pos[i + ubatch->n_tokens*2] : 0;
			
 
				                 const llama_pos p1_y = is_2d ? ubatch->pos[i + ubatch->n_tokens]   : 0;
			
 
				 
			
 
				-                const uint64_t idst = n_kv*(h*n_stream*n_tps_pad + s*n_tps_pad + ii);
			
 
				+                const uint64_t idst = n_kv*(h*n_stream*n_tps + s*n_tps + ii);
			
 
				 
			
 
				                 for (uint32_t j = 0; j < n_kv; ++j) {
			
 
				                     if (cells.is_empty(j)) {
			
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -5875,7 +5875,7 @@ struct test_flash_attn_ext : public test_case {
 
				 
			
 
				         ggml_tensor * m = nullptr;
			
 
				         if (mask) {
			
 
				-            m = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, nr23[1]);
			
 
				+            m = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, nb, 1, nr23[1]);
			
 
				             ggml_set_name(m, "m");
			
 
				         }
			
 
				 
			
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -775,10 +775,6 @@ struct clip_graph {
 
				 
			
 
				             // if flash attn is used, we need to pad the mask and cast to f16
			
 
				             if (ctx->flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
			
 
				-                int n_pad = GGML_PAD(window_mask->ne[1], GGML_KQ_MASK_PAD) - window_mask->ne[1];
			
 
				-                if (n_pad > 0) {
			
 
				-                    window_mask = ggml_pad(ctx0, window_mask, 0, n_pad, 0, 0);
			
 
				-                }
			
 
				                 window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
			
 
				             }
			
 
				 
			
@@ -791,7 +787,7 @@ struct clip_graph {
 
				 
			
 
				         // loop over layers
			
 
				         for (int il = 0; il < n_layer; il++) {
			
 
				-            auto & layer = model.layers[il];
			
 
				+            const auto & layer = model.layers[il];
			
 
				             const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true;
			
 
				 
			
 
				             ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states