|
@@ -173,6 +173,7 @@ struct ggml_tensor * llm_build_qwen3next::build_qwen3next_attention_layer(ggml_t
|
|
|
cb(Vcur, "Vcur", il);
|
|
cb(Vcur, "Vcur", il);
|
|
|
|
|
|
|
|
// Apply K normalization
|
|
// Apply K normalization
|
|
|
|
|
+ Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
|
Kcur = build_q3n_norm(Kcur, model.layers[il].attn_k_norm, il);
|
|
Kcur = build_q3n_norm(Kcur, model.layers[il].attn_k_norm, il);
|
|
|
cb(Kcur, "Kcur_normed", il);
|
|
cb(Kcur, "Kcur_normed", il);
|
|
|
|
|
|
|
@@ -180,8 +181,6 @@ struct ggml_tensor * llm_build_qwen3next::build_qwen3next_attention_layer(ggml_t
|
|
|
gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens);
|
|
gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens);
|
|
|
cb(gate, "gate_reshaped", il);
|
|
cb(gate, "gate_reshaped", il);
|
|
|
|
|
|
|
|
- Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
|
|
|
- Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
|
|
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
|
|
|
|
|
|
// Apply RoPE
|
|
// Apply RoPE
|
|
@@ -204,7 +203,6 @@ struct ggml_tensor * llm_build_qwen3next::build_qwen3next_attention_layer(ggml_t
|
|
|
struct ggml_tensor * gate_sigmoid = ggml_sigmoid(ctx0, gate);
|
|
struct ggml_tensor * gate_sigmoid = ggml_sigmoid(ctx0, gate);
|
|
|
cb(gate_sigmoid, "gate_sigmoid", il);
|
|
cb(gate_sigmoid, "gate_sigmoid", il);
|
|
|
|
|
|
|
|
- // Apply gating directly using the original gate tensor
|
|
|
|
|
cur = ggml_mul(ctx0, cur, gate_sigmoid);
|
|
cur = ggml_mul(ctx0, cur, gate_sigmoid);
|
|
|
cb(cur, "attn_gated", il);
|
|
cb(cur, "attn_gated", il);
|
|
|
|
|
|