3 сар өмнө · 32dcee47ef
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -7138,6 +7138,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
 
				         case LLM_ARCH_ARCEE:
			
 
				         case LLM_ARCH_ERNIE4_5:
			
 
				         case LLM_ARCH_ERNIE4_5_MOE:
			
 
				+        case LLM_ARCH_QWEN3NEXT:
			
 
				             return LLAMA_ROPE_TYPE_NORM;
			
 
				 
			
 
				         // the pairs of head values are offset by n_rot/2
			
@@ -7157,7 +7158,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
 
				         case LLM_ARCH_QWEN2MOE:
			
 
				         case LLM_ARCH_QWEN3:
			
 
				         case LLM_ARCH_QWEN3MOE:
			
 
				-        case LLM_ARCH_QWEN3NEXT:
			
 
				         case LLM_ARCH_LLADA_MOE:
			
 
				         case LLM_ARCH_OLMO2:
			
 
				         case LLM_ARCH_OLMOE:
			
--- a/src/models/llm_build_qwen3next.cpp
+++ b/src/models/llm_build_qwen3next.cpp
@@ -21,9 +21,7 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
 
				 
			
 
				     for (int il = 0; il < n_layer; ++il) {
			
 
				         struct ggml_tensor * inpSA = inpL;
			
 
				-
			
 
				-        // Pre-norm for attention/linear attention
			
 
				-        cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
			
 
				+        cur = build_q3n_norm(inpL, model.layers[il].attn_norm, il);
			
 
				         cb(cur, "attn_norm", il);
			
 
				 
			
 
				         // Determine layer type and build appropriate attention mechanism
			
@@ -35,7 +33,7 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
 
				             cur = build_qwen3next_attention_layer(cur, inp_pos, inp->get_attn(), model, n_embd_head, il);
			
 
				         }
			
 
				         // Post-attention norm
			
 
				-        cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
			
 
				+        cur = build_q3n_norm(cur, model.layers[il].attn_post_norm, il);
			
 
				         cb(cur, "attn_post_norm", il);
			
 
				 
			
 
				         if (il == n_layer - 1 && inp_out_ids) {
			
@@ -48,6 +46,7 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
 
				 
			
 
				         // FFN layer (MoE or dense)
			
 
				         cur = build_layer_ffn(cur, model, il);
			
 
				+        cb(cur, "post_moe", il);
			
 
				 
			
 
				         // Input for next layer
			
 
				         inpL = cur;
			
@@ -55,7 +54,7 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
 
				     cur = inpL;
			
 
				 
			
 
				     // Final norm
			
 
				-    cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
			
 
				+    cur = build_q3n_norm(cur, model.output_norm, -1);
			
 
				 
			
 
				     cb(cur, "result_norm", -1);
			
 
				     res->t_embd = cur;
			
@@ -70,6 +69,11 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
 
				     ggml_build_forward_expand(gf, cur);
			
 
				 }
			
 
				 
			
 
				+struct ggml_tensor * llm_build_qwen3next::build_q3n_norm(struct ggml_tensor * input, struct ggml_tensor * weights, int layer) {
			
 
				+    ggml_tensor * input_norm = ggml_scale_bias(ctx0, weights, 1.0f, 1.0f);
			
 
				+    return build_norm(input, input_norm, nullptr, LLM_NORM_RMS, layer);
			
 
				+}
			
 
				+
			
 
				 // ggml_delta_net
			
 
				 struct ggml_tensor * llm_build_qwen3next::ggml_delta_net(struct ggml_tensor * k,
			
 
				                                                          struct ggml_tensor * v,
			
@@ -386,27 +390,38 @@ ggml_tensor * llm_build_qwen3next::build_qwen3next_linear_attn_layer(llm_graph_i
 
				     ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
			
 
				     cb(conv_states, "conv_states", il);
			
 
				 
			
 
				-    // Combine query, key, value for convolution input
			
 
				-    ggml_tensor * qkv_mixed = ggml_concat(ctx0, query, key, 1);
			
 
				-    qkv_mixed               = ggml_concat(ctx0, qkv_mixed, value_reshaped, 1);
			
 
				+    // After creating query, key, and value_reshaped, reshape each to flatten the head dimensions
			
 
				+    // query: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
			
 
				+    ggml_tensor * query_flat = ggml_reshape_3d(ctx0, query, head_k_dim * num_k_heads, n_tokens, n_seqs);
			
 
				+    cb(query_flat, "query_flat", il);
			
 
				+
			
 
				+    // key: [head_k_dim, num_k_heads, n_tokens, n_seqs] -> [head_k_dim * num_k_heads, n_tokens, n_seqs]
			
 
				+    ggml_tensor * key_flat = ggml_reshape_3d(ctx0, key, head_k_dim * num_k_heads, n_tokens, n_seqs);
			
 
				+    cb(key_flat, "key_flat", il);
			
 
				+
			
 
				+    // value_reshaped: [head_v_dim, num_v_heads, n_tokens, n_seqs] -> [head_v_dim * num_v_heads, n_tokens, n_seqs]
			
 
				+    ggml_tensor * value_flat = ggml_reshape_3d(ctx0, value_reshaped, head_v_dim * num_v_heads, n_tokens, n_seqs);
			
 
				+    cb(value_flat, "value_flat", il);
			
 
				 
			
 
				+    // Now concatenate along the feature dimension (dim 0) to get [conv_dim, n_tokens, n_seqs]
			
 
				+    ggml_tensor * qkv_mixed = ggml_concat(ctx0, query_flat, key_flat, 0);
			
 
				+    qkv_mixed               = ggml_concat(ctx0, qkv_mixed, value_flat, 0);
			
 
				+    cb(qkv_mixed, "qkv_mixed_concatenated", il);
			
 
				+
			
 
				+    // Calculate the total conv dimension
			
 
				     int64_t qkv_dim = head_k_dim * num_k_heads * 2 + head_v_dim * num_v_heads;
			
 
				 
			
 
				+    // Reshape to [n_tokens, qkv_dim, n_seqs] for proper convolution input format
			
 
				+    qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, n_tokens, qkv_dim, n_seqs);
			
 
				+    cb(qkv_mixed, "qkv_mixed_for_conv", il);
			
 
				+
			
 
				     // Calculate convolution kernel size
			
 
				     const int64_t conv_kernel_size = model.layers[il].ssm_conv1d->ne[0];
			
 
				-    conv_states                    = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1,
			
 
				-                                                     d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state, n_seqs);
			
 
				+    conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state, n_seqs);
			
 
				     cb(conv_states, "conv_states_reshaped", il);
			
 
				 
			
 
				-    // Reshape to [input_dim, n_seq_tokens, n_seqs] for concatenation
			
 
				-    qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_dim, n_seq_tokens, n_seqs);
			
 
				-    cb(qkv_mixed, "qkv_mixed_for_conv", il);
			
 
				-
			
 
				-    // Concatenate cached conv states with current input
			
 
				-    // conv_states: [conv_kernel_size - 1, input_dim, n_seqs]
			
 
				-    // qkv_mixed: [input_dim, n_seq_tokens, n_seqs]
			
 
				-    // After transpose: [n_seq_tokens, input_dim, n_seqs]
			
 
				-    ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, ggml_transpose(ctx0, qkv_mixed), 0);
			
 
				+    // Now concatenate along the sequence dimension (dim 0 in Llama.cpp)
			
 
				+    ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0);    
			
 
				     cb(conv_input, "conv_input", il);
			
 
				 
			
 
				     // Apply convolution
			
--- a/src/models/llm_build_qwen3next.h
+++ b/src/models/llm_build_qwen3next.h
@@ -47,4 +47,7 @@ private:
 
				     ggml_tensor * build_layer_ffn(ggml_tensor * cur, const llama_model & model, const int il);
			
 
				 
			
 
				     ggml_tensor * softplus(ggml_tensor * alpha, ggml_tensor * dt_bias);
			
 
				+
			
 
				+    ggml_tensor * build_q3n_norm(struct ggml_tensor * input, struct ggml_tensor * weights, int layer);
			
 
				+
			
 
				 };