před 2 měsíci · 2fdbf16eb1
--- a/src/llama-memory-recurrent.cpp
+++ b/src/llama-memory-recurrent.cpp
@@ -1144,3 +1144,7 @@ ggml_tensor * llama_memory_recurrent_context::get_s_l(int32_t il) const {
 
															 int32_t llama_memory_recurrent_context::s_copy(int i) const {
														
 
															     return  mem->cells[i + mem->head].src0;
														
 
															 }
														
 
															+
														
 
															+bool llama_memory_recurrent_context::has_previous_state() const {
														
 
															+    return mem->cells[mem->head].pos >= 0;
														
 
															+}
														
--- a/src/llama-memory-recurrent.h
+++ b/src/llama-memory-recurrent.h
@@ -160,6 +160,7 @@ public:
 
															     ggml_tensor * get_s_l(int32_t il) const;
														
 
															     int32_t s_copy(int i) const;
														
 
															+    bool has_previous_state() const;
														
 
															 private:
														
 
															     const llama_memory_status status;
														
--- a/src/models/llm_build_qwen3next.cpp
+++ b/src/models/llm_build_qwen3next.cpp
@@ -605,7 +605,7 @@ ggml_tensor * llm_build_qwen3next::build_qwen3next_linear_attn_layer(llm_graph_i
 
															     ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
														
 
															     ggml_tensor * ssm_states_all  = mctx_cur->get_s_l(il);
														
 
															-    bool is_generation = mctx_cur->get_rs_z() < 0;
														
 
															+    bool use_precomputed_states = n_seq_tokens == 1 && mctx_cur->has_previous_state();
														
 
															     // Build the convolution states tensor
														
 
															     ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
														
@@ -719,7 +719,7 @@ ggml_tensor * llm_build_qwen3next::build_qwen3next_linear_attn_layer(llm_graph_i
 
															     // Choose between delta_net and delta_net_recurrent based on generation mode
														
 
															     ggml_tensor * attn_out;
														
 
															-    if (is_generation) {
														
 
															+    if (use_precomputed_states) {
														
 
															         // Use delta_net_recurrent for single token generation
														
 
															         attn_out = delta_net_recurrent(ctx0, q_conv, k_conv, v_conv, gate, beta, state, true, hparams.f_norm_rms_eps, il);
														
 
															     } else {