3 месяцев назад · 75586ea36e
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2295,6 +2295,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
 
				         case GGML_OP_POOL_2D:
			
 
				         case GGML_OP_POOL_2D_BACK:
			
 
				         case GGML_OP_DELTA_NET_RECURRENT:
			
 
				+        case GGML_OP_DELTA_NET:
			
 
				             {
			
 
				                 n_tasks = 1;
			
 
				             } break;
			
@@ -2312,7 +2313,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
 
				         case GGML_OP_RWKV_WKV6:
			
 
				         case GGML_OP_GATED_LINEAR_ATTN:
			
 
				         case GGML_OP_RWKV_WKV7:
			
 
				-        case GGML_OP_DELTA_NET:
			
 
				             {
			
 
				                 n_tasks = n_threads;
			
 
				             } break;
			
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -10529,7 +10529,69 @@ static void delta_compute_k_beta_key_t_f32(const float * k_beta, const float * k
 
				     }
			
 
				 }
			
 
				 
			
 
				-// Helper function to apply triangular updates
			
 
				+// Helper function to apply triangular updates to entire chunk (all sequences and heads)
			
 
				+static void delta_apply_triangular_updates_chunk_f32(float * attn, const int64_t chunk_size,
			
 
				+                                                    const int64_t n_seqs, const int64_t H_v) {
			
 
				+    for (int64_t seq = 0; seq < n_seqs; seq++) {
			
 
				+        for (int64_t head = 0; head < H_v; head++) {
			
 
				+            float * attn_ptr = attn + seq * (chunk_size * chunk_size * H_v) + head * (chunk_size * chunk_size);
			
 
				+            
			
 
				+            // Apply triangular updates following the Python reference exactly:
			
 
				+            // for i in range(1, chunk_size):
			
 
				+            //     row = attn[..., i, :i].clone()
			
 
				+            //     sub = attn[..., :i, :i].clone()
			
 
				+            //     attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2)
			
 
				+            for (int64_t i = 1; i < chunk_size; i++) {
			
 
				+                // Create temporary storage for row and sub to avoid modifying during computation
			
 
				+                float * row = (float *) malloc(i * sizeof(float));
			
 
				+                float * sub = (float *) malloc(i * i * sizeof(float));
			
 
				+                
			
 
				+                // Copy row = attn[..., i, :i]
			
 
				+                for (int64_t j = 0; j < i; j++) {
			
 
				+                    row[j] = attn_ptr[i * chunk_size + j];
			
 
				+                }
			
 
				+                
			
 
				+                // Copy sub = attn[..., :i, :i]
			
 
				+                for (int64_t k = 0; k < i; k++) {
			
 
				+                    for (int64_t j = 0; j < i; j++) {
			
 
				+                        sub[k * i + j] = attn_ptr[k * chunk_size + j];
			
 
				+                    }
			
 
				+                }
			
 
				+                
			
 
				+                // Compute updates for each j in :i
			
 
				+                for (int64_t j = 0; j < i; j++) {
			
 
				+                    // Compute (row.unsqueeze(-1) * sub).sum(-2)
			
 
				+                    float sum_val = 0.0f;
			
 
				+                    for (int64_t k = 0; k < i; k++) {
			
 
				+                        sum_val += row[k] * sub[k * i + j];
			
 
				+                    }
			
 
				+                                       
			
 
				+                    // Update: attn[..., i, j] = row[j] + sum_val
			
 
				+                    attn_ptr[i * chunk_size + j] = row[j] + sum_val;
			
 
				+                }
			
 
				+                
			
 
				+                free(row);
			
 
				+                free(sub);
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// Helper function to add identity matrix to entire chunk (all sequences and heads)
			
 
				+static void delta_add_identity_matrix_chunk_f32(float * matrix, const int64_t chunk_size,
			
 
				+                                                const int64_t n_seqs, const int64_t H_v) {
			
 
				+    for (int64_t seq = 0; seq < n_seqs; seq++) {
			
 
				+        for (int64_t head = 0; head < H_v; head++) {
			
 
				+            float * matrix_ptr = matrix + seq * (chunk_size * chunk_size * H_v) + head * (chunk_size * chunk_size);
			
 
				+            // Add identity matrix directly
			
 
				+            for (int64_t i = 0; i < chunk_size; i++) {
			
 
				+                matrix_ptr[i * chunk_size + i] += 1.0f;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// Helper function to apply triangular updates (original version for individual matrices)
			
 
				 static void delta_apply_triangular_updates_f32(float * attn, const int64_t chunk_size) {
			
 
				     for (int64_t i = 1; i < chunk_size; i++) {
			
 
				         for (int64_t j = 0; j < i; j++) {
			
@@ -10542,40 +10604,41 @@ static void delta_apply_triangular_updates_f32(float * attn, const int64_t chunk
 
				     }
			
 
				 }
			
 
				 
			
 
				-// Helper function to add identity matrix
			
 
				+// Helper function to add identity matrix (original version for individual matrices)
			
 
				 static void delta_add_identity_matrix_f32(float * matrix, const int64_t size) {
			
 
				     for (int64_t i = 0; i < size; i++) {
			
 
				         matrix[i * size + i] += 1.0f;
			
 
				     }
			
 
				 }
			
 
				 
			
 
				-// Helper function to compute value = attn @ v_beta
			
 
				-static void delta_compute_value_f32(const float * attn, const float * v_beta,
			
 
				-                                      float * value,
			
 
				-                                      const int64_t chunk_size, const int64_t v_head_dim) {
			
 
				-    for (int64_t i = 0; i < chunk_size; i++) {
			
 
				-        for (int64_t d = 0; d < v_head_dim; d++) {
			
 
				-            float sum = 0.0f;
			
 
				-            for (int64_t j = 0; j < chunk_size; j++) {
			
 
				-                int64_t v_beta_idx = j * v_head_dim + d;
			
 
				-                sum += attn[i * chunk_size + j] * v_beta[v_beta_idx];
			
 
				-            }
			
 
				-            value[i * v_head_dim + d] = sum;
			
 
				+static void delta_compute_value_f32(const float * attn,
			
 
				+                                    const float * v_beta,
			
 
				+                                    float *       value,
			
 
				+                                    const int64_t chunk_size,
			
 
				+                                    const int64_t v_head_dim,
			
 
				+                                    const int64_t n_heads,
			
 
				+                                    const int64_t n_seqs) {
			
 
				+    for (int64_t seq = 0; seq < n_seqs; seq++) {
			
 
				+        for (int64_t head = 0; head < n_heads; head++) {
			
 
				+            delta_matmul_f32(
			
 
				+                attn + (chunk_size * chunk_size * n_heads) * seq + (chunk_size * chunk_size) * head, 
			
 
				+                v_beta + (chunk_size * v_head_dim * n_heads) * seq + (chunk_size * v_head_dim) * head, 
			
 
				+                value + (chunk_size * v_head_dim * n_heads) * seq + (chunk_size * v_head_dim) * head, 
			
 
				+                chunk_size, v_head_dim, chunk_size);
			
 
				         }
			
 
				     }
			
 
				 }
			
 
				 
			
 
				-// Helper function to compute k_cumdecay = attn @ (k_beta * g.exp().unsqueeze(-1))
			
 
				+// Helper function to compute k_cumdecay = attn @ (k_beta * g.exp().unsqueeze(-1)) for single head/sequence
			
 
				 static void delta_compute_k_cumdecay_f32(const float * attn, const float * k_beta, const float * g,
			
 
				-                                           float * k_cumdecay, const int64_t chunk_size, const int64_t k_head_dim) {
			
 
				+                                        float * k_cumdecay, const int64_t chunk_size, const int64_t k_head_dim) {
			
 
				     for (int64_t i = 0; i < chunk_size; i++) {
			
 
				-        for (int64_t d = 0; d < k_head_dim; d++) {
			
 
				+        for (int64_t j = 0; j < k_head_dim; j++) {
			
 
				             float sum = 0.0f;
			
 
				-            for (int64_t j = 0; j < chunk_size; j++) {
			
 
				-                int64_t k_beta_idx = j * k_head_dim + d;
			
 
				-                sum += attn[i * chunk_size + j] * k_beta[k_beta_idx] * expf(g[j]);
			
 
				+            for (int64_t k = 0; k < chunk_size; k++) {
			
 
				+                sum += attn[i * chunk_size + k] * k_beta[k * k_head_dim + j] * expf(g[k]);
			
 
				             }
			
 
				-            k_cumdecay[i * k_head_dim + d] = sum;
			
 
				+            k_cumdecay[i * k_head_dim + j] = sum;
			
 
				         }
			
 
				     }
			
 
				 }
			
@@ -10625,7 +10688,9 @@ static void delta_matmul_state_f32(const float * a, const float * state, float *
 
				             for (int64_t k = 0; k < cols_a; k++) {
			
 
				                 int64_t a_idx = i * cols_a + k;
			
 
				                 int64_t state_idx = k * cols_state + j;
			
 
				-                sum += a[a_idx] * state[state_idx];
			
 
				+                float a_val = a[a_idx];
			
 
				+                float state_val = state[state_idx];
			
 
				+                sum += a_val * state_val;
			
 
				             }
			
 
				             dst[i * cols_state + j] = sum;
			
 
				         }
			
@@ -10670,6 +10735,108 @@ static void delta_update_recurrent_state_f32(const float * last_state, const flo
 
				     }
			
 
				 }
			
 
				 
			
 
				+// Helper function to compute q_i @ k_i.transpose(-1, -2) * decay_mask and apply mask for entire chunk
			
 
				+static void delta_compute_q_k_attn_chunk_f32(const float * q, const float * k, const float * decay_mask,
			
 
				+                                             float * attn, const bool * mask,
			
 
				+                                             const int64_t chunk_size, const int64_t head_dim,
			
 
				+                                             const int64_t n_seqs, const int64_t H_v) {
			
 
				+    for (int64_t seq = 0; seq < n_seqs; seq++) {
			
 
				+        for (int64_t head = 0; head < H_v; head++) {
			
 
				+            const float * q_ptr = q + seq * (chunk_size * head_dim * H_v) + head * (chunk_size * head_dim);
			
 
				+            const float * k_ptr = k + seq * (chunk_size * head_dim * H_v) + head * (chunk_size * head_dim);
			
 
				+            const float * decay_mask_ptr = decay_mask + seq * (chunk_size * chunk_size * H_v) + head * (chunk_size * chunk_size);
			
 
				+            float * attn_ptr = attn + seq * (chunk_size * chunk_size * H_v) + head * (chunk_size * chunk_size);
			
 
				+            delta_compute_q_k_attn_f32(q_ptr, k_ptr, decay_mask_ptr, attn_ptr, mask, chunk_size, head_dim);
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// Helper function for matrix multiplication with state tensors for entire chunk
			
 
				+static void delta_matmul_state_chunk_f32(const float * a, const float * state, float * dst,
			
 
				+                                        const int64_t rows_a, const int64_t cols_a, const int64_t cols_state,
			
 
				+                                        const int64_t n_seqs, const int64_t H_v) {
			
 
				+    for (int64_t seq = 0; seq < n_seqs; seq++) {
			
 
				+        for (int64_t head = 0; head < H_v; head++) {
			
 
				+            const float * a_ptr = a + seq * (rows_a * cols_a * H_v) + head * (rows_a * cols_a);
			
 
				+            const float * state_ptr = state + seq * (cols_a * cols_state * H_v) + head * (cols_a * cols_state);
			
 
				+            float * dst_ptr = dst + seq * (rows_a * cols_state * H_v) + head * (rows_a * cols_state);
			
 
				+            delta_matmul_state_f32(a_ptr, state_ptr, dst_ptr, rows_a, cols_a, cols_state);
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// Helper function to update recurrent state for entire chunk
			
 
				+static void delta_update_recurrent_state_chunk_f32(const float * state, const float * g_last,
			
 
				+                                                  const float * k, const float * g_diff_exp, const float * v_new, float * new_state,
			
 
				+                                                  const int64_t chunk_size, const int64_t k_head_dim, const int64_t v_head_dim,
			
 
				+                                                  const int64_t n_seqs, const int64_t H_v) {
			
 
				+    for (int64_t seq = 0; seq < n_seqs; seq++) {
			
 
				+        for (int64_t head = 0; head < H_v; head++) {
			
 
				+            const float * state_ptr = state + seq * (k_head_dim * v_head_dim * H_v) + head * (k_head_dim * v_head_dim);
			
 
				+            const float * k_ptr = k + seq * (chunk_size * k_head_dim * H_v) + head * (chunk_size * k_head_dim);
			
 
				+            const float * g_diff_exp_ptr = g_diff_exp + seq * (chunk_size * H_v) + head * chunk_size;
			
 
				+            const float * v_new_ptr = v_new + seq * (chunk_size * v_head_dim * H_v) + head * (chunk_size * v_head_dim);
			
 
				+            float * new_state_ptr = new_state + seq * (k_head_dim * v_head_dim * H_v) + head * (k_head_dim * v_head_dim);
			
 
				+                        
			
 
				+            for (int64_t i = 0; i < k_head_dim; i++) {
			
 
				+                for (int64_t j = 0; j < v_head_dim; j++) {
			
 
				+                    int64_t state_idx = i * v_head_dim + j;
			
 
				+                    
			
 
				+                    // last_recurrent_state * g_last
			
 
				+                    float term1 = state_ptr[state_idx] * g_last[seq * H_v + head];
			
 
				+                    
			
 
				+                    // (k_i * g_diff_exp).transpose(-1, -2) @ v_new
			
 
				+                    float term2 = 0.0f;
			
 
				+                    for (int64_t k = 0; k < chunk_size; k++) {
			
 
				+                        int64_t k_idx = k * k_head_dim + i;
			
 
				+                        int64_t v_idx = k * v_head_dim + j;
			
 
				+                        term2 += k_ptr[k_idx] * g_diff_exp_ptr[k] * v_new_ptr[v_idx];
			
 
				+                    }
			
 
				+                    
			
 
				+                    new_state_ptr[state_idx] = term1 + term2;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// Helper function for element-wise tensor subtraction for entire chunk
			
 
				+static void delta_tensor_subtract_chunk_f32(const float * a, const float * b, float * dst, const int64_t size,
			
 
				+                                           const int64_t n_seqs, const int64_t H_v) {
			
 
				+    for (int64_t seq = 0; seq < n_seqs; seq++) {
			
 
				+        for (int64_t head = 0; head < H_v; head++) {
			
 
				+            const float * a_ptr = a + seq * (size * H_v) + head * size;
			
 
				+            const float * b_ptr = b + seq * (size * H_v) + head * size;
			
 
				+            float * dst_ptr = dst + seq * (size * H_v) + head * size;
			
 
				+            delta_tensor_subtract_f32(a_ptr, b_ptr, dst_ptr, size);
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// Helper function for element-wise tensor addition for entire chunk
			
 
				+static void delta_tensor_add_chunk_f32(const float * a, const float * b, float * dst, const int64_t size,
			
 
				+                                       const int64_t n_seqs, const int64_t H_v) {
			
 
				+    for (int64_t seq = 0; seq < n_seqs; seq++) {
			
 
				+        for (int64_t head = 0; head < H_v; head++) {
			
 
				+            const float * a_ptr = a + seq * (size * H_v) + head * size;
			
 
				+            const float * b_ptr = b + seq * (size * H_v) + head * size;
			
 
				+            float * dst_ptr = dst + seq * (size * H_v) + head * size;
			
 
				+            delta_tensor_add_f32(a_ptr, b_ptr, dst_ptr, size);
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void print_debug_info(float * data, size_t size, const char * name, int64_t token) {
			
 
				+    GGML_LOG_INFO("\nggml-debug: %s (%ld) first 5 values: [%.6f, %.6f, %.6f, %.6f, %.6f, ...]\n", 
			
 
				+        name, token, data[0], data[1], data[2], data[3], data[4]);
			
 
				+    double sum = 0.0;
			
 
				+    for (unsigned int i = 0; i < size; i++) {
			
 
				+        sum += data[i];
			
 
				+    }
			
 
				+    GGML_LOG_INFO("total elements: %ld, sum = %.10f\n", size, sum);
			
 
				+}
			
 
				+
			
 
				 void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml_tensor * dst) {
			
 
				     const struct ggml_tensor * src0 = dst->src[0];  // q (already normalized and scaled)
			
 
				     const struct ggml_tensor * src1 = dst->src[1];  // k (already normalized)
			
@@ -10682,7 +10849,6 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml
 
				     const struct ggml_tensor * src8 = dst->src[8];  // attn
			
 
				 
			
 
				     const int64_t H_v               = (int64_t) dst->op_params[0];
			
 
				-    const int64_t S_k               = (int64_t) dst->op_params[1];
			
 
				     const int64_t S_v               = (int64_t) dst->op_params[2];
			
 
				     const int64_t original_n_tokens = (int64_t) dst->op_params[3];  // Get original sequence length
			
 
				     const int64_t n_tokens          = original_n_tokens;            // Use the original sequence length
			
@@ -10698,15 +10864,17 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml
 
				 
			
 
				     float * dst_data  = (float *) dst->data;
			
 
				     // Following GLA pattern: output is first part, state is second part
			
 
				-    float * output    = dst_data; // [S_v * H_v, n_tokens, 1, 1] - only real sequence length, not padded
			
 
				-    float * new_state = dst_data + (S_v * H_v * n_tokens);  // [S_v * H_v, S_v * n_seqs, 1, 1]
			
 
				+    float * output    = dst_data; // [S_v * H_v, n_tokens, 1, n_seqs] - only real sequence length, not padded
			
 
				+    float * new_state = dst_data + (S_v * H_v * n_tokens * n_seqs);  // [S_v * H_v, S_v * n_seqs, 1, 1]
			
 
				 
			
 
				     const int ith = params->ith;
			
 
				-    const int nth = params->nth;  // nth is unused
			
 
				+    // const int nth = params->nth;  // nth is unused
			
 
				 
			
 
				     // Clear output and new state section
			
 
				     if (ith == 0) {
			
 
				         memset(output, 0, ((S_v * H_v * n_tokens * n_seqs) + (S_v * S_v * H_v * n_seqs)) * sizeof(float));
			
 
				+    } else {
			
 
				+        return;
			
 
				     }
			
 
				 
			
 
				     // Calculate chunk size
			
@@ -10714,16 +10882,7 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml
 
				     const int64_t pad_size = (chunk_size - n_tokens % chunk_size) % chunk_size;
			
 
				     const int64_t num_chunks = (n_tokens + pad_size) / chunk_size;
			
 
				 
			
 
				-    // Apply triangular updates to the precomputed attention matrix
			
 
				-    float * attn_data = (float *) src8->data;
			
 
				-    float * v_beta_data = (float *) src6->data;
			
 
				-    float * k_beta_data = (float *) src7->data;
			
 
				-    float * g_data = (float *) src3->data;
			
 
				-    float * q_data = (float *) src0->data;
			
 
				-    float * k_data = (float *) src1->data;
			
 
				-    //float * v_data = (float *) src2->data;
			
 
				     float * state_data = (float *) src4->data;
			
 
				-    float * decay_mask_data = (float *) src5->data;
			
 
				 
			
 
				     GGML_ASSERT(ggml_is_contiguous(src0));
			
 
				     GGML_ASSERT(ggml_is_contiguous(src1));
			
@@ -10735,161 +10894,347 @@ void ggml_compute_forward_delta_net_f32(const ggml_compute_params * params, ggml
 
				     GGML_ASSERT(ggml_is_contiguous(src7));
			
 
				     GGML_ASSERT(ggml_is_contiguous(src8));
			
 
				 
			
 
				-    int64_t total_params = n_seqs * H_v * num_chunks;
			
 
				-    int64_t per_thread = (total_params % nth == 0) ? total_params / nth : (total_params / nth) + 1;
			
 
				+    // int64_t total_params = n_seqs * H_v * num_chunks;
			
 
				+    // int64_t per_thread = (total_params % nth == 0) ? total_params / nth : (total_params / nth) + 1;
			
 
				+
			
 
				+    // Create helper lambda for state tensor access
			
 
				+    const auto state_ptr = [state_data, src4] (int64_t seq, int64_t head, int64_t i, int64_t j) {
			
 
				+        return state_data + (j * src4->nb[0] / sizeof(float)) + (i * src4->nb[1] / sizeof(float)) +
			
 
				+            (head * src4->nb[2] / sizeof(float)) + (seq * src4->nb[3] / sizeof(float));
			
 
				+    };
			
 
				+    
			
 
				+    float * attn          = (float *) malloc(chunk_size * chunk_size * H_v * n_seqs * sizeof(float));
			
 
				+    float * value         = (float *) malloc(chunk_size * S_v * H_v * n_seqs * sizeof(float));
			
 
				+    float * k_cumdecay    = (float *) malloc(chunk_size * S_v * H_v * n_seqs * sizeof(float));
			
 
				+    bool *  mask          = (bool *) malloc(chunk_size * chunk_size * sizeof(bool));
			
 
				+    float * g =             (float *) malloc(chunk_size * H_v * n_seqs * sizeof(float));
			
 
				+
			
 
				+    // Create upper triangular mask for causal attention (exclude diagonal)
			
 
				+    for (int64_t i = 0; i < chunk_size; i++) {
			
 
				+        for (int64_t j = 0; j < chunk_size; j++) {
			
 
				+            mask[i * chunk_size + j] = (j > i);  // True for upper triangular (excluding diagonal)
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // Make a copy of the attention tensor and the gate cumsum tensor
			
 
				+    memcpy(attn, src8->data, ggml_nbytes(src8));
			
 
				+    memcpy(g, src3->data, ggml_nbytes(src3));
			
 
				+
			
 
				+    // Prepare the initial attention matrix with triangular updates and identity (for entire chunks)
			
 
				+    // This corresponds to the reference implementation:
			
 
				+    // for i in range(1, chunk_size): attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2)
			
 
				+    // attn = attn + torch.eye(chunk_size)
			
 
				+    delta_apply_triangular_updates_chunk_f32(attn, chunk_size, n_seqs, H_v);
			
 
				+    delta_add_identity_matrix_chunk_f32(attn, chunk_size, n_seqs, H_v);
			
 
				 
			
 
				+    // Compute value = attn @ v_beta
			
 
				+    delta_compute_value_f32(attn, (const float *) src6->data, value, chunk_size, S_v, H_v, n_seqs);
			
 
				     for (int64_t seq = 0; seq < n_seqs; seq++) {
			
 
				         for (int64_t head = 0; head < H_v; head++) {
			
 
				-            for (int64_t chunk = 0; chunk < num_chunks; chunk++) {
			
 
				-                int64_t tidx = seq * (H_v * num_chunks) + head * num_chunks + chunk;
			
 
				-                if (tidx < ith * per_thread || tidx >= (ith + 1) * per_thread) {
			
 
				-                    continue; // not our thread;
			
 
				-                }
			
 
				-                float * attn_data_for_chs = attn_data + (src8->nb[3] / sizeof(float)) * seq + (src8->nb[2] / sizeof(float)) * (chunk + head * num_chunks);
			
 
				-                float * value_chunk = (float *) malloc(S_v * chunk_size * H_v * n_seqs * sizeof(float));
			
 
				-                float * k_cumdecay = (float *) malloc(S_v * chunk_size * H_v * n_seqs * sizeof(float));
			
 
				-                delta_apply_triangular_updates_f32(attn_data_for_chs, chunk_size);
			
 
				-                delta_add_identity_matrix_f32(attn_data_for_chs, chunk_size);
			
 
				-                // Calculate the correct v_beta and k_beta pointers for this head and sequence
			
 
				-                float * v_beta_chunk = v_beta_data + (src6->nb[3] / sizeof(float)) * seq + (src6->nb[2] / sizeof(float)) * (chunk + head * num_chunks);
			
 
				-                float * k_beta_chunk = k_beta_data + (src7->nb[3] / sizeof(float)) * seq + (src7->nb[2] / sizeof(float)) * (chunk + head * num_chunks);
			
 
				-                // The g tensor has dimensions [8, 64, 2, 1] = [features, tokens, heads, sequences]
			
 
				-                // We need to access the correct head data
			
 
				-                // For each head, we need to access the correct feature for all tokens in the chunk
			
 
				-                // Let's try accessing feature index chunk (since we have 8 features and chunk=0)
			
 
				-                float * g_chunk = g_data + (src3->nb[3] / sizeof(float)) * seq + (src3->nb[2] / sizeof(float)) * head + (src3->nb[1] / sizeof(float)) * (chunk * chunk_size);
			
 
				-                delta_compute_value_f32(attn_data_for_chs, v_beta_chunk, value_chunk, chunk_size, S_v);
			
 
				-                delta_compute_k_cumdecay_f32(attn_data_for_chs, k_beta_chunk, g_chunk, k_cumdecay, chunk_size, S_k);
			
 
				-                // Now compute the per-chunk-specific part (corresponding to the inner loop in Python)
			
 
				-                float * q_chunk = q_data + (src0->nb[3] / sizeof(float)) * seq + (src0->nb[2] / sizeof(float)) * (chunk + head * num_chunks);
			
 
				-                float * k_chunk = k_data + (src1->nb[3] / sizeof(float)) * seq + (src1->nb[2] / sizeof(float)) * (chunk + head * num_chunks);
			
 
				-                float * decay_mask_chunk = decay_mask_data + (src5->nb[3] / sizeof(float)) * seq + (src5->nb[2] / sizeof(float)) * (chunk + head * num_chunks);
			
 
				-                float * k_cumdecay_chunk = k_cumdecay + (S_v * chunk_size * H_v) * seq + (S_v * chunk_size) * head;
			
 
				-                
			
 
				-                // Allocate temporary variables for the loop
			
 
				-                float * attn = (float *) malloc(chunk_size * chunk_size * sizeof(float));
			
 
				-                float * v_prime = (float *) malloc(chunk_size * S_v * sizeof(float));
			
 
				-                float * v_new = (float *) malloc(chunk_size * S_v * sizeof(float));
			
 
				-                float * attn_inter = (float *) malloc(chunk_size * S_v * sizeof(float));
			
 
				-                float * core_attn_out_chunk = (float *) malloc(chunk_size * S_v * sizeof(float));
			
 
				-                float * g_last = (float *) malloc(sizeof(float));
			
 
				-                float * g_diff_exp = (float *) malloc(chunk_size * sizeof(float));
			
 
				-                bool * mask = (bool *) malloc(chunk_size * chunk_size * sizeof(bool));
			
 
				-                
			
 
				-                // Create upper triangular mask for causal attention (exclude diagonal)
			
 
				+                delta_compute_k_cumdecay_f32(attn + (chunk_size * chunk_size * H_v) * seq + (chunk_size * chunk_size) * head, 
			
 
				+                    (float *) src7->data + (chunk_size * S_v * H_v) * seq + (chunk_size * S_v) * head,
			
 
				+                    g + (chunk_size * H_v) * seq + chunk_size * head,
			
 
				+                    k_cumdecay + (chunk_size * S_v * H_v) * seq + (chunk_size * S_v) * head,
			
 
				+                    chunk_size, S_v);
			
 
				+        }
			
 
				+    }
			
 
				+    print_debug_info(k_cumdecay, chunk_size * S_v * H_v * n_seqs, "k_cumdecay", -1);
			
 
				+
			
 
				+    // Process each chunk with all sequences and heads together
			
 
				+    for (int64_t chunk = 0; chunk < num_chunks; chunk++) {
			
 
				+        GGML_LOG_INFO("\n=== Processing chunk %ld ===\n", chunk);
			
 
				+
			
 
				+        // Create lambdas for tensor access similar to recurrent function
			
 
				+        const auto q_chunk = [chunk, src0](int64_t seq, int64_t head, int64_t token_idx, int64_t i) {
			
 
				+            return ggml_get_f32_nd(src0, i, chunk * chunk_size + token_idx, head, seq);
			
 
				+        };
			
 
				+        const auto k_chunk = [chunk, src1](int64_t seq, int64_t head, int64_t token_idx, int64_t i) {
			
 
				+            return ggml_get_f32_nd(src1, i, chunk * chunk_size + token_idx, head, seq);
			
 
				+        };
			
 
				+        const auto g_chunk = [chunk, src3](int64_t seq, int64_t head, int64_t token_idx) {
			
 
				+            return ggml_get_f32_nd(src3, chunk * chunk_size + token_idx, 0, head, seq);
			
 
				+        };
			
 
				+
			
 
				+        // Allocate per-chunk arrays containing all sequences and heads
			
 
				+        float * temp_state    = (float *) malloc(S_v * S_v * H_v * n_seqs * sizeof(float));
			
 
				+        float * core_attn_out = (float *) malloc(chunk_size * S_v * H_v * n_seqs * sizeof(float));
			
 
				+        float * attn_inter    = (float *) malloc(chunk_size * S_v * H_v * n_seqs * sizeof(float));
			
 
				+        float * v_new         = (float *) malloc(chunk_size * S_v * H_v * n_seqs * sizeof(float));
			
 
				+        float * v_prime       = (float *) malloc(chunk_size * S_v * H_v * n_seqs * sizeof(float));
			
 
				+        float * g_diff_exp    = (float *) malloc(chunk_size * H_v * n_seqs * sizeof(float));
			
 
				+        float * g_last        = (float *) malloc(H_v * n_seqs * sizeof(float));
			
 
				+
			
 
				+        // Initialize temp_state with zeros for all sequences and heads (state should be empty initially)
			
 
				+        memset(temp_state, 0, S_v * S_v * H_v * n_seqs * sizeof(float));
			
 
				+    
			
 
				+        // Create temporary arrays for entire chunk
			
 
				+        float * q_chunk_data    = (float *) malloc(chunk_size * S_v * H_v * n_seqs * sizeof(float));
			
 
				+        float * k_chunk_data    = (float *) malloc(chunk_size * S_v * H_v * n_seqs * sizeof(float));
			
 
				+        float * q_g_exp         = (float *) malloc(chunk_size * S_v * H_v * n_seqs * sizeof(float));
			
 
				+        float * attn_v_new      = (float *) malloc(chunk_size * S_v * H_v * n_seqs * sizeof(float));
			
 
				+
			
 
				+        // Fill temporary arrays with data from all sequences and heads
			
 
				+        for (int64_t seq = 0; seq < n_seqs; seq++) {
			
 
				+            for (int64_t head = 0; head < H_v; head++) {
			
 
				+                float * q_ptr = q_chunk_data + seq * (chunk_size * S_v * H_v) + head * (chunk_size * S_v);
			
 
				+                float * k_ptr = k_chunk_data + seq * (chunk_size * S_v * H_v) + head * (chunk_size * S_v);
			
 
				+                float * g_ptr       = g + seq * (chunk_size * H_v) + head * chunk_size;
			
 
				+                float * q_g_exp_ptr = q_g_exp + seq * (chunk_size * S_v * H_v) + head * (chunk_size * S_v);
			
 
				+
			
 
				+                // Fill q, k, decay_mask, and g data
			
 
				                 for (int64_t i = 0; i < chunk_size; i++) {
			
 
				-                    for (int64_t j = 0; j < chunk_size; j++) {
			
 
				-                        mask[i * chunk_size + j] = (j > i); // True for upper triangular (excluding diagonal)
			
 
				+                    for (int64_t d = 0; d < S_v; d++) {
			
 
				+                        q_ptr[i * S_v + d] = q_chunk(seq, head, i, d);
			
 
				+                        k_ptr[i * S_v + d] = k_chunk(seq, head, i, d);
			
 
				                     }
			
 
				+                    g_ptr[i] = g_chunk(seq, head, i);
			
 
				                 }
			
 
				-                                                
			
 
				-                // Python loop implementation:
			
 
				-                // q_i, k_i, v_i = query[:, :, i], key[:, :, i], value[:, :, i]
			
 
				-                // attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
			
 
				-                delta_compute_q_k_attn_f32(q_chunk, k_chunk, decay_mask_chunk, attn, mask, chunk_size, S_k);
			
 
				-                
			
 
				-                // v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
			
 
				-                // Calculate the correct state pointer for this head and sequence
			
 
				-                float * head_state_data = state_data + (seq * S_v * S_v * H_v) + (head * S_v * S_v);
			
 
				-                
			
 
				-                
			
 
				-                delta_matmul_state_f32(k_cumdecay_chunk, head_state_data, v_prime, chunk_size, S_k, S_v);
			
 
				-                
			
 
				-                // v_new = v_i - v_prime
			
 
				-                delta_tensor_subtract_f32(value_chunk, v_prime, v_new, chunk_size * S_v);
			
 
				-                
			
 
				-                // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
			
 
				-                float * q_g_exp = (float *) malloc(chunk_size * S_k * sizeof(float));
			
 
				+
			
 
				+                // Compute q_g_exp = q * g.exp()
			
 
				                 for (int64_t i = 0; i < chunk_size; i++) {
			
 
				-                    for (int64_t d = 0; d < S_k; d++) {
			
 
				-                        int64_t q_idx = i * S_k + d;
			
 
				-                        q_g_exp[q_idx] = q_chunk[q_idx] * expf(g_chunk[i]);
			
 
				+                    for (int64_t d = 0; d < S_v; d++) {
			
 
				+                        q_g_exp_ptr[i * S_v + d] = q_ptr[i * S_v + d] * expf(g_ptr[i]);
			
 
				                     }
			
 
				                 }
			
 
				-                delta_matmul_state_f32(q_g_exp, head_state_data, attn_inter, chunk_size, S_k, S_v);
			
 
				-                
			
 
				-                // core_attn_out[:, :, i] = attn_inter + attn @ v_new
			
 
				-                float * attn_v_new = (float *) malloc(chunk_size * S_v * sizeof(float));
			
 
				-                delta_matmul_state_f32(attn, v_new, attn_v_new, chunk_size, chunk_size, S_v);
			
 
				-                delta_tensor_add_f32(attn_inter, attn_v_new, core_attn_out_chunk, chunk_size * S_v);
			
 
				-                
			
 
				-                // Store the result in the output tensor
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        print_debug_info(q_chunk_data, chunk_size * S_v * H_v * n_seqs, "q_i_chunk", chunk);
			
 
				+        print_debug_info(k_chunk_data, chunk_size * S_v * H_v * n_seqs, "k_i_chunk", chunk);
			
 
				+
			
 
				+        // Step 4: Compute NEW attention matrix for this chunk: attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
			
 
				+        // Note: decay_mask[:, :, i] means we need to use the decay_mask for this specific chunk
			
 
				+        // The mask applied is the simple causal attention mask: torch.triu(torch.ones(chunk_size, chunk_size), diagonal=1)
			
 
				+        
			
 
				+        // Now compute attention for all sequences and heads together
			
 
				+        for (int64_t seq = 0; seq < n_seqs; seq++) {
			
 
				+            for (int64_t head = 0; head < H_v; head++) {
			
 
				+                float * attn_ptr = attn + seq * (chunk_size * chunk_size * H_v) + head * (chunk_size * chunk_size);
			
 
				+                const float * q_ptr = q_chunk_data + seq * (chunk_size * S_v * H_v) + head * (chunk_size * S_v);
			
 
				+                const float * k_ptr = k_chunk_data + seq * (chunk_size * S_v * H_v) + head * (chunk_size * S_v);
			
 
				+
			
 
				+                float * k_trans = (float *) malloc(chunk_size * S_v * sizeof(float));
			
 
				+                for (int i = 0; i < S_v; i++) {
			
 
				+                    for (int j = 0; j < chunk_size; j++) {
			
 
				+                        k_trans[i * chunk_size + j] = k_ptr[j * S_v + i];
			
 
				+                    }
			
 
				+                }
			
 
				+
			
 
				+                delta_matmul_f32(q_ptr, k_trans, attn_ptr, chunk_size, chunk_size, S_v);
			
 
				+            }
			
 
				+        }
			
 
				+        print_debug_info(attn, chunk_size * chunk_size * H_v * n_seqs, "q_k_trans", chunk);
			
 
				+
			
 
				+
			
 
				+        for (int64_t seq = 0; seq < n_seqs; seq++) {
			
 
				+            for (int64_t head = 0; head < H_v; head++) {
			
 
				                 for (int64_t i = 0; i < chunk_size; i++) {
			
 
				-                    for (int64_t d = 0; d < S_v; d++) {
			
 
				-                        if ((chunk * chunk_size + i) >= n_tokens) continue;
			
 
				-                        int64_t output_idx = seq * (n_tokens * S_v * H_v) + head * (n_tokens * S_v) + (chunk * chunk_size + i) * S_v + d; 
			
 
				-                        output[output_idx] = core_attn_out_chunk[i * S_v + d];
			
 
				+                    for (int64_t j = 0; j < chunk_size; j++) {
			
 
				+                        float * attn_ptr = attn + seq * (chunk_size * chunk_size * H_v) + head * (chunk_size * chunk_size);
			
 
				+                        const float * decay_mask_ptr = (float *) src5->data + seq * (chunk_size * chunk_size * H_v) + head * (chunk_size * chunk_size);
			
 
				+                        float attn_val = attn_ptr[i * chunk_size + j] * decay_mask_ptr[i * chunk_size + j];
			
 
				+                        // Apply simple causal attention mask (upper triangular with diagonal=1)
			
 
				+                        // This corresponds to: torch.triu(torch.ones(chunk_size, chunk_size), diagonal=1)
			
 
				+                        if (j > i) {
			
 
				+                            attn_val = 0.0f;
			
 
				+                        }
			
 
				+                        attn_ptr[i * chunk_size + j] = attn_val;
			
 
				                     }
			
 
				                 }
			
 
				-                
			
 
				-                // g_last = g[:, :, i, -1, None, None].exp()
			
 
				-                *g_last = expf(g_chunk[chunk_size - 1]);
			
 
				-                
			
 
				-                // Prepare g_diff_exp = (g[:, :, i, -1, None] - g[:, :, i]).exp()
			
 
				-                float g_last_val = g_chunk[chunk_size - 1];
			
 
				+            }
			
 
				+        }
			
 
				+        
			
 
				+        print_debug_info(attn, chunk_size * chunk_size * H_v * n_seqs, "attn_step4_new_chunk", chunk);
			
 
				+
			
 
				+        // v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
			
 
				+        // k_cumdecay has shape [chunk_size, v_head_dim], state has shape [v_head_dim, v_head_dim]
			
 
				+        delta_matmul_state_chunk_f32(k_cumdecay, state_data, v_prime, chunk_size, S_v, S_v, n_seqs, H_v);
			
 
				+        print_debug_info(v_prime, chunk_size * S_v * H_v * n_seqs, "v_prime_chunk", chunk);
			
 
				+
			
 
				+        // v_new = v_i - v_prime
			
 
				+        delta_tensor_subtract_chunk_f32(value, v_prime, v_new, chunk_size * S_v, n_seqs, H_v);
			
 
				+        print_debug_info(v_new, chunk_size * S_v * H_v * n_seqs, "v_new_chunk", chunk);
			
 
				+
			
 
				+        // attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
			
 
				+        delta_matmul_state_chunk_f32(q_g_exp, state_data, attn_inter, chunk_size, S_v, S_v, n_seqs, H_v);
			
 
				+        print_debug_info(attn_inter, chunk_size * S_v * H_v * n_seqs, "attn_inter_chunk", chunk);
			
 
				+
			
 
				+        // core_attn_out[:, :, i] = attn_inter + attn @ v_new
			
 
				+        // Use regular matrix multiplication for attn @ v_new
			
 
				+        for (int64_t seq = 0; seq < n_seqs; seq++) {
			
 
				+            for (int64_t head = 0; head < H_v; head++) {
			
 
				+                const float * attn_ptr = attn + seq * (chunk_size * chunk_size * H_v) + head * (chunk_size * chunk_size);
			
 
				+                const float * v_new_ptr = v_new + seq * (chunk_size * S_v * H_v) + head * (chunk_size * S_v);
			
 
				+                float * attn_v_new_ptr = attn_v_new + seq * (chunk_size * S_v * H_v) + head * (chunk_size * S_v);
			
 
				+                                
			
 
				+                // Compute attn @ v_new: [chunk_size, chunk_size] @ [chunk_size, S_v] -> [chunk_size, S_v]
			
 
				+                delta_matmul_f32(attn_ptr, v_new_ptr, attn_v_new_ptr, chunk_size, S_v, chunk_size);
			
 
				+            }
			
 
				+        }
			
 
				+        print_debug_info(attn_v_new, chunk_size * S_v * H_v * n_seqs, "attn_v_new_chunk", chunk);
			
 
				+        delta_tensor_add_chunk_f32(attn_inter, attn_v_new, core_attn_out, chunk_size * S_v, n_seqs, H_v);
			
 
				+        print_debug_info(core_attn_out, chunk_size * S_v * H_v * n_seqs, "core_attn_out_chunk", chunk);
			
 
				+
			
 
				+        // Prepare g_last and g_diff_exp for state update
			
 
				+        for (int64_t seq = 0; seq < n_seqs; seq++) {
			
 
				+            for (int64_t head = 0; head < H_v; head++) {
			
 
				+                float * g_ptr = g + seq * (chunk_size * H_v) + head * chunk_size;
			
 
				+                float g_last_val         = g_ptr[chunk_size - 1];
			
 
				+                g_last[seq * H_v + head] = expf(g_last_val);
			
 
				+
			
 
				+                float * g_diff_exp_ptr = g_diff_exp + seq * (chunk_size * H_v) + head * chunk_size;
			
 
				                 for (int64_t i = 0; i < chunk_size; i++) {
			
 
				-                    g_diff_exp[i] = expf(g_last_val - g_chunk[i]);
			
 
				+                    float diff        = g_last_val - g_ptr[i];
			
 
				+                    g_diff_exp_ptr[i] = expf(diff);
			
 
				                 }
			
 
				-                
			
 
				-                // last_recurrent_state = (
			
 
				-                //     last_recurrent_state * g_last
			
 
				-                //     + (k_i * (g[:, :, i, -1, None] - g[:, :, i]).exp()[..., None]).transpose(-1, -2) @ v_new
			
 
				-                // )
			
 
				-                float * new_recurrent_state = (float *) malloc(S_v * S_v * sizeof(float));
			
 
				-                
			
 
				-                
			
 
				-                delta_update_recurrent_state_f32(head_state_data, g_last, k_chunk, g_diff_exp, v_new,
			
 
				-                                                 new_recurrent_state, chunk_size, S_v, S_v);
			
 
				-                
			
 
				-                
			
 
				-                // Store the new state
			
 
				-                for (int64_t i = 0; i < S_v; i++) {
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        print_debug_info(g_last, H_v * n_seqs, "g_last_chunk", chunk);
			
 
				+        print_debug_info(g_diff_exp, chunk_size * H_v * n_seqs, "g_diff_exp", chunk);
			
 
				+
			
 
				+        float * k_g_diffexp = (float *) malloc(chunk_size * S_v * H_v * n_seqs * sizeof(float));
			
 
				+        for (int64_t seq = 0; seq < n_seqs; seq++) {
			
 
				+            for (int64_t head = 0; head < H_v; head++) {
			
 
				+                for (int64_t i = 0; i < chunk_size; i++) {
			
 
				                     for (int64_t j = 0; j < S_v; j++) {
			
 
				-                        int64_t state_idx = seq * S_v * S_v * H_v + head * S_v * S_v + i * S_v + j;
			
 
				-                        new_state[state_idx] = new_recurrent_state[i * S_v + j];
			
 
				+                        k_g_diffexp[seq * (chunk_size * S_v * H_v) + head * (chunk_size * S_v) + i * S_v + j] = 
			
 
				+                            k_chunk(seq, head, i, j) * g_diff_exp[seq * (chunk_size * H_v) + head * chunk_size + i];
			
 
				                     }
			
 
				                 }
			
 
				-                
			
 
				-                // Update the original state tensor with the new state for the next chunk
			
 
				+            }
			
 
				+        }
			
 
				+        print_debug_info(k_g_diffexp, chunk_size * S_v * H_v * n_seqs, "k_g_diffexp", chunk);
			
 
				+        float * k_g_diffexp_T = (float *) malloc(chunk_size * S_v * H_v * n_seqs * sizeof(float));
			
 
				+        for (int64_t seq = 0; seq < n_seqs; seq++) {
			
 
				+            for (int64_t head = 0; head < H_v; head++) {
			
 
				+                for (int64_t i = 0; i < S_v; i++) {
			
 
				+                    for (int64_t j = 0; j < chunk_size; j++) {
			
 
				+                        k_g_diffexp_T[seq * (chunk_size * S_v * H_v) + head * (chunk_size * S_v) + i * chunk_size + j] = 
			
 
				+                            k_g_diffexp[seq * (chunk_size * S_v * H_v) + head * (chunk_size * S_v) + j * S_v + i];
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        // for (int64_t seq = 0; seq < n_seqs; seq++) {
			
 
				+        //     for (int64_t head = 0; head < H_v; head++) {
			
 
				+        //         GGML_LOG_INFO("Sequence %ld, head %ld: \n[ ", seq, head);
			
 
				+        //         for (int i = 0; i < chunk_size; i++) {
			
 
				+        //             GGML_LOG_INFO("[ ");
			
 
				+        //             for (int j = 0; j < S_v; j++) {
			
 
				+        //                 GGML_LOG_INFO("%.6f", k_g_diffexp[(chunk_size * S_v * H_v) * seq + (chunk_size * S_v) * head + i * S_v + j]);
			
 
				+        //                 if (j < chunk_size - 1) {
			
 
				+        //                     GGML_LOG_INFO(", ");
			
 
				+        //                 }
			
 
				+        //             }
			
 
				+        //             GGML_LOG_INFO("], \n");
			
 
				+        //         }
			
 
				+        //         GGML_LOG_INFO("]\n");
			
 
				+        //     }
			
 
				+        // }
			
 
				+
			
 
				+        print_debug_info(k_g_diffexp_T, chunk_size * S_v * H_v * n_seqs, "k_g_diffexp_T", chunk);
			
 
				+
			
 
				+        float * kgd_mul_vnew = (float *) malloc(S_v * S_v * H_v * n_seqs * sizeof(float));
			
 
				+
			
 
				+        for (int64_t seq = 0; seq < n_seqs; seq++) {
			
 
				+            for (int64_t head = 0; head < H_v; head++) {
			
 
				+                delta_matmul_f32(k_g_diffexp_T + (chunk_size * S_v * H_v) * seq + (chunk_size * S_v) * head, 
			
 
				+                    v_new + (chunk_size * S_v * H_v) * seq + (chunk_size * S_v) * head,
			
 
				+                    kgd_mul_vnew + (S_v * S_v * H_v) * seq + (S_v * S_v) * head,
			
 
				+                    S_v, S_v, chunk_size);
			
 
				+            }
			
 
				+        }
			
 
				+        print_debug_info(kgd_mul_vnew, S_v * S_v * H_v * n_seqs, "kgd_mul_vnew", chunk);
			
 
				+        
			
 
				+        // for (int64_t seq = 0; seq < n_seqs; seq++) {
			
 
				+        //     for (int64_t head = 0; head < H_v; head++) {
			
 
				+        //         GGML_LOG_INFO("Sequence %ld, head %ld: \n[ ", seq, head);
			
 
				+        //         for (int i = 0; i < S_v; i++) {
			
 
				+        //             GGML_LOG_INFO("[ ");
			
 
				+        //             for (int j = 0; j < S_v; j++) {
			
 
				+        //                 GGML_LOG_INFO("%.6f", kgd_mul_vnew[(S_v * S_v * H_v) * seq + (S_v * S_v) * head + i * S_v + j]);
			
 
				+        //                 if (j < S_v - 1) {
			
 
				+        //                     GGML_LOG_INFO(", ");
			
 
				+        //                 }
			
 
				+        //             }
			
 
				+        //             GGML_LOG_INFO("], \n");
			
 
				+        //         }
			
 
				+        //         GGML_LOG_INFO("]\n");
			
 
				+        //     }
			
 
				+        // }
			
 
				+
			
 
				+        for (int64_t seq = 0; seq < n_seqs; seq++) {
			
 
				+            for (int64_t head = 0; head < H_v; head++) {
			
 
				+                for (int i = 0; i < S_v; i++) {
			
 
				+                    for (int j = 0; j < S_v; j++) {
			
 
				+                        temp_state[(S_v * S_v * H_v) * seq + (S_v * S_v) * head + S_v * i + j] = 
			
 
				+                            state_data[(S_v * S_v * H_v) * seq + (S_v * S_v) * head + S_v * i + j] * g_last[seq * H_v + head] + 
			
 
				+                            kgd_mul_vnew[(S_v * S_v * H_v) * seq + (S_v * S_v) * head + S_v * i + j];
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+        print_debug_info(temp_state, S_v * S_v * H_v * n_seqs, "temp_state", chunk);
			
 
				+
			
 
				+        // Free temporary memory
			
 
				+        free(q_chunk_data);
			
 
				+        free(k_chunk_data);
			
 
				+        free(q_g_exp);
			
 
				+        free(attn_v_new);
			
 
				+        free(kgd_mul_vnew);
			
 
				+        free(k_g_diffexp_T);
			
 
				+        free(k_g_diffexp);
			
 
				+
			
 
				+        // Store output for this chunk (all sequences and heads)
			
 
				+        for (int64_t seq = 0; seq < n_seqs; seq++) {
			
 
				+            for (int64_t head = 0; head < H_v; head++) {
			
 
				+                float * core_attn_out_ptr = core_attn_out + seq * (chunk_size * S_v * H_v) + head * (chunk_size * S_v);
			
 
				+
			
 
				+                // Store output for this chunk
			
 
				+                for (int64_t i = 0; i < n_tokens; i++) {
			
 
				+                    for (int64_t d = 0; d < S_v; d++) {
			
 
				+                        int64_t output_idx =
			
 
				+                            seq * (n_tokens * S_v * H_v) + head * (n_tokens * S_v) + (chunk * chunk_size + i) * S_v + d;
			
 
				+                        output[output_idx] = core_attn_out_ptr[i * S_v + d];
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+        print_debug_info(output, S_v * H_v * n_tokens * n_seqs, "output", chunk);
			
 
				+
			
 
				+        // Update state tensor (all sequences and heads)
			
 
				+        for (int64_t seq = 0; seq < n_seqs; seq++) {
			
 
				+            for (int64_t head = 0; head < H_v; head++) {
			
 
				+                float * temp_state_ptr = temp_state + seq * (S_v * S_v * H_v) + head * (S_v * S_v);
			
 
				+
			
 
				                 for (int64_t i = 0; i < S_v; i++) {
			
 
				                     for (int64_t j = 0; j < S_v; j++) {
			
 
				-                        int64_t state_idx = i * S_v + j;
			
 
				-                        head_state_data[state_idx] = new_recurrent_state[state_idx];
			
 
				+                        int64_t state_idx             = seq * S_v * S_v * H_v + head * S_v * S_v + i * S_v + j;
			
 
				+                        new_state[state_idx]          = temp_state_ptr[i * S_v + j];
			
 
				+                        *(state_ptr(seq, head, i, j)) = temp_state_ptr[i * S_v + j];
			
 
				                     }
			
 
				                 }
			
 
				-                
			
 
				-                // Recalculate head_state_data to point to the updated state for the next iteration
			
 
				-                head_state_data = state_data + (seq * S_v * S_v * H_v) + (head * S_v * S_v);
			
 
				-                
			
 
				-                // Free temporary memory
			
 
				-                free(attn);
			
 
				-                free(v_prime);
			
 
				-                free(v_new);
			
 
				-                free(attn_inter);
			
 
				-                free(core_attn_out_chunk);
			
 
				-                free(g_last);
			
 
				-                free(g_diff_exp);
			
 
				-                free(mask);
			
 
				-                free(q_g_exp);
			
 
				-                free(attn_v_new);
			
 
				-                free(new_recurrent_state);
			
 
				-                
			
 
				-                // Free the value and k_cumdecay allocated at the beginning of the loop
			
 
				-                free(value_chunk);
			
 
				-                free(k_cumdecay);
			
 
				             }
			
 
				         }
			
 
				-    }    
			
 
				-}
			
 
				+        print_debug_info(new_state, S_v * S_v * H_v * n_seqs, "new_state", chunk);
			
 
				 
			
 
				-static void print_debug_info(float * data, size_t size, const char * name, int64_t token) {
			
 
				-    GGML_LOG_INFO("\nggml-debug: %s (%ld) first 5 values: [%.6f, %.6f, %.6f, %.6f, %.6f, ...]\n", 
			
 
				-        name, token, data[0], data[1], data[2], data[3], data[4]);
			
 
				-    double sum = 0.0;
			
 
				-    for (unsigned int i = 0; i < size; i++) {
			
 
				-        sum += data[i];
			
 
				+        free(temp_state);
			
 
				+        free(core_attn_out);
			
 
				+        free(attn_inter);
			
 
				+        free(v_new);
			
 
				+        free(v_prime);
			
 
				+        free(g_diff_exp);
			
 
				+        free(g_last);
			
 
				     }
			
 
				-    GGML_LOG_INFO("sum = %.10f\n", sum);
			
 
				+
			
 
				+    GGML_ASSERT(output + S_v * H_v * n_tokens * n_seqs == new_state);
			
 
				+    free(attn);
			
 
				+    free(value);
			
 
				+    free(k_cumdecay);
			
 
				+    free(mask);
			
 
				+    free(g);
			
 
				 }
			
 
				 
			
 
				 void ggml_compute_forward_delta_net_recurrent_f32(const ggml_compute_params * params, ggml_tensor * dst) {
			
@@ -10971,7 +11316,7 @@ void ggml_compute_forward_delta_net_recurrent_f32(const ggml_compute_params * pa
 
				                 }
			
 
				             }
			
 
				         }
			
 
				-        //print_debug_info(temp_state, n_seqs * H_v * S_v * S_v, "temp_state_copy", token);
			
 
				+        print_debug_info(temp_state, n_seqs * H_v * S_v * S_v, "temp_state_copy", token);
			
 
				 
			
 
				         // 1. last_recurrent_state = last_recurrent_state * g_t (for all seqs and heads)
			
 
				         for (int64_t seq = 0; seq < n_seqs; seq++) {
			
@@ -10985,7 +11330,7 @@ void ggml_compute_forward_delta_net_recurrent_f32(const ggml_compute_params * pa
 
				                 }
			
 
				             }
			
 
				         }
			
 
				-        //print_debug_info(temp_state, n_seqs * H_v * S_v * S_v, "temp_state_times_g_t", token);
			
 
				+        print_debug_info(temp_state, n_seqs * H_v * S_v * S_v, "temp_state_times_g_t", token);
			
 
				         
			
 
				         // 2. kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2) (for all seqs and heads)
			
 
				         for (int64_t seq = 0; seq < n_seqs; seq++) {
			
@@ -11000,7 +11345,7 @@ void ggml_compute_forward_delta_net_recurrent_f32(const ggml_compute_params * pa
 
				                 }
			
 
				             }
			
 
				         }
			
 
				-        //print_debug_info(kv_mem, n_seqs * H_v * S_v, "kv_mem", token);
			
 
				+        print_debug_info(kv_mem, n_seqs * H_v * S_v, "kv_mem", token);
			
 
				         
			
 
				         // 3. delta = (v_t - kv_mem) * beta_t (for all seqs and heads)
			
 
				         for (int64_t seq = 0; seq < n_seqs; seq++) {
			
@@ -11012,7 +11357,7 @@ void ggml_compute_forward_delta_net_recurrent_f32(const ggml_compute_params * pa
 
				                 }
			
 
				             }
			
 
				         }
			
 
				-        //print_debug_info(delta, n_seqs * H_v * S_v, "delta", token);
			
 
				+        print_debug_info(delta, n_seqs * H_v * S_v, "delta", token);
			
 
				         
			
 
				         // 4. last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta.unsqueeze(-2) (for all seqs and heads)
			
 
				         for (int64_t seq = 0; seq < n_seqs; seq++) {
			
@@ -11026,7 +11371,7 @@ void ggml_compute_forward_delta_net_recurrent_f32(const ggml_compute_params * pa
 
				                 }
			
 
				             }
			
 
				         }
			
 
				-        //print_debug_info(temp_state, n_seqs * H_v * S_v * S_v, "temp_state", token);
			
 
				+        print_debug_info(temp_state, n_seqs * H_v * S_v * S_v, "temp_state", token);
			
 
				         
			
 
				         // 5. core_attn_out[:, :, i] = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2) (for all seqs and heads)
			
 
				         for (int64_t seq = 0; seq < n_seqs; seq++) {
			
@@ -11040,7 +11385,7 @@ void ggml_compute_forward_delta_net_recurrent_f32(const ggml_compute_params * pa
 
				                 }
			
 
				             }
			
 
				         }
			
 
				-        //print_debug_info(attn_out_t, n_seqs * S_v * H_v, "attn_out_t", token);
			
 
				+        print_debug_info(attn_out_t, n_seqs * S_v * H_v, "attn_out_t", token);
			
 
				         
			
 
				         // Store the output for this token (for all seqs and heads)
			
 
				         for (int64_t seq = 0; seq < n_seqs; seq++) {
			
--- a/src/models/llm_build_qwen3next.cpp
+++ b/src/models/llm_build_qwen3next.cpp
@@ -735,8 +735,11 @@ ggml_tensor * llm_build_qwen3next::build_qwen3next_linear_attn_layer(llm_graph_i
 
				     ggml_tensor * attn_out_1d =
			
 
				         ggml_view_1d(ctx0, attn_out, output_flat_size, 0);
			
 
				     cb(attn_out_1d, "attn_out_1d", il);
			
 
				+
			
 
				+    ggml_tensor * attn_out_reshaped = ggml_cont_4d(ctx0, attn_out_1d, head_v_dim, n_seq_tokens, num_v_heads, n_seqs);
			
 
				+    cb(attn_out_1d, "attn_out_reshaped", il);
			
 
				     
			
 
				-    ggml_tensor * attn_out_final = ggml_cont(ctx0, ggml_permute(ctx0, ggml_cont_4d(ctx0, attn_out_1d, head_v_dim, n_seq_tokens, num_v_heads, n_seqs), 0, 2, 1, 3));
			
 
				+    ggml_tensor * attn_out_final = ggml_cont(ctx0, ggml_permute(ctx0, attn_out_reshaped, 0, 2, 1, 3));
			
 
				     cb(attn_out_final, "attn_out_final", il);
			
 
				    
			
 
				     // Extract the state part (second part of the concatenated tensor)