4 месяцев назад · e0c5dff2a7
--- a/examples/model-conversion/scripts/causal/run-org-model.py
+++ b/examples/model-conversion/scripts/causal/run-org-model.py
@@ -193,7 +193,7 @@ print(f"Input text: {repr(prompt)}")
 
				 print(f"Tokenized: {tokenizer.convert_ids_to_tokens(input_ids[0])}")
			
 
				 
			
 
				 with torch.no_grad():
			
 
				-    outputs = model(input_ids)
			
 
				+    outputs = model(input_ids.to("cuda"))
			
 
				     logits = outputs.logits
			
 
				 
			
 
				     # Extract logits for the last token (next token prediction)
			
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2300,10 +2300,20 @@ extern "C" {
 
				             struct ggml_tensor  * conv_bias,
			
 
				             struct ggml_tensor  * beta,
			
 
				             struct ggml_tensor  * state,
			
 
				-            int                   chunk_size,
			
 
				             bool                  use_qk_l2norm,
			
 
				             float                 scale);
			
 
				 
			
 
				+struct ggml_tensor * ggml_delta_net_op(
			
 
				+        struct ggml_context * ctx,
			
 
				+        struct ggml_tensor  * q,
			
 
				+        struct ggml_tensor  * k,
			
 
				+        struct ggml_tensor  * v,
			
 
				+        struct ggml_tensor  * g,
			
 
				+        struct ggml_tensor  * beta,
			
 
				+        struct ggml_tensor  * state,
			
 
				+        bool                  use_qk_l2norm,
			
 
				+        float                 scale);
			
 
				+
			
 
				     GGML_API struct ggml_tensor * ggml_rwkv_wkv7(
			
 
				             struct ggml_context * ctx,
			
 
				             struct ggml_tensor  * r,
			
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -5431,9 +5431,9 @@ struct ggml_tensor * ggml_delta_net(
 
				         struct ggml_tensor  * conv_bias,
			
 
				         struct ggml_tensor  * beta,
			
 
				         struct ggml_tensor  * state,
			
 
				-        int                   chunk_size,
			
 
				         bool                  use_qk_l2norm,
			
 
				         float                 scale) {
			
 
				+    
			
 
				     GGML_ASSERT(ggml_is_contiguous(k));
			
 
				     GGML_ASSERT(ggml_is_contiguous(v));
			
 
				     GGML_ASSERT(ggml_is_contiguous(q));
			
@@ -5474,10 +5474,12 @@ struct ggml_tensor * ggml_delta_net(
 
				     
			
 
				     // Apply sigmoid to beta for gating
			
 
				     struct ggml_tensor * beta_sigmoid = ggml_sigmoid(ctx, beta);
			
 
				+    
			
 
				+    // Concatenate q, k, v for convolution processing
			
 
				     struct ggml_tensor * mixed_qkv = ggml_concat(ctx, q_norm, k_norm, 1);
			
 
				     mixed_qkv = ggml_concat(ctx, mixed_qkv, v, 1);
			
 
				 
			
 
				-    u_int32_t dim = (S_v * H_v) + 2 * (H_k * S_k);
			
 
				+    uint32_t dim = (S_v * H_v) + 2 * (H_k * S_k);
			
 
				 
			
 
				     mixed_qkv = ggml_reshape_3d(ctx, mixed_qkv, 1, dim, n_tokens);
			
 
				     struct ggml_tensor * mixed_qkv_padded = ggml_pad(ctx, mixed_qkv, 3, 0, 0, 0);
			
@@ -5566,33 +5568,219 @@ struct ggml_tensor * ggml_delta_net(
 
				         q_broadcast = ggml_reshape_4d(ctx, q_broadcast, S_k, H_v, n_tokens, 1);
			
 
				         k_broadcast = ggml_reshape_4d(ctx, k_broadcast, S_k, H_v, n_tokens, 1);
			
 
				     }
			
 
				+
			
 
				+    struct ggml_tensor * v_reshape = ggml_reshape_4d(ctx, v_conv, S_v, H_v, n_tokens, 1);
			
 
				+    struct ggml_tensor * v_broadcast = ggml_repeat_4d(ctx, v_reshape, S_v, H_v, n_tokens, n_seqs);
			
 
				+    struct ggml_tensor * g_reshape = ggml_reshape_4d(ctx, g, 1, H_v, n_tokens, n_seqs);
			
 
				+    q_broadcast = ggml_repeat_4d(ctx, q_broadcast, S_k, H_v, n_tokens, n_seqs);
			
 
				+    k_broadcast = ggml_repeat_4d(ctx, k_broadcast, S_k, H_v, n_tokens, n_seqs);
			
 
				+    struct ggml_tensor * beta_reshape = ggml_reshape_4d(ctx, beta_sigmoid, 1, H_v, n_tokens, 1);
			
 
				+    struct ggml_tensor * beta_broadcast = ggml_repeat_4d(ctx, beta_reshape, 1, H_v, n_tokens, n_seqs);
			
 
				+    struct ggml_tensor * state_broadcast = ggml_repeat_4d(ctx, state, S_v, S_v, H_v, n_seqs);
			
 
				     
			
 
				-    // concat output and new_state
			
 
				-    const int64_t ne[4] = { S_v * H_v, n_tokens + H_v * n_seqs, 1, 1 };
			
 
				+    // Call tensor-level kernel with convolved and processed tensors
			
 
				+    return ggml_delta_net_op(ctx, q_broadcast, k_broadcast, v_broadcast, g_reshape, beta_broadcast, state_broadcast, use_qk_l2norm, scale);
			
 
				+}
			
 
				+
			
 
				+struct ggml_tensor * ggml_delta_net_op(
			
 
				+        struct ggml_context * ctx,
			
 
				+        struct ggml_tensor  * q,
			
 
				+        struct ggml_tensor  * k,
			
 
				+        struct ggml_tensor  * v,
			
 
				+        struct ggml_tensor  * g,
			
 
				+        struct ggml_tensor  * beta,
			
 
				+        struct ggml_tensor  * state,
			
 
				+        bool                  use_qk_l2norm,
			
 
				+        float                 scale) {
			
 
				+    
			
 
				+    // Validate dimensions
			
 
				+    GGML_ASSERT(ggml_is_contiguous(q));
			
 
				+    GGML_ASSERT(ggml_is_contiguous(k));
			
 
				+    GGML_ASSERT(ggml_is_contiguous(v));
			
 
				+    GGML_ASSERT(ggml_is_contiguous(g));
			
 
				+    GGML_ASSERT(ggml_is_contiguous(beta));
			
 
				+    GGML_ASSERT(ggml_is_contiguous(state));
			
 
				+    
			
 
				+    const int64_t S_k = q->ne[0];  // head dimension for q/k
			
 
				+    const int64_t H_v = q->ne[1];  // number of heads (already processed to match v)
			
 
				+    const int64_t n_tokens = q->ne[2];
			
 
				+    const int64_t n_seqs = q->ne[3];
			
 
				+    
			
 
				+    const int64_t S_v = v->ne[0];  // head dimension for v
			
 
				+    
			
 
				+    // Validate dimensions (q and k should now have same head count as v)
			
 
				+    GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_v && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
			
 
				+    GGML_ASSERT(v->ne[1] == H_v && v->ne[2] == n_tokens && v->ne[3] == n_seqs);
			
 
				+    GGML_ASSERT(g->ne[0] == 1 && g->ne[1] == H_v && g->ne[2] == n_tokens && g->ne[3] == n_seqs);
			
 
				+    GGML_ASSERT(beta->ne[0] == 1 && beta->ne[1] == H_v && g->ne[2] == n_tokens && beta->ne[3] == n_seqs);
			
 
				+    GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v && state->ne[2] == H_v && state->ne[3] == n_seqs);
			
 
				+    
			
 
				+    // Create output tensor: [S_v, H_v, n_tokens, n_seqs]
			
 
				+    struct ggml_tensor * output = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S_v, H_v, n_tokens, n_seqs);
			
 
				+    
			
 
				+    // Create new state tensor: [S_v, S_v, H_v, n_seqs]
			
 
				+    struct ggml_tensor * new_state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S_v, S_v, H_v, n_seqs);
			
 
				+    
			
 
				+    // Process each sequence independently
			
 
				+    for (int64_t seq_idx = 0; seq_idx < n_seqs; ++seq_idx) {
			
 
				+        // Extract current sequence data
			
 
				+        struct ggml_tensor * q_seq = ggml_view_4d(ctx, q, S_k, H_v, n_tokens, 1,
			
 
				+                                                  q->nb[1], q->nb[2], q->nb[3], 
			
 
				+                                                  seq_idx * q->nb[3]);
			
 
				+        struct ggml_tensor * k_seq = ggml_view_4d(ctx, k, S_k, H_v, n_tokens, 1,
			
 
				+                                                  k->nb[1], k->nb[2], k->nb[3],
			
 
				+                                                  seq_idx * k->nb[3]);
			
 
				+        struct ggml_tensor * v_seq = ggml_view_4d(ctx, v, S_v, H_v, n_tokens, 1,
			
 
				+                                                  v->nb[1], v->nb[2], v->nb[3],
			
 
				+                                                  seq_idx * v->nb[3]);
			
 
				+        struct ggml_tensor * g_seq = ggml_view_4d(ctx, g, 1, H_v, n_tokens, 1,
			
 
				+                                                  g->nb[1], g->nb[2], g->nb[3],
			
 
				+                                                  seq_idx * g->nb[3]);
			
 
				+        struct ggml_tensor * beta_seq = ggml_view_4d(ctx, beta, 1, H_v, n_tokens, 1,
			
 
				+                                                     beta->nb[1], beta->nb[2], beta->nb[3],
			
 
				+                                                     seq_idx * beta->nb[3]);
			
 
				+        struct ggml_tensor * state_seq = ggml_view_4d(ctx, state, S_v, S_v, H_v, 1,
			
 
				+                                                      state->nb[1], state->nb[2], state->nb[3],
			
 
				+                                                      seq_idx * state->nb[3]);
			
 
				+        
			
 
				+        // Process each head
			
 
				+        for (int64_t head_idx = 0; head_idx < H_v; ++head_idx) {
			
 
				+            // Extract current head data
			
 
				+            struct ggml_tensor * q_head = ggml_view_3d(ctx, q_seq, S_k, n_tokens, 1,
			
 
				+                                                       q_seq->nb[1], q_seq->nb[2],
			
 
				+                                                       head_idx * q_seq->nb[2]);
			
 
				+            struct ggml_tensor * k_head = ggml_view_3d(ctx, k_seq, S_k, n_tokens, 1,
			
 
				+                                                       k_seq->nb[1], k_seq->nb[2],
			
 
				+                                                       head_idx * k_seq->nb[2]);
			
 
				+            struct ggml_tensor * v_head = ggml_view_3d(ctx, v_seq, S_v, n_tokens, 1,
			
 
				+                                                       v_seq->nb[1], v_seq->nb[2],
			
 
				+                                                       head_idx * v_seq->nb[2]);
			
 
				+            struct ggml_tensor * g_head = ggml_view_3d(ctx, g_seq, 1, n_tokens, 1,
			
 
				+                                                       g_seq->nb[1], g_seq->nb[2],
			
 
				+                                                       head_idx * g_seq->nb[2]);
			
 
				+            struct ggml_tensor * beta_head = ggml_view_3d(ctx, beta_seq, 1, n_tokens, 1,
			
 
				+                                                          beta_seq->nb[1], beta_seq->nb[2],
			
 
				+                                                          head_idx * beta_seq->nb[2]);
			
 
				+            struct ggml_tensor * state_head = ggml_view_3d(ctx, state_seq, S_v, S_v, 1,
			
 
				+                                                           state_seq->nb[1], state_seq->nb[2],
			
 
				+                                                           head_idx * state_seq->nb[2]);
			
 
				+            
			
 
				+            // Transpose to get [n_tokens, S] layout for sequential processing
			
 
				+            q_head = ggml_cont(ctx, ggml_permute(ctx, q_head, 1, 0, 2, 3));  // [n_tokens, S_k]
			
 
				+            k_head = ggml_cont(ctx, ggml_permute(ctx, k_head, 1, 0, 2, 3));  // [n_tokens, S_k]
			
 
				+            v_head = ggml_cont(ctx, ggml_permute(ctx, v_head, 1, 0, 2, 3));  // [n_tokens, S_v]
			
 
				+            g_head = ggml_cont(ctx, ggml_permute(ctx, g_head, 1, 0, 2, 3));  // [n_tokens, 1]
			
 
				+            beta_head = ggml_cont(ctx, ggml_permute(ctx, beta_head, 1, 0, 2, 3));  // [n_tokens, 1]
			
 
				+            
			
 
				+            // Process each token - apply L2 normalization and scaling per token as original
			
 
				+            struct ggml_tensor * current_state = state_head;
			
 
				+            struct ggml_tensor * output_head = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, S_v, n_tokens);
			
 
				+            
			
 
				+            for (int64_t t = 0; t < n_tokens; ++t) {
			
 
				+                // Extract current token data
			
 
				+                struct ggml_tensor * q_t = ggml_view_1d(ctx, q_head, S_k, t * S_k * sizeof(float));
			
 
				+                struct ggml_tensor * k_t = ggml_view_1d(ctx, k_head, S_k, t * S_k * sizeof(float));
			
 
				+                struct ggml_tensor * v_t = ggml_view_1d(ctx, v_head, S_v, t * S_v * sizeof(float));
			
 
				+                struct ggml_tensor * g_t = ggml_view_1d(ctx, g_head, 1, t * sizeof(float));
			
 
				+                struct ggml_tensor * beta_t = ggml_view_1d(ctx, beta_head, 1, t * sizeof(float));
			
 
				+                
			
 
				+                // Apply L2 normalization if requested - per token as in original
			
 
				+                if (use_qk_l2norm) {
			
 
				+                    // Compute L2 norm for q_t and k_t
			
 
				+                    struct ggml_tensor * q_norm = ggml_l2_norm(ctx, q_t, 1e-6f);
			
 
				+                    struct ggml_tensor * k_norm = ggml_l2_norm(ctx, k_t, 1e-6f);
			
 
				+                    q_t = q_norm;
			
 
				+                    k_t = k_norm;
			
 
				+                }
			
 
				+                
			
 
				+                // Apply scaling to query - per token as in original
			
 
				+                q_t = ggml_scale(ctx, q_t, scale);
			
 
				+                
			
 
				+                // Apply gate decay to state: state = state * exp(g_t)
			
 
				+                struct ggml_tensor * g_exp = ggml_exp(ctx, g_t);
			
 
				+                // Broadcast g_exp to match state dimensions using multiplication
			
 
				+                struct ggml_tensor * ones = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, S_v, S_v);
			
 
				+                ggml_exp(ctx, ones);
			
 
				+                struct ggml_tensor * g_exp_broadcast = ggml_mul(ctx, ones, g_exp);
			
 
				+                current_state = ggml_mul(ctx, current_state, g_exp_broadcast);
			
 
				+                
			
 
				+                // Compute kv_mem = state @ k_t^T
			
 
				+                struct ggml_tensor * k_t_reshaped = ggml_reshape_2d(ctx, k_t, S_k, 1);
			
 
				+                struct ggml_tensor * kv_mem = ggml_mul_mat(ctx, current_state, k_t_reshaped);  // [S_v, 1]
			
 
				+                kv_mem = ggml_reshape_1d(ctx, kv_mem, S_v);
			
 
				+                
			
 
				+                // Compute delta = (v_t - kv_mem) * beta_t
			
 
				+                struct ggml_tensor * v_minus_kv = ggml_sub(ctx, v_t, kv_mem);
			
 
				+                // Broadcast beta_t through multiplication (GGML auto-broadcasts)
			
 
				+                struct ggml_tensor * delta = ggml_mul(ctx, v_minus_kv, beta_t);
			
 
				+                
			
 
				+                // Update state: state = state + outer(k_t, delta)
			
 
				+                struct ggml_tensor * k_t_reshaped_2 = ggml_reshape_2d(ctx, k_t, 1, S_k);
			
 
				+                struct ggml_tensor * delta_reshaped = ggml_reshape_2d(ctx, delta, 1, S_v);
			
 
				+                struct ggml_tensor * outer_product = ggml_mul_mat(ctx, delta_reshaped, k_t_reshaped_2);  // [S_k, S_v]
			
 
				+                
			
 
				+                // Handle S_k != S_v case
			
 
				+                if (S_k == S_v) {
			
 
				+                    current_state = ggml_add(ctx, current_state, outer_product);
			
 
				+                } else {
			
 
				+                    // For S_k != S_v, handle dimension mismatch
			
 
				+                    if (S_k < S_v) {
			
 
				+                        // Pad outer_product with zeros
			
 
				+                        struct ggml_tensor * padding = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, S_v - S_k, S_v);
			
 
				+                        outer_product = ggml_concat(ctx, outer_product, padding, 0);
			
 
				+                    } else if (S_k > S_v) {
			
 
				+                        // Truncate outer_product
			
 
				+                        outer_product = ggml_view_2d(ctx, outer_product, S_v, S_v, outer_product->nb[1], 0);
			
 
				+                    }
			
 
				+                    current_state = ggml_add(ctx, current_state, outer_product);
			
 
				+                }
			
 
				+                
			
 
				+                // Compute output = current_state @ q_t^T
			
 
				+                struct ggml_tensor * q_t_reshaped = ggml_reshape_2d(ctx, q_t, S_k, 1);
			
 
				+                struct ggml_tensor * output_t = ggml_mul_mat(ctx, current_state, q_t_reshaped);  // [S_v, 1]
			
 
				+                output_t = ggml_reshape_1d(ctx, output_t, S_v);
			
 
				+                
			
 
				+                // Store output for this token using view and copy
			
 
				+                struct ggml_tensor * output_slice = ggml_view_1d(ctx, output_head, S_v, t * S_v * sizeof(float));
			
 
				+                output_slice = ggml_cpy(ctx, output_t, output_slice);
			
 
				+            }
			
 
				+            
			
 
				+            // Store processed head data using proper tensor operations
			
 
				+            // Reshape and permute output head to correct layout
			
 
				+            struct ggml_tensor * output_head_reshaped = ggml_reshape_3d(ctx, output_head, S_v, 1, n_tokens);
			
 
				+            struct ggml_tensor * output_head_final = ggml_cont(ctx, ggml_permute(ctx, output_head_reshaped, 1, 2, 0, 3));  // [1, n_tokens, S_v]
			
 
				+            output_head_final = ggml_reshape_2d(ctx, output_head_final, S_v, n_tokens);
			
 
				+            
			
 
				+            // Copy to final output tensor using proper tensor operations
			
 
				+            struct ggml_tensor * output_slice = ggml_view_2d(ctx, output, S_v, n_tokens, 
			
 
				+                                                            output->nb[1], head_idx * S_v * sizeof(float));
			
 
				+            output_slice = ggml_cpy(ctx, output_head_final, output_slice);
			
 
				+            
			
 
				+            // Copy current state to new_state tensor
			
 
				+            struct ggml_tensor * state_slice = ggml_view_2d(ctx, new_state, S_v, S_v,
			
 
				+                                                           new_state->nb[1], head_idx * S_v * S_v * sizeof(float));
			
 
				+            state_slice = ggml_cpy(ctx, current_state, state_slice);
			
 
				+        }
			
 
				+    }
			
 
				+    
			
 
				+    // Concatenate output and new_state into final result
			
 
				+    const int64_t ne[4] = { S_v * H_v, n_tokens + S_v * n_seqs, 1, 1 };
			
 
				     struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
			
 
				     
			
 
				-    // Set operation parameters for the delta rule computation
			
 
				-    int32_t params[8] = {
			
 
				-        chunk_size,
			
 
				-        use_qk_l2norm ? 1 : 0,
			
 
				-        0, 0,  // reserved
			
 
				-        0, 0, 0  // scale and other params
			
 
				-    };
			
 
				-    memcpy(params + 4, &scale, sizeof(float));
			
 
				-    ggml_set_op_params(result, params, sizeof(params));
			
 
				+    // Copy output data using proper tensor operations
			
 
				+    struct ggml_tensor * output_flat = ggml_reshape_2d(ctx, output, S_v * H_v, n_tokens);
			
 
				+    struct ggml_tensor * output_result_slice = ggml_view_2d(ctx, result, S_v * H_v, n_tokens, 
			
 
				+                                                           result->nb[1], 0);
			
 
				+    output_result_slice = ggml_cpy(ctx, output_flat, output_result_slice);
			
 
				     
			
 
				-    // Use custom operation for the gated delta rule computation
			
 
				-    result->op = GGML_OP_DELTA_NET;
			
 
				-    result->src[0] = q_broadcast;
			
 
				-    result->src[1] = k_broadcast;
			
 
				-    result->src[2] = v_conv;
			
 
				-    result->src[3] = g;
			
 
				-    result->src[4] = beta_sigmoid;
			
 
				-    result->src[5] = state;
			
 
				+    // Copy new_state data using proper tensor operations
			
 
				+    struct ggml_tensor * new_state_flat = ggml_reshape_2d(ctx, new_state, S_v * S_v * H_v, n_seqs);
			
 
				+    struct ggml_tensor * state_result_slice = ggml_view_2d(ctx, result, S_v * S_v * H_v, n_seqs,
			
 
				+                                                          result->nb[1], n_tokens * sizeof(float));
			
 
				+    state_result_slice = ggml_cpy(ctx, new_state_flat, state_result_slice);
			
 
				     
			
 
				     return result;
			
 
				 }
			
 
				-
			
 
				 // ggml_rwkv_wkv7
			
 
				 
			
 
				 struct ggml_tensor * ggml_rwkv_wkv7(
			
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -19018,6 +19018,7 @@ struct llm_build_qwen3next : public llm_graph_context_mamba {
 
				         cur = build_lora_mm(model.output, cur);
			
 
				 
			
 
				         cb(cur, "result_output", -1);
			
 
				+        ggml_set_output(cur);
			
 
				         res->t_logits = cur;
			
 
				 
			
 
				         ggml_build_forward_expand(gf, cur);
			
@@ -19223,7 +19224,6 @@ private:
 
				                                               conv_bias,       // conv_bias tensor (can be nullptr)
			
 
				                                               beta,            // beta tensor
			
 
				                                               state,           // state tensor
			
 
				-                                              64,              // chunk_size (adjust as needed)
			
 
				                                               true,            // use_qk_l2norm
			
 
				                                               1.0f             // scale (adjust based on your model)
			
 
				         );