|
@@ -5431,9 +5431,9 @@ struct ggml_tensor * ggml_delta_net(
|
|
|
struct ggml_tensor * conv_bias,
|
|
struct ggml_tensor * conv_bias,
|
|
|
struct ggml_tensor * beta,
|
|
struct ggml_tensor * beta,
|
|
|
struct ggml_tensor * state,
|
|
struct ggml_tensor * state,
|
|
|
- int chunk_size,
|
|
|
|
|
bool use_qk_l2norm,
|
|
bool use_qk_l2norm,
|
|
|
float scale) {
|
|
float scale) {
|
|
|
|
|
+
|
|
|
GGML_ASSERT(ggml_is_contiguous(k));
|
|
GGML_ASSERT(ggml_is_contiguous(k));
|
|
|
GGML_ASSERT(ggml_is_contiguous(v));
|
|
GGML_ASSERT(ggml_is_contiguous(v));
|
|
|
GGML_ASSERT(ggml_is_contiguous(q));
|
|
GGML_ASSERT(ggml_is_contiguous(q));
|
|
@@ -5474,10 +5474,12 @@ struct ggml_tensor * ggml_delta_net(
|
|
|
|
|
|
|
|
// Apply sigmoid to beta for gating
|
|
// Apply sigmoid to beta for gating
|
|
|
struct ggml_tensor * beta_sigmoid = ggml_sigmoid(ctx, beta);
|
|
struct ggml_tensor * beta_sigmoid = ggml_sigmoid(ctx, beta);
|
|
|
|
|
+
|
|
|
|
|
+ // Concatenate q, k, v for convolution processing
|
|
|
struct ggml_tensor * mixed_qkv = ggml_concat(ctx, q_norm, k_norm, 1);
|
|
struct ggml_tensor * mixed_qkv = ggml_concat(ctx, q_norm, k_norm, 1);
|
|
|
mixed_qkv = ggml_concat(ctx, mixed_qkv, v, 1);
|
|
mixed_qkv = ggml_concat(ctx, mixed_qkv, v, 1);
|
|
|
|
|
|
|
|
- u_int32_t dim = (S_v * H_v) + 2 * (H_k * S_k);
|
|
|
|
|
|
|
+ uint32_t dim = (S_v * H_v) + 2 * (H_k * S_k);
|
|
|
|
|
|
|
|
mixed_qkv = ggml_reshape_3d(ctx, mixed_qkv, 1, dim, n_tokens);
|
|
mixed_qkv = ggml_reshape_3d(ctx, mixed_qkv, 1, dim, n_tokens);
|
|
|
struct ggml_tensor * mixed_qkv_padded = ggml_pad(ctx, mixed_qkv, 3, 0, 0, 0);
|
|
struct ggml_tensor * mixed_qkv_padded = ggml_pad(ctx, mixed_qkv, 3, 0, 0, 0);
|
|
@@ -5566,33 +5568,219 @@ struct ggml_tensor * ggml_delta_net(
|
|
|
q_broadcast = ggml_reshape_4d(ctx, q_broadcast, S_k, H_v, n_tokens, 1);
|
|
q_broadcast = ggml_reshape_4d(ctx, q_broadcast, S_k, H_v, n_tokens, 1);
|
|
|
k_broadcast = ggml_reshape_4d(ctx, k_broadcast, S_k, H_v, n_tokens, 1);
|
|
k_broadcast = ggml_reshape_4d(ctx, k_broadcast, S_k, H_v, n_tokens, 1);
|
|
|
}
|
|
}
|
|
|
|
|
+
|
|
|
|
|
+ struct ggml_tensor * v_reshape = ggml_reshape_4d(ctx, v_conv, S_v, H_v, n_tokens, 1);
|
|
|
|
|
+ struct ggml_tensor * v_broadcast = ggml_repeat_4d(ctx, v_reshape, S_v, H_v, n_tokens, n_seqs);
|
|
|
|
|
+ struct ggml_tensor * g_reshape = ggml_reshape_4d(ctx, g, 1, H_v, n_tokens, n_seqs);
|
|
|
|
|
+ q_broadcast = ggml_repeat_4d(ctx, q_broadcast, S_k, H_v, n_tokens, n_seqs);
|
|
|
|
|
+ k_broadcast = ggml_repeat_4d(ctx, k_broadcast, S_k, H_v, n_tokens, n_seqs);
|
|
|
|
|
+ struct ggml_tensor * beta_reshape = ggml_reshape_4d(ctx, beta_sigmoid, 1, H_v, n_tokens, 1);
|
|
|
|
|
+ struct ggml_tensor * beta_broadcast = ggml_repeat_4d(ctx, beta_reshape, 1, H_v, n_tokens, n_seqs);
|
|
|
|
|
+ struct ggml_tensor * state_broadcast = ggml_repeat_4d(ctx, state, S_v, S_v, H_v, n_seqs);
|
|
|
|
|
|
|
|
- // concat output and new_state
|
|
|
|
|
- const int64_t ne[4] = { S_v * H_v, n_tokens + H_v * n_seqs, 1, 1 };
|
|
|
|
|
|
|
+ // Call tensor-level kernel with convolved and processed tensors
|
|
|
|
|
+ return ggml_delta_net_op(ctx, q_broadcast, k_broadcast, v_broadcast, g_reshape, beta_broadcast, state_broadcast, use_qk_l2norm, scale);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+struct ggml_tensor * ggml_delta_net_op(
|
|
|
|
|
+ struct ggml_context * ctx,
|
|
|
|
|
+ struct ggml_tensor * q,
|
|
|
|
|
+ struct ggml_tensor * k,
|
|
|
|
|
+ struct ggml_tensor * v,
|
|
|
|
|
+ struct ggml_tensor * g,
|
|
|
|
|
+ struct ggml_tensor * beta,
|
|
|
|
|
+ struct ggml_tensor * state,
|
|
|
|
|
+ bool use_qk_l2norm,
|
|
|
|
|
+ float scale) {
|
|
|
|
|
+
|
|
|
|
|
+ // Validate dimensions
|
|
|
|
|
+ GGML_ASSERT(ggml_is_contiguous(q));
|
|
|
|
|
+ GGML_ASSERT(ggml_is_contiguous(k));
|
|
|
|
|
+ GGML_ASSERT(ggml_is_contiguous(v));
|
|
|
|
|
+ GGML_ASSERT(ggml_is_contiguous(g));
|
|
|
|
|
+ GGML_ASSERT(ggml_is_contiguous(beta));
|
|
|
|
|
+ GGML_ASSERT(ggml_is_contiguous(state));
|
|
|
|
|
+
|
|
|
|
|
+ const int64_t S_k = q->ne[0]; // head dimension for q/k
|
|
|
|
|
+ const int64_t H_v = q->ne[1]; // number of heads (already processed to match v)
|
|
|
|
|
+ const int64_t n_tokens = q->ne[2];
|
|
|
|
|
+ const int64_t n_seqs = q->ne[3];
|
|
|
|
|
+
|
|
|
|
|
+ const int64_t S_v = v->ne[0]; // head dimension for v
|
|
|
|
|
+
|
|
|
|
|
+ // Validate dimensions (q and k should now have same head count as v)
|
|
|
|
|
+ GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_v && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
|
|
|
|
|
+ GGML_ASSERT(v->ne[1] == H_v && v->ne[2] == n_tokens && v->ne[3] == n_seqs);
|
|
|
|
|
+ GGML_ASSERT(g->ne[0] == 1 && g->ne[1] == H_v && g->ne[2] == n_tokens && g->ne[3] == n_seqs);
|
|
|
|
|
+ GGML_ASSERT(beta->ne[0] == 1 && beta->ne[1] == H_v && g->ne[2] == n_tokens && beta->ne[3] == n_seqs);
|
|
|
|
|
+ GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v && state->ne[2] == H_v && state->ne[3] == n_seqs);
|
|
|
|
|
+
|
|
|
|
|
+ // Create output tensor: [S_v, H_v, n_tokens, n_seqs]
|
|
|
|
|
+ struct ggml_tensor * output = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S_v, H_v, n_tokens, n_seqs);
|
|
|
|
|
+
|
|
|
|
|
+ // Create new state tensor: [S_v, S_v, H_v, n_seqs]
|
|
|
|
|
+ struct ggml_tensor * new_state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S_v, S_v, H_v, n_seqs);
|
|
|
|
|
+
|
|
|
|
|
+ // Process each sequence independently
|
|
|
|
|
+ for (int64_t seq_idx = 0; seq_idx < n_seqs; ++seq_idx) {
|
|
|
|
|
+ // Extract current sequence data
|
|
|
|
|
+ struct ggml_tensor * q_seq = ggml_view_4d(ctx, q, S_k, H_v, n_tokens, 1,
|
|
|
|
|
+ q->nb[1], q->nb[2], q->nb[3],
|
|
|
|
|
+ seq_idx * q->nb[3]);
|
|
|
|
|
+ struct ggml_tensor * k_seq = ggml_view_4d(ctx, k, S_k, H_v, n_tokens, 1,
|
|
|
|
|
+ k->nb[1], k->nb[2], k->nb[3],
|
|
|
|
|
+ seq_idx * k->nb[3]);
|
|
|
|
|
+ struct ggml_tensor * v_seq = ggml_view_4d(ctx, v, S_v, H_v, n_tokens, 1,
|
|
|
|
|
+ v->nb[1], v->nb[2], v->nb[3],
|
|
|
|
|
+ seq_idx * v->nb[3]);
|
|
|
|
|
+ struct ggml_tensor * g_seq = ggml_view_4d(ctx, g, 1, H_v, n_tokens, 1,
|
|
|
|
|
+ g->nb[1], g->nb[2], g->nb[3],
|
|
|
|
|
+ seq_idx * g->nb[3]);
|
|
|
|
|
+ struct ggml_tensor * beta_seq = ggml_view_4d(ctx, beta, 1, H_v, n_tokens, 1,
|
|
|
|
|
+ beta->nb[1], beta->nb[2], beta->nb[3],
|
|
|
|
|
+ seq_idx * beta->nb[3]);
|
|
|
|
|
+ struct ggml_tensor * state_seq = ggml_view_4d(ctx, state, S_v, S_v, H_v, 1,
|
|
|
|
|
+ state->nb[1], state->nb[2], state->nb[3],
|
|
|
|
|
+ seq_idx * state->nb[3]);
|
|
|
|
|
+
|
|
|
|
|
+ // Process each head
|
|
|
|
|
+ for (int64_t head_idx = 0; head_idx < H_v; ++head_idx) {
|
|
|
|
|
+ // Extract current head data
|
|
|
|
|
+ struct ggml_tensor * q_head = ggml_view_3d(ctx, q_seq, S_k, n_tokens, 1,
|
|
|
|
|
+ q_seq->nb[1], q_seq->nb[2],
|
|
|
|
|
+ head_idx * q_seq->nb[2]);
|
|
|
|
|
+ struct ggml_tensor * k_head = ggml_view_3d(ctx, k_seq, S_k, n_tokens, 1,
|
|
|
|
|
+ k_seq->nb[1], k_seq->nb[2],
|
|
|
|
|
+ head_idx * k_seq->nb[2]);
|
|
|
|
|
+ struct ggml_tensor * v_head = ggml_view_3d(ctx, v_seq, S_v, n_tokens, 1,
|
|
|
|
|
+ v_seq->nb[1], v_seq->nb[2],
|
|
|
|
|
+ head_idx * v_seq->nb[2]);
|
|
|
|
|
+ struct ggml_tensor * g_head = ggml_view_3d(ctx, g_seq, 1, n_tokens, 1,
|
|
|
|
|
+ g_seq->nb[1], g_seq->nb[2],
|
|
|
|
|
+ head_idx * g_seq->nb[2]);
|
|
|
|
|
+ struct ggml_tensor * beta_head = ggml_view_3d(ctx, beta_seq, 1, n_tokens, 1,
|
|
|
|
|
+ beta_seq->nb[1], beta_seq->nb[2],
|
|
|
|
|
+ head_idx * beta_seq->nb[2]);
|
|
|
|
|
+ struct ggml_tensor * state_head = ggml_view_3d(ctx, state_seq, S_v, S_v, 1,
|
|
|
|
|
+ state_seq->nb[1], state_seq->nb[2],
|
|
|
|
|
+ head_idx * state_seq->nb[2]);
|
|
|
|
|
+
|
|
|
|
|
+ // Transpose to get [n_tokens, S] layout for sequential processing
|
|
|
|
|
+ q_head = ggml_cont(ctx, ggml_permute(ctx, q_head, 1, 0, 2, 3)); // [n_tokens, S_k]
|
|
|
|
|
+ k_head = ggml_cont(ctx, ggml_permute(ctx, k_head, 1, 0, 2, 3)); // [n_tokens, S_k]
|
|
|
|
|
+ v_head = ggml_cont(ctx, ggml_permute(ctx, v_head, 1, 0, 2, 3)); // [n_tokens, S_v]
|
|
|
|
|
+ g_head = ggml_cont(ctx, ggml_permute(ctx, g_head, 1, 0, 2, 3)); // [n_tokens, 1]
|
|
|
|
|
+ beta_head = ggml_cont(ctx, ggml_permute(ctx, beta_head, 1, 0, 2, 3)); // [n_tokens, 1]
|
|
|
|
|
+
|
|
|
|
|
+ // Process each token - apply L2 normalization and scaling per token as original
|
|
|
|
|
+ struct ggml_tensor * current_state = state_head;
|
|
|
|
|
+ struct ggml_tensor * output_head = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, S_v, n_tokens);
|
|
|
|
|
+
|
|
|
|
|
+ for (int64_t t = 0; t < n_tokens; ++t) {
|
|
|
|
|
+ // Extract current token data
|
|
|
|
|
+ struct ggml_tensor * q_t = ggml_view_1d(ctx, q_head, S_k, t * S_k * sizeof(float));
|
|
|
|
|
+ struct ggml_tensor * k_t = ggml_view_1d(ctx, k_head, S_k, t * S_k * sizeof(float));
|
|
|
|
|
+ struct ggml_tensor * v_t = ggml_view_1d(ctx, v_head, S_v, t * S_v * sizeof(float));
|
|
|
|
|
+ struct ggml_tensor * g_t = ggml_view_1d(ctx, g_head, 1, t * sizeof(float));
|
|
|
|
|
+ struct ggml_tensor * beta_t = ggml_view_1d(ctx, beta_head, 1, t * sizeof(float));
|
|
|
|
|
+
|
|
|
|
|
+ // Apply L2 normalization if requested - per token as in original
|
|
|
|
|
+ if (use_qk_l2norm) {
|
|
|
|
|
+ // Compute L2 norm for q_t and k_t
|
|
|
|
|
+ struct ggml_tensor * q_norm = ggml_l2_norm(ctx, q_t, 1e-6f);
|
|
|
|
|
+ struct ggml_tensor * k_norm = ggml_l2_norm(ctx, k_t, 1e-6f);
|
|
|
|
|
+ q_t = q_norm;
|
|
|
|
|
+ k_t = k_norm;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // Apply scaling to query - per token as in original
|
|
|
|
|
+ q_t = ggml_scale(ctx, q_t, scale);
|
|
|
|
|
+
|
|
|
|
|
+ // Apply gate decay to state: state = state * exp(g_t)
|
|
|
|
|
+ struct ggml_tensor * g_exp = ggml_exp(ctx, g_t);
|
|
|
|
|
+ // Broadcast g_exp to match state dimensions using multiplication
|
|
|
|
|
+ struct ggml_tensor * ones = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, S_v, S_v);
|
|
|
|
|
+ ggml_exp(ctx, ones);
|
|
|
|
|
+ struct ggml_tensor * g_exp_broadcast = ggml_mul(ctx, ones, g_exp);
|
|
|
|
|
+ current_state = ggml_mul(ctx, current_state, g_exp_broadcast);
|
|
|
|
|
+
|
|
|
|
|
+ // Compute kv_mem = state @ k_t^T
|
|
|
|
|
+ struct ggml_tensor * k_t_reshaped = ggml_reshape_2d(ctx, k_t, S_k, 1);
|
|
|
|
|
+ struct ggml_tensor * kv_mem = ggml_mul_mat(ctx, current_state, k_t_reshaped); // [S_v, 1]
|
|
|
|
|
+ kv_mem = ggml_reshape_1d(ctx, kv_mem, S_v);
|
|
|
|
|
+
|
|
|
|
|
+ // Compute delta = (v_t - kv_mem) * beta_t
|
|
|
|
|
+ struct ggml_tensor * v_minus_kv = ggml_sub(ctx, v_t, kv_mem);
|
|
|
|
|
+ // Broadcast beta_t through multiplication (GGML auto-broadcasts)
|
|
|
|
|
+ struct ggml_tensor * delta = ggml_mul(ctx, v_minus_kv, beta_t);
|
|
|
|
|
+
|
|
|
|
|
+ // Update state: state = state + outer(k_t, delta)
|
|
|
|
|
+ struct ggml_tensor * k_t_reshaped_2 = ggml_reshape_2d(ctx, k_t, 1, S_k);
|
|
|
|
|
+ struct ggml_tensor * delta_reshaped = ggml_reshape_2d(ctx, delta, 1, S_v);
|
|
|
|
|
+ struct ggml_tensor * outer_product = ggml_mul_mat(ctx, delta_reshaped, k_t_reshaped_2); // [S_k, S_v]
|
|
|
|
|
+
|
|
|
|
|
+ // Handle S_k != S_v case
|
|
|
|
|
+ if (S_k == S_v) {
|
|
|
|
|
+ current_state = ggml_add(ctx, current_state, outer_product);
|
|
|
|
|
+ } else {
|
|
|
|
|
+ // For S_k != S_v, handle dimension mismatch
|
|
|
|
|
+ if (S_k < S_v) {
|
|
|
|
|
+ // Pad outer_product with zeros
|
|
|
|
|
+ struct ggml_tensor * padding = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, S_v - S_k, S_v);
|
|
|
|
|
+ outer_product = ggml_concat(ctx, outer_product, padding, 0);
|
|
|
|
|
+ } else if (S_k > S_v) {
|
|
|
|
|
+ // Truncate outer_product
|
|
|
|
|
+ outer_product = ggml_view_2d(ctx, outer_product, S_v, S_v, outer_product->nb[1], 0);
|
|
|
|
|
+ }
|
|
|
|
|
+ current_state = ggml_add(ctx, current_state, outer_product);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // Compute output = current_state @ q_t^T
|
|
|
|
|
+ struct ggml_tensor * q_t_reshaped = ggml_reshape_2d(ctx, q_t, S_k, 1);
|
|
|
|
|
+ struct ggml_tensor * output_t = ggml_mul_mat(ctx, current_state, q_t_reshaped); // [S_v, 1]
|
|
|
|
|
+ output_t = ggml_reshape_1d(ctx, output_t, S_v);
|
|
|
|
|
+
|
|
|
|
|
+ // Store output for this token using view and copy
|
|
|
|
|
+ struct ggml_tensor * output_slice = ggml_view_1d(ctx, output_head, S_v, t * S_v * sizeof(float));
|
|
|
|
|
+ output_slice = ggml_cpy(ctx, output_t, output_slice);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // Store processed head data using proper tensor operations
|
|
|
|
|
+ // Reshape and permute output head to correct layout
|
|
|
|
|
+ struct ggml_tensor * output_head_reshaped = ggml_reshape_3d(ctx, output_head, S_v, 1, n_tokens);
|
|
|
|
|
+ struct ggml_tensor * output_head_final = ggml_cont(ctx, ggml_permute(ctx, output_head_reshaped, 1, 2, 0, 3)); // [1, n_tokens, S_v]
|
|
|
|
|
+ output_head_final = ggml_reshape_2d(ctx, output_head_final, S_v, n_tokens);
|
|
|
|
|
+
|
|
|
|
|
+ // Copy to final output tensor using proper tensor operations
|
|
|
|
|
+ struct ggml_tensor * output_slice = ggml_view_2d(ctx, output, S_v, n_tokens,
|
|
|
|
|
+ output->nb[1], head_idx * S_v * sizeof(float));
|
|
|
|
|
+ output_slice = ggml_cpy(ctx, output_head_final, output_slice);
|
|
|
|
|
+
|
|
|
|
|
+ // Copy current state to new_state tensor
|
|
|
|
|
+ struct ggml_tensor * state_slice = ggml_view_2d(ctx, new_state, S_v, S_v,
|
|
|
|
|
+ new_state->nb[1], head_idx * S_v * S_v * sizeof(float));
|
|
|
|
|
+ state_slice = ggml_cpy(ctx, current_state, state_slice);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // Concatenate output and new_state into final result
|
|
|
|
|
+ const int64_t ne[4] = { S_v * H_v, n_tokens + S_v * n_seqs, 1, 1 };
|
|
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
|
|
|
|
|
|
|
- // Set operation parameters for the delta rule computation
|
|
|
|
|
- int32_t params[8] = {
|
|
|
|
|
- chunk_size,
|
|
|
|
|
- use_qk_l2norm ? 1 : 0,
|
|
|
|
|
- 0, 0, // reserved
|
|
|
|
|
- 0, 0, 0 // scale and other params
|
|
|
|
|
- };
|
|
|
|
|
- memcpy(params + 4, &scale, sizeof(float));
|
|
|
|
|
- ggml_set_op_params(result, params, sizeof(params));
|
|
|
|
|
|
|
+ // Copy output data using proper tensor operations
|
|
|
|
|
+ struct ggml_tensor * output_flat = ggml_reshape_2d(ctx, output, S_v * H_v, n_tokens);
|
|
|
|
|
+ struct ggml_tensor * output_result_slice = ggml_view_2d(ctx, result, S_v * H_v, n_tokens,
|
|
|
|
|
+ result->nb[1], 0);
|
|
|
|
|
+ output_result_slice = ggml_cpy(ctx, output_flat, output_result_slice);
|
|
|
|
|
|
|
|
- // Use custom operation for the gated delta rule computation
|
|
|
|
|
- result->op = GGML_OP_DELTA_NET;
|
|
|
|
|
- result->src[0] = q_broadcast;
|
|
|
|
|
- result->src[1] = k_broadcast;
|
|
|
|
|
- result->src[2] = v_conv;
|
|
|
|
|
- result->src[3] = g;
|
|
|
|
|
- result->src[4] = beta_sigmoid;
|
|
|
|
|
- result->src[5] = state;
|
|
|
|
|
|
|
+ // Copy new_state data using proper tensor operations
|
|
|
|
|
+ struct ggml_tensor * new_state_flat = ggml_reshape_2d(ctx, new_state, S_v * S_v * H_v, n_seqs);
|
|
|
|
|
+ struct ggml_tensor * state_result_slice = ggml_view_2d(ctx, result, S_v * S_v * H_v, n_seqs,
|
|
|
|
|
+ result->nb[1], n_tokens * sizeof(float));
|
|
|
|
|
+ state_result_slice = ggml_cpy(ctx, new_state_flat, state_result_slice);
|
|
|
|
|
|
|
|
return result;
|
|
return result;
|
|
|
}
|
|
}
|
|
|
-
|
|
|
|
|
// ggml_rwkv_wkv7
|
|
// ggml_rwkv_wkv7
|
|
|
|
|
|
|
|
struct ggml_tensor * ggml_rwkv_wkv7(
|
|
struct ggml_tensor * ggml_rwkv_wkv7(
|