|
|
@@ -134,11 +134,11 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
|
|
|
const int64_t H = 123;
|
|
|
const int64_t n_tokens = 123;
|
|
|
const int64_t n_seqs = 123;
|
|
|
- ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, 1, H, n_tokens);
|
|
|
- ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens);
|
|
|
- ggml_tensor * r = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens);
|
|
|
+ ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
|
|
+ ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
|
|
+ ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
|
|
ggml_tensor * tf = w;
|
|
|
- ggml_tensor * td = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, S, H, n_tokens);
|
|
|
+ ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
|
|
ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
|
|
|
op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
|
|
|
} break;
|
|
|
@@ -2186,11 +2186,13 @@ static bool llm_load_tensors(
|
|
|
layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
|
|
|
|
|
|
layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
|
|
|
- layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, 0);
|
|
|
- layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
|
|
|
- layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, 0);
|
|
|
- layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
|
|
|
- layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, 0);
|
|
|
+ layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
|
+ layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
|
+ layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
|
+ layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
|
+ layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
|
+ layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
|
+ GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));
|
|
|
|
|
|
layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
|
|
|
layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
|
|
|
@@ -2214,6 +2216,59 @@ static bool llm_load_tensors(
|
|
|
}
|
|
|
|
|
|
} break;
|
|
|
+ case LLM_ARCH_RWKV6QWEN2:
|
|
|
+ {
|
|
|
+ model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
|
+
|
|
|
+ model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
|
+ model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
|
+ model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
|
+
|
|
|
+ const int time_mix_extra_dim = hparams.time_mix_extra_dim;
|
|
|
+ const int time_decay_extra_dim = hparams.time_decay_extra_dim;
|
|
|
+ const int head_size = hparams.wkv_head_size;
|
|
|
+ const int attn_hidden_size = n_embd;
|
|
|
+ const int n_head_kv = hparams.n_head_kv();
|
|
|
+ int attn_key_value_size;
|
|
|
+ if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {
|
|
|
+ attn_key_value_size = attn_hidden_size;
|
|
|
+ } else {
|
|
|
+ attn_key_value_size = n_head_kv * head_size;
|
|
|
+ }
|
|
|
+
|
|
|
+ for (int i = 0; i < n_layer; ++i) {
|
|
|
+ auto & layer = model.layers[i];
|
|
|
+
|
|
|
+ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
|
+
|
|
|
+ layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
|
|
|
+ layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
|
|
|
+
|
|
|
+ layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
|
|
|
+ layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
|
|
|
+
|
|
|
+ layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
|
+ layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
|
|
|
+ layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
|
|
|
+ layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
|
|
|
+ layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {n_embd, attn_key_value_size}, 0);
|
|
|
+ layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {n_embd, attn_key_value_size}, 0);
|
|
|
+ layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
|
|
|
+ layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
|
|
|
+ // optional bias tensors
|
|
|
+ layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
|
+ layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
|
+ layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
|
+
|
|
|
+ layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
|
|
|
+
|
|
|
+ layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
|
+
|
|
|
+ layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
|
+ layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
|
+ layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
|
+ }
|
|
|
+ } break;
|
|
|
case LLM_ARCH_CHAMELEON:
|
|
|
{
|
|
|
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
|
@@ -3337,16 +3392,20 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix(
|
|
|
const struct llama_layer * layer,
|
|
|
struct ggml_tensor * cur,
|
|
|
struct ggml_tensor * x_prev,
|
|
|
- struct ggml_tensor ** wkv_state) {
|
|
|
+ struct ggml_tensor ** wkv_state,
|
|
|
+ size_t wkv_head_size,
|
|
|
+ size_t head_count_kv) {
|
|
|
size_t n_embd = cur->ne[0];
|
|
|
size_t n_seq_tokens = cur->ne[1];
|
|
|
size_t n_seqs = cur->ne[2];
|
|
|
|
|
|
- size_t head_size = layer->time_mix_first->ne[0];
|
|
|
- size_t head_count = layer->time_mix_first->ne[1];
|
|
|
+ size_t head_size = wkv_head_size;
|
|
|
+ size_t head_count = n_embd / head_size;
|
|
|
|
|
|
size_t n_tokens = n_seqs * n_seq_tokens;
|
|
|
|
|
|
+ bool is_qrwkv = layer->time_mix_first == nullptr;
|
|
|
+
|
|
|
struct ggml_tensor * sx = ggml_sub(ctx, x_prev, cur);
|
|
|
|
|
|
sx = ggml_reshape_2d(ctx, sx, n_embd, n_tokens);
|
|
|
@@ -3375,69 +3434,64 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix(
|
|
|
xxx
|
|
|
);
|
|
|
|
|
|
- struct ggml_tensor *mw = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0);
|
|
|
- struct ggml_tensor *mk = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
|
|
|
- struct ggml_tensor *mv = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
|
|
|
- struct ggml_tensor *mr = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
|
|
|
- struct ggml_tensor *mg = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
|
|
|
-
|
|
|
- struct ggml_tensor * xw = ggml_add(
|
|
|
- ctx,
|
|
|
- ggml_mul(
|
|
|
- ctx,
|
|
|
- ggml_add(ctx, mw, layer->time_mix_lerp_w),
|
|
|
- sx
|
|
|
- ),
|
|
|
- cur
|
|
|
- );
|
|
|
+ struct ggml_tensor *xw, *xk, *xv, *xr, *xg;
|
|
|
+ if (layer->time_mix_lerp_fused) {
|
|
|
+ // fusing these weights makes some performance improvement
|
|
|
+ sx = ggml_reshape_3d(ctx, sx, n_embd, 1, n_tokens);
|
|
|
+ cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens);
|
|
|
+ xxx = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xxx, layer->time_mix_lerp_fused), sx), cur);
|
|
|
+ xw = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0);
|
|
|
+ xk = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
|
|
|
+ xv = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
|
|
|
+ xr = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
|
|
|
+ xg = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
|
|
|
+ } else {
|
|
|
+ // for backward compatibility
|
|
|
+ xw = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], 0);
|
|
|
+ xk = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * sizeof(float));
|
|
|
+ xv = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 2 * sizeof(float));
|
|
|
+ xr = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 3 * sizeof(float));
|
|
|
+ xg = ggml_view_2d(ctx, xxx, n_embd, n_tokens, xxx->nb[1], n_embd * n_tokens * 4 * sizeof(float));
|
|
|
|
|
|
- struct ggml_tensor * xk = ggml_add(
|
|
|
- ctx,
|
|
|
- ggml_mul(
|
|
|
- ctx,
|
|
|
- ggml_add(ctx, mk, layer->time_mix_lerp_k),
|
|
|
- sx
|
|
|
- ),
|
|
|
- cur
|
|
|
- );
|
|
|
+ xw = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xw, layer->time_mix_lerp_w), sx), cur);
|
|
|
+ xk = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xk, layer->time_mix_lerp_k), sx), cur);
|
|
|
+ xv = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xv, layer->time_mix_lerp_v), sx), cur);
|
|
|
+ xr = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xr, layer->time_mix_lerp_r), sx), cur);
|
|
|
+ xg = ggml_add(ctx, ggml_mul(ctx, ggml_add(ctx, xg, layer->time_mix_lerp_g), sx), cur);
|
|
|
+ }
|
|
|
|
|
|
- struct ggml_tensor * xv = ggml_add(
|
|
|
- ctx,
|
|
|
- ggml_mul(
|
|
|
- ctx,
|
|
|
- ggml_add(ctx, mv, layer->time_mix_lerp_v),
|
|
|
- sx
|
|
|
- ),
|
|
|
- cur
|
|
|
- );
|
|
|
+ struct ggml_tensor * r = llm_build_lora_mm(lctx, ctx, layer->time_mix_receptance, xr);
|
|
|
+ struct ggml_tensor * k = llm_build_lora_mm(lctx, ctx, layer->time_mix_key, xk);
|
|
|
+ struct ggml_tensor * v = llm_build_lora_mm(lctx, ctx, layer->time_mix_value, xv);
|
|
|
+ if (layer->time_mix_receptance_b) {
|
|
|
+ r = ggml_add(ctx, r, layer->time_mix_receptance_b);
|
|
|
+ }
|
|
|
+ if (layer->time_mix_key_b) {
|
|
|
+ k = ggml_add(ctx, k, layer->time_mix_key_b);
|
|
|
+ }
|
|
|
+ if (layer->time_mix_value_b) {
|
|
|
+ v = ggml_add(ctx, v, layer->time_mix_value_b);
|
|
|
+ }
|
|
|
|
|
|
- struct ggml_tensor * xr = ggml_add(
|
|
|
- ctx,
|
|
|
- ggml_mul(
|
|
|
- ctx,
|
|
|
- ggml_add(ctx, mr, layer->time_mix_lerp_r),
|
|
|
- sx
|
|
|
- ),
|
|
|
- cur
|
|
|
- );
|
|
|
+ struct ggml_tensor * g = llm_build_lora_mm(lctx, ctx, layer->time_mix_gate, xg);
|
|
|
+ if (is_qrwkv) {
|
|
|
+ g = ggml_sigmoid(ctx, g);
|
|
|
+ } else {
|
|
|
+ g = ggml_silu(ctx, g);
|
|
|
+ }
|
|
|
|
|
|
- struct ggml_tensor * xg = ggml_add(
|
|
|
- ctx,
|
|
|
- ggml_mul(
|
|
|
- ctx,
|
|
|
- ggml_add(ctx, mg, layer->time_mix_lerp_g),
|
|
|
- sx
|
|
|
- ),
|
|
|
- cur
|
|
|
- );
|
|
|
+ if (head_count_kv != head_count) {
|
|
|
+ GGML_ASSERT(head_count % head_count_kv == 0);
|
|
|
+ k = ggml_reshape_4d(ctx, k, head_size, 1, head_count_kv, n_tokens);
|
|
|
+ v = ggml_reshape_4d(ctx, v, head_size, 1, head_count_kv, n_tokens);
|
|
|
+ struct ggml_tensor * tmp = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, head_size, head_count / head_count_kv, head_count_kv, n_tokens);
|
|
|
+ k = ggml_repeat(ctx, k, tmp);
|
|
|
+ v = ggml_repeat(ctx, v, tmp);
|
|
|
+ }
|
|
|
|
|
|
- struct ggml_tensor * r = ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_receptance, xr), head_size, 1, head_count, n_tokens);
|
|
|
- struct ggml_tensor * k = ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_key, xk), 1, head_size, head_count, n_tokens);
|
|
|
- struct ggml_tensor * v = ggml_reshape_4d(ctx, llm_build_lora_mm(lctx, ctx, layer->time_mix_value, xv), head_size, 1, head_count, n_tokens);
|
|
|
- struct ggml_tensor * g = ggml_silu(
|
|
|
- ctx,
|
|
|
- llm_build_lora_mm(lctx, ctx, layer->time_mix_gate, xg)
|
|
|
- );
|
|
|
+ k = ggml_reshape_3d(ctx, k, head_size, head_count, n_tokens);
|
|
|
+ v = ggml_reshape_3d(ctx, v, head_size, head_count, n_tokens);
|
|
|
+ r = ggml_reshape_3d(ctx, r, head_size, head_count, n_tokens);
|
|
|
|
|
|
struct ggml_tensor * w = ggml_mul_mat(
|
|
|
ctx,
|
|
|
@@ -3448,25 +3502,35 @@ static struct ggml_tensor * llm_build_rwkv6_time_mix(
|
|
|
)
|
|
|
);
|
|
|
|
|
|
- w = ggml_add(ctx, w, ggml_reshape_1d(ctx, layer->time_mix_decay, n_embd));
|
|
|
+ w = ggml_add(ctx, w, layer->time_mix_decay);
|
|
|
w = ggml_exp(ctx, ggml_neg(ctx, ggml_exp(ctx, w)));
|
|
|
- w = ggml_reshape_4d(ctx, w, 1, head_size, head_count, n_tokens);
|
|
|
+ w = ggml_reshape_3d(ctx, w, head_size, head_count, n_tokens);
|
|
|
|
|
|
- k = ggml_transpose(ctx, k);
|
|
|
- v = ggml_transpose(ctx, v);
|
|
|
- r = ggml_transpose(ctx, r);
|
|
|
+ if (is_qrwkv) {
|
|
|
+ // k = k * (1 - w)
|
|
|
+ k = ggml_sub(ctx, k, ggml_mul(ctx, k, w));
|
|
|
+ }
|
|
|
|
|
|
- struct ggml_tensor * wkv_output = ggml_rwkv_wkv6(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
|
|
|
+ struct ggml_tensor * wkv_output;
|
|
|
+ if (!layer->time_mix_first) {
|
|
|
+ wkv_output = ggml_gated_linear_attn(ctx, k, v, r, w, *wkv_state, pow(head_size, -0.5f));
|
|
|
+ } else {
|
|
|
+ wkv_output = ggml_rwkv_wkv6(ctx, k, v, r, layer->time_mix_first, w, *wkv_state);
|
|
|
+ }
|
|
|
cur = ggml_view_1d(ctx, wkv_output, n_embd * n_tokens, 0);
|
|
|
*wkv_state = ggml_view_1d(ctx, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float));
|
|
|
|
|
|
- // group norm with head_count groups
|
|
|
- cur = ggml_reshape_3d(ctx, cur, n_embd / head_count, head_count, n_tokens);
|
|
|
- cur = ggml_norm(ctx, cur, 64e-5f);
|
|
|
+ if (!is_qrwkv) {
|
|
|
+ // group norm with head_count groups
|
|
|
+ cur = ggml_reshape_3d(ctx, cur, n_embd / head_count, head_count, n_tokens);
|
|
|
+ cur = ggml_norm(ctx, cur, 64e-5f);
|
|
|
|
|
|
- // Convert back to regular vectors.
|
|
|
- cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
|
|
|
- cur = ggml_add(ctx, ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b);
|
|
|
+ // Convert back to regular vectors.
|
|
|
+ cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
|
|
|
+ cur = ggml_add(ctx, ggml_mul(ctx, cur, layer->time_mix_ln), layer->time_mix_ln_b);
|
|
|
+ } else {
|
|
|
+ cur = ggml_reshape_2d(ctx, cur, n_embd, n_tokens);
|
|
|
+ }
|
|
|
|
|
|
cur = ggml_mul(ctx, cur, g);
|
|
|
cur = llm_build_lora_mm(lctx, ctx, layer->time_mix_output, cur);
|
|
|
@@ -10048,7 +10112,7 @@ struct llm_build_context {
|
|
|
1
|
|
|
);
|
|
|
|
|
|
- cur = ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states));
|
|
|
+ cur = ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, n_embd / hparams.wkv_head_size));
|
|
|
ggml_build_forward_expand(gf, cur);
|
|
|
ggml_build_forward_expand(
|
|
|
gf,
|
|
|
@@ -10115,6 +10179,118 @@ struct llm_build_context {
|
|
|
return gf;
|
|
|
}
|
|
|
|
|
|
+ // ref: https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1/blob/main/modeling_rwkv6qwen2.py
|
|
|
+ ggml_cgraph * build_rwkv6qwen2() {
|
|
|
+ ggml_cgraph *gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
|
|
+
|
|
|
+ GGML_ASSERT(n_embd == hparams.n_embd_k_s());
|
|
|
+
|
|
|
+ const int64_t n_seqs = ubatch.n_seqs;
|
|
|
+ const int64_t n_seq_tokens = ubatch.n_seq_tokens;
|
|
|
+ const int64_t n_tokens = ubatch.n_tokens;
|
|
|
+ GGML_ASSERT(n_seqs != 0);
|
|
|
+ GGML_ASSERT(ubatch.equal_seqs);
|
|
|
+ GGML_ASSERT(n_tokens == n_seq_tokens * n_seqs);
|
|
|
+
|
|
|
+ struct ggml_tensor * cur;
|
|
|
+ struct ggml_tensor * inpL;
|
|
|
+ struct ggml_tensor * state_copy = build_inp_s_copy();
|
|
|
+ struct ggml_tensor * state_mask = build_inp_s_mask();
|
|
|
+
|
|
|
+ inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
|
|
+
|
|
|
+ for (int il = 0; il < n_layer; ++il) {
|
|
|
+ const llama_layer * layer = &model.layers[il];
|
|
|
+
|
|
|
+ // (ab)using the KV cache to store the states
|
|
|
+ struct ggml_tensor * token_shift = llm_build_copy_mask_state(ctx0,
|
|
|
+ gf, kv_self.k_l[il], state_copy, state_mask,
|
|
|
+ hparams.n_embd_k_s(), kv_self.size, kv_head, n_kv, n_seqs);
|
|
|
+ struct ggml_tensor * wkv_states = llm_build_copy_mask_state(ctx0,
|
|
|
+ gf, kv_self.v_l[il], state_copy, state_mask,
|
|
|
+ hparams.n_embd_v_s(), kv_self.size, kv_head, n_kv, n_seqs);
|
|
|
+
|
|
|
+ cur = ggml_reshape_3d(ctx0, inpL, n_embd, n_seq_tokens, n_seqs);
|
|
|
+ token_shift = ggml_reshape_3d(ctx0, token_shift, n_embd, 1, n_seqs);
|
|
|
+
|
|
|
+ struct ggml_tensor * x_norm_att = llm_build_norm(ctx0, cur, hparams, layer->attn_norm, layer->attn_norm_b, LLM_NORM_RMS, cb, il);
|
|
|
+ struct ggml_tensor * x_prev = ggml_concat(
|
|
|
+ ctx0,
|
|
|
+ token_shift,
|
|
|
+ ggml_view_3d(ctx0, x_norm_att, n_embd, n_seq_tokens - 1, n_seqs, x_norm_att->nb[1], x_norm_att->nb[2], 0),
|
|
|
+ 1
|
|
|
+ );
|
|
|
+
|
|
|
+ ggml_build_forward_expand(
|
|
|
+ gf,
|
|
|
+ ggml_cpy(
|
|
|
+ ctx0,
|
|
|
+ wkv_states,
|
|
|
+ ggml_view_1d(
|
|
|
+ ctx0,
|
|
|
+ kv_self.v_l[il],
|
|
|
+ hparams.n_embd_v_s() * n_seqs,
|
|
|
+ hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
|
|
|
+ )
|
|
|
+ )
|
|
|
+ );
|
|
|
+
|
|
|
+ struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, llm_build_rwkv6_time_mix(lctx, ctx0, layer, x_norm_att, x_prev, &wkv_states, hparams.wkv_head_size, hparams.n_head_kv()));
|
|
|
+ ggml_build_forward_expand(gf, ffn_inp);
|
|
|
+ ggml_build_forward_expand(
|
|
|
+ gf,
|
|
|
+ ggml_cpy(
|
|
|
+ ctx0,
|
|
|
+ wkv_states,
|
|
|
+ ggml_view_1d(
|
|
|
+ ctx0,
|
|
|
+ kv_self.v_l[il],
|
|
|
+ hparams.n_embd_v_s() * n_seqs,
|
|
|
+ hparams.n_embd_v_s() * kv_head * ggml_element_size(kv_self.v_l[il])
|
|
|
+ )
|
|
|
+ )
|
|
|
+ );
|
|
|
+
|
|
|
+ cb(ffn_inp, "ffn_inp", il);
|
|
|
+
|
|
|
+ // feed-forward network
|
|
|
+ cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
|
|
+ model.layers[il].ffn_norm, NULL,
|
|
|
+ LLM_NORM_RMS, cb, il);
|
|
|
+ cb(cur, "ffn_norm", il);
|
|
|
+
|
|
|
+ cur = llm_build_ffn(ctx0, lctx, cur,
|
|
|
+ model.layers[il].ffn_up, NULL, NULL,
|
|
|
+ model.layers[il].ffn_gate, NULL, NULL,
|
|
|
+ model.layers[il].ffn_down, NULL, NULL,
|
|
|
+ NULL,
|
|
|
+ LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
|
|
+ cb(cur, "ffn_out", il);
|
|
|
+
|
|
|
+ cur = ggml_add(ctx0, cur, ffn_inp);
|
|
|
+ cur = lctx.cvec.apply_to(ctx0, cur, il);
|
|
|
+ cb(cur, "l_out", il);
|
|
|
+
|
|
|
+ // input for next layer
|
|
|
+ inpL = cur;
|
|
|
+ }
|
|
|
+
|
|
|
+ cur = inpL;
|
|
|
+ struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
+ cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
|
|
|
+ cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
|
+
|
|
|
+ cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, model.output_norm_b, LLM_NORM_RMS, cb, -1);
|
|
|
+ cb(cur, "result_norm", -1);
|
|
|
+
|
|
|
+ cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
|
|
+ cb(cur, "result_output", -1);
|
|
|
+
|
|
|
+ ggml_build_forward_expand(gf, cur);
|
|
|
+
|
|
|
+ return gf;
|
|
|
+ }
|
|
|
+
|
|
|
// ref: https://github.com/facebookresearch/chameleon
|
|
|
// based on the original build_llama() function, changes:
|
|
|
// * qk-norm
|
|
|
@@ -10724,6 +10900,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
{
|
|
|
result = llm.build_rwkv6();
|
|
|
} break;
|
|
|
+ case LLM_ARCH_RWKV6QWEN2:
|
|
|
+ {
|
|
|
+ result = llm.build_rwkv6qwen2();
|
|
|
+ } break;
|
|
|
case LLM_ARCH_CHAMELEON:
|
|
|
{
|
|
|
result = llm.build_chameleon();
|