|
@@ -124,14 +124,14 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|
|
|
|
|
|
|
// {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
|
|
// {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
|
|
|
// note: rope must go first for in-place context shifting in build_rope_shift()
|
|
// note: rope must go first for in-place context shifting in build_rope_shift()
|
|
|
- ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0);
|
|
|
|
|
|
|
+ ggml_tensor * Qcur = ggml_concat(ctx0, q_nope_absorbed, q_pe, 0);
|
|
|
cb(Qcur, "Qcur", il);
|
|
cb(Qcur, "Qcur", il);
|
|
|
|
|
|
|
|
kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
|
|
kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
|
|
|
cb(kv_cmpr, "kv_cmpr_reshape", il);
|
|
cb(kv_cmpr, "kv_cmpr_reshape", il);
|
|
|
|
|
|
|
|
// {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
|
|
// {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
|
|
|
- ggml_tensor * Kcur = ggml_concat(ctx0, k_pe, kv_cmpr, 0);
|
|
|
|
|
|
|
+ ggml_tensor * Kcur = ggml_concat(ctx0, kv_cmpr, k_pe, 0);
|
|
|
cb(Kcur, "Kcur", il);
|
|
cb(Kcur, "Kcur", il);
|
|
|
|
|
|
|
|
// {kv_lora_rank, 1, n_tokens}
|
|
// {kv_lora_rank, 1, n_tokens}
|
|
@@ -169,11 +169,10 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
|
|
|
Vcur = ggml_cont(ctx0, Vcur);
|
|
Vcur = ggml_cont(ctx0, Vcur);
|
|
|
cb(Vcur, "Vcur_cont", il);
|
|
cb(Vcur, "Vcur_cont", il);
|
|
|
|
|
|
|
|
- // note: rope must go first for in-place context shifting in build_rope_shift()
|
|
|
|
|
- ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope, 0);
|
|
|
|
|
|
|
+ ggml_tensor * Qcur = ggml_concat(ctx0, q_nope, q_pe, 0);
|
|
|
cb(Qcur, "Qcur", il);
|
|
cb(Qcur, "Qcur", il);
|
|
|
|
|
|
|
|
- ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
|
|
|
|
|
|
|
+ ggml_tensor * Kcur = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
|
|
|
cb(Kcur, "Kcur", il);
|
|
cb(Kcur, "Kcur", il);
|
|
|
|
|
|
|
|
if (inp_attn_scale) {
|
|
if (inp_attn_scale) {
|