|
@@ -4315,6 +4315,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
|
const llama_model & model,
|
|
const llama_model & model,
|
|
|
const llama_hparams & hparams,
|
|
const llama_hparams & hparams,
|
|
|
const llama_kv_cache & kv,
|
|
const llama_kv_cache & kv,
|
|
|
|
|
+ struct ggml_cgraph * graph,
|
|
|
struct ggml_tensor * wo,
|
|
struct ggml_tensor * wo,
|
|
|
struct ggml_tensor * wo_b,
|
|
struct ggml_tensor * wo_b,
|
|
|
struct ggml_tensor * q_cur,
|
|
struct ggml_tensor * q_cur,
|
|
@@ -4393,6 +4394,8 @@ static struct ggml_tensor * llm_build_kqv(
|
|
|
struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
|
|
struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens);
|
|
|
cb(cur, "kqv_merged_cont", il);
|
|
cb(cur, "kqv_merged_cont", il);
|
|
|
|
|
|
|
|
|
|
+ ggml_build_forward_expand(graph, cur);
|
|
|
|
|
+
|
|
|
cur = ggml_mul_mat(ctx, wo, cur);
|
|
cur = ggml_mul_mat(ctx, wo, cur);
|
|
|
if (wo_b) {
|
|
if (wo_b) {
|
|
|
cb(cur, "kqv_wo", il);
|
|
cb(cur, "kqv_wo", il);
|
|
@@ -4405,6 +4408,44 @@ static struct ggml_tensor * llm_build_kqv(
|
|
|
return cur;
|
|
return cur;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+static struct ggml_tensor * llm_build_kv(
|
|
|
|
|
+ struct ggml_context * ctx,
|
|
|
|
|
+ const llama_model & model,
|
|
|
|
|
+ const llama_hparams & hparams,
|
|
|
|
|
+ const llama_kv_cache & kv,
|
|
|
|
|
+ struct ggml_cgraph * graph,
|
|
|
|
|
+ struct ggml_tensor * wo,
|
|
|
|
|
+ struct ggml_tensor * wo_b,
|
|
|
|
|
+ struct ggml_tensor * k_cur,
|
|
|
|
|
+ struct ggml_tensor * v_cur,
|
|
|
|
|
+ struct ggml_tensor * q_cur,
|
|
|
|
|
+ struct ggml_tensor * kq_mask,
|
|
|
|
|
+ int64_t n_ctx,
|
|
|
|
|
+ int32_t n_tokens,
|
|
|
|
|
+ int32_t kv_head,
|
|
|
|
|
+ int32_t n_kv,
|
|
|
|
|
+ float max_alibi_bias,
|
|
|
|
|
+ float kq_scale,
|
|
|
|
|
+ const llm_build_cb & cb,
|
|
|
|
|
+ int il) {
|
|
|
|
|
+
|
|
|
|
|
+ // these nodes are added to the graph together so that they are not reordered
|
|
|
|
|
+ // by doing so, the number of splits in the graph is reduced
|
|
|
|
|
+ ggml_build_forward_expand(graph, k_cur);
|
|
|
|
|
+ ggml_build_forward_expand(graph, v_cur);
|
|
|
|
|
+ ggml_build_forward_expand(graph, q_cur);
|
|
|
|
|
+
|
|
|
|
|
+ llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
|
|
|
|
|
+
|
|
|
|
|
+ struct ggml_tensor * cur;
|
|
|
|
|
+ cur = llm_build_kqv(ctx, model, hparams, kv, graph,
|
|
|
|
|
+ wo, wo_b,
|
|
|
|
|
+ q_cur, kq_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, kq_scale, cb, il);
|
|
|
|
|
+ cb(cur, "kqv_out", il);
|
|
|
|
|
+
|
|
|
|
|
+ return cur;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
struct llm_build_context {
|
|
struct llm_build_context {
|
|
|
const llama_model & model;
|
|
const llama_model & model;
|
|
|
const llama_hparams & hparams;
|
|
const llama_hparams & hparams;
|
|
@@ -4562,12 +4603,6 @@ struct llm_build_context {
|
|
|
cb(Vcur, "Vcur", il);
|
|
cb(Vcur, "Vcur", il);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- // these nodes are added to the graph together so that they are not reordered
|
|
|
|
|
- // by doing so, the number of splits in the graph is reduced
|
|
|
|
|
- ggml_build_forward_expand(gf, Qcur);
|
|
|
|
|
- ggml_build_forward_expand(gf, Kcur);
|
|
|
|
|
- ggml_build_forward_expand(gf, Vcur);
|
|
|
|
|
-
|
|
|
|
|
Qcur = ggml_rope_custom(
|
|
Qcur = ggml_rope_custom(
|
|
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
|
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
|
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
|
@@ -4582,11 +4617,9 @@ struct llm_build_context {
|
|
|
);
|
|
);
|
|
|
cb(Kcur, "Kcur", il);
|
|
cb(Kcur, "Kcur", il);
|
|
|
|
|
|
|
|
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
|
|
|
|
-
|
|
|
|
|
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
|
|
|
|
|
|
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
|
model.layers[il].wo, model.layers[il].bo,
|
|
model.layers[il].wo, model.layers[il].bo,
|
|
|
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
|
|
|
|
|
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
|
cb(cur, "kqv_out", il);
|
|
cb(cur, "kqv_out", il);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -4763,14 +4796,13 @@ struct llm_build_context {
|
|
|
cb(Qcur, "Qcur", il);
|
|
cb(Qcur, "Qcur", il);
|
|
|
cb(Kcur, "Kcur", il);
|
|
cb(Kcur, "Kcur", il);
|
|
|
|
|
|
|
|
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
|
|
|
|
|
|
|
|
|
// apply ALiBi for 13B model
|
|
// apply ALiBi for 13B model
|
|
|
const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
|
|
const float max_alibi_bias = model.type == MODEL_13B ? 8.0f : -1.0f;
|
|
|
|
|
|
|
|
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
|
|
|
|
|
|
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
|
model.layers[il].wo, NULL,
|
|
model.layers[il].wo, NULL,
|
|
|
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
|
|
|
|
|
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
|
cb(cur, "kqv_out", il);
|
|
cb(cur, "kqv_out", il);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -4892,11 +4924,9 @@ struct llm_build_context {
|
|
|
);
|
|
);
|
|
|
cb(Kcur, "Kcur", il);
|
|
cb(Kcur, "Kcur", il);
|
|
|
|
|
|
|
|
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
|
|
|
|
-
|
|
|
|
|
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
|
|
|
|
|
|
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
|
model.layers[il].wo, NULL,
|
|
model.layers[il].wo, NULL,
|
|
|
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
|
|
|
|
|
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
|
cb(cur, "kqv_out", il);
|
|
cb(cur, "kqv_out", il);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -4993,11 +5023,9 @@ struct llm_build_context {
|
|
|
|
|
|
|
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
|
|
|
|
|
|
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
|
|
|
|
-
|
|
|
|
|
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
|
|
|
|
|
|
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
|
model.layers[il].wo, model.layers[il].bo,
|
|
model.layers[il].wo, model.layers[il].bo,
|
|
|
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
|
|
|
|
|
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
|
cb(cur, "kqv_out", il);
|
|
cb(cur, "kqv_out", il);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -5200,12 +5228,9 @@ struct llm_build_context {
|
|
|
);
|
|
);
|
|
|
cb(Vcur, "Vcur", il);
|
|
cb(Vcur, "Vcur", il);
|
|
|
|
|
|
|
|
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
|
|
|
|
-
|
|
|
|
|
- // TODO: not tested, could be broken
|
|
|
|
|
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
|
|
|
|
|
|
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
|
model.layers[il].wo, model.layers[il].bo,
|
|
model.layers[il].wo, model.layers[il].bo,
|
|
|
- Q, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
|
|
|
|
|
+ Kcur, Vcur, Q, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
|
cb(cur, "kqv_out", il);
|
|
cb(cur, "kqv_out", il);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -5292,11 +5317,9 @@ struct llm_build_context {
|
|
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
|
cb(Qcur, "Qcur", il);
|
|
cb(Qcur, "Qcur", il);
|
|
|
|
|
|
|
|
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
|
|
|
|
-
|
|
|
|
|
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
|
|
|
|
|
|
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
|
model.layers[il].wo, NULL,
|
|
model.layers[il].wo, NULL,
|
|
|
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
|
|
|
|
|
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
|
cb(cur, "kqv_out", il);
|
|
cb(cur, "kqv_out", il);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -5390,11 +5413,9 @@ struct llm_build_context {
|
|
|
|
|
|
|
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
|
|
|
|
|
|
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
|
|
|
|
-
|
|
|
|
|
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
|
|
|
|
|
|
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
|
model.layers[il].wo, model.layers[il].bo,
|
|
model.layers[il].wo, model.layers[il].bo,
|
|
|
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
|
|
|
|
|
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, 8.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
|
cb(cur, "kqv_out", il);
|
|
cb(cur, "kqv_out", il);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -5485,11 +5506,9 @@ struct llm_build_context {
|
|
|
|
|
|
|
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
|
|
|
|
|
|
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
|
|
|
|
-
|
|
|
|
|
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
|
|
|
|
|
|
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
|
model.layers[il].wo, NULL,
|
|
model.layers[il].wo, NULL,
|
|
|
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
|
|
|
|
|
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, hparams.f_max_alibi_bias, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
|
cb(cur, "kqv_out", il);
|
|
cb(cur, "kqv_out", il);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -5597,11 +5616,9 @@ struct llm_build_context {
|
|
|
);
|
|
);
|
|
|
cb(Kcur, "Kcur", il);
|
|
cb(Kcur, "Kcur", il);
|
|
|
|
|
|
|
|
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
|
|
|
|
-
|
|
|
|
|
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
|
|
|
|
|
|
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
|
model.layers[il].wo, NULL,
|
|
model.layers[il].wo, NULL,
|
|
|
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
|
|
|
|
|
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
|
cb(cur, "kqv_out", il);
|
|
cb(cur, "kqv_out", il);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -5714,11 +5731,9 @@ struct llm_build_context {
|
|
|
);
|
|
);
|
|
|
cb(Kcur, "Kcur", il);
|
|
cb(Kcur, "Kcur", il);
|
|
|
|
|
|
|
|
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
|
|
|
|
-
|
|
|
|
|
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
|
|
|
|
|
|
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
|
model.layers[il].wo, NULL,
|
|
model.layers[il].wo, NULL,
|
|
|
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
|
|
|
|
|
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
|
cb(cur, "kqv_out", il);
|
|
cb(cur, "kqv_out", il);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -5837,11 +5852,9 @@ struct llm_build_context {
|
|
|
);
|
|
);
|
|
|
cb(Kcur, "Kcur", il);
|
|
cb(Kcur, "Kcur", il);
|
|
|
|
|
|
|
|
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
|
|
|
|
-
|
|
|
|
|
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
|
|
|
|
|
|
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
|
model.layers[il].wo, model.layers[il].bo,
|
|
model.layers[il].wo, model.layers[il].bo,
|
|
|
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
|
|
|
|
|
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
|
cb(cur, "kqv_out", il);
|
|
cb(cur, "kqv_out", il);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -5966,11 +5979,9 @@ struct llm_build_context {
|
|
|
);
|
|
);
|
|
|
cb(Kcur, "Kcur", il);
|
|
cb(Kcur, "Kcur", il);
|
|
|
|
|
|
|
|
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
|
|
|
|
-
|
|
|
|
|
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
|
|
|
|
|
|
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
|
model.layers[il].wo, model.layers[il].bo,
|
|
model.layers[il].wo, model.layers[il].bo,
|
|
|
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f, cb, il);
|
|
|
|
|
|
|
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f, cb, il);
|
|
|
cb(cur, "kqv_out", il);
|
|
cb(cur, "kqv_out", il);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -6071,11 +6082,9 @@ struct llm_build_context {
|
|
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
|
cb(Kcur, "Kcur", il);
|
|
cb(Kcur, "Kcur", il);
|
|
|
|
|
|
|
|
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
|
|
|
|
-
|
|
|
|
|
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
|
|
|
|
|
|
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
|
model.layers[il].wo, NULL,
|
|
model.layers[il].wo, NULL,
|
|
|
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
|
|
|
|
|
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
|
cb(cur, "kqv_out", il);
|
|
cb(cur, "kqv_out", il);
|
|
|
}
|
|
}
|
|
|
struct ggml_tensor * sa_out = cur;
|
|
struct ggml_tensor * sa_out = cur;
|
|
@@ -6172,11 +6181,9 @@ struct llm_build_context {
|
|
|
|
|
|
|
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
|
|
|
|
|
|
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
|
|
|
|
-
|
|
|
|
|
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
|
|
|
|
|
|
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
|
model.layers[il].wo, model.layers[il].bo,
|
|
model.layers[il].wo, model.layers[il].bo,
|
|
|
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
|
|
|
|
|
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
|
cb(cur, "kqv_out", il);
|
|
cb(cur, "kqv_out", il);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -6283,11 +6290,9 @@ struct llm_build_context {
|
|
|
);
|
|
);
|
|
|
cb(Kcur, "Kcur", il);
|
|
cb(Kcur, "Kcur", il);
|
|
|
|
|
|
|
|
- llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
|
|
|
|
-
|
|
|
|
|
- cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
|
|
|
|
|
|
+ cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
|
|
model.layers[il].wo, model.layers[il].bo,
|
|
model.layers[il].wo, model.layers[il].bo,
|
|
|
- Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
|
|
|
|
|
+ Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
|
cb(cur, "kqv_out", il);
|
|
cb(cur, "kqv_out", il);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -6355,6 +6360,14 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
ggml_set_name(cur, name);
|
|
ggml_set_name(cur, name);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+
|
|
|
|
|
+ if (!lctx.cparams.offload_kqv) {
|
|
|
|
|
+ if (strcmp(name, "kqv_merged_cont") == 0) {
|
|
|
|
|
+ // all nodes between the KV store and the attention output are run on the CPU
|
|
|
|
|
+ ggml_backend_sched_set_node_backend(lctx.sched, cur, lctx.backend_cpu);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
//
|
|
//
|
|
|
// allocate input tensors and set input data
|
|
// allocate input tensors and set input data
|
|
|
//
|
|
//
|