|
|
@@ -487,9 +487,9 @@ struct llm_graph_context {
|
|
|
|
|
|
ggml_tensor * build_attn_mha(
|
|
|
ggml_cgraph * gf,
|
|
|
- ggml_tensor * q,
|
|
|
- ggml_tensor * k,
|
|
|
- ggml_tensor * v,
|
|
|
+ ggml_tensor * q, // [n_embd_head_q, n_tokens, n_head_q]
|
|
|
+ ggml_tensor * k, // [n_embd_head_k, n_tokens, n_head_k]
|
|
|
+ ggml_tensor * v, // [n_embd_head_v, n_tokens, n_head_v] (v_trans == false)
|
|
|
ggml_tensor * kq_b,
|
|
|
ggml_tensor * kq_mask,
|
|
|
bool v_trans,
|
|
|
@@ -502,9 +502,9 @@ struct llm_graph_context {
|
|
|
ggml_cgraph * gf,
|
|
|
ggml_tensor * wo,
|
|
|
ggml_tensor * wo_b,
|
|
|
- ggml_tensor * q_cur,
|
|
|
- ggml_tensor * k_cur,
|
|
|
- ggml_tensor * v_cur,
|
|
|
+ ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
|
|
+ ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
|
|
+ ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
|
|
ggml_tensor * kq_b,
|
|
|
float kq_scale,
|
|
|
int il) const;
|
|
|
@@ -516,9 +516,9 @@ struct llm_graph_context {
|
|
|
ggml_cgraph * gf,
|
|
|
ggml_tensor * wo,
|
|
|
ggml_tensor * wo_b,
|
|
|
- ggml_tensor * q_cur,
|
|
|
- ggml_tensor * k_cur,
|
|
|
- ggml_tensor * v_cur,
|
|
|
+ ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
|
|
+ ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
|
|
+ ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
|
|
ggml_tensor * kq_b,
|
|
|
float kq_scale,
|
|
|
int il) const;
|
|
|
@@ -530,9 +530,9 @@ struct llm_graph_context {
|
|
|
ggml_cgraph * gf,
|
|
|
ggml_tensor * wo,
|
|
|
ggml_tensor * wo_b,
|
|
|
- ggml_tensor * q_cur,
|
|
|
- ggml_tensor * k_cur,
|
|
|
- ggml_tensor * v_cur,
|
|
|
+ ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
|
|
+ ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
|
|
+ ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
|
|
ggml_tensor * kq_b,
|
|
|
float kq_scale,
|
|
|
int il) const;
|