|
|
@@ -6,8 +6,8 @@
|
|
|
#include "llama-cparams.h"
|
|
|
#include "llama-model-loader.h"
|
|
|
|
|
|
-#include "llama-kv-cache-unified.h"
|
|
|
-#include "llama-kv-cache-unified-iswa.h"
|
|
|
+#include "llama-kv-cache.h"
|
|
|
+#include "llama-kv-cache-iswa.h"
|
|
|
#include "llama-memory-hybrid.h"
|
|
|
#include "llama-memory-recurrent.h"
|
|
|
|
|
|
@@ -5986,7 +5986,7 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
|
|
|
|
@@ -6146,7 +6146,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
|
|
|
ggml_tensor * inp_attn_scale = nullptr;
|
|
|
inp_attn_scale = build_inp_attn_scale();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
|
|
+ auto * inp_attn = build_attn_inp_kv_iswa();
|
|
|
|
|
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
|
|
|
|
@@ -6325,7 +6325,7 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
|
|
|
|
@@ -6481,7 +6481,7 @@ struct llm_build_baichuan : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = model.type == LLM_TYPE_7B ? build_inp_pos() : nullptr;
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -6603,7 +6603,7 @@ struct llm_build_xverse : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -6717,7 +6717,7 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -6841,7 +6841,7 @@ struct llm_build_grok : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -7001,7 +7001,7 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -7125,7 +7125,7 @@ struct llm_build_starcoder : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
|
|
cb(pos, "pos_embd", -1);
|
|
|
@@ -7230,7 +7230,7 @@ struct llm_build_refact : public llm_graph_context {
|
|
|
|
|
|
inpL = build_inp_embd(model.tok_embd);
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -7632,7 +7632,7 @@ struct llm_build_bloom : public llm_graph_context {
|
|
|
|
|
|
inpL = build_inp_embd(model.tok_embd);
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
inpL = build_norm(inpL,
|
|
|
model.tok_norm,
|
|
|
@@ -7739,7 +7739,7 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
|
|
|
inpL = build_inp_embd(model.tok_embd);
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
if (model.pos_embd) {
|
|
|
// inp_pos - contains the positions
|
|
|
@@ -7889,7 +7889,7 @@ struct llm_build_stablelm : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -8041,7 +8041,7 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -8156,7 +8156,7 @@ struct llm_build_qwen2 : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -8481,7 +8481,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
int sections[4];
|
|
|
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
|
|
@@ -8602,7 +8602,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -8761,7 +8761,7 @@ struct llm_build_qwen3 : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -8882,7 +8882,7 @@ struct llm_build_qwen3moe : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -9012,7 +9012,7 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -9141,13 +9141,13 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
|
|
|
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
|
|
|
inp_attn_type * inp_attn = nullptr;
|
|
|
|
|
|
if constexpr (iswa) {
|
|
|
- inp_attn = build_attn_inp_kv_unified_iswa();
|
|
|
+ inp_attn = build_attn_inp_kv_iswa();
|
|
|
} else {
|
|
|
- inp_attn = build_attn_inp_kv_unified();
|
|
|
+ inp_attn = build_attn_inp_kv();
|
|
|
}
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
@@ -9299,7 +9299,7 @@ struct llm_build_plamo : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -9415,7 +9415,7 @@ struct llm_build_gpt2 : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
|
|
cb(pos, "pos_embd", -1);
|
|
|
@@ -9525,7 +9525,7 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -9638,7 +9638,7 @@ struct llm_build_orion : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -9765,7 +9765,7 @@ struct llm_build_internlm2 : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -9901,7 +9901,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -10096,7 +10096,7 @@ struct llm_build_gemma : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -10212,7 +10212,7 @@ struct llm_build_gemma2_iswa : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
|
|
+ auto * inp_attn = build_attn_inp_kv_iswa();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -10346,7 +10346,7 @@ struct llm_build_gemma3_iswa : public llm_graph_context {
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
// TODO: is causal == true correct? might need some changes
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
|
|
+ auto * inp_attn = build_attn_inp_kv_iswa();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -10497,7 +10497,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context {
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
// TODO: is causal == true correct? might need some changes
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
|
|
+ auto * inp_attn = build_attn_inp_kv_iswa();
|
|
|
|
|
|
// inp_per_layer shape: [n_embd_altup, n_tokens, n_layer]
|
|
|
ggml_tensor * inp_per_layer = project_per_layer_inputs(inpL, get_per_layer_inputs());
|
|
|
@@ -10904,7 +10904,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -11473,7 +11473,7 @@ struct llm_build_command_r : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -11620,7 +11620,7 @@ struct llm_build_cohere2_iswa : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
|
|
+ auto * inp_attn = build_attn_inp_kv_iswa();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -11755,7 +11755,7 @@ struct llm_build_olmo : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -11883,7 +11883,7 @@ struct llm_build_olmo2 : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -12012,7 +12012,7 @@ struct llm_build_olmoe : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -12138,7 +12138,7 @@ struct llm_build_openelm : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -12269,7 +12269,7 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -12415,7 +12415,7 @@ struct llm_build_arctic : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -12553,7 +12553,7 @@ struct llm_build_deepseek : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
|
|
|
|
@@ -12730,7 +12730,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -12977,7 +12977,7 @@ struct llm_build_bitnet : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -13241,7 +13241,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
|
|
|
const int64_t n_outputs_enc = embd_enc->ne[1];
|
|
|
|
|
|
- auto * inp_attn_self = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn_self = build_attn_inp_kv();
|
|
|
auto * inp_attn_cross = build_attn_inp_cross();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
@@ -13406,7 +13406,7 @@ struct llm_build_jais : public llm_graph_context {
|
|
|
|
|
|
inpL = build_inp_embd(model.tok_embd);
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -13504,7 +13504,7 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -13637,7 +13637,7 @@ struct llm_build_glm4 : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -13787,7 +13787,7 @@ struct llm_build_glm4_moe : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -13947,7 +13947,7 @@ struct llm_build_nemotron : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -14076,7 +14076,7 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -14208,13 +14208,13 @@ struct llm_build_exaone4 : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
|
|
|
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
|
|
|
inp_attn_type * inp_attn = nullptr;
|
|
|
|
|
|
if constexpr (iswa) {
|
|
|
- inp_attn = build_attn_inp_kv_unified_iswa();
|
|
|
+ inp_attn = build_attn_inp_kv_iswa();
|
|
|
} else {
|
|
|
- inp_attn = build_attn_inp_kv_unified();
|
|
|
+ inp_attn = build_attn_inp_kv();
|
|
|
}
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
@@ -15097,7 +15097,7 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
inp_pos = build_inp_pos();
|
|
|
}
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -15148,12 +15148,12 @@ struct llm_build_granite : public llm_graph_context {
|
|
|
}
|
|
|
|
|
|
ggml_tensor * build_attention_layer(
|
|
|
- ggml_tensor * cur,
|
|
|
- ggml_tensor * inp_pos,
|
|
|
- llm_graph_input_attn_kv_unified * inp_attn,
|
|
|
- const llama_model & model,
|
|
|
- const int64_t n_embd_head,
|
|
|
- const int il) {
|
|
|
+ ggml_tensor * cur,
|
|
|
+ ggml_tensor * inp_pos,
|
|
|
+ llm_graph_input_attn_kv * inp_attn,
|
|
|
+ const llama_model & model,
|
|
|
+ const int64_t n_embd_head,
|
|
|
+ const int il) {
|
|
|
|
|
|
// compute Q and K and (optionally) RoPE them
|
|
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
|
@@ -15367,12 +15367,12 @@ struct llm_build_granite_hybrid : public llm_graph_context_mamba {
|
|
|
}
|
|
|
|
|
|
ggml_tensor * build_attention_layer(
|
|
|
- ggml_tensor * cur,
|
|
|
- ggml_tensor * inp_pos,
|
|
|
- llm_graph_input_attn_kv_unified * inp_attn,
|
|
|
- const llama_model & model,
|
|
|
- const int64_t n_embd_head,
|
|
|
- const int il) {
|
|
|
+ ggml_tensor * cur,
|
|
|
+ ggml_tensor * inp_pos,
|
|
|
+ llm_graph_input_attn_kv * inp_attn,
|
|
|
+ const llama_model & model,
|
|
|
+ const int64_t n_embd_head,
|
|
|
+ const int il) {
|
|
|
|
|
|
// compute Q and K and (optionally) RoPE them
|
|
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
|
@@ -15529,7 +15529,7 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -15860,7 +15860,7 @@ struct llm_build_plm : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -16025,7 +16025,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -16174,7 +16174,7 @@ struct llm_build_dots1 : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -16324,7 +16324,7 @@ struct llm_build_ernie4_5 : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
for (int il = 0; il < n_layer; ++il) {
|
|
|
ggml_tensor * inpSA = inpL;
|
|
|
@@ -16454,7 +16454,7 @@ struct llm_build_ernie4_5_moe : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
|
|
|
@@ -16828,7 +16828,7 @@ struct llm_build_plamo2 : public llm_graph_context_mamba {
|
|
|
|
|
|
private:
|
|
|
ggml_tensor * build_plamo2_attn_layer(
|
|
|
- llm_graph_input_attn_kv_unified * inp,
|
|
|
+ llm_graph_input_attn_kv * inp,
|
|
|
ggml_tensor * inp_pos,
|
|
|
ggml_tensor * cur,
|
|
|
const llama_model & model,
|
|
|
@@ -17061,7 +17061,7 @@ struct llm_build_arcee : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
|
|
|
|
@@ -17196,7 +17196,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
|
|
|
|
|
|
@@ -17357,7 +17357,7 @@ struct llm_build_hunyuan_dense : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
|
|
|
|
|
|
@@ -17495,7 +17495,7 @@ struct llm_build_smollm3 : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified();
|
|
|
+ auto * inp_attn = build_attn_inp_kv();
|
|
|
|
|
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
|
|
|
|
@@ -17627,7 +17627,7 @@ struct llm_build_openai_moe_iswa : public llm_graph_context {
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- auto * inp_attn = build_attn_inp_kv_unified_iswa();
|
|
|
+ auto * inp_attn = build_attn_inp_kv_iswa();
|
|
|
|
|
|
for (int il = 0; il < n_layer; ++il) {
|
|
|
ggml_tensor * inpSA = inpL;
|
|
|
@@ -17809,10 +17809,10 @@ struct llm_build_lfm2 : public llm_graph_context {
|
|
|
return cur;
|
|
|
}
|
|
|
|
|
|
- ggml_tensor * build_attn_block(ggml_tensor * cur,
|
|
|
- ggml_tensor * inp_pos,
|
|
|
- llm_graph_input_attn_kv_unified * inp_attn,
|
|
|
- int il) const {
|
|
|
+ ggml_tensor * build_attn_block(ggml_tensor * cur,
|
|
|
+ ggml_tensor * inp_pos,
|
|
|
+ llm_graph_input_attn_kv * inp_attn,
|
|
|
+ int il) const {
|
|
|
GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
|
|
|
auto const n_embd_head = hparams.n_embd_head_v;
|
|
|
auto const n_head_kv = hparams.n_head_kv(il);
|
|
|
@@ -17940,13 +17940,13 @@ struct llm_build_smallthinker : public llm_graph_context{
|
|
|
// inp_pos - contains the positions
|
|
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
|
|
|
|
- using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_unified_iswa, llm_graph_input_attn_kv_unified>;
|
|
|
+ using inp_attn_type = std::conditional_t<iswa, llm_graph_input_attn_kv_iswa, llm_graph_input_attn_kv>;
|
|
|
inp_attn_type * inp_attn = nullptr;
|
|
|
|
|
|
if constexpr (iswa) {
|
|
|
- inp_attn = build_attn_inp_kv_unified_iswa();
|
|
|
+ inp_attn = build_attn_inp_kv_iswa();
|
|
|
} else {
|
|
|
- inp_attn = build_attn_inp_kv_unified();
|
|
|
+ inp_attn = build_attn_inp_kv();
|
|
|
}
|
|
|
|
|
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
|
@@ -18076,7 +18076,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
std::max((uint32_t) 1, cparams.n_seq_max),
|
|
|
cparams.n_seq_max);
|
|
|
} else if (llm_arch_is_hybrid(arch)) {
|
|
|
- const auto padding = llama_kv_cache_unified::get_padding(cparams);
|
|
|
+ const auto padding = llama_kv_cache::get_padding(cparams);
|
|
|
|
|
|
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
|
|
|
|
|
|
@@ -18098,7 +18098,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
/* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
|
|
|
/* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
|
|
|
} else {
|
|
|
- const auto padding = llama_kv_cache_unified::get_padding(cparams);
|
|
|
+ const auto padding = llama_kv_cache::get_padding(cparams);
|
|
|
|
|
|
uint32_t n_ctx_per_stream = cparams.n_ctx;
|
|
|
|
|
|
@@ -18118,7 +18118,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
|
|
|
GGML_ASSERT(hparams.is_swa_any());
|
|
|
|
|
|
- res = new llama_kv_cache_unified_iswa(
|
|
|
+ res = new llama_kv_cache_iswa(
|
|
|
*this,
|
|
|
params.type_k,
|
|
|
params.type_v,
|
|
|
@@ -18133,7 +18133,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
|
|
|
} else {
|
|
|
GGML_ASSERT(!hparams.is_swa_any());
|
|
|
|
|
|
- res = new llama_kv_cache_unified(
|
|
|
+ res = new llama_kv_cache(
|
|
|
*this,
|
|
|
nullptr,
|
|
|
params.type_k,
|