|
|
@@ -512,7 +512,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
|
|
|
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
|
|
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
|
|
- ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out, false);
|
|
|
+ ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl, false);
|
|
|
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
|
|
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
|
|
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
|
|
@@ -1697,15 +1697,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
case LLM_ARCH_DEEPSEEK2:
|
|
|
{
|
|
|
// lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
|
|
|
- bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
|
|
|
+ const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
|
|
|
+
|
|
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
|
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
|
if (!is_lite) {
|
|
|
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
|
|
}
|
|
|
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
|
|
- ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla, false);
|
|
|
- ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
|
|
|
+ ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl, false);
|
|
|
+ ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl, false);
|
|
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
|
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
|
|
|
@@ -4909,14 +4910,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
} break;
|
|
|
case LLM_ARCH_DEEPSEEK2:
|
|
|
{
|
|
|
- // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B
|
|
|
- const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26);
|
|
|
-
|
|
|
- const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
|
|
|
+ const bool is_mla = hparams.is_mla();
|
|
|
|
|
|
// note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
|
|
|
- const int64_t n_embd_head_k_mla = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
|
|
|
- const int64_t n_embd_head_v_mla = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
|
|
|
+ const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
|
|
|
+ const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();
|
|
|
|
|
|
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
|
|
const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
|
|
|
@@ -4941,13 +4939,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
auto & layer = layers[i];
|
|
|
|
|
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
|
- if (!is_lite) {
|
|
|
+ if (q_lora_rank > 0) {
|
|
|
layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
|
|
|
}
|
|
|
|
|
|
layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
|
|
|
|
|
|
- if (!is_lite) {
|
|
|
+ if (q_lora_rank > 0) {
|
|
|
layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
|
|
|
layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
|
|
|
} else {
|
|
|
@@ -6597,7 +6595,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
}
|
|
|
|
|
|
// for LFM2-ColBert-350M
|
|
|
- dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.get_n_embd_out()}, TENSOR_NOT_REQUIRED);
|
|
|
+ dense_2_out_layers = create_tensor(tn(LLM_TENSOR_DENSE_2_OUT, "weight"), {n_embd, hparams.n_embd_out()}, TENSOR_NOT_REQUIRED);
|
|
|
} break;
|
|
|
case LLM_ARCH_SMALLTHINKER:
|
|
|
{
|
|
|
@@ -7316,8 +7314,8 @@ void llama_model::print_info() const {
|
|
|
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
|
|
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
|
|
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
|
|
|
- LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
|
|
|
- LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
|
|
|
+ LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla());
|
|
|
+ LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla());
|
|
|
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
|
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
|
|
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
|
@@ -8162,7 +8160,7 @@ int32_t llama_model_n_embd_inp(const llama_model * model) {
|
|
|
}
|
|
|
|
|
|
int32_t llama_model_n_embd_out(const llama_model * model) {
|
|
|
- return model->hparams.get_n_embd_out();
|
|
|
+ return model->hparams.n_embd_out();
|
|
|
}
|
|
|
|
|
|
int32_t llama_model_n_layer(const llama_model * model) {
|