1 week ago · 8fb7175576
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -8772,11 +8772,7 @@ class ExaoneMoEModel(Exaone4Model):
 
				         self.gguf_writer.add_expert_weights_norm(self.hparams["norm_topk_prob"])
			
 
				         n_dense_layer = self.hparams.get("first_k_dense_replace", self.hparams.get("first_last_k_dense_replace", 0))
			
 
				         self.gguf_writer.add_leading_dense_block_count(n_dense_layer)
			
 
				-        # For here, we hard-code the number of NextN/MTP layers to 1 for K-EXAONE,
			
 
				-        # so that we can convert MTP weights to GGUF format for speculative decoding.
			
 
				-        # This is because HF config of K-EXAONE does not have `num_nextn_predict_layers` at now.
			
 
				-        # Will be updated when HF config is updated.
			
 
				-        self.gguf_writer.add_nextn_predict_layers(self.hparams.get("num_nextn_predict_layers", 1))
			
 
				+        self.gguf_writer.add_nextn_predict_layers(self.hparams.get("num_nextn_predict_layers", 0))
			
 
				 
			
 
				         self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
			
 
				 
			
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1942,16 +1942,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
 
				                 hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
			
 
				 
			
 
				                 ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,                hparams.rope_freq_base_train_swa, false);
			
 
				-                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,          hparams.n_swa, true);
			
 
				+                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,          hparams.n_swa);
			
 
				                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
			
 
				-                ml.get_key(LLM_KV_EXPERT_COUNT,                      hparams.n_expert);
			
 
				-                ml.get_key(LLM_KV_EXPERT_USED_COUNT,                 hparams.n_expert_used);
			
 
				                 ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared, false);
			
 
				                 ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
			
 
				                 ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
			
 
				-                ml.get_key(LLM_KV_EXPERT_GROUP_COUNT,                hparams.n_expert_groups, false);
			
 
				-                ml.get_key(LLM_KV_EXPERT_GROUP_USED_COUNT,           hparams.n_group_used, false);
			
 
				-                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func, false);
			
 
				+                ml.get_key(LLM_KV_EXPERT_GATING_FUNC,                hparams.expert_gating_func);
			
 
				                 ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale, false);
			
 
				                 ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,               hparams.expert_weights_norm, false);
			
 
				                 ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead);