hai 8 meses · 3bf785f3ef
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -2123,6 +2123,9 @@ class DeciModel(TextModel):
 
				             # if n_heads_in_group is not None, then
			
 
				             # _num_kv_heads[il] is num_attention_head // n_heads_in_group and
			
 
				             # _num_heads[il] is num_attention_head
			
 
				+            # ***dummy layer*** for nemotron 253B
			
 
				+            # if n_heads_in_group is None and ffn_mult is None
			
 
				+            # then _num_kv_heads[il] is 0 and _num_heads[il] is 0 and _ffn_dims is 0
			
 
				             for il in range(len(_block_configs)):
			
 
				                 if _block_configs[il]["attention"]["n_heads_in_group"] is None:
			
 
				                     if _block_configs[il]["attention"]["replace_with_linear"] is True:
			
@@ -2134,7 +2137,10 @@ class DeciModel(TextModel):
 
				                 else:
			
 
				                     self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"])
			
 
				                     self._num_heads.append(self.hparams["num_attention_heads"])
			
 
				-                _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
			
 
				+                if _block_configs[il]["ffn"]["ffn_mult"] is None: # dummy layer
			
 
				+                    _ffn_multipliers.append(0.0)
			
 
				+                else:
			
 
				+                    _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
			
 
				             assert self.block_count == len(self._num_kv_heads)
			
 
				             assert self.block_count == len(self._num_heads)
			
 
				             assert self.block_count == len(_ffn_multipliers)
			
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -80,6 +80,7 @@ const char * llm_type_name(llm_type type) {
 
				         case LLM_TYPE_236B:          return "236B";
			
 
				         case LLM_TYPE_290B:          return "290B";
			
 
				         case LLM_TYPE_314B:          return "314B";
			
 
				+        case LLM_TYPE_405B:          return "405B";
			
 
				         case LLM_TYPE_671B:          return "671B";
			
 
				         case LLM_TYPE_SMALL:         return "0.1B";
			
 
				         case LLM_TYPE_MEDIUM:        return "0.4B";
			
@@ -582,6 +583,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
 
				                 switch (hparams.n_layer) {
			
 
				                     case 32: type = LLM_TYPE_7B; break;
			
 
				                     case 80: type = LLM_TYPE_70B; break;
			
 
				+                    case 162: type = LLM_TYPE_405B; break;
			
 
				                     default: type = LLM_TYPE_UNKNOWN;
			
 
				                 }
			
 
				             } break;
			
@@ -1848,7 +1850,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
				                         layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
			
 
				                         layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
			
 
				 
			
 
				-                        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
			
 
				+                        if (n_ff > 0) {
			
 
				+                            layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
			
 
				+                        }
			
 
				 
			
 
				                         if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
			
 
				                             layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
			
@@ -1858,9 +1862,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
				                             layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
			
 
				                         }
			
 
				 
			
 
				-                        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
			
 
				-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
			
 
				-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
			
 
				+                        if (n_ff > 0) {
			
 
				+                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
			
 
				+                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
			
 
				+                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
			
 
				+                        }
			
 
				 
			
 
				                         // optional MLP bias
			
 
				                         layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
			
@@ -4705,6 +4711,7 @@ struct llm_build_deci : public llm_graph_context {
 
				             ggml_tensor * inpSA = inpL;
			
 
				             const int64_t n_head_kv = hparams.n_head_kv(il);
			
 
				             const int64_t n_head    = hparams.n_head(il);
			
 
				+            const int64_t n_ff      = hparams.n_ff(il);
			
 
				 
			
 
				             if (n_head == 0) {
			
 
				                 // attention-free layer of Llama-3_1-Nemotron-51B
			
@@ -4780,6 +4787,11 @@ struct llm_build_deci : public llm_graph_context {
 
				                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
			
 
				             }
			
 
				 
			
 
				+            // FFN-free layer of Llama-3_1-Nemotron-Ultra-253B
			
 
				+            if (n_head == 0 && n_ff == 0) {
			
 
				+                continue;
			
 
				+            }
			
 
				+
			
 
				             // For Granite architecture
			
 
				             if (hparams.f_residual_scale) {
			
 
				                 cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
			
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -76,6 +76,7 @@ enum llm_type {
 
				     LLM_TYPE_236B,
			
 
				     LLM_TYPE_290B,
			
 
				     LLM_TYPE_314B,
			
 
				+    LLM_TYPE_405B,
			
 
				     LLM_TYPE_671B,
			
 
				     LLM_TYPE_SMALL,
			
 
				     LLM_TYPE_MEDIUM,