Sfoglia il codice sorgente

Modify sanity check to handle hybrid models

Piotr Wilkin 3 mesi fa
parent
commit
ee52fe36f3
2 ha cambiato i file con 5 aggiunte e 2 eliminazioni
  1. 1 1
      src/llama-hparams.h
  2. 4 1
      src/llama-quant.cpp

+ 1 - 1
src/llama-hparams.h

@@ -6,7 +6,7 @@
 
 
 // bump if necessary
 // bump if necessary
 #define LLAMA_MAX_LAYERS  512
 #define LLAMA_MAX_LAYERS  512
-#define LLAMA_MAX_EXPERTS 384  // Kimi-K2
+#define LLAMA_MAX_EXPERTS 512  // Qwen3-Next
 
 
 enum llama_expert_gating_func_type {
 enum llama_expert_gating_func_type {
     LLAMA_EXPERT_GATING_FUNC_TYPE_NONE           = 0,
     LLAMA_EXPERT_GATING_FUNC_TYPE_NONE           = 0,

+ 4 - 1
src/llama-quant.cpp

@@ -667,6 +667,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     std::map<int, std::string> mapped;
     std::map<int, std::string> mapped;
     int blk_id = 0;
     int blk_id = 0;
     int pruned_attention_w = 0;
     int pruned_attention_w = 0;
+    int linear_layers = 0;
 
 
     // make a list of weights
     // make a list of weights
     std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
     std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
@@ -684,6 +685,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         } else if (remapped_name != it.first) {
         } else if (remapped_name != it.first) {
             ggml_set_name(it.second.tensor, remapped_name.c_str());
             ggml_set_name(it.second.tensor, remapped_name.c_str());
             LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor));
             LLAMA_LOG_DEBUG("%s: tensor %s remapped to %s\n", __func__, it.first.c_str(), ggml_get_name(it.second.tensor));
+        } else if (it.first.find("ssm_conv") != std::string::npos) {
+            linear_layers++;
         }
         }
         tensors.push_back(&it.second);
         tensors.push_back(&it.second);
     }
     }
@@ -729,7 +732,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             // for each decoder block, there are 2 attention layers
             // for each decoder block, there are 2 attention layers
             n_attn_layer += 2 * model.hparams.dec_n_layer;
             n_attn_layer += 2 * model.hparams.dec_n_layer;
         }
         }
-        GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
+        GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w - linear_layers) && "n_attention_wv is unexpected");
     }
     }
 
 
     size_t total_size_org = 0;
     size_t total_size_org = 0;