пре 1 месец · 444f00b0ec
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -666,7 +666,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
				 
			
 
				     std::map<int, std::string> mapped;
			
 
				     int blk_id = 0;
			
 
				-    int pruned_attention_w = 0;
			
 
				 
			
 
				     // make a list of weights
			
 
				     std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
			
@@ -674,11 +673,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
				     for (const auto & it : ml.weights_map) {
			
 
				         const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
			
 
				         if (remapped_name.empty()) {
			
 
				-            if (it.first.find("attn_v.weight") != std::string::npos ||
			
 
				-                it.first.find("attn_qkv.weight") != std::string::npos ||
			
 
				-                it.first.find("attn_kv_b.weight") != std::string::npos) {
			
 
				-                    pruned_attention_w++;
			
 
				-            }
			
 
				             LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
			
 
				             continue;
			
 
				         }
			
@@ -703,7 +697,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
				         });
			
 
				     }
			
 
				 
			
 
				-    bool is_clip_model = false;
			
 
				     for (const auto * it : tensors) {
			
 
				         const struct ggml_tensor * tensor = it->tensor;
			
 
				 
			
@@ -717,30 +710,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
				         } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
			
 
				             qs.has_output = true;
			
 
				         }
			
 
				-
			
 
				-        is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
			
 
				     }
			
 
				 
			
 
				     qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
			
 
				 
			
 
				-    // sanity checks for models that have attention layers
			
 
				-    if (qs.n_attention_wv != 0 && !is_clip_model)
			
 
				-    {
			
 
				-        int32_t n_layer_all = model.hparams.n_layer;
			
 
				-        if (llama_model_has_encoder(&model)) {
			
 
				-            // now n_layer_all is the number of attention layers in the encoder
			
 
				-            // for each decoder block, there are 2 attention layers
			
 
				-            n_layer_all += 2 * model.hparams.dec_n_layer;
			
 
				-        }
			
 
				-
			
 
				-        // note: for linear-attention models (such as Qwen3 Next) this is the number of linear layers
			
 
				-        const int32_t n_layer_recr = std::count(model.hparams.recurrent_layer_arr.begin(), model.hparams.recurrent_layer_arr.end(), true);
			
 
				-
			
 
				-        LLAMA_LOG_INFO("%s: n_layer_all = %d, n_layer_recr = %d, pruned_attention_w = %d\n", __func__, n_layer_all, n_layer_recr, pruned_attention_w);
			
 
				-
			
 
				-        GGML_ASSERT((qs.n_attention_wv == n_layer_all - pruned_attention_w - n_layer_recr) && "n_attention_wv is unexpected");
			
 
				-    }
			
 
				-
			
 
				     size_t total_size_org = 0;
			
 
				     size_t total_size_new = 0;