|
|
@@ -701,6 +701,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
});
|
|
|
}
|
|
|
|
|
|
+ bool is_clip_model = false;
|
|
|
for (const auto * it : tensors) {
|
|
|
const struct ggml_tensor * tensor = it->tensor;
|
|
|
|
|
|
@@ -714,12 +715,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
|
|
|
qs.has_output = true;
|
|
|
}
|
|
|
+
|
|
|
+ is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
|
|
|
}
|
|
|
|
|
|
qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
|
|
|
|
|
|
// sanity checks for models that have attention layers
|
|
|
- if (qs.n_attention_wv != 0)
|
|
|
+ if (qs.n_attention_wv != 0 && !is_clip_model)
|
|
|
{
|
|
|
const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
|
|
|
// attention layers have a non-zero number of kv heads
|
|
|
@@ -881,6 +884,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|
|
// do not quantize relative position bias (T5)
|
|
|
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
|
|
|
|
|
+ // do not quantize specific multimodal tensors
|
|
|
+ quantize &= name.find(".position_embd.") == std::string::npos;
|
|
|
+
|
|
|
ggml_type new_type;
|
|
|
void * new_data;
|
|
|
size_t new_size;
|