3 месяцев назад · 3e3cb19f64
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -5,6 +5,7 @@
 
				 #include <map>
			
 
				 
			
 
				 static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
			
 
				+    { LLM_ARCH_CLIP,             "clip"             }, // dummy, only used by llama-quantize
			
 
				     { LLM_ARCH_LLAMA,            "llama"            },
			
 
				     { LLM_ARCH_LLAMA4,           "llama4"           },
			
 
				     { LLM_ARCH_DECI,             "deci"             },
			
@@ -275,6 +276,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
 
				 };
			
 
				 
			
 
				 static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
			
 
				+    {
			
 
				+        LLM_ARCH_CLIP,
			
 
				+        {},
			
 
				+    },
			
 
				     {
			
 
				         LLM_ARCH_LLAMA,
			
 
				         {
			
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -9,6 +9,7 @@
 
				 //
			
 
				 
			
 
				 enum llm_arch {
			
 
				+    LLM_ARCH_CLIP,
			
 
				     LLM_ARCH_LLAMA,
			
 
				     LLM_ARCH_LLAMA4,
			
 
				     LLM_ARCH_DECI,
			
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -478,7 +478,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
 
				     ml.get_key(LLM_KV_GENERAL_NAME, name, false);
			
 
				 
			
 
				     // everything past this point is not vocab-related
			
 
				-    if (hparams.vocab_only) {
			
 
				+    // for CLIP models, we only need to load tensors, no hparams
			
 
				+    if (hparams.vocab_only || ml.get_arch() == LLM_ARCH_CLIP) {
			
 
				         return;
			
 
				     }
			
 
				 
			
@@ -20013,6 +20014,7 @@ int32_t llama_n_head(const llama_model * model) {
 
				 llama_rope_type llama_model_rope_type(const llama_model * model) {
			
 
				     switch (model->arch) {
			
 
				         // these models do not use RoPE
			
 
				+        case LLM_ARCH_CLIP:
			
 
				         case LLM_ARCH_GPT2:
			
 
				         case LLM_ARCH_GPTJ:
			
 
				         case LLM_ARCH_MPT:
			
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -701,6 +701,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
				         });
			
 
				     }
			
 
				 
			
 
				+    bool is_clip_model = false;
			
 
				     for (const auto * it : tensors) {
			
 
				         const struct ggml_tensor * tensor = it->tensor;
			
 
				 
			
@@ -714,12 +715,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
				         } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
			
 
				             qs.has_output = true;
			
 
				         }
			
 
				+
			
 
				+        is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
			
 
				     }
			
 
				 
			
 
				     qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
			
 
				 
			
 
				     // sanity checks for models that have attention layers
			
 
				-    if (qs.n_attention_wv != 0)
			
 
				+    if (qs.n_attention_wv != 0 && !is_clip_model)
			
 
				     {
			
 
				         const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
			
 
				         // attention layers have a non-zero number of kv heads
			
@@ -881,6 +884,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
				         // do not quantize relative position bias (T5)
			
 
				         quantize &= name.find("attn_rel_b.weight") == std::string::npos;
			
 
				 
			
 
				+        // do not quantize specific multimodal tensors
			
 
				+        quantize &= name.find(".position_embd.") == std::string::npos;
			
 
				+
			
 
				         ggml_type new_type;
			
 
				         void * new_data;
			
 
				         size_t new_size;
			
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -124,6 +124,9 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
 
				         } catch(const std::exception & e) {
			
 
				             throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
			
 
				         }
			
 
				+        if (model.arch == LLM_ARCH_CLIP) {
			
 
				+            throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead");
			
 
				+        }
			
 
				         try {
			
 
				             model.load_vocab(ml);
			
 
				         } catch(const std::exception & e) {