6 days ago · 0e4ebeb057
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -422,57 +422,6 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
 
				         ++qs.i_ffn_up;
			
 
				     }
			
 
				 
			
 
				-    //    if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
			
 
				-    //}
			
 
				-    // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
			
 
				-    //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
			
 
				-    //    if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
			
 
				-    //}
			
 
				-    // This can be used to reduce the size of the Q5_K_S model.
			
 
				-    // The associated PPL increase is fully in line with the size reduction
			
 
				-    //else {
			
 
				-    //    if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
			
 
				-    //}
			
 
				-    bool convert_incompatible_tensor = false;
			
 
				-    {
			
 
				-        const int64_t nx = tensor->ne[0];
			
 
				-        const int64_t ny = tensor->ne[1];
			
 
				-        const int64_t qk_k = ggml_blck_size(new_type);
			
 
				-
			
 
				-        if (nx % qk_k != 0) {
			
 
				-            LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
			
 
				-            convert_incompatible_tensor = true;
			
 
				-        } else {
			
 
				-            ++qs.n_k_quantized;
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-    if (convert_incompatible_tensor) {
			
 
				-        switch (new_type) {
			
 
				-            case GGML_TYPE_TQ1_0:
			
 
				-            case GGML_TYPE_TQ2_0:  new_type = GGML_TYPE_Q4_0; break;  // TODO: use a symmetric type instead
			
 
				-            case GGML_TYPE_IQ2_XXS:
			
 
				-            case GGML_TYPE_IQ2_XS:
			
 
				-            case GGML_TYPE_IQ2_S:
			
 
				-            case GGML_TYPE_IQ3_XXS:
			
 
				-            case GGML_TYPE_IQ3_S:
			
 
				-            case GGML_TYPE_IQ1_S:
			
 
				-            case GGML_TYPE_IQ1_M:
			
 
				-            case GGML_TYPE_Q2_K:
			
 
				-            case GGML_TYPE_Q3_K:
			
 
				-            case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
			
 
				-            case GGML_TYPE_Q4_K:   new_type = GGML_TYPE_Q5_0;   break;
			
 
				-            case GGML_TYPE_Q5_K:   new_type = GGML_TYPE_Q5_1;   break;
			
 
				-            case GGML_TYPE_Q6_K:   new_type = GGML_TYPE_Q8_0;   break;
			
 
				-            default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
			
 
				-        }
			
 
				-        if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
			
 
				-            new_type = GGML_TYPE_F16;
			
 
				-        }
			
 
				-        LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
			
 
				-        ++qs.n_fallback;
			
 
				-    }
			
 
				-
			
 
				     return new_type;
			
 
				 }
			
 
				 
			
@@ -875,21 +824,69 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
				 
			
 
				             // get more optimal quantization type based on the tensor shape, layer, etc.
			
 
				             if (!params->pure && ggml_is_quantized(default_type)) {
			
 
				-                int fallback = qs.n_fallback;
			
 
				-                new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
			
 
				-                // unless the user specifies a type, and the tensor geometry will not require fallback quantisation
			
 
				-                if (params->tensor_types && qs.n_fallback - fallback == 0) {
			
 
				+                // if the user provided tensor types - use those
			
 
				+                bool manual = false;
			
 
				+                if (params->tensor_types) {
			
 
				                     const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
			
 
				                     const std::string tensor_name(tensor->name);
			
 
				                     for (const auto & [tname, qtype] : tensor_types) {
			
 
				                         if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
			
 
				                             if  (qtype != new_type) {
			
 
				-                                LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
			
 
				+                                LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(qtype));
			
 
				                                 new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
			
 
				+                                manual = true;
			
 
				+                                break;
			
 
				                             }
			
 
				                         }
			
 
				                     }
			
 
				                 }
			
 
				+
			
 
				+                // if not manual - use the standard logic for choosing the quantization type based on the selected mixture
			
 
				+                if (!manual) {
			
 
				+                    new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
			
 
				+                }
			
 
				+
			
 
				+                // incompatible tensor shapes are handled here - fallback to a compatible type
			
 
				+                {
			
 
				+                    bool convert_incompatible_tensor = false;
			
 
				+
			
 
				+                    const int64_t nx = tensor->ne[0];
			
 
				+                    const int64_t ny = tensor->ne[1];
			
 
				+                    const int64_t qk_k = ggml_blck_size(new_type);
			
 
				+
			
 
				+                    if (nx % qk_k != 0) {
			
 
				+                        LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
			
 
				+                        convert_incompatible_tensor = true;
			
 
				+                    } else {
			
 
				+                        ++qs.n_k_quantized;
			
 
				+                    }
			
 
				+
			
 
				+                    if (convert_incompatible_tensor) {
			
 
				+                        switch (new_type) {
			
 
				+                            case GGML_TYPE_TQ1_0:
			
 
				+                            case GGML_TYPE_TQ2_0:  new_type = GGML_TYPE_Q4_0; break;  // TODO: use a symmetric type instead
			
 
				+                            case GGML_TYPE_IQ2_XXS:
			
 
				+                            case GGML_TYPE_IQ2_XS:
			
 
				+                            case GGML_TYPE_IQ2_S:
			
 
				+                            case GGML_TYPE_IQ3_XXS:
			
 
				+                            case GGML_TYPE_IQ3_S:
			
 
				+                            case GGML_TYPE_IQ1_S:
			
 
				+                            case GGML_TYPE_IQ1_M:
			
 
				+                            case GGML_TYPE_Q2_K:
			
 
				+                            case GGML_TYPE_Q3_K:
			
 
				+                            case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
			
 
				+                            case GGML_TYPE_Q4_K:   new_type = GGML_TYPE_Q5_0;   break;
			
 
				+                            case GGML_TYPE_Q5_K:   new_type = GGML_TYPE_Q5_1;   break;
			
 
				+                            case GGML_TYPE_Q6_K:   new_type = GGML_TYPE_Q8_0;   break;
			
 
				+                            default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
			
 
				+                        }
			
 
				+                        if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
			
 
				+                            new_type = GGML_TYPE_F16;
			
 
				+                        }
			
 
				+                        LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
			
 
				+                        ++qs.n_fallback;
			
 
				+                    }
			
 
				+                }
			
 
				             }
			
 
				             if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
			
 
				                 new_type = params->token_embedding_type;