Browse Source

quant : manual overrides of tensor types take precedence (#18952)

Georgi Gerganov 6 days ago
parent
commit
0e4ebeb057
1 changed files with 53 additions and 56 deletions
  1. 53 56
      src/llama-quant.cpp

+ 53 - 56
src/llama-quant.cpp

@@ -422,57 +422,6 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
         ++qs.i_ffn_up;
     }
 
-    //    if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
-    //}
-    // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
-    //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
-    //    if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
-    //}
-    // This can be used to reduce the size of the Q5_K_S model.
-    // The associated PPL increase is fully in line with the size reduction
-    //else {
-    //    if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
-    //}
-    bool convert_incompatible_tensor = false;
-    {
-        const int64_t nx = tensor->ne[0];
-        const int64_t ny = tensor->ne[1];
-        const int64_t qk_k = ggml_blck_size(new_type);
-
-        if (nx % qk_k != 0) {
-            LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
-            convert_incompatible_tensor = true;
-        } else {
-            ++qs.n_k_quantized;
-        }
-    }
-
-    if (convert_incompatible_tensor) {
-        switch (new_type) {
-            case GGML_TYPE_TQ1_0:
-            case GGML_TYPE_TQ2_0:  new_type = GGML_TYPE_Q4_0; break;  // TODO: use a symmetric type instead
-            case GGML_TYPE_IQ2_XXS:
-            case GGML_TYPE_IQ2_XS:
-            case GGML_TYPE_IQ2_S:
-            case GGML_TYPE_IQ3_XXS:
-            case GGML_TYPE_IQ3_S:
-            case GGML_TYPE_IQ1_S:
-            case GGML_TYPE_IQ1_M:
-            case GGML_TYPE_Q2_K:
-            case GGML_TYPE_Q3_K:
-            case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
-            case GGML_TYPE_Q4_K:   new_type = GGML_TYPE_Q5_0;   break;
-            case GGML_TYPE_Q5_K:   new_type = GGML_TYPE_Q5_1;   break;
-            case GGML_TYPE_Q6_K:   new_type = GGML_TYPE_Q8_0;   break;
-            default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
-        }
-        if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
-            new_type = GGML_TYPE_F16;
-        }
-        LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
-        ++qs.n_fallback;
-    }
-
     return new_type;
 }
 
@@ -875,21 +824,69 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
             // get more optimal quantization type based on the tensor shape, layer, etc.
             if (!params->pure && ggml_is_quantized(default_type)) {
-                int fallback = qs.n_fallback;
-                new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
-                // unless the user specifies a type, and the tensor geometry will not require fallback quantisation
-                if (params->tensor_types && qs.n_fallback - fallback == 0) {
+                // if the user provided tensor types - use those
+                bool manual = false;
+                if (params->tensor_types) {
                     const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
                     const std::string tensor_name(tensor->name);
                     for (const auto & [tname, qtype] : tensor_types) {
                         if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
                             if  (qtype != new_type) {
-                                LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
+                                LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(qtype));
                                 new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
+                                manual = true;
+                                break;
                             }
                         }
                     }
                 }
+
+                // if not manual - use the standard logic for choosing the quantization type based on the selected mixture
+                if (!manual) {
+                    new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
+                }
+
+                // incompatible tensor shapes are handled here - fallback to a compatible type
+                {
+                    bool convert_incompatible_tensor = false;
+
+                    const int64_t nx = tensor->ne[0];
+                    const int64_t ny = tensor->ne[1];
+                    const int64_t qk_k = ggml_blck_size(new_type);
+
+                    if (nx % qk_k != 0) {
+                        LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
+                        convert_incompatible_tensor = true;
+                    } else {
+                        ++qs.n_k_quantized;
+                    }
+
+                    if (convert_incompatible_tensor) {
+                        switch (new_type) {
+                            case GGML_TYPE_TQ1_0:
+                            case GGML_TYPE_TQ2_0:  new_type = GGML_TYPE_Q4_0; break;  // TODO: use a symmetric type instead
+                            case GGML_TYPE_IQ2_XXS:
+                            case GGML_TYPE_IQ2_XS:
+                            case GGML_TYPE_IQ2_S:
+                            case GGML_TYPE_IQ3_XXS:
+                            case GGML_TYPE_IQ3_S:
+                            case GGML_TYPE_IQ1_S:
+                            case GGML_TYPE_IQ1_M:
+                            case GGML_TYPE_Q2_K:
+                            case GGML_TYPE_Q3_K:
+                            case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
+                            case GGML_TYPE_Q4_K:   new_type = GGML_TYPE_Q5_0;   break;
+                            case GGML_TYPE_Q5_K:   new_type = GGML_TYPE_Q5_1;   break;
+                            case GGML_TYPE_Q6_K:   new_type = GGML_TYPE_Q8_0;   break;
+                            default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
+                        }
+                        if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
+                            new_type = GGML_TYPE_F16;
+                        }
+                        LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
+                        ++qs.n_fallback;
+                    }
+                }
             }
             if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
                 new_type = params->token_embedding_type;