1 rok temu · 1d0331c12a
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -189,6 +189,18 @@ static void prepare_imatrix(const std::string& imatrix_file,
 
				     }
			
 
				 }
			
 
				 
			
 
				+static ggml_type parse_ggml_type(const char * arg) {
			
 
				+    ggml_type result = GGML_TYPE_COUNT;
			
 
				+    for (int j = 0; j < GGML_TYPE_COUNT; ++j) {
			
 
				+        auto type = ggml_type(j);
			
 
				+        const auto * name = ggml_type_name(type);
			
 
				+        if (name && strcmp(arg, name) == 0) {
			
 
				+            result = type; break;
			
 
				+        }
			
 
				+    }
			
 
				+    return result;
			
 
				+}
			
 
				+
			
 
				 int main(int argc, char ** argv) {
			
 
				     if (argc < 3) {
			
 
				         usage(argv[0]);
			
@@ -203,6 +215,18 @@ int main(int argc, char ** argv) {
 
				     for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
			
 
				         if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
			
 
				             params.quantize_output_tensor = false;
			
 
				+        } else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) {
			
 
				+            if (arg_idx < argc-1) {
			
 
				+                params.output_tensor_type = parse_ggml_type(argv[++arg_idx]);
			
 
				+            } else {
			
 
				+                usage(argv[0]);
			
 
				+            }
			
 
				+        } else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) {
			
 
				+            if (arg_idx < argc-1) {
			
 
				+                params.token_embedding_type = parse_ggml_type(argv[++arg_idx]);
			
 
				+            } else {
			
 
				+                usage(argv[0]);
			
 
				+            }
			
 
				         } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
			
 
				             params.allow_requantize = true;
			
 
				         } else if (strcmp(argv[arg_idx], "--pure") == 0) {
			
--- a/llama.cpp
+++ b/llama.cpp
@@ -12141,27 +12141,34 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
 
				     // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
			
 
				     // with the quantization of the output tensor
			
 
				     if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
			
 
				-        int nx = tensor->ne[0];
			
 
				-        if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
			
 
				-            new_type = GGML_TYPE_Q8_0;
			
 
				-        }
			
 
				-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
			
 
				-                 ftype == LLAMA_FTYPE_MOSTLY_IQ1_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S  || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
			
 
				-            new_type = GGML_TYPE_Q5_K;
			
 
				-        }
			
 
				-        else if (new_type != GGML_TYPE_Q8_0) {
			
 
				-            new_type = GGML_TYPE_Q6_K;
			
 
				+        if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
			
 
				+            new_type = qs.params->output_tensor_type;
			
 
				+        } else {
			
 
				+            int nx = tensor->ne[0];
			
 
				+            if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
			
 
				+                new_type = GGML_TYPE_Q8_0;
			
 
				+            }
			
 
				+            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
			
 
				+                    ftype == LLAMA_FTYPE_MOSTLY_IQ1_S   || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S  || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
			
 
				+                new_type = GGML_TYPE_Q5_K;
			
 
				+            }
			
 
				+            else if (new_type != GGML_TYPE_Q8_0) {
			
 
				+                new_type = GGML_TYPE_Q6_K;
			
 
				+            }
			
 
				         }
			
 
				     } else if (name == "token_embd.weight") {
			
 
				-        if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
			
 
				-            ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
			
 
				-            new_type = GGML_TYPE_Q2_K;
			
 
				-        }
			
 
				-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
			
 
				-            new_type = GGML_TYPE_IQ3_S;
			
 
				-        }
			
 
				-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
			
 
				-            new_type = GGML_TYPE_IQ3_S;
			
 
				+        if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
			
 
				+            new_type = qs.params->token_embedding_type;
			
 
				+        } else {
			
 
				+            if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
			
 
				+                new_type = GGML_TYPE_Q2_K;
			
 
				+            }
			
 
				+            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
			
 
				+                new_type = GGML_TYPE_IQ3_S;
			
 
				+            }
			
 
				+            else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
			
 
				+                new_type = GGML_TYPE_IQ3_S;
			
 
				+            }
			
 
				         }
			
 
				     } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
			
 
				                ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
			
@@ -13051,6 +13058,8 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
 
				     struct llama_model_quantize_params result = {
			
 
				         /*.nthread                     =*/ 0,
			
 
				         /*.ftype                       =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
			
 
				+        /*.output_tensor_type          =*/ GGML_TYPE_COUNT,
			
 
				+        /*.token_embedding_type        =*/ GGML_TYPE_COUNT,
			
 
				         /*.allow_requantize            =*/ false,
			
 
				         /*.quantize_output_tensor      =*/ true,
			
 
				         /*.only_copy                   =*/ false,
			
--- a/llama.h
+++ b/llama.h
@@ -275,13 +275,15 @@ extern "C" {
 
				 
			
 
				     // model quantization parameters
			
 
				     typedef struct llama_model_quantize_params {
			
 
				-        int32_t nthread;             // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
			
 
				-        enum llama_ftype ftype;      // quantize to this llama_ftype
			
 
				-        bool allow_requantize;       // allow quantizing non-f32/f16 tensors
			
 
				-        bool quantize_output_tensor; // quantize output.weight
			
 
				-        bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
			
 
				-        bool pure;                   // quantize all tensors to the default type
			
 
				-        void * imatrix;              // pointer to importance matrix data
			
 
				+        int32_t nthread;                     // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
			
 
				+        enum llama_ftype ftype;              // quantize to this llama_ftype
			
 
				+        enum ggml_type output_tensor_type;   // output tensor type
			
 
				+        enum ggml_type token_embedding_type; // itoken embeddings tensor type
			
 
				+        bool allow_requantize;               // allow quantizing non-f32/f16 tensors
			
 
				+        bool quantize_output_tensor;         // quantize output.weight
			
 
				+        bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
			
 
				+        bool pure;                           // quantize all tensors to the default type
			
 
				+        void * imatrix;                      // pointer to importance matrix data
			
 
				     } llama_model_quantize_params;
			
 
				 
			
 
				     // grammar types