|
@@ -7,14 +7,12 @@
|
|
|
#include <algorithm>
|
|
#include <algorithm>
|
|
|
#include <cmath>
|
|
#include <cmath>
|
|
|
#include <cstring>
|
|
#include <cstring>
|
|
|
|
|
+#include <cinttypes>
|
|
|
#include <fstream>
|
|
#include <fstream>
|
|
|
#include <mutex>
|
|
#include <mutex>
|
|
|
#include <thread>
|
|
#include <thread>
|
|
|
#include <unordered_map>
|
|
#include <unordered_map>
|
|
|
|
|
|
|
|
-// TODO: replace with ggml API call
|
|
|
|
|
-#define QK_K 256
|
|
|
|
|
-
|
|
|
|
|
static void zeros(std::ofstream & file, size_t n) {
|
|
static void zeros(std::ofstream & file, size_t n) {
|
|
|
char zero = 0;
|
|
char zero = 0;
|
|
|
for (size_t i = 0; i < n; ++i) {
|
|
for (size_t i = 0; i < n; ++i) {
|
|
@@ -154,8 +152,10 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
|
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
|
|
if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
|
|
|
new_type = qs.params->output_tensor_type;
|
|
new_type = qs.params->output_tensor_type;
|
|
|
} else {
|
|
} else {
|
|
|
- int nx = tensor->ne[0];
|
|
|
|
|
- if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
|
|
|
|
|
|
+ const int64_t nx = tensor->ne[0];
|
|
|
|
|
+ const int64_t qk_k = ggml_blck_size(new_type);
|
|
|
|
|
+
|
|
|
|
|
+ if (arch == LLM_ARCH_FALCON || nx % qk_k != 0) {
|
|
|
new_type = GGML_TYPE_Q8_0;
|
|
new_type = GGML_TYPE_Q8_0;
|
|
|
}
|
|
}
|
|
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
|
@@ -367,20 +367,19 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|
|
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
|
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
|
|
//}
|
|
//}
|
|
|
bool convert_incompatible_tensor = false;
|
|
bool convert_incompatible_tensor = false;
|
|
|
- if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
|
|
|
|
- new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
|
|
|
|
|
- new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
|
|
|
|
|
- new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S ||
|
|
|
|
|
- new_type == GGML_TYPE_IQ1_M) {
|
|
|
|
|
- int nx = tensor->ne[0];
|
|
|
|
|
- int ny = tensor->ne[1];
|
|
|
|
|
- if (nx % QK_K != 0) {
|
|
|
|
|
- LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
|
|
|
|
|
|
|
+ {
|
|
|
|
|
+ const int64_t nx = tensor->ne[0];
|
|
|
|
|
+ const int64_t ny = tensor->ne[1];
|
|
|
|
|
+ const int64_t qk_k = ggml_blck_size(new_type);
|
|
|
|
|
+
|
|
|
|
|
+ if (nx % qk_k != 0) {
|
|
|
|
|
+ LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
|
|
|
convert_incompatible_tensor = true;
|
|
convert_incompatible_tensor = true;
|
|
|
} else {
|
|
} else {
|
|
|
++qs.n_k_quantized;
|
|
++qs.n_k_quantized;
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
+
|
|
|
if (convert_incompatible_tensor) {
|
|
if (convert_incompatible_tensor) {
|
|
|
switch (new_type) {
|
|
switch (new_type) {
|
|
|
case GGML_TYPE_TQ1_0:
|
|
case GGML_TYPE_TQ1_0:
|