|
|
@@ -886,6 +886,17 @@ struct llama_context_params llama_context_default_params() {
|
|
|
return result;
|
|
|
}
|
|
|
|
|
|
+struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
|
+ struct llama_model_quantize_params result = {
|
|
|
+ /*.nthread =*/ 0,
|
|
|
+ /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
|
|
|
+ /*.allow_requantize =*/ false,
|
|
|
+ /*.quantize_output_tensor =*/ true,
|
|
|
+ };
|
|
|
+
|
|
|
+ return result;
|
|
|
+}
|
|
|
+
|
|
|
bool llama_mmap_supported() {
|
|
|
return llama_mmap::SUPPORTED;
|
|
|
}
|
|
|
@@ -2231,9 +2242,70 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
|
// quantization
|
|
|
//
|
|
|
|
|
|
-static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
|
|
|
+static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llama_buffer & output, const int nelements, const int nthread) {
|
|
|
+ if (output.size < nelements * sizeof(float)) {
|
|
|
+ output.resize(nelements * sizeof(float));
|
|
|
+ }
|
|
|
+ float * f32_output = (float *) output.addr;
|
|
|
+
|
|
|
+ quantize_fns_t qtype;
|
|
|
+ if (ggml_is_quantized(tensor.type)) {
|
|
|
+ qtype = ggml_internal_get_quantize_fn(tensor.type);
|
|
|
+ if (qtype.dequantize_row_q == NULL) {
|
|
|
+ throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
|
|
|
+ }
|
|
|
+ } else if (tensor.type != GGML_TYPE_F16) {
|
|
|
+ throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type)));
|
|
|
+ }
|
|
|
+
|
|
|
+ if (nthread < 2) {
|
|
|
+ if (tensor.type == GGML_TYPE_F16) {
|
|
|
+ ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
|
|
|
+ } else if (ggml_is_quantized(tensor.type)) {
|
|
|
+ qtype.dequantize_row_q(tensor.data, f32_output, nelements);
|
|
|
+ } else {
|
|
|
+ LLAMA_ASSERT(false); // unreachable
|
|
|
+ }
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ auto block_size = tensor.type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor.type);
|
|
|
+ auto block_size_bytes = ggml_type_size(tensor.type);
|
|
|
+
|
|
|
+ LLAMA_ASSERT(nelements % block_size == 0);
|
|
|
+ auto nblocks = nelements / block_size;
|
|
|
+ auto blocks_per_thread = nblocks / nthread;
|
|
|
+ auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
|
|
|
+
|
|
|
+ std::vector<std::thread> workers;
|
|
|
+ for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
|
|
|
+ auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
|
|
|
+ auto thr_elems = thr_blocks * block_size; // number of elements for this thread
|
|
|
+ auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
|
|
|
+
|
|
|
+ auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
|
|
|
+ if (typ == GGML_TYPE_F16) {
|
|
|
+ ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
|
|
+ } else {
|
|
|
+ qtype.dequantize_row_q(inbuf, outbuf, nels);
|
|
|
+ }
|
|
|
+ };
|
|
|
+ workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
|
|
|
+ in_buff_offs += thr_block_bytes;
|
|
|
+ out_buff_offs += thr_elems;
|
|
|
+ }
|
|
|
+ for (auto & worker : workers) {
|
|
|
+ worker.join();
|
|
|
+ }
|
|
|
+
|
|
|
+}
|
|
|
+
|
|
|
+static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
|
|
ggml_type quantized_type;
|
|
|
- switch (ftype) {
|
|
|
+ llama_ftype ftype = params->ftype;
|
|
|
+ int nthread = params->nthread;
|
|
|
+
|
|
|
+ switch (params->ftype) {
|
|
|
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
|
|
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
|
|
case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
|
|
|
@@ -2259,7 +2331,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
|
|
|
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false,
|
|
|
/*vocab_only*/ false));
|
|
|
- llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
|
|
+ llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), params->ftype);
|
|
|
|
|
|
int n_attention_wv = 0;
|
|
|
int n_feed_forward_w2 = 0;
|
|
|
@@ -2301,9 +2373,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
quantize &= (tensor.ne.size() == 2);
|
|
|
|
|
|
// uncomment this to keep the output layer in FP16
|
|
|
- //if (tensor.name == "output.weight") {
|
|
|
- // quantize = false;
|
|
|
- //}
|
|
|
+ if (!params->quantize_output_tensor && tensor.name == "output.weight") {
|
|
|
+ quantize = false;
|
|
|
+ }
|
|
|
+ quantize = quantize && quantized_type != tensor.type;
|
|
|
|
|
|
enum ggml_type new_type;
|
|
|
void * new_data;
|
|
|
@@ -2346,17 +2419,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
float * f32_data;
|
|
|
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
|
|
|
llama_buffer f32_conv_buf;
|
|
|
+
|
|
|
if (tensor.type == GGML_TYPE_F32) {
|
|
|
f32_data = (float *) tensor.data;
|
|
|
- } else if (tensor.type == GGML_TYPE_F16) {
|
|
|
- f32_conv_buf.resize(nelements * sizeof(float));
|
|
|
- f32_data = (float *) f32_conv_buf.addr;
|
|
|
- const auto * f16_data = (const ggml_fp16_t *) tensor.data;
|
|
|
- for (size_t i = 0; i < nelements; i++) {
|
|
|
- f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
|
|
|
- }
|
|
|
+ } else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) {
|
|
|
+ throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type)));
|
|
|
} else {
|
|
|
- throw std::runtime_error(format("type %s unsupported for integer quantization", ggml_type_name(tensor.type)));
|
|
|
+ llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
|
|
|
+ f32_data = (float *) f32_conv_buf.addr;
|
|
|
}
|
|
|
|
|
|
printf("quantizing .. ");
|
|
|
@@ -2566,10 +2636,9 @@ void llama_free(struct llama_context * ctx) {
|
|
|
int llama_model_quantize(
|
|
|
const char * fname_inp,
|
|
|
const char * fname_out,
|
|
|
- enum llama_ftype ftype,
|
|
|
- int nthread) {
|
|
|
+ const llama_model_quantize_params *params) {
|
|
|
try {
|
|
|
- llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
|
|
|
+ llama_model_quantize_internal(fname_inp, fname_out, params);
|
|
|
return 0;
|
|
|
} catch (const std::exception & err) {
|
|
|
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.what());
|