2 лет назад · 38de86a711
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -15,6 +15,8 @@
 
				 #include <string>
			
 
				 #include <unordered_map>
			
 
				 #include <vector>
			
 
				+#include <thread>
			
 
				+#include <mutex>
			
 
				 
			
 
				 struct quantize_stats_params {
			
 
				     std::string model = "models/7B/ggml-model-f16.bin";
			
@@ -27,7 +29,6 @@ struct quantize_stats_params {
 
				     std::vector<enum ggml_type> include_types;
			
 
				 };
			
 
				 
			
 
				-const int64_t SCRATCH_ELEMENTS = 32*32;
			
 
				 const size_t HISTOGRAM_BUCKETS = 150;
			
 
				 const double HISTOGRAM_RANGE = 0.03;
			
 
				 
			
@@ -90,6 +91,13 @@ void update_error_stats(int64_t nelements, const float * input, const float * ou
 
				     stats.num_samples += nelements;
			
 
				 }
			
 
				 
			
 
				+void combine_error_stats(error_stats & into, const error_stats & from) {
			
 
				+    into.num_samples += from.num_samples;
			
 
				+    into.total_error += from.total_error;
			
 
				+    if (from.max_error > into.max_error) into.max_error = from.max_error;
			
 
				+    for (size_t i=0; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i];
			
 
				+}
			
 
				+
			
 
				 double find_quantile(const error_stats & stats, double quantile) {
			
 
				     double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0);
			
 
				 
			
@@ -130,6 +138,36 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
 
				         tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
			
 
				 }
			
 
				 
			
 
				+void test_roundtrip_on_chunk(
			
 
				+        const ggml_tensor * layer,
			
 
				+        int64_t offset,
			
 
				+        int64_t chunk_size,
			
 
				+        const quantize_fns_t & qfns,
			
 
				+        bool use_reference,
			
 
				+        float * input_scratch,
			
 
				+        char * quantized_scratch,
			
 
				+        float * output_scratch,
			
 
				+        error_stats & stats) {
			
 
				+
			
 
				+    if (layer->type == GGML_TYPE_F16) {
			
 
				+        for (int i = 0; i < chunk_size; i++) {
			
 
				+            input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
			
 
				+        }
			
 
				+    } else {
			
 
				+        input_scratch = ggml_get_data_f32(layer) + offset;
			
 
				+    }
			
 
				+
			
 
				+    if (use_reference) {
			
 
				+        qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
			
 
				+    } else {
			
 
				+        qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
			
 
				+    }
			
 
				+    qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
			
 
				+
			
 
				+    update_error_stats(chunk_size, input_scratch, output_scratch, stats);
			
 
				+}
			
 
				+
			
 
				+
			
 
				 // Run quantization function for a single layer and update error stats
			
 
				 void test_roundtrip_on_layer(
			
 
				         std::string & name,
			
@@ -137,40 +175,61 @@ void test_roundtrip_on_layer(
 
				         const quantize_fns_t & qfns,
			
 
				         bool use_reference,
			
 
				         const ggml_tensor * layer,
			
 
				-        float * input_scratch,
			
 
				-        char *quantized_scratch,
			
 
				-        float * output_scratch,
			
 
				-        error_stats & total_error) {
			
 
				+        std::vector<float> & input_scratch,
			
 
				+        std::vector<char> & quantized_scratch,
			
 
				+        std::vector<float> & output_scratch,
			
 
				+        error_stats & total_error,
			
 
				+        int max_thread = 0) {
			
 
				 
			
 
				     assert(tensor_is_contiguous(layer));
			
 
				     error_stats layer_error {};
			
 
				-    int64_t nelements = ggml_nelements(layer);
			
 
				-
			
 
				-    for (int64_t offset = 0; offset < nelements; offset += SCRATCH_ELEMENTS) {
			
 
				-        int64_t chunk_size = std::min(SCRATCH_ELEMENTS, nelements - offset);
			
 
				+    uint64_t nelements = ggml_nelements(layer);
			
 
				 
			
 
				-        if (layer->type == GGML_TYPE_F16) {
			
 
				-            for (int i = 0; i < chunk_size; i++) {
			
 
				-                input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
			
 
				+    float* input_scratch_ptr = nullptr;
			
 
				+    if (layer->type == GGML_TYPE_F16) {
			
 
				+        if (input_scratch.size() < nelements) input_scratch.resize(nelements);
			
 
				+        input_scratch_ptr = input_scratch.data();
			
 
				+    }
			
 
				+    if (quantized_scratch.size() < 4*nelements) quantized_scratch.resize(4*nelements);
			
 
				+    if (output_scratch.size() < nelements) output_scratch.resize(nelements);
			
 
				+
			
 
				+    if (max_thread < 1) max_thread = std::thread::hardware_concurrency();
			
 
				+    int chunk_size = 32*512;
			
 
				+    int num_chunks = (nelements + chunk_size - 1)/chunk_size;
			
 
				+
			
 
				+    if (num_chunks < 2 || max_thread < 2) {
			
 
				+        test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(),
			
 
				+                output_scratch.data(), print_layer_stats ? layer_error : total_error);
			
 
				+    } else {
			
 
				+        auto & stats = print_layer_stats ? layer_error : total_error;
			
 
				+        std::mutex mutex;
			
 
				+        uint64_t counter = 0;
			
 
				+        auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr,
			
 
				+             &quantized_scratch, &output_scratch, chunk_size] () {
			
 
				+            error_stats local_stats {};
			
 
				+            while (true) {
			
 
				+                std::unique_lock<std::mutex> lock(mutex);
			
 
				+                uint64_t offset = counter; counter += chunk_size;
			
 
				+                if (offset >= nelements) {
			
 
				+                    combine_error_stats(stats, local_stats);
			
 
				+                    break;
			
 
				+                }
			
 
				+                lock.unlock();
			
 
				+                uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset;
			
 
				+                test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset,
			
 
				+                        quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats);
			
 
				             }
			
 
				-        } else {
			
 
				-            input_scratch = ggml_get_data_f32(layer) + offset;
			
 
				-        }
			
 
				-
			
 
				-        if (use_reference) {
			
 
				-            qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
			
 
				-        } else {
			
 
				-            qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
			
 
				-        }
			
 
				-        qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
			
 
				-
			
 
				-        update_error_stats(chunk_size, input_scratch, output_scratch, total_error);
			
 
				-        if (print_layer_stats) {
			
 
				-            update_error_stats(chunk_size, input_scratch, output_scratch, layer_error);
			
 
				-        }
			
 
				+        };
			
 
				+        int nthread = std::min(num_chunks, max_thread);
			
 
				+        std::vector<std::thread> workers(nthread-1);
			
 
				+        for (auto& w : workers) w = std::thread(compute);
			
 
				+        compute();
			
 
				+        for (auto& w : workers) w.join();
			
 
				     }
			
 
				+
			
 
				     if (print_layer_stats) {
			
 
				         print_error_stats(name, layer_error, false);
			
 
				+        combine_error_stats(total_error, layer_error);
			
 
				     }
			
 
				 }
			
 
				 
			
@@ -181,6 +240,7 @@ int main(int argc, char ** argv) {
 
				 
			
 
				     // read command line
			
 
				 
			
 
				+    int max_thread = 0;
			
 
				     bool invalid_param = false;
			
 
				     std::string arg;
			
 
				     for (int i = 1; i < argc; i++) {
			
@@ -230,6 +290,12 @@ int main(int argc, char ** argv) {
 
				                 fprintf(stderr, "error: %s not in list of types\n", argv[i]);
			
 
				                 invalid_param = true;
			
 
				             }
			
 
				+        } else if (arg == "-n" || arg == "--num-threads") {
			
 
				+            if (++i >= argc) {
			
 
				+                invalid_param = true;
			
 
				+                break;
			
 
				+            }
			
 
				+            max_thread = atoi(argv[i]);
			
 
				         } else {
			
 
				             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
			
 
				             quantize_stats_print_usage(argc, argv);
			
@@ -295,9 +361,9 @@ int main(int argc, char ** argv) {
 
				     }
			
 
				     printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements);
			
 
				     // allocate scratch space
			
 
				-    std::vector<float> input_scratch(SCRATCH_ELEMENTS);
			
 
				-    std::vector<char> quantized_scratch(SCRATCH_ELEMENTS*4);
			
 
				-    std::vector<float> output_scratch(SCRATCH_ELEMENTS);
			
 
				+    std::vector<float> input_scratch;
			
 
				+    std::vector<char> quantized_scratch;
			
 
				+    std::vector<float> output_scratch;
			
 
				 
			
 
				     // loop throught quantization types
			
 
				     for (int i = 0; i < GGML_TYPE_COUNT; i++) {
			
@@ -328,10 +394,11 @@ int main(int argc, char ** argv) {
 
				                         qfns,
			
 
				                         params.reference,
			
 
				                         kv_tensor.second,
			
 
				-                        input_scratch.data(),
			
 
				-                        quantized_scratch.data(),
			
 
				-                        output_scratch.data(),
			
 
				-                        global_stats
			
 
				+                        input_scratch,
			
 
				+                        quantized_scratch,
			
 
				+                        output_scratch,
			
 
				+                        global_stats,
			
 
				+                        max_thread
			
 
				                 );
			
 
				             }
			
 
				 
			
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -10,8 +10,8 @@
 
				 int main(int argc, char ** argv) {
			
 
				     ggml_time_init();
			
 
				 
			
 
				-    if (argc != 4) {
			
 
				-        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
			
 
				+    if (argc < 4) {
			
 
				+        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthread]\n", argv[0]);
			
 
				         fprintf(stderr, "  type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0);
			
 
				         fprintf(stderr, "  type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
			
 
				         fprintf(stderr, "  type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2);
			
@@ -30,6 +30,7 @@ int main(int argc, char ** argv) {
 
				     const std::string fname_out = argv[2];
			
 
				 
			
 
				     const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]);
			
 
				+    int nthread = argc > 4 ? atoi(argv[4]) : 0;
			
 
				 
			
 
				     const int64_t t_main_start_us = ggml_time_us();
			
 
				 
			
@@ -39,7 +40,7 @@ int main(int argc, char ** argv) {
 
				     {
			
 
				         const int64_t t_start_us = ggml_time_us();
			
 
				 
			
 
				-        if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype)) {
			
 
				+        if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) {
			
 
				             fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
			
 
				             return 1;
			
 
				         }
			
--- a/ggml.c
+++ b/ggml.c
@@ -12189,6 +12189,33 @@ size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t *
 
				     return (n/QK4_3*sizeof(block_q4_3));
			
 
				 }
			
 
				 
			
 
				+size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) {
			
 
				+    size_t result = 0;
			
 
				+    switch (type) {
			
 
				+        case GGML_TYPE_Q4_0:
			
 
				+            {
			
 
				+                GGML_ASSERT(start % QK4_0 == 0);
			
 
				+                block_q4_0 * block = (block_q4_0*)dst + start / QK4_0;
			
 
				+                result = ggml_quantize_q4_0(src + start, block, n, n, hist);
			
 
				+            } break;
			
 
				+        case GGML_TYPE_Q4_1:
			
 
				+            {
			
 
				+                GGML_ASSERT(start % QK4_1 == 0);
			
 
				+                block_q4_1 * block = (block_q4_1*)dst + start / QK4_1;
			
 
				+                result = ggml_quantize_q4_1(src + start, block, n, n, hist);
			
 
				+            } break;
			
 
				+        case GGML_TYPE_Q4_2:
			
 
				+            {
			
 
				+                GGML_ASSERT(start % QK4_2 == 0);
			
 
				+                block_q4_2 * block = (block_q4_2*)dst + start / QK4_2;
			
 
				+                result = ggml_quantize_q4_2(src + start, block, n, n, hist);
			
 
				+            } break;
			
 
				+        default:
			
 
				+            assert(false);
			
 
				+    }
			
 
				+    return result;
			
 
				+}
			
 
				+
			
 
				 ////////////////////////////////////////////////////////////////////////////////
			
 
				 
			
 
				 int ggml_cpu_has_avx(void) {
			
--- a/ggml.h
+++ b/ggml.h
@@ -813,6 +813,8 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
 
				 size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
			
 
				 size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
			
 
				 
			
 
				+size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
			
 
				+
			
 
				 //
			
 
				 // system info
			
 
				 //
			
--- a/llama.cpp
+++ b/llama.cpp
@@ -24,6 +24,9 @@
 
				 #include <memory>
			
 
				 #include <algorithm>
			
 
				 #include <initializer_list>
			
 
				+#include <thread>
			
 
				+#include <atomic>
			
 
				+#include <mutex>
			
 
				 
			
 
				 #define LLAMA_USE_SCRATCH
			
 
				 #define LLAMA_MAX_SCRATCH_BUFFERS 16
			
@@ -1572,7 +1575,7 @@ static llama_vocab::id llama_sample_top_p_top_k(
 
				 // quantization
			
 
				 //
			
 
				 
			
 
				-static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
			
 
				+static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
			
 
				     ggml_type quantized_type;
			
 
				     switch (ftype) {
			
 
				         case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
			
@@ -1582,6 +1585,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
				         default: throw format("invalid output file type %d\n", ftype);
			
 
				     };
			
 
				 
			
 
				+    if (nthread <= 0) {
			
 
				+        nthread = std::thread::hardware_concurrency();
			
 
				+    }
			
 
				+
			
 
				     std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
			
 
				                                                                             /*vocab_only*/ false));
			
 
				     llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
			
@@ -1590,6 +1597,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
				     size_t total_size_new = 0;
			
 
				     std::vector<int64_t> hist_all(1 << 4, 0);
			
 
				 
			
 
				+    std::vector<std::thread> workers;
			
 
				+    std::mutex mutex;
			
 
				+
			
 
				     size_t idx = 0;
			
 
				     for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
			
 
				         llama_buffer read_data;
			
@@ -1643,25 +1653,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
				             new_data = work.addr;
			
 
				             std::vector<int64_t> hist_cur(1 << 4, 0);
			
 
				 
			
 
				-            switch (new_type) {
			
 
				-                case GGML_TYPE_Q4_0:
			
 
				-                    {
			
 
				-                        new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
			
 
				-                    } break;
			
 
				-                case GGML_TYPE_Q4_1:
			
 
				-                    {
			
 
				-                        new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
			
 
				-                    } break;
			
 
				-                case GGML_TYPE_Q4_2:
			
 
				-                    {
			
 
				-                        new_size = ggml_quantize_q4_2(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
			
 
				-                    } break;
			
 
				-                case GGML_TYPE_Q4_3:
			
 
				-                    {
			
 
				-                        new_size = ggml_quantize_q4_3(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
			
 
				-                    } break;
			
 
				-                default:
			
 
				-                    LLAMA_ASSERT(false);
			
 
				+            int chunk_size = 32 * 512;
			
 
				+            const int nchunk = (nelements + chunk_size - 1)/chunk_size;
			
 
				+            const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
			
 
				+            if (nthread_use < 2) {
			
 
				+                new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
			
 
				+            } else {
			
 
				+                size_t counter = 0;
			
 
				+                new_size = 0;
			
 
				+                auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
			
 
				+                    std::vector<int64_t> local_hist;
			
 
				+                    size_t local_size = 0;
			
 
				+                    while (true) {
			
 
				+                        std::unique_lock<std::mutex> lock(mutex);
			
 
				+                        size_t first = counter; counter += chunk_size;
			
 
				+                        if (first >= nelements) {
			
 
				+                            if (!local_hist.empty()) {
			
 
				+                                for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
			
 
				+                                new_size += local_size;
			
 
				+                            }
			
 
				+                            break;
			
 
				+                        }
			
 
				+                        lock.unlock();
			
 
				+                        size_t last = std::min(nelements, first + chunk_size);
			
 
				+                        if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
			
 
				+                        local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
			
 
				+                    }
			
 
				+                };
			
 
				+                if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
			
 
				+                for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
			
 
				+                compute();
			
 
				+                for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
			
 
				             }
			
 
				 
			
 
				             printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
			
@@ -1783,9 +1805,10 @@ void llama_free(struct llama_context * ctx) {
 
				 int llama_model_quantize(
			
 
				         const char * fname_inp,
			
 
				         const char * fname_out,
			
 
				-  enum llama_ftype   ftype) {
			
 
				+  enum llama_ftype   ftype,
			
 
				+        int          nthread) {
			
 
				     try {
			
 
				-        llama_model_quantize_internal(fname_inp, fname_out, ftype);
			
 
				+        llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
			
 
				         return 0;
			
 
				     } catch (const std::string & err) {
			
 
				         fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
			
--- a/llama.h
+++ b/llama.h
@@ -93,10 +93,12 @@ extern "C" {
 
				 
			
 
				     // TODO: not great API - very likely to change
			
 
				     // Returns 0 on success
			
 
				+    // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
			
 
				     LLAMA_API int llama_model_quantize(
			
 
				             const char * fname_inp,
			
 
				             const char * fname_out,
			
 
				-      enum llama_ftype   ftype);
			
 
				+      enum llama_ftype   ftype,
			
 
				+            int          nthread);
			
 
				 
			
 
				     // Apply a LoRA adapter to a loaded model
			
 
				     // path_base_model is the path to a higher quality model to use as a base for