2 жил өмнө · 2777a84be4
--- a/llama.cpp
+++ b/llama.cpp
@@ -6027,7 +6027,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
															         nthread = std::thread::hardware_concurrency();
														
 
															     }
														
 
															-    llama_model_loader ml(fname_inp, /*use_mmap*/ false);
														
 
															+    // mmap consistently increases speed Linux, and also increases speed on Windows with
														
 
															+    // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
														
 
															+#if defined(__linux__) || defined(_WIN32)
														
 
															+    constexpr bool use_mmap = true;
														
 
															+#else
														
 
															+    constexpr bool use_mmap = false;
														
 
															+#endif
														
 
															+
														
 
															+    llama_model_loader ml(fname_inp, use_mmap);
														
 
															+    if (ml.use_mmap) {
														
 
															+        ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
														
 
															+    }
														
 
															     llama_model model;
														
 
															     llm_load_arch(ml, model);
														
@@ -6105,10 +6116,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
															         const std::string name = ggml_get_name(tensor);
														
 
															-        if (read_data.size() < ggml_nbytes(tensor)) {
														
 
															-            read_data.resize(ggml_nbytes(tensor));
														
 
															+        if (!ml.use_mmap) {
														
 
															+            if (read_data.size() < ggml_nbytes(tensor)) {
														
 
															+                read_data.resize(ggml_nbytes(tensor));
														
 
															+            }
														
 
															+            tensor->data = read_data.data();
														
 
															         }
														
 
															-        tensor->data = read_data.data();
														
 
															         ml.load_data_for(tensor);
														
 
															         LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",