2 years ago · 2777a84be4
--- a/llama.cpp
+++ b/llama.cpp
@@ -6027,7 +6027,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
				         nthread = std::thread::hardware_concurrency();
			
 
				     }
			
 
				 
			
 
				-    llama_model_loader ml(fname_inp, /*use_mmap*/ false);
			
 
				+    // mmap consistently increases speed Linux, and also increases speed on Windows with
			
 
				+    // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
			
 
				+#if defined(__linux__) || defined(_WIN32)
			
 
				+    constexpr bool use_mmap = true;
			
 
				+#else
			
 
				+    constexpr bool use_mmap = false;
			
 
				+#endif
			
 
				+
			
 
				+    llama_model_loader ml(fname_inp, use_mmap);
			
 
				+    if (ml.use_mmap) {
			
 
				+        ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
			
 
				+    }
			
 
				 
			
 
				     llama_model model;
			
 
				     llm_load_arch(ml, model);
			
@@ -6105,10 +6116,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
				 
			
 
				         const std::string name = ggml_get_name(tensor);
			
 
				 
			
 
				-        if (read_data.size() < ggml_nbytes(tensor)) {
			
 
				-            read_data.resize(ggml_nbytes(tensor));
			
 
				+        if (!ml.use_mmap) {
			
 
				+            if (read_data.size() < ggml_nbytes(tensor)) {
			
 
				+                read_data.resize(ggml_nbytes(tensor));
			
 
				+            }
			
 
				+            tensor->data = read_data.data();
			
 
				         }
			
 
				-        tensor->data = read_data.data();
			
 
				         ml.load_data_for(tensor);
			
 
				 
			
 
				         LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",