2 vuotta sitten · f963b63afa
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -140,6 +140,7 @@ if (LLAMA_ALL_WARNINGS)
 
				             -Wpedantic
			
 
				             -Wcast-qual
			
 
				             -Wno-unused-function
			
 
				+            -Wno-multichar
			
 
				         )
			
 
				     else()
			
 
				         # todo : msvc
			
@@ -152,6 +153,10 @@ if (LLAMA_ALL_WARNINGS)
 
				 
			
 
				 endif()
			
 
				 
			
 
				+if (MSVC)
			
 
				+    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
			
 
				+endif()
			
 
				+
			
 
				 if (LLAMA_LTO)
			
 
				     include(CheckIPOSupported)
			
 
				     check_ipo_supported(RESULT result OUTPUT output)
			
@@ -241,7 +246,9 @@ endif()
 
				 
			
 
				 add_library(llama
			
 
				             llama.cpp
			
 
				-            llama.h)
			
 
				+            llama.h
			
 
				+            llama_internal.h
			
 
				+            llama_util.h)
			
 
				 
			
 
				 target_include_directories(llama PUBLIC .)
			
 
				 target_compile_features(llama PUBLIC cxx_std_11) # don't bump
			
--- a/Makefile
+++ b/Makefile
@@ -37,7 +37,7 @@ LDFLAGS  =
 
				 
			
 
				 # warnings
			
 
				 CFLAGS   += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function
			
 
				-CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
			
 
				+CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
			
 
				 
			
 
				 # OS specific
			
 
				 # TODO: support Windows
			
@@ -142,7 +142,7 @@ default: main quantize perplexity embedding
 
				 ggml.o: ggml.c ggml.h
			
 
				 	$(CC)  $(CFLAGS)   -c ggml.c -o ggml.o
			
 
				 
			
 
				-llama.o: llama.cpp llama.h
			
 
				+llama.o: llama.cpp llama.h llama_util.h llama_internal.h
			
 
				 	$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
			
 
				 
			
 
				 common.o: examples/common.cpp examples/common.h
			
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -1,7 +1,5 @@
 
				 #include "common.h"
			
 
				 
			
 
				-#include "ggml.h"
			
 
				-
			
 
				 #include <cassert>
			
 
				 #include <cstring>
			
 
				 #include <fstream>
			
@@ -161,6 +159,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 
				             params.use_color = true;
			
 
				         } else if (arg == "--mlock") {
			
 
				             params.use_mlock = true;
			
 
				+        } else if (arg == "--no-mmap") {
			
 
				+            params.use_mmap = false;
			
 
				         } else if (arg == "--mtest") {
			
 
				             params.mem_test = true;
			
 
				         } else if (arg == "--verbose-prompt") {
			
@@ -240,9 +240,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 
				     fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
			
 
				     fprintf(stderr, "  --perplexity          compute perplexity over the prompt\n");
			
 
				     fprintf(stderr, "  --keep                number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
			
 
				-    if (ggml_mlock_supported()) {
			
 
				+    if (llama_mlock_supported()) {
			
 
				         fprintf(stderr, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
			
 
				     }
			
 
				+    if (llama_mmap_supported()) {
			
 
				+        fprintf(stderr, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
			
 
				+    }
			
 
				     fprintf(stderr, "  --mtest               compute maximum memory usage\n");
			
 
				     fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");
			
 
				     fprintf(stderr, "  -m FNAME, --model FNAME\n");
			
--- a/examples/common.h
+++ b/examples/common.h
@@ -47,6 +47,7 @@ struct gpt_params {
 
				     bool instruct          = false; // instruction mode (used for Alpaca models)
			
 
				     bool ignore_eos        = false; // do not stop generating after eos
			
 
				     bool perplexity        = false; // compute perplexity over the prompt
			
 
				+    bool use_mmap          = true;  // use mmap for faster loads
			
 
				     bool use_mlock         = false; // use mlock to keep model in memory
			
 
				     bool mem_test          = false; // compute maximum memory usage
			
 
				     bool verbose_prompt    = false; // print prompt tokens before generation
			
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -38,6 +38,7 @@ int main(int argc, char ** argv) {
 
				         lparams.seed       = params.seed;
			
 
				         lparams.f16_kv     = params.memory_f16;
			
 
				         lparams.logits_all = params.perplexity;
			
 
				+        lparams.use_mmap   = params.use_mmap;
			
 
				         lparams.use_mlock  = params.use_mlock;
			
 
				         lparams.embedding  = params.embedding;
			
 
				 
			
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -97,6 +97,7 @@ int main(int argc, char ** argv) {
 
				         lparams.n_parts    = params.n_parts;
			
 
				         lparams.seed       = params.seed;
			
 
				         lparams.f16_kv     = params.memory_f16;
			
 
				+        lparams.use_mmap   = params.use_mmap;
			
 
				         lparams.use_mlock  = params.use_mlock;
			
 
				 
			
 
				         ctx = llama_init_from_file(params.model.c_str(), lparams);
			
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -115,6 +115,7 @@ int main(int argc, char ** argv) {
 
				         lparams.seed       = params.seed;
			
 
				         lparams.f16_kv     = params.memory_f16;
			
 
				         lparams.logits_all = params.perplexity;
			
 
				+        lparams.use_mmap   = params.use_mmap;
			
 
				         lparams.use_mlock  = params.use_mlock;
			
 
				         lparams.embedding  = params.embedding;
			
 
				 
			
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -1,5 +1,6 @@
 
				 #include "ggml.h"
			
 
				 #include "llama.h"
			
 
				+#include "llama_internal.h"
			
 
				 
			
 
				 #include <algorithm>
			
 
				 #include <cassert>
			
@@ -266,15 +267,13 @@ int main(int argc, char ** argv) {
 
				         }
			
 
				     }
			
 
				 
			
 
				-    // Sort tensors for consistent output
			
 
				-    const auto tensors = llama_internal_get_tensor_map(ctx);
			
 
				-    std::map<std::string, struct ggml_tensor *> tensors_sorted { tensors.begin(), tensors.end() };
			
 
				+    const auto &tensors = llama_internal_get_tensor_map(ctx);
			
 
				 
			
 
				     // check layer tensors
			
 
				     int included_layers = 0;
			
 
				     int64_t max_nelements = 0;
			
 
				     bool is_f16 = false;
			
 
				-    for (const auto& kv_tensor : tensors_sorted) {
			
 
				+    for (const auto& kv_tensor : tensors) {
			
 
				         if (!layer_included(params, kv_tensor.first)) {
			
 
				             continue;
			
 
				         }
			
@@ -315,7 +314,7 @@ int main(int argc, char ** argv) {
 
				 
			
 
				             error_stats global_stats {};
			
 
				 
			
 
				-            for (const auto& kv_tensor : tensors_sorted) {
			
 
				+            for (const auto& kv_tensor : tensors) {
			
 
				                 if (!layer_included(params, kv_tensor.first)) {
			
 
				                     continue;
			
 
				                 }
			
--- a/ggml.c
+++ b/ggml.c
@@ -97,17 +97,6 @@ typedef void* thread_ret_t;
 
				 #define static_assert(cond, msg) _Static_assert(cond, msg)
			
 
				 #endif
			
 
				 
			
 
				-#define GGML_MLOCK_SUPPORT 0
			
 
				-
			
 
				-#ifdef __has_include
			
 
				-    #if __has_include(<sys/mman.h>)
			
 
				-        #undef GGML_MLOCK_SUPPORT
			
 
				-        #define GGML_MLOCK_SUPPORT 1
			
 
				-        #include <sys/mman.h>
			
 
				-    #endif
			
 
				-#endif
			
 
				-
			
 
				-
			
 
				 /*#define GGML_PERF*/
			
 
				 #define GGML_DEBUG 0
			
 
				 #define GGML_GELU_FP16
			
@@ -2690,21 +2679,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
 
				 
			
 
				 static_assert(GGML_OP_COUNT == 35, "GGML_OP_COUNT != 35");
			
 
				 
			
 
				-//
			
 
				-// ggml object
			
 
				-//
			
 
				-
			
 
				-struct ggml_object {
			
 
				-    size_t offs;
			
 
				-    size_t size;
			
 
				-
			
 
				-    struct ggml_object * next;
			
 
				-
			
 
				-    char padding[8];
			
 
				-};
			
 
				-
			
 
				-static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
			
 
				-
			
 
				 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
			
 
				 static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
			
 
				 
			
@@ -2716,7 +2690,6 @@ struct ggml_context {
 
				     size_t mem_size;
			
 
				     void * mem_buffer;
			
 
				     bool   mem_buffer_owned;
			
 
				-    bool   mem_buffer_mlocked;
			
 
				     bool   no_alloc;
			
 
				 
			
 
				     int    n_objects;
			
@@ -3003,7 +2976,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
 
				         /*.mem_size           =*/ params.mem_size,
			
 
				         /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
			
 
				         /*.mem_buffer_owned   =*/ params.mem_buffer ? false : true,
			
 
				-        /*.mem_buffer_mlocked =*/ false,
			
 
				         /*.no_alloc           =*/ params.no_alloc,
			
 
				         /*.n_objects          =*/ 0,
			
 
				         /*.objects_begin      =*/ NULL,
			
@@ -3036,14 +3008,6 @@ void ggml_free(struct ggml_context * ctx) {
 
				             GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
			
 
				                     __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
			
 
				 
			
 
				-#if GGML_MLOCK_SUPPORT
			
 
				-            if (ctx->mem_buffer_mlocked) {
			
 
				-                if (munlock(ctx->mem_buffer, ctx->mem_size)) {
			
 
				-                    fprintf(stderr, "%s: failed to munlock buffer: %s\n", __func__, strerror(errno));
			
 
				-                }
			
 
				-            }
			
 
				-#endif
			
 
				-
			
 
				             if (ctx->mem_buffer_owned) {
			
 
				                 free(ctx->mem_buffer);
			
 
				             }
			
@@ -3072,48 +3036,6 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
 
				     return result;
			
 
				 }
			
 
				 
			
 
				-#ifdef __APPLE__
			
 
				-#define MLOCK_SUGGESTION \
			
 
				-    "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
			
 
				-    "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
			
 
				-#else
			
 
				-#define MLOCK_SUGGESTION \
			
 
				-    "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
			
 
				-#endif
			
 
				-
			
 
				-bool ggml_mlock_supported(void) {
			
 
				-    return GGML_MLOCK_SUPPORT;
			
 
				-}
			
 
				-
			
 
				-bool ggml_mlock(
			
 
				-        struct ggml_context * ctx,
			
 
				-        const void *opt_extra_addr,
			
 
				-        size_t opt_extra_len,
			
 
				-        char **err_p) {
			
 
				-    // TODO: Use SetProcessWorkingSetSize() + VirtualLock() on WIN32
			
 
				-#if GGML_MLOCK_SUPPORT
			
 
				-    if (ctx->mem_buffer_mlocked) {
			
 
				-        return true;
			
 
				-    }
			
 
				-    if (mlock(ctx->mem_buffer, ctx->mem_size) ||
			
 
				-        (opt_extra_len &&
			
 
				-         mlock(opt_extra_addr, opt_extra_len))) {
			
 
				-        if ((*err_p = malloc(1024))) {
			
 
				-            snprintf(*err_p, 1024,
			
 
				-                     "failed to mlock %zu-byte buffer: %s\n" MLOCK_SUGGESTION,
			
 
				-                     ctx->mem_size + opt_extra_len,
			
 
				-                     strerror(errno));
			
 
				-        }
			
 
				-        return false;
			
 
				-    }
			
 
				-    ctx->mem_buffer_mlocked = true;
			
 
				-    return true;
			
 
				-#else // GGML_MLOCK_SUPPORT
			
 
				-    *err_p = strdup("can't mlock because it's not supported on this system");
			
 
				-    return false;
			
 
				-#endif // GGML_MLOCK_SUPPORT
			
 
				-}
			
 
				-
			
 
				 ////////////////////////////////////////////////////////////////////////////////
			
 
				 
			
 
				 struct ggml_tensor * ggml_new_tensor_impl(
			
--- a/ggml.h
+++ b/ggml.h
@@ -253,6 +253,19 @@ enum ggml_op {
 
				     GGML_OP_COUNT,
			
 
				 };
			
 
				 
			
 
				+
			
 
				+// ggml object
			
 
				+struct ggml_object {
			
 
				+    size_t offs;
			
 
				+    size_t size;
			
 
				+
			
 
				+    struct ggml_object * next;
			
 
				+
			
 
				+    char padding[8];
			
 
				+};
			
 
				+
			
 
				+static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
			
 
				+
			
 
				 // n-dimensional tensor
			
 
				 struct ggml_tensor {
			
 
				     enum ggml_type type;
			
@@ -344,13 +357,6 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
 
				 
			
 
				 size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
			
 
				 
			
 
				-bool ggml_mlock_supported(void);
			
 
				-bool ggml_mlock(
			
 
				-        struct ggml_context * ctx,
			
 
				-        const void *opt_extra_addr,
			
 
				-        size_t opt_extra_len,
			
 
				-        char **err_p);
			
 
				-
			
 
				 struct ggml_tensor * ggml_new_tensor(
			
 
				         struct ggml_context * ctx,
			
 
				         enum   ggml_type type,
			
--- a/llama.cpp
+++ b/llama.cpp
--- a/llama.h
+++ b/llama.h
@@ -55,6 +55,7 @@ extern "C" {
 
				         bool f16_kv;     // use fp16 for KV cache
			
 
				         bool logits_all; // the llama_eval() call computes all logits, not just the last one
			
 
				         bool vocab_only; // only load the vocabulary, no weights
			
 
				+        bool use_mmap;   // use mmap if possible
			
 
				         bool use_mlock;  // force system to keep model in RAM
			
 
				         bool embedding;  // embedding mode only
			
 
				 
			
@@ -66,6 +67,9 @@ extern "C" {
 
				 
			
 
				     LLAMA_API struct llama_context_params llama_context_default_params();
			
 
				 
			
 
				+    LLAMA_API bool llama_mmap_supported();
			
 
				+    LLAMA_API bool llama_mlock_supported();
			
 
				+
			
 
				     // Various functions for loading a ggml llama model.
			
 
				     // Allocate (almost) all memory needed for the model.
			
 
				     // Return NULL on failure
			
@@ -164,13 +168,6 @@ extern "C" {
 
				 
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				-
			
 
				-#include <string>
			
 
				-#include <unordered_map>
			
 
				-//
			
 
				-// Internal function exposed for tests and benchmarks
			
 
				-//
			
 
				-std::unordered_map<std::string, struct ggml_tensor *>& llama_internal_get_tensor_map(struct llama_context * ctx);
			
 
				 #endif
			
 
				 
			
 
				-#endif
			
 
				+#endif // LLAMA_H
			
--- a/llama_internal.h
+++ b/llama_internal.h
@@ -0,0 +1,12 @@
 
				+// Internal header to be included by llama.cpp and tests/benchmarks only.
			
 
				+
			
 
				+#ifndef LLAMA_INTERNAL_H
			
 
				+#define LLAMA_INTERNAL_H
			
 
				+
			
 
				+#include <vector>
			
 
				+#include <string>
			
 
				+struct ggml_tensor;
			
 
				+
			
 
				+std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
			
 
				+
			
 
				+#endif // LLAMA_INTERNAL_H
			
--- a/llama_util.h
+++ b/llama_util.h
@@ -0,0 +1,383 @@
 
				+// Internal header to be included only by llama.cpp.
			
 
				+// Contains wrappers around OS interfaces.
			
 
				+
			
 
				+#ifndef LLAMA_UTIL_H
			
 
				+#define LLAMA_UTIL_H
			
 
				+
			
 
				+#include <cstdio>
			
 
				+#include <cstdint>
			
 
				+#include <cerrno>
			
 
				+#include <cstring>
			
 
				+#include <cstdarg>
			
 
				+#include <cstdlib>
			
 
				+#include <climits>
			
 
				+
			
 
				+#include <string>
			
 
				+#include <vector>
			
 
				+
			
 
				+#ifdef __has_include
			
 
				+    #if __has_include(<unistd.h>)
			
 
				+        #include <unistd.h>
			
 
				+        #if defined(_POSIX_MAPPED_FILES)
			
 
				+            #include <sys/mman.h>
			
 
				+        #endif
			
 
				+    #endif
			
 
				+#endif
			
 
				+
			
 
				+#if defined(_WIN32)
			
 
				+    #define WIN32_LEAN_AND_MEAN
			
 
				+    #define NOMINMAX
			
 
				+    #include <windows.h>
			
 
				+    #include <io.h>
			
 
				+    #include <stdio.h> // for _fseeki64
			
 
				+#endif
			
 
				+
			
 
				+#define LLAMA_ASSERT(x) \
			
 
				+    do { \
			
 
				+        if (!(x)) { \
			
 
				+            fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
			
 
				+            abort(); \
			
 
				+        } \
			
 
				+    } while (0)
			
 
				+
			
 
				+#ifdef __GNUC__
			
 
				+__attribute__((format(printf, 1, 2)))
			
 
				+#endif
			
 
				+static std::string format(const char * fmt, ...) {
			
 
				+    va_list ap, ap2;
			
 
				+    va_start(ap, fmt);
			
 
				+    va_copy(ap2, ap);
			
 
				+    int size = vsnprintf(NULL, 0, fmt, ap);
			
 
				+    LLAMA_ASSERT(size >= 0 && size < INT_MAX);
			
 
				+    std::vector<char> buf(size + 1);
			
 
				+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
			
 
				+    LLAMA_ASSERT(size2 == size);
			
 
				+    va_end(ap2);
			
 
				+    va_end(ap);
			
 
				+    return std::string(buf.data(), size);
			
 
				+};
			
 
				+
			
 
				+struct llama_file {
			
 
				+    // use FILE * so we don't have to re-open the file to mmap
			
 
				+    FILE * fp;
			
 
				+    size_t size;
			
 
				+
			
 
				+    llama_file(const char * fname, const char * mode) {
			
 
				+        fp = std::fopen(fname, mode);
			
 
				+        if (fp == NULL) {
			
 
				+            throw format("failed to open %s: %s", fname, std::strerror(errno));
			
 
				+        }
			
 
				+        seek(0, SEEK_END);
			
 
				+        size = tell();
			
 
				+        seek(0, SEEK_SET);
			
 
				+    }
			
 
				+
			
 
				+    size_t tell() const {
			
 
				+#ifdef _WIN32
			
 
				+        __int64 ret = _ftelli64(fp);
			
 
				+#else
			
 
				+        long ret = std::ftell(fp);
			
 
				+#endif
			
 
				+        LLAMA_ASSERT(ret != -1); // this really shouldn't fail
			
 
				+        return (size_t) ret;
			
 
				+    }
			
 
				+
			
 
				+    void seek(size_t offset, int whence) {
			
 
				+#ifdef _WIN32
			
 
				+        int ret = _fseeki64(fp, (__int64) offset, whence);
			
 
				+#else
			
 
				+        int ret = std::fseek(fp, (long) offset, whence);
			
 
				+#endif
			
 
				+        LLAMA_ASSERT(ret == 0); // same
			
 
				+    }
			
 
				+
			
 
				+    void read_raw(void * ptr, size_t size) {
			
 
				+        if (size == 0) {
			
 
				+            return;
			
 
				+        }
			
 
				+        errno = 0;
			
 
				+        std::size_t ret = std::fread(ptr, size, 1, fp);
			
 
				+        if (ferror(fp)) {
			
 
				+            throw format("read error: %s", strerror(errno));
			
 
				+        }
			
 
				+        if (ret != 1) {
			
 
				+            throw std::string("unexpectedly reached end of file");
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    std::uint32_t read_u32() {
			
 
				+        std::uint32_t ret;
			
 
				+        read_raw(&ret, sizeof(ret));
			
 
				+        return ret;
			
 
				+    }
			
 
				+
			
 
				+    std::string read_string(std::uint32_t len) {
			
 
				+        std::vector<char> chars(len);
			
 
				+        read_raw(chars.data(), len);
			
 
				+        return std::string(chars.data(), len);
			
 
				+    }
			
 
				+
			
 
				+    void write_raw(const void * ptr, size_t size) {
			
 
				+        if (size == 0) {
			
 
				+            return;
			
 
				+        }
			
 
				+        errno = 0;
			
 
				+        size_t ret = std::fwrite(ptr, size, 1, fp);
			
 
				+        if (ret != 1) {
			
 
				+            throw format("write error: %s", strerror(errno));
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    void write_u32(std::uint32_t val) {
			
 
				+        write_raw(&val, sizeof(val));
			
 
				+    }
			
 
				+
			
 
				+    ~llama_file() {
			
 
				+        if (fp) {
			
 
				+            std::fclose(fp);
			
 
				+        }
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+#if defined(_WIN32)
			
 
				+static std::string llama_format_win_err(DWORD err) {
			
 
				+    LPSTR buf;
			
 
				+    size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
			
 
				+                                 NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
			
 
				+    if (!size) {
			
 
				+        return "FormatMessageA failed";
			
 
				+    }
			
 
				+    std::string ret(buf, size);
			
 
				+    LocalFree(buf);
			
 
				+    return ret;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+struct llama_mmap {
			
 
				+    void * addr;
			
 
				+    size_t size;
			
 
				+
			
 
				+    llama_mmap(const llama_mmap &) = delete;
			
 
				+
			
 
				+#ifdef _POSIX_MAPPED_FILES
			
 
				+    static constexpr bool SUPPORTED = true;
			
 
				+
			
 
				+    llama_mmap(struct llama_file * file) {
			
 
				+        size = file->size;
			
 
				+        int fd = fileno(file->fp);
			
 
				+        int flags = MAP_SHARED;
			
 
				+#ifdef __linux__
			
 
				+        flags |= MAP_POPULATE;
			
 
				+#endif
			
 
				+        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
			
 
				+        close(fd);
			
 
				+        if (addr == MAP_FAILED) {
			
 
				+            throw format("mmap failed: %s", strerror(errno));
			
 
				+        }
			
 
				+
			
 
				+        // Advise the kernel to preload the mapped memory
			
 
				+        if (madvise(addr, file->size, MADV_WILLNEED)) {
			
 
				+            fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
			
 
				+                    strerror(errno));
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    ~llama_mmap() {
			
 
				+        munmap(addr, size);
			
 
				+    }
			
 
				+#elif defined(_WIN32)
			
 
				+    static constexpr bool SUPPORTED = true;
			
 
				+
			
 
				+    llama_mmap(struct llama_file * file) {
			
 
				+        size = file->size;
			
 
				+
			
 
				+        HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
			
 
				+
			
 
				+        HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
			
 
				+        DWORD error = GetLastError();
			
 
				+        CloseHandle(hFile);
			
 
				+
			
 
				+        if (hMapping == NULL) {
			
 
				+            throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
			
 
				+        }
			
 
				+
			
 
				+        addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
			
 
				+        error = GetLastError();
			
 
				+        CloseHandle(hMapping);
			
 
				+
			
 
				+        if (addr == NULL) {
			
 
				+            throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
			
 
				+        }
			
 
				+
			
 
				+        // Advise the kernel to preload the mapped memory
			
 
				+        WIN32_MEMORY_RANGE_ENTRY range;
			
 
				+        range.VirtualAddress = addr;
			
 
				+        range.NumberOfBytes = (SIZE_T)size;
			
 
				+        if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
			
 
				+            fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
			
 
				+                    llama_format_win_err(GetLastError()).c_str());
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    ~llama_mmap() {
			
 
				+        if (!UnmapViewOfFile(addr)) {
			
 
				+            fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
			
 
				+                    llama_format_win_err(GetLastError()).c_str());
			
 
				+        }
			
 
				+    }
			
 
				+#else
			
 
				+    static constexpr bool SUPPORTED = false;
			
 
				+
			
 
				+    llama_mmap(struct llama_file *) {
			
 
				+        throw std::string("mmap not supported");
			
 
				+    }
			
 
				+#endif
			
 
				+};
			
 
				+
			
 
				+// Represents some region of memory being locked using mlock or VirtualLock;
			
 
				+// will automatically unlock on destruction.
			
 
				+struct llama_mlock {
			
 
				+    void * addr = NULL;
			
 
				+    size_t size = 0;
			
 
				+    bool failed_already = false;
			
 
				+
			
 
				+    llama_mlock() {}
			
 
				+    llama_mlock(const llama_mlock &) = delete;
			
 
				+
			
 
				+    ~llama_mlock() {
			
 
				+        if (size) {
			
 
				+            raw_unlock(addr, size);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    void init(void * addr) {
			
 
				+        LLAMA_ASSERT(this->addr == NULL && this->size == 0);
			
 
				+        this->addr = addr;
			
 
				+    }
			
 
				+
			
 
				+    void grow_to(size_t target_size) {
			
 
				+        LLAMA_ASSERT(addr);
			
 
				+        if (failed_already) {
			
 
				+            return;
			
 
				+        }
			
 
				+        size_t granularity = lock_granularity();
			
 
				+        target_size = (target_size + granularity - 1) & ~(granularity - 1);
			
 
				+        if (target_size > size) {
			
 
				+            if (raw_lock((uint8_t *) addr + size, target_size - size)) {
			
 
				+                size = target_size;
			
 
				+            } else {
			
 
				+                failed_already = true;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+#ifdef _POSIX_MEMLOCK_RANGE
			
 
				+    static constexpr bool SUPPORTED = true;
			
 
				+
			
 
				+    size_t lock_granularity() {
			
 
				+        return (size_t) sysconf(_SC_PAGESIZE);
			
 
				+    }
			
 
				+
			
 
				+    #ifdef __APPLE__
			
 
				+        #define MLOCK_SUGGESTION \
			
 
				+            "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
			
 
				+            "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
			
 
				+    #else
			
 
				+        #define MLOCK_SUGGESTION \
			
 
				+            "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
			
 
				+    #endif
			
 
				+
			
 
				+    bool raw_lock(const void * addr, size_t size) {
			
 
				+        if (!mlock(addr, size)) {
			
 
				+            return true;
			
 
				+        } else {
			
 
				+            fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n" MLOCK_SUGGESTION,
			
 
				+                    size, this->size, std::strerror(errno));
			
 
				+            return false;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    #undef MLOCK_SUGGESTION
			
 
				+
			
 
				+    void raw_unlock(void * addr, size_t size) {
			
 
				+        if (munlock(addr, size)) {
			
 
				+            fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
			
 
				+        }
			
 
				+    }
			
 
				+#elif defined(_WIN32)
			
 
				+    static constexpr bool SUPPORTED = true;
			
 
				+
			
 
				+    size_t lock_granularity() {
			
 
				+        SYSTEM_INFO si;
			
 
				+        GetSystemInfo(&si);
			
 
				+        return (size_t) si.dwPageSize;
			
 
				+    }
			
 
				+
			
 
				+    bool raw_lock(void * addr, size_t size) {
			
 
				+        for (int tries = 1; ; tries++) {
			
 
				+            if (VirtualLock(addr, size)) {
			
 
				+                return true;
			
 
				+            }
			
 
				+            if (tries == 2) {
			
 
				+                fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
			
 
				+                        size, this->size, llama_format_win_err(GetLastError()).c_str());
			
 
				+                return false;
			
 
				+            }
			
 
				+
			
 
				+            // It failed but this was only the first try; increase the working
			
 
				+            // set size and try again.
			
 
				+            SIZE_T min_ws_size, max_ws_size;
			
 
				+            if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
			
 
				+                fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
			
 
				+                        llama_format_win_err(GetLastError()).c_str());
			
 
				+                return false;
			
 
				+            }
			
 
				+            // Per MSDN: "The maximum number of pages that a process can lock
			
 
				+            // is equal to the number of pages in its minimum working set minus
			
 
				+            // a small overhead."
			
 
				+            // Hopefully a megabyte is enough overhead:
			
 
				+            size_t increment = size + 1048576;
			
 
				+            // The minimum must be <= the maximum, so we need to increase both:
			
 
				+            min_ws_size += size;
			
 
				+            max_ws_size += size;
			
 
				+            if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
			
 
				+                fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
			
 
				+                        llama_format_win_err(GetLastError()).c_str());
			
 
				+                return false;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    void raw_unlock(void * addr, size_t size) {
			
 
				+        if (!VirtualUnlock(addr, size)) {
			
 
				+            fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
			
 
				+                    llama_format_win_err(GetLastError()).c_str());
			
 
				+        }
			
 
				+    }
			
 
				+#else
			
 
				+    static constexpr bool SUPPORTED = false;
			
 
				+
			
 
				+    void raw_lock(const void * addr, size_t size) {
			
 
				+        fprintf(stderr, "warning: mlock not supported on this system\n");
			
 
				+    }
			
 
				+
			
 
				+    void raw_unlock(const void * addr, size_t size) {}
			
 
				+#endif
			
 
				+};
			
 
				+
			
 
				+// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
			
 
				+struct llama_buffer {
			
 
				+    uint8_t * addr = NULL;
			
 
				+    size_t size = 0;
			
 
				+
			
 
				+    void resize(size_t size) {
			
 
				+        delete[] addr;
			
 
				+        addr = new uint8_t[size];
			
 
				+        this->size = size;
			
 
				+    }
			
 
				+
			
 
				+    ~llama_buffer() {
			
 
				+        delete[] addr;
			
 
				+    }
			
 
				+};
			
 
				+#endif