2 years ago · 6381d4e110
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 
				 *.o
			
 
				 *.a
			
 
				 *.so
			
 
				+*.gguf
			
 
				 *.bin
			
 
				 .DS_Store
			
 
				 .build/
			
@@ -47,6 +48,8 @@ models-mnt
 
				 /server
			
 
				 /Pipfile
			
 
				 /embd-input-test
			
 
				+/gguf
			
 
				+/gguf-llama-simple
			
 
				 /libllama.so
			
 
				 /llama-bench
			
 
				 build-info.h
			
@@ -65,7 +68,6 @@ perf-*.txt
 
				 
			
 
				 examples/jeopardy/results.txt
			
 
				 
			
 
				-
			
 
				 pyproject.toml
			
 
				 poetry.lock
			
 
				 poetry.toml
			
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -497,9 +497,11 @@ else()
 
				 endif()
			
 
				 
			
 
				 #
			
 
				-# Build libraries
			
 
				+# libraries
			
 
				 #
			
 
				 
			
 
				+# ggml
			
 
				+
			
 
				 add_library(ggml OBJECT
			
 
				             ggml.c
			
 
				             ggml.h
			
@@ -524,10 +526,11 @@ if (BUILD_SHARED_LIBS)
 
				     install(TARGETS ggml_shared LIBRARY)
			
 
				 endif()
			
 
				 
			
 
				+# llama
			
 
				+
			
 
				 add_library(llama
			
 
				             llama.cpp
			
 
				             llama.h
			
 
				-            llama-util.h
			
 
				             )
			
 
				 
			
 
				 target_include_directories(llama PUBLIC .)
			
@@ -546,6 +549,10 @@ if (BUILD_SHARED_LIBS)
 
				     install(TARGETS llama LIBRARY)
			
 
				 endif()
			
 
				 
			
 
				+#
			
 
				+# install
			
 
				+#
			
 
				+
			
 
				 include(GNUInstallDirs)
			
 
				 install(
			
 
				     FILES convert.py
			
@@ -584,6 +591,8 @@ endif()
 
				 # programs, examples and tests
			
 
				 #
			
 
				 
			
 
				+add_subdirectory(common)
			
 
				+
			
 
				 if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
			
 
				     include(CTest)
			
 
				     add_subdirectory(tests)
			
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 
				 # Define the default target now so that it is always the first target
			
 
				-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test llama-bench
			
 
				+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test gguf llama-bench
			
 
				 
			
 
				 # Binaries only useful for tests
			
 
				 TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
			
@@ -45,8 +45,8 @@ OPT = -Ofast
 
				 else
			
 
				 OPT = -O3
			
 
				 endif
			
 
				-CFLAGS   = -I.              $(OPT) -std=c11   -fPIC
			
 
				-CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC
			
 
				+CFLAGS   = -I.            $(OPT) -std=c11   -fPIC
			
 
				+CXXFLAGS = -I. -I./common $(OPT) -std=c++11 -fPIC
			
 
				 LDFLAGS  =
			
 
				 
			
 
				 ifdef LLAMA_DEBUG
			
@@ -329,23 +329,23 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
 
				 
			
 
				 OBJS += ggml-alloc.o
			
 
				 
			
 
				-llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
			
 
				+llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h
			
 
				 	$(CXX) $(CXXFLAGS) -c $< -o $@
			
 
				 
			
 
				-common.o: examples/common.cpp examples/common.h
			
 
				+common.o: common/common.cpp common/common.h
			
 
				 	$(CXX) $(CXXFLAGS) -c $< -o $@
			
 
				 
			
 
				-console.o: examples/console.cpp examples/console.h
			
 
				+console.o: common/console.cpp common/console.h
			
 
				 	$(CXX) $(CXXFLAGS) -c $< -o $@
			
 
				 
			
 
				-grammar-parser.o: examples/grammar-parser.cpp examples/grammar-parser.h
			
 
				+grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
			
 
				 	$(CXX) $(CXXFLAGS) -c $< -o $@
			
 
				 
			
 
				 libllama.so: llama.o ggml.o $(OBJS)
			
 
				 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
			
 
				 
			
 
				 clean:
			
 
				-	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c-to-ggml embd-input-test llama-bench build-info.h $(TEST_TARGETS)
			
 
				+	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c-to-ggml embd-input-test gguf llama-bench build-info.h $(TEST_TARGETS)
			
 
				 
			
 
				 #
			
 
				 # Examples
			
@@ -385,7 +385,10 @@ $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-in
 
				 embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
			
 
				 	$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
			
 
				 
			
 
				-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp    build-info.h ggml.o llama.o $(OBJS)
			
 
				+gguf: examples/gguf/gguf.cpp                                  build-info.h ggml.o llama.o $(OBJS)
			
 
				+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
			
 
				+
			
 
				+train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp    build-info.h ggml.o llama.o common.o $(OBJS)
			
 
				 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
			
 
				 
			
 
				 convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp    build-info.h ggml.o llama.o $(OBJS)
			
@@ -418,7 +421,7 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 
				 tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o llama.o common.o $(OBJS)
			
 
				 	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
			
 
				 
			
 
				-tests/test-grammar-parser: tests/test-grammar-parser.cpp examples/grammar-parser.cpp build-info.h ggml.o llama.o common.o $(OBJS)
			
 
				+tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o common.o $(OBJS)
			
 
				 	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
			
 
				 
			
 
				 tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS)
			
--- a/README.md
+++ b/README.md
@@ -9,11 +9,17 @@
 
				 
			
 
				 Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
			
 
				 
			
 
				-### 🚧 Incoming breaking change + refactoring:
			
 
				+### Hot topics
			
 
				 
			
 
				-See PR https://github.com/ggerganov/llama.cpp/pull/2398 for more info.
			
 
				+A new file format has been introduced: [GGUF](https://github.com/ggerganov/llama.cpp/pull/2398)
			
 
				 
			
 
				-To devs: avoid making big changes to `llama.h` / `llama.cpp` until merged
			
 
				+Last revision compatible with the old format: [dadbed9](https://github.com/ggerganov/llama.cpp/commit/dadbed99e65252d79f81101a392d0d6497b86caa)
			
 
				+
			
 
				+### Current `master` should be considered in Beta - expect some issues for a few days!
			
 
				+
			
 
				+### Be prepared to re-convert and / or re-quantize your GGUF models while this notice is up!
			
 
				+
			
 
				+### Issues with non-GGUF models will be considered with low priority!
			
 
				 
			
 
				 ----
			
 
				 
			
@@ -291,7 +297,7 @@ When built with Metal support, you can enable GPU inference with the `--gpu-laye
 
				 Any value larger than 0 will offload the computation to the GPU. For example:
			
 
				 
			
 
				 ```bash
			
 
				-./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1
			
 
				+./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 -ngl 1
			
 
				 ```
			
 
				 
			
 
				 ### MPI Build
			
@@ -330,7 +336,7 @@ The above will distribute the computation across 2 processes on the first host a
 
				 Finally, you're ready to run a computation using `mpirun`:
			
 
				 
			
 
				 ```bash
			
 
				-mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
			
 
				+mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
			
 
				 ```
			
 
				 
			
 
				 ### BLAS Build
			
@@ -513,10 +519,10 @@ python3 convert.py models/7B/
 
				   python convert.py models/7B/ --vocabtype bpe
			
 
				 
			
 
				 # quantize the model to 4-bits (using q4_0 method)
			
 
				-./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0
			
 
				+./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0
			
 
				 
			
 
				 # run the inference
			
 
				-./main -m ./models/7B/ggml-model-q4_0.bin -n 128
			
 
				+./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
			
 
				 ```
			
 
				 
			
 
				 When running the larger models, make sure you have enough disk space to store all the intermediate files.
			
@@ -572,7 +578,7 @@ Here is an example of a few-shot interaction, invoked with the command
 
				 ./examples/chat-13B.sh
			
 
				 
			
 
				 # custom arguments using a 13B model
			
 
				-./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
			
 
				+./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
			
 
				 ```
			
 
				 
			
 
				 Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
			
@@ -635,6 +641,8 @@ OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It
 
				 
			
 
				 ### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
			
 
				 
			
 
				+*Note: these instructions are likely obsoleted by the GGUF update*
			
 
				+
			
 
				 - Obtain the `tokenizer.model` file from LLaMA model and put it to `models`
			
 
				 - Obtain the `added_tokens.json` file from Alpaca model and put it to `models`
			
 
				 - Obtain the `gpt4all-lora-quantized.bin` file from GPT4All model and put it to `models/gpt4all-7B`
			
@@ -710,7 +718,7 @@ If your issue is with model generation quality, then please at least scan the fo
 
				 #### How to run
			
 
				 
			
 
				 1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
			
 
				-2. Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
			
 
				+2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
			
 
				 3. Output:
			
 
				 ```
			
 
				 perplexity : calculating perplexity over 655 chunks
			
@@ -809,13 +817,13 @@ docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-
 
				 On completion, you are ready to play!
			
 
				 
			
 
				 ```bash
			
 
				-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
			
 
				+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
			
 
				 ```
			
 
				 
			
 
				 or with a light image:
			
 
				 
			
 
				 ```bash
			
 
				-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
			
 
				+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
			
 
				 ```
			
 
				 
			
 
				 ### Docker With CUDA
			
@@ -846,8 +854,8 @@ The resulting images, are essentially the same as the non-CUDA images:
 
				 After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.
			
 
				 
			
 
				 ```bash
			
 
				-docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
			
 
				-docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
			
 
				+docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
			
 
				+docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
			
 
				 ```
			
 
				 
			
 
				 ### Contributing
			
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -159,17 +159,17 @@ function gg_run_open_llama_3b_v2 {
 
				 
			
 
				     python3 ../convert.py ${path_models}
			
 
				 
			
 
				-    model_f16="${path_models}/ggml-model-f16.bin"
			
 
				-    model_q8_0="${path_models}/ggml-model-q8_0.bin"
			
 
				-    model_q4_0="${path_models}/ggml-model-q4_0.bin"
			
 
				-    model_q4_1="${path_models}/ggml-model-q4_1.bin"
			
 
				-    model_q5_0="${path_models}/ggml-model-q5_0.bin"
			
 
				-    model_q5_1="${path_models}/ggml-model-q5_1.bin"
			
 
				-    model_q2_k="${path_models}/ggml-model-q2_k.bin"
			
 
				-    model_q3_k="${path_models}/ggml-model-q3_k.bin"
			
 
				-    model_q4_k="${path_models}/ggml-model-q4_k.bin"
			
 
				-    model_q5_k="${path_models}/ggml-model-q5_k.bin"
			
 
				-    model_q6_k="${path_models}/ggml-model-q6_k.bin"
			
 
				+    model_f16="${path_models}/ggml-model-f16.gguf"
			
 
				+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
			
 
				+    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
			
 
				+    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
			
 
				+    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
			
 
				+    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
			
 
				+    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
			
 
				+    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
			
 
				+    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
			
 
				+    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
			
 
				+    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
			
 
				 
			
 
				     wiki_test_60="${path_wiki}/wiki.test-60.raw"
			
 
				 
			
@@ -285,17 +285,17 @@ function gg_run_open_llama_7b_v2 {
 
				 
			
 
				     python3 ../convert.py ${path_models}
			
 
				 
			
 
				-    model_f16="${path_models}/ggml-model-f16.bin"
			
 
				-    model_q8_0="${path_models}/ggml-model-q8_0.bin"
			
 
				-    model_q4_0="${path_models}/ggml-model-q4_0.bin"
			
 
				-    model_q4_1="${path_models}/ggml-model-q4_1.bin"
			
 
				-    model_q5_0="${path_models}/ggml-model-q5_0.bin"
			
 
				-    model_q5_1="${path_models}/ggml-model-q5_1.bin"
			
 
				-    model_q2_k="${path_models}/ggml-model-q2_k.bin"
			
 
				-    model_q3_k="${path_models}/ggml-model-q3_k.bin"
			
 
				-    model_q4_k="${path_models}/ggml-model-q4_k.bin"
			
 
				-    model_q5_k="${path_models}/ggml-model-q5_k.bin"
			
 
				-    model_q6_k="${path_models}/ggml-model-q6_k.bin"
			
 
				+    model_f16="${path_models}/ggml-model-f16.gguf"
			
 
				+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
			
 
				+    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
			
 
				+    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
			
 
				+    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
			
 
				+    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
			
 
				+    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
			
 
				+    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
			
 
				+    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
			
 
				+    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
			
 
				+    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
			
 
				 
			
 
				     wiki_test="${path_wiki}/wiki.test.raw"
			
 
				 
			
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -0,0 +1,20 @@
 
				+# common
			
 
				+
			
 
				+set(TARGET common)
			
 
				+
			
 
				+add_library(${TARGET} OBJECT
			
 
				+    common.h
			
 
				+    common.cpp
			
 
				+    console.h
			
 
				+    console.cpp
			
 
				+    grammar-parser.h
			
 
				+    grammar-parser.cpp
			
 
				+    )
			
 
				+
			
 
				+if (BUILD_SHARED_LIBS)
			
 
				+    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
			
 
				+endif()
			
 
				+
			
 
				+target_include_directories(${TARGET} PUBLIC .)
			
 
				+target_compile_features(${TARGET} PUBLIC cxx_std_11)
			
 
				+target_link_libraries(${TARGET} PRIVATE llama)
			
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -170,18 +170,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 
				                 break;
			
 
				             }
			
 
				             params.n_ctx = std::stoi(argv[i]);
			
 
				-        } else if (arg == "-gqa" || arg == "--gqa") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            params.n_gqa = std::stoi(argv[i]);
			
 
				-        } else if (arg == "-eps" || arg == "--rms-norm-eps") {
			
 
				-            if (++i >= argc) {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            params.rms_norm_eps = std::stof(argv[i]);
			
 
				         } else if (arg == "--rope-freq-base") {
			
 
				             if (++i >= argc) {
			
 
				                 invalid_param = true;
			
@@ -439,7 +427,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 
				             }
			
 
				             params.hellaswag_tasks = std::stoi(argv[i]);
			
 
				         } else if (arg == "--ignore-eos") {
			
 
				-            params.logit_bias[llama_token_eos()] = -INFINITY;
			
 
				+            params.ignore_eos = true;
			
 
				         } else if (arg == "--no-penalize-nl") {
			
 
				             params.penalize_nl = false;
			
 
				         } else if (arg == "-l" || arg == "--logit-bias") {
			
@@ -561,8 +549,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 
				     fprintf(stdout, "  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
			
 
				     fprintf(stdout, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
			
 
				     fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
			
 
				-    fprintf(stdout, "  -gqa N, --gqa N       grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
			
 
				-    fprintf(stdout, "  -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
			
 
				     fprintf(stdout, "  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
			
 
				     fprintf(stdout, "  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
			
 
				     fprintf(stdout, "  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
			
@@ -650,24 +636,15 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
 
				     return "The";
			
 
				 }
			
 
				 
			
 
				-// TODO: not great allocating this every time
			
 
				-std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
			
 
				-    // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
			
 
				-    std::vector<llama_token> res(text.size() + (int) add_bos);
			
 
				-    const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
			
 
				-    assert(n >= 0);
			
 
				-    res.resize(n);
			
 
				-
			
 
				-    return res;
			
 
				-}
			
 
				+//
			
 
				+// Model utils
			
 
				+//
			
 
				 
			
 
				 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
			
 
				     auto lparams = llama_context_default_params();
			
 
				 
			
 
				     lparams.n_ctx           = params.n_ctx;
			
 
				     lparams.n_batch         = params.n_batch;
			
 
				-    lparams.n_gqa           = params.n_gqa;
			
 
				-    lparams.rms_norm_eps    = params.rms_norm_eps;
			
 
				     lparams.n_gpu_layers    = params.n_gpu_layers;
			
 
				     lparams.main_gpu        = params.main_gpu;
			
 
				     lparams.tensor_split    = params.tensor_split;
			
@@ -685,7 +662,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
 
				     return lparams;
			
 
				 }
			
 
				 
			
 
				-std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
			
 
				+std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
			
 
				     auto lparams = llama_context_params_from_gpt_params(params);
			
 
				 
			
 
				     llama_model * model  = llama_load_model_from_file(params.model.c_str(), lparams);
			
@@ -714,5 +691,77 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 
				         }
			
 
				     }
			
 
				 
			
 
				+    if (params.ignore_eos) {
			
 
				+        params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
			
 
				+    }
			
 
				+
			
 
				     return std::make_tuple(model, lctx);
			
 
				 }
			
 
				+
			
 
				+//
			
 
				+// Vocab utils
			
 
				+//
			
 
				+
			
 
				+std::vector<llama_token> llama_tokenize(
			
 
				+        struct llama_context * ctx,
			
 
				+           const std::string & text,
			
 
				+                        bool   add_bos) {
			
 
				+    // upper limit for the number of tokens
			
 
				+    int n_tokens = text.length() + add_bos;
			
 
				+    std::vector<llama_token> result(n_tokens);
			
 
				+    n_tokens = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
			
 
				+    if (n_tokens < 0) {
			
 
				+        result.resize(-n_tokens);
			
 
				+        int check = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
			
 
				+        GGML_ASSERT(check == -n_tokens);
			
 
				+    } else {
			
 
				+        result.resize(n_tokens);
			
 
				+    }
			
 
				+    return result;
			
 
				+}
			
 
				+
			
 
				+std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
			
 
				+    std::vector<char> result(8, 0);
			
 
				+    const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size());
			
 
				+    if (n_tokens < 0) {
			
 
				+        result.resize(-n_tokens);
			
 
				+        int check = llama_token_to_str(ctx, token, result.data(), result.size());
			
 
				+        GGML_ASSERT(check == -n_tokens);
			
 
				+    } else {
			
 
				+        result.resize(n_tokens);
			
 
				+    }
			
 
				+
			
 
				+    return std::string(result.data(), result.size());
			
 
				+}
			
 
				+
			
 
				+std::vector<llama_token> llama_tokenize_bpe(
			
 
				+        struct llama_context * ctx,
			
 
				+           const std::string & text,
			
 
				+                        bool   add_bos) {
			
 
				+    int n_tokens = text.length() + add_bos;
			
 
				+    std::vector<llama_token> result(n_tokens);
			
 
				+    n_tokens = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
			
 
				+    if (n_tokens < 0) {
			
 
				+        result.resize(-n_tokens);
			
 
				+        int check = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
			
 
				+        GGML_ASSERT(check == -n_tokens);
			
 
				+    } else {
			
 
				+        result.resize(n_tokens);
			
 
				+    }
			
 
				+    return result;
			
 
				+}
			
 
				+
			
 
				+std::string llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token) {
			
 
				+    std::vector<char> result(8, 0);
			
 
				+    const int n_tokens = llama_token_to_str_bpe(ctx, token, result.data(), result.size());
			
 
				+    if (n_tokens < 0) {
			
 
				+        result.resize(-n_tokens);
			
 
				+        const int check = llama_token_to_str_bpe(ctx, token, result.data(), result.size());
			
 
				+        GGML_ASSERT(check == -n_tokens);
			
 
				+    } else {
			
 
				+        result.resize(n_tokens);
			
 
				+    }
			
 
				+
			
 
				+    return std::string(result.data(), result.size());
			
 
				+}
			
 
				+
			
--- a/examples/common.h
+++ b/examples/common.h
@@ -22,19 +22,16 @@ struct gpt_params {
 
				     int32_t n_predict                       = -1;   // new tokens to predict
			
 
				     int32_t n_ctx                           = 512;  // context size
			
 
				     int32_t n_batch                         = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
			
 
				-    int32_t n_gqa                           = 1;    // grouped-query attention factor (TODO: move to hparams)
			
 
				     int32_t n_keep                          = 0;    // number of tokens to keep from initial prompt
			
 
				     int32_t n_chunks                        = -1;   // max number of chunks to process (-1 = unlimited)
			
 
				     int32_t n_gpu_layers                    = 0;    // number of layers to store in VRAM
			
 
				     int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
			
 
				     float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
			
 
				     int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
			
 
				-    float   rms_norm_eps                    = LLAMA_DEFAULT_RMS_EPS; // rms norm epsilon
			
 
				     float   rope_freq_base                  = 10000.0f; // RoPE base frequency
			
 
				     float   rope_freq_scale                 = 1.0f;     // RoPE frequency scaling factor
			
 
				 
			
 
				     // sampling parameters
			
 
				-    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
			
 
				     int32_t top_k             = 40;    // <= 0 to use vocab size
			
 
				     float   top_p             = 0.95f; // 1.0 = disabled
			
 
				     float   tfs_z             = 1.00f; // 1.0 = disabled
			
@@ -48,12 +45,14 @@ struct gpt_params {
 
				     float   mirostat_tau      = 5.00f; // target entropy
			
 
				     float   mirostat_eta      = 0.10f; // learning rate
			
 
				 
			
 
				+    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
			
 
				+
			
 
				     // Classifier-Free Guidance
			
 
				     // https://arxiv.org/abs/2306.17806
			
 
				     std::string cfg_negative_prompt;       // string to help guidance
			
 
				     float       cfg_scale         = 1.f;   // How strong is guidance
			
 
				 
			
 
				-    std::string model             = "models/7B/ggml-model.bin"; // model path
			
 
				+    std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
			
 
				     std::string model_alias       = "unknown"; // model alias
			
 
				     std::string prompt            = "";
			
 
				     std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
			
@@ -83,6 +82,7 @@ struct gpt_params {
 
				     bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
			
 
				 
			
 
				     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
			
 
				+    bool ignore_eos        = false; // ignore generated EOS tokens
			
 
				     bool instruct          = false; // instruction mode (used for Alpaca models)
			
 
				     bool penalize_nl       = true;  // consider newlines as a repeatable token
			
 
				     bool perplexity        = false; // compute perplexity over the prompt
			
@@ -101,14 +101,30 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
 
				 std::string gpt_random_prompt(std::mt19937 & rng);
			
 
				 
			
 
				 //
			
 
				-// Vocab utils
			
 
				+// Model utils
			
 
				 //
			
 
				 
			
 
				-std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
			
 
				+std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
			
 
				+struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
			
 
				 
			
 
				 //
			
 
				-// Model utils
			
 
				+// Vocab utils
			
 
				 //
			
 
				 
			
 
				-std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
			
 
				-struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
			
 
				+std::vector<llama_token> llama_tokenize(
			
 
				+        struct llama_context * ctx,
			
 
				+           const std::string & text,
			
 
				+                        bool   add_bos);
			
 
				+
			
 
				+std::vector<llama_token> llama_tokenize_bpe(
			
 
				+        struct llama_context * ctx,
			
 
				+           const std::string & text,
			
 
				+                        bool   add_bos);
			
 
				+
			
 
				+std::string llama_token_to_str(
			
 
				+        const struct llama_context * ctx,
			
 
				+                       llama_token   token);
			
 
				+
			
 
				+std::string llama_token_to_str_bpe(
			
 
				+    const struct llama_context * ctx,
			
 
				+                   llama_token   token);
			
--- a/examples/console.cpp
+++ b/examples/console.cpp
--- a/examples/console.h
+++ b/examples/console.h
--- a/examples/grammar-parser.cpp
+++ b/examples/grammar-parser.cpp
--- a/examples/grammar-parser.h
+++ b/examples/grammar-parser.h
--- a/convert-falcon-hf-to-gguf.py
+++ b/convert-falcon-hf-to-gguf.py
@@ -0,0 +1,282 @@
 
				+# HF falcon--> gguf conversion
			
 
				+
			
 
				+import gguf
			
 
				+import os
			
 
				+import sys
			
 
				+import struct
			
 
				+import json
			
 
				+import numpy as np
			
 
				+import torch
			
 
				+
			
 
				+from typing import Any, List
			
 
				+from pathlib import Path
			
 
				+from transformers import AutoTokenizer
			
 
				+
			
 
				+def bytes_to_unicode():
			
 
				+    # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
			
 
				+    """
			
 
				+    Returns list of utf-8 byte and a corresponding list of unicode strings.
			
 
				+    The reversible bpe codes work on unicode strings.
			
 
				+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
			
 
				+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
			
 
				+    This is a significant percentage of your normal, say, 32K bpe vocab.
			
 
				+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
			
 
				+    And avoids mapping to whitespace/control characters the bpe code barfs on.
			
 
				+    """
			
 
				+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
			
 
				+    cs = bs[:]
			
 
				+    n = 0
			
 
				+    for b in range(2**8):
			
 
				+        if b not in bs:
			
 
				+            bs.append(b)
			
 
				+            cs.append(2**8+n)
			
 
				+            n += 1
			
 
				+    cs = [chr(n) for n in cs]
			
 
				+    return dict(zip(bs, cs))
			
 
				+
			
 
				+
			
 
				+def count_model_parts(dir_model: str) -> int:
			
 
				+    num_parts = 0
			
 
				+    for filename in os.listdir(dir_model):
			
 
				+        if filename.startswith("pytorch_model-"):
			
 
				+            num_parts += 1
			
 
				+
			
 
				+    if num_parts > 0:
			
 
				+        print("gguf: found " + str(num_parts) + " model parts")
			
 
				+    return num_parts
			
 
				+
			
 
				+
			
 
				+if len(sys.argv) < 3:
			
 
				+    print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
			
 
				+    print("  ftype == 0 -> float32")
			
 
				+    print("  ftype == 1 -> float16")
			
 
				+    sys.exit(1)
			
 
				+
			
 
				+
			
 
				+# output in the same directory as the model
			
 
				+dir_model = sys.argv[1]
			
 
				+last_dir = os.path.basename(os.path.normpath(dir_model))
			
 
				+
			
 
				+# possible tensor data types
			
 
				+#   ftype == 0 -> float32
			
 
				+#   ftype == 1 -> float16
			
 
				+
			
 
				+# map from ftype to string
			
 
				+ftype_str = ["f32", "f16"]
			
 
				+
			
 
				+ftype = 1
			
 
				+if len(sys.argv) > 2:
			
 
				+    ftype = int(sys.argv[2])
			
 
				+    if ftype < 0 or ftype > 1:
			
 
				+        print("Invalid ftype: " + str(ftype))
			
 
				+
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
			
 
				+
			
 
				+print("gguf: loading model "+last_dir)
			
 
				+
			
 
				+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
			
 
				+    hparams = json.load(f)
			
 
				+
			
 
				+if hparams["architectures"][0] != "RWForCausalLM":
			
 
				+    print("Model architecture not supported: " + hparams["architectures"][0])
			
 
				+
			
 
				+    sys.exit()
			
 
				+
			
 
				+# get number of model parts
			
 
				+num_parts = count_model_parts(dir_model)
			
 
				+
			
 
				+ARCH=gguf.MODEL_ARCH.FALCON
			
 
				+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
			
 
				+
			
 
				+print("gguf: get model metadata")
			
 
				+
			
 
				+block_count = hparams["n_layer"]
			
 
				+
			
 
				+gguf_writer.add_name(last_dir)
			
 
				+gguf_writer.add_context_length(2048) # not in config.json
			
 
				+gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
			
 
				+gguf_writer.add_embedding_length(hparams["hidden_size"])
			
 
				+gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
			
 
				+gguf_writer.add_block_count(block_count)
			
 
				+gguf_writer.add_head_count(hparams["n_head"])
			
 
				+if "n_head_kv" in hparams: gguf_writer.add_head_count_kv(hparams["n_head_kv"])
			
 
				+gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
			
 
				+
			
 
				+# TOKENIZATION
			
 
				+
			
 
				+print("gguf: get tokenizer metadata")
			
 
				+
			
 
				+tokens: List[str] = []
			
 
				+merges: List[str] = []
			
 
				+
			
 
				+
			
 
				+if Path(dir_model + "/tokenizer.json").is_file():
			
 
				+    # gpt2 tokenizer
			
 
				+    gguf_writer.add_tokenizer_model("gpt2")
			
 
				+
			
 
				+    print("gguf: get gpt2 tokenizer merges")
			
 
				+
			
 
				+    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
			
 
				+        tokenizer_json = json.load(f)
			
 
				+    merges = tokenizer_json["model"]["merges"]
			
 
				+
			
 
				+    gguf_writer.add_token_merges(merges)
			
 
				+
			
 
				+    print("gguf: get gpt2 tokenizer vocab")
			
 
				+
			
 
				+    vocab_size = len(tokenizer_json["model"]["vocab"])
			
 
				+
			
 
				+    # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
			
 
				+    tokenizer = AutoTokenizer.from_pretrained(dir_model)
			
 
				+
			
 
				+    reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
			
 
				+    byte_encoder = bytes_to_unicode()
			
 
				+    byte_decoder = {v: k for k, v in byte_encoder.items()}
			
 
				+
			
 
				+    for i in range(vocab_size):
			
 
				+        if i in reverse_vocab:
			
 
				+            try:
			
 
				+                text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
			
 
				+            except KeyError:
			
 
				+                text = bytearray()
			
 
				+                for c in reverse_vocab[i]:
			
 
				+                    if ord(c) < 256:  # single byte character
			
 
				+                        text.append(byte_decoder[ord(c)])
			
 
				+                    else:  # multibyte special token character
			
 
				+                        text.extend(c.encode('utf-8'))
			
 
				+        else:
			
 
				+            print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
			
 
				+            pad_token = f"[PAD{i}]".encode("utf8")
			
 
				+            text = bytearray(pad_token)
			
 
				+
			
 
				+        tokens.append(text)
			
 
				+
			
 
				+    gguf_writer.add_token_list(tokens)
			
 
				+
			
 
				+    if "added_tokens" in tokenizer_json and Path(dir_model + "/tokenizer_config.json").is_file():
			
 
				+        print("gguf: get special token ids")
			
 
				+
			
 
				+        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
			
 
				+            tokenizer_config = json.load(f)
			
 
				+
			
 
				+        # find special token ids
			
 
				+
			
 
				+        if "bos_token" in tokenizer_config:
			
 
				+            for key in tokenizer_json["added_tokens"]:
			
 
				+                if key["content"] == tokenizer_config["bos_token"]:
			
 
				+                    gguf_writer.add_bos_token_id(key["id"])
			
 
				+
			
 
				+        if "eos_token" in tokenizer_config:
			
 
				+            for key in tokenizer_json["added_tokens"]:
			
 
				+                if key["content"] == tokenizer_config["eos_token"]:
			
 
				+                    gguf_writer.add_eos_token_id(key["id"])
			
 
				+
			
 
				+        if "unk_token" in tokenizer_config:
			
 
				+            for key in tokenizer_json["added_tokens"]:
			
 
				+                if key["content"] == tokenizer_config["unk_token"]:
			
 
				+                    gguf_writer.add_unk_token_id(key["id"])
			
 
				+
			
 
				+        if "sep_token" in tokenizer_config:
			
 
				+            for key in tokenizer_json["added_tokens"]:
			
 
				+                if key["content"] == tokenizer_config["sep_token"]:
			
 
				+                    gguf_writer.add_sep_token_id(key["id"])
			
 
				+
			
 
				+        if "pad_token" in tokenizer_config:
			
 
				+            for key in tokenizer_json["added_tokens"]:
			
 
				+                if key["content"] == tokenizer_config["pad_token"]:
			
 
				+                    gguf_writer.add_pad_token_id(key["id"])
			
 
				+
			
 
				+
			
 
				+# TENSORS
			
 
				+
			
 
				+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
			
 
				+
			
 
				+# params for qkv transform
			
 
				+n_head = hparams["n_head"]
			
 
				+n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
			
 
				+head_dim = hparams["hidden_size"] // n_head
			
 
				+
			
 
				+# tensor info
			
 
				+print("gguf: get tensor metadata")
			
 
				+
			
 
				+if num_parts == 0:
			
 
				+    part_names = ("pytorch_model.bin",)
			
 
				+else:
			
 
				+    part_names = (
			
 
				+        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
			
 
				+    )
			
 
				+
			
 
				+for part_name in part_names:
			
 
				+    print("gguf: loading model part '" + part_name + "'")
			
 
				+    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
			
 
				+
			
 
				+    for name in model_part.keys():
			
 
				+        data = model_part[name]
			
 
				+
			
 
				+        old_dtype = data.dtype
			
 
				+
			
 
				+        # convert any unsupported data types to float32
			
 
				+        if data.dtype != torch.float16 and data.dtype != torch.float32:
			
 
				+            data = data.to(torch.float32)
			
 
				+
			
 
				+        # QKV tensor transform
			
 
				+        # The original query_key_value tensor contains n_head_kv "kv groups",
			
 
				+        # each consisting of n_head/n_head_kv query weights followed by one key
			
 
				+        # and one value weight (shared by all query heads in the kv group).
			
 
				+        # This layout makes it a big pain to work with in GGML.
			
 
				+        # So we rearrange them here,, so that we have n_head query weights
			
 
				+        # followed by n_head_kv key weights followed by n_head_kv value weights,
			
 
				+        # in contiguous fashion.
			
 
				+        # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
			
 
				+
			
 
				+        if "query_key_value" in name:
			
 
				+            qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
			
 
				+            q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
			
 
				+            k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
			
 
				+            v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
			
 
				+            data = torch.cat((q,k,v)).reshape_as(data)
			
 
				+
			
 
				+        data = data.squeeze().numpy()
			
 
				+
			
 
				+        # map tensor names
			
 
				+        if name.endswith(".weight") and name[:-7] in tensor_map:
			
 
				+            name = tensor_map[name[:-7]] + ".weight"
			
 
				+        elif name.endswith(".bias") and name[:-5] in tensor_map:
			
 
				+            name = tensor_map[name[:-5]] + ".bias"
			
 
				+        else:
			
 
				+            print("Can not map tensor '" + name + "'")
			
 
				+            sys.exit()
			
 
				+
			
 
				+        n_dims = len(data.shape)
			
 
				+        data_dtype = data.dtype
			
 
				+
			
 
				+        # if f32 desired, convert any float16 to float32
			
 
				+        if ftype == 0 and data_dtype == np.float16:
			
 
				+            data = data.astype(np.float32)
			
 
				+
			
 
				+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
			
 
				+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
			
 
				+            data = data.astype(np.float32)
			
 
				+
			
 
				+        # if f16 desired, convert any float32 2-dim weight tensors to float16
			
 
				+        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
			
 
				+            data = data.astype(np.float16)
			
 
				+
			
 
				+        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
			
 
				+
			
 
				+        gguf_writer.add_tensor(name, data)
			
 
				+
			
 
				+
			
 
				+print("gguf: write header")
			
 
				+gguf_writer.write_header_to_file()
			
 
				+print("gguf: write metadata")
			
 
				+gguf_writer.write_kv_data_to_file()
			
 
				+print("gguf: write tensors")
			
 
				+gguf_writer.write_tensors_to_file()
			
 
				+
			
 
				+gguf_writer.close()
			
 
				+
			
 
				+print("gguf: model successfully exported to '" + fname_out + "'")
			
 
				+print("")
			
--- a/convert-gptneox-hf-to-gguf.py
+++ b/convert-gptneox-hf-to-gguf.py
@@ -0,0 +1,266 @@
 
				+# HF gptneox--> gguf conversion
			
 
				+
			
 
				+import gguf
			
 
				+import os
			
 
				+import sys
			
 
				+import struct
			
 
				+import json
			
 
				+import numpy as np
			
 
				+import torch
			
 
				+
			
 
				+from typing import Any, List
			
 
				+from pathlib import Path
			
 
				+from transformers import AutoTokenizer
			
 
				+
			
 
				+# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
			
 
				+
			
 
				+
			
 
				+def bytes_to_unicode():
			
 
				+    """
			
 
				+    Returns list of utf-8 byte and a corresponding list of unicode strings.
			
 
				+    The reversible bpe codes work on unicode strings.
			
 
				+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
			
 
				+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
			
 
				+    This is a significant percentage of your normal, say, 32K bpe vocab.
			
 
				+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
			
 
				+    And avoids mapping to whitespace/control characters the bpe code barfs on.
			
 
				+    """
			
 
				+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
			
 
				+    cs = bs[:]
			
 
				+    n = 0
			
 
				+    for b in range(2**8):
			
 
				+        if b not in bs:
			
 
				+            bs.append(b)
			
 
				+            cs.append(2**8+n)
			
 
				+            n += 1
			
 
				+    cs = [chr(n) for n in cs]
			
 
				+    return dict(zip(bs, cs))
			
 
				+
			
 
				+
			
 
				+def count_model_parts(dir_model: str) -> int:
			
 
				+    num_parts = 0
			
 
				+    for filename in os.listdir(dir_model):
			
 
				+        if filename.startswith("pytorch_model-"):
			
 
				+            num_parts += 1
			
 
				+
			
 
				+    if num_parts > 0:
			
 
				+        print("gguf: found " + str(num_parts) + " model parts")
			
 
				+    return num_parts
			
 
				+
			
 
				+
			
 
				+if len(sys.argv) < 3:
			
 
				+    print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
			
 
				+    print("  ftype == 0 -> float32")
			
 
				+    print("  ftype == 1 -> float16")
			
 
				+    sys.exit(1)
			
 
				+
			
 
				+
			
 
				+# output in the same directory as the model
			
 
				+dir_model = sys.argv[1]
			
 
				+last_dir = os.path.basename(os.path.normpath(dir_model))
			
 
				+
			
 
				+# possible tensor data types
			
 
				+#   ftype == 0 -> float32
			
 
				+#   ftype == 1 -> float16
			
 
				+
			
 
				+# map from ftype to string
			
 
				+ftype_str = ["f32", "f16"]
			
 
				+
			
 
				+ftype = 1
			
 
				+if len(sys.argv) > 2:
			
 
				+    ftype = int(sys.argv[2])
			
 
				+    if ftype < 0 or ftype > 1:
			
 
				+        print("Invalid ftype: " + str(ftype))
			
 
				+
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
			
 
				+
			
 
				+print("gguf: loading model "+last_dir)
			
 
				+
			
 
				+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
			
 
				+    hparams = json.load(f)
			
 
				+
			
 
				+if hparams["architectures"][0] != "GPTNeoXForCausalLM":
			
 
				+    print("Model architecture not supported: " + hparams["architectures"][0])
			
 
				+
			
 
				+    sys.exit()
			
 
				+
			
 
				+# get number of model parts
			
 
				+num_parts = count_model_parts(dir_model)
			
 
				+
			
 
				+ARCH=gguf.MODEL_ARCH.GPTNEOX
			
 
				+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
			
 
				+
			
 
				+print("gguf: get model metadata")
			
 
				+
			
 
				+block_count = hparams["num_hidden_layers"]
			
 
				+
			
 
				+gguf_writer.add_name(last_dir)
			
 
				+gguf_writer.add_context_length(hparams["max_position_embeddings"])
			
 
				+gguf_writer.add_embedding_length(hparams["hidden_size"])
			
 
				+gguf_writer.add_block_count(block_count)
			
 
				+gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
			
 
				+gguf_writer.add_rope_dimension_count(int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"])))
			
 
				+gguf_writer.add_head_count(hparams["num_attention_heads"])
			
 
				+gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
			
 
				+gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])
			
 
				+
			
 
				+# TOKENIZATION
			
 
				+
			
 
				+print("gguf: get tokenizer metadata")
			
 
				+
			
 
				+tokens: List[str] = []
			
 
				+merges: List[str] = []
			
 
				+
			
 
				+
			
 
				+if Path(dir_model + "/tokenizer.json").is_file():
			
 
				+    # gpt2 tokenizer
			
 
				+    gguf_writer.add_tokenizer_model("gpt2")
			
 
				+
			
 
				+    print("gguf: get gpt2 tokenizer merges")
			
 
				+
			
 
				+    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
			
 
				+        tokenizer_json = json.load(f)
			
 
				+    merges = tokenizer_json["model"]["merges"]
			
 
				+
			
 
				+    gguf_writer.add_token_merges(merges)
			
 
				+
			
 
				+    print("gguf: get gpt2 tokenizer vocab")
			
 
				+
			
 
				+    vocab_size = len(tokenizer_json["model"]["vocab"])
			
 
				+
			
 
				+    # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
			
 
				+    tokenizer = AutoTokenizer.from_pretrained(dir_model)
			
 
				+
			
 
				+    reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
			
 
				+    byte_encoder = bytes_to_unicode()
			
 
				+    byte_decoder = {v: k for k, v in byte_encoder.items()}
			
 
				+
			
 
				+    for i in range(vocab_size):
			
 
				+        if i in reverse_vocab:
			
 
				+            try:
			
 
				+                text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
			
 
				+            except KeyError:
			
 
				+                text = bytearray()
			
 
				+                for c in reverse_vocab[i]:
			
 
				+                    if ord(c) < 256:  # single byte character
			
 
				+                        text.append(byte_decoder[ord(c)])
			
 
				+                    else:  # multibyte special token character
			
 
				+                        text.extend(c.encode('utf-8'))
			
 
				+        else:
			
 
				+            print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
			
 
				+            pad_token = f"[PAD{i}]".encode("utf8")
			
 
				+            text = bytearray(pad_token)
			
 
				+
			
 
				+        tokens.append(text)
			
 
				+
			
 
				+    gguf_writer.add_token_list(tokens)
			
 
				+
			
 
				+    if "added_tokens" in tokenizer_json and Path(dir_model + "/tokenizer_config.json").is_file():
			
 
				+        print("gguf: get special token ids")
			
 
				+
			
 
				+        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
			
 
				+            tokenizer_config = json.load(f)
			
 
				+
			
 
				+        # find special token ids
			
 
				+
			
 
				+        if "bos_token" in tokenizer_config:
			
 
				+            for key in tokenizer_json["added_tokens"]:
			
 
				+                if key["content"] == tokenizer_config["bos_token"]:
			
 
				+                    gguf_writer.add_bos_token_id(key["id"])
			
 
				+
			
 
				+        if "eos_token" in tokenizer_config:
			
 
				+            for key in tokenizer_json["added_tokens"]:
			
 
				+                if key["content"] == tokenizer_config["eos_token"]:
			
 
				+                    gguf_writer.add_eos_token_id(key["id"])
			
 
				+
			
 
				+        if "unk_token" in tokenizer_config:
			
 
				+            for key in tokenizer_json["added_tokens"]:
			
 
				+                if key["content"] == tokenizer_config["unk_token"]:
			
 
				+                    gguf_writer.add_unk_token_id(key["id"])
			
 
				+
			
 
				+        if "sep_token" in tokenizer_config:
			
 
				+            for key in tokenizer_json["added_tokens"]:
			
 
				+                if key["content"] == tokenizer_config["sep_token"]:
			
 
				+                    gguf_writer.add_sep_token_id(key["id"])
			
 
				+
			
 
				+        if "pad_token" in tokenizer_config:
			
 
				+            for key in tokenizer_json["added_tokens"]:
			
 
				+                if key["content"] == tokenizer_config["pad_token"]:
			
 
				+                    gguf_writer.add_pad_token_id(key["id"])
			
 
				+
			
 
				+
			
 
				+# TENSORS
			
 
				+
			
 
				+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
			
 
				+
			
 
				+# tensor info
			
 
				+print("gguf: get tensor metadata")
			
 
				+
			
 
				+if num_parts == 0:
			
 
				+    part_names = ("pytorch_model.bin",)
			
 
				+else:
			
 
				+    part_names = (
			
 
				+        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
			
 
				+    )
			
 
				+
			
 
				+for part_name in part_names:
			
 
				+    print("gguf: loading model part '" + part_name + "'")
			
 
				+    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
			
 
				+
			
 
				+    for name in model_part.keys():
			
 
				+        data = model_part[name]
			
 
				+
			
 
				+        # we don't need these
			
 
				+        if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"):
			
 
				+            continue
			
 
				+
			
 
				+        old_dtype = data.dtype
			
 
				+
			
 
				+        # convert any unsupported data types to float32
			
 
				+        if data.dtype != torch.float16 and data.dtype != torch.float32:
			
 
				+            data = data.to(torch.float32)
			
 
				+
			
 
				+        data = data.squeeze().numpy()
			
 
				+
			
 
				+        # map tensor names
			
 
				+        if name.endswith(".weight") and name[:-7] in tensor_map:
			
 
				+            name = tensor_map[name[:-7]] + ".weight"
			
 
				+        elif name.endswith(".bias") and name[:-5] in tensor_map:
			
 
				+            name = tensor_map[name[:-5]] + ".bias"
			
 
				+        else:
			
 
				+            print("Can not map tensor '" + name + "'")
			
 
				+            sys.exit()
			
 
				+
			
 
				+        n_dims = len(data.shape)
			
 
				+        data_dtype = data.dtype
			
 
				+
			
 
				+        # if f32 desired, convert any float16 to float32
			
 
				+        if ftype == 0 and data_dtype == np.float16:
			
 
				+            data = data.astype(np.float32)
			
 
				+
			
 
				+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
			
 
				+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
			
 
				+            data = data.astype(np.float32)
			
 
				+
			
 
				+        # if f16 desired, convert any float32 2-dim weight tensors to float16
			
 
				+        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
			
 
				+            data = data.astype(np.float16)
			
 
				+
			
 
				+        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
			
 
				+
			
 
				+        gguf_writer.add_tensor(name, data)
			
 
				+
			
 
				+
			
 
				+print("gguf: write header")
			
 
				+gguf_writer.write_header_to_file()
			
 
				+print("gguf: write metadata")
			
 
				+gguf_writer.write_kv_data_to_file()
			
 
				+print("gguf: write tensors")
			
 
				+gguf_writer.write_tensors_to_file()
			
 
				+
			
 
				+gguf_writer.close()
			
 
				+
			
 
				+print("gguf: model successfully exported to '" + fname_out + "'")
			
 
				+print("")
			
--- a/convert-llama-7b-pth-to-gguf.py
+++ b/convert-llama-7b-pth-to-gguf.py
@@ -0,0 +1,307 @@
 
				+# 7b pth llama --> gguf conversion
			
 
				+# Only models with a single datafile are supported, like 7B
			
 
				+# HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model
			
 
				+
			
 
				+import gguf
			
 
				+import os
			
 
				+import sys
			
 
				+import struct
			
 
				+import json
			
 
				+import numpy as np
			
 
				+import torch
			
 
				+
			
 
				+from typing import Any, List
			
 
				+from pathlib import Path
			
 
				+from sentencepiece import SentencePieceProcessor
			
 
				+
			
 
				+#NDArray = np.ndarray[Any, Any]
			
 
				+# compatible with python < 3.9
			
 
				+NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
			
 
				+
			
 
				+
			
 
				+def count_model_parts(dir_model: str) -> int:
			
 
				+    num_parts = 0
			
 
				+    for filename in os.listdir(dir_model):
			
 
				+        if filename.startswith("consolidated."):
			
 
				+            num_parts += 1
			
 
				+
			
 
				+    if num_parts > 0:
			
 
				+        print("gguf: found " + str(num_parts) + " model parts")
			
 
				+    return num_parts
			
 
				+
			
 
				+
			
 
				+if len(sys.argv) < 3:
			
 
				+    print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
			
 
				+    print("  ftype == 0 -> float32")
			
 
				+    print("  ftype == 1 -> float16")
			
 
				+
			
 
				+    sys.exit(1)
			
 
				+
			
 
				+
			
 
				+# output in the same directory as the model
			
 
				+dir_model = sys.argv[1]
			
 
				+last_dir = os.path.basename(os.path.normpath(dir_model))
			
 
				+
			
 
				+
			
 
				+# possible tensor data types
			
 
				+#   ftype == 0 -> float32
			
 
				+#   ftype == 1 -> float16
			
 
				+
			
 
				+# map from ftype to string
			
 
				+ftype_str = ["f32", "f16"]
			
 
				+
			
 
				+ftype = 1
			
 
				+if len(sys.argv) > 2:
			
 
				+    ftype = int(sys.argv[2])
			
 
				+    if ftype < 0 or ftype > 1:
			
 
				+        print("Invalid ftype: " + str(ftype))
			
 
				+
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
			
 
				+
			
 
				+print("gguf: loading model "+last_dir)
			
 
				+
			
 
				+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
			
 
				+    hparams = json.load(f)
			
 
				+
			
 
				+if hparams["architectures"][0] != "LlamaForCausalLM":
			
 
				+    print("Model architecture not supported: " + hparams["architectures"][0])
			
 
				+    sys.exit()
			
 
				+
			
 
				+# get number of model parts
			
 
				+num_parts = count_model_parts(dir_model)
			
 
				+
			
 
				+if num_parts > 1:
			
 
				+    print("gguf: Only models with a single datafile are supported.")
			
 
				+
			
 
				+    sys.exit()
			
 
				+
			
 
				+ARCH=gguf.MODEL_ARCH.LLAMA
			
 
				+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
			
 
				+
			
 
				+
			
 
				+print("gguf: get model metadata")
			
 
				+
			
 
				+block_count = hparams["num_hidden_layers"]
			
 
				+head_count = hparams["num_attention_heads"]
			
 
				+
			
 
				+if "num_key_value_heads" in hparams:
			
 
				+    head_count_kv = hparams["num_key_value_heads"]
			
 
				+else:
			
 
				+    head_count_kv = head_count
			
 
				+
			
 
				+if "_name_or_path" in hparams:
			
 
				+    hf_repo = hparams["_name_or_path"]
			
 
				+else:
			
 
				+    hf_repo = ""
			
 
				+
			
 
				+if "max_sequence_length" in hparams:
			
 
				+    ctx_length = hparams["max_sequence_length"]
			
 
				+elif "max_position_embeddings" in hparams:
			
 
				+    ctx_length = hparams["max_position_embeddings"]
			
 
				+else:
			
 
				+    print("gguf: can not find ctx length parameter.")
			
 
				+
			
 
				+    sys.exit()
			
 
				+
			
 
				+
			
 
				+gguf_writer.add_name(last_dir)
			
 
				+gguf_writer.add_source_hf_repo(hf_repo)
			
 
				+gguf_writer.add_tensor_data_layout("Meta AI original pth")
			
 
				+gguf_writer.add_context_length(ctx_length)
			
 
				+gguf_writer.add_embedding_length(hparams["hidden_size"])
			
 
				+gguf_writer.add_block_count(block_count)
			
 
				+gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
			
 
				+gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
			
 
				+gguf_writer.add_head_count(head_count)
			
 
				+gguf_writer.add_head_count_kv(head_count_kv)
			
 
				+gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
			
 
				+
			
 
				+if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
			
 
				+    if "type" in hparams["rope_scaling"]:
			
 
				+        if hparams["rope_scaling"]["type"] == "linear":
			
 
				+            gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
			
 
				+
			
 
				+
			
 
				+# TOKENIZATION
			
 
				+
			
 
				+print("gguf: get tokenizer metadata")
			
 
				+
			
 
				+tokens: List[bytes] = []
			
 
				+scores: List[float] = []
			
 
				+toktypes: List[int] = []
			
 
				+
			
 
				+if Path(dir_model + "/tokenizer.model").is_file():
			
 
				+    # vocab type sentencepiece
			
 
				+    print("gguf: get sentencepiece tokenizer vocab and scores")
			
 
				+
			
 
				+    tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model")
			
 
				+
			
 
				+    for i in range(tokenizer.vocab_size()):
			
 
				+        text: bytes
			
 
				+        score: float
			
 
				+
			
 
				+        piece = tokenizer.id_to_piece(i)
			
 
				+        text = piece.encode("utf-8")
			
 
				+        score = tokenizer.get_score(i)
			
 
				+
			
 
				+        toktype = 1  # defualt to normal token type
			
 
				+        if tokenizer.is_unknown(i):
			
 
				+            toktype = 2
			
 
				+        if tokenizer.is_control(i):
			
 
				+            toktype = 3
			
 
				+
			
 
				+        # toktype = 4 is user-defined = tokens from added_tokens.json
			
 
				+
			
 
				+        if tokenizer.is_unused(i):
			
 
				+            toktype = 5
			
 
				+        if tokenizer.is_byte(i):
			
 
				+            toktype = 6
			
 
				+
			
 
				+        tokens.append(text)
			
 
				+        scores.append(score)
			
 
				+        toktypes.append(toktype)
			
 
				+
			
 
				+    if Path(dir_model + "/added_tokens.json").is_file():
			
 
				+        with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
			
 
				+            addtokens_json = json.load(f)
			
 
				+
			
 
				+            print("gguf: get added tokens")
			
 
				+
			
 
				+            for key in addtokens_json:
			
 
				+                tokens.append( key.encode("utf-8") )
			
 
				+                scores.append(-1000.0)
			
 
				+                toktypes.append(4) # user-defined token type
			
 
				+
			
 
				+    gguf_writer.add_tokenizer_model("llama")
			
 
				+    gguf_writer.add_token_list(tokens)
			
 
				+    gguf_writer.add_token_scores(scores)
			
 
				+    gguf_writer.add_token_types(toktypes)
			
 
				+
			
 
				+
			
 
				+print("gguf: get special token ids")
			
 
				+
			
 
				+if Path(dir_model + "/tokenizer.json").is_file():
			
 
				+    # Look for special tokens in tokenizer.json if it exists
			
 
				+
			
 
				+    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
			
 
				+        tokenizer = json.load(f)
			
 
				+
			
 
				+    if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
			
 
				+
			
 
				+        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
			
 
				+            tokenizer_config = json.load(f)
			
 
				+
			
 
				+        if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:
			
 
				+            for key in tokenizer["added_tokens"]:
			
 
				+                if key["content"] == tokenizer_config["bos_token"]["content"]:
			
 
				+                    gguf_writer.add_bos_token_id(key["id"])
			
 
				+
			
 
				+        if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:
			
 
				+            for key in tokenizer["added_tokens"]:
			
 
				+                if key["content"] == tokenizer_config["eos_token"]["content"]:
			
 
				+                    gguf_writer.add_eos_token_id(key["id"])
			
 
				+
			
 
				+        if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:
			
 
				+            for key in tokenizer["added_tokens"]:
			
 
				+                if key["content"] == tokenizer_config["unk_token"]["content"]:
			
 
				+                    gguf_writer.add_unk_token_id(key["id"])
			
 
				+
			
 
				+        if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:
			
 
				+            for key in tokenizer["added_tokens"]:
			
 
				+                if key["content"] == tokenizer_config["sep_token"]["content"]:
			
 
				+                    gguf_writer.add_sep_token_id(key["id"])
			
 
				+
			
 
				+        if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:
			
 
				+            for key in tokenizer["added_tokens"]:
			
 
				+                if key["content"] == tokenizer_config["pad_token"]["content"]:
			
 
				+                    gguf_writer.add_pad_token_id(key["id"])
			
 
				+else:
			
 
				+    # If no tokenizer.json: Look for special tokens in config.json
			
 
				+
			
 
				+    if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
			
 
				+        gguf_writer.add_bos_token_id(hparams["bos_token_id"])
			
 
				+
			
 
				+    if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
			
 
				+        gguf_writer.add_eos_token_id(hparams["eos_token_id"])
			
 
				+
			
 
				+    if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
			
 
				+        gguf_writer.add_unk_token_id(hparams["unk_token_id"])
			
 
				+
			
 
				+    if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
			
 
				+        gguf_writer.add_sep_token_id(hparams["sep_token_id"])
			
 
				+
			
 
				+    if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
			
 
				+        gguf_writer.add_pad_token_id(hparams["pad_token_id"])
			
 
				+
			
 
				+
			
 
				+# TENSORS
			
 
				+
			
 
				+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
			
 
				+
			
 
				+# tensor info
			
 
				+print("gguf: get tensor metadata")
			
 
				+
			
 
				+part_names = (f"consolidated.{n:02}.pth" for n in range(0, num_parts))
			
 
				+
			
 
				+for part_name in part_names:
			
 
				+    print("gguf: loading model part '" + part_name + "'")
			
 
				+    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
			
 
				+
			
 
				+    for name in model_part.keys():
			
 
				+        data = model_part[name]
			
 
				+
			
 
				+        # we don't need these
			
 
				+        if name == "rope.freqs":
			
 
				+            continue
			
 
				+
			
 
				+        old_dtype = data.dtype
			
 
				+
			
 
				+        # convert any unsupported data types to float32
			
 
				+        if data.dtype != torch.float16 and data.dtype != torch.float32:
			
 
				+            data = data.to(torch.float32)
			
 
				+
			
 
				+        data = data.squeeze().numpy()
			
 
				+
			
 
				+        # map tensor names
			
 
				+        if name.endswith(".weight") and name[:-7] in tensor_map:
			
 
				+            name = tensor_map[name[:-7]] + ".weight"
			
 
				+        elif name.endswith(".bias") and name[:-5] in tensor_map:
			
 
				+            name = tensor_map[name[:-5]] + ".bias"
			
 
				+        else:
			
 
				+            print("Can not map tensor '" + name + "'")
			
 
				+            sys.exit()
			
 
				+
			
 
				+        n_dims = len(data.shape)
			
 
				+        data_dtype = data.dtype
			
 
				+
			
 
				+        # if f32 desired, convert any float16 to float32
			
 
				+        if ftype == 0 and data_dtype == np.float16:
			
 
				+            data = data.astype(np.float32)
			
 
				+
			
 
				+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
			
 
				+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
			
 
				+            data = data.astype(np.float32)
			
 
				+
			
 
				+        # if f16 desired, convert any float32 2-dim weight tensors to float16
			
 
				+        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
			
 
				+            data = data.astype(np.float16)
			
 
				+
			
 
				+        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
			
 
				+
			
 
				+        gguf_writer.add_tensor(name, data)
			
 
				+
			
 
				+
			
 
				+print("gguf: write header")
			
 
				+gguf_writer.write_header_to_file()
			
 
				+print("gguf: write metadata")
			
 
				+gguf_writer.write_kv_data_to_file()
			
 
				+print("gguf: write tensors")
			
 
				+gguf_writer.write_tensors_to_file()
			
 
				+
			
 
				+gguf_writer.close()
			
 
				+
			
 
				+
			
 
				+print("gguf: model successfully exported to '" + fname_out + "'")
			
 
				+print("")
			
--- a/convert-llama-ggmlv3-to-gguf.py
+++ b/convert-llama-ggmlv3-to-gguf.py
@@ -0,0 +1,334 @@
 
				+import sys, struct, math, argparse
			
 
				+from pathlib import Path
			
 
				+
			
 
				+import numpy as np
			
 
				+
			
 
				+import gguf
			
 
				+
			
 
				+# Note: Does not support GGML_QKK_64
			
 
				+QK_K = 256
			
 
				+# Items here are (block size, type size)
			
 
				+GGML_QUANT_SIZES = {
			
 
				+    gguf.GGMLQuantizationType.F32  : (1, 4),
			
 
				+    gguf.GGMLQuantizationType.F16  : (1, 2),
			
 
				+    gguf.GGMLQuantizationType.Q4_0 : (32, 2 + 16),
			
 
				+    gguf.GGMLQuantizationType.Q4_1 : (32, 2 + 2 + 16),
			
 
				+    gguf.GGMLQuantizationType.Q5_0 : (32, 2 + 4 + 16),
			
 
				+    gguf.GGMLQuantizationType.Q5_1 : (32, 2 + 2 + 4 + 16),
			
 
				+    gguf.GGMLQuantizationType.Q8_0 : (32, 2 + 32),
			
 
				+    gguf.GGMLQuantizationType.Q8_1 : (32, 4 + 4 + 32),
			
 
				+    gguf.GGMLQuantizationType.Q2_K : (256, 2 + 2 + QK_K // 16 + QK_K // 4),
			
 
				+    gguf.GGMLQuantizationType.Q3_K : (256, 2 + QK_K // 4 + QK_K // 8 + 12),
			
 
				+    gguf.GGMLQuantizationType.Q4_K : (256, 2 + 2 + QK_K // 2 + 12),
			
 
				+    gguf.GGMLQuantizationType.Q5_K : (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
			
 
				+    gguf.GGMLQuantizationType.Q6_K : (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
			
 
				+    gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
			
 
				+}
			
 
				+
			
 
				+class Hyperparameters:
			
 
				+    def __init__(self):
			
 
				+        self.n_vocab = self.n_embd = self.n_mult = self.n_head = self.n_layer = self.n_rot = self.ftype = 0
			
 
				+        self.n_ff = 0
			
 
				+
			
 
				+    def set_n_ff(self, model):
			
 
				+        ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
			
 
				+        assert ff_tensor_idx is not None, 'Missing layer 0 FF tensor'
			
 
				+        ff_tensor = model.tensors[ff_tensor_idx]
			
 
				+        self.n_ff = ff_tensor.dims[1]
			
 
				+
			
 
				+    def load(self, data, offset):
			
 
				+        (
			
 
				+            self.n_vocab,
			
 
				+            self.n_embd,
			
 
				+            self.n_mult,
			
 
				+            self.n_head,
			
 
				+            self.n_layer,
			
 
				+            self.n_rot,
			
 
				+            self.ftype,
			
 
				+        ) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
			
 
				+        return 4 * 7
			
 
				+
			
 
				+    def __str__(self):
			
 
				+        return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype}>'
			
 
				+
			
 
				+class Vocab:
			
 
				+    def __init__(self):
			
 
				+        self.items = []
			
 
				+
			
 
				+    def load(self, data, offset, n_vocab):
			
 
				+        orig_offset = offset
			
 
				+        for _ in range(n_vocab):
			
 
				+            itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
			
 
				+            assert itemlen < 4096, 'Absurd vocab item length'
			
 
				+            offset += 4
			
 
				+            vocab = bytes(data[offset:offset + itemlen])
			
 
				+            offset += itemlen
			
 
				+            score = struct.unpack('<f', data[offset:offset + 4])[0]
			
 
				+            offset += 4
			
 
				+            self.items.append((vocab, score))
			
 
				+        return offset - orig_offset
			
 
				+
			
 
				+class Tensor:
			
 
				+    def __init__(self):
			
 
				+        self.name = None
			
 
				+        self.dims = ()
			
 
				+        self.dtype = None
			
 
				+        self.start_offset = 0
			
 
				+        self.len_bytes = 0
			
 
				+
			
 
				+    def load(self, data, offset):
			
 
				+        orig_offset = offset
			
 
				+        (n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
			
 
				+        assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
			
 
				+        assert name_len < 4096, 'Absurd tensor name length'
			
 
				+        quant = GGML_QUANT_SIZES.get(dtype)
			
 
				+        assert quant is not None, 'Unknown tensor type'
			
 
				+        (blksize, tysize) = quant
			
 
				+        offset += 12
			
 
				+        self.dtype= dtype
			
 
				+        self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
			
 
				+        offset += 4 * n_dims
			
 
				+        self.name = bytes(data[offset:offset + name_len])
			
 
				+        offset += name_len
			
 
				+        pad = ((offset + 31) & ~31) - offset
			
 
				+        offset += pad
			
 
				+        n_elems = np.prod(self.dims)
			
 
				+        n_bytes = (n_elems * tysize) // blksize
			
 
				+        self.start_offset = offset
			
 
				+        self.len_bytes = n_bytes
			
 
				+        offset += n_bytes
			
 
				+        # print(n_dims, name_len, dtype, self.dims, self.name, pad)
			
 
				+        return offset - orig_offset
			
 
				+
			
 
				+class GGMLV3Model:
			
 
				+    def __init__(self):
			
 
				+        self.hyperparameters = None
			
 
				+        self.vocab = None
			
 
				+        self.tensor_map = {}
			
 
				+        self.tensors = []
			
 
				+
			
 
				+    def validate_header(self, data, offset):
			
 
				+        if bytes(data[offset:offset + 4]) != b'tjgg' or struct.unpack('<I', data[offset + 4:offset + 8])[0] != 3:
			
 
				+            raise ValueError('Only GGJTv3 supported')
			
 
				+        return 8
			
 
				+
			
 
				+    def load(self, data, offset):
			
 
				+        offset += self.validate_header(data, offset)
			
 
				+        hp = Hyperparameters()
			
 
				+        offset += hp.load(data, offset)
			
 
				+        vocab = Vocab()
			
 
				+        offset += vocab.load(data, offset, hp.n_vocab)
			
 
				+        tensors = []
			
 
				+        tensor_map = {}
			
 
				+        while offset < len(data):
			
 
				+            tensor = Tensor()
			
 
				+            offset += tensor.load(data, offset)
			
 
				+            tensor_map[tensor.name] = len(tensors)
			
 
				+            tensors.append(tensor)
			
 
				+        self.hyperparameters = hp
			
 
				+        self.vocab = vocab
			
 
				+        self.tensors = tensors
			
 
				+        self.tensor_map = tensor_map
			
 
				+        hp.set_n_ff(self)
			
 
				+        return offset
			
 
				+
			
 
				+class GGMLToGGUF:
			
 
				+    def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None):
			
 
				+        hp = ggml_model.hyperparameters
			
 
				+        self.model = ggml_model
			
 
				+        self.data = data
			
 
				+        self.cfg = cfg
			
 
				+        self.params_override = params_override
			
 
				+        self.vocab_override = vocab_override
			
 
				+        if params_override is not None:
			
 
				+            n_kv_head = params_override.n_head_kv
			
 
				+        else:
			
 
				+            if cfg.gqa == 1:
			
 
				+                n_kv_head = hp.n_head
			
 
				+            else:
			
 
				+                gqa = float(cfg.gqa)
			
 
				+                n_kv_head = None
			
 
				+                for x in range(1, 256):
			
 
				+                    if float(hp.n_head) / float(x) == gqa:
			
 
				+                        n_kv_head = x
			
 
				+                assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
			
 
				+                print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
			
 
				+        self.n_kv_head = n_kv_head
			
 
				+        self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
			
 
				+
			
 
				+    def save(self):
			
 
				+        print('* Preparing to save GGUF file')
			
 
				+        gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
			
 
				+        self.add_params(gguf_writer)
			
 
				+        self.add_vocab(gguf_writer)
			
 
				+        self.add_tensors(gguf_writer)
			
 
				+        print("    gguf: write header")
			
 
				+        gguf_writer.write_header_to_file()
			
 
				+        print("    gguf: write metadata")
			
 
				+        gguf_writer.write_kv_data_to_file()
			
 
				+        print("    gguf: write tensors")
			
 
				+        gguf_writer.write_tensors_to_file()
			
 
				+        gguf_writer.close()
			
 
				+
			
 
				+    def add_params(self, gguf_writer):
			
 
				+        hp = self.model.hyperparameters
			
 
				+        cfg = self.cfg
			
 
				+        desc = cfg.desc if cfg.desc is not None else 'converted from legacy GGJTv3 format'
			
 
				+        try:
			
 
				+            # Filenames aren't necessarily valid UTF8.
			
 
				+            name = cfg.name if cfg.name is not None else cfg.input.name
			
 
				+        except UnicodeDecodeError:
			
 
				+            name = None
			
 
				+        print('* Adding model parameters and KV items')
			
 
				+        if name is not None:
			
 
				+            gguf_writer.add_name(name)
			
 
				+        gguf_writer.add_description(desc)
			
 
				+        if self.params_override is not None:
			
 
				+            po = self.params_override
			
 
				+            assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
			
 
				+            assert po.n_layer == hp.n_layer, 'Model hyperparams mismatch'
			
 
				+            assert po.n_head == hp.n_head, 'Model hyperparams mismatch'
			
 
				+            gguf_writer.add_context_length      (po.n_ctx)
			
 
				+            gguf_writer.add_embedding_length    (po.n_embd)
			
 
				+            gguf_writer.add_block_count         (po.n_layer)
			
 
				+            gguf_writer.add_feed_forward_length (po.n_ff)
			
 
				+            gguf_writer.add_rope_dimension_count(po.n_embd // po.n_head)
			
 
				+            gguf_writer.add_head_count          (po.n_head)
			
 
				+            gguf_writer.add_head_count_kv       (po.n_head_kv)
			
 
				+            gguf_writer.add_layer_norm_rms_eps  (po.f_norm_eps)
			
 
				+            return
			
 
				+        gguf_writer.add_context_length(cfg.context_length)
			
 
				+        gguf_writer.add_embedding_length(hp.n_embd)
			
 
				+        gguf_writer.add_block_count(hp.n_layer)
			
 
				+        gguf_writer.add_feed_forward_length(hp.n_ff)
			
 
				+        gguf_writer.add_rope_dimension_count(hp.n_embd // hp.n_head)
			
 
				+        gguf_writer.add_head_count(hp.n_head)
			
 
				+        gguf_writer.add_head_count_kv(self.n_kv_head)
			
 
				+        gguf_writer.add_layer_norm_rms_eps(float(cfg.eps))
			
 
				+
			
 
				+    def add_vocab(self, gguf_writer):
			
 
				+        hp = self.model.hyperparameters
			
 
				+        gguf_writer.add_tokenizer_model('llama')
			
 
				+        tokens = []
			
 
				+        scores = []
			
 
				+        toktypes = []
			
 
				+        if self.vocab_override is not None:
			
 
				+            vo = self.vocab_override
			
 
				+            print('* Adding vocab item(s)')
			
 
				+            for (idx, vitem) in enumerate(vo.all_tokens()):
			
 
				+                if len(vitem) == 3:
			
 
				+                    tokens.append(vitem[0])
			
 
				+                    scores.append(vitem[1])
			
 
				+                    toktypes.append(vitem[2])
			
 
				+                else:
			
 
				+                    # Maybe try to guess the token type here?
			
 
				+                    tokens.append(vitem[0])
			
 
				+                    scores.append(vitem[1])
			
 
				+            assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
			
 
				+            gguf_writer.add_token_list(tokens)
			
 
				+            gguf_writer.add_token_scores(scores)
			
 
				+            if len(toktypes) > 0:
			
 
				+                gguf_writer.add_token_types(toktypes)
			
 
				+            return
			
 
				+        print(f'* Adding {hp.n_vocab} vocab item(s)')
			
 
				+        for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
			
 
				+            tt = 1 # Normal
			
 
				+            if len(vbytes) == 0:
			
 
				+                tt = 3 # Control
			
 
				+            elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
			
 
				+                hv = hex(vbytes[0])[2:].upper()
			
 
				+                vbytes = bytes(f'<0x{hv}>', encoding = 'UTF-8')
			
 
				+                tt = 6 # Byte
			
 
				+            else:
			
 
				+                vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
			
 
				+            toktypes.append(tt)
			
 
				+            tokens.append(vbytes)
			
 
				+            scores.append(vscore)
			
 
				+        gguf_writer.add_token_list(tokens)
			
 
				+        gguf_writer.add_token_scores(scores)
			
 
				+        gguf_writer.add_token_types(toktypes)
			
 
				+
			
 
				+    def add_tensors(self, gguf_writer):
			
 
				+        nm = self.name_map
			
 
				+        data = self.data
			
 
				+        print(f'* Adding {len(self.model.tensors)} tensor(s)')
			
 
				+        for tensor in self.model.tensors:
			
 
				+            name = str(tensor.name, 'UTF-8')
			
 
				+            if name.endswith('.weight'):
			
 
				+                name = name[:-7]
			
 
				+                suffix = '.weight'
			
 
				+            elif name.endswith('.bias'):
			
 
				+                name = name[:-5]
			
 
				+                suffix = '.bias'
			
 
				+            mapped_name = nm.get(name)
			
 
				+            assert mapped_name is not None, f'Bad name {name}'
			
 
				+            mapped_name += suffix
			
 
				+            tempdims = list(tensor.dims[:])
			
 
				+            if len(tempdims) > 1:
			
 
				+                temp = tempdims[1]
			
 
				+                tempdims[1] = tempdims[0]
			
 
				+                tempdims[0] = temp
			
 
				+            # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
			
 
				+            gguf_writer.add_tensor(mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], raw_shape = tempdims, raw_dtype = tensor.dtype)
			
 
				+
			
 
				+def handle_metadata(cfg, hp):
			
 
				+    import convert
			
 
				+    assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
			
 
				+    hf_config_path   = cfg.model_metadata_dir / "config.json"
			
 
				+    orig_config_path = cfg.model_metadata_dir / "params.json"
			
 
				+    # We pass a fake model here. "original" mode will check the shapes of some
			
 
				+    # tensors if information is missing in the .json file: other than that, the
			
 
				+    # model data isn't used so this should be safe (at least for now).
			
 
				+    fakemodel = {
			
 
				+        'tok_embeddings.weight': convert.LazyTensor.__new__(convert.LazyTensor),
			
 
				+        'layers.0.feed_forward.w1.weight': convert.LazyTensor.__new__(convert.LazyTensor),
			
 
				+    }
			
 
				+    fakemodel['tok_embeddings.weight'].shape = [hp.n_vocab]
			
 
				+    fakemodel['layers.0.feed_forward.w1.weight'].shape = [hp.n_ff]
			
 
				+    if hf_config_path.exists():
			
 
				+        params = convert.Params.loadHFTransformerJson(fakemodel, hf_config_path)
			
 
				+    elif orig_config_path.exists():
			
 
				+        params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
			
 
				+    else:
			
 
				+        raise ValueError('Unable to load metadata')
			
 
				+    vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype)
			
 
				+    convert.check_vocab_size(params, vocab)
			
 
				+    return (params, vocab)
			
 
				+
			
 
				+def handle_args():
			
 
				+    parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF')
			
 
				+    parser.add_argument('--input', '-i', type = Path, help = 'Input GGMLv3 filename')
			
 
				+    parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename')
			
 
				+    parser.add_argument('--name', help = 'Set model name')
			
 
				+    parser.add_argument('--desc', help = 'Set model description')
			
 
				+    parser.add_argument('--gqa', type = int, default = 1, help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
			
 
				+    parser.add_argument('--eps', default = '5.0e-06', help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
			
 
				+    parser.add_argument('--context-length', '-c', type=int, default = 2048, help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
			
 
				+    parser.add_argument('--model-metadata-dir', '-m', type = Path, help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
			
 
				+    parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
			
 
				+    parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)", default="spm")
			
 
				+    return parser.parse_args()
			
 
				+
			
 
				+def main():
			
 
				+    cfg = handle_args()
			
 
				+    print(f'* Using config: {cfg}')
			
 
				+    print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
			
 
				+    data = np.memmap(cfg.input, mode = 'r')
			
 
				+    model = GGMLV3Model()
			
 
				+    print('* Scanning GGML input file')
			
 
				+    offset = model.load(data, 0)
			
 
				+    print(f'* GGML model hyperparameters: {model.hyperparameters}')
			
 
				+    vocab_override = None
			
 
				+    params_override = None
			
 
				+    if cfg.model_metadata_dir is not None:
			
 
				+        (params_override, vocab_override) = handle_metadata(cfg, model.hyperparameters)
			
 
				+        print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
			
 
				+        print(f'* Overriding params: {params_override}')
			
 
				+        print(f'* Overriding vocab: {vocab_override}')
			
 
				+    else:
			
 
				+        print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
			
 
				+    converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override)
			
 
				+    converter.save()
			
 
				+    print(f'* Successful completion. Output saved to: {cfg.output}')
			
 
				+
			
 
				+main()
			
--- a/convert-llama-hf-to-gguf.py
+++ b/convert-llama-hf-to-gguf.py
@@ -0,0 +1,327 @@
 
				+# HF llama --> gguf conversion
			
 
				+
			
 
				+import gguf
			
 
				+import os
			
 
				+import sys
			
 
				+import struct
			
 
				+import json
			
 
				+import numpy as np
			
 
				+import torch
			
 
				+
			
 
				+from typing import Any, List, Optional
			
 
				+from pathlib import Path
			
 
				+from sentencepiece import SentencePieceProcessor
			
 
				+
			
 
				+#NDArray = np.ndarray[Any, Any]
			
 
				+# compatible with python < 3.9
			
 
				+NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
			
 
				+
			
 
				+# reverse HF permute back to original pth layout
			
 
				+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py
			
 
				+
			
 
				+
			
 
				+def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
			
 
				+    if n_kv_head is not None and n_head != n_kv_head:
			
 
				+        n_head //= n_kv_head
			
 
				+
			
 
				+    return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
			
 
				+            .swapaxes(1, 2)
			
 
				+            .reshape(weights.shape))
			
 
				+
			
 
				+
			
 
				+def count_model_parts(dir_model: str) -> int:
			
 
				+    num_parts = 0
			
 
				+
			
 
				+    for filename in os.listdir(dir_model):
			
 
				+        if filename.startswith("pytorch_model-"):
			
 
				+            num_parts += 1
			
 
				+
			
 
				+    if num_parts > 0:
			
 
				+        print("gguf: found " + str(num_parts) + " model parts")
			
 
				+
			
 
				+    return num_parts
			
 
				+
			
 
				+
			
 
				+if len(sys.argv) < 3:
			
 
				+    print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
			
 
				+    print("  ftype == 0 -> float32")
			
 
				+    print("  ftype == 1 -> float16")
			
 
				+
			
 
				+    sys.exit(1)
			
 
				+
			
 
				+
			
 
				+# output in the same directory as the model
			
 
				+dir_model = sys.argv[1]
			
 
				+last_dir = os.path.basename(os.path.normpath(dir_model))
			
 
				+
			
 
				+
			
 
				+# possible tensor data types
			
 
				+#   ftype == 0 -> float32
			
 
				+#   ftype == 1 -> float16
			
 
				+
			
 
				+
			
 
				+# map from ftype to string
			
 
				+ftype_str = ["f32", "f16"]
			
 
				+
			
 
				+ftype = 1
			
 
				+if len(sys.argv) > 2:
			
 
				+    ftype = int(sys.argv[2])
			
 
				+    if ftype < 0 or ftype > 1:
			
 
				+        print("Invalid ftype: " + str(ftype))
			
 
				+
			
 
				+        sys.exit(1)
			
 
				+
			
 
				+fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
			
 
				+
			
 
				+print("gguf: loading model "+last_dir)
			
 
				+
			
 
				+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
			
 
				+    hparams = json.load(f)
			
 
				+
			
 
				+if hparams["architectures"][0] != "LlamaForCausalLM":
			
 
				+    print("Model architecture not supported: " + hparams["architectures"][0])
			
 
				+
			
 
				+    sys.exit()
			
 
				+
			
 
				+# get number of model parts
			
 
				+num_parts = count_model_parts(dir_model)
			
 
				+
			
 
				+ARCH=gguf.MODEL_ARCH.LLAMA
			
 
				+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
			
 
				+
			
 
				+print("gguf: get model metadata")
			
 
				+
			
 
				+block_count = hparams["num_hidden_layers"]
			
 
				+head_count = hparams["num_attention_heads"]
			
 
				+
			
 
				+if "num_key_value_heads" in hparams:
			
 
				+    head_count_kv = hparams["num_key_value_heads"]
			
 
				+else:
			
 
				+    head_count_kv = head_count
			
 
				+
			
 
				+if "_name_or_path" in hparams:
			
 
				+    hf_repo = hparams["_name_or_path"]
			
 
				+else:
			
 
				+    hf_repo = ""
			
 
				+
			
 
				+if "max_sequence_length" in hparams:
			
 
				+    ctx_length = hparams["max_sequence_length"]
			
 
				+elif "max_position_embeddings" in hparams:
			
 
				+    ctx_length = hparams["max_position_embeddings"]
			
 
				+else:
			
 
				+    print("gguf: can not find ctx length parameter.")
			
 
				+
			
 
				+    sys.exit()
			
 
				+
			
 
				+
			
 
				+gguf_writer.add_name(last_dir)
			
 
				+gguf_writer.add_source_hf_repo(hf_repo)
			
 
				+gguf_writer.add_tensor_data_layout("Meta AI original pth")
			
 
				+gguf_writer.add_context_length(ctx_length)
			
 
				+gguf_writer.add_embedding_length(hparams["hidden_size"])
			
 
				+gguf_writer.add_block_count(block_count)
			
 
				+gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
			
 
				+gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
			
 
				+gguf_writer.add_head_count(head_count)
			
 
				+gguf_writer.add_head_count_kv(head_count_kv)
			
 
				+gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
			
 
				+
			
 
				+if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
			
 
				+    if "type" in hparams["rope_scaling"]:
			
 
				+        if hparams["rope_scaling"]["type"] == "linear":
			
 
				+            gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
			
 
				+
			
 
				+
			
 
				+# TOKENIZATION
			
 
				+
			
 
				+print("gguf: get tokenizer metadata")
			
 
				+
			
 
				+tokens: List[bytes] = []
			
 
				+scores: List[float] = []
			
 
				+toktypes: List[int] = []
			
 
				+
			
 
				+if Path(dir_model + "/tokenizer.model").is_file():
			
 
				+    # vocab type sentencepiece
			
 
				+    print("gguf: get sentencepiece tokenizer vocab, scores and token types")
			
 
				+
			
 
				+    tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model")
			
 
				+
			
 
				+    for i in range(tokenizer.vocab_size()):
			
 
				+        text: bytes
			
 
				+        score: float
			
 
				+
			
 
				+        piece = tokenizer.id_to_piece(i)
			
 
				+        text = piece.encode("utf-8")
			
 
				+        score = tokenizer.get_score(i)
			
 
				+
			
 
				+        toktype = 1  # defualt to normal token type
			
 
				+        if tokenizer.is_unknown(i):
			
 
				+            toktype = 2
			
 
				+        if tokenizer.is_control(i):
			
 
				+            toktype = 3
			
 
				+
			
 
				+        # toktype = 4 is user-defined = tokens from added_tokens.json
			
 
				+
			
 
				+        if tokenizer.is_unused(i):
			
 
				+            toktype = 5
			
 
				+        if tokenizer.is_byte(i):
			
 
				+            toktype = 6
			
 
				+
			
 
				+        tokens.append(text)
			
 
				+        scores.append(score)
			
 
				+        toktypes.append(toktype)
			
 
				+
			
 
				+    if Path(dir_model + "/added_tokens.json").is_file():
			
 
				+        with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
			
 
				+            addtokens_json = json.load(f)
			
 
				+
			
 
				+            print("gguf: get added tokens")
			
 
				+
			
 
				+            for key in addtokens_json:
			
 
				+                tokens.append( key.encode("utf-8") )
			
 
				+                scores.append(-1000.0)
			
 
				+                toktypes.append(4) # user-defined token type
			
 
				+
			
 
				+
			
 
				+    gguf_writer.add_tokenizer_model("llama")
			
 
				+    gguf_writer.add_token_list(tokens)
			
 
				+    gguf_writer.add_token_scores(scores)
			
 
				+    gguf_writer.add_token_types(toktypes)
			
 
				+
			
 
				+
			
 
				+print("gguf: get special token ids")
			
 
				+
			
 
				+if Path(dir_model + "/tokenizer.json").is_file():
			
 
				+    # Look for special tokens in tokenizer.json if it exists
			
 
				+
			
 
				+    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
			
 
				+        tokenizer = json.load(f)
			
 
				+
			
 
				+    if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
			
 
				+
			
 
				+        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
			
 
				+            tokenizer_config = json.load(f)
			
 
				+
			
 
				+        if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:
			
 
				+            for key in tokenizer["added_tokens"]:
			
 
				+                if key["content"] == tokenizer_config["bos_token"]["content"]:
			
 
				+                    gguf_writer.add_bos_token_id(key["id"])
			
 
				+
			
 
				+        if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:
			
 
				+            for key in tokenizer["added_tokens"]:
			
 
				+                if key["content"] == tokenizer_config["eos_token"]["content"]:
			
 
				+                    gguf_writer.add_eos_token_id(key["id"])
			
 
				+
			
 
				+        if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:
			
 
				+            for key in tokenizer["added_tokens"]:
			
 
				+                if key["content"] == tokenizer_config["unk_token"]["content"]:
			
 
				+                    gguf_writer.add_unk_token_id(key["id"])
			
 
				+
			
 
				+        if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:
			
 
				+            for key in tokenizer["added_tokens"]:
			
 
				+                if key["content"] == tokenizer_config["sep_token"]["content"]:
			
 
				+                    gguf_writer.add_sep_token_id(key["id"])
			
 
				+
			
 
				+        if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:
			
 
				+            for key in tokenizer["added_tokens"]:
			
 
				+                if key["content"] == tokenizer_config["pad_token"]["content"]:
			
 
				+                    gguf_writer.add_pad_token_id(key["id"])
			
 
				+else:
			
 
				+    # If no tokenizer.json: Look for special tokens in config.json
			
 
				+
			
 
				+    if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
			
 
				+        gguf_writer.add_bos_token_id(hparams["bos_token_id"])
			
 
				+
			
 
				+    if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
			
 
				+        gguf_writer.add_eos_token_id(hparams["eos_token_id"])
			
 
				+
			
 
				+    if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
			
 
				+        gguf_writer.add_unk_token_id(hparams["unk_token_id"])
			
 
				+
			
 
				+    if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
			
 
				+        gguf_writer.add_sep_token_id(hparams["sep_token_id"])
			
 
				+
			
 
				+    if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
			
 
				+        gguf_writer.add_pad_token_id(hparams["pad_token_id"])
			
 
				+
			
 
				+
			
 
				+# TENSORS
			
 
				+
			
 
				+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
			
 
				+
			
 
				+# tensor info
			
 
				+print("gguf: get tensor metadata")
			
 
				+
			
 
				+if num_parts == 0:
			
 
				+    part_names = ("pytorch_model.bin",)
			
 
				+else:
			
 
				+    part_names = (
			
 
				+        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
			
 
				+    )
			
 
				+
			
 
				+for part_name in part_names:
			
 
				+    print("gguf: loading model part '" + part_name + "'")
			
 
				+    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
			
 
				+
			
 
				+    for name in model_part.keys():
			
 
				+        data = model_part[name]
			
 
				+
			
 
				+        # we don't need these
			
 
				+        if name.endswith(".rotary_emb.inv_freq"):
			
 
				+            continue
			
 
				+
			
 
				+        old_dtype = data.dtype
			
 
				+
			
 
				+        # convert any unsupported data types to float32
			
 
				+        if data.dtype != torch.float16 and data.dtype != torch.float32:
			
 
				+            data = data.to(torch.float32)
			
 
				+
			
 
				+        data = data.squeeze().numpy()
			
 
				+
			
 
				+        # reverse permute these
			
 
				+        if name.endswith(".q_proj.weight"):
			
 
				+            data = reverse_hf_permute(data, head_count)
			
 
				+        if name.endswith(".k_proj.weight"):
			
 
				+            data = reverse_hf_permute(data, head_count, head_count_kv)
			
 
				+
			
 
				+        # map tensor names
			
 
				+        if name.endswith(".weight") and name[:-7] in tensor_map:
			
 
				+            name = tensor_map[name[:-7]] + ".weight"
			
 
				+        elif name.endswith(".bias") and name[:-5] in tensor_map:
			
 
				+            name = tensor_map[name[:-5]] + ".bias"
			
 
				+        else:
			
 
				+            print("Can not map tensor '" + name + "'")
			
 
				+            sys.exit()
			
 
				+
			
 
				+        n_dims = len(data.shape)
			
 
				+        data_dtype = data.dtype
			
 
				+
			
 
				+        # if f32 desired, convert any float16 to float32
			
 
				+        if ftype == 0 and data_dtype == np.float16:
			
 
				+            data = data.astype(np.float32)
			
 
				+
			
 
				+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
			
 
				+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
			
 
				+            data = data.astype(np.float32)
			
 
				+
			
 
				+        # if f16 desired, convert any float32 2-dim weight tensors to float16
			
 
				+        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
			
 
				+            data = data.astype(np.float16)
			
 
				+
			
 
				+        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
			
 
				+
			
 
				+        gguf_writer.add_tensor(name, data)
			
 
				+
			
 
				+
			
 
				+print("gguf: write header")
			
 
				+gguf_writer.write_header_to_file()
			
 
				+print("gguf: write metadata")
			
 
				+gguf_writer.write_kv_data_to_file()
			
 
				+print("gguf: write tensors")
			
 
				+gguf_writer.write_tensors_to_file()
			
 
				+
			
 
				+gguf_writer.close()
			
 
				+
			
 
				+
			
 
				+print("gguf: model successfully exported to '" + fname_out + "'")
			
 
				+print("")
			
--- a/convert.py
+++ b/convert.py
--- a/docs/token_generation_performance_tips.md
+++ b/docs/token_generation_performance_tips.md
@@ -3,7 +3,7 @@
 
				 ## Verifying that the model is running on the GPU with cuBLAS
			
 
				 Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
			
 
				 ```shell
			
 
				-./main -m "path/to/model.bin" -ngl 200000 -p "Please sir, may I have some "
			
 
				+./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
			
 
				 ```
			
 
				 
			
 
				 When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
			
@@ -25,9 +25,9 @@ GPU: A6000 (48GB VRAM)
 
				 CPU: 7 physical cores
			
 
				 RAM: 32GB
			
 
				 
			
 
				-Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.ggmlv3.q4_0.bin` (30B parameters, 4bit quantization, GGML)
			
 
				+Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
			
 
				 
			
 
				-Run command: `./main -m "path/to/model.bin" -p "-p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
			
 
				+Run command: `./main -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
			
 
				 
			
 
				 Result:
			
 
				 
			
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -6,27 +6,6 @@ find_package(Threads REQUIRED)
 
				 
			
 
				 # ...
			
 
				 
			
 
				-# common
			
 
				-
			
 
				-set(TARGET common)
			
 
				-
			
 
				-add_library(${TARGET} OBJECT
			
 
				-    common.h
			
 
				-    common.cpp
			
 
				-    console.h
			
 
				-    console.cpp
			
 
				-    grammar-parser.h
			
 
				-    grammar-parser.cpp
			
 
				-    )
			
 
				-
			
 
				-if (BUILD_SHARED_LIBS)
			
 
				-    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
			
 
				-endif()
			
 
				-
			
 
				-target_include_directories(${TARGET} PUBLIC .)
			
 
				-target_compile_features(${TARGET} PUBLIC cxx_std_11)
			
 
				-target_link_libraries(${TARGET} PRIVATE llama)
			
 
				-
			
 
				 # examples
			
 
				 
			
 
				 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
			
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -1,5 +1,6 @@
 
				 #include "ggml.h"
			
 
				 #include "llama.h"
			
 
				+
			
 
				 #include <unordered_map>
			
 
				 #include <vector>
			
 
				 #include <cassert>
			
@@ -138,14 +139,16 @@ void print_sample_weights(TransformerWeights *w){
 
				 struct llama_vocab {
			
 
				     using id    = int32_t;
			
 
				     using token = std::string;
			
 
				+    using ttype = llama_token_type;
			
 
				 
			
 
				-    struct token_score {
			
 
				-        token tok;
			
 
				+    struct token_data {
			
 
				+        token text;
			
 
				         float score;
			
 
				+        ttype type;
			
 
				     };
			
 
				 
			
 
				     std::unordered_map<token, id> token_to_id;
			
 
				-    std::vector<token_score> id_to_token;
			
 
				+    std::vector<token_data> id_to_token;
			
 
				 };
			
 
				 
			
 
				 struct my_llama_hparams {
			
@@ -502,7 +505,7 @@ bool is_ggml_file(const char *filename) {
 
				         return false;
			
 
				     }
			
 
				     uint32_t magic = file.read_u32();
			
 
				-    return magic == LLAMA_FILE_MAGIC;
			
 
				+    return magic == GGUF_MAGIC;
			
 
				 }
			
 
				 
			
 
				 void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
			
@@ -515,36 +518,30 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
 
				         struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
			
 
				         struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
			
 
				 
			
 
				-        std::vector<const char *> strings;
			
 
				-        std::vector<float> scores;
			
 
				-        int n_vocab = llama_n_vocab(lctx);
			
 
				-        strings.resize(n_vocab, NULL);
			
 
				-        scores.resize(n_vocab, 0);
			
 
				-        n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
			
 
				-        GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
			
 
				+        const int n_vocab = llama_n_vocab(lctx);
			
 
				         vocab->id_to_token.resize(n_vocab);
			
 
				         for (int i=0; i<n_vocab; ++i) {
			
 
				-            std::string tok   = std::string(strings[i]);
			
 
				-            float       score = scores[i];
			
 
				-            vocab->id_to_token[i].tok   = tok;
			
 
				-            vocab->id_to_token[i].score = score;
			
 
				-            vocab->token_to_id.emplace(tok, i);
			
 
				+            vocab->id_to_token[i].text  = llama_token_get_text(lctx, i);
			
 
				+            vocab->id_to_token[i].score = llama_token_get_score(lctx, i);
			
 
				+            vocab->id_to_token[i].type  = llama_token_get_type(lctx, i);
			
 
				+            vocab->token_to_id.emplace(vocab->id_to_token[i].text, i);
			
 
				         }
			
 
				         llama_free(lctx);
			
 
				         llama_free_model(lmodel);
			
 
				     } else { // assume llama2.c vocabulary
			
 
				         printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
			
 
				         llama_file file(filename, "rb");
			
 
				-        uint32_t n_vocab = config->vocab_size;
			
 
				+        const int  n_vocab = config->vocab_size;
			
 
				         /* uint32_t max_token_length =  */ file.read_u32(); // unused
			
 
				         vocab->id_to_token.resize(n_vocab);
			
 
				-        for (uint32_t i=0; i<n_vocab; ++i) {
			
 
				+        for (int i=0; i<n_vocab; ++i) {
			
 
				             float_t score = file.read_f32();
			
 
				             uint32_t len = file.read_u32();
			
 
				-            std::string tok = file.read_string(len);
			
 
				-            vocab->id_to_token[i].tok = tok;
			
 
				+            std::string text = file.read_string(len);
			
 
				+            vocab->id_to_token[i].text = text;
			
 
				             vocab->id_to_token[i].score = score;
			
 
				-            vocab->token_to_id.emplace(tok, i);
			
 
				+            vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED;
			
 
				+            vocab->token_to_id.emplace(text, i);
			
 
				         }
			
 
				     }
			
 
				 }
			
@@ -590,75 +587,80 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
 
				     if (file.fp == NULL) {
			
 
				         return;
			
 
				     }
			
 
				-    // write_magic
			
 
				-    file.write_u32(LLAMA_FILE_MAGIC);   // magic
			
 
				-    file.write_u32(LLAMA_FILE_VERSION); // version
			
 
				-    // write_hparams
			
 
				-    file.write_u32(model->hparams.n_vocab);
			
 
				-    file.write_u32(model->hparams.n_embd);
			
 
				-    file.write_u32(model->hparams.n_mult);
			
 
				-    file.write_u32(model->hparams.n_head);
			
 
				-    file.write_u32(model->hparams.n_layer);
			
 
				-    file.write_u32(model->hparams.n_rot);
			
 
				-    file.write_u32(LLAMA_FTYPE_ALL_F32);
			
 
				-
			
 
				-    // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
			
 
				-    uint32_t n_vocab = model->hparams.n_vocab;
			
 
				-    for (uint32_t i = 0; i < n_vocab; i++) {
			
 
				-        const auto & token_score = vocab->id_to_token.at(i);
			
 
				-        file.write_u32((uint32_t) token_score.tok.size());
			
 
				-        file.write_raw(token_score.tok.data(), token_score.tok.size());
			
 
				-        file.write_raw(&token_score.score, sizeof(token_score.score));
			
 
				-    }
			
 
				 
			
 
				-    // stuff AK weights into GG weights one by one.
			
 
				-    // w->token_embedding_table -> model->tok_embeddings
			
 
				-    // float*                   -> struct ggml_tensor
			
 
				-    stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
			
 
				-    stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
			
 
				-
			
 
				-    stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
			
 
				-    //print_row(model->norm, 0);
			
 
				-
			
 
				-    // for rms-att-weight
			
 
				-    int row_length = model->hparams.n_embd;
			
 
				-    const auto & hparams = model->hparams;
			
 
				-    //int n_ff = model->hparams.n_embd;
			
 
				-    int n_ff = get_n_ff(&hparams);
			
 
				-
			
 
				-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
			
 
				-        auto & layer = model->layers[i];
			
 
				-        // 1d
			
 
				-        stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
			
 
				-        stuff_karpathy_weights_into_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
			
 
				-
			
 
				-        // from 3d matrix layer x dim x dim to 2d matrix dim x dim
			
 
				-        stuff_karpathy_weights_into_gg(layer.wq            , &w->wq[i*row_length*row_length]);
			
 
				-        stuff_karpathy_weights_into_gg(layer.wk            , &w->wk[i*row_length*row_length]);
			
 
				-        stuff_karpathy_weights_into_gg(layer.wv            , &w->wv[i*row_length*row_length]);
			
 
				-        stuff_karpathy_weights_into_gg(layer.wo            , &w->wo[i*row_length*row_length]);
			
 
				-
			
 
				-        stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
			
 
				-        stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
			
 
				-        stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length*n_ff]);
			
 
				-    }
			
 
				-    // write tensors
			
 
				-    write_tensor(&file, model->tok_embeddings);
			
 
				-    write_tensor(&file, model->norm);
			
 
				-    write_tensor(&file, model->output); // ?
			
 
				-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
			
 
				-        auto & layer = model->layers[i];
			
 
				-
			
 
				-        write_tensor(&file, layer.attention_norm);
			
 
				-        write_tensor(&file, layer.wq);
			
 
				-        write_tensor(&file, layer.wk);
			
 
				-        write_tensor(&file, layer.wv);
			
 
				-        write_tensor(&file, layer.wo);
			
 
				-        write_tensor(&file, layer.ffn_norm);
			
 
				-        write_tensor(&file, layer.w1);
			
 
				-        write_tensor(&file, layer.w2);
			
 
				-        write_tensor(&file, layer.w3);
			
 
				-    }
			
 
				+#pragma message("TODO: implement file saving using gguf")
			
 
				+    (void) vocab;
			
 
				+    (void) model;
			
 
				+    (void) w;
			
 
				+//    // write_magic
			
 
				+//    file.write_u32(LLAMA_FILE_MAGIC);   // magic
			
 
				+//    file.write_u32(LLAMA_FILE_VERSION); // version
			
 
				+//    // write_hparams
			
 
				+//    file.write_u32(model->hparams.n_vocab);
			
 
				+//    file.write_u32(model->hparams.n_embd);
			
 
				+//    file.write_u32(model->hparams.n_mult);
			
 
				+//    file.write_u32(model->hparams.n_head);
			
 
				+//    file.write_u32(model->hparams.n_layer);
			
 
				+//    file.write_u32(model->hparams.n_rot);
			
 
				+//    file.write_u32(LLAMA_FTYPE_ALL_F32);
			
 
				+//
			
 
				+//    // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
			
 
				+//    uint32_t n_vocab = model->hparams.n_vocab;
			
 
				+//    for (uint32_t i = 0; i < n_vocab; i++) {
			
 
				+//        const auto & token_data = vocab->id_to_token.at(i);
			
 
				+//        file.write_u32((uint32_t) token_data.tok.size());
			
 
				+//        file.write_raw(token_data.tok.data(), token_data.tok.size());
			
 
				+//        file.write_raw(&token_data.score, sizeof(token_data.score));
			
 
				+//    }
			
 
				+//
			
 
				+//    // stuff AK weights into GG weights one by one.
			
 
				+//    // w->token_embedding_table -> model->tok_embeddings
			
 
				+//    // float*                   -> struct ggml_tensor
			
 
				+//    stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
			
 
				+//    stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
			
 
				+//
			
 
				+//    stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
			
 
				+//    //print_row(model->norm, 0);
			
 
				+//
			
 
				+//    // for rms-att-weight
			
 
				+//    int row_length = model->hparams.n_embd;
			
 
				+//    const auto & hparams = model->hparams;
			
 
				+//    //int n_ff = model->hparams.n_embd;
			
 
				+//    int n_ff = get_n_ff(&hparams);
			
 
				+//
			
 
				+//    for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
			
 
				+//        auto & layer = model->layers[i];
			
 
				+//        // 1d
			
 
				+//        stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
			
 
				+//        stuff_karpathy_weights_into_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
			
 
				+//
			
 
				+//        // from 3d matrix layer x dim x dim to 2d matrix dim x dim
			
 
				+//        stuff_karpathy_weights_into_gg(layer.wq            , &w->wq[i*row_length*row_length]);
			
 
				+//        stuff_karpathy_weights_into_gg(layer.wk            , &w->wk[i*row_length*row_length]);
			
 
				+//        stuff_karpathy_weights_into_gg(layer.wv            , &w->wv[i*row_length*row_length]);
			
 
				+//        stuff_karpathy_weights_into_gg(layer.wo            , &w->wo[i*row_length*row_length]);
			
 
				+//
			
 
				+//        stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
			
 
				+//        stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
			
 
				+//        stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length*n_ff]);
			
 
				+//    }
			
 
				+//    // write tensors
			
 
				+//    write_tensor(&file, model->tok_embeddings);
			
 
				+//    write_tensor(&file, model->norm);
			
 
				+//    write_tensor(&file, model->output); // ?
			
 
				+//    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
			
 
				+//        auto & layer = model->layers[i];
			
 
				+//
			
 
				+//        write_tensor(&file, layer.attention_norm);
			
 
				+//        write_tensor(&file, layer.wq);
			
 
				+//        write_tensor(&file, layer.wk);
			
 
				+//        write_tensor(&file, layer.wv);
			
 
				+//        write_tensor(&file, layer.wo);
			
 
				+//        write_tensor(&file, layer.ffn_norm);
			
 
				+//        write_tensor(&file, layer.w1);
			
 
				+//        write_tensor(&file, layer.w2);
			
 
				+//        write_tensor(&file, layer.w3);
			
 
				+//    }
			
 
				 }
			
 
				 
			
 
				 struct train_params get_default_train_params() {
			
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@@ -167,7 +167,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
 
				         llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
			
 
				 
			
 
				         // TODO: Apply penalties
			
 
				-        // float nl_logit = logits[llama_token_nl()];
			
 
				+        // float nl_logit = logits[llama_token_nl(ctx)];
			
 
				         // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
			
 
				         // llama_sample_repetition_penalty(ctx, &candidates_p,
			
 
				         //      last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
			
@@ -176,7 +176,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
 
				         // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
			
 
				         // last_n_repeat, alpha_frequency, alpha_presence);
			
 
				         // if (!penalize_nl) {
			
 
				-        //     logits[llama_token_nl()] = nl_logit;
			
 
				+        //     logits[llama_token_nl(ctx)] = nl_logit;
			
 
				         // }
			
 
				 
			
 
				         if (temp <= 0) {
			
@@ -211,7 +211,7 @@ const char * sampling(struct MyModel * mymodel) {
 
				     llama_context * ctx = mymodel->ctx;
			
 
				     int id = sampling_id(mymodel);
			
 
				     static std::string ret;
			
 
				-    if (id == llama_token_eos()) {
			
 
				+    if (id == llama_token_eos(ctx)) {
			
 
				         ret = "</s>";
			
 
				     } else {
			
 
				         ret = llama_token_to_str(ctx, id);
			
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -67,7 +67,7 @@ int main(int argc, char ** argv) {
 
				         fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
			
 
				         fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
			
 
				         for (int i = 0; i < (int) embd_inp.size(); i++) {
			
 
				-            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
			
 
				+            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
			
 
				         }
			
 
				         fprintf(stderr, "\n");
			
 
				     }
			
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -0,0 +1,246 @@
 
				+#include "ggml.h"
			
 
				+#include "llama.h"
			
 
				+
			
 
				+#include <cstdio>
			
 
				+#include <cinttypes>
			
 
				+#include <string>
			
 
				+#include <sstream>
			
 
				+#include <fstream>
			
 
				+#include <vector>
			
 
				+
			
 
				+#undef MIN
			
 
				+#undef MAX
			
 
				+#define MIN(a, b) ((a) < (b) ? (a) : (b))
			
 
				+#define MAX(a, b) ((a) > (b) ? (a) : (b))
			
 
				+
			
 
				+template<typename T>
			
 
				+static std::string to_string(const T & val) {
			
 
				+    std::stringstream ss;
			
 
				+    ss << val;
			
 
				+    return ss.str();
			
 
				+}
			
 
				+
			
 
				+bool gguf_ex_write(const std::string & fname) {
			
 
				+    struct gguf_context * ctx = gguf_init_empty();
			
 
				+
			
 
				+    gguf_set_val_u8  (ctx, "some.parameter.uint8",    0x12);
			
 
				+    gguf_set_val_i8  (ctx, "some.parameter.int8",    -0x13);
			
 
				+    gguf_set_val_u16 (ctx, "some.parameter.uint16",   0x1234);
			
 
				+    gguf_set_val_i16 (ctx, "some.parameter.int16",   -0x1235);
			
 
				+    gguf_set_val_u32 (ctx, "some.parameter.uint32",   0x12345678);
			
 
				+    gguf_set_val_i32 (ctx, "some.parameter.int32",   -0x12345679);
			
 
				+    gguf_set_val_f32 (ctx, "some.parameter.float32",  0.123456789f);
			
 
				+    gguf_set_val_bool(ctx, "some.parameter.bool",     true);
			
 
				+    gguf_set_val_str (ctx, "some.parameter.string",   "hello world");
			
 
				+
			
 
				+    gguf_set_arr_data(ctx, "some.parameter.arr.i16", GGUF_TYPE_INT16,   std::vector<int16_t>{ 1, 2, 3, 4, }.data(), 4);
			
 
				+    gguf_set_arr_data(ctx, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, std::vector<float>{ 3.145f, 2.718f, 1.414f, }.data(), 3);
			
 
				+    gguf_set_arr_str (ctx, "some.parameter.arr.str",                    std::vector<const char *>{ "hello", "world", "!" }.data(), 3);
			
 
				+
			
 
				+    struct ggml_init_params params = {
			
 
				+        /*.mem_size   =*/ 128ull*1024ull*1024ull,
			
 
				+        /*.mem_buffer =*/ NULL,
			
 
				+        /*.no_alloc   =*/ false,
			
 
				+    };
			
 
				+
			
 
				+    struct ggml_context * ctx_data = ggml_init(params);
			
 
				+
			
 
				+    const int n_tensors = 10;
			
 
				+
			
 
				+    // tensor infos
			
 
				+    for (int i = 0; i < n_tensors; ++i) {
			
 
				+        const std::string name = "tensor_" + to_string(i);
			
 
				+
			
 
				+        int64_t ne[GGML_MAX_DIMS] = { 1 };
			
 
				+        int32_t n_dims = rand() % GGML_MAX_DIMS + 1;
			
 
				+
			
 
				+        for (int j = 0; j < n_dims; ++j) {
			
 
				+            ne[j] = rand() % 10 + 1;
			
 
				+        }
			
 
				+
			
 
				+        struct ggml_tensor * cur = ggml_new_tensor(ctx_data, GGML_TYPE_F32, n_dims, ne);
			
 
				+        ggml_set_name(cur, name.c_str());
			
 
				+
			
 
				+        {
			
 
				+            float * data = (float *) cur->data;
			
 
				+            for (int j = 0; j < ggml_nelements(cur); ++j) {
			
 
				+                data[j] = 100 + i;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        gguf_add_tensor(ctx, cur);
			
 
				+    }
			
 
				+
			
 
				+    gguf_write_to_file(ctx, fname.c_str(), false);
			
 
				+
			
 
				+    fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str());
			
 
				+
			
 
				+    ggml_free(ctx_data);
			
 
				+    gguf_free(ctx);
			
 
				+
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+// just read tensor info
			
 
				+bool gguf_ex_read_0(const std::string & fname) {
			
 
				+    struct gguf_init_params params = {
			
 
				+        /*.no_alloc = */ false,
			
 
				+        /*.ctx      = */ NULL,
			
 
				+    };
			
 
				+
			
 
				+    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
			
 
				+
			
 
				+    fprintf(stdout, "%s: version:      %d\n", __func__, gguf_get_version(ctx));
			
 
				+    fprintf(stdout, "%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
			
 
				+    fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
			
 
				+
			
 
				+    // kv
			
 
				+    {
			
 
				+        const int n_kv = gguf_get_n_kv(ctx);
			
 
				+
			
 
				+        fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
			
 
				+
			
 
				+        for (int i = 0; i < n_kv; ++i) {
			
 
				+            const char * key = gguf_get_key(ctx, i);
			
 
				+
			
 
				+            fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // find kv string
			
 
				+    {
			
 
				+        const char * findkey = "some.parameter.string";
			
 
				+
			
 
				+        const int keyidx = gguf_find_key(ctx, findkey);
			
 
				+        if (keyidx == -1) {
			
 
				+            fprintf(stdout, "%s: find key: %s not found.\n", __func__, findkey);
			
 
				+        } else {
			
 
				+            const char * key_value = gguf_get_val_str(ctx, keyidx);
			
 
				+            fprintf(stdout, "%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // tensor info
			
 
				+    {
			
 
				+        const int n_tensors = gguf_get_n_tensors(ctx);
			
 
				+
			
 
				+        fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
			
 
				+
			
 
				+        for (int i = 0; i < n_tensors; ++i) {
			
 
				+            const char * name   = gguf_get_tensor_name  (ctx, i);
			
 
				+            const size_t offset = gguf_get_tensor_offset(ctx, i);
			
 
				+
			
 
				+            fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    gguf_free(ctx);
			
 
				+
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+// read and create ggml_context containing the tensors and their data
			
 
				+bool gguf_ex_read_1(const std::string & fname) {
			
 
				+    struct ggml_context * ctx_data = NULL;
			
 
				+
			
 
				+    struct gguf_init_params params = {
			
 
				+        /*.no_alloc = */ false,
			
 
				+        /*.ctx      = */ &ctx_data,
			
 
				+    };
			
 
				+
			
 
				+    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
			
 
				+
			
 
				+    fprintf(stdout, "%s: version:      %d\n", __func__, gguf_get_version(ctx));
			
 
				+    fprintf(stdout, "%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
			
 
				+    fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
			
 
				+
			
 
				+    // kv
			
 
				+    {
			
 
				+        const int n_kv = gguf_get_n_kv(ctx);
			
 
				+
			
 
				+        fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
			
 
				+
			
 
				+        for (int i = 0; i < n_kv; ++i) {
			
 
				+            const char * key = gguf_get_key(ctx, i);
			
 
				+
			
 
				+            fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // tensor info
			
 
				+    {
			
 
				+        const int n_tensors = gguf_get_n_tensors(ctx);
			
 
				+
			
 
				+        fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
			
 
				+
			
 
				+        for (int i = 0; i < n_tensors; ++i) {
			
 
				+            const char * name   = gguf_get_tensor_name  (ctx, i);
			
 
				+            const size_t offset = gguf_get_tensor_offset(ctx, i);
			
 
				+
			
 
				+            fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // data
			
 
				+    {
			
 
				+        const int n_tensors = gguf_get_n_tensors(ctx);
			
 
				+
			
 
				+        for (int i = 0; i < n_tensors; ++i) {
			
 
				+            fprintf(stdout, "%s: reading tensor %d data\n", __func__, i);
			
 
				+
			
 
				+            const char * name = gguf_get_tensor_name(ctx, i);
			
 
				+
			
 
				+            struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
			
 
				+
			
 
				+            fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
			
 
				+
			
 
				+            // print first 10 elements
			
 
				+            const float * data = (const float *) cur->data;
			
 
				+
			
 
				+            printf("%s data[:10] : ", name);
			
 
				+            for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
			
 
				+                printf("%f ", data[j]);
			
 
				+            }
			
 
				+            printf("\n\n");
			
 
				+
			
 
				+            // check data
			
 
				+            {
			
 
				+                const float * data = (const float *) cur->data;
			
 
				+                for (int j = 0; j < ggml_nelements(cur); ++j) {
			
 
				+                    if (data[j] != 100 + i) {
			
 
				+                        fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
			
 
				+                        return false;
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
			
 
				+
			
 
				+    ggml_free(ctx_data);
			
 
				+    gguf_free(ctx);
			
 
				+
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char ** argv) {
			
 
				+    if (argc < 3) {
			
 
				+        fprintf(stdout, "usage: %s data.gguf r|w\n", argv[0]);
			
 
				+        return -1;
			
 
				+    }
			
 
				+
			
 
				+    const std::string fname(argv[1]);
			
 
				+    const std::string mode (argv[2]);
			
 
				+
			
 
				+    GGML_ASSERT((mode == "r" || mode == "w") && "mode must be r or w");
			
 
				+
			
 
				+    if (mode == "w") {
			
 
				+        GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
			
 
				+    } else if (mode == "r") {
			
 
				+        GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
			
 
				+        GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
			
 
				+    }
			
 
				+
			
 
				+    return 0;
			
 
				+}
			
--- a/examples/gptneox-wip/cmpnct_gpt2bpe.hpp
+++ b/examples/gptneox-wip/cmpnct_gpt2bpe.hpp
@@ -0,0 +1,1133 @@
 
				+#ifndef CMPNCT_GPT2BPE
			
 
				+#define CMPNCT_GPT2BPE
			
 
				+
			
 
				+#include <vector>
			
 
				+#include <string>
			
 
				+#include <algorithm>
			
 
				+#include <utility>
			
 
				+#include <iostream>
			
 
				+#include <map>
			
 
				+#include <unordered_map>
			
 
				+#include <queue>
			
 
				+#include <cstring>
			
 
				+
			
 
				+
			
 
				+// Unicode GPT2 Byte Pair Encoding Tokenizer
			
 
				+// Adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
			
 
				+// Removed loading of merges from HF json and parts made for a specific vocab
			
 
				+
			
 
				+
			
 
				+//-----------------
			
 
				+// Unicode library (from cmpnct_unicode.cpp)
			
 
				+//-----------------
			
 
				+
			
 
				+// Minimal library for high performance handling and categorization of UTF8 strings and characters
			
 
				+// Using std::string
			
 
				+
			
 
				+enum CNCTCharType {
			
 
				+    DIGIT,          // a numerical char in any language
			
 
				+    LETTER,         // a letter in any language
			
 
				+    WHITESPACE,     // any form of whitespace
			
 
				+    ACCENT_MARK,    // letter modifiers like ´ in é
			
 
				+    PUNCTUATION,    // punctuation including brackets
			
 
				+    SYMBOL,         // math, currency, other symbols
			
 
				+    CONTROL,        // control characters
			
 
				+    MIXED,          // a mix of the above
			
 
				+    UNIDENTIFIED    // something more exotic like emoji or separators
			
 
				+};
			
 
				+
			
 
				+struct CNCTUnicode;
			
 
				+
			
 
				+struct CNCTString {
			
 
				+    std::string str;
			
 
				+    size_t utf8_chars;
			
 
				+
			
 
				+    CNCTCharType char_type=UNIDENTIFIED;
			
 
				+    bool is_sequential=false;
			
 
				+
			
 
				+    size_t seq_offset_bytes=0;
			
 
				+    size_t seq_offset_utf8_chars=0;
			
 
				+
			
 
				+    bool operator==(const std::string &other) const;
			
 
				+    bool operator==(const char other) const;
			
 
				+    bool operator==(const CNCTString &other) const;
			
 
				+    CNCTString &operator+=(const std::string &other);
			
 
				+    CNCTString &operator+=(const char other);
			
 
				+    friend CNCTString operator+(CNCTString lhs, const std::string &rhs);
			
 
				+    friend CNCTString operator+(CNCTString lhs, const char rhs);
			
 
				+    CNCTString& operator+=(const CNCTString& other);
			
 
				+    friend CNCTString operator+(CNCTString lhs, const CNCTString& rhs);
			
 
				+};
			
 
				+
			
 
				+struct CNCTUnicode {
			
 
				+    static bool check_code_range(int c, const std::vector<std::pair<int, int>>& ranges);
			
 
				+    static CNCTCharType get_code_type(int c);
			
 
				+    static CNCTCharType get_code_type(const std::string &utf8_char);
			
 
				+    static int utf8_len(const char c);
			
 
				+    static int strlen_utf8(std::string src);
			
 
				+    static std::vector<std::string> split_utf8(const std::string &src);
			
 
				+    static std::vector<CNCTString> split_utf8_enhanced(const std::string &src);
			
 
				+    static CNCTCharType string_identify(const std::string& str);
			
 
				+    static bool string_test(const std::string& str, CNCTCharType chartype);
			
 
				+};
			
 
				+
			
 
				+static const std::vector<std::pair<int, int>> digit_ranges = {
			
 
				+{0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},
			
 
				+{0xCE6, 0xCEF}, {0xD66, 0xD6F}, {0xDE6, 0xDEF}, {0xE50, 0xE59}, {0xED0, 0xED9}, {0xF20, 0xF29}, {0x1040, 0x1049}, {0x1090, 0x1099}, {0x1369, 0x1371}, {0x17E0, 0x17E9}, {0x1810, 0x1819}, {0x1946, 0x194F},
			
 
				+{0x19D0, 0x19DA}, {0x1A80, 0x1A89}, {0x1A90, 0x1A99}, {0x1B50, 0x1B59}, {0x1BB0, 0x1BB9}, {0x1C40, 0x1C49}, {0x1C50, 0x1C59}, {0x2070, 0x2070}, {0x2074, 0x2079}, {0x2080, 0x2089}, {0x2460, 0x2468},
			
 
				+{0x2474, 0x247C}, {0x2488, 0x2490}, {0x24EA, 0x24EA}, {0x24F5, 0x24FD}, {0x24FF, 0x24FF}, {0x2776, 0x277E}, {0x2780, 0x2788}, {0x278A, 0x2792}, {0xA620, 0xA629}, {0xA8D0, 0xA8D9}, {0xA900, 0xA909},
			
 
				+{0xA9D0, 0xA9D9}, {0xA9F0, 0xA9F9}, {0xAA50, 0xAA59}, {0xABF0, 0xABF9}, {0xFF10, 0xFF19}, {0x104A0, 0x104A9}, {0x10A40, 0x10A43}, {0x10D30, 0x10D39}, {0x10E60, 0x10E68}, {0x11052, 0x1105A},
			
 
				+{0x11066, 0x1106F}, {0x110F0, 0x110F9}, {0x11136, 0x1113F}, {0x111D0, 0x111D9}, {0x112F0, 0x112F9}, {0x11450, 0x11459}, {0x114D0, 0x114D9}, {0x11650, 0x11659}, {0x116C0, 0x116C9}, {0x11730, 0x11739},
			
 
				+{0x118E0, 0x118E9}, {0x11950, 0x11959}, {0x11C50, 0x11C59}, {0x11D50, 0x11D59}, {0x11DA0, 0x11DA9}, {0x16A60, 0x16A69}, {0x16B50, 0x16B59}, {0x1D7CE, 0x1D7FF}, {0x1E140, 0x1E149}, {0x1E2F0, 0x1E2F9},
			
 
				+{0x1E950, 0x1E959}, {0x1F100, 0x1F10A}, {0x1FBF0, 0x1FBF9},
			
 
				+};
			
 
				+
			
 
				+static const std::vector<std::pair<int, int>> letter_ranges = {
			
 
				+{0x41, 0x5A}, {0x61, 0x7A}, {0xAA, 0xAA}, {0xB5, 0xB5}, {0xBA, 0xBA}, {0xC0, 0xD6}, {0xD8, 0xF6}, {0xF8, 0x2C1}, {0x2C6, 0x2D1}, {0x2E0, 0x2E4}, {0x2EC, 0x2EC}, {0x2EE, 0x2EE}, {0x370, 0x374},
			
 
				+{0x376, 0x377}, {0x37A, 0x37D}, {0x37F, 0x37F}, {0x386, 0x386}, {0x388, 0x38A}, {0x38C, 0x38C}, {0x38E, 0x3A1}, {0x3A3, 0x3F5}, {0x3F7, 0x481}, {0x48A, 0x52F}, {0x531, 0x556}, {0x559, 0x559},
			
 
				+{0x560, 0x588}, {0x5D0, 0x5EA}, {0x5EF, 0x5F2}, {0x620, 0x64A}, {0x66E, 0x66F}, {0x671, 0x6D3}, {0x6D5, 0x6D5}, {0x6E5, 0x6E6}, {0x6EE, 0x6EF}, {0x6FA, 0x6FC}, {0x6FF, 0x6FF}, {0x710, 0x710},
			
 
				+{0x712, 0x72F}, {0x74D, 0x7A5}, {0x7B1, 0x7B1}, {0x7CA, 0x7EA}, {0x7F4, 0x7F5}, {0x7FA, 0x7FA}, {0x800, 0x815}, {0x81A, 0x81A}, {0x824, 0x824}, {0x828, 0x828}, {0x840, 0x858}, {0x860, 0x86A},
			
 
				+{0x8A0, 0x8B4}, {0x8B6, 0x8C7}, {0x904, 0x939}, {0x93D, 0x93D}, {0x950, 0x950}, {0x958, 0x961}, {0x971, 0x980}, {0x985, 0x98C}, {0x98F, 0x990}, {0x993, 0x9A8}, {0x9AA, 0x9B0}, {0x9B2, 0x9B2},
			
 
				+{0x9B6, 0x9B9}, {0x9BD, 0x9BD}, {0x9CE, 0x9CE}, {0x9DC, 0x9DD}, {0x9DF, 0x9E1}, {0x9F0, 0x9F1}, {0x9FC, 0x9FC}, {0xA05, 0xA0A}, {0xA0F, 0xA10}, {0xA13, 0xA28}, {0xA2A, 0xA30}, {0xA32, 0xA33},
			
 
				+{0xA35, 0xA36}, {0xA38, 0xA39}, {0xA59, 0xA5C}, {0xA5E, 0xA5E}, {0xA72, 0xA74}, {0xA85, 0xA8D}, {0xA8F, 0xA91}, {0xA93, 0xAA8}, {0xAAA, 0xAB0}, {0xAB2, 0xAB3}, {0xAB5, 0xAB9}, {0xABD, 0xABD},
			
 
				+{0xAD0, 0xAD0}, {0xAE0, 0xAE1}, {0xAF9, 0xAF9}, {0xB05, 0xB0C}, {0xB0F, 0xB10}, {0xB13, 0xB28}, {0xB2A, 0xB30}, {0xB32, 0xB33}, {0xB35, 0xB39}, {0xB3D, 0xB3D}, {0xB5C, 0xB5D}, {0xB5F, 0xB61},
			
 
				+{0xB71, 0xB71}, {0xB83, 0xB83}, {0xB85, 0xB8A}, {0xB8E, 0xB90}, {0xB92, 0xB95}, {0xB99, 0xB9A}, {0xB9C, 0xB9C}, {0xB9E, 0xB9F}, {0xBA3, 0xBA4}, {0xBA8, 0xBAA}, {0xBAE, 0xBB9}, {0xBD0, 0xBD0},
			
 
				+{0xC05, 0xC0C}, {0xC0E, 0xC10}, {0xC12, 0xC28}, {0xC2A, 0xC39}, {0xC3D, 0xC3D}, {0xC58, 0xC5A}, {0xC60, 0xC61}, {0xC80, 0xC80}, {0xC85, 0xC8C}, {0xC8E, 0xC90}, {0xC92, 0xCA8}, {0xCAA, 0xCB3},
			
 
				+{0xCB5, 0xCB9}, {0xCBD, 0xCBD}, {0xCDE, 0xCDE}, {0xCE0, 0xCE1}, {0xCF1, 0xCF2}, {0xD04, 0xD0C}, {0xD0E, 0xD10}, {0xD12, 0xD3A}, {0xD3D, 0xD3D}, {0xD4E, 0xD4E}, {0xD54, 0xD56}, {0xD5F, 0xD61},
			
 
				+{0xD7A, 0xD7F}, {0xD85, 0xD96}, {0xD9A, 0xDB1}, {0xDB3, 0xDBB}, {0xDBD, 0xDBD}, {0xDC0, 0xDC6}, {0xE01, 0xE30}, {0xE32, 0xE33}, {0xE40, 0xE46}, {0xE81, 0xE82}, {0xE84, 0xE84}, {0xE86, 0xE8A},
			
 
				+{0xE8C, 0xEA3}, {0xEA5, 0xEA5}, {0xEA7, 0xEB0}, {0xEB2, 0xEB3}, {0xEBD, 0xEBD}, {0xEC0, 0xEC4}, {0xEC6, 0xEC6}, {0xEDC, 0xEDF}, {0xF00, 0xF00}, {0xF40, 0xF47}, {0xF49, 0xF6C}, {0xF88, 0xF8C},
			
 
				+{0x1000, 0x102A}, {0x103F, 0x103F}, {0x1050, 0x1055}, {0x105A, 0x105D}, {0x1061, 0x1061}, {0x1065, 0x1066}, {0x106E, 0x1070}, {0x1075, 0x1081}, {0x108E, 0x108E}, {0x10A0, 0x10C5}, {0x10C7, 0x10C7},
			
 
				+{0x10CD, 0x10CD}, {0x10D0, 0x10FA}, {0x10FC, 0x1248}, {0x124A, 0x124D}, {0x1250, 0x1256}, {0x1258, 0x1258}, {0x125A, 0x125D}, {0x1260, 0x1288}, {0x128A, 0x128D}, {0x1290, 0x12B0}, {0x12B2, 0x12B5},
			
 
				+{0x12B8, 0x12BE}, {0x12C0, 0x12C0}, {0x12C2, 0x12C5}, {0x12C8, 0x12D6}, {0x12D8, 0x1310}, {0x1312, 0x1315}, {0x1318, 0x135A}, {0x1380, 0x138F}, {0x13A0, 0x13F5}, {0x13F8, 0x13FD}, {0x1401, 0x166C},
			
 
				+{0x166F, 0x167F}, {0x1681, 0x169A}, {0x16A0, 0x16EA}, {0x16F1, 0x16F8}, {0x1700, 0x170C}, {0x170E, 0x1711}, {0x1720, 0x1731}, {0x1740, 0x1751}, {0x1760, 0x176C}, {0x176E, 0x1770}, {0x1780, 0x17B3},
			
 
				+{0x17D7, 0x17D7}, {0x17DC, 0x17DC}, {0x1820, 0x1878}, {0x1880, 0x1884}, {0x1887, 0x18A8}, {0x18AA, 0x18AA}, {0x18B0, 0x18F5}, {0x1900, 0x191E}, {0x1950, 0x196D}, {0x1970, 0x1974}, {0x1980, 0x19AB},
			
 
				+{0x19B0, 0x19C9}, {0x1A00, 0x1A16}, {0x1A20, 0x1A54}, {0x1AA7, 0x1AA7}, {0x1B05, 0x1B33}, {0x1B45, 0x1B4B}, {0x1B83, 0x1BA0}, {0x1BAE, 0x1BAF}, {0x1BBA, 0x1BE5}, {0x1C00, 0x1C23}, {0x1C4D, 0x1C4F},
			
 
				+{0x1C5A, 0x1C7D}, {0x1C80, 0x1C88}, {0x1C90, 0x1CBA}, {0x1CBD, 0x1CBF}, {0x1CE9, 0x1CEC}, {0x1CEE, 0x1CF3}, {0x1CF5, 0x1CF6}, {0x1CFA, 0x1CFA}, {0x1D00, 0x1DBF}, {0x1E00, 0x1F15}, {0x1F18, 0x1F1D},
			
 
				+{0x1F20, 0x1F45}, {0x1F48, 0x1F4D}, {0x1F50, 0x1F57}, {0x1F59, 0x1F59}, {0x1F5B, 0x1F5B}, {0x1F5D, 0x1F5D}, {0x1F5F, 0x1F7D}, {0x1F80, 0x1FB4}, {0x1FB6, 0x1FBC}, {0x1FBE, 0x1FBE}, {0x1FC2, 0x1FC4},
			
 
				+{0x1FC6, 0x1FCC}, {0x1FD0, 0x1FD3}, {0x1FD6, 0x1FDB}, {0x1FE0, 0x1FEC}, {0x1FF2, 0x1FF4}, {0x1FF6, 0x1FFC}, {0x2071, 0x2071}, {0x207F, 0x207F}, {0x2090, 0x209C}, {0x2102, 0x2102}, {0x2107, 0x2107},
			
 
				+{0x210A, 0x2113}, {0x2115, 0x2115}, {0x2119, 0x211D}, {0x2124, 0x2124}, {0x2126, 0x2126}, {0x2128, 0x2128}, {0x212A, 0x212D}, {0x212F, 0x2139}, {0x213C, 0x213F}, {0x2145, 0x2149}, {0x214E, 0x214E},
			
 
				+{0x2183, 0x2184}, {0x2C00, 0x2C2E}, {0x2C30, 0x2C5E}, {0x2C60, 0x2CE4}, {0x2CEB, 0x2CEE}, {0x2CF2, 0x2CF3}, {0x2D00, 0x2D25}, {0x2D27, 0x2D27}, {0x2D2D, 0x2D2D}, {0x2D30, 0x2D67}, {0x2D6F, 0x2D6F},
			
 
				+{0x2D80, 0x2D96}, {0x2DA0, 0x2DA6}, {0x2DA8, 0x2DAE}, {0x2DB0, 0x2DB6}, {0x2DB8, 0x2DBE}, {0x2DC0, 0x2DC6}, {0x2DC8, 0x2DCE}, {0x2DD0, 0x2DD6}, {0x2DD8, 0x2DDE}, {0x2E2F, 0x2E2F}, {0x3005, 0x3006},
			
 
				+{0x3031, 0x3035}, {0x303B, 0x303C}, {0x3041, 0x3096}, {0x309D, 0x309F}, {0x30A1, 0x30FA}, {0x30FC, 0x30FF}, {0x3105, 0x312F}, {0x3131, 0x318E}, {0x31A0, 0x31BF}, {0x31F0, 0x31FF}, {0x3400, 0x4DBF},
			
 
				+{0x4E00, 0x9FFC}, {0xA000, 0xA48C}, {0xA4D0, 0xA4FD}, {0xA500, 0xA60C}, {0xA610, 0xA61F}, {0xA62A, 0xA62B}, {0xA640, 0xA66E}, {0xA67F, 0xA69D}, {0xA6A0, 0xA6E5}, {0xA717, 0xA71F}, {0xA722, 0xA788},
			
 
				+{0xA78B, 0xA7BF}, {0xA7C2, 0xA7CA}, {0xA7F5, 0xA801}, {0xA803, 0xA805}, {0xA807, 0xA80A}, {0xA80C, 0xA822}, {0xA840, 0xA873}, {0xA882, 0xA8B3}, {0xA8F2, 0xA8F7}, {0xA8FB, 0xA8FB}, {0xA8FD, 0xA8FE},
			
 
				+{0xA90A, 0xA925}, {0xA930, 0xA946}, {0xA960, 0xA97C}, {0xA984, 0xA9B2}, {0xA9CF, 0xA9CF}, {0xA9E0, 0xA9E4}, {0xA9E6, 0xA9EF}, {0xA9FA, 0xA9FE}, {0xAA00, 0xAA28}, {0xAA40, 0xAA42}, {0xAA44, 0xAA4B},
			
 
				+{0xAA60, 0xAA76}, {0xAA7A, 0xAA7A}, {0xAA7E, 0xAAAF}, {0xAAB1, 0xAAB1}, {0xAAB5, 0xAAB6}, {0xAAB9, 0xAABD}, {0xAAC0, 0xAAC0}, {0xAAC2, 0xAAC2}, {0xAADB, 0xAADD}, {0xAAE0, 0xAAEA}, {0xAAF2, 0xAAF4},
			
 
				+{0xAB01, 0xAB06}, {0xAB09, 0xAB0E}, {0xAB11, 0xAB16}, {0xAB20, 0xAB26}, {0xAB28, 0xAB2E}, {0xAB30, 0xAB5A}, {0xAB5C, 0xAB69}, {0xAB70, 0xABE2}, {0xAC00, 0xD7A3}, {0xD7B0, 0xD7C6}, {0xD7CB, 0xD7FB},
			
 
				+{0xF900, 0xFA6D}, {0xFA70, 0xFAD9}, {0xFB00, 0xFB06}, {0xFB13, 0xFB17}, {0xFB1D, 0xFB1D}, {0xFB1F, 0xFB28}, {0xFB2A, 0xFB36}, {0xFB38, 0xFB3C}, {0xFB3E, 0xFB3E}, {0xFB40, 0xFB41}, {0xFB43, 0xFB44},
			
 
				+{0xFB46, 0xFBB1}, {0xFBD3, 0xFD3D}, {0xFD50, 0xFD8F}, {0xFD92, 0xFDC7}, {0xFDF0, 0xFDFB}, {0xFE70, 0xFE74}, {0xFE76, 0xFEFC}, {0xFF21, 0xFF3A}, {0xFF41, 0xFF5A}, {0xFF66, 0xFFBE}, {0xFFC2, 0xFFC7},
			
 
				+{0xFFCA, 0xFFCF}, {0xFFD2, 0xFFD7}, {0xFFDA, 0xFFDC}, {0x10000, 0x1000B}, {0x1000D, 0x10026}, {0x10028, 0x1003A}, {0x1003C, 0x1003D}, {0x1003F, 0x1004D}, {0x10050, 0x1005D}, {0x10080, 0x100FA},
			
 
				+{0x10280, 0x1029C}, {0x102A0, 0x102D0}, {0x10300, 0x1031F}, {0x1032D, 0x10340}, {0x10342, 0x10349}, {0x10350, 0x10375}, {0x10380, 0x1039D}, {0x103A0, 0x103C3}, {0x103C8, 0x103CF}, {0x10400, 0x1049D},
			
 
				+{0x104B0, 0x104D3}, {0x104D8, 0x104FB}, {0x10500, 0x10527}, {0x10530, 0x10563}, {0x10600, 0x10736}, {0x10740, 0x10755}, {0x10760, 0x10767}, {0x10800, 0x10805}, {0x10808, 0x10808}, {0x1080A, 0x10835},
			
 
				+{0x10837, 0x10838}, {0x1083C, 0x1083C}, {0x1083F, 0x10855}, {0x10860, 0x10876}, {0x10880, 0x1089E}, {0x108E0, 0x108F2}, {0x108F4, 0x108F5}, {0x10900, 0x10915}, {0x10920, 0x10939}, {0x10980, 0x109B7},
			
 
				+{0x109BE, 0x109BF}, {0x10A00, 0x10A00}, {0x10A10, 0x10A13}, {0x10A15, 0x10A17}, {0x10A19, 0x10A35}, {0x10A60, 0x10A7C}, {0x10A80, 0x10A9C}, {0x10AC0, 0x10AC7}, {0x10AC9, 0x10AE4}, {0x10B00, 0x10B35},
			
 
				+{0x10B40, 0x10B55}, {0x10B60, 0x10B72}, {0x10B80, 0x10B91}, {0x10C00, 0x10C48}, {0x10C80, 0x10CB2}, {0x10CC0, 0x10CF2}, {0x10D00, 0x10D23}, {0x10E80, 0x10EA9}, {0x10EB0, 0x10EB1}, {0x10F00, 0x10F1C},
			
 
				+{0x10F27, 0x10F27}, {0x10F30, 0x10F45}, {0x10FB0, 0x10FC4}, {0x10FE0, 0x10FF6}, {0x11003, 0x11037}, {0x11083, 0x110AF}, {0x110D0, 0x110E8}, {0x11103, 0x11126}, {0x11144, 0x11144}, {0x11147, 0x11147},
			
 
				+{0x11150, 0x11172}, {0x11176, 0x11176}, {0x11183, 0x111B2}, {0x111C1, 0x111C4}, {0x111DA, 0x111DA}, {0x111DC, 0x111DC}, {0x11200, 0x11211}, {0x11213, 0x1122B}, {0x11280, 0x11286}, {0x11288, 0x11288},
			
 
				+{0x1128A, 0x1128D}, {0x1128F, 0x1129D}, {0x1129F, 0x112A8}, {0x112B0, 0x112DE}, {0x11305, 0x1130C}, {0x1130F, 0x11310}, {0x11313, 0x11328}, {0x1132A, 0x11330}, {0x11332, 0x11333}, {0x11335, 0x11339},
			
 
				+{0x1133D, 0x1133D}, {0x11350, 0x11350}, {0x1135D, 0x11361}, {0x11400, 0x11434}, {0x11447, 0x1144A}, {0x1145F, 0x11461}, {0x11480, 0x114AF}, {0x114C4, 0x114C5}, {0x114C7, 0x114C7}, {0x11580, 0x115AE},
			
 
				+{0x115D8, 0x115DB}, {0x11600, 0x1162F}, {0x11644, 0x11644}, {0x11680, 0x116AA}, {0x116B8, 0x116B8}, {0x11700, 0x1171A}, {0x11800, 0x1182B}, {0x118A0, 0x118DF}, {0x118FF, 0x11906}, {0x11909, 0x11909},
			
 
				+{0x1190C, 0x11913}, {0x11915, 0x11916}, {0x11918, 0x1192F}, {0x1193F, 0x1193F}, {0x11941, 0x11941}, {0x119A0, 0x119A7}, {0x119AA, 0x119D0}, {0x119E1, 0x119E1}, {0x119E3, 0x119E3}, {0x11A00, 0x11A00},
			
 
				+{0x11A0B, 0x11A32}, {0x11A3A, 0x11A3A}, {0x11A50, 0x11A50}, {0x11A5C, 0x11A89}, {0x11A9D, 0x11A9D}, {0x11AC0, 0x11AF8}, {0x11C00, 0x11C08}, {0x11C0A, 0x11C2E}, {0x11C40, 0x11C40}, {0x11C72, 0x11C8F},
			
 
				+{0x11D00, 0x11D06}, {0x11D08, 0x11D09}, {0x11D0B, 0x11D30}, {0x11D46, 0x11D46}, {0x11D60, 0x11D65}, {0x11D67, 0x11D68}, {0x11D6A, 0x11D89}, {0x11D98, 0x11D98}, {0x11EE0, 0x11EF2}, {0x11FB0, 0x11FB0},
			
 
				+{0x12000, 0x12399}, {0x12480, 0x12543}, {0x13000, 0x1342E}, {0x14400, 0x14646}, {0x16800, 0x16A38}, {0x16A40, 0x16A5E}, {0x16AD0, 0x16AED}, {0x16B00, 0x16B2F}, {0x16B40, 0x16B43}, {0x16B63, 0x16B77},
			
 
				+{0x16B7D, 0x16B8F}, {0x16E40, 0x16E7F}, {0x16F00, 0x16F4A}, {0x16F50, 0x16F50}, {0x16F93, 0x16F9F}, {0x16FE0, 0x16FE1}, {0x16FE3, 0x16FE3}, {0x17000, 0x187F7}, {0x18800, 0x18CD5}, {0x18D00, 0x18D08},
			
 
				+{0x1B000, 0x1B11E}, {0x1B150, 0x1B152}, {0x1B164, 0x1B167}, {0x1B170, 0x1B2FB}, {0x1BC00, 0x1BC6A}, {0x1BC70, 0x1BC7C}, {0x1BC80, 0x1BC88}, {0x1BC90, 0x1BC99}, {0x1D400, 0x1D454}, {0x1D456, 0x1D49C},
			
 
				+{0x1D49E, 0x1D49F}, {0x1D4A2, 0x1D4A2}, {0x1D4A5, 0x1D4A6}, {0x1D4A9, 0x1D4AC}, {0x1D4AE, 0x1D4B9}, {0x1D4BB, 0x1D4BB}, {0x1D4BD, 0x1D4C3}, {0x1D4C5, 0x1D505}, {0x1D507, 0x1D50A}, {0x1D50D, 0x1D514},
			
 
				+{0x1D516, 0x1D51C}, {0x1D51E, 0x1D539}, {0x1D53B, 0x1D53E}, {0x1D540, 0x1D544}, {0x1D546, 0x1D546}, {0x1D54A, 0x1D550}, {0x1D552, 0x1D6A5}, {0x1D6A8, 0x1D6C0}, {0x1D6C2, 0x1D6DA}, {0x1D6DC, 0x1D6FA},
			
 
				+{0x1D6FC, 0x1D714}, {0x1D716, 0x1D734}, {0x1D736, 0x1D74E}, {0x1D750, 0x1D76E}, {0x1D770, 0x1D788}, {0x1D78A, 0x1D7A8}, {0x1D7AA, 0x1D7C2}, {0x1D7C4, 0x1D7CB}, {0x1E100, 0x1E12C}, {0x1E137, 0x1E13D},
			
 
				+{0x1E14E, 0x1E14E}, {0x1E2C0, 0x1E2EB}, {0x1E800, 0x1E8C4}, {0x1E900, 0x1E943}, {0x1E94B, 0x1E94B}, {0x1EE00, 0x1EE03}, {0x1EE05, 0x1EE1F}, {0x1EE21, 0x1EE22}, {0x1EE24, 0x1EE24}, {0x1EE27, 0x1EE27},
			
 
				+{0x1EE29, 0x1EE32}, {0x1EE34, 0x1EE37}, {0x1EE39, 0x1EE39}, {0x1EE3B, 0x1EE3B}, {0x1EE42, 0x1EE42}, {0x1EE47, 0x1EE47}, {0x1EE49, 0x1EE49}, {0x1EE4B, 0x1EE4B}, {0x1EE4D, 0x1EE4F}, {0x1EE51, 0x1EE52},
			
 
				+{0x1EE54, 0x1EE54}, {0x1EE57, 0x1EE57}, {0x1EE59, 0x1EE59}, {0x1EE5B, 0x1EE5B}, {0x1EE5D, 0x1EE5D}, {0x1EE5F, 0x1EE5F}, {0x1EE61, 0x1EE62}, {0x1EE64, 0x1EE64}, {0x1EE67, 0x1EE6A}, {0x1EE6C, 0x1EE72},
			
 
				+{0x1EE74, 0x1EE77}, {0x1EE79, 0x1EE7C}, {0x1EE7E, 0x1EE7E}, {0x1EE80, 0x1EE89}, {0x1EE8B, 0x1EE9B}, {0x1EEA1, 0x1EEA3}, {0x1EEA5, 0x1EEA9}, {0x1EEAB, 0x1EEBB}, {0x20000, 0x2A6DD}, {0x2A700, 0x2B734},
			
 
				+{0x2B740, 0x2B81D}, {0x2B820, 0x2CEA1}, {0x2CEB0, 0x2EBE0}, {0x2F800, 0x2FA1D}, {0x30000, 0x3134A},
			
 
				+};
			
 
				+
			
 
				+static const std::vector<std::pair<int, int>> whitespace_ranges = {
			
 
				+{0x9, 0xD}, {0x1C, 0x20}, {0x85, 0x85}, {0xA0, 0xA0}, {0x1680, 0x1680}, {0x2000, 0x200A}, {0x2028, 0x2029}, {0x202F, 0x202F}, {0x205F, 0x205F}, {0x3000, 0x3000},
			
 
				+};
			
 
				+
			
 
				+static const std::vector<std::pair<int, int>> accent_mark_ranges = {
			
 
				+{0x300, 0x36F}, {0x483, 0x489}, {0x591, 0x5BD}, {0x5BF, 0x5BF}, {0x5C1, 0x5C2}, {0x5C4, 0x5C5}, {0x5C7, 0x5C7}, {0x610, 0x61A}, {0x64B, 0x65F}, {0x670, 0x670}, {0x6D6, 0x6DC}, {0x6DF, 0x6E4},
			
 
				+{0x6E7, 0x6E8}, {0x6EA, 0x6ED}, {0x711, 0x711}, {0x730, 0x74A}, {0x7A6, 0x7B0}, {0x7EB, 0x7F3}, {0x7FD, 0x7FD}, {0x816, 0x819}, {0x81B, 0x823}, {0x825, 0x827}, {0x829, 0x82D}, {0x859, 0x85B},
			
 
				+{0x8D3, 0x8E1}, {0x8E3, 0x903}, {0x93A, 0x93C}, {0x93E, 0x94F}, {0x951, 0x957}, {0x962, 0x963}, {0x981, 0x983}, {0x9BC, 0x9BC}, {0x9BE, 0x9C4}, {0x9C7, 0x9C8}, {0x9CB, 0x9CD}, {0x9D7, 0x9D7},
			
 
				+{0x9E2, 0x9E3}, {0x9FE, 0x9FE}, {0xA01, 0xA03}, {0xA3C, 0xA3C}, {0xA3E, 0xA42}, {0xA47, 0xA48}, {0xA4B, 0xA4D}, {0xA51, 0xA51}, {0xA70, 0xA71}, {0xA75, 0xA75}, {0xA81, 0xA83}, {0xABC, 0xABC},
			
 
				+{0xABE, 0xAC5}, {0xAC7, 0xAC9}, {0xACB, 0xACD}, {0xAE2, 0xAE3}, {0xAFA, 0xAFF}, {0xB01, 0xB03}, {0xB3C, 0xB3C}, {0xB3E, 0xB44}, {0xB47, 0xB48}, {0xB4B, 0xB4D}, {0xB55, 0xB57}, {0xB62, 0xB63},
			
 
				+{0xB82, 0xB82}, {0xBBE, 0xBC2}, {0xBC6, 0xBC8}, {0xBCA, 0xBCD}, {0xBD7, 0xBD7}, {0xC00, 0xC04}, {0xC3E, 0xC44}, {0xC46, 0xC48}, {0xC4A, 0xC4D}, {0xC55, 0xC56}, {0xC62, 0xC63}, {0xC81, 0xC83},
			
 
				+{0xCBC, 0xCBC}, {0xCBE, 0xCC4}, {0xCC6, 0xCC8}, {0xCCA, 0xCCD}, {0xCD5, 0xCD6}, {0xCE2, 0xCE3}, {0xD00, 0xD03}, {0xD3B, 0xD3C}, {0xD3E, 0xD44}, {0xD46, 0xD48}, {0xD4A, 0xD4D}, {0xD57, 0xD57},
			
 
				+{0xD62, 0xD63}, {0xD81, 0xD83}, {0xDCA, 0xDCA}, {0xDCF, 0xDD4}, {0xDD6, 0xDD6}, {0xDD8, 0xDDF}, {0xDF2, 0xDF3}, {0xE31, 0xE31}, {0xE34, 0xE3A}, {0xE47, 0xE4E}, {0xEB1, 0xEB1}, {0xEB4, 0xEBC},
			
 
				+{0xEC8, 0xECD}, {0xF18, 0xF19}, {0xF35, 0xF35}, {0xF37, 0xF37}, {0xF39, 0xF39}, {0xF3E, 0xF3F}, {0xF71, 0xF84}, {0xF86, 0xF87}, {0xF8D, 0xF97}, {0xF99, 0xFBC}, {0xFC6, 0xFC6}, {0x102B, 0x103E},
			
 
				+{0x1056, 0x1059}, {0x105E, 0x1060}, {0x1062, 0x1064}, {0x1067, 0x106D}, {0x1071, 0x1074}, {0x1082, 0x108D}, {0x108F, 0x108F}, {0x109A, 0x109D}, {0x135D, 0x135F}, {0x1712, 0x1714}, {0x1732, 0x1734},
			
 
				+{0x1752, 0x1753}, {0x1772, 0x1773}, {0x17B4, 0x17D3}, {0x17DD, 0x17DD}, {0x180B, 0x180D}, {0x1885, 0x1886}, {0x18A9, 0x18A9}, {0x1920, 0x192B}, {0x1930, 0x193B}, {0x1A17, 0x1A1B}, {0x1A55, 0x1A5E},
			
 
				+{0x1A60, 0x1A7C}, {0x1A7F, 0x1A7F}, {0x1AB0, 0x1AC0}, {0x1B00, 0x1B04}, {0x1B34, 0x1B44}, {0x1B6B, 0x1B73}, {0x1B80, 0x1B82}, {0x1BA1, 0x1BAD}, {0x1BE6, 0x1BF3}, {0x1C24, 0x1C37}, {0x1CD0, 0x1CD2},
			
 
				+{0x1CD4, 0x1CE8}, {0x1CED, 0x1CED}, {0x1CF4, 0x1CF4}, {0x1CF7, 0x1CF9}, {0x1DC0, 0x1DF9}, {0x1DFB, 0x1DFF}, {0x20D0, 0x20F0}, {0x2CEF, 0x2CF1}, {0x2D7F, 0x2D7F}, {0x2DE0, 0x2DFF}, {0x302A, 0x302F},
			
 
				+{0x3099, 0x309A}, {0xA66F, 0xA672}, {0xA674, 0xA67D}, {0xA69E, 0xA69F}, {0xA6F0, 0xA6F1}, {0xA802, 0xA802}, {0xA806, 0xA806}, {0xA80B, 0xA80B}, {0xA823, 0xA827}, {0xA82C, 0xA82C}, {0xA880, 0xA881},
			
 
				+{0xA8B4, 0xA8C5}, {0xA8E0, 0xA8F1}, {0xA8FF, 0xA8FF}, {0xA926, 0xA92D}, {0xA947, 0xA953}, {0xA980, 0xA983}, {0xA9B3, 0xA9C0}, {0xA9E5, 0xA9E5}, {0xAA29, 0xAA36}, {0xAA43, 0xAA43}, {0xAA4C, 0xAA4D},
			
 
				+{0xAA7B, 0xAA7D}, {0xAAB0, 0xAAB0}, {0xAAB2, 0xAAB4}, {0xAAB7, 0xAAB8}, {0xAABE, 0xAABF}, {0xAAC1, 0xAAC1}, {0xAAEB, 0xAAEF}, {0xAAF5, 0xAAF6}, {0xABE3, 0xABEA}, {0xABEC, 0xABED}, {0xFB1E, 0xFB1E},
			
 
				+{0xFE00, 0xFE0F}, {0xFE20, 0xFE2F}, {0x101FD, 0x101FD}, {0x102E0, 0x102E0}, {0x10376, 0x1037A}, {0x10A01, 0x10A03}, {0x10A05, 0x10A06}, {0x10A0C, 0x10A0F}, {0x10A38, 0x10A3A}, {0x10A3F, 0x10A3F},
			
 
				+{0x10AE5, 0x10AE6}, {0x10D24, 0x10D27}, {0x10EAB, 0x10EAC}, {0x10F46, 0x10F50}, {0x11000, 0x11002}, {0x11038, 0x11046}, {0x1107F, 0x11082}, {0x110B0, 0x110BA}, {0x11100, 0x11102}, {0x11127, 0x11134},
			
 
				+{0x11145, 0x11146}, {0x11173, 0x11173}, {0x11180, 0x11182}, {0x111B3, 0x111C0}, {0x111C9, 0x111CC}, {0x111CE, 0x111CF}, {0x1122C, 0x11237}, {0x1123E, 0x1123E}, {0x112DF, 0x112EA}, {0x11300, 0x11303},
			
 
				+{0x1133B, 0x1133C}, {0x1133E, 0x11344}, {0x11347, 0x11348}, {0x1134B, 0x1134D}, {0x11357, 0x11357}, {0x11362, 0x11363}, {0x11366, 0x1136C}, {0x11370, 0x11374}, {0x11435, 0x11446}, {0x1145E, 0x1145E},
			
 
				+{0x114B0, 0x114C3}, {0x115AF, 0x115B5}, {0x115B8, 0x115C0}, {0x115DC, 0x115DD}, {0x11630, 0x11640}, {0x116AB, 0x116B7}, {0x1171D, 0x1172B}, {0x1182C, 0x1183A}, {0x11930, 0x11935}, {0x11937, 0x11938},
			
 
				+{0x1193B, 0x1193E}, {0x11940, 0x11940}, {0x11942, 0x11943}, {0x119D1, 0x119D7}, {0x119DA, 0x119E0}, {0x119E4, 0x119E4}, {0x11A01, 0x11A0A}, {0x11A33, 0x11A39}, {0x11A3B, 0x11A3E}, {0x11A47, 0x11A47},
			
 
				+{0x11A51, 0x11A5B}, {0x11A8A, 0x11A99}, {0x11C2F, 0x11C36}, {0x11C38, 0x11C3F}, {0x11C92, 0x11CA7}, {0x11CA9, 0x11CB6}, {0x11D31, 0x11D36}, {0x11D3A, 0x11D3A}, {0x11D3C, 0x11D3D}, {0x11D3F, 0x11D45},
			
 
				+{0x11D47, 0x11D47}, {0x11D8A, 0x11D8E}, {0x11D90, 0x11D91}, {0x11D93, 0x11D97}, {0x11EF3, 0x11EF6}, {0x16AF0, 0x16AF4}, {0x16B30, 0x16B36}, {0x16F4F, 0x16F4F}, {0x16F51, 0x16F87}, {0x16F8F, 0x16F92},
			
 
				+{0x16FE4, 0x16FE4}, {0x16FF0, 0x16FF1}, {0x1BC9D, 0x1BC9E}, {0x1D165, 0x1D169}, {0x1D16D, 0x1D172}, {0x1D17B, 0x1D182}, {0x1D185, 0x1D18B}, {0x1D1AA, 0x1D1AD}, {0x1D242, 0x1D244}, {0x1DA00, 0x1DA36},
			
 
				+{0x1DA3B, 0x1DA6C}, {0x1DA75, 0x1DA75}, {0x1DA84, 0x1DA84}, {0x1DA9B, 0x1DA9F}, {0x1DAA1, 0x1DAAF}, {0x1E000, 0x1E006}, {0x1E008, 0x1E018}, {0x1E01B, 0x1E021}, {0x1E023, 0x1E024}, {0x1E026, 0x1E02A},
			
 
				+{0x1E130, 0x1E136}, {0x1E2EC, 0x1E2EF}, {0x1E8D0, 0x1E8D6}, {0x1E944, 0x1E94A}, {0xE0100, 0xE01EF},
			
 
				+};
			
 
				+
			
 
				+static const std::vector<std::pair<int, int>> punctuation_ranges = {
			
 
				+{0x21, 0x23}, {0x25, 0x2A}, {0x2C, 0x2F}, {0x3A, 0x3B}, {0x3F, 0x40}, {0x5B, 0x5D}, {0x5F, 0x5F}, {0x7B, 0x7B}, {0x7D, 0x7D}, {0xA1, 0xA1}, {0xA7, 0xA7}, {0xAB, 0xAB}, {0xB6, 0xB7}, {0xBB, 0xBB},
			
 
				+{0xBF, 0xBF}, {0x37E, 0x37E}, {0x387, 0x387}, {0x55A, 0x55F}, {0x589, 0x58A}, {0x5BE, 0x5BE}, {0x5C0, 0x5C0}, {0x5C3, 0x5C3}, {0x5C6, 0x5C6}, {0x5F3, 0x5F4}, {0x609, 0x60A}, {0x60C, 0x60D},
			
 
				+{0x61B, 0x61B}, {0x61E, 0x61F}, {0x66A, 0x66D}, {0x6D4, 0x6D4}, {0x700, 0x70D}, {0x7F7, 0x7F9}, {0x830, 0x83E}, {0x85E, 0x85E}, {0x964, 0x965}, {0x970, 0x970}, {0x9FD, 0x9FD}, {0xA76, 0xA76},
			
 
				+{0xAF0, 0xAF0}, {0xC77, 0xC77}, {0xC84, 0xC84}, {0xDF4, 0xDF4}, {0xE4F, 0xE4F}, {0xE5A, 0xE5B}, {0xF04, 0xF12}, {0xF14, 0xF14}, {0xF3A, 0xF3D}, {0xF85, 0xF85}, {0xFD0, 0xFD4}, {0xFD9, 0xFDA},
			
 
				+{0x104A, 0x104F}, {0x10FB, 0x10FB}, {0x1360, 0x1368}, {0x1400, 0x1400}, {0x166E, 0x166E}, {0x169B, 0x169C}, {0x16EB, 0x16ED}, {0x1735, 0x1736}, {0x17D4, 0x17D6}, {0x17D8, 0x17DA}, {0x1800, 0x180A},
			
 
				+{0x1944, 0x1945}, {0x1A1E, 0x1A1F}, {0x1AA0, 0x1AA6}, {0x1AA8, 0x1AAD}, {0x1B5A, 0x1B60}, {0x1BFC, 0x1BFF}, {0x1C3B, 0x1C3F}, {0x1C7E, 0x1C7F}, {0x1CC0, 0x1CC7}, {0x1CD3, 0x1CD3}, {0x2010, 0x2027},
			
 
				+{0x2030, 0x2043}, {0x2045, 0x2051}, {0x2053, 0x205E}, {0x207D, 0x207E}, {0x208D, 0x208E}, {0x2308, 0x230B}, {0x2329, 0x232A}, {0x2768, 0x2775}, {0x27C5, 0x27C6}, {0x27E6, 0x27EF}, {0x2983, 0x2998},
			
 
				+{0x29D8, 0x29DB}, {0x29FC, 0x29FD}, {0x2CF9, 0x2CFC}, {0x2CFE, 0x2CFF}, {0x2D70, 0x2D70}, {0x2E00, 0x2E2E}, {0x2E30, 0x2E4F}, {0x2E52, 0x2E52}, {0x3001, 0x3003}, {0x3008, 0x3011}, {0x3014, 0x301F},
			
 
				+{0x3030, 0x3030}, {0x303D, 0x303D}, {0x30A0, 0x30A0}, {0x30FB, 0x30FB}, {0xA4FE, 0xA4FF}, {0xA60D, 0xA60F}, {0xA673, 0xA673}, {0xA67E, 0xA67E}, {0xA6F2, 0xA6F7}, {0xA874, 0xA877}, {0xA8CE, 0xA8CF},
			
 
				+{0xA8F8, 0xA8FA}, {0xA8FC, 0xA8FC}, {0xA92E, 0xA92F}, {0xA95F, 0xA95F}, {0xA9C1, 0xA9CD}, {0xA9DE, 0xA9DF}, {0xAA5C, 0xAA5F}, {0xAADE, 0xAADF}, {0xAAF0, 0xAAF1}, {0xABEB, 0xABEB}, {0xFD3E, 0xFD3F},
			
 
				+{0xFE10, 0xFE19}, {0xFE30, 0xFE52}, {0xFE54, 0xFE61}, {0xFE63, 0xFE63}, {0xFE68, 0xFE68}, {0xFE6A, 0xFE6B}, {0xFF01, 0xFF03}, {0xFF05, 0xFF0A}, {0xFF0C, 0xFF0F}, {0xFF1A, 0xFF1B}, {0xFF1F, 0xFF20},
			
 
				+{0xFF3B, 0xFF3D}, {0xFF3F, 0xFF3F}, {0xFF5B, 0xFF5B}, {0xFF5D, 0xFF5D}, {0xFF5F, 0xFF65}, {0x10100, 0x10102}, {0x1039F, 0x1039F}, {0x103D0, 0x103D0}, {0x1056F, 0x1056F}, {0x10857, 0x10857},
			
 
				+{0x1091F, 0x1091F}, {0x1093F, 0x1093F}, {0x10A50, 0x10A58}, {0x10A7F, 0x10A7F}, {0x10AF0, 0x10AF6}, {0x10B39, 0x10B3F}, {0x10B99, 0x10B9C}, {0x10EAD, 0x10EAD}, {0x10F55, 0x10F59}, {0x11047, 0x1104D},
			
 
				+{0x110BB, 0x110BC}, {0x110BE, 0x110C1}, {0x11140, 0x11143}, {0x11174, 0x11175}, {0x111C5, 0x111C8}, {0x111CD, 0x111CD}, {0x111DB, 0x111DB}, {0x111DD, 0x111DF}, {0x11238, 0x1123D}, {0x112A9, 0x112A9},
			
 
				+{0x1144B, 0x1144F}, {0x1145A, 0x1145B}, {0x1145D, 0x1145D}, {0x114C6, 0x114C6}, {0x115C1, 0x115D7}, {0x11641, 0x11643}, {0x11660, 0x1166C}, {0x1173C, 0x1173E}, {0x1183B, 0x1183B}, {0x11944, 0x11946},
			
 
				+{0x119E2, 0x119E2}, {0x11A3F, 0x11A46}, {0x11A9A, 0x11A9C}, {0x11A9E, 0x11AA2}, {0x11C41, 0x11C45}, {0x11C70, 0x11C71}, {0x11EF7, 0x11EF8}, {0x11FFF, 0x11FFF}, {0x12470, 0x12474}, {0x16A6E, 0x16A6F},
			
 
				+{0x16AF5, 0x16AF5}, {0x16B37, 0x16B3B}, {0x16B44, 0x16B44}, {0x16E97, 0x16E9A}, {0x16FE2, 0x16FE2}, {0x1BC9F, 0x1BC9F}, {0x1DA87, 0x1DA8B}, {0x1E95E, 0x1E95F},
			
 
				+};
			
 
				+
			
 
				+static const std::vector<std::pair<int, int>> symbol_ranges = {
			
 
				+{0x24, 0x24}, {0x2B, 0x2B}, {0x3C, 0x3E}, {0x5E, 0x5E}, {0x60, 0x60}, {0x7C, 0x7C}, {0x7E, 0x7E}, {0xA2, 0xA6}, {0xA8, 0xA9}, {0xAC, 0xAC}, {0xAE, 0xB1}, {0xB4, 0xB4}, {0xB8, 0xB8}, {0xD7, 0xD7},
			
 
				+{0xF7, 0xF7}, {0x2C2, 0x2C5}, {0x2D2, 0x2DF}, {0x2E5, 0x2EB}, {0x2ED, 0x2ED}, {0x2EF, 0x2FF}, {0x375, 0x375}, {0x384, 0x385}, {0x3F6, 0x3F6}, {0x482, 0x482}, {0x58D, 0x58F}, {0x606, 0x608},
			
 
				+{0x60B, 0x60B}, {0x60E, 0x60F}, {0x6DE, 0x6DE}, {0x6E9, 0x6E9}, {0x6FD, 0x6FE}, {0x7F6, 0x7F6}, {0x7FE, 0x7FF}, {0x9F2, 0x9F3}, {0x9FA, 0x9FB}, {0xAF1, 0xAF1}, {0xB70, 0xB70}, {0xBF3, 0xBFA},
			
 
				+{0xC7F, 0xC7F}, {0xD4F, 0xD4F}, {0xD79, 0xD79}, {0xE3F, 0xE3F}, {0xF01, 0xF03}, {0xF13, 0xF13}, {0xF15, 0xF17}, {0xF1A, 0xF1F}, {0xF34, 0xF34}, {0xF36, 0xF36}, {0xF38, 0xF38}, {0xFBE, 0xFC5},
			
 
				+{0xFC7, 0xFCC}, {0xFCE, 0xFCF}, {0xFD5, 0xFD8}, {0x109E, 0x109F}, {0x1390, 0x1399}, {0x166D, 0x166D}, {0x17DB, 0x17DB}, {0x1940, 0x1940}, {0x19DE, 0x19FF}, {0x1B61, 0x1B6A}, {0x1B74, 0x1B7C},
			
 
				+{0x1FBD, 0x1FBD}, {0x1FBF, 0x1FC1}, {0x1FCD, 0x1FCF}, {0x1FDD, 0x1FDF}, {0x1FED, 0x1FEF}, {0x1FFD, 0x1FFE}, {0x2044, 0x2044}, {0x2052, 0x2052}, {0x207A, 0x207C}, {0x208A, 0x208C}, {0x20A0, 0x20BF},
			
 
				+{0x2100, 0x2101}, {0x2103, 0x2106}, {0x2108, 0x2109}, {0x2114, 0x2114}, {0x2116, 0x2118}, {0x211E, 0x2123}, {0x2125, 0x2125}, {0x2127, 0x2127}, {0x2129, 0x2129}, {0x212E, 0x212E}, {0x213A, 0x213B},
			
 
				+{0x2140, 0x2144}, {0x214A, 0x214D}, {0x214F, 0x214F}, {0x218A, 0x218B}, {0x2190, 0x2307}, {0x230C, 0x2328}, {0x232B, 0x2426}, {0x2440, 0x244A}, {0x249C, 0x24E9}, {0x2500, 0x2767}, {0x2794, 0x27C4},
			
 
				+{0x27C7, 0x27E5}, {0x27F0, 0x2982}, {0x2999, 0x29D7}, {0x29DC, 0x29FB}, {0x29FE, 0x2B73}, {0x2B76, 0x2B95}, {0x2B97, 0x2BFF}, {0x2CE5, 0x2CEA}, {0x2E50, 0x2E51}, {0x2E80, 0x2E99}, {0x2E9B, 0x2EF3},
			
 
				+{0x2F00, 0x2FD5}, {0x2FF0, 0x2FFB}, {0x3004, 0x3004}, {0x3012, 0x3013}, {0x3020, 0x3020}, {0x3036, 0x3037}, {0x303E, 0x303F}, {0x309B, 0x309C}, {0x3190, 0x3191}, {0x3196, 0x319F}, {0x31C0, 0x31E3},
			
 
				+{0x3200, 0x321E}, {0x322A, 0x3247}, {0x3250, 0x3250}, {0x3260, 0x327F}, {0x328A, 0x32B0}, {0x32C0, 0x33FF}, {0x4DC0, 0x4DFF}, {0xA490, 0xA4C6}, {0xA700, 0xA716}, {0xA720, 0xA721}, {0xA789, 0xA78A},
			
 
				+{0xA828, 0xA82B}, {0xA836, 0xA839}, {0xAA77, 0xAA79}, {0xAB5B, 0xAB5B}, {0xAB6A, 0xAB6B}, {0xFB29, 0xFB29}, {0xFBB2, 0xFBC1}, {0xFDFC, 0xFDFD}, {0xFE62, 0xFE62}, {0xFE64, 0xFE66}, {0xFE69, 0xFE69},
			
 
				+{0xFF04, 0xFF04}, {0xFF0B, 0xFF0B}, {0xFF1C, 0xFF1E}, {0xFF3E, 0xFF3E}, {0xFF40, 0xFF40}, {0xFF5C, 0xFF5C}, {0xFF5E, 0xFF5E}, {0xFFE0, 0xFFE6}, {0xFFE8, 0xFFEE}, {0xFFFC, 0xFFFD}, {0x10137, 0x1013F},
			
 
				+{0x10179, 0x10189}, {0x1018C, 0x1018E}, {0x10190, 0x1019C}, {0x101A0, 0x101A0}, {0x101D0, 0x101FC}, {0x10877, 0x10878}, {0x10AC8, 0x10AC8}, {0x1173F, 0x1173F}, {0x11FD5, 0x11FF1}, {0x16B3C, 0x16B3F},
			
 
				+{0x16B45, 0x16B45}, {0x1BC9C, 0x1BC9C}, {0x1D000, 0x1D0F5}, {0x1D100, 0x1D126}, {0x1D129, 0x1D164}, {0x1D16A, 0x1D16C}, {0x1D183, 0x1D184}, {0x1D18C, 0x1D1A9}, {0x1D1AE, 0x1D1E8}, {0x1D200, 0x1D241},
			
 
				+{0x1D245, 0x1D245}, {0x1D300, 0x1D356}, {0x1D6C1, 0x1D6C1}, {0x1D6DB, 0x1D6DB}, {0x1D6FB, 0x1D6FB}, {0x1D715, 0x1D715}, {0x1D735, 0x1D735}, {0x1D74F, 0x1D74F}, {0x1D76F, 0x1D76F}, {0x1D789, 0x1D789},
			
 
				+{0x1D7A9, 0x1D7A9}, {0x1D7C3, 0x1D7C3}, {0x1D800, 0x1D9FF}, {0x1DA37, 0x1DA3A}, {0x1DA6D, 0x1DA74}, {0x1DA76, 0x1DA83}, {0x1DA85, 0x1DA86}, {0x1E14F, 0x1E14F}, {0x1E2FF, 0x1E2FF}, {0x1ECAC, 0x1ECAC},
			
 
				+{0x1ECB0, 0x1ECB0}, {0x1ED2E, 0x1ED2E}, {0x1EEF0, 0x1EEF1}, {0x1F000, 0x1F02B}, {0x1F030, 0x1F093}, {0x1F0A0, 0x1F0AE}, {0x1F0B1, 0x1F0BF}, {0x1F0C1, 0x1F0CF}, {0x1F0D1, 0x1F0F5}, {0x1F10D, 0x1F1AD},
			
 
				+{0x1F1E6, 0x1F202}, {0x1F210, 0x1F23B}, {0x1F240, 0x1F248}, {0x1F250, 0x1F251}, {0x1F260, 0x1F265}, {0x1F300, 0x1F6D7}, {0x1F6E0, 0x1F6EC}, {0x1F6F0, 0x1F6FC}, {0x1F700, 0x1F773}, {0x1F780, 0x1F7D8},
			
 
				+{0x1F7E0, 0x1F7EB}, {0x1F800, 0x1F80B}, {0x1F810, 0x1F847}, {0x1F850, 0x1F859}, {0x1F860, 0x1F887}, {0x1F890, 0x1F8AD}, {0x1F8B0, 0x1F8B1}, {0x1F900, 0x1F978}, {0x1F97A, 0x1F9CB}, {0x1F9CD, 0x1FA53},
			
 
				+{0x1FA60, 0x1FA6D}, {0x1FA70, 0x1FA74}, {0x1FA78, 0x1FA7A}, {0x1FA80, 0x1FA86}, {0x1FA90, 0x1FAA8}, {0x1FAB0, 0x1FAB6}, {0x1FAC0, 0x1FAC2}, {0x1FAD0, 0x1FAD6}, {0x1FB00, 0x1FB92}, {0x1FB94, 0x1FBCA},
			
 
				+};
			
 
				+
			
 
				+static const std::vector<std::pair<int, int>> control_ranges = {
			
 
				+{0x0, 0x8}, {0xE, 0x1B}, {0x7F, 0x84}, {0x86, 0x9F}, {0xAD, 0xAD}, {0x378, 0x379}, {0x380, 0x383}, {0x38B, 0x38B}, {0x38D, 0x38D}, {0x3A2, 0x3A2}, {0x530, 0x530}, {0x557, 0x558}, {0x58B, 0x58C},
			
 
				+{0x590, 0x590}, {0x5C8, 0x5CF}, {0x5EB, 0x5EE}, {0x5F5, 0x605}, {0x61C, 0x61D}, {0x6DD, 0x6DD}, {0x70E, 0x70F}, {0x74B, 0x74C}, {0x7B2, 0x7BF}, {0x7FB, 0x7FC}, {0x82E, 0x82F}, {0x83F, 0x83F},
			
 
				+{0x85C, 0x85D}, {0x85F, 0x85F}, {0x86B, 0x89F}, {0x8B5, 0x8B5}, {0x8C8, 0x8D2}, {0x8E2, 0x8E2}, {0x984, 0x984}, {0x98D, 0x98E}, {0x991, 0x992}, {0x9A9, 0x9A9}, {0x9B1, 0x9B1}, {0x9B3, 0x9B5},
			
 
				+{0x9BA, 0x9BB}, {0x9C5, 0x9C6}, {0x9C9, 0x9CA}, {0x9CF, 0x9D6}, {0x9D8, 0x9DB}, {0x9DE, 0x9DE}, {0x9E4, 0x9E5}, {0x9FF, 0xA00}, {0xA04, 0xA04}, {0xA0B, 0xA0E}, {0xA11, 0xA12}, {0xA29, 0xA29},
			
 
				+{0xA31, 0xA31}, {0xA34, 0xA34}, {0xA37, 0xA37}, {0xA3A, 0xA3B}, {0xA3D, 0xA3D}, {0xA43, 0xA46}, {0xA49, 0xA4A}, {0xA4E, 0xA50}, {0xA52, 0xA58}, {0xA5D, 0xA5D}, {0xA5F, 0xA65}, {0xA77, 0xA80},
			
 
				+{0xA84, 0xA84}, {0xA8E, 0xA8E}, {0xA92, 0xA92}, {0xAA9, 0xAA9}, {0xAB1, 0xAB1}, {0xAB4, 0xAB4}, {0xABA, 0xABB}, {0xAC6, 0xAC6}, {0xACA, 0xACA}, {0xACE, 0xACF}, {0xAD1, 0xADF}, {0xAE4, 0xAE5},
			
 
				+{0xAF2, 0xAF8}, {0xB00, 0xB00}, {0xB04, 0xB04}, {0xB0D, 0xB0E}, {0xB11, 0xB12}, {0xB29, 0xB29}, {0xB31, 0xB31}, {0xB34, 0xB34}, {0xB3A, 0xB3B}, {0xB45, 0xB46}, {0xB49, 0xB4A}, {0xB4E, 0xB54},
			
 
				+{0xB58, 0xB5B}, {0xB5E, 0xB5E}, {0xB64, 0xB65}, {0xB78, 0xB81}, {0xB84, 0xB84}, {0xB8B, 0xB8D}, {0xB91, 0xB91}, {0xB96, 0xB98}, {0xB9B, 0xB9B}, {0xB9D, 0xB9D}, {0xBA0, 0xBA2}, {0xBA5, 0xBA7},
			
 
				+{0xBAB, 0xBAD}, {0xBBA, 0xBBD}, {0xBC3, 0xBC5}, {0xBC9, 0xBC9}, {0xBCE, 0xBCF}, {0xBD1, 0xBD6}, {0xBD8, 0xBE5}, {0xBFB, 0xBFF}, {0xC0D, 0xC0D}, {0xC11, 0xC11}, {0xC29, 0xC29}, {0xC3A, 0xC3C},
			
 
				+{0xC45, 0xC45}, {0xC49, 0xC49}, {0xC4E, 0xC54}, {0xC57, 0xC57}, {0xC5B, 0xC5F}, {0xC64, 0xC65}, {0xC70, 0xC76}, {0xC8D, 0xC8D}, {0xC91, 0xC91}, {0xCA9, 0xCA9}, {0xCB4, 0xCB4}, {0xCBA, 0xCBB},
			
 
				+{0xCC5, 0xCC5}, {0xCC9, 0xCC9}, {0xCCE, 0xCD4}, {0xCD7, 0xCDD}, {0xCDF, 0xCDF}, {0xCE4, 0xCE5}, {0xCF0, 0xCF0}, {0xCF3, 0xCFF}, {0xD0D, 0xD0D}, {0xD11, 0xD11}, {0xD45, 0xD45}, {0xD49, 0xD49},
			
 
				+{0xD50, 0xD53}, {0xD64, 0xD65}, {0xD80, 0xD80}, {0xD84, 0xD84}, {0xD97, 0xD99}, {0xDB2, 0xDB2}, {0xDBC, 0xDBC}, {0xDBE, 0xDBF}, {0xDC7, 0xDC9}, {0xDCB, 0xDCE}, {0xDD5, 0xDD5}, {0xDD7, 0xDD7},
			
 
				+{0xDE0, 0xDE5}, {0xDF0, 0xDF1}, {0xDF5, 0xE00}, {0xE3B, 0xE3E}, {0xE5C, 0xE80}, {0xE83, 0xE83}, {0xE85, 0xE85}, {0xE8B, 0xE8B}, {0xEA4, 0xEA4}, {0xEA6, 0xEA6}, {0xEBE, 0xEBF}, {0xEC5, 0xEC5},
			
 
				+{0xEC7, 0xEC7}, {0xECE, 0xECF}, {0xEDA, 0xEDB}, {0xEE0, 0xEFF}, {0xF48, 0xF48}, {0xF6D, 0xF70}, {0xF98, 0xF98}, {0xFBD, 0xFBD}, {0xFCD, 0xFCD}, {0xFDB, 0xFFF}, {0x10C6, 0x10C6}, {0x10C8, 0x10CC},
			
 
				+{0x10CE, 0x10CF}, {0x1249, 0x1249}, {0x124E, 0x124F}, {0x1257, 0x1257}, {0x1259, 0x1259}, {0x125E, 0x125F}, {0x1289, 0x1289}, {0x128E, 0x128F}, {0x12B1, 0x12B1}, {0x12B6, 0x12B7}, {0x12BF, 0x12BF},
			
 
				+{0x12C1, 0x12C1}, {0x12C6, 0x12C7}, {0x12D7, 0x12D7}, {0x1311, 0x1311}, {0x1316, 0x1317}, {0x135B, 0x135C}, {0x137D, 0x137F}, {0x139A, 0x139F}, {0x13F6, 0x13F7}, {0x13FE, 0x13FF}, {0x169D, 0x169F},
			
 
				+{0x16F9, 0x16FF}, {0x170D, 0x170D}, {0x1715, 0x171F}, {0x1737, 0x173F}, {0x1754, 0x175F}, {0x176D, 0x176D}, {0x1771, 0x1771}, {0x1774, 0x177F}, {0x17DE, 0x17DF}, {0x17EA, 0x17EF}, {0x17FA, 0x17FF},
			
 
				+{0x180E, 0x180F}, {0x181A, 0x181F}, {0x1879, 0x187F}, {0x18AB, 0x18AF}, {0x18F6, 0x18FF}, {0x191F, 0x191F}, {0x192C, 0x192F}, {0x193C, 0x193F}, {0x1941, 0x1943}, {0x196E, 0x196F}, {0x1975, 0x197F},
			
 
				+{0x19AC, 0x19AF}, {0x19CA, 0x19CF}, {0x19DB, 0x19DD}, {0x1A1C, 0x1A1D}, {0x1A5F, 0x1A5F}, {0x1A7D, 0x1A7E}, {0x1A8A, 0x1A8F}, {0x1A9A, 0x1A9F}, {0x1AAE, 0x1AAF}, {0x1AC1, 0x1AFF}, {0x1B4C, 0x1B4F},
			
 
				+{0x1B7D, 0x1B7F}, {0x1BF4, 0x1BFB}, {0x1C38, 0x1C3A}, {0x1C4A, 0x1C4C}, {0x1C89, 0x1C8F}, {0x1CBB, 0x1CBC}, {0x1CC8, 0x1CCF}, {0x1CFB, 0x1CFF}, {0x1DFA, 0x1DFA}, {0x1F16, 0x1F17}, {0x1F1E, 0x1F1F},
			
 
				+{0x1F46, 0x1F47}, {0x1F4E, 0x1F4F}, {0x1F58, 0x1F58}, {0x1F5A, 0x1F5A}, {0x1F5C, 0x1F5C}, {0x1F5E, 0x1F5E}, {0x1F7E, 0x1F7F}, {0x1FB5, 0x1FB5}, {0x1FC5, 0x1FC5}, {0x1FD4, 0x1FD5}, {0x1FDC, 0x1FDC},
			
 
				+{0x1FF0, 0x1FF1}, {0x1FF5, 0x1FF5}, {0x1FFF, 0x1FFF}, {0x200B, 0x200F}, {0x202A, 0x202E}, {0x2060, 0x206F}, {0x2072, 0x2073}, {0x208F, 0x208F}, {0x209D, 0x209F}, {0x20C0, 0x20CF}, {0x20F1, 0x20FF},
			
 
				+{0x218C, 0x218F}, {0x2427, 0x243F}, {0x244B, 0x245F}, {0x2B74, 0x2B75}, {0x2B96, 0x2B96}, {0x2C2F, 0x2C2F}, {0x2C5F, 0x2C5F}, {0x2CF4, 0x2CF8}, {0x2D26, 0x2D26}, {0x2D28, 0x2D2C}, {0x2D2E, 0x2D2F},
			
 
				+{0x2D68, 0x2D6E}, {0x2D71, 0x2D7E}, {0x2D97, 0x2D9F}, {0x2DA7, 0x2DA7}, {0x2DAF, 0x2DAF}, {0x2DB7, 0x2DB7}, {0x2DBF, 0x2DBF}, {0x2DC7, 0x2DC7}, {0x2DCF, 0x2DCF}, {0x2DD7, 0x2DD7}, {0x2DDF, 0x2DDF},
			
 
				+{0x2E53, 0x2E7F}, {0x2E9A, 0x2E9A}, {0x2EF4, 0x2EFF}, {0x2FD6, 0x2FEF}, {0x2FFC, 0x2FFF}, {0x3040, 0x3040}, {0x3097, 0x3098}, {0x3100, 0x3104}, {0x3130, 0x3130}, {0x318F, 0x318F}, {0x31E4, 0x31EF},
			
 
				+{0x321F, 0x321F}, {0x9FFD, 0x9FFF}, {0xA48D, 0xA48F}, {0xA4C7, 0xA4CF}, {0xA62C, 0xA63F}, {0xA6F8, 0xA6FF}, {0xA7C0, 0xA7C1}, {0xA7CB, 0xA7F4}, {0xA82D, 0xA82F}, {0xA83A, 0xA83F}, {0xA878, 0xA87F},
			
 
				+{0xA8C6, 0xA8CD}, {0xA8DA, 0xA8DF}, {0xA954, 0xA95E}, {0xA97D, 0xA97F}, {0xA9CE, 0xA9CE}, {0xA9DA, 0xA9DD}, {0xA9FF, 0xA9FF}, {0xAA37, 0xAA3F}, {0xAA4E, 0xAA4F}, {0xAA5A, 0xAA5B}, {0xAAC3, 0xAADA},
			
 
				+{0xAAF7, 0xAB00}, {0xAB07, 0xAB08}, {0xAB0F, 0xAB10}, {0xAB17, 0xAB1F}, {0xAB27, 0xAB27}, {0xAB2F, 0xAB2F}, {0xAB6C, 0xAB6F}, {0xABEE, 0xABEF}, {0xABFA, 0xABFF}, {0xD7A4, 0xD7AF}, {0xD7C7, 0xD7CA},
			
 
				+{0xD7FC, 0xF8FF}, {0xFA6E, 0xFA6F}, {0xFADA, 0xFAFF}, {0xFB07, 0xFB12}, {0xFB18, 0xFB1C}, {0xFB37, 0xFB37}, {0xFB3D, 0xFB3D}, {0xFB3F, 0xFB3F}, {0xFB42, 0xFB42}, {0xFB45, 0xFB45}, {0xFBC2, 0xFBD2},
			
 
				+{0xFD40, 0xFD4F}, {0xFD90, 0xFD91}, {0xFDC8, 0xFDEF}, {0xFDFE, 0xFDFF}, {0xFE1A, 0xFE1F}, {0xFE53, 0xFE53}, {0xFE67, 0xFE67}, {0xFE6C, 0xFE6F}, {0xFE75, 0xFE75}, {0xFEFD, 0xFF00}, {0xFFBF, 0xFFC1},
			
 
				+{0xFFC8, 0xFFC9}, {0xFFD0, 0xFFD1}, {0xFFD8, 0xFFD9}, {0xFFDD, 0xFFDF}, {0xFFE7, 0xFFE7}, {0xFFEF, 0xFFFB}, {0xFFFE, 0xFFFF}, {0x1000C, 0x1000C}, {0x10027, 0x10027}, {0x1003B, 0x1003B},
			
 
				+{0x1003E, 0x1003E}, {0x1004E, 0x1004F}, {0x1005E, 0x1007F}, {0x100FB, 0x100FF}, {0x10103, 0x10106}, {0x10134, 0x10136}, {0x1018F, 0x1018F}, {0x1019D, 0x1019F}, {0x101A1, 0x101CF}, {0x101FE, 0x1027F},
			
 
				+{0x1029D, 0x1029F}, {0x102D1, 0x102DF}, {0x102FC, 0x102FF}, {0x10324, 0x1032C}, {0x1034B, 0x1034F}, {0x1037B, 0x1037F}, {0x1039E, 0x1039E}, {0x103C4, 0x103C7}, {0x103D6, 0x103FF}, {0x1049E, 0x1049F},
			
 
				+{0x104AA, 0x104AF}, {0x104D4, 0x104D7}, {0x104FC, 0x104FF}, {0x10528, 0x1052F}, {0x10564, 0x1056E}, {0x10570, 0x105FF}, {0x10737, 0x1073F}, {0x10756, 0x1075F}, {0x10768, 0x107FF}, {0x10806, 0x10807},
			
 
				+{0x10809, 0x10809}, {0x10836, 0x10836}, {0x10839, 0x1083B}, {0x1083D, 0x1083E}, {0x10856, 0x10856}, {0x1089F, 0x108A6}, {0x108B0, 0x108DF}, {0x108F3, 0x108F3}, {0x108F6, 0x108FA}, {0x1091C, 0x1091E},
			
 
				+{0x1093A, 0x1093E}, {0x10940, 0x1097F}, {0x109B8, 0x109BB}, {0x109D0, 0x109D1}, {0x10A04, 0x10A04}, {0x10A07, 0x10A0B}, {0x10A14, 0x10A14}, {0x10A18, 0x10A18}, {0x10A36, 0x10A37}, {0x10A3B, 0x10A3E},
			
 
				+{0x10A49, 0x10A4F}, {0x10A59, 0x10A5F}, {0x10AA0, 0x10ABF}, {0x10AE7, 0x10AEA}, {0x10AF7, 0x10AFF}, {0x10B36, 0x10B38}, {0x10B56, 0x10B57}, {0x10B73, 0x10B77}, {0x10B92, 0x10B98}, {0x10B9D, 0x10BA8},
			
 
				+{0x10BB0, 0x10BFF}, {0x10C49, 0x10C7F}, {0x10CB3, 0x10CBF}, {0x10CF3, 0x10CF9}, {0x10D28, 0x10D2F}, {0x10D3A, 0x10E5F}, {0x10E7F, 0x10E7F}, {0x10EAA, 0x10EAA}, {0x10EAE, 0x10EAF}, {0x10EB2, 0x10EFF},
			
 
				+{0x10F28, 0x10F2F}, {0x10F5A, 0x10FAF}, {0x10FCC, 0x10FDF}, {0x10FF7, 0x10FFF}, {0x1104E, 0x11051}, {0x11070, 0x1107E}, {0x110BD, 0x110BD}, {0x110C2, 0x110CF}, {0x110E9, 0x110EF}, {0x110FA, 0x110FF},
			
 
				+{0x11135, 0x11135}, {0x11148, 0x1114F}, {0x11177, 0x1117F}, {0x111E0, 0x111E0}, {0x111F5, 0x111FF}, {0x11212, 0x11212}, {0x1123F, 0x1127F}, {0x11287, 0x11287}, {0x11289, 0x11289}, {0x1128E, 0x1128E},
			
 
				+{0x1129E, 0x1129E}, {0x112AA, 0x112AF}, {0x112EB, 0x112EF}, {0x112FA, 0x112FF}, {0x11304, 0x11304}, {0x1130D, 0x1130E}, {0x11311, 0x11312}, {0x11329, 0x11329}, {0x11331, 0x11331}, {0x11334, 0x11334},
			
 
				+{0x1133A, 0x1133A}, {0x11345, 0x11346}, {0x11349, 0x1134A}, {0x1134E, 0x1134F}, {0x11351, 0x11356}, {0x11358, 0x1135C}, {0x11364, 0x11365}, {0x1136D, 0x1136F}, {0x11375, 0x113FF}, {0x1145C, 0x1145C},
			
 
				+{0x11462, 0x1147F}, {0x114C8, 0x114CF}, {0x114DA, 0x1157F}, {0x115B6, 0x115B7}, {0x115DE, 0x115FF}, {0x11645, 0x1164F}, {0x1165A, 0x1165F}, {0x1166D, 0x1167F}, {0x116B9, 0x116BF}, {0x116CA, 0x116FF},
			
 
				+{0x1171B, 0x1171C}, {0x1172C, 0x1172F}, {0x11740, 0x117FF}, {0x1183C, 0x1189F}, {0x118F3, 0x118FE}, {0x11907, 0x11908}, {0x1190A, 0x1190B}, {0x11914, 0x11914}, {0x11917, 0x11917}, {0x11936, 0x11936},
			
 
				+{0x11939, 0x1193A}, {0x11947, 0x1194F}, {0x1195A, 0x1199F}, {0x119A8, 0x119A9}, {0x119D8, 0x119D9}, {0x119E5, 0x119FF}, {0x11A48, 0x11A4F}, {0x11AA3, 0x11ABF}, {0x11AF9, 0x11BFF}, {0x11C09, 0x11C09},
			
 
				+{0x11C37, 0x11C37}, {0x11C46, 0x11C4F}, {0x11C6D, 0x11C6F}, {0x11C90, 0x11C91}, {0x11CA8, 0x11CA8}, {0x11CB7, 0x11CFF}, {0x11D07, 0x11D07}, {0x11D0A, 0x11D0A}, {0x11D37, 0x11D39}, {0x11D3B, 0x11D3B},
			
 
				+{0x11D3E, 0x11D3E}, {0x11D48, 0x11D4F}, {0x11D5A, 0x11D5F}, {0x11D66, 0x11D66}, {0x11D69, 0x11D69}, {0x11D8F, 0x11D8F}, {0x11D92, 0x11D92}, {0x11D99, 0x11D9F}, {0x11DAA, 0x11EDF}, {0x11EF9, 0x11FAF},
			
 
				+{0x11FB1, 0x11FBF}, {0x11FF2, 0x11FFE}, {0x1239A, 0x123FF}, {0x1246F, 0x1246F}, {0x12475, 0x1247F}, {0x12544, 0x12FFF}, {0x1342F, 0x143FF}, {0x14647, 0x167FF}, {0x16A39, 0x16A3F}, {0x16A5F, 0x16A5F},
			
 
				+{0x16A6A, 0x16A6D}, {0x16A70, 0x16ACF}, {0x16AEE, 0x16AEF}, {0x16AF6, 0x16AFF}, {0x16B46, 0x16B4F}, {0x16B5A, 0x16B5A}, {0x16B62, 0x16B62}, {0x16B78, 0x16B7C}, {0x16B90, 0x16E3F}, {0x16E9B, 0x16EFF},
			
 
				+{0x16F4B, 0x16F4E}, {0x16F88, 0x16F8E}, {0x16FA0, 0x16FDF}, {0x16FE5, 0x16FEF}, {0x16FF2, 0x16FFF}, {0x187F8, 0x187FF}, {0x18CD6, 0x18CFF}, {0x18D09, 0x1AFFF}, {0x1B11F, 0x1B14F}, {0x1B153, 0x1B163},
			
 
				+{0x1B168, 0x1B16F}, {0x1B2FC, 0x1BBFF}, {0x1BC6B, 0x1BC6F}, {0x1BC7D, 0x1BC7F}, {0x1BC89, 0x1BC8F}, {0x1BC9A, 0x1BC9B}, {0x1BCA0, 0x1CFFF}, {0x1D0F6, 0x1D0FF}, {0x1D127, 0x1D128}, {0x1D173, 0x1D17A},
			
 
				+{0x1D1E9, 0x1D1FF}, {0x1D246, 0x1D2DF}, {0x1D2F4, 0x1D2FF}, {0x1D357, 0x1D35F}, {0x1D379, 0x1D3FF}, {0x1D455, 0x1D455}, {0x1D49D, 0x1D49D}, {0x1D4A0, 0x1D4A1}, {0x1D4A3, 0x1D4A4}, {0x1D4A7, 0x1D4A8},
			
 
				+{0x1D4AD, 0x1D4AD}, {0x1D4BA, 0x1D4BA}, {0x1D4BC, 0x1D4BC}, {0x1D4C4, 0x1D4C4}, {0x1D506, 0x1D506}, {0x1D50B, 0x1D50C}, {0x1D515, 0x1D515}, {0x1D51D, 0x1D51D}, {0x1D53A, 0x1D53A}, {0x1D53F, 0x1D53F},
			
 
				+{0x1D545, 0x1D545}, {0x1D547, 0x1D549}, {0x1D551, 0x1D551}, {0x1D6A6, 0x1D6A7}, {0x1D7CC, 0x1D7CD}, {0x1DA8C, 0x1DA9A}, {0x1DAA0, 0x1DAA0}, {0x1DAB0, 0x1DFFF}, {0x1E007, 0x1E007}, {0x1E019, 0x1E01A},
			
 
				+{0x1E022, 0x1E022}, {0x1E025, 0x1E025}, {0x1E02B, 0x1E0FF}, {0x1E12D, 0x1E12F}, {0x1E13E, 0x1E13F}, {0x1E14A, 0x1E14D}, {0x1E150, 0x1E2BF}, {0x1E2FA, 0x1E2FE}, {0x1E300, 0x1E7FF}, {0x1E8C5, 0x1E8C6},
			
 
				+{0x1E8D7, 0x1E8FF}, {0x1E94C, 0x1E94F}, {0x1E95A, 0x1E95D}, {0x1E960, 0x1EC70}, {0x1ECB5, 0x1ED00}, {0x1ED3E, 0x1EDFF}, {0x1EE04, 0x1EE04}, {0x1EE20, 0x1EE20}, {0x1EE23, 0x1EE23}, {0x1EE25, 0x1EE26},
			
 
				+{0x1EE28, 0x1EE28}, {0x1EE33, 0x1EE33}, {0x1EE38, 0x1EE38}, {0x1EE3A, 0x1EE3A}, {0x1EE3C, 0x1EE41}, {0x1EE43, 0x1EE46}, {0x1EE48, 0x1EE48}, {0x1EE4A, 0x1EE4A}, {0x1EE4C, 0x1EE4C}, {0x1EE50, 0x1EE50},
			
 
				+{0x1EE53, 0x1EE53}, {0x1EE55, 0x1EE56}, {0x1EE58, 0x1EE58}, {0x1EE5A, 0x1EE5A}, {0x1EE5C, 0x1EE5C}, {0x1EE5E, 0x1EE5E}, {0x1EE60, 0x1EE60}, {0x1EE63, 0x1EE63}, {0x1EE65, 0x1EE66}, {0x1EE6B, 0x1EE6B},
			
 
				+{0x1EE73, 0x1EE73}, {0x1EE78, 0x1EE78}, {0x1EE7D, 0x1EE7D}, {0x1EE7F, 0x1EE7F}, {0x1EE8A, 0x1EE8A}, {0x1EE9C, 0x1EEA0}, {0x1EEA4, 0x1EEA4}, {0x1EEAA, 0x1EEAA}, {0x1EEBC, 0x1EEEF}, {0x1EEF2, 0x1EFFF},
			
 
				+{0x1F02C, 0x1F02F}, {0x1F094, 0x1F09F}, {0x1F0AF, 0x1F0B0}, {0x1F0C0, 0x1F0C0}, {0x1F0D0, 0x1F0D0}, {0x1F0F6, 0x1F0FF}, {0x1F1AE, 0x1F1E5}, {0x1F203, 0x1F20F}, {0x1F23C, 0x1F23F}, {0x1F249, 0x1F24F},
			
 
				+{0x1F252, 0x1F25F}, {0x1F266, 0x1F2FF}, {0x1F6D8, 0x1F6DF}, {0x1F6ED, 0x1F6EF}, {0x1F6FD, 0x1F6FF}, {0x1F774, 0x1F77F}, {0x1F7D9, 0x1F7DF}, {0x1F7EC, 0x1F7FF}, {0x1F80C, 0x1F80F}, {0x1F848, 0x1F84F},
			
 
				+{0x1F85A, 0x1F85F}, {0x1F888, 0x1F88F}, {0x1F8AE, 0x1F8AF}, {0x1F8B2, 0x1F8FF}, {0x1F979, 0x1F979}, {0x1F9CC, 0x1F9CC}, {0x1FA54, 0x1FA5F}, {0x1FA6E, 0x1FA6F}, {0x1FA75, 0x1FA77}, {0x1FA7B, 0x1FA7F},
			
 
				+{0x1FA87, 0x1FA8F}, {0x1FAA9, 0x1FAAF}, {0x1FAB7, 0x1FABF}, {0x1FAC3, 0x1FACF}, {0x1FAD7, 0x1FAFF}, {0x1FB93, 0x1FB93}, {0x1FBCB, 0x1FBEF}, {0x1FBFA, 0x1FFFF}, {0x2A6DE, 0x2A6FF}, {0x2B735, 0x2B73F},
			
 
				+{0x2B81E, 0x2B81F}, {0x2CEA2, 0x2CEAF}, {0x2EBE1, 0x2F7FF}, {0x2FA1E, 0x2FFFF}, {0x3134B, 0xE00FF}, {0xE01F0, 0x10FFFF},
			
 
				+};
			
 
				+
			
 
				+//String
			
 
				+bool CNCTString::operator==(const std::string& other) const {
			
 
				+    return str.compare(other) == 0;
			
 
				+}
			
 
				+bool CNCTString::operator==(const char other) const {
			
 
				+    return str.compare(std::string(1, other)) == 0;
			
 
				+}
			
 
				+bool CNCTString::operator==(const CNCTString& other) const {
			
 
				+    return str.compare(other.str) == 0;
			
 
				+}
			
 
				+// + operators
			
 
				+CNCTString& CNCTString::operator+=(const std::string& other) {
			
 
				+    str += other;
			
 
				+    int new_len = CNCTUnicode::strlen_utf8(other);
			
 
				+    utf8_chars += new_len;
			
 
				+    char_type = CNCTUnicode::string_identify(str);
			
 
				+    seq_offset_bytes += other.size();
			
 
				+    seq_offset_utf8_chars += new_len;
			
 
				+    return *this;
			
 
				+}
			
 
				+
			
 
				+CNCTString& CNCTString::operator+=(const char other) {
			
 
				+    std::string str = std::string(1, other);
			
 
				+    *this += str;
			
 
				+    return *this;
			
 
				+}
			
 
				+
			
 
				+CNCTString& CNCTString::operator+=(const CNCTString& other) {
			
 
				+    str += other.str;
			
 
				+    utf8_chars += other.utf8_chars;
			
 
				+    char_type = CNCTUnicode::string_identify(str);
			
 
				+    seq_offset_bytes += other.str.size();
			
 
				+    seq_offset_utf8_chars += other.utf8_chars;
			
 
				+    return *this;
			
 
				+}
			
 
				+
			
 
				+struct CRCompare {
			
 
				+    bool operator()(const std::pair<int, int>& p, int i) {
			
 
				+        return p.second < i;
			
 
				+    }
			
 
				+    bool operator()(int i, const std::pair<int, int>& p) {
			
 
				+        return i < p.first;
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+// binary search for code range
			
 
				+bool CNCTUnicode::check_code_range(int c, const std::vector<std::pair<int, int>> &ranges) {
			
 
				+    auto it = std::upper_bound(ranges.begin(), ranges.end(), c, CRCompare());
			
 
				+    if (it != ranges.begin()) {
			
 
				+        --it;
			
 
				+    }
			
 
				+    return c >= it->first && c <= it->second;
			
 
				+}
			
 
				+
			
 
				+// these are binary searches, it takes only a few operations
			
 
				+CNCTCharType CNCTUnicode::get_code_type(int c) {
			
 
				+    if (check_code_range(c, letter_ranges)) {
			
 
				+        return LETTER;
			
 
				+    }
			
 
				+    if (check_code_range(c, digit_ranges)) {
			
 
				+        return DIGIT;
			
 
				+    }
			
 
				+    if (check_code_range(c, whitespace_ranges)) {
			
 
				+        return WHITESPACE;
			
 
				+    }
			
 
				+    if (check_code_range(c, punctuation_ranges)) {
			
 
				+        return PUNCTUATION;
			
 
				+    }
			
 
				+    if (check_code_range(c, symbol_ranges)) {
			
 
				+        return SYMBOL;
			
 
				+    }
			
 
				+    if (check_code_range(c, accent_mark_ranges)) {
			
 
				+        return ACCENT_MARK;
			
 
				+    }
			
 
				+    if (check_code_range(c, control_ranges)) {
			
 
				+        return CONTROL;
			
 
				+    }
			
 
				+    return UNIDENTIFIED;
			
 
				+}
			
 
				+
			
 
				+static int utf8_to_unicode(const std::string& utf8_char) {
			
 
				+    int c = 0;
			
 
				+    int len = (int)utf8_char.size();
			
 
				+    if (len == 1) {
			
 
				+        c = utf8_char[0];
			
 
				+    } else if (len == 2) {
			
 
				+        c = ((utf8_char[0] & 0x1F) << 6) | (utf8_char[1] & 0x3F);
			
 
				+    } else if (len == 3) {
			
 
				+        c = ((utf8_char[0] & 0x0F) << 12) | ((utf8_char[1] & 0x3F) << 6) | (utf8_char[2] & 0x3F);
			
 
				+    } else if (len == 4) {
			
 
				+        c = ((utf8_char[0] & 0x07) << 18) | ((utf8_char[1] & 0x3F) << 12) | ((utf8_char[2] & 0x3F) << 6) | (utf8_char[3] & 0x3F);
			
 
				+    }
			
 
				+    return c;
			
 
				+}
			
 
				+
			
 
				+CNCTCharType CNCTUnicode::get_code_type(const std::string &utf8_char) {
			
 
				+    return get_code_type(utf8_to_unicode(utf8_char));
			
 
				+}
			
 
				+
			
 
				+int CNCTUnicode::utf8_len(const char c)
			
 
				+{
			
 
				+    if ((c & 0x80) == 0) {
			
 
				+        return 1; // ASCII character
			
 
				+    }
			
 
				+    if ((c & 0xE0) == 0xC0) {
			
 
				+        return 2; // 2-byte character
			
 
				+    }
			
 
				+    if ((c & 0xF0) == 0xE0) {
			
 
				+        return 3; // 3-byte character
			
 
				+    }
			
 
				+    if ((c & 0xF0) == 0xF0) {
			
 
				+        return 4; // 4-byte character
			
 
				+    }
			
 
				+    return 1;     // not valid utf8
			
 
				+    // static const uint8_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
			
 
				+    // return lookup[static_cast<uint8_t>(c) >> 4];
			
 
				+}
			
 
				+
			
 
				+int CNCTUnicode::strlen_utf8(const std::string src) {
			
 
				+    int len = 0;
			
 
				+    for (std::string::const_iterator it = src.begin(); it != src.end(); ++it) {
			
 
				+        int char_len = utf8_len(*it);
			
 
				+        if (char_len > 1) {
			
 
				+            it += char_len - 1;
			
 
				+        }
			
 
				+        len += 1;
			
 
				+    }
			
 
				+    return len;
			
 
				+}
			
 
				+
			
 
				+// split a string into unicode strings
			
 
				+std::vector<std::string> CNCTUnicode::split_utf8(const std::string &src) {
			
 
				+    std::vector<std::string> result;
			
 
				+    for (std::string::const_iterator it = src.begin(); it != src.end(); ++it) {
			
 
				+        int char_len = utf8_len(*it);
			
 
				+        std::string str(it, it + char_len);
			
 
				+        result.push_back(str);
			
 
				+        if (char_len > 1) {
			
 
				+            it += char_len - 1;
			
 
				+        }
			
 
				+    }
			
 
				+    return result;
			
 
				+}
			
 
				+
			
 
				+// split a string into unicode strings (CNCTString) with sequence information
			
 
				+std::vector<CNCTString> CNCTUnicode::split_utf8_enhanced(const std::string &src) {
			
 
				+    std::vector<CNCTString> result;
			
 
				+    int seq_offset_bytes=0;
			
 
				+    int seq_offset_utf8_chars=0;
			
 
				+    for (std::string::const_iterator it = src.begin(); it != src.end(); ++it) {
			
 
				+        int char_len = utf8_len(*it);
			
 
				+        std::string str(it, it + char_len);
			
 
				+        CNCTString cnct_str;
			
 
				+        cnct_str.seq_offset_bytes = seq_offset_bytes;
			
 
				+        cnct_str.seq_offset_utf8_chars = seq_offset_utf8_chars;
			
 
				+        cnct_str.str = str;
			
 
				+        cnct_str.utf8_chars = 1;
			
 
				+        cnct_str.char_type = get_code_type(str);
			
 
				+        #if 0
			
 
				+        switch (cnct_str.char_type)
			
 
				+        {
			
 
				+        case DIGIT:
			
 
				+            printf("%s = DIGIT\n", str.c_str());
			
 
				+            break;
			
 
				+        case LETTER:
			
 
				+            printf("%s = LETTER\n", str.c_str());
			
 
				+            break;
			
 
				+        case WHITESPACE:
			
 
				+            printf("%s = WHITESPACE\n", str.c_str());
			
 
				+            break;
			
 
				+        case PUNCTUATION:
			
 
				+            printf("%s = PUNCTUATION\n", str.c_str());
			
 
				+            break;
			
 
				+        case UNIDENTIFIED:
			
 
				+            printf("%s = UNIDENTIFIED\n", str.c_str());
			
 
				+            break;
			
 
				+        case SYMBOL:
			
 
				+            printf("%s = SYMBOL\n", str.c_str());
			
 
				+            break;
			
 
				+        case CONTROL:
			
 
				+            printf("%s = CONTROL\n", str.c_str());
			
 
				+            break;
			
 
				+        }
			
 
				+        #endif
			
 
				+
			
 
				+        result.push_back(cnct_str);
			
 
				+        seq_offset_bytes += char_len;
			
 
				+        seq_offset_utf8_chars += 1;
			
 
				+        if (char_len > 1) {
			
 
				+            it += char_len - 1;
			
 
				+        }
			
 
				+
			
 
				+    }
			
 
				+    return result;
			
 
				+}
			
 
				+
			
 
				+// return the type of the string
			
 
				+CNCTCharType CNCTUnicode::string_identify(const std::string &str) {
			
 
				+    CNCTCharType result = UNIDENTIFIED;
			
 
				+    std::string::const_iterator it = str.begin();
			
 
				+    while (it != str.end()) {
			
 
				+        int len = utf8_len(*it);
			
 
				+        int c = 0;
			
 
				+        for (int i = 0; i < len && it != str.end(); ++i, ++it) {
			
 
				+            c = (c << 8) | static_cast<unsigned char>(*it);
			
 
				+        }
			
 
				+        switch (get_code_type(c)) {
			
 
				+        case DIGIT:
			
 
				+            if (result == UNIDENTIFIED) {
			
 
				+                result = DIGIT;
			
 
				+            } else if (result != DIGIT) {
			
 
				+                return MIXED;
			
 
				+            }
			
 
				+            break;
			
 
				+        case LETTER:
			
 
				+            if (result == UNIDENTIFIED) {
			
 
				+                result = LETTER;
			
 
				+            } else if (result != LETTER) {
			
 
				+                return MIXED;
			
 
				+            }
			
 
				+            break;
			
 
				+        case WHITESPACE:
			
 
				+            if (result == UNIDENTIFIED) {
			
 
				+                result = WHITESPACE;
			
 
				+            } else if (result != WHITESPACE) {
			
 
				+                return MIXED;
			
 
				+            }
			
 
				+            break;
			
 
				+        case PUNCTUATION:
			
 
				+            if (result == UNIDENTIFIED) {
			
 
				+                result = PUNCTUATION;
			
 
				+            } else if (result != PUNCTUATION) {
			
 
				+                return MIXED;
			
 
				+            }
			
 
				+            break;
			
 
				+        default:
			
 
				+            return MIXED;
			
 
				+            break;
			
 
				+        }
			
 
				+    }
			
 
				+    return result;
			
 
				+}
			
 
				+
			
 
				+// verify the content of a string
			
 
				+bool CNCTUnicode::string_test(const std::string &str, CNCTCharType chartype)
			
 
				+{
			
 
				+    std::string::const_iterator it = str.begin();
			
 
				+    while (it != str.end()) {
			
 
				+        int len = utf8_len(*it);
			
 
				+        int c = 0;
			
 
				+        for (int i = 0; i < len && it != str.end(); ++i, ++it) {
			
 
				+            c = (c << 8) | static_cast<unsigned char>(*it);
			
 
				+        }
			
 
				+        if (get_code_type(c) != chartype) {
			
 
				+            return false;
			
 
				+        }
			
 
				+    }
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+//-----------------
			
 
				+// llama.cpp GPT2 vocab (from libfalcon.cpp)
			
 
				+//-----------------
			
 
				+
			
 
				+std::string replaceAll(std::string str, const std::string& from, const std::string& to) {
			
 
				+    size_t start_pos = 0;
			
 
				+    while((start_pos = str.find(from, start_pos)) != std::string::npos) {
			
 
				+        str.replace(start_pos, from.length(), to);
			
 
				+        start_pos += to.length(); // Handles case where 'to' is a substring of 'from'
			
 
				+    }
			
 
				+    return str;
			
 
				+}
			
 
				+
			
 
				+struct TrieNode {
			
 
				+    std::map<char, TrieNode*> map;
			
 
				+    int32_t Id = -1;
			
 
				+};
			
 
				+
			
 
				+struct Trie {
			
 
				+    TrieNode *root;
			
 
				+
			
 
				+    Trie() : root(new TrieNode()) {}
			
 
				+
			
 
				+    ~Trie() {
			
 
				+        if(root)
			
 
				+        deleteTrie(root);
			
 
				+    }
			
 
				+
			
 
				+    // Move constructor
			
 
				+    Trie(Trie&& other) noexcept : root(other.root) {
			
 
				+        other.root = nullptr;
			
 
				+    }
			
 
				+
			
 
				+    // Move assignment operator
			
 
				+    Trie& operator=(Trie&& other) noexcept {
			
 
				+        if (this != &other) {
			
 
				+            if(root)
			
 
				+                deleteTrie(root);
			
 
				+            root = other.root;
			
 
				+            other.root = nullptr;
			
 
				+        }
			
 
				+        return *this;
			
 
				+    }
			
 
				+
			
 
				+    void insert(const std::string &token, int32_t Id) {
			
 
				+        TrieNode* current = root;
			
 
				+        for(auto ch : token) {
			
 
				+            if(current->map.find(ch) == current->map.end()) {
			
 
				+                current->map[ch] = new TrieNode();
			
 
				+            }
			
 
				+            current = current->map[ch];
			
 
				+        }
			
 
				+        current->Id = Id;
			
 
				+    }
			
 
				+
			
 
				+    void reset() {
			
 
				+        deleteTrie(root);
			
 
				+        root = new TrieNode();
			
 
				+    }
			
 
				+
			
 
				+private:
			
 
				+    void deleteTrie(TrieNode* node) {
			
 
				+        for(auto &it: node->map) {
			
 
				+            deleteTrie(it.second);
			
 
				+        }
			
 
				+        delete node;
			
 
				+    }
			
 
				+
			
 
				+};
			
 
				+
			
 
				+struct gpt2bpe_vocab {
			
 
				+    using id = int32_t;
			
 
				+    using token = std::string;
			
 
				+
			
 
				+    std::map<std::string, uint32_t> max_token_length; // max length, for each 2byte prefix
			
 
				+    std::map<std::pair<std::string,std::string>, int> bpe_ranks;
			
 
				+    std::vector<std::pair<std::string, std::string>> bpe_merges;
			
 
				+
			
 
				+    id special_bos_id = -1;
			
 
				+    id special_eos_id = -1;
			
 
				+    id special_unk_id = -1;
			
 
				+    id special_sep_id = -1;
			
 
				+    id special_pad_id = -1;
			
 
				+
			
 
				+    id linefeed_id = -1;
			
 
				+
			
 
				+    std::unordered_map<token, id> token_to_id;
			
 
				+    std::unordered_map<id, token> id_to_token;
			
 
				+
			
 
				+    Trie trie; // highspeed access to tokens by prefix tree
			
 
				+
			
 
				+    // populate trie from map
			
 
				+    void populate_trie_from_map() {
			
 
				+        trie.reset();
			
 
				+        for (const auto& pair : token_to_id) {
			
 
				+            trie.insert(pair.first, pair.second);
			
 
				+            if (pair.first.size() >= 2) {
			
 
				+                std::string prefix = pair.first.substr(0, 2);
			
 
				+                max_token_length[prefix] = std::max(max_token_length[prefix], (uint32_t)pair.first.size());
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    // populate token ranks map
			
 
				+    int populate_bpe_ranks(std::vector<std::pair<std::string, std::string>> bpe_merges_) {
			
 
				+        for (int i = 0; i < (int)bpe_merges_.size(); i++) {
			
 
				+            bpe_ranks.emplace(bpe_merges_[i], i);
			
 
				+        }
			
 
				+        bpe_merges = bpe_merges_;
			
 
				+        return bpe_merges_.size();
			
 
				+    }
			
 
				+
			
 
				+    // Trim whitespace characters from the beginning and end of the string
			
 
				+    void trim(std::string& str) {
			
 
				+        // Remove whitespace characters from the beginning of the string
			
 
				+        str.erase(str.begin(), std::find_if(str.begin(), str.end(), [](int ch) {
			
 
				+            return !std::isspace(ch);
			
 
				+        }));
			
 
				+
			
 
				+        // Remove whitespace characters from the end of the string
			
 
				+        str.erase(std::find_if(str.rbegin(), str.rend(), [](int ch) {
			
 
				+            return !std::isspace(ch);
			
 
				+        }).base(), str.end());
			
 
				+    }
			
 
				+
			
 
				+    // get max token length available for a prefix of 2 bytes (string at least 2 bytes long)
			
 
				+    int get_max_token_length(const std::string& string) const {
			
 
				+        if (string.size() < 2) {
			
 
				+            return -1;
			
 
				+        }
			
 
				+        std::string prefix = string.substr(0, 2);
			
 
				+        if (max_token_length.find(prefix) == max_token_length.end()) {
			
 
				+            return 0;
			
 
				+        }
			
 
				+        return max_token_length.at(prefix);
			
 
				+    }
			
 
				+
			
 
				+    // function to find if two tokens match in bpe_rank, return rank or -1
			
 
				+    int find_bpe_rank(const std::string& token1, const std::string& token2) const {
			
 
				+        std::string left_token = token1;
			
 
				+        std::string right_token = token2;
			
 
				+        left_token = replaceAll(left_token, " ", "Ġ");
			
 
				+        left_token = replaceAll(left_token, "\n", "Ċ");
			
 
				+        right_token = replaceAll(right_token, " ", "Ġ");
			
 
				+        right_token = replaceAll(right_token, "\n", "Ċ");
			
 
				+
			
 
				+        auto it = bpe_ranks.find(std::make_pair(left_token, right_token));
			
 
				+        if (it == bpe_ranks.end()) {
			
 
				+            return -1;
			
 
				+        }
			
 
				+        return it->second;
			
 
				+    }
			
 
				+
			
 
				+    std::pair<gpt2bpe_vocab::id, std::string> find_longest_match(const std::string& snippet) const {
			
 
				+        TrieNode* current = trie.root;
			
 
				+        gpt2bpe_vocab::id last_matched_id = -1;
			
 
				+        std::string last_matched_token = "";
			
 
				+        std::string current_token = "";
			
 
				+        for (auto ch : snippet) {
			
 
				+            if (current->map.find(ch) == current->map.end()) {
			
 
				+                break;
			
 
				+            }
			
 
				+            current = current->map[ch];
			
 
				+            current_token += ch;
			
 
				+            if (current->Id != -1) {
			
 
				+                last_matched_id = current->Id;
			
 
				+                last_matched_token = current_token;
			
 
				+            }
			
 
				+        }
			
 
				+        return {last_matched_id, last_matched_token};
			
 
				+    }
			
 
				+
			
 
				+};
			
 
				+
			
 
				+
			
 
				+//
			
 
				+// tokenizer - bpe type, gpt2 tokenization compatible
			
 
				+//
			
 
				+
			
 
				+struct ggllm_bpe_symbol {
			
 
				+    using index = int;
			
 
				+    index prev;
			
 
				+    index next;
			
 
				+    const char * text;
			
 
				+    size_t n;
			
 
				+};
			
 
				+
			
 
				+static_assert(std::is_trivially_copyable<ggllm_bpe_symbol>::value, "ggllm_bpe_symbol is not trivially copyable");
			
 
				+
			
 
				+struct ggllm_bpe_bigram {
			
 
				+    struct comparator {
			
 
				+        bool operator()(ggllm_bpe_bigram & l, ggllm_bpe_bigram & r) {
			
 
				+            return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
			
 
				+        }
			
 
				+    };
			
 
				+
			
 
				+    using queue_storage = std::vector<ggllm_bpe_bigram>;
			
 
				+    using queue = std::priority_queue<ggllm_bpe_bigram, queue_storage, comparator>;
			
 
				+    ggllm_bpe_symbol::index left;
			
 
				+    ggllm_bpe_symbol::index right;
			
 
				+    std::string text;
			
 
				+    int rank;
			
 
				+    size_t size;
			
 
				+};
			
 
				+
			
 
				+struct gpt2bpe_tokenizer {
			
 
				+    gpt2bpe_tokenizer(const gpt2bpe_vocab & vocab, bool g2ws_): vocab_(vocab) { flag_g2ws = g2ws_; }
			
 
				+
			
 
				+    void tokenize(const std::string & text, std::vector<gpt2bpe_vocab::id> & output) {
			
 
				+        int final_prev_index = -1;
			
 
				+        // auto start = ggml_time_us();
			
 
				+        auto word_collection = bpe_gpt2_preprocess(text);
			
 
				+        // auto end = ggml_time_us();
			
 
				+        // fprintf(stderr, "%s: preprocessing took %0.3f ms\n", __func__, (end - start) / 1000.0);
			
 
				+
			
 
				+        symbols_final.clear();
			
 
				+
			
 
				+        for (auto & word : word_collection) {
			
 
				+            work_queue_ = ggllm_bpe_bigram::queue();
			
 
				+            symbols_.clear();
			
 
				+
			
 
				+            int index = 0;
			
 
				+            size_t offset = 0;
			
 
				+
			
 
				+            while (offset < word.size()) {
			
 
				+                ggllm_bpe_symbol sym;
			
 
				+                size_t char_len = std::min(word.size() - offset, (size_t) CNCTUnicode::utf8_len(word[offset]));
			
 
				+                sym.text = word.c_str() + offset;
			
 
				+                sym.n = 1;
			
 
				+                sym.n = char_len;
			
 
				+                offset += sym.n;
			
 
				+                sym.prev = index - 1;
			
 
				+                sym.next = offset == word.size() ? -1 : index + 1;
			
 
				+                index++;
			
 
				+                symbols_.emplace_back(sym);
			
 
				+            }
			
 
				+            for (size_t i = 1; i < symbols_.size(); ++i) {
			
 
				+                add_new_bigram(i - 1, i);
			
 
				+            }
			
 
				+
			
 
				+            // build token(s)
			
 
				+            while (!work_queue_.empty()) {
			
 
				+                auto bigram = work_queue_.top();
			
 
				+                work_queue_.pop();
			
 
				+
			
 
				+                auto & left_symbol = symbols_[bigram.left];
			
 
				+                auto & right_symbol = symbols_[bigram.right];
			
 
				+
			
 
				+                if (left_symbol.n == 0 || right_symbol.n == 0) {
			
 
				+                    continue;
			
 
				+                }
			
 
				+                std::string left_token = std::string(left_symbol.text, left_symbol.n);
			
 
				+                std::string right_token = std::string(right_symbol.text, right_symbol.n);
			
 
				+                if (left_token + right_token != bigram.text) {
			
 
				+                    continue;  // Skip this bigram if it's outdated
			
 
				+                }
			
 
				+
			
 
				+                // merge the right sym into the left one
			
 
				+                left_symbol.n += right_symbol.n;
			
 
				+                right_symbol.n = 0;
			
 
				+
			
 
				+                // remove the right sym from the chain
			
 
				+                left_symbol.next = right_symbol.next;
			
 
				+                if (right_symbol.next >= 0) {
			
 
				+                    symbols_[right_symbol.next].prev = bigram.left;
			
 
				+                }
			
 
				+
			
 
				+                add_new_bigram(left_symbol.prev, bigram.left);  // left side of current symbol
			
 
				+                add_new_bigram(bigram.left, left_symbol.next);  // right side of current symbol
			
 
				+            }
			
 
				+
			
 
				+            // add the fnished tokens to the final list keeping correct order for next and prev
			
 
				+            for (auto & sym : symbols_) {
			
 
				+                if (sym.n > 0) {
			
 
				+                    sym.prev = final_prev_index;
			
 
				+                    sym.next = -1;
			
 
				+                    if (final_prev_index != -1) {
			
 
				+                        symbols_final[final_prev_index].next = symbols_final.size();
			
 
				+                    }
			
 
				+                    symbols_final.emplace_back(sym);
			
 
				+                    final_prev_index = symbols_final.size() - 1;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        symbols_ = symbols_final;
			
 
				+        if (symbols_.size())
			
 
				+        for (int i = 0; i != -1; i = symbols_[i].next) {
			
 
				+            auto & symbol = symbols_[i];
			
 
				+            if (symbol.n == 0) {
			
 
				+                continue;
			
 
				+            }
			
 
				+            std::string str = std::string(symbol.text, symbol.n);
			
 
				+            std::string str_decoded = decode_token(str);
			
 
				+            auto token = vocab_.token_to_id.find(str_decoded);
			
 
				+
			
 
				+            if (token == vocab_.token_to_id.end()) {
			
 
				+                for (auto j = str_decoded.begin(); j != str_decoded.end(); ++j) {
			
 
				+                    std::string byte_str(1, *j);
			
 
				+                    auto token_multibyte = vocab_.token_to_id.find(byte_str);
			
 
				+                    if (token_multibyte == vocab_.token_to_id.end()) {
			
 
				+                        fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
			
 
				+                    }
			
 
				+                    output.push_back((*token_multibyte).second);
			
 
				+                }
			
 
				+            } else {
			
 
				+                output.push_back((*token).second);
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+private:
			
 
				+    void add_new_bigram(int left, int right) {
			
 
				+        if (left == -1 || right == -1)  return;
			
 
				+
			
 
				+        std::string left_token = std::string(symbols_[left].text, symbols_[left].n);
			
 
				+        std::string right_token = std::string(symbols_[right].text, symbols_[right].n);
			
 
				+
			
 
				+        int rank_found = -1;
			
 
				+        rank_found = vocab_.find_bpe_rank(left_token, right_token);
			
 
				+
			
 
				+        if (rank_found < 0) {
			
 
				+            return;
			
 
				+        }
			
 
				+
			
 
				+        ggllm_bpe_bigram bigram;
			
 
				+        bigram.left = left;
			
 
				+        bigram.right = right;
			
 
				+        bigram.rank = rank_found;
			
 
				+        bigram.size = left_token.size() + right_token.size();
			
 
				+        bigram.text = left_token + right_token;
			
 
				+        work_queue_.push(bigram);
			
 
				+    }
			
 
				+
			
 
				+    std::unordered_map<unsigned char, std::string> bytes_to_unicode() {
			
 
				+        static std::unordered_map<unsigned char, std::string> hex_map = {
			
 
				+            { 0x21, "\x21" }, { 0x22, "\x22" }, { 0x23, "\x23" }, { 0x24, "\x24" }, { 0x25, "\x25" }, { 0x26, "\x26" }, { 0x27, "\x27" }, { 0x28, "\x28" }, { 0x29, "\x29" }, { 0x2A, "\x2A" },
			
 
				+            { 0x2B, "\x2B" }, { 0x2C, "\x2C" }, { 0x2D, "\x2D" }, { 0x2E, "\x2E" }, { 0x2F, "\x2F" }, { 0x30, "\x30" }, { 0x31, "\x31" }, { 0x32, "\x32" }, { 0x33, "\x33" }, { 0x34, "\x34" },
			
 
				+            { 0x35, "\x35" }, { 0x36, "\x36" }, { 0x37, "\x37" }, { 0x38, "\x38" }, { 0x39, "\x39" }, { 0x3A, "\x3A" }, { 0x3B, "\x3B" }, { 0x3C, "\x3C" }, { 0x3D, "\x3D" }, { 0x3E, "\x3E" },
			
 
				+            { 0x3F, "\x3F" }, { 0x40, "\x40" }, { 0x41, "\x41" }, { 0x42, "\x42" }, { 0x43, "\x43" }, { 0x44, "\x44" }, { 0x45, "\x45" }, { 0x46, "\x46" }, { 0x47, "\x47" }, { 0x48, "\x48" },
			
 
				+            { 0x49, "\x49" }, { 0x4A, "\x4A" }, { 0x4B, "\x4B" }, { 0x4C, "\x4C" }, { 0x4D, "\x4D" }, { 0x4E, "\x4E" }, { 0x4F, "\x4F" }, { 0x50, "\x50" }, { 0x51, "\x51" }, { 0x52, "\x52" },
			
 
				+            { 0x53, "\x53" }, { 0x54, "\x54" }, { 0x55, "\x55" }, { 0x56, "\x56" }, { 0x57, "\x57" }, { 0x58, "\x58" }, { 0x59, "\x59" }, { 0x5A, "\x5A" }, { 0x5B, "\x5B" }, { 0x5C, "\x5C" },
			
 
				+            { 0x5D, "\x5D" }, { 0x5E, "\x5E" }, { 0x5F, "\x5F" }, { 0x60, "\x60" }, { 0x61, "\x61" }, { 0x62, "\x62" }, { 0x63, "\x63" }, { 0x64, "\x64" }, { 0x65, "\x65" }, { 0x66, "\x66" },
			
 
				+            { 0x67, "\x67" }, { 0x68, "\x68" }, { 0x69, "\x69" }, { 0x6A, "\x6A" }, { 0x6B, "\x6B" }, { 0x6C, "\x6C" }, { 0x6D, "\x6D" }, { 0x6E, "\x6E" }, { 0x6F, "\x6F" }, { 0x70, "\x70" },
			
 
				+            { 0x71, "\x71" }, { 0x72, "\x72" }, { 0x73, "\x73" }, { 0x74, "\x74" }, { 0x75, "\x75" }, { 0x76, "\x76" }, { 0x77, "\x77" }, { 0x78, "\x78" }, { 0x79, "\x79" }, { 0x7A, "\x7A" },
			
 
				+            { 0x7B, "\x7B" }, { 0x7C, "\x7C" }, { 0x7D, "\x7D" }, { 0x7E, "\x7E" }, { 0xA1, "\xC2\xA1" }, { 0xA2, "\xC2\xA2" }, { 0xA3, "\xC2\xA3" }, { 0xA4, "\xC2\xA4" }, { 0xA5, "\xC2\xA5" },
			
 
				+            { 0xA6, "\xC2\xA6" }, { 0xA7, "\xC2\xA7" }, { 0xA8, "\xC2\xA8" }, { 0xA9, "\xC2\xA9" }, { 0xAA, "\xC2\xAA" }, { 0xAB, "\xC2\xAB" }, { 0xAC, "\xC2\xAC" }, { 0xAE, "\xC2\xAE" },
			
 
				+            { 0xAF, "\xC2\xAF" }, { 0xB0, "\xC2\xB0" }, { 0xB1, "\xC2\xB1" }, { 0xB2, "\xC2\xB2" }, { 0xB3, "\xC2\xB3" }, { 0xB4, "\xC2\xB4" }, { 0xB5, "\xC2\xB5" }, { 0xB6, "\xC2\xB6" },
			
 
				+            { 0xB7, "\xC2\xB7" }, { 0xB8, "\xC2\xB8" }, { 0xB9, "\xC2\xB9" }, { 0xBA, "\xC2\xBA" }, { 0xBB, "\xC2\xBB" }, { 0xBC, "\xC2\xBC" }, { 0xBD, "\xC2\xBD" }, { 0xBE, "\xC2\xBE" },
			
 
				+            { 0xBF, "\xC2\xBF" }, { 0xC0, "\xC3\x80" }, { 0xC1, "\xC3\x81" }, { 0xC2, "\xC3\x82" }, { 0xC3, "\xC3\x83" }, { 0xC4, "\xC3\x84" }, { 0xC5, "\xC3\x85" }, { 0xC6, "\xC3\x86" },
			
 
				+            { 0xC7, "\xC3\x87" }, { 0xC8, "\xC3\x88" }, { 0xC9, "\xC3\x89" }, { 0xCA, "\xC3\x8A" }, { 0xCB, "\xC3\x8B" }, { 0xCC, "\xC3\x8C" }, { 0xCD, "\xC3\x8D" }, { 0xCE, "\xC3\x8E" },
			
 
				+            { 0xCF, "\xC3\x8F" }, { 0xD0, "\xC3\x90" }, { 0xD1, "\xC3\x91" }, { 0xD2, "\xC3\x92" }, { 0xD3, "\xC3\x93" }, { 0xD4, "\xC3\x94" }, { 0xD5, "\xC3\x95" }, { 0xD6, "\xC3\x96" },
			
 
				+            { 0xD7, "\xC3\x97" }, { 0xD8, "\xC3\x98" }, { 0xD9, "\xC3\x99" }, { 0xDA, "\xC3\x9A" }, { 0xDB, "\xC3\x9B" }, { 0xDC, "\xC3\x9C" }, { 0xDD, "\xC3\x9D" }, { 0xDE, "\xC3\x9E" },
			
 
				+            { 0xDF, "\xC3\x9F" }, { 0xE0, "\xC3\xA0" }, { 0xE1, "\xC3\xA1" }, { 0xE2, "\xC3\xA2" }, { 0xE3, "\xC3\xA3" }, { 0xE4, "\xC3\xA4" }, { 0xE5, "\xC3\xA5" }, { 0xE6, "\xC3\xA6" },
			
 
				+            { 0xE7, "\xC3\xA7" }, { 0xE8, "\xC3\xA8" }, { 0xE9, "\xC3\xA9" }, { 0xEA, "\xC3\xAA" }, { 0xEB, "\xC3\xAB" }, { 0xEC, "\xC3\xAC" }, { 0xED, "\xC3\xAD" }, { 0xEE, "\xC3\xAE" },
			
 
				+            { 0xEF, "\xC3\xAF" }, { 0xF0, "\xC3\xB0" }, { 0xF1, "\xC3\xB1" }, { 0xF2, "\xC3\xB2" }, { 0xF3, "\xC3\xB3" }, { 0xF4, "\xC3\xB4" }, { 0xF5, "\xC3\xB5" }, { 0xF6, "\xC3\xB6" },
			
 
				+            { 0xF7, "\xC3\xB7" }, { 0xF8, "\xC3\xB8" }, { 0xF9, "\xC3\xB9" }, { 0xFA, "\xC3\xBA" }, { 0xFB, "\xC3\xBB" }, { 0xFC, "\xC3\xBC" }, { 0xFD, "\xC3\xBD" }, { 0xFE, "\xC3\xBE" },
			
 
				+            { 0xFF, "\xC3\xBF" }, { 0x00, "\xC4\x80" }, { 0x01, "\xC4\x81" }, { 0x02, "\xC4\x82" }, { 0x03, "\xC4\x83" }, { 0x04, "\xC4\x84" }, { 0x05, "\xC4\x85" }, { 0x06, "\xC4\x86" },
			
 
				+            { 0x07, "\xC4\x87" }, { 0x08, "\xC4\x88" }, { 0x09, "\xC4\x89" }, { 0x0A, "\xC4\x8A" }, { 0x0B, "\xC4\x8B" }, { 0x0C, "\xC4\x8C" }, { 0x0D, "\xC4\x8D" }, { 0x0E, "\xC4\x8E" },
			
 
				+            { 0x0F, "\xC4\x8F" }, { 0x10, "\xC4\x90" }, { 0x11, "\xC4\x91" }, { 0x12, "\xC4\x92" }, { 0x13, "\xC4\x93" }, { 0x14, "\xC4\x94" }, { 0x15, "\xC4\x95" }, { 0x16, "\xC4\x96" },
			
 
				+            { 0x17, "\xC4\x97" }, { 0x18, "\xC4\x98" }, { 0x19, "\xC4\x99" }, { 0x1A, "\xC4\x9A" }, { 0x1B, "\xC4\x9B" }, { 0x1C, "\xC4\x9C" }, { 0x1D, "\xC4\x9D" }, { 0x1E, "\xC4\x9E" },
			
 
				+            { 0x1F, "\xC4\x9F" }, { 0x20, "\xC4\xA0" }, { 0x7F, "\xC4\xA1" }, { 0x80, "\xC4\xA2" }, { 0x81, "\xC4\xA3" }, { 0x82, "\xC4\xA4" }, { 0x83, "\xC4\xA5" }, { 0x84, "\xC4\xA6" },
			
 
				+            { 0x85, "\xC4\xA7" }, { 0x86, "\xC4\xA8" }, { 0x87, "\xC4\xA9" }, { 0x88, "\xC4\xAA" }, { 0x89, "\xC4\xAB" }, { 0x8A, "\xC4\xAC" }, { 0x8B, "\xC4\xAD" }, { 0x8C, "\xC4\xAE" },
			
 
				+            { 0x8D, "\xC4\xAF" }, { 0x8E, "\xC4\xB0" }, { 0x8F, "\xC4\xB1" }, { 0x90, "\xC4\xB2" }, { 0x91, "\xC4\xB3" }, { 0x92, "\xC4\xB4" }, { 0x93, "\xC4\xB5" }, { 0x94, "\xC4\xB6" },
			
 
				+            { 0x95, "\xC4\xB7" }, { 0x96, "\xC4\xB8" }, { 0x97, "\xC4\xB9" }, { 0x98, "\xC4\xBA" }, { 0x99, "\xC4\xBB" }, { 0x9A, "\xC4\xBC" }, { 0x9B, "\xC4\xBD" }, { 0x9C, "\xC4\xBE" },
			
 
				+            { 0x9D, "\xC4\xBF" }, { 0x9E, "\xC5\x80" }, { 0x9F, "\xC5\x81" }, { 0xA0, "\xC5\x82" }, { 0xAD, "\xC5\x83" }
			
 
				+        };
			
 
				+        return hex_map;
			
 
				+    }
			
 
				+
			
 
				+    std::unordered_map<std::string, unsigned char> unicode_to_bytes() {
			
 
				+        static std::unordered_map<std::string, unsigned char> hex_map = {
			
 
				+            { "\x21", 0x21 }, { "\x22", 0x22 }, { "\x23", 0x23 }, { "\x24", 0x24 }, { "\x25", 0x25 }, { "\x26", 0x26 }, { "\x27", 0x27 }, { "\x28", 0x28 }, { "\x29", 0x29 }, { "\x2A", 0x2A },
			
 
				+            { "\x2B", 0x2B }, { "\x2C", 0x2C }, { "\x2D", 0x2D }, { "\x2E", 0x2E }, { "\x2F", 0x2F }, { "\x30", 0x30 }, { "\x31", 0x31 }, { "\x32", 0x32 }, { "\x33", 0x33 }, { "\x34", 0x34 },
			
 
				+            { "\x35", 0x35 }, { "\x36", 0x36 }, { "\x37", 0x37 }, { "\x38", 0x38 }, { "\x39", 0x39 }, { "\x3A", 0x3A }, { "\x3B", 0x3B }, { "\x3C", 0x3C }, { "\x3D", 0x3D }, { "\x3E", 0x3E },
			
 
				+            { "\x3F", 0x3F }, { "\x40", 0x40 }, { "\x41", 0x41 }, { "\x42", 0x42 }, { "\x43", 0x43 }, { "\x44", 0x44 }, { "\x45", 0x45 }, { "\x46", 0x46 }, { "\x47", 0x47 }, { "\x48", 0x48 },
			
 
				+            { "\x49", 0x49 }, { "\x4A", 0x4A }, { "\x4B", 0x4B }, { "\x4C", 0x4C }, { "\x4D", 0x4D }, { "\x4E", 0x4E }, { "\x4F", 0x4F }, { "\x50", 0x50 }, { "\x51", 0x51 }, { "\x52", 0x52 },
			
 
				+            { "\x53", 0x53 }, { "\x54", 0x54 }, { "\x55", 0x55 }, { "\x56", 0x56 }, { "\x57", 0x57 }, { "\x58", 0x58 }, { "\x59", 0x59 }, { "\x5A", 0x5A }, { "\x5B", 0x5B }, { "\x5C", 0x5C },
			
 
				+            { "\x5D", 0x5D }, { "\x5E", 0x5E }, { "\x5F", 0x5F }, { "\x60", 0x60 }, { "\x61", 0x61 }, { "\x62", 0x62 }, { "\x63", 0x63 }, { "\x64", 0x64 }, { "\x65", 0x65 }, { "\x66", 0x66 },
			
 
				+            { "\x67", 0x67 }, { "\x68", 0x68 }, { "\x69", 0x69 }, { "\x6A", 0x6A }, { "\x6B", 0x6B }, { "\x6C", 0x6C }, { "\x6D", 0x6D }, { "\x6E", 0x6E }, { "\x6F", 0x6F }, { "\x70", 0x70 },
			
 
				+            { "\x71", 0x71 }, { "\x72", 0x72 }, { "\x73", 0x73 }, { "\x74", 0x74 }, { "\x75", 0x75 }, { "\x76", 0x76 }, { "\x77", 0x77 }, { "\x78", 0x78 }, { "\x79", 0x79 }, { "\x7A", 0x7A },
			
 
				+            { "\x7B", 0x7B }, { "\x7C", 0x7C }, { "\x7D", 0x7D }, { "\x7E", 0x7E }, { "\xC2\xA1", 0xA1 }, { "\xC2\xA2", 0xA2 }, { "\xC2\xA3", 0xA3 }, { "\xC2\xA4", 0xA4 }, { "\xC2\xA5", 0xA5 },
			
 
				+            { "\xC2\xA6", 0xA6 }, { "\xC2\xA7", 0xA7 }, { "\xC2\xA8", 0xA8 }, { "\xC2\xA9", 0xA9 }, { "\xC2\xAA", 0xAA }, { "\xC2\xAB", 0xAB }, { "\xC2\xAC", 0xAC }, { "\xC2\xAE", 0xAE },
			
 
				+            { "\xC2\xAF", 0xAF }, { "\xC2\xB0", 0xB0 }, { "\xC2\xB1", 0xB1 }, { "\xC2\xB2", 0xB2 }, { "\xC2\xB3", 0xB3 }, { "\xC2\xB4", 0xB4 }, { "\xC2\xB5", 0xB5 }, { "\xC2\xB6", 0xB6 },
			
 
				+            { "\xC2\xB7", 0xB7 }, { "\xC2\xB8", 0xB8 }, { "\xC2\xB9", 0xB9 }, { "\xC2\xBA", 0xBA }, { "\xC2\xBB", 0xBB }, { "\xC2\xBC", 0xBC }, { "\xC2\xBD", 0xBD }, { "\xC2\xBE", 0xBE },
			
 
				+            { "\xC2\xBF", 0xBF }, { "\xC3\x80", 0xC0 }, { "\xC3\x81", 0xC1 }, { "\xC3\x82", 0xC2 }, { "\xC3\x83", 0xC3 }, { "\xC3\x84", 0xC4 }, { "\xC3\x85", 0xC5 }, { "\xC3\x86", 0xC6 },
			
 
				+            { "\xC3\x87", 0xC7 }, { "\xC3\x88", 0xC8 }, { "\xC3\x89", 0xC9 }, { "\xC3\x8A", 0xCA }, { "\xC3\x8B", 0xCB }, { "\xC3\x8C", 0xCC }, { "\xC3\x8D", 0xCD }, { "\xC3\x8E", 0xCE },
			
 
				+            { "\xC3\x8F", 0xCF }, { "\xC3\x90", 0xD0 }, { "\xC3\x91", 0xD1 }, { "\xC3\x92", 0xD2 }, { "\xC3\x93", 0xD3 }, { "\xC3\x94", 0xD4 }, { "\xC3\x95", 0xD5 }, { "\xC3\x96", 0xD6 },
			
 
				+            { "\xC3\x97", 0xD7 }, { "\xC3\x98", 0xD8 }, { "\xC3\x99", 0xD9 }, { "\xC3\x9A", 0xDA }, { "\xC3\x9B", 0xDB }, { "\xC3\x9C", 0xDC }, { "\xC3\x9D", 0xDD }, { "\xC3\x9E", 0xDE },
			
 
				+            { "\xC3\x9F", 0xDF }, { "\xC3\xA0", 0xE0 }, { "\xC3\xA1", 0xE1 }, { "\xC3\xA2", 0xE2 }, { "\xC3\xA3", 0xE3 }, { "\xC3\xA4", 0xE4 }, { "\xC3\xA5", 0xE5 }, { "\xC3\xA6", 0xE6 },
			
 
				+            { "\xC3\xA7", 0xE7 }, { "\xC3\xA8", 0xE8 }, { "\xC3\xA9", 0xE9 }, { "\xC3\xAA", 0xEA }, { "\xC3\xAB", 0xEB }, { "\xC3\xAC", 0xEC }, { "\xC3\xAD", 0xED }, { "\xC3\xAE", 0xEE },
			
 
				+            { "\xC3\xAF", 0xEF }, { "\xC3\xB0", 0xF0 }, { "\xC3\xB1", 0xF1 }, { "\xC3\xB2", 0xF2 }, { "\xC3\xB3", 0xF3 }, { "\xC3\xB4", 0xF4 }, { "\xC3\xB5", 0xF5 }, { "\xC3\xB6", 0xF6 },
			
 
				+            { "\xC3\xB7", 0xF7 }, { "\xC3\xB8", 0xF8 }, { "\xC3\xB9", 0xF9 }, { "\xC3\xBA", 0xFA }, { "\xC3\xBB", 0xFB }, { "\xC3\xBC", 0xFC }, { "\xC3\xBD", 0xFD }, { "\xC3\xBE", 0xFE },
			
 
				+            { "\xC3\xBF", 0xFF }, { "\xC4\x80", 0x00 }, { "\xC4\x81", 0x01 }, { "\xC4\x82", 0x02 }, { "\xC4\x83", 0x03 }, { "\xC4\x84", 0x04 }, { "\xC4\x85", 0x05 }, { "\xC4\x86", 0x06 },
			
 
				+            { "\xC4\x87", 0x07 }, { "\xC4\x88", 0x08 }, { "\xC4\x89", 0x09 }, { "\xC4\x8A", 0x0A }, { "\xC4\x8B", 0x0B }, { "\xC4\x8C", 0x0C }, { "\xC4\x8D", 0x0D }, { "\xC4\x8E", 0x0E },
			
 
				+            { "\xC4\x8F", 0x0F }, { "\xC4\x90", 0x10 }, { "\xC4\x91", 0x11 }, { "\xC4\x92", 0x12 }, { "\xC4\x93", 0x13 }, { "\xC4\x94", 0x14 }, { "\xC4\x95", 0x15 }, { "\xC4\x96", 0x16 },
			
 
				+            { "\xC4\x97", 0x17 }, { "\xC4\x98", 0x18 }, { "\xC4\x99", 0x19 }, { "\xC4\x9A", 0x1A }, { "\xC4\x9B", 0x1B }, { "\xC4\x9C", 0x1C }, { "\xC4\x9D", 0x1D }, { "\xC4\x9E", 0x1E },
			
 
				+            { "\xC4\x9F", 0x1F }, { "\xC4\xA0", 0x20 }, { "\xC4\xA1", 0x7F }, { "\xC4\xA2", 0x80 }, { "\xC4\xA3", 0x81 }, { "\xC4\xA4", 0x82 }, { "\xC4\xA5", 0x83 }, { "\xC4\xA6", 0x84 },
			
 
				+            { "\xC4\xA7", 0x85 }, { "\xC4\xA8", 0x86 }, { "\xC4\xA9", 0x87 }, { "\xC4\xAA", 0x88 }, { "\xC4\xAB", 0x89 }, { "\xC4\xAC", 0x8A }, { "\xC4\xAD", 0x8B }, { "\xC4\xAE", 0x8C },
			
 
				+            { "\xC4\xAF", 0x8D }, { "\xC4\xB0", 0x8E }, { "\xC4\xB1", 0x8F }, { "\xC4\xB2", 0x90 }, { "\xC4\xB3", 0x91 }, { "\xC4\xB4", 0x92 }, { "\xC4\xB5", 0x93 }, { "\xC4\xB6", 0x94 },
			
 
				+            { "\xC4\xB7", 0x95 }, { "\xC4\xB8", 0x96 }, { "\xC4\xB9", 0x97 }, { "\xC4\xBA", 0x98 }, { "\xC4\xBB", 0x99 }, { "\xC4\xBC", 0x9A }, { "\xC4\xBD", 0x9B }, { "\xC4\xBE", 0x9C },
			
 
				+            { "\xC4\xBF", 0x9D }, { "\xC5\x80", 0x9E }, { "\xC5\x81", 0x9F }, { "\xC5\x82", 0xA0 }, { "\xC5\x83", 0xAD }
			
 
				+        };
			
 
				+        return hex_map;
			
 
				+    }
			
 
				+
			
 
				+    // len must be available
			
 
				+    bool inline str_is_equal(const char* str1, const char* str2, size_t len) {
			
 
				+        for (size_t i = 0; i < len; ++i) {
			
 
				+            if (str1[i] != str2[i]) {
			
 
				+                return false;
			
 
				+            }
			
 
				+        }
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+    std::vector<std::string> bpe_gpt2_preprocess(const std::string& text) {
			
 
				+        static std::unordered_map< unsigned char, std::string> byte_encoder = bytes_to_unicode();
			
 
				+        std::vector<std::string> bpe_words;
			
 
				+        std::vector<std::string> bpe_encoded_words;
			
 
				+
			
 
				+        std::string token="";
			
 
				+        const char *raw_text_p = text.c_str();
			
 
				+        // GPT2 system regex:  's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
			
 
				+        bool collecting_numeric = false;
			
 
				+        bool collecting_letter = false;
			
 
				+        bool collecting_special = false;
			
 
				+        bool collecting_whitespace_lookahead = false;
			
 
				+        bool collecting=false;
			
 
				+
			
 
				+        std::vector<CNCTString> text_utf;
			
 
				+        text_utf.reserve(text.size());
			
 
				+        bpe_words.reserve(text.size());
			
 
				+        bpe_encoded_words.reserve(text.size());
			
 
				+
			
 
				+        text_utf = CNCTUnicode::split_utf8_enhanced(text);
			
 
				+
			
 
				+        for (int i = 0; i < (int)text_utf.size(); i++) {
			
 
				+            const CNCTString &utf_char = text_utf[i];
			
 
				+            bool split_condition = false;
			
 
				+            const char *text_pos = raw_text_p + utf_char.seq_offset_bytes;
			
 
				+            int bytes_remain = strlen(text_pos);
			
 
				+            // forward backward lookups
			
 
				+            const CNCTString &utf_char_next = (i+1 < (int)text_utf.size()) ? text_utf[i+1] : CNCTString();
			
 
				+            const CNCTString &utf_char_next_next = (i+2 < (int)text_utf.size()) ? text_utf[i+2] : CNCTString();
			
 
				+            // const CNCTString &utf_char_prev = (i > 0) ? text_utf[i-1] : CNCTString();
			
 
				+
			
 
				+            // handling contractions
			
 
				+            if (!split_condition && bytes_remain >= 2) {
			
 
				+                // 's|'t|'m|'d
			
 
				+                if (utf_char == '\'' && (utf_char_next == 's' || utf_char_next == 't' || utf_char_next == 'm' || utf_char_next == 'd')) {
			
 
				+                    split_condition = true;
			
 
				+                }
			
 
				+                if (split_condition) {
			
 
				+                    if (token.size()) {
			
 
				+                        bpe_words.emplace_back(token); // push previous content as token
			
 
				+                    }
			
 
				+                    token = utf_char.str + utf_char_next.str;
			
 
				+                    bpe_words.emplace_back(token);
			
 
				+                    token="";
			
 
				+                    i++;
			
 
				+                    continue;
			
 
				+                }
			
 
				+            }
			
 
				+            if (!split_condition && bytes_remain >= 3) {
			
 
				+                // 're|'ve|'ll
			
 
				+                if (utf_char == '\'' && (
			
 
				+                                          (utf_char_next == 'r' || utf_char_next_next == 'e') ||
			
 
				+                                          (utf_char_next == 'v' || utf_char_next_next == 'e') ||
			
 
				+                                          (utf_char_next == 'l' || utf_char_next_next == 'l'))
			
 
				+                                        ) {
			
 
				+                    split_condition = true;
			
 
				+                }
			
 
				+                if (split_condition) {
			
 
				+                    // current token + next token can be defined
			
 
				+                    if (token.size()) {
			
 
				+                        bpe_words.emplace_back(token); // push previous content as token
			
 
				+                    }
			
 
				+                    token = utf_char.str + utf_char_next.str + utf_char_next_next.str;
			
 
				+                    bpe_words.emplace_back(token); // the contraction
			
 
				+                    token="";
			
 
				+                    i+=2;
			
 
				+                    continue;
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            if (!split_condition && !collecting) {
			
 
				+                if (utf_char.char_type == CNCTCharType::LETTER || (!token.size() && utf_char==" " && utf_char_next.char_type == CNCTCharType::LETTER)) {
			
 
				+                    collecting_letter = true;
			
 
				+                    collecting = true;
			
 
				+                } else if (utf_char.char_type == CNCTCharType::DIGIT || (!token.size() && utf_char==" " && utf_char_next.char_type == CNCTCharType::DIGIT)) {
			
 
				+                    collecting_numeric = true;
			
 
				+                    collecting = true;
			
 
				+                } else if (
			
 
				+                           ((utf_char.char_type != CNCTCharType::LETTER && utf_char.char_type != CNCTCharType::DIGIT) && (utf_char.char_type != CNCTCharType::WHITESPACE)) ||
			
 
				+                           (!token.size() && utf_char==" " && utf_char_next.char_type != CNCTCharType::LETTER && utf_char_next.char_type != CNCTCharType::DIGIT && utf_char_next.char_type != CNCTCharType::WHITESPACE)
			
 
				+                          ) {
			
 
				+                    collecting_special = true;
			
 
				+                    collecting = true;
			
 
				+                } else if (utf_char.char_type == CNCTCharType::WHITESPACE && utf_char_next.char_type == CNCTCharType::WHITESPACE) {
			
 
				+                    collecting_whitespace_lookahead = true;
			
 
				+                    collecting = true;
			
 
				+                } else if (utf_char.char_type == CNCTCharType::WHITESPACE) {
			
 
				+                    split_condition = true;
			
 
				+                }
			
 
				+            } else if (!split_condition && collecting) {
			
 
				+                if (collecting_letter && utf_char.char_type != CNCTCharType::LETTER) {
			
 
				+                    split_condition = true;
			
 
				+                } else if (collecting_numeric && utf_char.char_type != CNCTCharType::DIGIT) {
			
 
				+                    split_condition = true;
			
 
				+                } else if (collecting_special && (utf_char.char_type == CNCTCharType::LETTER || utf_char.char_type == CNCTCharType::DIGIT || utf_char.char_type == CNCTCharType::WHITESPACE)) {
			
 
				+                    split_condition = true;
			
 
				+                } else if (collecting_whitespace_lookahead && utf_char_next.char_type != CNCTCharType::WHITESPACE) {
			
 
				+                    split_condition = true;
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            if(utf_char_next.str.size() == 0) {
			
 
				+                split_condition = true; // final
			
 
				+                token += utf_char.str;
			
 
				+            }
			
 
				+
			
 
				+            if (split_condition) {
			
 
				+                if (token.size()) {
			
 
				+                    bpe_words.emplace_back(token);
			
 
				+                }
			
 
				+                token = utf_char.str;
			
 
				+                collecting = false;
			
 
				+                collecting_letter = false;
			
 
				+                collecting_numeric = false;
			
 
				+                collecting_special = false;
			
 
				+                collecting_whitespace_lookahead = false;
			
 
				+            } else {
			
 
				+                token += utf_char.str;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        for (std::string& word : bpe_words) {
			
 
				+            std::string encoded_token="";
			
 
				+            for (char& c : word) {
			
 
				+                encoded_token += byte_encoder[c];
			
 
				+            }
			
 
				+            bpe_encoded_words.emplace_back(encoded_token);
			
 
				+        }
			
 
				+
			
 
				+        return bpe_encoded_words;
			
 
				+    }
			
 
				+
			
 
				+    // decoder (for one token)
			
 
				+    std::string decode_token(const std::string& token) {
			
 
				+        static std::unordered_map< std::string, unsigned char> byte_decoder = unicode_to_bytes();
			
 
				+        std::string decoded_token="";
			
 
				+        auto unicode_seqeunces = CNCTUnicode::split_utf8(token);
			
 
				+        for (auto& unicode_sequence : unicode_seqeunces) {
			
 
				+            decoded_token += byte_decoder[unicode_sequence];
			
 
				+        }
			
 
				+
			
 
				+        return decoded_token;
			
 
				+    }
			
 
				+
			
 
				+    const gpt2bpe_vocab & vocab_;
			
 
				+    std::vector<ggllm_bpe_symbol> symbols_;
			
 
				+    std::vector<ggllm_bpe_symbol> symbols_final;
			
 
				+    ggllm_bpe_bigram::queue work_queue_;
			
 
				+    bool flag_g2ws=false;
			
 
				+};
			
 
				+
			
 
				+static std::vector<gpt2bpe_vocab::id> gpt2bpe_tokenize(const gpt2bpe_vocab & vocab, const std::string & text, bool bos, bool g2ws ) {
			
 
				+    gpt2bpe_tokenizer tokenizer(vocab, g2ws);
			
 
				+    std::vector<gpt2bpe_vocab::id> output;
			
 
				+
			
 
				+    if (text.empty()) {
			
 
				+        return output;
			
 
				+    }
			
 
				+
			
 
				+    if (bos && vocab.special_bos_id != -1) {
			
 
				+        output.push_back(vocab.special_bos_id);
			
 
				+    }
			
 
				+
			
 
				+    tokenizer.tokenize(text, output);
			
 
				+    return output;
			
 
				+}
			
 
				+
			
 
				+#endif // CMPNCT_GPT2BPE
			
--- a/examples/gptneox-wip/falcon-main.cpp
+++ b/examples/gptneox-wip/falcon-main.cpp
@@ -0,0 +1,1111 @@
 
				+#include "ggml.h"
			
 
				+#include "cmpnct_gpt2bpe.hpp"
			
 
				+
			
 
				+#include <cassert>
			
 
				+#include <cmath>
			
 
				+#include <cstdio>
			
 
				+#include <cstring>
			
 
				+#include <cinttypes>
			
 
				+#include <fstream>
			
 
				+#include <map>
			
 
				+#include <string>
			
 
				+#include <vector>
			
 
				+#include <thread>
			
 
				+#include <random>
			
 
				+
			
 
				+#if defined(_MSC_VER)
			
 
				+#pragma warning(disable: 4244 4267) // possible loss of data
			
 
				+#endif
			
 
				+
			
 
				+// default hparams
			
 
				+struct falcon_hparams {
			
 
				+    size_t n_merges = 0;
			
 
				+    size_t n_vocab  = 0;
			
 
				+    uint32_t n_ctx    = 0;
			
 
				+    uint32_t n_embd   = 0;
			
 
				+    uint32_t n_head   = 0;
			
 
				+    uint32_t n_head_kv = 1; // Needs to be 1 for 7B model
			
 
				+    uint32_t n_ff = 0;
			
 
				+    uint32_t n_block  = 0;
			
 
				+    float norm_eps = 1e-5;
			
 
				+};
			
 
				+struct falcon_block {
			
 
				+    // normalization
			
 
				+    struct ggml_tensor* input_layernorm;
			
 
				+    struct ggml_tensor* input_layernorm_b;
			
 
				+    struct ggml_tensor* attention_norm;    // Falcon-40B only
			
 
				+    struct ggml_tensor* attention_norm_b;  // Falcon-40B only
			
 
				+
			
 
				+    // attention
			
 
				+    struct ggml_tensor* query_key_value;
			
 
				+    struct ggml_tensor* wo;
			
 
				+
			
 
				+    // ff
			
 
				+    struct ggml_tensor* ffn_up;
			
 
				+    struct ggml_tensor* ffn_down;
			
 
				+};
			
 
				+
			
 
				+struct falcon_model {
			
 
				+    falcon_hparams hparams;
			
 
				+
			
 
				+    struct ggml_tensor* tok_embeddings;
			
 
				+    struct ggml_tensor* output_norm;
			
 
				+    struct ggml_tensor* output_norm_b;
			
 
				+    struct ggml_tensor* lm_head;
			
 
				+
			
 
				+    std::vector<falcon_block> blocks;
			
 
				+
			
 
				+    // key + value memory
			
 
				+    struct ggml_tensor* memory_k;
			
 
				+    struct ggml_tensor* memory_v;
			
 
				+
			
 
				+    struct gguf_context * ggufctx;
			
 
				+    struct ggml_context * ctx;
			
 
				+    struct ggml_context * kvctx;
			
 
				+
			
 
				+    std::map<std::string, struct ggml_tensor*> tensors;
			
 
				+};
			
 
				+
			
 
				+struct gpt_params {
			
 
				+    int32_t seed      = -1;  // RNG seed
			
 
				+    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
			
 
				+    uint32_t n_predict = 200; // new tokens to predict
			
 
				+    uint32_t n_batch   = 512;   // batch size for prompt processing
			
 
				+
			
 
				+    // sampling parameters
			
 
				+    int32_t top_k          = 40;
			
 
				+    float top_p            = 1.0f;
			
 
				+    float temp             = 0.8f;
			
 
				+    int32_t repeat_last_n  = 64;
			
 
				+    float repeat_penalty   = 1.02f;
			
 
				+
			
 
				+    std::string model      = ""; // model path
			
 
				+    std::string prompt     = "";
			
 
				+
			
 
				+    std::string token_test = "";
			
 
				+    bool    interactive      = false;
			
 
				+    int32_t interactive_port = -1;
			
 
				+    int32_t n_gpu_layers     = 0;
			
 
				+};
			
 
				+
			
 
				+void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
			
 
				+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
			
 
				+    fprintf(stderr, "\n");
			
 
				+    fprintf(stderr, "options:\n");
			
 
				+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
			
 
				+    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
			
 
				+    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
			
 
				+    fprintf(stderr, "  -ngl N, --gpu-layers N  number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers);
			
 
				+    fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
			
 
				+    fprintf(stderr, "                        prompt to start generation with (default: random)\n");
			
 
				+    fprintf(stderr, "  -f FNAME, --file FNAME\n");
			
 
				+    fprintf(stderr, "                        load prompt from a file\n");
			
 
				+    fprintf(stderr, "  -tt TOKEN_TEST, --token_test TOKEN_TEST\n");
			
 
				+    fprintf(stderr, "                        test tokenization\n");
			
 
				+    fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d)\n", params.n_predict);
			
 
				+    fprintf(stderr, "  --top_k N             top-k sampling, 0 = n_vocab (default: %d)\n", params.top_k);
			
 
				+    fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f)\n", params.top_p);
			
 
				+    fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", params.temp);
			
 
				+    fprintf(stderr, "  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled)\n", params.repeat_last_n);
			
 
				+    fprintf(stderr, "  --repeat-penalty N    penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty);
			
 
				+    fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
			
 
				+    fprintf(stderr, "  -m FNAME, --model FNAME\n");
			
 
				+    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
			
 
				+    fprintf(stderr, "\n");
			
 
				+}
			
 
				+
			
 
				+// Function to check if the next argument exists
			
 
				+std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) {
			
 
				+    if (i + 1 < argc && argv[i + 1][0] != '-') {
			
 
				+        return argv[++i];
			
 
				+    } else {
			
 
				+        fprintf(stderr, "error: %s requires one argument.\n", flag.c_str());
			
 
				+        gpt_print_usage(argc, argv, params);
			
 
				+        exit(0);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
			
 
				+    for (int i = 1; i < argc; i++) {
			
 
				+        std::string arg = argv[i];
			
 
				+
			
 
				+        if (arg == "-s" || arg == "--seed") {
			
 
				+            params.seed = std::stoi(get_next_arg(i, argc, argv, arg, params));
			
 
				+        } else if (arg == "-t" || arg == "--threads") {
			
 
				+            params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params));
			
 
				+        } else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
			
 
				+            params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params));
			
 
				+        } else if (arg == "-p" || arg == "--prompt") {
			
 
				+            params.prompt = get_next_arg(i, argc, argv, arg, params);
			
 
				+        } else if (arg == "-n" || arg == "--n_predict") {
			
 
				+            params.n_predict = std::stoi(get_next_arg(i, argc, argv, arg, params));
			
 
				+        } else if (arg == "--top_k") {
			
 
				+            params.top_k = std::stoi(get_next_arg(i, argc, argv, arg, params));
			
 
				+        } else if (arg == "--top_p") {
			
 
				+            params.top_p = std::stof(get_next_arg(i, argc, argv, arg, params));
			
 
				+        } else if (arg == "--temp") {
			
 
				+            params.temp = std::stof(get_next_arg(i, argc, argv, arg, params));
			
 
				+        } else if (arg == "--repeat-last-n") {
			
 
				+            params.repeat_last_n = std::stoi(get_next_arg(i, argc, argv, arg, params));
			
 
				+        } else if (arg == "--repeat-penalty") {
			
 
				+            params.repeat_penalty = std::stof(get_next_arg(i, argc, argv, arg, params));
			
 
				+        } else if (arg == "-b" || arg == "--batch_size") {
			
 
				+            params.n_batch= std::stoi(get_next_arg(i, argc, argv, arg, params));
			
 
				+        } else if (arg == "-m" || arg == "--model") {
			
 
				+            params.model = get_next_arg(i, argc, argv, arg, params);
			
 
				+        } else if (arg == "-i" || arg == "--interactive") {
			
 
				+            params.interactive = true;
			
 
				+        } else if (arg == "-ip" || arg == "--interactive-port") {
			
 
				+            params.interactive = true;
			
 
				+            params.interactive_port = std::stoi(get_next_arg(i, argc, argv, arg, params));
			
 
				+        } else if (arg == "-h" || arg == "--help") {
			
 
				+            gpt_print_usage(argc, argv, params);
			
 
				+            exit(0);
			
 
				+        } else if (arg == "-f" || arg == "--file") {
			
 
				+            get_next_arg(i, argc, argv, arg, params);
			
 
				+            std::ifstream file(argv[i]);
			
 
				+            if (!file) {
			
 
				+                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
			
 
				+                break;
			
 
				+            }
			
 
				+            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
			
 
				+            if (params.prompt.back() == '\n') {
			
 
				+                params.prompt.pop_back();
			
 
				+            }
			
 
				+        } else if (arg == "-tt" || arg == "--token_test") {
			
 
				+            params.token_test = get_next_arg(i, argc, argv, arg, params);
			
 
				+        }
			
 
				+        else {
			
 
				+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
			
 
				+            gpt_print_usage(argc, argv, params);
			
 
				+            exit(0);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+gpt2bpe_vocab::id sample_top_k_top_p_repeat(
			
 
				+        const gpt2bpe_vocab & vocab,
			
 
				+        const float * logits,
			
 
				+        const int32_t * last_n_tokens_data,
			
 
				+        size_t last_n_tokens_data_size,
			
 
				+        int    top_k,
			
 
				+        double top_p,
			
 
				+        double temp,
			
 
				+        int repeat_last_n,
			
 
				+        float repeat_penalty,
			
 
				+        std::mt19937 & rng) {
			
 
				+
			
 
				+    int n_logits = vocab.id_to_token.size();
			
 
				+
			
 
				+    const auto * plogits = logits;
			
 
				+
			
 
				+    const auto last_n_tokens = std::vector<int32_t>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_data_size);
			
 
				+
			
 
				+    if (temp <= 0) {
			
 
				+        // select the token with the highest logit directly
			
 
				+        float max_logit = plogits[0];
			
 
				+        gpt2bpe_vocab::id max_id = 0;
			
 
				+
			
 
				+        for (int i = 1; i < n_logits; ++i) {
			
 
				+            if (plogits[i] > max_logit) {
			
 
				+                max_logit = plogits[i];
			
 
				+                max_id = i;
			
 
				+            }
			
 
				+        }
			
 
				+        return max_id;
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    std::vector<std::pair<double, gpt2bpe_vocab::id>> logits_id;
			
 
				+    logits_id.reserve(n_logits);
			
 
				+
			
 
				+    {
			
 
				+        const float scale = 1.0f/temp;
			
 
				+        for (int i = 0; i < n_logits; ++i) {
			
 
				+            // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
			
 
				+            // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
			
 
				+            if (repeat_last_n > 0 && std::find(last_n_tokens.end()-repeat_last_n, last_n_tokens.end(), i) != last_n_tokens.end()) {
			
 
				+                // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
			
 
				+                if (plogits[i] < 0.0f) {
			
 
				+                    logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
			
 
				+                } else {
			
 
				+                    logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
			
 
				+                }
			
 
				+            } else {
			
 
				+                logits_id.push_back(std::make_pair(plogits[i]*scale, i));
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // find the top K tokens
			
 
				+    std::partial_sort(
			
 
				+            logits_id.begin(),
			
 
				+            logits_id.begin() + top_k, logits_id.end(),
			
 
				+            [](const std::pair<double, gpt2bpe_vocab::id> & a, const std::pair<double, gpt2bpe_vocab::id> & b) {
			
 
				+        return a.first > b.first;
			
 
				+    });
			
 
				+
			
 
				+    logits_id.resize(top_k);
			
 
				+
			
 
				+    double maxl = -INFINITY;
			
 
				+    for (const auto & kv : logits_id) {
			
 
				+        maxl = std::max(maxl, kv.first);
			
 
				+    }
			
 
				+
			
 
				+    // compute probs for the top K tokens
			
 
				+    std::vector<double> probs;
			
 
				+    probs.reserve(logits_id.size());
			
 
				+
			
 
				+    double sum = 0.0;
			
 
				+    for (const auto & kv : logits_id) {
			
 
				+        double p = exp(kv.first - maxl);
			
 
				+        probs.push_back(p);
			
 
				+        sum += p;
			
 
				+    }
			
 
				+
			
 
				+    // normalize the probs
			
 
				+    for (auto & p : probs) {
			
 
				+        p /= sum;
			
 
				+    }
			
 
				+
			
 
				+    if (top_p < 1.0f) {
			
 
				+        double cumsum = 0.0f;
			
 
				+        for (int i = 0; i < top_k; i++) {
			
 
				+            cumsum += probs[i];
			
 
				+            if (cumsum >= top_p) {
			
 
				+                top_k = i + 1;
			
 
				+                probs.resize(top_k);
			
 
				+                logits_id.resize(top_k);
			
 
				+                break;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        cumsum = 1.0/cumsum;
			
 
				+        for (int i = 0; i < (int) probs.size(); i++) {
			
 
				+            probs[i] *= cumsum;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+//    printf("\n");
			
 
				+//    for (int i = 0; i < (int) probs.size(); i++) {
			
 
				+//    for (int i = 0; i < 10; i++) {
			
 
				+//        printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
			
 
				+//    }
			
 
				+
			
 
				+    std::discrete_distribution<> dist(probs.begin(), probs.end());
			
 
				+    int idx = dist(rng);
			
 
				+
			
 
				+    return logits_id[idx].second;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+struct ggml_tensor * get_tensor_ex( struct ggml_context * ctx, std::string name){
			
 
				+
			
 
				+    struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
			
 
				+    if( cur == NULL ) {
			
 
				+        fprintf(stdout, "%s: tensor '%s' not found!\n", __func__, name.c_str());
			
 
				+    } else {
			
 
				+//        fprintf(stdout, "%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name);
			
 
				+    }
			
 
				+
			
 
				+    return cur;
			
 
				+}
			
 
				+
			
 
				+// load the model's weights from a file
			
 
				+bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_vocab & vocab) {
			
 
				+    printf("%s: loading model from '%s'..\n", __func__, fname.c_str());
			
 
				+
			
 
				+    model.ctx = NULL;
			
 
				+
			
 
				+    struct gguf_init_params ggufparams = {
			
 
				+        /*.no_alloc = */ false,
			
 
				+        /*.ctx      = */ &model.ctx,
			
 
				+    };
			
 
				+
			
 
				+    auto & ggufctx = model.ggufctx;
			
 
				+
			
 
				+    ggufctx  = gguf_init_from_file(fname.c_str(), ggufparams);
			
 
				+
			
 
				+    if (!ggufctx) {
			
 
				+        fprintf(stderr, "%s: gguf_init_from_file() failed\n", __func__);
			
 
				+        return false;
			
 
				+    }
			
 
				+
			
 
				+    fprintf(stdout, "%s: gguf version     = %d\n", __func__, gguf_get_version(ggufctx));
			
 
				+    fprintf(stdout, "%s: gguf alignment   = %zu\n", __func__, gguf_get_alignment(ggufctx));
			
 
				+    fprintf(stdout, "%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx));
			
 
				+
			
 
				+    // print all kv
			
 
				+    #if 0
			
 
				+    {
			
 
				+        const int n_kv = gguf_get_n_kv(ggufctx);
			
 
				+
			
 
				+        fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
			
 
				+
			
 
				+        for (int i = 0; i < n_kv; ++i) {
			
 
				+            const char * key = gguf_get_key(ggufctx, i);
			
 
				+
			
 
				+            fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
			
 
				+        }
			
 
				+    }
			
 
				+    #endif
			
 
				+
			
 
				+    // print some standard metadata
			
 
				+    {
			
 
				+        int keyidx;
			
 
				+
			
 
				+        keyidx = gguf_find_key(ggufctx, "general.name");
			
 
				+        if (keyidx != -1) { fprintf(stdout, "%s: model name           = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
			
 
				+        keyidx = gguf_find_key(ggufctx, "general.description");
			
 
				+        if (keyidx != -1) { fprintf(stdout, "%s: model description    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
			
 
				+        keyidx = gguf_find_key(ggufctx, "general.author");
			
 
				+        if (keyidx != -1) { fprintf(stdout, "%s: model author         = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
			
 
				+        keyidx = gguf_find_key(ggufctx, "general.license");
			
 
				+        if (keyidx != -1) { fprintf(stdout, "%s: model license        = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
			
 
				+        keyidx = gguf_find_key(ggufctx, "general.architecture");
			
 
				+        if (keyidx != -1) { fprintf(stdout, "%s: model architecture   = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
			
 
				+        keyidx = gguf_find_key(ggufctx, "general.file_type");
			
 
				+        if (keyidx != -1) { fprintf(stdout, "%s: model file type      = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
			
 
				+        keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
			
 
				+        if (keyidx != -1) { fprintf(stdout, "%s: model data layout    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
			
 
				+        keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
			
 
				+        if (keyidx != -1) { fprintf(stdout, "%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
			
 
				+    }
			
 
				+
			
 
				+    // check required metadata
			
 
				+    {
			
 
				+        int keyidx;
			
 
				+
			
 
				+        // check model architecture kv
			
 
				+        keyidx = gguf_find_key(ggufctx, "general.architecture");
			
 
				+        if (keyidx != -1) {
			
 
				+            if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "falcon") != 0) {
			
 
				+                fprintf(stdout, "%s: model architecture not supported!\n", __func__);
			
 
				+                return false;
			
 
				+            }
			
 
				+        } else {
			
 
				+            fprintf(stdout, "%s: gguf model architecture not found!\n", __func__);
			
 
				+            return false;
			
 
				+        }
			
 
				+
			
 
				+        // check model tensor data layout kv
			
 
				+        keyidx = gguf_find_key(ggufctx, "falcon.tensor_data_layout");
			
 
				+        if (keyidx != -1) {
			
 
				+            if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "jploski") != 0) {
			
 
				+                fprintf(stdout, "%s: model tensor data layout not supported!\n", __func__);
			
 
				+                return false;
			
 
				+            }
			
 
				+        } else {
			
 
				+            fprintf(stdout, "%s: gguf model tensor data layout not found!\n", __func__);
			
 
				+            return false;
			
 
				+        }
			
 
				+
			
 
				+    }
			
 
				+
			
 
				+    // load hparams
			
 
				+    {
			
 
				+        auto & hparams = model.hparams;
			
 
				+
			
 
				+        bool ok = true;
			
 
				+        int keyidx;
			
 
				+
			
 
				+        if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.context_length");
			
 
				+                  if (keyidx != -1) { hparams.n_ctx = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
			
 
				+
			
 
				+        if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.embedding_length");
			
 
				+                  if (keyidx != -1) { hparams.n_embd = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
			
 
				+
			
 
				+        if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.attention.head_count");
			
 
				+                  if (keyidx != -1) { hparams.n_head = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
			
 
				+
			
 
				+        if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.feed_forward_length");
			
 
				+                  if (keyidx != -1) { hparams.n_ff = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
			
 
				+
			
 
				+        if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.block_count");
			
 
				+                  if (keyidx != -1) { hparams.n_block = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
			
 
				+
			
 
				+        if (ok) { keyidx = gguf_find_key(ggufctx, "falcon.attention.layer_norm_epsilon");
			
 
				+                  if (keyidx != -1) { hparams.norm_eps= gguf_get_val_f32(ggufctx, keyidx); } else { ok = false; }  }
			
 
				+
			
 
				+        if (!ok) {
			
 
				+            fprintf(stderr, "%s: required hparam missing!\n", __func__);
			
 
				+            return false;
			
 
				+        }
			
 
				+
			
 
				+        keyidx = gguf_find_key(ggufctx, "falcon.attention.head_count_kv");
			
 
				+        if (keyidx != -1) { hparams.n_head_kv = gguf_get_val_u32(ggufctx, keyidx); }
			
 
				+
			
 
				+
			
 
				+        printf("%s: n_ctx      = %d\n", __func__, hparams.n_ctx);
			
 
				+        printf("%s: n_embd     = %d\n", __func__, hparams.n_embd);
			
 
				+        printf("%s: n_head     = %d\n", __func__, hparams.n_head);
			
 
				+        printf("%s: n_head_kv  = %d\n", __func__, hparams.n_head_kv);
			
 
				+        printf("%s: n_block    = %d\n", __func__, hparams.n_block);
			
 
				+        printf("%s: norm_eps   = %g\n", __func__, hparams.norm_eps);
			
 
				+
			
 
				+    }
			
 
				+
			
 
				+    // load vocab
			
 
				+    {
			
 
				+        auto & hparams = model.hparams;
			
 
				+
			
 
				+        int keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.model");
			
 
				+
			
 
				+        if (keyidx != -1) {
			
 
				+            if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) {
			
 
				+                fprintf(stdout, "%s: tokenizer model not supported!\n", __func__);
			
 
				+                return false;
			
 
				+            }
			
 
				+        } else {
			
 
				+            fprintf(stdout, "%s: tokenizer model not found!\n", __func__);
			
 
				+            return false;
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+        int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens");
			
 
				+
			
 
				+        if (tokens_keyidx == -1) {
			
 
				+            fprintf(stdout, "%s: gpt2 tokenizer vocab not found!\n", __func__);
			
 
				+            return false;
			
 
				+        }
			
 
				+
			
 
				+        int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges");
			
 
				+
			
 
				+        if (merges_keyidx == -1) {
			
 
				+            fprintf(stdout, "%s: gpt2 tokenizer merges not found!\n", __func__);
			
 
				+            return false;
			
 
				+        }
			
 
				+
			
 
				+        hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx);
			
 
				+        hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx);
			
 
				+
			
 
				+        fprintf(stdout, "%s: gpt2 tokenizer vocab  = %zu\n", __func__, hparams.n_vocab);
			
 
				+        fprintf(stdout, "%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges);
			
 
				+
			
 
				+        for (size_t i = 0; i < hparams.n_vocab; i++) {
			
 
				+            std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i);
			
 
				+
			
 
				+//            printf("token %d = '%s'\n",i,word.c_str() );
			
 
				+
			
 
				+            vocab.token_to_id[word] = i;
			
 
				+            vocab.id_to_token[i] = word;
			
 
				+
			
 
				+            if( vocab.id_to_token[i] == "\n" ) {
			
 
				+                vocab.linefeed_id = i;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        std::vector<std::pair<std::string, std::string>> bpe_merges;
			
 
				+
			
 
				+        for (size_t i = 0; i < hparams.n_merges; i++) {
			
 
				+
			
 
				+            std::string word = gguf_get_arr_str(ggufctx, merges_keyidx, i);
			
 
				+
			
 
				+            // Split the merges
			
 
				+            std::string first, second;
			
 
				+            size_t pos = word.find(' ', 1); // Start the search from the second character
			
 
				+            if (pos != std::string::npos) {
			
 
				+                first = word.substr(0, pos);
			
 
				+                second = word.substr(pos + 1);
			
 
				+            }
			
 
				+
			
 
				+            bpe_merges.push_back(std::make_pair(first, second));
			
 
				+        }
			
 
				+
			
 
				+        vocab.populate_bpe_ranks(bpe_merges);
			
 
				+
			
 
				+
			
 
				+        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.bos_token_id"); if( keyidx != -1 ) {       vocab.special_bos_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
			
 
				+        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.eos_token_id"); if( keyidx != -1 ) {       vocab.special_eos_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
			
 
				+        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.unknown_token_id"); if( keyidx != -1 ) {   vocab.special_unk_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
			
 
				+        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
			
 
				+        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) {   vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
			
 
				+
			
 
				+        if( vocab.special_bos_id != -1 ) { fprintf(stdout, "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); }
			
 
				+        if( vocab.special_eos_id != -1 ) { fprintf(stdout, "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); }
			
 
				+        if( vocab.special_unk_id != -1 ) { fprintf(stdout, "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); }
			
 
				+        if( vocab.special_sep_id != -1 ) { fprintf(stdout, "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); }
			
 
				+        if( vocab.special_pad_id != -1 ) { fprintf(stdout, "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); }
			
 
				+        if( vocab.linefeed_id    != -1 ) { fprintf(stdout, "%s: LF token  = %d\n",      __func__, vocab.linefeed_id ); }
			
 
				+
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    auto & ctx = model.ctx;
			
 
				+    size_t ctx_size = ggml_get_mem_size(ctx);
			
 
				+
			
 
				+    printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
			
 
				+
			
 
				+    // print tensor info
			
 
				+    #if 0
			
 
				+    {
			
 
				+        const int n_tensors = gguf_get_n_tensors(ggufctx);
			
 
				+
			
 
				+        fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
			
 
				+
			
 
				+        for (int i = 0; i < n_tensors; ++i) {
			
 
				+            const char * name   = gguf_get_tensor_name  (ggufctx, i);
			
 
				+            const size_t offset = gguf_get_tensor_offset(ggufctx, i);
			
 
				+
			
 
				+            fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
			
 
				+        }
			
 
				+    }
			
 
				+    #endif
			
 
				+
			
 
				+    // prepare memory for the weights
			
 
				+    {
			
 
				+
			
 
				+        auto & hparams = model.hparams;
			
 
				+
			
 
				+        const int n_block = hparams.n_block;
			
 
				+
			
 
				+        model.blocks.resize(n_block);
			
 
				+
			
 
				+        model.tok_embeddings = ggml_get_tensor(ctx, "token_embd.weight");
			
 
				+
			
 
				+        model.output_norm = ggml_get_tensor(ctx, "output_norm.weight");
			
 
				+        model.output_norm_b = ggml_get_tensor(ctx, "output_norm.bias");
			
 
				+        model.lm_head = ggml_get_tensor(ctx, "output.weight");
			
 
				+
			
 
				+        // map by name
			
 
				+        model.tensors["token_embd.weight"] = model.tok_embeddings;
			
 
				+        model.tensors["output_norm.weight"] = model.output_norm;
			
 
				+        model.tensors["output_norm.bias"] = model.output_norm_b;
			
 
				+        model.tensors["output.weight"] = model.lm_head;
			
 
				+
			
 
				+        for (int i = 0; i < n_block; ++i) {
			
 
				+
			
 
				+            auto& block = model.blocks[i];
			
 
				+            std::string blocknamestart = "blk." + std::to_string(i) + ".";
			
 
				+
			
 
				+            block.input_layernorm   =  get_tensor_ex(ctx, blocknamestart + "attn_norm.weight" );
			
 
				+            block.input_layernorm_b =  get_tensor_ex(ctx, blocknamestart + "attn_norm.bias" );
			
 
				+
			
 
				+            if ( hparams.n_head_kv == 8 ) { // Falcon-40B
			
 
				+                block.attention_norm   =  get_tensor_ex(ctx, blocknamestart + "attn_norm_2.weight" );
			
 
				+                block.attention_norm_b =  get_tensor_ex(ctx, blocknamestart + "attn_norm_2.bias" );
			
 
				+            }
			
 
				+
			
 
				+            // query_key_value shape for config.multi_query == True:
			
 
				+            block.query_key_value = get_tensor_ex(ctx, blocknamestart + "attn_qkv.weight" );
			
 
				+            block.wo = get_tensor_ex(ctx, blocknamestart + "attn_output.weight" );
			
 
				+
			
 
				+            block.ffn_up = get_tensor_ex(ctx, blocknamestart + "ffn_up.weight" );
			
 
				+            block.ffn_down = get_tensor_ex(ctx, blocknamestart + "ffn_down.weight" );
			
 
				+
			
 
				+            // map by name
			
 
				+            if ( hparams.n_head_kv == 8 ) { // Falcon-40B
			
 
				+                // Falcon-40B:
			
 
				+                model.tensors[blocknamestart + "attn_norm.weight"] = block.input_layernorm;
			
 
				+                model.tensors[blocknamestart + "attn_norm.bias"] = block.input_layernorm_b;
			
 
				+                model.tensors[blocknamestart + "attn_norm_2.weight"] = block.attention_norm;
			
 
				+                model.tensors[blocknamestart + "attn_norm_2.bias"] = block.attention_norm_b;
			
 
				+            } else {
			
 
				+                // Falcon-7B:
			
 
				+                model.tensors[blocknamestart + "attn_norm.weight"] = block.input_layernorm;
			
 
				+                model.tensors[blocknamestart + "attn_norm.bias"] = block.input_layernorm_b;
			
 
				+            }
			
 
				+
			
 
				+            model.tensors[blocknamestart + "attn_qkv.weight"] = block.query_key_value;
			
 
				+            model.tensors[blocknamestart + "attn_output.weight"] = block.wo;
			
 
				+
			
 
				+            model.tensors[blocknamestart + "ffn_up.weight"] = block.ffn_up;
			
 
				+            model.tensors[blocknamestart + "ffn_down.weight"] = block.ffn_down;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // key + value memory
			
 
				+    {
			
 
				+        const auto & kvctx = model.kvctx;
			
 
				+        const auto & hparams = model.hparams;
			
 
				+
			
 
				+        const int n_block = hparams.n_block;
			
 
				+        const int n_ctx   = hparams.n_ctx;
			
 
				+        const int n_embd = hparams.n_embd;
			
 
				+
			
 
				+        const int64_t n_mem      = n_block*n_ctx;
			
 
				+        const int64_t n_elements = n_embd*n_mem;
			
 
				+
			
 
				+        // create the ggml context
			
 
				+        {
			
 
				+            struct ggml_init_params params = {
			
 
				+                /*.mem_size   =*/ size_t(n_elements*4+ggml_tensor_overhead()*2),
			
 
				+                /*.mem_buffer =*/ NULL,
			
 
				+                /*.no_alloc   =*/ false,
			
 
				+            };
			
 
				+
			
 
				+            model.kvctx = ggml_init(params);
			
 
				+            if (!model.kvctx) {
			
 
				+                fprintf(stderr, "%s: kv ggml_init() failed\n", __func__);
			
 
				+                return false;
			
 
				+            }
			
 
				+
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+        model.memory_k = ggml_new_tensor_1d(kvctx, GGML_TYPE_F16, n_elements);
			
 
				+        model.memory_v = ggml_new_tensor_1d(kvctx, GGML_TYPE_F16, n_elements);
			
 
				+
			
 
				+        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
			
 
				+
			
 
				+        printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
			
 
				+    }
			
 
				+
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+// evaluate the transformer
			
 
				+//
			
 
				+//   - model:     the model
			
 
				+//   - n_threads: number of threads to use
			
 
				+//   - n_past:    the context size so far
			
 
				+//   - embd_inp:  the embeddings of the tokens in the context
			
 
				+//   - embd_w:    the predicted logits for the next token
			
 
				+//
			
 
				+bool falcon_eval(
			
 
				+        const falcon_model & model,
			
 
				+        const int n_threads,
			
 
				+        const int n_past,
			
 
				+        const std::vector<gpt2bpe_vocab::id> & embd_inp,
			
 
				+              std::vector<float>         & embd_w,
			
 
				+              size_t                     & mem_per_token) {
			
 
				+
			
 
				+
			
 
				+    const int N = embd_inp.size();
			
 
				+
			
 
				+    const auto & hparams = model.hparams;
			
 
				+
			
 
				+    const int n_embd  = hparams.n_embd;
			
 
				+    const int n_block = hparams.n_block;
			
 
				+    const int n_ctx   = hparams.n_ctx;
			
 
				+    const int n_head  = hparams.n_head;
			
 
				+    const int n_head_kv = hparams.n_head_kv;
			
 
				+    const int n_vocab = hparams.n_vocab;
			
 
				+    const size_t head_dim = n_embd / n_head;
			
 
				+
			
 
				+    static size_t buf_size = 256u*1024*1024;
			
 
				+    static void * buf = malloc(buf_size);
			
 
				+
			
 
				+    // use 2 scratch buffers
			
 
				+    // TODO: very hacky solution - reimplement in a more elegant way
			
 
				+    static size_t scr0_size = 256u*1024*1024;
			
 
				+    static void * scr0 = malloc(scr0_size);
			
 
				+
			
 
				+    static size_t scr1_size = 256u*1024*1024;
			
 
				+    static void * scr1 = malloc(scr1_size);
			
 
				+
			
 
				+    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
			
 
				+        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
			
 
				+        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
			
 
				+
			
 
				+        // reallocate
			
 
				+        buf_size = buf_size_new;
			
 
				+        buf = realloc(buf, buf_size);
			
 
				+        if (buf == nullptr) {
			
 
				+            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
			
 
				+            return false;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    struct ggml_init_params params = {
			
 
				+        /*.mem_size   =*/ buf_size,
			
 
				+        /*.mem_buffer =*/ buf,
			
 
				+        /*.no_alloc   =*/ false,
			
 
				+    };
			
 
				+
			
 
				+    struct ggml_context * ctx0 = ggml_init(params);
			
 
				+    struct ggml_cgraph gf = {};
			
 
				+//    gf.n_threads = n_threads;
			
 
				+
			
 
				+    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
			
 
				+    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
			
 
				+
			
 
				+    // wte
			
 
				+    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
			
 
				+//    struct ggml_tensor* repeat_dummy = ggml_new_tensor_3d(ctx0, inpL->type, head_dim, N + n_past, n_head);
			
 
				+
			
 
				+    ggml_type wtype = GGML_TYPE_F32;
			
 
				+    const int sizeof_wtype = ggml_type_sizef(wtype);
			
 
				+
			
 
				+    for (int il = 0; il < n_block; ++il) {
			
 
				+        struct ggml_tensor * cur;
			
 
				+        struct ggml_tensor * layernorm_output;
			
 
				+
			
 
				+        ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
			
 
				+
			
 
				+        // self-attention
			
 
				+        {
			
 
				+            layernorm_output = ggml_norm(ctx0, inpL);
			
 
				+
			
 
				+            layernorm_output = ggml_add(ctx0,
			
 
				+                    ggml_mul(ctx0,
			
 
				+                        ggml_repeat(ctx0, model.blocks[il].input_layernorm, layernorm_output),
			
 
				+                        layernorm_output),
			
 
				+                    ggml_repeat(ctx0, model.blocks[il].input_layernorm_b, layernorm_output));
			
 
				+
			
 
				+            if ( hparams.n_head_kv == 8 ) { // Falcon-40B
			
 
				+                cur = ggml_norm(ctx0, inpL);
			
 
				+
			
 
				+                cur = ggml_add(ctx0,
			
 
				+                        ggml_mul(ctx0,
			
 
				+                            ggml_repeat(ctx0, model.blocks[il].attention_norm, cur),
			
 
				+                            cur),
			
 
				+                        ggml_repeat(ctx0, model.blocks[il].attention_norm_b, cur));
			
 
				+            }
			
 
				+            else { // Falcon 7B
			
 
				+                cur = layernorm_output;
			
 
				+            }
			
 
				+
			
 
				+            // compute QKV
			
 
				+
			
 
				+            cur = ggml_mul_mat(ctx0, model.blocks[il].query_key_value, cur);
			
 
				+
			
 
				+            // Note that the strides for Kcur, Vcur are set up so that the
			
 
				+            // resulting views are misaligned with the tensor's storage
			
 
				+            // (by applying the K/V offset we shift the tensor's original
			
 
				+            // view to stick out behind the viewed QKV tensor's allocated
			
 
				+            // memory, so to say). This is ok because no actual accesses
			
 
				+            // happen to that out-of-range memory, but it can require some
			
 
				+            // trickery when trying to accurately dump these views for
			
 
				+            // debugging.
			
 
				+
			
 
				+            struct ggml_tensor * Qcur = ggml_view_3d(
			
 
				+                ctx0, cur, head_dim, n_head, N,
			
 
				+                head_dim * sizeof_wtype,
			
 
				+                head_dim * (n_head + 2 * n_head_kv) * sizeof_wtype,
			
 
				+                0);
			
 
				+
			
 
				+            struct ggml_tensor * Kcur = ggml_view_3d(
			
 
				+                ctx0, cur, head_dim, n_head_kv, N,
			
 
				+                head_dim * sizeof_wtype,
			
 
				+                head_dim * (n_head + 2 * n_head_kv) * sizeof_wtype,
			
 
				+                head_dim * n_head * sizeof_wtype);
			
 
				+
			
 
				+            struct ggml_tensor * Vcur = ggml_view_3d(
			
 
				+                ctx0, cur, head_dim, n_head_kv, N,
			
 
				+                head_dim * sizeof_wtype,
			
 
				+                head_dim * (n_head + 2 * n_head_kv) * sizeof_wtype,
			
 
				+                head_dim * (n_head + n_head_kv) * sizeof_wtype);
			
 
				+
			
 
				+            // using mode = 2 for neox mode
			
 
				+            Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, head_dim, 2, 0);
			
 
				+            Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, head_dim, 2, 0);
			
 
				+
			
 
				+            // store key and value to memory
			
 
				+            {
			
 
				+                struct ggml_tensor* k = ggml_view_1d(
			
 
				+                    ctx0, model.memory_k, N * n_head_kv * head_dim,
			
 
				+                    (ggml_element_size(model.memory_k) * n_head_kv * head_dim) *
			
 
				+                        (il * n_ctx + n_past));
			
 
				+                struct ggml_tensor* v = ggml_view_1d(
			
 
				+                    ctx0, model.memory_v, N * n_head_kv * head_dim,
			
 
				+                    (ggml_element_size(model.memory_v) * n_head_kv * head_dim) *
			
 
				+                        (il * n_ctx + n_past));
			
 
				+
			
 
				+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
			
 
				+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
			
 
				+            }
			
 
				+
			
 
				+            struct ggml_tensor * K = ggml_permute(
			
 
				+                ctx0,
			
 
				+                ggml_reshape_3d(
			
 
				+                    ctx0,
			
 
				+                    ggml_view_1d(ctx0, model.memory_k, (n_past + N) * n_head_kv * head_dim,
			
 
				+                                 il * n_ctx *
			
 
				+                                     ggml_element_size(model.memory_k) *
			
 
				+                                     n_head_kv *
			
 
				+                                     head_dim),
			
 
				+                    head_dim, n_head_kv, n_past + N),
			
 
				+                0, 2, 1, 3);
			
 
				+
			
 
				+            // K * Q
			
 
				+
			
 
				+//            K = ggml_cont(ctx0, ggml_repeat2(ctx0, K, repeat_dummy));
			
 
				+
			
 
				+            struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
			
 
				+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
			
 
				+
			
 
				+            // KQ_scaled = KQ / sqrt(n_embd/n_head)
			
 
				+            struct ggml_tensor * KQ_scaled =
			
 
				+                ggml_scale_inplace(ctx0,
			
 
				+                        KQ,
			
 
				+                        ggml_new_f32(ctx0, 1.0f/sqrt(float(head_dim)))
			
 
				+                        );
			
 
				+
			
 
				+            // KQ_masked = mask_past(KQ_scaled)
			
 
				+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
			
 
				+
			
 
				+            // KQ = soft_max(KQ_masked)
			
 
				+            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
			
 
				+
			
 
				+            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
			
 
				+            struct ggml_tensor* V = ggml_permute(
			
 
				+                ctx0,
			
 
				+                ggml_reshape_3d(
			
 
				+                    ctx0,
			
 
				+                    ggml_view_1d(ctx0, model.memory_v, (n_past + N) * n_head_kv * head_dim,
			
 
				+                                 il * n_ctx *
			
 
				+                                     ggml_element_size(model.memory_v) *
			
 
				+                                     n_head_kv *
			
 
				+                                     head_dim),
			
 
				+                    head_dim, n_head_kv, n_past + N),
			
 
				+                0, 2, 1, 3);
			
 
				+
			
 
				+//            V = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_repeat2(ctx0, V, repeat_dummy)));
			
 
				+            V = ggml_cont(ctx0, ggml_transpose(ctx0, V));
			
 
				+
			
 
				+            // KQV = transpose(V) * KQ_soft_max
			
 
				+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
			
 
				+
			
 
				+            // KQV_merged = KQV.permute(0, 2, 1, 3)
			
 
				+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
			
 
				+
			
 
				+            // cur = KQV_merged.contiguous().view(n_embd, N)
			
 
				+            cur = ggml_cpy(ctx0,
			
 
				+                    KQV_merged,
			
 
				+                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
			
 
				+
			
 
				+            // projection
			
 
				+            {
			
 
				+                cur = ggml_mul_mat(ctx0,
			
 
				+                        model.blocks[il].wo,
			
 
				+                        cur);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
			
 
				+
			
 
				+        struct ggml_tensor* inpFF = layernorm_output;
			
 
				+        struct ggml_tensor* attn_out = ggml_cpy(
			
 
				+            ctx0, cur, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
			
 
				+
			
 
				+        {
			
 
				+            cur = ggml_mul_mat(ctx0, model.blocks[il].ffn_up, inpFF);
			
 
				+            cur = ggml_gelu(ctx0, cur);
			
 
				+            cur = ggml_mul_mat(ctx0, model.blocks[il].ffn_down, cur);
			
 
				+        }
			
 
				+
			
 
				+        cur = ggml_add(ctx0, cur, attn_out);
			
 
				+        cur = ggml_add(ctx0, cur, inpL);
			
 
				+        // input for next layer
			
 
				+        inpL = cur;
			
 
				+    }
			
 
				+
			
 
				+    ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
			
 
				+
			
 
				+    // norm
			
 
				+    {
			
 
				+        inpL = ggml_norm(ctx0, inpL);
			
 
				+
			
 
				+        // inpL = ln_f_g*inpL + ln_f_b
			
 
				+        inpL = ggml_add(ctx0,
			
 
				+                ggml_mul(ctx0,
			
 
				+                    ggml_repeat(ctx0, model.output_norm, inpL),
			
 
				+                    inpL),
			
 
				+                ggml_repeat(ctx0, model.output_norm_b, inpL));
			
 
				+    }
			
 
				+
			
 
				+    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
			
 
				+
			
 
				+    // lm_head
			
 
				+    {
			
 
				+        inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
			
 
				+
			
 
				+        //inpL = ggml_add(ctx0,
			
 
				+        //        ggml_repeat(ctx0, model.lmh_b, inpL),
			
 
				+        //        inpL);
			
 
				+    }
			
 
				+
			
 
				+    // logits -> probs
			
 
				+    //inpL = ggml_soft_max_inplace(ctx0, inpL);
			
 
				+
			
 
				+    // run the computation
			
 
				+    ggml_build_forward_expand(&gf, inpL);
			
 
				+//    ggml_graph_compute       (ctx0, &gf);
			
 
				+    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
			
 
				+
			
 
				+    //if (n_past%100 == 0) {
			
 
				+    //    ggml_graph_print   (&gf);
			
 
				+    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
			
 
				+    //}
			
 
				+
			
 
				+    // return result for just the last token
			
 
				+    embd_w.resize(n_vocab);
			
 
				+    memcpy(embd_w.data(), (float *)ggml_get_data(inpL) + (n_vocab * (N - 1)), sizeof(float) * n_vocab);
			
 
				+
			
 
				+    if (mem_per_token == 0) {
			
 
				+        mem_per_token = ggml_used_mem(ctx0)/N;
			
 
				+    }
			
 
				+    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
			
 
				+
			
 
				+    ggml_free(ctx0);
			
 
				+
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char ** argv) {
			
 
				+    ggml_time_init();
			
 
				+
			
 
				+    const int64_t t_main_start_us = ggml_time_us();
			
 
				+
			
 
				+    gpt_params params;
			
 
				+
			
 
				+    if (gpt_params_parse(argc, argv, params) == false) {
			
 
				+        return 1;
			
 
				+    }
			
 
				+
			
 
				+    int64_t t_load_us = 0;
			
 
				+
			
 
				+    gpt2bpe_vocab vocab;
			
 
				+    falcon_model model;
			
 
				+
			
 
				+    // load the model
			
 
				+    {
			
 
				+        const int64_t t_start_us = ggml_time_us();
			
 
				+
			
 
				+        if (!falcon_model_load(params.model, model, vocab)) {
			
 
				+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
			
 
				+            return 1;
			
 
				+        }
			
 
				+
			
 
				+        t_load_us = ggml_time_us() - t_start_us;
			
 
				+
			
 
				+    }
			
 
				+
			
 
				+    if (params.seed < 0) {
			
 
				+        params.seed = time(NULL);
			
 
				+    }
			
 
				+
			
 
				+    if (params.top_k == 0) {
			
 
				+        params.top_k = model.hparams.n_vocab;
			
 
				+    }
			
 
				+
			
 
				+    printf("%s: seed           = %d\n",   __func__, params.seed);
			
 
				+    printf("%s: temp           = %.3f\n", __func__, params.temp);
			
 
				+    printf("%s: top_k          = %d\n",   __func__, params.top_k);
			
 
				+    printf("%s: top_p          = %.3f\n", __func__, params.top_p);
			
 
				+    printf("%s: repeat_last_n  = %d\n",   __func__, params.repeat_last_n);
			
 
				+    printf("%s: repeat_penalty = %.3f\n", __func__, params.repeat_penalty);
			
 
				+
			
 
				+    std::mt19937 rng(params.seed);
			
 
				+
			
 
				+    if (params.prompt.empty()) {
			
 
				+        params.prompt = "Once upon";
			
 
				+    }
			
 
				+
			
 
				+    std::vector<int32_t> last_n_tokens(model.hparams.n_ctx);
			
 
				+    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
			
 
				+
			
 
				+    int n_past = 0;
			
 
				+
			
 
				+    int64_t t_sample_us  = 0;
			
 
				+    int64_t t_predict_us = 0;
			
 
				+
			
 
				+    std::vector<float> logits;
			
 
				+
			
 
				+    // tokenize the prompt
			
 
				+    std::vector<gpt2bpe_vocab::id> embd_inp = gpt2bpe_tokenize(vocab, params.prompt,false, false);
			
 
				+
			
 
				+    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
			
 
				+
			
 
				+    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
			
 
				+//    for (size_t i = 0; i < embd_inp.size(); i++) {
			
 
				+//        printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token[embd_inp[i]].c_str());
			
 
				+//    }
			
 
				+
			
 
				+    if( model.hparams.n_ctx < params.n_predict+embd_inp.size() ) {
			
 
				+        params.n_predict = model.hparams.n_ctx-embd_inp.size();
			
 
				+    }
			
 
				+
			
 
				+    printf("%s: n_predict = %d\n", __func__, params.n_predict);
			
 
				+    printf("\n");
			
 
				+
			
 
				+    std::vector<gpt2bpe_vocab::id> embd;
			
 
				+
			
 
				+    // determine the required inference memory per token:
			
 
				+    size_t mem_per_token = 0;
			
 
				+    falcon_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
			
 
				+
			
 
				+    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
			
 
				+        // predict
			
 
				+        if (embd.size() > 0) {
			
 
				+            const int64_t t_start_us = ggml_time_us();
			
 
				+
			
 
				+            if (!falcon_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
			
 
				+                printf("Failed to predict\n");
			
 
				+                return 1;
			
 
				+            }
			
 
				+
			
 
				+            t_predict_us += ggml_time_us() - t_start_us;
			
 
				+        }
			
 
				+
			
 
				+        n_past += embd.size();
			
 
				+        embd.clear();
			
 
				+
			
 
				+        if (i >= embd_inp.size()) {
			
 
				+            // sample next token
			
 
				+            const int   top_k = params.top_k;
			
 
				+            const float top_p = params.top_p;
			
 
				+            const float temp  = params.temp;
			
 
				+            const int repeat_last_n = params.repeat_last_n;
			
 
				+            const float repeat_penalty = params.repeat_penalty;
			
 
				+
			
 
				+            const int n_vocab = model.hparams.n_vocab;
			
 
				+
			
 
				+            gpt2bpe_vocab::id id = 0;
			
 
				+
			
 
				+            {
			
 
				+                const int64_t t_start_sample_us = ggml_time_us();
			
 
				+
			
 
				+                id = sample_top_k_top_p_repeat(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, repeat_last_n, repeat_penalty, rng);
			
 
				+
			
 
				+                last_n_tokens.erase(last_n_tokens.begin());
			
 
				+                last_n_tokens.push_back(id);
			
 
				+
			
 
				+                t_sample_us += ggml_time_us() - t_start_sample_us;
			
 
				+            }
			
 
				+
			
 
				+            // add it to the context
			
 
				+            embd.push_back(id);
			
 
				+        } else {
			
 
				+            // if here, it means we are still processing the input prompt
			
 
				+            for (size_t k = i; k < embd_inp.size(); k++) {
			
 
				+                embd.push_back(embd_inp[k]);
			
 
				+                if (embd.size() > params.n_batch) {
			
 
				+                    break;
			
 
				+                }
			
 
				+            }
			
 
				+            i += embd.size() - 1;
			
 
				+        }
			
 
				+
			
 
				+        // display text
			
 
				+        for (auto id : embd) {
			
 
				+            printf("%s", vocab.id_to_token[id].c_str()  );
			
 
				+        }
			
 
				+        fflush(stdout);
			
 
				+
			
 
				+        // end of text token
			
 
				+        if (vocab.special_eos_id != -1 && embd.back() == vocab.special_eos_id) {
			
 
				+            break;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // report timing
			
 
				+    {
			
 
				+        const int64_t t_main_end_us = ggml_time_us();
			
 
				+
			
 
				+        printf("\n\n");
			
 
				+        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
			
 
				+        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
			
 
				+        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
			
 
				+        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
			
 
				+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
			
 
				+    }
			
 
				+
			
 
				+    ggml_free(model.ctx);
			
 
				+
			
 
				+    return 0;
			
 
				+}
			
--- a/examples/gptneox-wip/gptneox-main.cpp
+++ b/examples/gptneox-wip/gptneox-main.cpp
@@ -0,0 +1,1082 @@
 
				+#include "ggml.h"
			
 
				+#include "cmpnct_gpt2bpe.hpp"
			
 
				+
			
 
				+#include <cassert>
			
 
				+#include <cmath>
			
 
				+#include <cstdio>
			
 
				+#include <cstring>
			
 
				+#include <cinttypes>
			
 
				+#include <fstream>
			
 
				+#include <map>
			
 
				+#include <string>
			
 
				+#include <vector>
			
 
				+#include <thread>
			
 
				+#include <random>
			
 
				+
			
 
				+#if defined(_MSC_VER)
			
 
				+#pragma warning(disable: 4244 4267) // possible loss of data
			
 
				+#endif
			
 
				+
			
 
				+// default hparams
			
 
				+struct gpt_neox_hparams {
			
 
				+    size_t n_merges = 0;
			
 
				+    size_t n_vocab  = 0;
			
 
				+    uint32_t n_ctx    = 0;
			
 
				+    uint32_t n_embd   = 0;
			
 
				+    uint32_t n_head   = 0;
			
 
				+    uint32_t n_block  = 0;
			
 
				+    uint32_t n_rot    = 0; // rotary_pct * (n_embd / n_head)
			
 
				+    bool par_res = true;
			
 
				+    float norm_eps = 1e-5;
			
 
				+};
			
 
				+
			
 
				+struct gpt_neox_block {
			
 
				+    // pre normalization
			
 
				+    struct ggml_tensor * ln_1_g;
			
 
				+    struct ggml_tensor * ln_1_b;
			
 
				+
			
 
				+    // attention
			
 
				+    struct ggml_tensor * c_attn_attn_w;
			
 
				+    struct ggml_tensor * c_attn_attn_b;
			
 
				+
			
 
				+    struct ggml_tensor * c_attn_proj_w;
			
 
				+    struct ggml_tensor * c_attn_proj_b;
			
 
				+
			
 
				+    // post normalization
			
 
				+    struct ggml_tensor * ln_2_g;
			
 
				+    struct ggml_tensor * ln_2_b;
			
 
				+
			
 
				+    // ff
			
 
				+    struct ggml_tensor * c_mlp_fc_w;
			
 
				+    struct ggml_tensor * c_mlp_fc_b;
			
 
				+
			
 
				+    struct ggml_tensor * c_mlp_proj_w;
			
 
				+    struct ggml_tensor * c_mlp_proj_b;
			
 
				+};
			
 
				+
			
 
				+struct gpt_neox_model {
			
 
				+    gpt_neox_hparams hparams;
			
 
				+
			
 
				+    // normalization
			
 
				+    struct ggml_tensor * ln_f_g;
			
 
				+    struct ggml_tensor * ln_f_b;
			
 
				+
			
 
				+    struct ggml_tensor * wte; // position embedding
			
 
				+
			
 
				+    struct ggml_tensor * lmh_g; // language model head
			
 
				+
			
 
				+    std::vector<gpt_neox_block> blocks;
			
 
				+
			
 
				+    // key + value memory
			
 
				+    struct ggml_tensor * memory_k;
			
 
				+    struct ggml_tensor * memory_v;
			
 
				+
			
 
				+    //
			
 
				+    struct gguf_context * ggufctx;
			
 
				+    struct ggml_context * ctx;
			
 
				+    struct ggml_context * kvctx;
			
 
				+
			
 
				+    std::map<std::string, struct ggml_tensor *> tensors;
			
 
				+};
			
 
				+
			
 
				+struct gpt_params {
			
 
				+    int32_t seed      = -1;  // RNG seed
			
 
				+    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
			
 
				+    uint32_t n_predict = 200; // new tokens to predict
			
 
				+    uint32_t n_batch   = 512;   // batch size for prompt processing
			
 
				+
			
 
				+    // sampling parameters
			
 
				+    int32_t top_k          = 40;
			
 
				+    float top_p            = 1.0f;
			
 
				+    float temp             = 0.8f;
			
 
				+    int32_t repeat_last_n  = 64;
			
 
				+    float repeat_penalty   = 1.02f;
			
 
				+
			
 
				+    std::string model      = ""; // model path
			
 
				+    std::string prompt     = "";
			
 
				+
			
 
				+    std::string token_test = "";
			
 
				+    bool    interactive      = false;
			
 
				+    int32_t interactive_port = -1;
			
 
				+    int32_t n_gpu_layers     = 0;
			
 
				+};
			
 
				+
			
 
				+void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
			
 
				+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
			
 
				+    fprintf(stderr, "\n");
			
 
				+    fprintf(stderr, "options:\n");
			
 
				+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
			
 
				+    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
			
 
				+    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
			
 
				+    fprintf(stderr, "  -ngl N, --gpu-layers N  number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers);
			
 
				+    fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
			
 
				+    fprintf(stderr, "                        prompt to start generation with (default: random)\n");
			
 
				+    fprintf(stderr, "  -f FNAME, --file FNAME\n");
			
 
				+    fprintf(stderr, "                        load prompt from a file\n");
			
 
				+    fprintf(stderr, "  -tt TOKEN_TEST, --token_test TOKEN_TEST\n");
			
 
				+    fprintf(stderr, "                        test tokenization\n");
			
 
				+    fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d)\n", params.n_predict);
			
 
				+    fprintf(stderr, "  --top_k N             top-k sampling, 0 = n_vocab (default: %d)\n", params.top_k);
			
 
				+    fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f)\n", params.top_p);
			
 
				+    fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", params.temp);
			
 
				+    fprintf(stderr, "  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled)\n", params.repeat_last_n);
			
 
				+    fprintf(stderr, "  --repeat-penalty N    penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty);
			
 
				+    fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
			
 
				+    fprintf(stderr, "  -m FNAME, --model FNAME\n");
			
 
				+    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
			
 
				+    fprintf(stderr, "\n");
			
 
				+}
			
 
				+
			
 
				+// Function to check if the next argument exists
			
 
				+std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) {
			
 
				+    if (i + 1 < argc && argv[i + 1][0] != '-') {
			
 
				+        return argv[++i];
			
 
				+    } else {
			
 
				+        fprintf(stderr, "error: %s requires one argument.\n", flag.c_str());
			
 
				+        gpt_print_usage(argc, argv, params);
			
 
				+        exit(0);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
			
 
				+    for (int i = 1; i < argc; i++) {
			
 
				+        std::string arg = argv[i];
			
 
				+
			
 
				+        if (arg == "-s" || arg == "--seed") {
			
 
				+            params.seed = std::stoi(get_next_arg(i, argc, argv, arg, params));
			
 
				+        } else if (arg == "-t" || arg == "--threads") {
			
 
				+            params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params));
			
 
				+        } else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
			
 
				+            params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params));
			
 
				+        } else if (arg == "-p" || arg == "--prompt") {
			
 
				+            params.prompt = get_next_arg(i, argc, argv, arg, params);
			
 
				+        } else if (arg == "-n" || arg == "--n_predict") {
			
 
				+            params.n_predict = std::stoi(get_next_arg(i, argc, argv, arg, params));
			
 
				+        } else if (arg == "--top_k") {
			
 
				+            params.top_k = std::stoi(get_next_arg(i, argc, argv, arg, params));
			
 
				+        } else if (arg == "--top_p") {
			
 
				+            params.top_p = std::stof(get_next_arg(i, argc, argv, arg, params));
			
 
				+        } else if (arg == "--temp") {
			
 
				+            params.temp = std::stof(get_next_arg(i, argc, argv, arg, params));
			
 
				+        } else if (arg == "--repeat-last-n") {
			
 
				+            params.repeat_last_n = std::stoi(get_next_arg(i, argc, argv, arg, params));
			
 
				+        } else if (arg == "--repeat-penalty") {
			
 
				+            params.repeat_penalty = std::stof(get_next_arg(i, argc, argv, arg, params));
			
 
				+        } else if (arg == "-b" || arg == "--batch_size") {
			
 
				+            params.n_batch= std::stoi(get_next_arg(i, argc, argv, arg, params));
			
 
				+        } else if (arg == "-m" || arg == "--model") {
			
 
				+            params.model = get_next_arg(i, argc, argv, arg, params);
			
 
				+        } else if (arg == "-i" || arg == "--interactive") {
			
 
				+            params.interactive = true;
			
 
				+        } else if (arg == "-ip" || arg == "--interactive-port") {
			
 
				+            params.interactive = true;
			
 
				+            params.interactive_port = std::stoi(get_next_arg(i, argc, argv, arg, params));
			
 
				+        } else if (arg == "-h" || arg == "--help") {
			
 
				+            gpt_print_usage(argc, argv, params);
			
 
				+            exit(0);
			
 
				+        } else if (arg == "-f" || arg == "--file") {
			
 
				+            get_next_arg(i, argc, argv, arg, params);
			
 
				+            std::ifstream file(argv[i]);
			
 
				+            if (!file) {
			
 
				+                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
			
 
				+                break;
			
 
				+            }
			
 
				+            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
			
 
				+            if (params.prompt.back() == '\n') {
			
 
				+                params.prompt.pop_back();
			
 
				+            }
			
 
				+        } else if (arg == "-tt" || arg == "--token_test") {
			
 
				+            params.token_test = get_next_arg(i, argc, argv, arg, params);
			
 
				+        }
			
 
				+        else {
			
 
				+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
			
 
				+            gpt_print_usage(argc, argv, params);
			
 
				+            exit(0);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+gpt2bpe_vocab::id sample_top_k_top_p_repeat(
			
 
				+        const gpt2bpe_vocab & vocab,
			
 
				+        const float * logits,
			
 
				+        const int32_t * last_n_tokens_data,
			
 
				+        size_t last_n_tokens_data_size,
			
 
				+        int    top_k,
			
 
				+        double top_p,
			
 
				+        double temp,
			
 
				+        int repeat_last_n,
			
 
				+        float repeat_penalty,
			
 
				+        std::mt19937 & rng) {
			
 
				+
			
 
				+    int n_logits = vocab.id_to_token.size();
			
 
				+
			
 
				+    const auto * plogits = logits;
			
 
				+
			
 
				+    const auto last_n_tokens = std::vector<int32_t>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_data_size);
			
 
				+
			
 
				+    if (temp <= 0) {
			
 
				+        // select the token with the highest logit directly
			
 
				+        float max_logit = plogits[0];
			
 
				+        gpt2bpe_vocab::id max_id = 0;
			
 
				+
			
 
				+        for (int i = 1; i < n_logits; ++i) {
			
 
				+            if (plogits[i] > max_logit) {
			
 
				+                max_logit = plogits[i];
			
 
				+                max_id = i;
			
 
				+            }
			
 
				+        }
			
 
				+        return max_id;
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    std::vector<std::pair<double, gpt2bpe_vocab::id>> logits_id;
			
 
				+    logits_id.reserve(n_logits);
			
 
				+
			
 
				+    {
			
 
				+        const float scale = 1.0f/temp;
			
 
				+        for (int i = 0; i < n_logits; ++i) {
			
 
				+            // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
			
 
				+            // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
			
 
				+            if (repeat_last_n > 0 && std::find(last_n_tokens.end()-repeat_last_n, last_n_tokens.end(), i) != last_n_tokens.end()) {
			
 
				+                // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
			
 
				+                if (plogits[i] < 0.0f) {
			
 
				+                    logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
			
 
				+                } else {
			
 
				+                    logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
			
 
				+                }
			
 
				+            } else {
			
 
				+                logits_id.push_back(std::make_pair(plogits[i]*scale, i));
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // find the top K tokens
			
 
				+    std::partial_sort(
			
 
				+            logits_id.begin(),
			
 
				+            logits_id.begin() + top_k, logits_id.end(),
			
 
				+            [](const std::pair<double, gpt2bpe_vocab::id> & a, const std::pair<double, gpt2bpe_vocab::id> & b) {
			
 
				+        return a.first > b.first;
			
 
				+    });
			
 
				+
			
 
				+    logits_id.resize(top_k);
			
 
				+
			
 
				+    double maxl = -INFINITY;
			
 
				+    for (const auto & kv : logits_id) {
			
 
				+        maxl = std::max(maxl, kv.first);
			
 
				+    }
			
 
				+
			
 
				+    // compute probs for the top K tokens
			
 
				+    std::vector<double> probs;
			
 
				+    probs.reserve(logits_id.size());
			
 
				+
			
 
				+    double sum = 0.0;
			
 
				+    for (const auto & kv : logits_id) {
			
 
				+        double p = exp(kv.first - maxl);
			
 
				+        probs.push_back(p);
			
 
				+        sum += p;
			
 
				+    }
			
 
				+
			
 
				+    // normalize the probs
			
 
				+    for (auto & p : probs) {
			
 
				+        p /= sum;
			
 
				+    }
			
 
				+
			
 
				+    if (top_p < 1.0f) {
			
 
				+        double cumsum = 0.0f;
			
 
				+        for (int i = 0; i < top_k; i++) {
			
 
				+            cumsum += probs[i];
			
 
				+            if (cumsum >= top_p) {
			
 
				+                top_k = i + 1;
			
 
				+                probs.resize(top_k);
			
 
				+                logits_id.resize(top_k);
			
 
				+                break;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        cumsum = 1.0/cumsum;
			
 
				+        for (int i = 0; i < (int) probs.size(); i++) {
			
 
				+            probs[i] *= cumsum;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+//    printf("\n");
			
 
				+//    for (int i = 0; i < (int) probs.size(); i++) {
			
 
				+//    for (int i = 0; i < 10; i++) {
			
 
				+//        printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
			
 
				+//    }
			
 
				+
			
 
				+    std::discrete_distribution<> dist(probs.begin(), probs.end());
			
 
				+    int idx = dist(rng);
			
 
				+
			
 
				+    return logits_id[idx].second;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+struct ggml_tensor * get_tensor_ex( struct ggml_context * ctx, std::string name){
			
 
				+
			
 
				+    struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
			
 
				+    if( cur == NULL ) {
			
 
				+        fprintf(stdout, "%s: tensor '%s' not found!\n", __func__, name.c_str());
			
 
				+    } else {
			
 
				+//        fprintf(stdout, "%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name);
			
 
				+    }
			
 
				+
			
 
				+    return cur;
			
 
				+}
			
 
				+
			
 
				+// load the model's weights from a file
			
 
				+bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2bpe_vocab & vocab) {
			
 
				+    printf("%s: loading model from '%s'..\n", __func__, fname.c_str());
			
 
				+
			
 
				+    model.ctx = NULL;
			
 
				+
			
 
				+    struct gguf_init_params ggufparams = {
			
 
				+        /*.no_alloc = */ false,
			
 
				+        /*.ctx      = */ &model.ctx,
			
 
				+    };
			
 
				+
			
 
				+    auto & ggufctx = model.ggufctx;
			
 
				+
			
 
				+    ggufctx  = gguf_init_from_file(fname.c_str(), ggufparams);
			
 
				+
			
 
				+    if (!ggufctx) {
			
 
				+        fprintf(stderr, "%s: gguf_init_from_file() failed\n", __func__);
			
 
				+        return false;
			
 
				+    }
			
 
				+
			
 
				+    fprintf(stdout, "%s: gguf version     = %d\n", __func__, gguf_get_version(ggufctx));
			
 
				+    fprintf(stdout, "%s: gguf alignment   = %zu\n", __func__, gguf_get_alignment(ggufctx));
			
 
				+    fprintf(stdout, "%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx));
			
 
				+
			
 
				+    // print all kv
			
 
				+    #if 0
			
 
				+    {
			
 
				+        const int n_kv = gguf_get_n_kv(ggufctx);
			
 
				+
			
 
				+        fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
			
 
				+
			
 
				+        for (int i = 0; i < n_kv; ++i) {
			
 
				+            const char * key = gguf_get_key(ggufctx, i);
			
 
				+
			
 
				+            fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
			
 
				+        }
			
 
				+    }
			
 
				+    #endif
			
 
				+
			
 
				+    // print some standard metadata
			
 
				+    {
			
 
				+        int keyidx;
			
 
				+
			
 
				+        keyidx = gguf_find_key(ggufctx, "general.name");
			
 
				+        if (keyidx != -1) { fprintf(stdout, "%s: model name           = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
			
 
				+        keyidx = gguf_find_key(ggufctx, "general.description");
			
 
				+        if (keyidx != -1) { fprintf(stdout, "%s: model description    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
			
 
				+        keyidx = gguf_find_key(ggufctx, "general.author");
			
 
				+        if (keyidx != -1) { fprintf(stdout, "%s: model author         = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
			
 
				+        keyidx = gguf_find_key(ggufctx, "general.license");
			
 
				+        if (keyidx != -1) { fprintf(stdout, "%s: model license        = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
			
 
				+        keyidx = gguf_find_key(ggufctx, "general.architecture");
			
 
				+        if (keyidx != -1) { fprintf(stdout, "%s: model architecture   = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
			
 
				+        keyidx = gguf_find_key(ggufctx, "general.file_type");
			
 
				+        if (keyidx != -1) { fprintf(stdout, "%s: model file type      = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
			
 
				+        keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
			
 
				+        if (keyidx != -1) { fprintf(stdout, "%s: model data layout    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
			
 
				+        keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
			
 
				+        if (keyidx != -1) { fprintf(stdout, "%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
			
 
				+    }
			
 
				+
			
 
				+    // check required metadata
			
 
				+    {
			
 
				+        int keyidx;
			
 
				+
			
 
				+        // check model architecture kv
			
 
				+        keyidx = gguf_find_key(ggufctx, "general.architecture");
			
 
				+        if (keyidx != -1) {
			
 
				+            if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gptneox") != 0) {
			
 
				+                fprintf(stdout, "%s: model architecture not supported!\n", __func__);
			
 
				+                return false;
			
 
				+            }
			
 
				+        } else {
			
 
				+            fprintf(stdout, "%s: gguf model architecture not found!\n", __func__);
			
 
				+            return false;
			
 
				+        }
			
 
				+
			
 
				+    }
			
 
				+
			
 
				+    // load hparams
			
 
				+    {
			
 
				+        auto & hparams = model.hparams;
			
 
				+
			
 
				+        bool ok = true;
			
 
				+        int keyidx;
			
 
				+
			
 
				+        if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.context_length");
			
 
				+                  if (keyidx != -1) { hparams.n_ctx = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
			
 
				+
			
 
				+        if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.embedding_length");
			
 
				+                  if (keyidx != -1) { hparams.n_embd = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
			
 
				+
			
 
				+        if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.attention.head_count");
			
 
				+                  if (keyidx != -1) { hparams.n_head = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
			
 
				+
			
 
				+        if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.block_count");
			
 
				+                  if (keyidx != -1) { hparams.n_block = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
			
 
				+
			
 
				+        if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.rope.dimension_count");
			
 
				+                  if (keyidx != -1) { hparams.n_rot = gguf_get_val_u32(ggufctx, keyidx); } else { ok = false; }  }
			
 
				+
			
 
				+        if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.use_parallel_residual");
			
 
				+                  if (keyidx != -1) { hparams.par_res = gguf_get_val_bool(ggufctx, keyidx); } else { ok = false; }  }
			
 
				+
			
 
				+        if (ok) { keyidx = gguf_find_key(ggufctx, "gptneox.attention.layer_norm_epsilon");
			
 
				+                  if (keyidx != -1) { hparams.norm_eps= gguf_get_val_f32(ggufctx, keyidx); } else { ok = false; }  }
			
 
				+
			
 
				+        if (!ok) {
			
 
				+            fprintf(stderr, "%s: required hparam missing!\n", __func__);
			
 
				+            return false;
			
 
				+        }
			
 
				+
			
 
				+        printf("%s: n_ctx    = %d\n", __func__, hparams.n_ctx);
			
 
				+        printf("%s: n_embd   = %d\n", __func__, hparams.n_embd);
			
 
				+        printf("%s: n_head   = %d\n", __func__, hparams.n_head);
			
 
				+        printf("%s: n_block  = %d\n", __func__, hparams.n_block);
			
 
				+        printf("%s: n_rot    = %d\n", __func__, hparams.n_rot);
			
 
				+        printf("%s: par_res  = %d\n", __func__, hparams.par_res);
			
 
				+        printf("%s: norm_eps = %g\n", __func__, hparams.norm_eps);
			
 
				+
			
 
				+    }
			
 
				+
			
 
				+    // load vocab
			
 
				+    {
			
 
				+        auto & hparams = model.hparams;
			
 
				+
			
 
				+        int keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.model");
			
 
				+
			
 
				+        if (keyidx != -1) {
			
 
				+            if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) {
			
 
				+                fprintf(stdout, "%s: tokenizer model not supported!\n", __func__);
			
 
				+                return false;
			
 
				+            }
			
 
				+        } else {
			
 
				+            fprintf(stdout, "%s: tokenizer model not found!\n", __func__);
			
 
				+            return false;
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+        int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens");
			
 
				+
			
 
				+        if (tokens_keyidx == -1) {
			
 
				+            fprintf(stdout, "%s: gpt2 tokenizer vocab not found!\n", __func__);
			
 
				+            return false;
			
 
				+        }
			
 
				+
			
 
				+        int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges");
			
 
				+
			
 
				+        if (merges_keyidx == -1) {
			
 
				+            fprintf(stdout, "%s: gpt2 tokenizer merges not found!\n", __func__);
			
 
				+            return false;
			
 
				+        }
			
 
				+
			
 
				+        hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx);
			
 
				+        hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx);
			
 
				+
			
 
				+        fprintf(stdout, "%s: gpt2 tokenizer vocab  = %zu\n", __func__, hparams.n_vocab);
			
 
				+        fprintf(stdout, "%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges);
			
 
				+
			
 
				+        for (size_t i = 0; i < hparams.n_vocab; i++) {
			
 
				+            std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i);
			
 
				+
			
 
				+//            printf("token %d = '%s'\n",i,word.c_str() );
			
 
				+
			
 
				+            vocab.token_to_id[word] = i;
			
 
				+            vocab.id_to_token[i] = word;
			
 
				+
			
 
				+            if( vocab.id_to_token[i] == "\n" ) {
			
 
				+                vocab.linefeed_id = i;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        std::vector<std::pair<std::string, std::string>> bpe_merges;
			
 
				+
			
 
				+        for (size_t i = 0; i < hparams.n_merges; i++) {
			
 
				+
			
 
				+            std::string word = gguf_get_arr_str(ggufctx, merges_keyidx, i);
			
 
				+
			
 
				+            // Split the merges
			
 
				+            std::string first, second;
			
 
				+            size_t pos = word.find(' ', 1); // Start the search from the second character
			
 
				+            if (pos != std::string::npos) {
			
 
				+                first = word.substr(0, pos);
			
 
				+                second = word.substr(pos + 1);
			
 
				+            }
			
 
				+
			
 
				+            bpe_merges.push_back(std::make_pair(first, second));
			
 
				+        }
			
 
				+
			
 
				+        vocab.populate_bpe_ranks(bpe_merges);
			
 
				+
			
 
				+
			
 
				+        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.bos_token_id"); if( keyidx != -1 ) {       vocab.special_bos_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
			
 
				+        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.eos_token_id"); if( keyidx != -1 ) {       vocab.special_eos_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
			
 
				+        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.unknown_token_id"); if( keyidx != -1 ) {   vocab.special_unk_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
			
 
				+        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
			
 
				+        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) {   vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
			
 
				+
			
 
				+        if( vocab.special_bos_id != -1 ) { fprintf(stdout, "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); }
			
 
				+        if( vocab.special_eos_id != -1 ) { fprintf(stdout, "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); }
			
 
				+        if( vocab.special_unk_id != -1 ) { fprintf(stdout, "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); }
			
 
				+        if( vocab.special_sep_id != -1 ) { fprintf(stdout, "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); }
			
 
				+        if( vocab.special_pad_id != -1 ) { fprintf(stdout, "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); }
			
 
				+        if( vocab.linefeed_id    != -1 ) { fprintf(stdout, "%s: LF token  = %d\n",      __func__, vocab.linefeed_id ); }
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    auto & ctx = model.ctx;
			
 
				+    size_t ctx_size = ggml_get_mem_size(ctx);
			
 
				+
			
 
				+    printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
			
 
				+
			
 
				+    // print tensor info
			
 
				+    #if 0
			
 
				+    {
			
 
				+        const int n_tensors = gguf_get_n_tensors(ggufctx);
			
 
				+
			
 
				+        fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
			
 
				+
			
 
				+        for (int i = 0; i < n_tensors; ++i) {
			
 
				+            const char * name   = gguf_get_tensor_name  (ggufctx, i);
			
 
				+            const size_t offset = gguf_get_tensor_offset(ggufctx, i);
			
 
				+
			
 
				+            fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
			
 
				+        }
			
 
				+    }
			
 
				+    #endif
			
 
				+
			
 
				+    // prepare memory for the weights
			
 
				+    {
			
 
				+        const int n_block = model.hparams.n_block;
			
 
				+
			
 
				+        model.blocks.resize(n_block);
			
 
				+
			
 
				+        model.wte    = ggml_get_tensor(ctx, "token_embd.weight");
			
 
				+        model.ln_f_g = ggml_get_tensor(ctx, "output_norm.weight");
			
 
				+        model.ln_f_b = ggml_get_tensor(ctx, "output_norm.bias");
			
 
				+        model.lmh_g  = ggml_get_tensor(ctx, "output.weight");
			
 
				+
			
 
				+        // map by name
			
 
				+        model.tensors["token_embd.weight"] = model.wte;
			
 
				+        model.tensors["output_norm.weight"] = model.ln_f_g;
			
 
				+        model.tensors["output_norm.bias"]   = model.ln_f_b;
			
 
				+        model.tensors["output.weight"] = model.lmh_g;
			
 
				+
			
 
				+        for (int i = 0; i < n_block; ++i) {
			
 
				+            auto & block = model.blocks[i];
			
 
				+
			
 
				+            std::string blocknamestart = "blk." + std::to_string(i) + ".";
			
 
				+
			
 
				+            block.ln_1_g          = get_tensor_ex(ctx, blocknamestart + "attn_norm.weight" );
			
 
				+            block.ln_1_b          = get_tensor_ex(ctx, blocknamestart + "attn_norm.bias" );
			
 
				+
			
 
				+            block.c_attn_attn_w   = get_tensor_ex(ctx, blocknamestart + "attn_qkv.weight" );
			
 
				+            block.c_attn_attn_b   = get_tensor_ex(ctx ,blocknamestart + "attn_qkv.bias" );
			
 
				+
			
 
				+            block.c_attn_proj_w   = get_tensor_ex(ctx, blocknamestart + "attn_output.weight" );
			
 
				+            block.c_attn_proj_b   = get_tensor_ex(ctx, blocknamestart + "attn_output.bias" );
			
 
				+
			
 
				+            block.ln_2_g          = get_tensor_ex(ctx, blocknamestart + "ffn_norm.weight" );
			
 
				+            block.ln_2_b          = get_tensor_ex(ctx, blocknamestart + "ffn_norm.bias");
			
 
				+
			
 
				+            block.c_mlp_fc_w      = get_tensor_ex(ctx, blocknamestart + "ffn_up.weight" );
			
 
				+            block.c_mlp_fc_b      = get_tensor_ex(ctx, blocknamestart + "ffn_up.bias" );
			
 
				+
			
 
				+            block.c_mlp_proj_w    = get_tensor_ex(ctx, blocknamestart + "ffn_down.weight" );
			
 
				+            block.c_mlp_proj_b    = get_tensor_ex(ctx, blocknamestart + "ffn_down.bias" );
			
 
				+
			
 
				+            // map by name
			
 
				+            model.tensors[blocknamestart + "attn_norm.weight"] = block.ln_1_g;
			
 
				+            model.tensors[blocknamestart + "attn_norm.bias"]   = block.ln_1_b;
			
 
				+
			
 
				+            model.tensors[blocknamestart + "attn_qkv.weight"] = block.c_attn_attn_w;
			
 
				+            model.tensors[blocknamestart + "attn_qkv.bias"]   = block.c_attn_attn_b;
			
 
				+
			
 
				+            model.tensors[blocknamestart + "attn_output.weight"] = block.c_attn_proj_w;
			
 
				+            model.tensors[blocknamestart + "attn_output.bias"]   = block.c_attn_proj_b;
			
 
				+
			
 
				+            model.tensors[blocknamestart + "ffn_norm.weight"] = block.ln_2_g;
			
 
				+            model.tensors[blocknamestart + "ffn_norm.bias"]   = block.ln_2_b;
			
 
				+
			
 
				+            model.tensors[blocknamestart + "ffn_up.weight"] = block.c_mlp_fc_w;
			
 
				+            model.tensors[blocknamestart + "ffn_up.bias"]   = block.c_mlp_fc_b;
			
 
				+
			
 
				+            model.tensors[blocknamestart + "ffn_down.weight"] = block.c_mlp_proj_w;
			
 
				+            model.tensors[blocknamestart + "ffn_down.bias"]   = block.c_mlp_proj_b;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // key + value memory
			
 
				+    {
			
 
				+        const auto & kvctx = model.kvctx;
			
 
				+        const auto & hparams = model.hparams;
			
 
				+
			
 
				+        const int n_embd  = hparams.n_embd;
			
 
				+        const int n_block = hparams.n_block;
			
 
				+        const int n_ctx   = hparams.n_ctx;
			
 
				+
			
 
				+        const int64_t n_mem      = n_block*n_ctx;
			
 
				+        const int64_t n_elements = n_embd*n_mem;
			
 
				+
			
 
				+        // create the ggml context
			
 
				+        {
			
 
				+            struct ggml_init_params params = {
			
 
				+                /*.mem_size   =*/ size_t(n_elements*4+ggml_tensor_overhead()*2),
			
 
				+                /*.mem_buffer =*/ NULL,
			
 
				+                /*.no_alloc   =*/ false,
			
 
				+            };
			
 
				+
			
 
				+            model.kvctx = ggml_init(params);
			
 
				+            if (!model.kvctx) {
			
 
				+                fprintf(stderr, "%s: kv ggml_init() failed\n", __func__);
			
 
				+                return false;
			
 
				+            }
			
 
				+
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+        model.memory_k = ggml_new_tensor_1d(kvctx, GGML_TYPE_F16, n_elements);
			
 
				+        model.memory_v = ggml_new_tensor_1d(kvctx, GGML_TYPE_F16, n_elements);
			
 
				+
			
 
				+        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
			
 
				+
			
 
				+        printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
			
 
				+    }
			
 
				+
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+// feed-forward network
			
 
				+ggml_tensor * gpt_neox_ff(
			
 
				+        const gpt_neox_block &block,
			
 
				+        ggml_context * ctx0,
			
 
				+        ggml_tensor * inp) {
			
 
				+
			
 
				+    ggml_tensor * cur = ggml_norm(ctx0, inp);
			
 
				+
			
 
				+    cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, block.ln_2_g, cur), cur), ggml_repeat(ctx0, block.ln_2_b, cur));
			
 
				+    cur = ggml_mul_mat(ctx0, block.c_mlp_fc_w, cur);
			
 
				+    cur = ggml_add(ctx0, ggml_repeat(ctx0, block.c_mlp_fc_b, cur), cur);
			
 
				+
			
 
				+    // GELU activation
			
 
				+    cur = ggml_gelu(ctx0, cur);
			
 
				+
			
 
				+    // projection
			
 
				+    // cur = proj_w*cur + proj_b
			
 
				+    cur = ggml_mul_mat(ctx0, block.c_mlp_proj_w, cur);
			
 
				+
			
 
				+    cur = ggml_add(ctx0, ggml_repeat(ctx0, block.c_mlp_proj_b, cur), cur);
			
 
				+    return cur;
			
 
				+}
			
 
				+
			
 
				+// evaluate the transformer
			
 
				+//
			
 
				+//   - model:     the model
			
 
				+//   - n_threads: number of threads to use
			
 
				+//   - n_past:    the context size so far
			
 
				+//   - embd_inp:  the embeddings of the tokens in the context
			
 
				+//   - embd_w:    the predicted logits for the next token
			
 
				+//
			
 
				+bool gpt_neox_eval(
			
 
				+        const gpt_neox_model & model,
			
 
				+        const int n_threads,
			
 
				+        const int n_past,
			
 
				+        const std::vector<gpt2bpe_vocab::id> & embd_inp,
			
 
				+              std::vector<float>         & embd_w,
			
 
				+              size_t                     & mem_per_token) {
			
 
				+    const int N = embd_inp.size();
			
 
				+
			
 
				+    const auto & hparams = model.hparams;
			
 
				+
			
 
				+    const int n_embd  = hparams.n_embd;
			
 
				+    const int n_block = hparams.n_block;
			
 
				+    const int n_ctx   = hparams.n_ctx;
			
 
				+    const int n_head  = hparams.n_head;
			
 
				+    const int n_vocab = hparams.n_vocab;
			
 
				+    const int n_rot   = hparams.n_rot;
			
 
				+
			
 
				+    static size_t buf_size = 256u*1024*1024;
			
 
				+    static void * buf = malloc(buf_size);
			
 
				+
			
 
				+    // use 2 scratch buffers
			
 
				+    // TODO: very hacky solution - reimplement in a more elegant way
			
 
				+    static size_t scr0_size = 256u*1024*1024;
			
 
				+    static void * scr0 = malloc(scr0_size);
			
 
				+
			
 
				+    static size_t scr1_size = 256u*1024*1024;
			
 
				+    static void * scr1 = malloc(scr1_size);
			
 
				+
			
 
				+    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
			
 
				+        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
			
 
				+        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
			
 
				+
			
 
				+        // reallocate
			
 
				+        buf_size = buf_size_new;
			
 
				+        buf = realloc(buf, buf_size);
			
 
				+        if (buf == nullptr) {
			
 
				+            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
			
 
				+            return false;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    struct ggml_init_params params = {
			
 
				+        /*.mem_size   =*/ buf_size,
			
 
				+        /*.mem_buffer =*/ buf,
			
 
				+        /*.no_alloc   =*/ false,
			
 
				+    };
			
 
				+
			
 
				+    struct ggml_context * ctx0 = ggml_init(params);
			
 
				+    struct ggml_cgraph gf = {};
			
 
				+
			
 
				+    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
			
 
				+    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
			
 
				+
			
 
				+
			
 
				+    // wte
			
 
				+    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);
			
 
				+
			
 
				+    for (int il = 0; il < n_block; ++il) {
			
 
				+        struct ggml_tensor * cur;
			
 
				+
			
 
				+        ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
			
 
				+
			
 
				+        // self-attention
			
 
				+        {
			
 
				+            {
			
 
				+                cur = ggml_norm(ctx0, inpL);
			
 
				+
			
 
				+                cur = ggml_add(ctx0,
			
 
				+                        ggml_mul(ctx0, ggml_repeat(ctx0, model.blocks[il].ln_1_g, cur), cur),
			
 
				+                        ggml_repeat(ctx0, model.blocks[il].ln_1_b, cur));
			
 
				+            }
			
 
				+
			
 
				+            // compute QKV
			
 
				+            {
			
 
				+
			
 
				+                cur = ggml_mul_mat(ctx0, model.blocks[il].c_attn_attn_w, cur);
			
 
				+                cur = ggml_add(ctx0, ggml_repeat(ctx0, model.blocks[il].c_attn_attn_b, cur), cur);
			
 
				+            }
			
 
				+
			
 
				+            struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 0*sizeof(float)*n_embd/n_head));
			
 
				+            struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 1*sizeof(float)*n_embd/n_head));
			
 
				+            struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head));
			
 
				+
			
 
				+            // using mode = 2 for GPT-NeoX mode
			
 
				+            Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, n_rot, 2, 0);
			
 
				+            Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, n_rot, 2, 0);
			
 
				+
			
 
				+            // store key and value to memory
			
 
				+            {
			
 
				+                Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N));
			
 
				+
			
 
				+                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
			
 
				+                struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
			
 
				+                        (   n_ctx)*ggml_element_size(model.memory_v),
			
 
				+                        (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v));
			
 
				+
			
 
				+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
			
 
				+                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
			
 
				+            }
			
 
				+
			
 
				+            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
			
 
				+            struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
			
 
				+
			
 
				+            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
			
 
				+            struct ggml_tensor * K =
			
 
				+                ggml_permute(ctx0,
			
 
				+                        ggml_reshape_3d(ctx0,
			
 
				+                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
			
 
				+                            n_embd/n_head, n_head, n_past + N),
			
 
				+                        0, 2, 1, 3);
			
 
				+
			
 
				+            // K * Q
			
 
				+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
			
 
				+
			
 
				+            // KQ_scaled = KQ / sqrt(n_embd/n_head)
			
 
				+            struct ggml_tensor * KQ_scaled =
			
 
				+                ggml_scale_inplace(ctx0,
			
 
				+                        KQ,
			
 
				+                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
			
 
				+                        );
			
 
				+
			
 
				+            // KQ_masked = mask_past(KQ_scaled)
			
 
				+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
			
 
				+
			
 
				+            // KQ = soft_max(KQ_masked)
			
 
				+            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
			
 
				+
			
 
				+            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
			
 
				+            struct ggml_tensor * V =
			
 
				+                ggml_view_3d(ctx0, model.memory_v,
			
 
				+                        n_past + N, n_embd/n_head, n_head,
			
 
				+                        n_ctx*ggml_element_size(model.memory_v),
			
 
				+                        n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head,
			
 
				+                        il*n_ctx*ggml_element_size(model.memory_v)*n_embd);
			
 
				+
			
 
				+            // KQV = transpose(V) * KQ_soft_max
			
 
				+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
			
 
				+
			
 
				+            // KQV_merged = KQV.permute(0, 2, 1, 3)
			
 
				+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
			
 
				+
			
 
				+            // cur = KQV_merged.contiguous().view(n_embd, N)
			
 
				+            cur = ggml_cpy(ctx0, KQV_merged, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
			
 
				+
			
 
				+            // projection
			
 
				+            {
			
 
				+                cur = ggml_mul_mat(ctx0, model.blocks[il].c_attn_proj_w, cur);
			
 
				+                cur = ggml_add(ctx0, ggml_repeat(ctx0, model.blocks[il].c_attn_proj_b, cur), cur);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
			
 
				+
			
 
				+        if (hparams.par_res == 0) {
			
 
				+            struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
			
 
				+
			
 
				+            cur = gpt_neox_ff(model.blocks[il], ctx0, inpFF);
			
 
				+
			
 
				+            // input for next layer
			
 
				+            inpL = ggml_add(ctx0, cur, inpFF);
			
 
				+        } else {
			
 
				+            struct ggml_tensor * inpFF = cur;
			
 
				+
			
 
				+            // this is independent of the self-attention result, so it could be done in parallel to the self-attention
			
 
				+            // note here we pass inpL instead of cur
			
 
				+            cur = gpt_neox_ff(model.blocks[il], ctx0, inpL);
			
 
				+
			
 
				+            // layer input + FF
			
 
				+            cur  = ggml_add(ctx0, cur, inpFF);
			
 
				+
			
 
				+            // input for next layer
			
 
				+            inpL = ggml_add(ctx0, cur, inpL);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
			
 
				+
			
 
				+    // norm
			
 
				+    {
			
 
				+        inpL = ggml_norm(ctx0, inpL);
			
 
				+
			
 
				+        // inpL = ln_f_g*inpL + ln_f_b
			
 
				+        inpL = ggml_add(ctx0,
			
 
				+                ggml_mul(ctx0,
			
 
				+                    ggml_repeat(ctx0, model.ln_f_g, inpL),
			
 
				+                    inpL),
			
 
				+                ggml_repeat(ctx0, model.ln_f_b, inpL));
			
 
				+    }
			
 
				+
			
 
				+    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
			
 
				+
			
 
				+    // lm_head
			
 
				+    {
			
 
				+        inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
			
 
				+
			
 
				+        //inpL = ggml_add(ctx0,
			
 
				+        //        ggml_repeat(ctx0, model.lmh_b, inpL),
			
 
				+        //        inpL);
			
 
				+    }
			
 
				+
			
 
				+    // logits -> probs
			
 
				+    //inpL = ggml_soft_max_inplace(ctx0, inpL);
			
 
				+
			
 
				+    // run the computation
			
 
				+    ggml_build_forward_expand(&gf, inpL);
			
 
				+    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
			
 
				+
			
 
				+    //if (n_past%100 == 0) {
			
 
				+    //    ggml_graph_print   (&gf);
			
 
				+    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
			
 
				+    //}
			
 
				+
			
 
				+    //embd_w.resize(n_vocab*N);
			
 
				+    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
			
 
				+
			
 
				+    // return result for just the last token
			
 
				+    embd_w.resize(n_vocab);
			
 
				+    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
			
 
				+
			
 
				+    if (mem_per_token == 0) {
			
 
				+        mem_per_token = ggml_used_mem(ctx0)/N;
			
 
				+    }
			
 
				+    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
			
 
				+
			
 
				+    ggml_free(ctx0);
			
 
				+
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char ** argv) {
			
 
				+    ggml_time_init();
			
 
				+
			
 
				+    const int64_t t_main_start_us = ggml_time_us();
			
 
				+
			
 
				+    gpt_params params;
			
 
				+
			
 
				+    if (gpt_params_parse(argc, argv, params) == false) {
			
 
				+        return 1;
			
 
				+    }
			
 
				+
			
 
				+    int64_t t_load_us = 0;
			
 
				+
			
 
				+    gpt2bpe_vocab vocab;
			
 
				+    gpt_neox_model model;
			
 
				+
			
 
				+    // load the model
			
 
				+    {
			
 
				+        const int64_t t_start_us = ggml_time_us();
			
 
				+
			
 
				+        if (!gpt_neox_model_load(params.model, model, vocab)) {
			
 
				+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
			
 
				+            return 1;
			
 
				+        }
			
 
				+
			
 
				+        t_load_us = ggml_time_us() - t_start_us;
			
 
				+
			
 
				+    }
			
 
				+
			
 
				+    if (params.seed < 0) {
			
 
				+        params.seed = time(NULL);
			
 
				+    }
			
 
				+
			
 
				+    if (params.top_k == 0) {
			
 
				+        params.top_k = model.hparams.n_vocab;
			
 
				+    }
			
 
				+
			
 
				+    printf("%s: seed           = %d\n",   __func__, params.seed);
			
 
				+    printf("%s: temp           = %.3f\n", __func__, params.temp);
			
 
				+    printf("%s: top_k          = %d\n",   __func__, params.top_k);
			
 
				+    printf("%s: top_p          = %.3f\n", __func__, params.top_p);
			
 
				+    printf("%s: repeat_last_n  = %d\n",   __func__, params.repeat_last_n);
			
 
				+    printf("%s: repeat_penalty = %.3f\n", __func__, params.repeat_penalty);
			
 
				+
			
 
				+    std::mt19937 rng(params.seed);
			
 
				+
			
 
				+    if (params.prompt.empty()) {
			
 
				+        params.prompt = "Once upon";
			
 
				+    }
			
 
				+
			
 
				+    std::vector<int32_t> last_n_tokens(model.hparams.n_ctx);
			
 
				+    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
			
 
				+
			
 
				+    int n_past = 0;
			
 
				+
			
 
				+    int64_t t_sample_us  = 0;
			
 
				+    int64_t t_predict_us = 0;
			
 
				+
			
 
				+    std::vector<float> logits;
			
 
				+
			
 
				+    // tokenize the prompt
			
 
				+    std::vector<gpt2bpe_vocab::id> embd_inp = gpt2bpe_tokenize(vocab, params.prompt,false, false);
			
 
				+
			
 
				+    params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
			
 
				+
			
 
				+    printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
			
 
				+//    for (size_t i = 0; i < embd_inp.size(); i++) {
			
 
				+//        printf("%s: token[%zu] = %6d, %s\n", __func__, i, embd_inp[i], vocab.id_to_token[embd_inp[i]].c_str());
			
 
				+//    }
			
 
				+
			
 
				+    if( model.hparams.n_ctx < params.n_predict+embd_inp.size() ) {
			
 
				+        params.n_predict = model.hparams.n_ctx-embd_inp.size();
			
 
				+    }
			
 
				+
			
 
				+    printf("%s: n_predict = %d\n", __func__, params.n_predict);
			
 
				+    printf("\n");
			
 
				+
			
 
				+    std::vector<gpt2bpe_vocab::id> embd;
			
 
				+
			
 
				+    // determine the required inference memory per token:
			
 
				+    size_t mem_per_token = 0;
			
 
				+    gpt_neox_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
			
 
				+
			
 
				+    for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
			
 
				+        // predict
			
 
				+        if (embd.size() > 0) {
			
 
				+            const int64_t t_start_us = ggml_time_us();
			
 
				+
			
 
				+            if (!gpt_neox_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
			
 
				+                printf("Failed to predict\n");
			
 
				+                return 1;
			
 
				+            }
			
 
				+
			
 
				+            t_predict_us += ggml_time_us() - t_start_us;
			
 
				+        }
			
 
				+
			
 
				+        n_past += embd.size();
			
 
				+        embd.clear();
			
 
				+
			
 
				+        if (i >= embd_inp.size()) {
			
 
				+            // sample next token
			
 
				+            const int   top_k = params.top_k;
			
 
				+            const float top_p = params.top_p;
			
 
				+            const float temp  = params.temp;
			
 
				+            const int repeat_last_n = params.repeat_last_n;
			
 
				+            const float repeat_penalty = params.repeat_penalty;
			
 
				+
			
 
				+            const int n_vocab = model.hparams.n_vocab;
			
 
				+
			
 
				+            gpt2bpe_vocab::id id = 0;
			
 
				+
			
 
				+            {
			
 
				+                const int64_t t_start_sample_us = ggml_time_us();
			
 
				+
			
 
				+                id = sample_top_k_top_p_repeat(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, repeat_last_n, repeat_penalty, rng);
			
 
				+
			
 
				+                last_n_tokens.erase(last_n_tokens.begin());
			
 
				+                last_n_tokens.push_back(id);
			
 
				+
			
 
				+                t_sample_us += ggml_time_us() - t_start_sample_us;
			
 
				+            }
			
 
				+
			
 
				+            // add it to the context
			
 
				+            embd.push_back(id);
			
 
				+        } else {
			
 
				+            // if here, it means we are still processing the input prompt
			
 
				+            for (size_t k = i; k < embd_inp.size(); k++) {
			
 
				+                embd.push_back(embd_inp[k]);
			
 
				+                if (embd.size() > params.n_batch) {
			
 
				+                    break;
			
 
				+                }
			
 
				+            }
			
 
				+            i += embd.size() - 1;
			
 
				+        }
			
 
				+
			
 
				+        // display text
			
 
				+        for (auto id : embd) {
			
 
				+            printf("%s", vocab.id_to_token[id].c_str()  );
			
 
				+        }
			
 
				+        fflush(stdout);
			
 
				+
			
 
				+        // end of text token
			
 
				+        if (vocab.special_eos_id != -1 && embd.back() == vocab.special_eos_id) {
			
 
				+            break;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // report timing
			
 
				+    {
			
 
				+        const int64_t t_main_end_us = ggml_time_us();
			
 
				+
			
 
				+        printf("\n\n");
			
 
				+        printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
			
 
				+        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
			
 
				+        printf("%s:   sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
			
 
				+        printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
			
 
				+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
			
 
				+    }
			
 
				+
			
 
				+    ggml_free(model.ctx);
			
 
				+
			
 
				+    return 0;
			
 
				+}
			
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -606,6 +606,8 @@ const std::string test::cpu_info     = get_cpu_info();
 
				 const std::string test::gpu_info     = get_gpu_info();
			
 
				 
			
 
				 struct printer {
			
 
				+    virtual ~printer() {}
			
 
				+
			
 
				     FILE * fout;
			
 
				     virtual void print_header(const cmd_params & params) { (void) params; };
			
 
				     virtual void print_test(const test & t) = 0;
			
@@ -849,7 +851,7 @@ struct sql_printer : public printer {
 
				 };
			
 
				 
			
 
				 static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
			
 
				-    std::vector<llama_token> tokens(n_batch, llama_token_bos());
			
 
				+    std::vector<llama_token> tokens(n_batch, llama_token_bos(ctx));
			
 
				     int n_processed = 0;
			
 
				     while (n_processed < n_prompt) {
			
 
				         int n_tokens = std::min(n_prompt - n_processed, n_batch);
			
@@ -859,7 +861,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
 
				 }
			
 
				 
			
 
				 static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
			
 
				-    llama_token token = llama_token_bos();
			
 
				+    llama_token token = llama_token_bos(ctx);
			
 
				     for (int i = 0; i < n_gen; i++) {
			
 
				         llama_eval(ctx, &token, 1, n_past + i, n_threads);
			
 
				     }
			
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -143,7 +143,7 @@ int main(int argc, char ** argv) {
 
				         {
			
 
				             fprintf(stderr, "%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);
			
 
				 
			
 
				-            const std::vector<llama_token> tmp(params.n_batch, llama_token_bos());
			
 
				+            const std::vector<llama_token> tmp(params.n_batch, llama_token_bos(ctx));
			
 
				             llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
			
 
				         }
			
 
				 
			
@@ -191,10 +191,6 @@ int main(int argc, char ** argv) {
 
				 
			
 
				     // tokenize the prompt
			
 
				     std::vector<llama_token> embd_inp;
			
 
				-
			
 
				-    // Add a space in front of the first character to match OG llama tokenizer behavior
			
 
				-    params.prompt.insert(0, 1, ' ');
			
 
				-
			
 
				     if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
			
 
				         embd_inp = ::llama_tokenize(ctx, params.prompt, true);
			
 
				     } else {
			
@@ -270,15 +266,12 @@ int main(int argc, char ** argv) {
 
				         params.interactive = true;
			
 
				     }
			
 
				 
			
 
				-    // determine newline token
			
 
				-    auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
			
 
				-
			
 
				     if (params.verbose_prompt) {
			
 
				         fprintf(stderr, "\n");
			
 
				         fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
			
 
				         fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
			
 
				         for (int i = 0; i < (int) embd_inp.size(); i++) {
			
 
				-            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
			
 
				+            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
			
 
				         }
			
 
				 
			
 
				         if (ctx_guidance) {
			
@@ -286,14 +279,14 @@ int main(int argc, char ** argv) {
 
				             fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
			
 
				             fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
			
 
				             for (int i = 0; i < (int) guidance_inp.size(); i++) {
			
 
				-                fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]));
			
 
				+                fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str());
			
 
				             }
			
 
				         }
			
 
				 
			
 
				         if (params.n_keep > 0) {
			
 
				         fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
			
 
				             for (int i = 0; i < params.n_keep; i++) {
			
 
				-                fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]));
			
 
				+                fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]).c_str());
			
 
				             }
			
 
				             fprintf(stderr, "'\n");
			
 
				         }
			
@@ -311,7 +304,7 @@ int main(int argc, char ** argv) {
 
				         auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
			
 
				             return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
			
 
				         };
			
 
				-        SetConsoleCtrlHandler(static_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
			
 
				+        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
			
 
				 #endif
			
 
				 
			
 
				         fprintf(stderr, "%s: interactive mode on.\n", __func__);
			
@@ -352,10 +345,9 @@ int main(int argc, char ** argv) {
 
				         fprintf(stderr, "\n");
			
 
				 
			
 
				         {
			
 
				-            auto it = params.logit_bias.find(llama_token_eos());
			
 
				+            auto it = params.logit_bias.find(llama_token_eos(ctx));
			
 
				             if (it != params.logit_bias.end() && it->second == -INFINITY) {
			
 
				-                fprintf(stderr,
			
 
				-                    "%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
			
 
				+                fprintf(stderr, "%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
			
 
				             }
			
 
				         }
			
 
				 
			
@@ -405,7 +397,7 @@ int main(int argc, char ** argv) {
 
				 
			
 
				     // do one empty run to warm up the model
			
 
				     {
			
 
				-        const std::vector<llama_token> tmp = { llama_token_bos(), };
			
 
				+        const std::vector<llama_token> tmp = { llama_token_bos(ctx), };
			
 
				         llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
			
 
				         llama_reset_timings(ctx);
			
 
				     }
			
@@ -589,7 +581,7 @@ int main(int argc, char ** argv) {
 
				                 }
			
 
				 
			
 
				                 // Apply penalties
			
 
				-                float nl_logit = logits[llama_token_nl()];
			
 
				+                float nl_logit = logits[llama_token_nl(ctx)];
			
 
				                 auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
			
 
				                 llama_sample_repetition_penalty(ctx, &candidates_p,
			
 
				                     last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
			
@@ -598,7 +590,7 @@ int main(int argc, char ** argv) {
 
				                     last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
			
 
				                     last_n_repeat, alpha_frequency, alpha_presence);
			
 
				                 if (!penalize_nl) {
			
 
				-                    logits[llama_token_nl()] = nl_logit;
			
 
				+                    logits[llama_token_nl(ctx)] = nl_logit;
			
 
				                 }
			
 
				 
			
 
				                 if (grammar != NULL) {
			
@@ -662,7 +654,7 @@ int main(int argc, char ** argv) {
 
				         // display text
			
 
				         if (input_echo) {
			
 
				             for (auto id : embd) {
			
 
				-                printf("%s", llama_token_to_str(ctx, id));
			
 
				+                printf("%s", llama_token_to_str(ctx, id).c_str());
			
 
				             }
			
 
				             fflush(stdout);
			
 
				         }
			
@@ -704,7 +696,7 @@ int main(int argc, char ** argv) {
 
				             }
			
 
				 
			
 
				             // deal with end of text token in interactive mode
			
 
				-            if (last_n_tokens.back() == llama_token_eos()) {
			
 
				+            if (last_n_tokens.back() == llama_token_eos(ctx)) {
			
 
				                 if (params.interactive) {
			
 
				                     if (params.antiprompt.size() != 0) {
			
 
				                         // tokenize and inject first reverse prompt
			
@@ -728,7 +720,7 @@ int main(int argc, char ** argv) {
 
				                 }
			
 
				 
			
 
				                 if (params.input_prefix_bos) {
			
 
				-                    embd_inp.push_back(llama_token_bos());
			
 
				+                    embd_inp.push_back(llama_token_bos(ctx));
			
 
				                 }
			
 
				 
			
 
				                 std::string buffer;
			
@@ -782,8 +774,7 @@ int main(int argc, char ** argv) {
 
				                     if (grammar != NULL) {
			
 
				                         llama_grammar_free(grammar);
			
 
				 
			
 
				-                        std::vector<const llama_grammar_element *> grammar_rules(
			
 
				-                            parsed_grammar.c_rules());
			
 
				+                        std::vector<const llama_grammar_element *> grammar_rules( parsed_grammar.c_rules());
			
 
				                         grammar = llama_grammar_init(
			
 
				                             grammar_rules.data(), grammar_rules.size(),
			
 
				                             parsed_grammar.symbol_ids.at("root"));
			
@@ -794,7 +785,7 @@ int main(int argc, char ** argv) {
 
				         }
			
 
				 
			
 
				         // end of text token
			
 
				-        if (!embd.empty() && embd.back() == llama_token_eos() && !(params.instruct || params.interactive)) {
			
 
				+        if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) {
			
 
				             fprintf(stderr, " [end of text]\n");
			
 
				             break;
			
 
				         }
			
--- a/examples/metal/metal.cpp
+++ b/examples/metal/metal.cpp
@@ -2,7 +2,7 @@
 
				 //
			
 
				 // - First, export a LLaMA graph:
			
 
				 //
			
 
				-//  $ ./bin/main -m ../models/7B/ggml-model-q4_0.bin --export
			
 
				+//  $ ./bin/main -m ../models/7B/ggml-model-q4_0.gguf --export
			
 
				 //
			
 
				 // - Run this tool to evaluate the exported graph:
			
 
				 //
			
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -64,7 +64,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
 
				 
			
 
				             // add BOS token for the first batch of each chunk
			
 
				             if (j == 0) {
			
 
				-                tokens[batch_start] = llama_token_bos();
			
 
				+                tokens[batch_start] = llama_token_bos(ctx);
			
 
				             }
			
 
				 
			
 
				             if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
			
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -24,7 +24,7 @@
 
				 #endif
			
 
				 
			
 
				 struct quantize_stats_params {
			
 
				-    std::string model = "models/7B/ggml-model-f16.bin";
			
 
				+    std::string model = "models/7B/ggml-model-f16.gguf";
			
 
				     bool verbose = false;
			
 
				     bool per_layer_stats = false;
			
 
				     bool print_histogram = false;
			
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -68,10 +68,10 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
 
				 }
			
 
				 
			
 
				 // usage:
			
 
				-//  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
			
 
				+//  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
			
 
				 //
			
 
				 void usage(const char * executable) {
			
 
				-    fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable);
			
 
				+    fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
			
 
				     fprintf(stderr, "  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
			
 
				     fprintf(stderr, "  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
			
 
				     fprintf(stderr, "\nAllowed quantization types:\n");
			
@@ -118,8 +118,8 @@ int main(int argc, char ** argv) {
 
				         if (pos != std::string::npos) {
			
 
				             fpath = fname_inp.substr(0, pos + 1);
			
 
				         }
			
 
				-        // export as [inp path]/ggml-model-[ftype].bin
			
 
				-        fname_out = fpath + "ggml-model-" + ftype_str + ".bin";
			
 
				+        // export as [inp path]/ggml-model-[ftype].gguf
			
 
				+        fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
			
 
				         arg_idx++;
			
 
				     }
			
 
				     else {
			
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -26,7 +26,6 @@ int main(int argc, char ** argv) {
 
				     auto lparams = llama_context_default_params();
			
 
				 
			
 
				     lparams.n_ctx     = params.n_ctx;
			
 
				-    lparams.n_gqa     = params.n_gqa;
			
 
				     lparams.seed      = params.seed;
			
 
				     lparams.f16_kv    = params.memory_f16;
			
 
				     lparams.use_mmap  = params.use_mmap;
			
@@ -45,9 +44,8 @@ int main(int argc, char ** argv) {
 
				         llama_free_model(model);
			
 
				         return 1;
			
 
				     }
			
 
				-    auto tokens = std::vector<llama_token>(params.n_ctx);
			
 
				-    auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
			
 
				-
			
 
				+    auto tokens = llama_tokenize(ctx, params.prompt.c_str(), true);
			
 
				+    auto n_prompt_tokens = tokens.size();
			
 
				     if (n_prompt_tokens < 1) {
			
 
				         fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
			
 
				         llama_free(ctx);
			
@@ -92,7 +90,7 @@ int main(int argc, char ** argv) {
 
				         auto next_token_str = llama_token_to_str(ctx, next_token);
			
 
				         last_n_tokens_data.push_back(next_token);
			
 
				 
			
 
				-        printf("%s", next_token_str);
			
 
				+        printf("%s", next_token_str.c_str());
			
 
				         if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
			
 
				             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
			
 
				             llama_free(ctx);
			
@@ -152,7 +150,7 @@ int main(int argc, char ** argv) {
 
				         auto next_token_str = llama_token_to_str(ctx2, next_token);
			
 
				         last_n_tokens_data.push_back(next_token);
			
 
				 
			
 
				-        printf("%s", next_token_str);
			
 
				+        printf("%s", next_token_str.c_str());
			
 
				         if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
			
 
				             fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
			
 
				             llama_free(ctx2);
			
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -5,7 +5,7 @@ This example demonstrates a simple HTTP API server and a simple web front end to
 
				 Command line options:
			
 
				 
			
 
				 -   `--threads N`, `-t N`: Set the number of threads to use during computation.
			
 
				--   `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
			
 
				+-   `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
			
 
				 -   `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
			
 
				 -   `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
			
 
				 -   `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
			
@@ -48,15 +48,14 @@ To get started right away, run the following command, making sure to use the cor
 
				 ### Unix-based systems (Linux, macOS, etc.):
			
 
				 
			
 
				 ```bash
			
 
				-./server -m models/7B/ggml-model.bin -c 2048
			
 
				+./server -m models/7B/ggml-model.gguf -c 2048
			
 
				 ```
			
 
				 
			
 
				 ### Windows:
			
 
				 
			
 
				 ```powershell
			
 
				-server.exe -m models\7B\ggml-model.bin -c 2048
			
 
				+server.exe -m models\7B\ggml-model.gguf -c 2048
			
 
				 ```
			
 
				-
			
 
				 The above command will start a server that by default listens on `127.0.0.1:8080`.
			
 
				 You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url.
			
 
				 
			
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -279,7 +279,7 @@ struct llama_server_context
 
				             grammar_parser::print_grammar(stderr, parsed_grammar);
			
 
				 
			
 
				             {
			
 
				-                auto it = params.logit_bias.find(llama_token_eos());
			
 
				+                auto it = params.logit_bias.find(llama_token_eos(ctx));
			
 
				                 if (it != params.logit_bias.end() && it->second == -INFINITY) {
			
 
				                     LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
			
 
				                 }
			
@@ -402,7 +402,7 @@ struct llama_server_context
 
				         if (params.n_predict == 0)
			
 
				         {
			
 
				             has_next_token = false;
			
 
				-            result.tok = llama_token_eos();
			
 
				+            result.tok = llama_token_eos(ctx);
			
 
				             return result;
			
 
				         }
			
 
				 
			
@@ -442,7 +442,7 @@ struct llama_server_context
 
				             llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
			
 
				 
			
 
				             // Apply penalties
			
 
				-            float nl_logit = logits[llama_token_nl()];
			
 
				+            float nl_logit = logits[llama_token_nl(ctx)];
			
 
				             auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx);
			
 
				             llama_sample_repetition_penalty(ctx, &candidates_p,
			
 
				                                             last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
			
@@ -452,7 +452,7 @@ struct llama_server_context
 
				                                                           last_n_repeat, alpha_frequency, alpha_presence);
			
 
				             if (!penalize_nl)
			
 
				             {
			
 
				-                logits[llama_token_nl()] = nl_logit;
			
 
				+                logits[llama_token_nl(ctx)] = nl_logit;
			
 
				             }
			
 
				 
			
 
				             if (grammar != nullptr) {
			
@@ -515,7 +515,7 @@ struct llama_server_context
 
				         // decrement remaining sampling budget
			
 
				         --n_remain;
			
 
				 
			
 
				-        if (!embd.empty() && embd.back() == llama_token_eos())
			
 
				+        if (!embd.empty() && embd.back() == llama_token_eos(ctx))
			
 
				         {
			
 
				             // stopping_word = llama_token_to_str(ctx, embd.back());
			
 
				             has_next_token = false;
			
@@ -652,8 +652,6 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
 
				     fprintf(stdout, "  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
			
 
				     fprintf(stdout, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
			
 
				     fprintf(stdout, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
			
 
				-    fprintf(stdout, "  -gqa N, --gqa N       grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
			
 
				-    fprintf(stdout, "  -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
			
 
				     fprintf(stdout, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
			
 
				     fprintf(stdout, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
			
 
				     fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
			
@@ -774,23 +772,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
 
				             }
			
 
				             params.n_ctx = std::stoi(argv[i]);
			
 
				         }
			
 
				-        else if (arg == "-gqa" || arg == "--gqa")
			
 
				-        {
			
 
				-            if (++i >= argc)
			
 
				-            {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            params.n_gqa = std::stoi(argv[i]);
			
 
				-        }
			
 
				-        else if (arg == "-eps" || arg == "--rms-norm-eps") {
			
 
				-            if (++i >= argc)
			
 
				-            {
			
 
				-                invalid_param = true;
			
 
				-                break;
			
 
				-            }
			
 
				-            params.rms_norm_eps = std::stof(argv[i]);
			
 
				-        }
			
 
				         else if (arg == "--rope-freq-base")
			
 
				         {
			
 
				             if (++i >= argc)
			
@@ -968,7 +949,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
 
				 
			
 
				 static json format_generation_settings(llama_server_context &llama)
			
 
				 {
			
 
				-    const auto eos_bias = llama.params.logit_bias.find(llama_token_eos());
			
 
				+    const auto eos_bias = llama.params.logit_bias.find(llama_token_eos(llama.ctx));
			
 
				     const bool ignore_eos = eos_bias != llama.params.logit_bias.end() &&
			
 
				                             eos_bias->second < 0.0f && std::isinf(eos_bias->second);
			
 
				 
			
@@ -1103,7 +1084,7 @@ static void parse_options_completion(const json &body, llama_server_context &lla
 
				     llama.params.logit_bias.clear();
			
 
				     if (body.value("ignore_eos", false))
			
 
				     {
			
 
				-        llama.params.logit_bias[llama_token_eos()] = -INFINITY;
			
 
				+        llama.params.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
			
 
				     }
			
 
				 
			
 
				     const auto &logit_bias = body.find("logit_bias");
			
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -2,180 +2,129 @@
 
				 #define _GNU_SOURCE
			
 
				 #endif
			
 
				 
			
 
				+#include "build-info.h"
			
 
				+
			
 
				 #include "common.h"
			
 
				 #include "llama.h"
			
 
				-#include "build-info.h"
			
 
				 
			
 
				-#include <cassert>
			
 
				-#include <cinttypes>
			
 
				 #include <cmath>
			
 
				 #include <cstdio>
			
 
				-#include <cstring>
			
 
				-#include <ctime>
			
 
				-#include <fstream>
			
 
				-#include <iostream>
			
 
				 #include <string>
			
 
				 #include <vector>
			
 
				 
			
 
				-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
			
 
				-#include <signal.h>
			
 
				-#include <unistd.h>
			
 
				-#elif defined (_WIN32)
			
 
				-#define WIN32_LEAN_AND_MEAN
			
 
				-#define NOMINMAX
			
 
				-#include <windows.h>
			
 
				-#include <signal.h>
			
 
				-#endif
			
 
				-
			
 
				-
			
 
				-
			
 
				-int main(int argc, char ** argv)
			
 
				-{
			
 
				+int main(int argc, char ** argv) {
			
 
				     gpt_params params;
			
 
				 
			
 
				-    //---------------------------------
			
 
				-    // Print help :
			
 
				-    //---------------------------------
			
 
				-
			
 
				-    if ( argc == 1 || argv[1][0] == '-' )
			
 
				-    {
			
 
				-        printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] );
			
 
				+    if (argc == 1 || argv[1][0] == '-') {
			
 
				+        printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
			
 
				         return 1 ;
			
 
				     }
			
 
				 
			
 
				-    //---------------------------------
			
 
				-    // Load parameters :
			
 
				-    //---------------------------------
			
 
				-
			
 
				-    if ( argc >= 2 )
			
 
				-    {
			
 
				+    if (argc >= 2) {
			
 
				         params.model = argv[1];
			
 
				     }
			
 
				 
			
 
				-    if ( argc >= 3 )
			
 
				-    {
			
 
				+    if (argc >= 3) {
			
 
				         params.prompt = argv[2];
			
 
				     }
			
 
				 
			
 
				-    if ( params.prompt.empty() )
			
 
				-    {
			
 
				+    if (params.prompt.empty()) {
			
 
				         params.prompt = "Hello my name is";
			
 
				     }
			
 
				 
			
 
				-    //---------------------------------
			
 
				-    // Init LLM :
			
 
				-    //---------------------------------
			
 
				+    // init LLM
			
 
				 
			
 
				     llama_backend_init(params.numa);
			
 
				 
			
 
				-    llama_model * model;
			
 
				-    llama_context * ctx;
			
 
				+    llama_context_params ctx_params = llama_context_default_params();
			
 
				 
			
 
				-    std::tie(model, ctx) = llama_init_from_gpt_params( params );
			
 
				+    llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);
			
 
				 
			
 
				-    if ( model == NULL )
			
 
				-    {
			
 
				-        fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
			
 
				+    if (model == NULL) {
			
 
				+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
			
 
				         return 1;
			
 
				     }
			
 
				 
			
 
				-    //---------------------------------
			
 
				-    // Tokenize the prompt :
			
 
				-    //---------------------------------
			
 
				+    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
			
 
				+
			
 
				+    // tokenize the prompt
			
 
				 
			
 
				     std::vector<llama_token> tokens_list;
			
 
				-    tokens_list = ::llama_tokenize( ctx , params.prompt , true );
			
 
				+    tokens_list = ::llama_tokenize(ctx, params.prompt, true);
			
 
				 
			
 
				-    const int max_context_size     = llama_n_ctx( ctx );
			
 
				-    const int max_tokens_list_size = max_context_size - 4 ;
			
 
				+    const int max_context_size     = llama_n_ctx(ctx);
			
 
				+    const int max_tokens_list_size = max_context_size - 4;
			
 
				 
			
 
				-    if ( (int)tokens_list.size() > max_tokens_list_size )
			
 
				-    {
			
 
				-        fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" ,
			
 
				-             __func__ , (int)tokens_list.size() , max_tokens_list_size );
			
 
				+    if ((int) tokens_list.size() > max_tokens_list_size) {
			
 
				+        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
			
 
				         return 1;
			
 
				     }
			
 
				 
			
 
				-    fprintf( stderr, "\n\n" );
			
 
				-
			
 
				-    // Print the tokens from the prompt :
			
 
				+    fprintf(stderr, "\n\n");
			
 
				 
			
 
				-    for( auto id : tokens_list )
			
 
				-    {
			
 
				-        printf( "%s" , llama_token_to_str( ctx , id ) );
			
 
				+    for (auto id : tokens_list) {
			
 
				+        fprintf(stderr, "%s", llama_token_to_str(ctx, id).c_str());
			
 
				     }
			
 
				 
			
 
				-    fflush(stdout);
			
 
				-
			
 
				+    fflush(stderr);
			
 
				 
			
 
				-    //---------------------------------
			
 
				-    // Main prediction loop :
			
 
				-    //---------------------------------
			
 
				+    // main loop
			
 
				 
			
 
				     // The LLM keeps a contextual cache memory of previous token evaluation.
			
 
				     // Usually, once this cache is full, it is required to recompute a compressed context based on previous
			
 
				     // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
			
 
				     // example, we will just stop the loop once this cache is full or once an end of stream is detected.
			
 
				 
			
 
				-    while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
			
 
				-    {
			
 
				-        //---------------------------------
			
 
				-        // Evaluate the tokens :
			
 
				-        //---------------------------------
			
 
				+    const int n_gen = std::min(32, max_context_size);
			
 
				 
			
 
				-        if ( llama_eval( ctx , tokens_list.data() , int(tokens_list.size()) , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
			
 
				-        {
			
 
				-            fprintf( stderr,  "%s : failed to eval\n" , __func__ );
			
 
				+    while (llama_get_kv_cache_token_count(ctx) < n_gen) {
			
 
				+        // evaluate the transformer
			
 
				+
			
 
				+        if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
			
 
				+            fprintf(stderr, "%s : failed to eval\n", __func__);
			
 
				             return 1;
			
 
				         }
			
 
				 
			
 
				         tokens_list.clear();
			
 
				 
			
 
				-        //---------------------------------
			
 
				-        // Select the best prediction :
			
 
				-        //---------------------------------
			
 
				+        // sample the next token
			
 
				 
			
 
				         llama_token new_token_id = 0;
			
 
				 
			
 
				-        auto logits  = llama_get_logits( ctx );
			
 
				-        auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
			
 
				+        auto logits  = llama_get_logits(ctx);
			
 
				+        auto n_vocab = llama_n_vocab(ctx);
			
 
				 
			
 
				         std::vector<llama_token_data> candidates;
			
 
				-        candidates.reserve( n_vocab );
			
 
				+        candidates.reserve(n_vocab);
			
 
				 
			
 
				-        for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ )
			
 
				-        {
			
 
				-            candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } );
			
 
				+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
			
 
				+            candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
			
 
				         }
			
 
				 
			
 
				         llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
			
 
				 
			
 
				-        // Select it using the "Greedy sampling" method :
			
 
				-        new_token_id = llama_sample_token_greedy( ctx , &candidates_p );
			
 
				-
			
 
				+        new_token_id = llama_sample_token_greedy(ctx , &candidates_p);
			
 
				 
			
 
				         // is it an end of stream ?
			
 
				-        if ( new_token_id == llama_token_eos() )
			
 
				-        {
			
 
				+        if (new_token_id == llama_token_eos(ctx)) {
			
 
				             fprintf(stderr, " [end of text]\n");
			
 
				             break;
			
 
				         }
			
 
				 
			
 
				-        // Print the new token :
			
 
				-        printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
			
 
				-        fflush( stdout );
			
 
				+        // print the new token :
			
 
				+        printf("%s", llama_token_to_str(ctx, new_token_id).c_str());
			
 
				+        fflush(stdout);
			
 
				 
			
 
				-        // Push this new token for next evaluation :
			
 
				-        tokens_list.push_back( new_token_id );
			
 
				-
			
 
				-    } // wend of main loop
			
 
				+        // push this new token for next evaluation
			
 
				+        tokens_list.push_back(new_token_id);
			
 
				+    }
			
 
				 
			
 
				-    llama_free( ctx );
			
 
				-    llama_free_model( model );
			
 
				+    llama_free(ctx);
			
 
				+    llama_free_model(model);
			
 
				 
			
 
				     llama_backend_free();
			
 
				 
			
 
				+    fprintf(stderr, "\n\n");
			
 
				+
			
 
				     return 0;
			
 
				 }
			
 
				-
			
 
				-// EOF
			
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1,4 +1,5 @@
 
				 #include "ggml.h"
			
 
				+#include "common.h"
			
 
				 #include "llama.h"
			
 
				 #include <unordered_map>
			
 
				 #include <vector>
			
@@ -16,7 +17,7 @@
 
				 #pragma warning(disable: 4244 4267) // possible loss of data
			
 
				 #endif
			
 
				 
			
 
				-static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
			
 
				+static const float rms_norm_eps = 1e-5f;
			
 
				 
			
 
				 struct random_normal_distribution {
			
 
				     std::mt19937 gen;
			
@@ -169,14 +170,16 @@ struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struc
 
				 struct llama_vocab {
			
 
				     using id    = int32_t;
			
 
				     using token = std::string;
			
 
				+    using ttype = llama_token_type;
			
 
				 
			
 
				-    struct token_score {
			
 
				-        token tok;
			
 
				+    struct token_data {
			
 
				+        token text;
			
 
				         float score;
			
 
				+        ttype type;
			
 
				     };
			
 
				 
			
 
				     std::unordered_map<token, id> token_to_id;
			
 
				-    std::vector<token_score> id_to_token;
			
 
				+    std::vector<token_data> id_to_token;
			
 
				 };
			
 
				 
			
 
				 struct my_llama_hparams {
			
@@ -1961,7 +1964,7 @@ void print_matrix(struct ggml_tensor * probs) {
 
				 
			
 
				 
			
 
				 void print_token(struct llama_context * ctx, llama_token token) {
			
 
				-    printf("%s", llama_token_to_str(ctx, token));
			
 
				+    printf("%s", llama_token_to_str(ctx, token).c_str());
			
 
				 }
			
 
				 
			
 
				 void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
			
@@ -1995,7 +1998,7 @@ void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens)
 
				     }
			
 
				 }
			
 
				 
			
 
				-void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
			
 
				+void get_example_targets(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
			
 
				     int n_tokens = tokens_input->ne[0];
			
 
				     int n_vocab  = target_logits->ne[0];
			
 
				 
			
@@ -2004,7 +2007,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons
 
				 
			
 
				     ggml_set_f32(target_logits, -1.0f/n_vocab);
			
 
				     ggml_set_f32(target_probs, 0.0f);
			
 
				-    ggml_set_i32_1d(tokens_input, 0, llama_token_bos());
			
 
				+    ggml_set_i32_1d(tokens_input, 0, llama_token_bos(lctx));
			
 
				     for (int i=1; i<n_tokens+1; ++i) {
			
 
				         int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
			
 
				         set_f32_2d(target_logits, token, i-1, +1.0f);
			
@@ -2015,7 +2018,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons
 
				     }
			
 
				 }
			
 
				 
			
 
				-void get_example_targets_batch(struct llama_context * /*lctx*/, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
			
 
				+void get_example_targets_batch(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
			
 
				     GGML_ASSERT(tokens_input->n_dims  == 2);
			
 
				     GGML_ASSERT(target_logits->n_dims == 3);
			
 
				     GGML_ASSERT(target_probs->n_dims  == 3);
			
@@ -2035,7 +2038,7 @@ void get_example_targets_batch(struct llama_context * /*lctx*/, const int * trai
 
				         size_t sample = train_samples[(example_id*n_batch + k) % n_train_samples];
			
 
				         GGML_ASSERT(sample+n_tokens-1 < n_train_data);
			
 
				 
			
 
				-        set_i32_2d(tokens_input, 0, k, llama_token_bos());
			
 
				+        set_i32_2d(tokens_input, 0, k, llama_token_bos(lctx));
			
 
				         for (int i=1; i<n_tokens+1; ++i) {
			
 
				             int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
			
 
				             // print_token(lctx, token);
			
@@ -2188,11 +2191,10 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
 
				     f.read_raw(buf.data(), f.size);
			
 
				     buf[f.size] = '\0';
			
 
				 
			
 
				-    out.resize(buf.size());
			
 
				-
			
 
				-    int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false);
			
 
				-    if (n_tokens >= 0) {
			
 
				-        out.resize(n_tokens);
			
 
				+    int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
			
 
				+    if (n_tokens < 0) {
			
 
				+        out.resize(-n_tokens);
			
 
				+        llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
			
 
				     }
			
 
				 
			
 
				     bool verify = false;
			
@@ -2200,17 +2202,17 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
 
				         const char * in  = buf.data();
			
 
				         const char * end = buf.data() + buf.size();
			
 
				         for (int i = 0; i < (int) out.size(); ++i) {
			
 
				-            const char * s = llama_token_to_str(lctx, out[i]);
			
 
				-            int len = strlen(s);
			
 
				+            std::string s = llama_token_to_str(lctx, out[i]);
			
 
				+            int len = s.length();
			
 
				             if (in >= end) {
			
 
				                 printf("%s: unexpected end of original text.\n", __func__);
			
 
				                 break;
			
 
				             }
			
 
				-            const bool matches = (strncmp(in, s, len) == 0);
			
 
				+            const bool matches = (strncmp(in, s.c_str(), len) == 0);
			
 
				             if (matches) {
			
 
				                 in += len;
			
 
				             } else {
			
 
				-                printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s);
			
 
				+                printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s.c_str());
			
 
				             }
			
 
				         }
			
 
				     }
			
@@ -2294,7 +2296,7 @@ llama_token sample(struct my_llama_sampler * sampler, float * logits, const llam
 
				     const auto params = sampler->params;
			
 
				 
			
 
				     // Apply penalties
			
 
				-    const float nl_logit = logits[llama_token_nl()];
			
 
				+    const float nl_logit = logits[llama_token_nl(ctx)];
			
 
				 
			
 
				     const int n_last = std::min(std::min(n_last_tokens, params.repeat_last_n), sampler->n_ctx);
			
 
				 
			
@@ -2313,7 +2315,7 @@ llama_token sample(struct my_llama_sampler * sampler, float * logits, const llam
 
				         params.alpha_presence);
			
 
				 
			
 
				     if (!params.penalize_nl) {
			
 
				-        logits[llama_token_nl()] = nl_logit;
			
 
				+        logits[llama_token_nl(ctx)] = nl_logit;
			
 
				     }
			
 
				 
			
 
				     llama_token token = 0;
			
@@ -2612,42 +2614,45 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
 
				         return;
			
 
				     }
			
 
				 
			
 
				-    // write_magic
			
 
				-    file.write_u32(LLAMA_FILE_MAGIC);   // magic
			
 
				-    file.write_u32(LLAMA_FILE_VERSION); // version
			
 
				-    // write_hparams
			
 
				-    file.write_u32(model->hparams.n_vocab);
			
 
				-    file.write_u32(model->hparams.n_embd);
			
 
				-    file.write_u32(model->hparams.n_mult);
			
 
				-    file.write_u32(model->hparams.n_head);
			
 
				-    file.write_u32(model->hparams.n_layer);
			
 
				-    file.write_u32(model->hparams.n_rot);
			
 
				-    file.write_u32(LLAMA_FTYPE_ALL_F32);
			
 
				-    // write_vocab
			
 
				-    uint32_t n_vocab = model->hparams.n_vocab;
			
 
				-    for (uint32_t i = 0; i < n_vocab; i++) {
			
 
				-        const auto & token_score = vocab->id_to_token.at(i);
			
 
				-        file.write_u32((uint32_t) token_score.tok.size());
			
 
				-        file.write_raw(token_score.tok.data(), token_score.tok.size());
			
 
				-        file.write_raw(&token_score.score, sizeof(token_score.score));
			
 
				-    }
			
 
				-    // write tensors
			
 
				-    write_tensor(&file, model->tok_embeddings);
			
 
				-    write_tensor(&file, model->norm);
			
 
				-    write_tensor(&file, model->output);
			
 
				-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
			
 
				-        auto & layer = model->layers[i];
			
 
				-
			
 
				-        write_tensor(&file, layer.attention_norm);
			
 
				-        write_tensor(&file, layer.wq);
			
 
				-        write_tensor(&file, layer.wk);
			
 
				-        write_tensor(&file, layer.wv);
			
 
				-        write_tensor(&file, layer.wo);
			
 
				-        write_tensor(&file, layer.ffn_norm);
			
 
				-        write_tensor(&file, layer.w1);
			
 
				-        write_tensor(&file, layer.w2);
			
 
				-        write_tensor(&file, layer.w3);
			
 
				-    }
			
 
				+#pragma message("TODO: implement file saving using gguf")
			
 
				+    (void) vocab;
			
 
				+    (void) model;
			
 
				+//    // write_magic
			
 
				+//    file.write_u32(LLAMA_FILE_MAGIC);   // magic
			
 
				+//    file.write_u32(LLAMA_FILE_VERSION); // version
			
 
				+//    // write_hparams
			
 
				+//    file.write_u32(model->hparams.n_vocab);
			
 
				+//    file.write_u32(model->hparams.n_embd);
			
 
				+//    file.write_u32(model->hparams.n_mult);
			
 
				+//    file.write_u32(model->hparams.n_head);
			
 
				+//    file.write_u32(model->hparams.n_layer);
			
 
				+//    file.write_u32(model->hparams.n_rot);
			
 
				+//    file.write_u32(LLAMA_FTYPE_ALL_F32);
			
 
				+//    // write_vocab
			
 
				+//    uint32_t n_vocab = model->hparams.n_vocab;
			
 
				+//    for (uint32_t i = 0; i < n_vocab; i++) {
			
 
				+//        const auto & token_data = vocab->id_to_token.at(i);
			
 
				+//        file.write_u32((uint32_t) token_data.tok.size());
			
 
				+//        file.write_raw(token_data.tok.data(), token_data.tok.size());
			
 
				+//        file.write_raw(&token_data.score, sizeof(token_data.score));
			
 
				+//    }
			
 
				+//    // write tensors
			
 
				+//    write_tensor(&file, model->tok_embeddings);
			
 
				+//    write_tensor(&file, model->norm);
			
 
				+//    write_tensor(&file, model->output);
			
 
				+//    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
			
 
				+//        auto & layer = model->layers[i];
			
 
				+//
			
 
				+//        write_tensor(&file, layer.attention_norm);
			
 
				+//        write_tensor(&file, layer.wq);
			
 
				+//        write_tensor(&file, layer.wk);
			
 
				+//        write_tensor(&file, layer.wv);
			
 
				+//        write_tensor(&file, layer.wo);
			
 
				+//        write_tensor(&file, layer.ffn_norm);
			
 
				+//        write_tensor(&file, layer.w1);
			
 
				+//        write_tensor(&file, layer.w2);
			
 
				+//        write_tensor(&file, layer.w3);
			
 
				+//    }
			
 
				 }
			
 
				 
			
 
				 float cosine_decay(const int decay_steps, const float alpha, int step) {
			
@@ -3052,20 +3057,13 @@ int main(int argc, char ** argv) {
 
				 
			
 
				     struct llama_vocab vocab;
			
 
				     {
			
 
				-        std::vector<const char *> strings;
			
 
				-        std::vector<float> scores;
			
 
				-        int n_vocab = llama_n_vocab(lctx);
			
 
				-        strings.resize(n_vocab, NULL);
			
 
				-        scores.resize(n_vocab, 0);
			
 
				-        n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
			
 
				-        GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
			
 
				+        const int n_vocab = llama_n_vocab(lctx);
			
 
				         vocab.id_to_token.resize(n_vocab);
			
 
				         for (int i=0; i<n_vocab; ++i) {
			
 
				-            std::string tok   = std::string(strings[i]);
			
 
				-            float       score = scores[i];
			
 
				-            vocab.id_to_token[i].tok   = tok;
			
 
				-            vocab.id_to_token[i].score = score;
			
 
				-            vocab.token_to_id.emplace(tok, i);
			
 
				+            vocab.id_to_token[i].text  = llama_token_get_text(lctx, i);
			
 
				+            vocab.id_to_token[i].score = llama_token_get_score(lctx, i);
			
 
				+            vocab.id_to_token[i].type  = llama_token_get_type(lctx, i);
			
 
				+            vocab.token_to_id.emplace(vocab.id_to_token[i].text, i);
			
 
				         }
			
 
				     }
			
 
				 
			
@@ -3178,7 +3176,7 @@ int main(int argc, char ** argv) {
 
				     std::vector<int> train_samples;
			
 
				     train_samples.push_back(0);
			
 
				     for (int i = 1; i < (int) train_tokens.size() - n_tokens; ++i) {
			
 
				-        if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl())) {
			
 
				+        if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl(lctx))) {
			
 
				             train_samples.push_back(i);
			
 
				         }
			
 
				     }
			
@@ -3338,7 +3336,7 @@ int main(int argc, char ** argv) {
 
				         struct ggml_tensor * target_logits = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab,  n_tokens);
			
 
				         struct ggml_tensor * target_probs  = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab,  n_tokens);
			
 
				 
			
 
				-        get_example_targets(train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), rand()%train_samples.size(), tokens_input, target_logits, target_probs);
			
 
				+        get_example_targets(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), rand()%train_samples.size(), tokens_input, target_logits, target_probs);
			
 
				         for (int i=sample_ctx; i<n_tokens; ++i) {
			
 
				             ggml_set_i32_1d(tokens_input, i, n_vocab/2);
			
 
				         }
			
--- a/ggml-metal.h
+++ b/ggml-metal.h
@@ -38,6 +38,9 @@ struct ggml_metal_context;
 
				 struct ggml_metal_context * ggml_metal_init(int n_cb);
			
 
				 void ggml_metal_free(struct ggml_metal_context * ctx);
			
 
				 
			
 
				+void * ggml_metal_host_malloc(size_t n);
			
 
				+void   ggml_metal_host_free  (void * data);
			
 
				+
			
 
				 // set the number of command buffers to use
			
 
				 void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
			
 
				 
			
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -237,6 +237,21 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
 
				     free(ctx);
			
 
				 }
			
 
				 
			
 
				+void * ggml_metal_host_malloc(size_t n) {
			
 
				+    void * data = NULL;
			
 
				+    const int result = posix_memalign((void **) &data, getpagesize(), n);
			
 
				+    if (result != 0) {
			
 
				+        fprintf(stderr, "%s: error: posix_memalign failed\n", __func__);
			
 
				+        return NULL;
			
 
				+    }
			
 
				+
			
 
				+    return data;
			
 
				+}
			
 
				+
			
 
				+void ggml_metal_host_free(void * data) {
			
 
				+    free(data);
			
 
				+}
			
 
				+
			
 
				 void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
			
 
				     ctx->n_cb = n_cb;
			
 
				 }
			
--- a/ggml.c
+++ b/ggml.c
@@ -213,10 +213,10 @@ inline static void * ggml_aligned_malloc(size_t size) {
 
				                 error_desc = "insufficient memory";
			
 
				                 break;
			
 
				         }
			
 
				-        GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n",
			
 
				-            __func__, error_desc, size/(1024.0*1024.0));
			
 
				+        GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
			
 
				         return NULL;
			
 
				     }
			
 
				+
			
 
				     return aligned_memory;
			
 
				 }
			
 
				 #define GGML_ALIGNED_MALLOC(size)  ggml_aligned_malloc(size)
			
@@ -4091,7 +4091,11 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
 
				     //
			
 
				     // is enough, but just in case, adding the second part
			
 
				 
			
 
				-    return GGML_PAD(MAX(tensor->ne[3]*tensor->nb[3], ggml_nelements(tensor)*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type), GGML_MEM_ALIGN);
			
 
				+    return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*ggml_type_size(tensor->type))/ggml_blck_size(tensor->type));
			
 
				+}
			
 
				+
			
 
				+size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
			
 
				+    return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
			
 
				 }
			
 
				 
			
 
				 size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
			
@@ -9118,6 +9122,8 @@ static void ggml_compute_forward_mul(
 
				         const struct ggml_tensor * src0,
			
 
				         const struct ggml_tensor * src1,
			
 
				         struct ggml_tensor * dst) {
			
 
				+    GGML_ASSERT(src1->type == GGML_TYPE_F32 && "only f32 src1 supported for now");
			
 
				+
			
 
				     switch (src0->type) {
			
 
				         case GGML_TYPE_F32:
			
 
				             {
			
@@ -16881,7 +16887,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
 
				     // compute size of intermediate results
			
 
				     // TODO: does not take into account scratch buffers !!!!
			
 
				     for (int i = 0; i < cgraph->n_nodes; ++i) {
			
 
				-        size_eval += ggml_nbytes(cgraph->nodes[i]);
			
 
				+        size_eval += ggml_nbytes_pad(cgraph->nodes[i]);
			
 
				     }
			
 
				 
			
 
				     // print
			
@@ -18542,6 +18548,1005 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
 
				 
			
 
				 ////////////////////////////////////////////////////////////////////////////////
			
 
				 
			
 
				+struct gguf_str {
			
 
				+    uint32_t n;
			
 
				+    char * data;
			
 
				+};
			
 
				+
			
 
				+static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
			
 
				+    [GGUF_TYPE_UINT8]   = sizeof(uint8_t),
			
 
				+    [GGUF_TYPE_INT8]    = sizeof(int8_t),
			
 
				+    [GGUF_TYPE_UINT16]  = sizeof(uint16_t),
			
 
				+    [GGUF_TYPE_INT16]   = sizeof(int16_t),
			
 
				+    [GGUF_TYPE_UINT32]  = sizeof(uint32_t),
			
 
				+    [GGUF_TYPE_INT32]   = sizeof(int32_t),
			
 
				+    [GGUF_TYPE_FLOAT32] = sizeof(float),
			
 
				+    [GGUF_TYPE_BOOL]    = sizeof(bool),
			
 
				+    [GGUF_TYPE_STRING]  = sizeof(struct gguf_str),
			
 
				+    [GGUF_TYPE_ARRAY]   = 0, // undefined
			
 
				+};
			
 
				+static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
			
 
				+
			
 
				+static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
			
 
				+    [GGUF_TYPE_UINT8]   = "u8",
			
 
				+    [GGUF_TYPE_INT8]    = "i8",
			
 
				+    [GGUF_TYPE_UINT16]  = "u16",
			
 
				+    [GGUF_TYPE_INT16]   = "i16",
			
 
				+    [GGUF_TYPE_UINT32]  = "u32",
			
 
				+    [GGUF_TYPE_INT32]   = "i32",
			
 
				+    [GGUF_TYPE_FLOAT32] = "f32",
			
 
				+    [GGUF_TYPE_BOOL]    = "bool",
			
 
				+    [GGUF_TYPE_STRING]  = "str",
			
 
				+    [GGUF_TYPE_ARRAY]   = "arr",
			
 
				+};
			
 
				+static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
			
 
				+
			
 
				+union gguf_value {
			
 
				+    uint8_t  uint8;
			
 
				+    int8_t   int8;
			
 
				+    uint16_t uint16;
			
 
				+    int16_t  int16;
			
 
				+    uint32_t uint32;
			
 
				+    int32_t  int32;
			
 
				+    float    float32;
			
 
				+    bool     bool_;
			
 
				+
			
 
				+    struct gguf_str str;
			
 
				+
			
 
				+    struct {
			
 
				+        enum gguf_type type;
			
 
				+
			
 
				+        uint32_t n;
			
 
				+        void * data;
			
 
				+    } arr;
			
 
				+};
			
 
				+
			
 
				+struct gguf_kv {
			
 
				+    struct gguf_str key;
			
 
				+
			
 
				+    uint32_t n_bytes; // TODO: is this actually needed?
			
 
				+
			
 
				+    enum  gguf_type  type;
			
 
				+    union gguf_value value;
			
 
				+};
			
 
				+
			
 
				+struct gguf_header {
			
 
				+    uint32_t magic;
			
 
				+    uint32_t version;
			
 
				+    uint32_t n_tensors;
			
 
				+    uint32_t n_kv;
			
 
				+};
			
 
				+
			
 
				+struct gguf_tensor_info {
			
 
				+    struct gguf_str name;
			
 
				+
			
 
				+    uint32_t n_dims;
			
 
				+    uint32_t ne[GGML_MAX_DIMS];
			
 
				+
			
 
				+    enum ggml_type type;
			
 
				+
			
 
				+    uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
			
 
				+
			
 
				+    // for writing API
			
 
				+    const void * data;
			
 
				+    size_t size;
			
 
				+};
			
 
				+
			
 
				+struct gguf_context {
			
 
				+    struct gguf_header header;
			
 
				+
			
 
				+    struct gguf_kv          * kv;
			
 
				+    struct gguf_tensor_info * infos;
			
 
				+
			
 
				+    size_t alignment;
			
 
				+    size_t offset;    // offset of `data` from beginning of file
			
 
				+    size_t size;      // size of `data` in bytes
			
 
				+
			
 
				+    //uint8_t * padding;
			
 
				+    void * data;
			
 
				+};
			
 
				+
			
 
				+static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
			
 
				+    const size_t n = fread(dst, 1, size, file);
			
 
				+    *offset += n;
			
 
				+    return n == size;
			
 
				+}
			
 
				+
			
 
				+static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
			
 
				+    p->n    = 0;
			
 
				+    p->data = NULL;
			
 
				+
			
 
				+    bool ok = true;
			
 
				+
			
 
				+    // TODO: how to avoid mallocs for strings?
			
 
				+    ok = ok && gguf_fread_el(file, &p->n,    sizeof(p->n), offset); p->data = calloc(p->n + 1, 1);
			
 
				+    ok = ok && gguf_fread_el(file,  p->data, p->n,         offset);
			
 
				+
			
 
				+    return ok;
			
 
				+}
			
 
				+
			
 
				+struct gguf_context * gguf_init_empty(void) {
			
 
				+    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
			
 
				+
			
 
				+    ctx->header.magic     = GGUF_MAGIC;
			
 
				+    ctx->header.version   = GGUF_VERSION;
			
 
				+    ctx->header.n_tensors = 0;
			
 
				+    ctx->header.n_kv      = 0;
			
 
				+
			
 
				+    ctx->kv    = NULL;
			
 
				+    ctx->infos = NULL;
			
 
				+
			
 
				+    ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
			
 
				+    ctx->offset    = 0;
			
 
				+    ctx->size      = 0;
			
 
				+
			
 
				+    ctx->data = NULL;
			
 
				+
			
 
				+    return ctx;
			
 
				+}
			
 
				+
			
 
				+struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
			
 
				+    FILE * file = fopen(fname, "rb");
			
 
				+    if (!file) {
			
 
				+        return NULL;
			
 
				+    }
			
 
				+
			
 
				+    // offset from start of file
			
 
				+    size_t offset = 0;
			
 
				+
			
 
				+    uint32_t magic = 0;
			
 
				+
			
 
				+    // check the magic before making allocations
			
 
				+    {
			
 
				+        gguf_fread_el(file, &magic, sizeof(magic), &offset);
			
 
				+
			
 
				+        if (magic != GGUF_MAGIC) {
			
 
				+            fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
			
 
				+            fclose(file);
			
 
				+            return NULL;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    bool ok = true;
			
 
				+
			
 
				+    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
			
 
				+
			
 
				+    // read the header
			
 
				+    {
			
 
				+        ctx->header.magic = magic;
			
 
				+
			
 
				+        ctx->kv    = NULL;
			
 
				+        ctx->infos = NULL;
			
 
				+        ctx->data  = NULL;
			
 
				+
			
 
				+        ok = ok && gguf_fread_el(file, &ctx->header.version,   sizeof(ctx->header.version),   &offset);
			
 
				+        ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
			
 
				+        ok = ok && gguf_fread_el(file, &ctx->header.n_kv,      sizeof(ctx->header.n_kv),      &offset);
			
 
				+
			
 
				+        if (!ok) {
			
 
				+            fprintf(stderr, "%s: failed to read header\n", __func__);
			
 
				+            fclose(file);
			
 
				+            gguf_free(ctx);
			
 
				+            return NULL;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // read the kv pairs
			
 
				+    {
			
 
				+        ctx->kv = GGML_ALIGNED_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
			
 
				+
			
 
				+        for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
			
 
				+            struct gguf_kv * kv = &ctx->kv[i];
			
 
				+
			
 
				+            //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
			
 
				+
			
 
				+            ok = ok && gguf_fread_str(file, &kv->key,                          &offset);
			
 
				+          //ok = ok && gguf_fread_el (file, &kv->n_bytes, sizeof(kv->n_bytes), &offset);
			
 
				+            ok = ok && gguf_fread_el (file, &kv->type,    sizeof(kv->type),    &offset);
			
 
				+
			
 
				+            //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
			
 
				+
			
 
				+            switch (kv->type) {
			
 
				+                case GGUF_TYPE_UINT8:   ok = ok && gguf_fread_el (file, &kv->value.uint8,   sizeof(kv->value.uint8),   &offset); break;
			
 
				+                case GGUF_TYPE_INT8:    ok = ok && gguf_fread_el (file, &kv->value.int8,    sizeof(kv->value.int8),    &offset); break;
			
 
				+                case GGUF_TYPE_UINT16:  ok = ok && gguf_fread_el (file, &kv->value.uint16,  sizeof(kv->value.uint16),  &offset); break;
			
 
				+                case GGUF_TYPE_INT16:   ok = ok && gguf_fread_el (file, &kv->value.int16,   sizeof(kv->value.int16),   &offset); break;
			
 
				+                case GGUF_TYPE_UINT32:  ok = ok && gguf_fread_el (file, &kv->value.uint32,  sizeof(kv->value.uint32),  &offset); break;
			
 
				+                case GGUF_TYPE_INT32:   ok = ok && gguf_fread_el (file, &kv->value.int32,   sizeof(kv->value.int32),   &offset); break;
			
 
				+                case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
			
 
				+                case GGUF_TYPE_BOOL:    ok = ok && gguf_fread_el (file, &kv->value.bool_,   sizeof(kv->value.bool_),   &offset); break;
			
 
				+                case GGUF_TYPE_STRING:  ok = ok && gguf_fread_str(file, &kv->value.str,                                &offset); break;
			
 
				+                case GGUF_TYPE_ARRAY:
			
 
				+                    {
			
 
				+                        ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
			
 
				+                        ok = ok && gguf_fread_el(file, &kv->value.arr.n,    sizeof(kv->value.arr.n),    &offset);
			
 
				+
			
 
				+                        switch (kv->value.arr.type) {
			
 
				+                            case GGUF_TYPE_UINT8:
			
 
				+                            case GGUF_TYPE_INT8:
			
 
				+                            case GGUF_TYPE_UINT16:
			
 
				+                            case GGUF_TYPE_INT16:
			
 
				+                            case GGUF_TYPE_UINT32:
			
 
				+                            case GGUF_TYPE_INT32:
			
 
				+                            case GGUF_TYPE_FLOAT32:
			
 
				+                            case GGUF_TYPE_BOOL:
			
 
				+                                {
			
 
				+                                    kv->value.arr.data = malloc(kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
			
 
				+                                    ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type], &offset);
			
 
				+                                } break;
			
 
				+                            case GGUF_TYPE_STRING:
			
 
				+                                {
			
 
				+                                    kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
			
 
				+                                    for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
			
 
				+                                        ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
			
 
				+                                    }
			
 
				+                                } break;
			
 
				+                            case GGUF_TYPE_ARRAY:
			
 
				+                            case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
			
 
				+                        };
			
 
				+                    } break;
			
 
				+                case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
			
 
				+            };
			
 
				+
			
 
				+            if (!ok) {
			
 
				+                break;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        if (!ok) {
			
 
				+            fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
			
 
				+            fclose(file);
			
 
				+            gguf_free(ctx);
			
 
				+            return NULL;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // read the tensor infos
			
 
				+    {
			
 
				+        ctx->infos = GGML_ALIGNED_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
			
 
				+
			
 
				+        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
			
 
				+            struct gguf_tensor_info * info = &ctx->infos[i];
			
 
				+
			
 
				+            for (int j = 0; j < GGML_MAX_DIMS; ++j) {
			
 
				+                info->ne[j] = 1;
			
 
				+            }
			
 
				+
			
 
				+            ok = ok && gguf_fread_str(file, &info->name,                          &offset);
			
 
				+            ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims),  &offset);
			
 
				+            for (uint32_t j = 0; j < info->n_dims; ++j) {
			
 
				+                ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
			
 
				+            }
			
 
				+            ok = ok && gguf_fread_el (file, &info->type,   sizeof(info->type),    &offset);
			
 
				+            ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset),  &offset);
			
 
				+
			
 
				+            if (!ok) {
			
 
				+                fprintf(stderr, "%s: failed to read tensor info\n", __func__);
			
 
				+                fclose(file);
			
 
				+                gguf_free(ctx);
			
 
				+                return NULL;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
			
 
				+
			
 
				+    int alignment_idx = gguf_find_key(ctx, "general.alignment");
			
 
				+    if (alignment_idx != -1) {
			
 
				+        ctx->alignment = gguf_get_val_u32(ctx, alignment_idx);
			
 
				+    }
			
 
				+
			
 
				+    // we require the data section to be aligned, so take into account any padding
			
 
				+    {
			
 
				+        const size_t offset_pad = offset % ctx->alignment;
			
 
				+
			
 
				+        if (offset_pad != 0) {
			
 
				+            offset += ctx->alignment - offset_pad;
			
 
				+            fseek(file, offset, SEEK_SET);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // store the current file offset - this is where the data section starts
			
 
				+    ctx->offset = offset;
			
 
				+
			
 
				+    // compute the total size of the data section, taking into account the alignment
			
 
				+    {
			
 
				+        ctx->size = 0;
			
 
				+        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
			
 
				+            struct gguf_tensor_info * info = &ctx->infos[i];
			
 
				+
			
 
				+            const int64_t ne =
			
 
				+                (int64_t) info->ne[0] *
			
 
				+                (int64_t) info->ne[1] *
			
 
				+                (int64_t) info->ne[2] *
			
 
				+                (int64_t) info->ne[3];
			
 
				+
			
 
				+            if (ne % ggml_blck_size(info->type) != 0) {
			
 
				+                fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
			
 
				+                        __func__, info->name.data, ne, ggml_blck_size(info->type));
			
 
				+                fclose(file);
			
 
				+                gguf_free(ctx);
			
 
				+                return NULL;
			
 
				+            }
			
 
				+
			
 
				+            const size_t size_cur = (ne*ggml_type_size(info->type))/ggml_blck_size(info->type);
			
 
				+
			
 
				+            ctx->size += GGML_PAD(size_cur, ctx->alignment);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // load the tensor data only if requested
			
 
				+    if (params.ctx != NULL) {
			
 
				+        // if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob
			
 
				+        // otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of
			
 
				+        // the ggml_tensor structs to the appropriate locations in the binary blob
			
 
				+
			
 
				+        // compute the exact size needed for the new ggml_context
			
 
				+        const size_t mem_size =
			
 
				+            params.no_alloc ?
			
 
				+            (ctx->header.n_tensors    )*ggml_tensor_overhead() :
			
 
				+            (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
			
 
				+
			
 
				+        struct ggml_init_params pdata = {
			
 
				+            .mem_size   = mem_size,
			
 
				+            .mem_buffer = NULL,
			
 
				+            .no_alloc   = params.no_alloc,
			
 
				+        };
			
 
				+
			
 
				+        *params.ctx = ggml_init(pdata);
			
 
				+
			
 
				+        struct ggml_context * ctx_data = *params.ctx;
			
 
				+
			
 
				+        struct ggml_tensor * data = NULL;
			
 
				+
			
 
				+        if (params.no_alloc == false) {
			
 
				+            data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
			
 
				+
			
 
				+            ok = ok && data != NULL;
			
 
				+
			
 
				+            // read the binary blob with the tensor data
			
 
				+            ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset);
			
 
				+
			
 
				+            if (!ok) {
			
 
				+                fprintf(stderr, "%s: failed to read tensor data\n", __func__);
			
 
				+                fclose(file);
			
 
				+                ggml_free(ctx_data);
			
 
				+                gguf_free(ctx);
			
 
				+                return NULL;
			
 
				+            }
			
 
				+
			
 
				+            ctx->data = data->data;
			
 
				+        }
			
 
				+
			
 
				+        ggml_set_no_alloc(ctx_data, true);
			
 
				+
			
 
				+        // create the tensors
			
 
				+        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
			
 
				+            const int64_t ne[GGML_MAX_DIMS] = {
			
 
				+                ctx->infos[i].ne[0],
			
 
				+                ctx->infos[i].ne[1],
			
 
				+                ctx->infos[i].ne[2],
			
 
				+                ctx->infos[i].ne[3],
			
 
				+            };
			
 
				+
			
 
				+            struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
			
 
				+
			
 
				+            ok = ok && cur != NULL;
			
 
				+
			
 
				+            ggml_set_name(cur, ctx->infos[i].name.data);
			
 
				+
			
 
				+            if (!ok) {
			
 
				+                break;
			
 
				+            }
			
 
				+
			
 
				+            // point the data member to the appropriate location in the binary blob using the tensor infos
			
 
				+            if (params.no_alloc == false) {
			
 
				+              //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
			
 
				+                cur->data = (char *) data->data + ctx->infos[i].offset;               // offset from data
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        if (!ok) {
			
 
				+            fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
			
 
				+            fclose(file);
			
 
				+            ggml_free(ctx_data);
			
 
				+            gguf_free(ctx);
			
 
				+            return NULL;
			
 
				+        }
			
 
				+
			
 
				+        ggml_set_no_alloc(ctx_data, params.no_alloc);
			
 
				+    }
			
 
				+
			
 
				+    fclose(file);
			
 
				+
			
 
				+    return ctx;
			
 
				+}
			
 
				+
			
 
				+void gguf_free(struct gguf_context * ctx) {
			
 
				+    if (ctx == NULL) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    if (ctx->kv) {
			
 
				+        // free string memory - not great..
			
 
				+        for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
			
 
				+            struct gguf_kv * kv = &ctx->kv[i];
			
 
				+
			
 
				+            if (kv->key.data) {
			
 
				+                free(kv->key.data);
			
 
				+            }
			
 
				+
			
 
				+            if (kv->type == GGUF_TYPE_STRING) {
			
 
				+                if (kv->value.str.data) {
			
 
				+                    free(kv->value.str.data);
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            if (kv->type == GGUF_TYPE_ARRAY) {
			
 
				+                if (kv->value.arr.data) {
			
 
				+                    if (kv->value.arr.type == GGUF_TYPE_STRING) {
			
 
				+                        for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
			
 
				+                            struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j];
			
 
				+                            if (str->data) {
			
 
				+                                free(str->data);
			
 
				+                            }
			
 
				+                        }
			
 
				+                    }
			
 
				+                    free(kv->value.arr.data);
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        GGML_ALIGNED_FREE(ctx->kv);
			
 
				+    }
			
 
				+
			
 
				+    if (ctx->infos) {
			
 
				+        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
			
 
				+            struct gguf_tensor_info * info = &ctx->infos[i];
			
 
				+
			
 
				+            if (info->name.data) {
			
 
				+                free(info->name.data);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        GGML_ALIGNED_FREE(ctx->infos);
			
 
				+    }
			
 
				+
			
 
				+    GGML_ALIGNED_FREE(ctx);
			
 
				+}
			
 
				+
			
 
				+const char * gguf_type_name(enum gguf_type type) {
			
 
				+    return GGUF_TYPE_NAME[type];
			
 
				+}
			
 
				+
			
 
				+int gguf_get_version(struct gguf_context * ctx) {
			
 
				+    return ctx->header.version;
			
 
				+}
			
 
				+
			
 
				+size_t gguf_get_alignment(struct gguf_context * ctx) {
			
 
				+    return ctx->alignment;
			
 
				+}
			
 
				+
			
 
				+size_t gguf_get_data_offset(struct gguf_context * ctx) {
			
 
				+    return ctx->offset;
			
 
				+}
			
 
				+
			
 
				+void * gguf_get_data(struct gguf_context * ctx) {
			
 
				+    return ctx->data;
			
 
				+}
			
 
				+
			
 
				+int gguf_get_n_kv(struct gguf_context * ctx) {
			
 
				+    return ctx->header.n_kv;
			
 
				+}
			
 
				+
			
 
				+int gguf_find_key(struct gguf_context * ctx, const char * key) {
			
 
				+    // return -1 if key not found
			
 
				+    int keyfound = -1;
			
 
				+
			
 
				+    const int n_kv = gguf_get_n_kv(ctx);
			
 
				+
			
 
				+    for (int i = 0; i < n_kv; ++i) {
			
 
				+        if (strcmp(key, gguf_get_key(ctx, i)) == 0) {
			
 
				+            keyfound = i;
			
 
				+            break;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    return keyfound;
			
 
				+}
			
 
				+
			
 
				+const char * gguf_get_key(struct gguf_context * ctx, int i) {
			
 
				+    return ctx->kv[i].key.data;
			
 
				+}
			
 
				+
			
 
				+enum gguf_type gguf_get_kv_type(struct gguf_context * ctx, int i) {
			
 
				+    return ctx->kv[i].type;
			
 
				+}
			
 
				+
			
 
				+enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i) {
			
 
				+    return ctx->kv[i].value.arr.type;
			
 
				+}
			
 
				+
			
 
				+const void * gguf_get_arr_data(struct gguf_context * ctx, int i) {
			
 
				+    return ctx->kv[i].value.arr.data;
			
 
				+}
			
 
				+
			
 
				+const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i) {
			
 
				+    struct gguf_kv * kv = &ctx->kv[key_id];
			
 
				+    struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
			
 
				+    return str->data;
			
 
				+}
			
 
				+
			
 
				+int gguf_get_arr_n(struct gguf_context * ctx, int i) {
			
 
				+    return ctx->kv[i].value.arr.n;
			
 
				+}
			
 
				+
			
 
				+uint8_t gguf_get_val_u8(struct gguf_context * ctx, int i) {
			
 
				+    return ctx->kv[i].value.uint8;
			
 
				+}
			
 
				+
			
 
				+int8_t gguf_get_val_i8(struct gguf_context * ctx, int i) {
			
 
				+    return ctx->kv[i].value.int8;
			
 
				+}
			
 
				+
			
 
				+uint16_t gguf_get_val_u16(struct gguf_context * ctx, int i) {
			
 
				+    return ctx->kv[i].value.uint16;
			
 
				+}
			
 
				+
			
 
				+int16_t gguf_get_val_i16(struct gguf_context * ctx, int i) {
			
 
				+    return ctx->kv[i].value.int16;
			
 
				+}
			
 
				+
			
 
				+uint32_t gguf_get_val_u32(struct gguf_context * ctx, int i) {
			
 
				+    return ctx->kv[i].value.uint32;
			
 
				+}
			
 
				+
			
 
				+int32_t gguf_get_val_i32(struct gguf_context * ctx, int i) {
			
 
				+    return ctx->kv[i].value.int32;
			
 
				+}
			
 
				+
			
 
				+float gguf_get_val_f32(struct gguf_context * ctx, int i) {
			
 
				+    return ctx->kv[i].value.float32;
			
 
				+}
			
 
				+
			
 
				+bool gguf_get_val_bool(struct gguf_context * ctx, int i) {
			
 
				+    return ctx->kv[i].value.bool_;
			
 
				+}
			
 
				+
			
 
				+const char * gguf_get_val_str (struct gguf_context * ctx, int i) {
			
 
				+    return ctx->kv[i].value.str.data;
			
 
				+}
			
 
				+
			
 
				+int gguf_get_n_tensors(struct gguf_context * ctx) {
			
 
				+    return ctx->header.n_tensors;
			
 
				+}
			
 
				+
			
 
				+int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
			
 
				+    // return -1 if tensor not found
			
 
				+    int tensorfound = -1;
			
 
				+
			
 
				+    const int n_tensors = gguf_get_n_tensors(ctx);
			
 
				+
			
 
				+    for (int i = 0; i < n_tensors; ++i) {
			
 
				+        if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
			
 
				+            tensorfound = i;
			
 
				+            break;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    return tensorfound;
			
 
				+}
			
 
				+
			
 
				+size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) {
			
 
				+    return ctx->infos[i].offset;
			
 
				+}
			
 
				+
			
 
				+char * gguf_get_tensor_name(struct gguf_context * ctx, int i) {
			
 
				+    return ctx->infos[i].name.data;
			
 
				+}
			
 
				+
			
 
				+// returns the index
			
 
				+static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
			
 
				+    const int idx = gguf_find_key(ctx, key);
			
 
				+    if (idx >= 0) {
			
 
				+        return idx;
			
 
				+    }
			
 
				+
			
 
				+    const int n_kv = gguf_get_n_kv(ctx);
			
 
				+
			
 
				+    ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
			
 
				+    ctx->kv[n_kv].key.n    = strlen(key) + 1;
			
 
				+    ctx->kv[n_kv].key.data = strdup(key);
			
 
				+    ctx->header.n_kv++;
			
 
				+
			
 
				+    return n_kv;
			
 
				+}
			
 
				+
			
 
				+void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) {
			
 
				+    const int idx = gguf_get_or_add_key(ctx, key);
			
 
				+
			
 
				+    ctx->kv[idx].type        = GGUF_TYPE_UINT8;
			
 
				+    ctx->kv[idx].value.uint8 = val;
			
 
				+}
			
 
				+
			
 
				+void gguf_set_val_i8(struct gguf_context * ctx, const char * key, int8_t val) {
			
 
				+    const int idx = gguf_get_or_add_key(ctx, key);
			
 
				+
			
 
				+    ctx->kv[idx].type       = GGUF_TYPE_INT8;
			
 
				+    ctx->kv[idx].value.int8 = val;
			
 
				+}
			
 
				+
			
 
				+void gguf_set_val_u16(struct gguf_context * ctx, const char * key, uint16_t val) {
			
 
				+    const int idx = gguf_get_or_add_key(ctx, key);
			
 
				+
			
 
				+    ctx->kv[idx].type         = GGUF_TYPE_UINT16;
			
 
				+    ctx->kv[idx].value.uint16 = val;
			
 
				+}
			
 
				+
			
 
				+void gguf_set_val_i16(struct gguf_context * ctx, const char * key, int16_t val) {
			
 
				+    const int idx = gguf_get_or_add_key(ctx, key);
			
 
				+
			
 
				+    ctx->kv[idx].type        = GGUF_TYPE_INT16;
			
 
				+    ctx->kv[idx].value.int16 = val;
			
 
				+}
			
 
				+
			
 
				+void gguf_set_val_u32(struct gguf_context * ctx, const char * key, uint32_t val) {
			
 
				+    const int idx = gguf_get_or_add_key(ctx, key);
			
 
				+
			
 
				+    ctx->kv[idx].type         = GGUF_TYPE_UINT32;
			
 
				+    ctx->kv[idx].value.uint32 = val;
			
 
				+}
			
 
				+
			
 
				+void gguf_set_val_i32(struct gguf_context * ctx, const char * key, int32_t val) {
			
 
				+    const int idx = gguf_get_or_add_key(ctx, key);
			
 
				+
			
 
				+    ctx->kv[idx].type        = GGUF_TYPE_INT32;
			
 
				+    ctx->kv[idx].value.int32 = val;
			
 
				+}
			
 
				+
			
 
				+void gguf_set_val_f32(struct gguf_context * ctx, const char * key, float val) {
			
 
				+    const int idx = gguf_get_or_add_key(ctx, key);
			
 
				+
			
 
				+    ctx->kv[idx].type          = GGUF_TYPE_FLOAT32;
			
 
				+    ctx->kv[idx].value.float32 = val;
			
 
				+}
			
 
				+
			
 
				+void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
			
 
				+    const int idx = gguf_get_or_add_key(ctx, key);
			
 
				+
			
 
				+    ctx->kv[idx].type        = GGUF_TYPE_BOOL;
			
 
				+    ctx->kv[idx].value.bool_ = val;
			
 
				+}
			
 
				+
			
 
				+void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) {
			
 
				+    const int idx = gguf_get_or_add_key(ctx, key);
			
 
				+
			
 
				+    ctx->kv[idx].type           = GGUF_TYPE_STRING;
			
 
				+    ctx->kv[idx].value.str.n    = strlen(val) + 1;
			
 
				+    ctx->kv[idx].value.str.data = strdup(val);
			
 
				+}
			
 
				+
			
 
				+void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) {
			
 
				+    const int idx = gguf_get_or_add_key(ctx, key);
			
 
				+
			
 
				+    ctx->kv[idx].type           = GGUF_TYPE_ARRAY;
			
 
				+    ctx->kv[idx].value.arr.type = type;
			
 
				+    ctx->kv[idx].value.arr.n    = n;
			
 
				+    ctx->kv[idx].value.arr.data = malloc(n*GGUF_TYPE_SIZE[type]);
			
 
				+    memcpy(ctx->kv[idx].value.arr.data, data, n*GGUF_TYPE_SIZE[type]);
			
 
				+}
			
 
				+
			
 
				+void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
			
 
				+    const int idx = gguf_get_or_add_key(ctx, key);
			
 
				+
			
 
				+    ctx->kv[idx].type           = GGUF_TYPE_ARRAY;
			
 
				+    ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
			
 
				+    ctx->kv[idx].value.arr.n    = n;
			
 
				+    ctx->kv[idx].value.arr.data = malloc(n*sizeof(struct gguf_str));
			
 
				+    for (int i = 0; i < n; i++) {
			
 
				+        struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
			
 
				+        str->n    = strlen(data[i]) + 1;
			
 
				+        str->data = strdup(data[i]);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// set or add KV pairs from another context
			
 
				+void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
			
 
				+    for (uint32_t i = 0; i < src->header.n_kv; i++) {
			
 
				+        switch (src->kv[i].type) {
			
 
				+            case GGUF_TYPE_UINT8:   gguf_set_val_u8  (ctx, src->kv[i].key.data, src->kv[i].value.uint8);    break;
			
 
				+            case GGUF_TYPE_INT8:    gguf_set_val_i8  (ctx, src->kv[i].key.data, src->kv[i].value.int8);     break;
			
 
				+            case GGUF_TYPE_UINT16:  gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16);   break;
			
 
				+            case GGUF_TYPE_INT16:   gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16);    break;
			
 
				+            case GGUF_TYPE_UINT32:  gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32);   break;
			
 
				+            case GGUF_TYPE_INT32:   gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32);    break;
			
 
				+            case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32);  break;
			
 
				+            case GGUF_TYPE_BOOL:    gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_);    break;
			
 
				+            case GGUF_TYPE_STRING:  gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
			
 
				+            case GGUF_TYPE_ARRAY:
			
 
				+                {
			
 
				+                    if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
			
 
				+                        const char ** data = malloc(src->kv[i].value.arr.n*sizeof(char *));
			
 
				+                        for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
			
 
				+                            data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
			
 
				+                        }
			
 
				+                        gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
			
 
				+                        free(data);
			
 
				+                    } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
			
 
				+                        GGML_ASSERT(false && "nested arrays not supported");
			
 
				+                    } else {
			
 
				+                        gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
			
 
				+                    }
			
 
				+                } break;
			
 
				+            case GGUF_TYPE_COUNT:  GGML_ASSERT(false && "invalid type"); break;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void gguf_add_tensor(
			
 
				+             struct gguf_context * ctx,
			
 
				+        const struct ggml_tensor * tensor) {
			
 
				+    const int idx = ctx->header.n_tensors;
			
 
				+    ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
			
 
				+
			
 
				+    ctx->infos[idx].name.n    = strlen(tensor->name) + 1;
			
 
				+    ctx->infos[idx].name.data = strdup(tensor->name);
			
 
				+
			
 
				+    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
			
 
				+        ctx->infos[idx].ne[i] = 1;
			
 
				+    }
			
 
				+
			
 
				+    ctx->infos[idx].n_dims = tensor->n_dims;
			
 
				+    for (int i = 0; i < tensor->n_dims; i++) {
			
 
				+        ctx->infos[idx].ne[i] = tensor->ne[i];
			
 
				+    }
			
 
				+
			
 
				+    ctx->infos[idx].type   = tensor->type;
			
 
				+    ctx->infos[idx].offset = 0;
			
 
				+    ctx->infos[idx].data   = tensor->data;
			
 
				+    ctx->infos[idx].size   = ggml_nbytes(tensor);
			
 
				+
			
 
				+    if (ctx->header.n_tensors > 0) {
			
 
				+        ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment);
			
 
				+    }
			
 
				+
			
 
				+    ctx->header.n_tensors++;
			
 
				+}
			
 
				+
			
 
				+void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
			
 
				+    const int idx = gguf_find_tensor(ctx, name);
			
 
				+    if (idx < 0) {
			
 
				+        GGML_ASSERT(false && "tensor not found");
			
 
				+    }
			
 
				+
			
 
				+    ctx->infos[idx].type = type;
			
 
				+}
			
 
				+
			
 
				+void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
			
 
				+    const int idx = gguf_find_tensor(ctx, name);
			
 
				+    if (idx < 0) {
			
 
				+        GGML_ASSERT(false && "tensor not found");
			
 
				+    }
			
 
				+
			
 
				+    ctx->infos[idx].data = data;
			
 
				+    ctx->infos[idx].size = size;
			
 
				+
			
 
				+    // update offsets
			
 
				+    for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
			
 
				+        ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+//static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
			
 
				+//    fwrite(&val->n,   sizeof(val->n),    1, file);
			
 
				+//    fwrite(val->data, sizeof(char), val->n, file);
			
 
				+//}
			
 
				+//
			
 
				+//static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
			
 
				+//    fwrite(val, sizeof(char), size, file);
			
 
				+//}
			
 
				+
			
 
				+struct gguf_buf {
			
 
				+    void * data;
			
 
				+    size_t size;
			
 
				+    size_t offset;
			
 
				+};
			
 
				+
			
 
				+static struct gguf_buf gguf_buf_init(size_t size) {
			
 
				+    struct gguf_buf buf = {
			
 
				+        /*buf.data   =*/ size == 0 ? NULL : malloc(size),
			
 
				+        /*buf.size   =*/ size,
			
 
				+        /*buf.offset =*/ 0,
			
 
				+    };
			
 
				+
			
 
				+    return buf;
			
 
				+}
			
 
				+
			
 
				+static void gguf_buf_free(struct gguf_buf buf) {
			
 
				+    if (buf.data) {
			
 
				+        free(buf.data);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static void gguf_buf_grow(struct gguf_buf * buf, size_t size) {
			
 
				+    if (buf->offset + size > buf->size) {
			
 
				+        buf->size = 1.5*(buf->offset + size);
			
 
				+        if (buf->data) {
			
 
				+            buf->data = realloc(buf->data, buf->size);
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) {
			
 
				+    gguf_buf_grow(buf, sizeof(val->n) + val->n);
			
 
				+
			
 
				+    if (buf->data) {
			
 
				+        memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n));
			
 
				+    }
			
 
				+    buf->offset += sizeof(val->n);
			
 
				+
			
 
				+    if (buf->data) {
			
 
				+        memcpy((char *) buf->data + buf->offset, val->data, val->n);
			
 
				+    }
			
 
				+    buf->offset += val->n;
			
 
				+}
			
 
				+
			
 
				+static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) {
			
 
				+    gguf_buf_grow(buf, el_size);
			
 
				+
			
 
				+    if (buf->data) {
			
 
				+        memcpy((char *) buf->data + buf->offset, val, el_size);
			
 
				+    }
			
 
				+    buf->offset += el_size;
			
 
				+}
			
 
				+
			
 
				+static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
			
 
				+    // write header
			
 
				+    gguf_bwrite_el(buf, &ctx->header.magic,     sizeof(ctx->header.magic));
			
 
				+    gguf_bwrite_el(buf, &ctx->header.version,   sizeof(ctx->header.version));
			
 
				+    gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
			
 
				+    gguf_bwrite_el(buf, &ctx->header.n_kv,      sizeof(ctx->header.n_kv));
			
 
				+
			
 
				+    // write key-value pairs
			
 
				+    for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
			
 
				+        struct gguf_kv * kv = &ctx->kv[i];
			
 
				+
			
 
				+        gguf_bwrite_str(buf, &kv->key);
			
 
				+        gguf_bwrite_el (buf, &kv->type, sizeof(kv->type));
			
 
				+
			
 
				+        switch (kv->type) {
			
 
				+            case GGUF_TYPE_UINT8:   gguf_bwrite_el( buf, &kv->value.uint8,   sizeof(kv->value.uint8)  ); break;
			
 
				+            case GGUF_TYPE_INT8:    gguf_bwrite_el (buf, &kv->value.int8,    sizeof(kv->value.int8)   ); break;
			
 
				+            case GGUF_TYPE_UINT16:  gguf_bwrite_el (buf, &kv->value.uint16,  sizeof(kv->value.uint16) ); break;
			
 
				+            case GGUF_TYPE_INT16:   gguf_bwrite_el (buf, &kv->value.int16,   sizeof(kv->value.int16)  ); break;
			
 
				+            case GGUF_TYPE_UINT32:  gguf_bwrite_el (buf, &kv->value.uint32,  sizeof(kv->value.uint32) ); break;
			
 
				+            case GGUF_TYPE_INT32:   gguf_bwrite_el (buf, &kv->value.int32,   sizeof(kv->value.int32)  ); break;
			
 
				+            case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
			
 
				+            case GGUF_TYPE_BOOL:    gguf_bwrite_el (buf, &kv->value.bool_,   sizeof(kv->value.bool_)  ); break;
			
 
				+            case GGUF_TYPE_STRING:  gguf_bwrite_str(buf, &kv->value.str                               ); break;
			
 
				+            case GGUF_TYPE_ARRAY:
			
 
				+                {
			
 
				+                    gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
			
 
				+                    gguf_bwrite_el(buf, &kv->value.arr.n,    sizeof(kv->value.arr.n)   );
			
 
				+
			
 
				+                    switch (kv->value.arr.type) {
			
 
				+                        case GGUF_TYPE_UINT8:
			
 
				+                        case GGUF_TYPE_INT8:
			
 
				+                        case GGUF_TYPE_UINT16:
			
 
				+                        case GGUF_TYPE_INT16:
			
 
				+                        case GGUF_TYPE_UINT32:
			
 
				+                        case GGUF_TYPE_INT32:
			
 
				+                        case GGUF_TYPE_FLOAT32:
			
 
				+                        case GGUF_TYPE_BOOL:
			
 
				+                            {
			
 
				+                                gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
			
 
				+                            } break;
			
 
				+                        case GGUF_TYPE_STRING:
			
 
				+                            {
			
 
				+                                for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
			
 
				+                                    gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]);
			
 
				+                                }
			
 
				+                            } break;
			
 
				+                        case GGUF_TYPE_ARRAY:
			
 
				+                        case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type"); break;
			
 
				+                    };
			
 
				+                } break;
			
 
				+            case GGUF_TYPE_COUNT: GGML_ASSERT(false && "invalid type");
			
 
				+        };
			
 
				+    }
			
 
				+
			
 
				+    // write tensor infos
			
 
				+    for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
			
 
				+        struct gguf_tensor_info * info = &ctx->infos[i];
			
 
				+
			
 
				+        gguf_bwrite_str(buf, &info->name);
			
 
				+        gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
			
 
				+        for (uint32_t j = 0; j < info->n_dims; ++j) {
			
 
				+            gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
			
 
				+        }
			
 
				+        gguf_bwrite_el(buf, &info->type,   sizeof(info->type));
			
 
				+        gguf_bwrite_el(buf, &info->offset, sizeof(info->offset));
			
 
				+    }
			
 
				+
			
 
				+    // we require the data section to be aligned, so take into account any padding
			
 
				+    {
			
 
				+        const size_t offset     = buf->offset;
			
 
				+        const size_t offset_pad = GGML_PAD(offset, ctx->alignment);
			
 
				+
			
 
				+        if (offset_pad != offset) {
			
 
				+            uint8_t pad = 0;
			
 
				+            for (size_t i = 0; i < offset_pad - offset; ++i) {
			
 
				+                gguf_bwrite_el(buf, &pad, sizeof(pad));
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    if (only_meta) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    size_t offset = 0;
			
 
				+
			
 
				+    // write tensor data
			
 
				+    for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
			
 
				+        struct gguf_tensor_info * info = &ctx->infos[i];
			
 
				+
			
 
				+        const size_t size     = info->size;
			
 
				+        const size_t size_pad = GGML_PAD(size, ctx->alignment);
			
 
				+
			
 
				+        gguf_bwrite_el(buf, info->data, size);
			
 
				+
			
 
				+        if (size_pad != size) {
			
 
				+            uint8_t pad = 0;
			
 
				+            for (size_t j = 0; j < size_pad - size; ++j) {
			
 
				+                gguf_bwrite_el(buf, &pad, sizeof(pad));
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        GGML_ASSERT(offset == info->offset);
			
 
				+
			
 
				+        offset += size_pad;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta) {
			
 
				+    FILE * file = fopen(fname, "wb");
			
 
				+    if (!file) {
			
 
				+        GGML_ASSERT(false && "failed to open file for writing");
			
 
				+    }
			
 
				+
			
 
				+    struct gguf_buf buf = gguf_buf_init(16*1024);
			
 
				+
			
 
				+    gguf_write_to_buf(ctx, &buf, only_meta);
			
 
				+
			
 
				+    fwrite(buf.data, 1, buf.offset, file);
			
 
				+
			
 
				+    gguf_buf_free(buf);
			
 
				+
			
 
				+    fclose(file);
			
 
				+}
			
 
				+
			
 
				+size_t gguf_get_meta_size(struct gguf_context * ctx) {
			
 
				+    // no allocs - only compute size
			
 
				+    struct gguf_buf buf = gguf_buf_init(0);
			
 
				+
			
 
				+    gguf_write_to_buf(ctx, &buf, true);
			
 
				+
			
 
				+    return buf.offset;
			
 
				+}
			
 
				+
			
 
				+void gguf_get_meta_data(struct gguf_context * ctx, void * data) {
			
 
				+    struct gguf_buf buf = gguf_buf_init(16*1024);
			
 
				+
			
 
				+    gguf_write_to_buf(ctx, &buf, true);
			
 
				+
			
 
				+    memcpy(data, buf.data, buf.offset);
			
 
				+
			
 
				+    gguf_buf_free(buf);
			
 
				+}
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////
			
 
				+
			
 
				 int ggml_cpu_has_avx(void) {
			
 
				 #if defined(__AVX__)
			
 
				     return 1;
			
--- a/ggml.h
+++ b/ggml.h
@@ -207,14 +207,18 @@
 
				 #define GGML_MAX_PARAMS        256
			
 
				 #define GGML_MAX_CONTEXTS      64
			
 
				 #define GGML_MAX_SRC           6
			
 
				-#define GGML_MAX_NAME          48
			
 
				+#define GGML_MAX_NAME          64
			
 
				 #define GGML_MAX_OP_PARAMS     32
			
 
				 #define GGML_DEFAULT_N_THREADS 4
			
 
				 
			
 
				-
			
 
				 #define GGML_EXIT_SUCCESS 0
			
 
				 #define GGML_EXIT_ABORTED 1
			
 
				 
			
 
				+#define GGUF_MAGIC   0x46554747 // "GGUF"
			
 
				+#define GGUF_VERSION 1
			
 
				+
			
 
				+#define GGUF_DEFAULT_ALIGNMENT 32
			
 
				+
			
 
				 #define GGML_UNUSED(x) (void)(x)
			
 
				 
			
 
				 #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
			
@@ -562,6 +566,7 @@ extern "C" {
 
				     GGML_API int64_t ggml_nelements   (const struct ggml_tensor * tensor);
			
 
				     GGML_API int64_t ggml_nrows       (const struct ggml_tensor * tensor);
			
 
				     GGML_API size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
			
 
				+    GGML_API size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
			
 
				     GGML_API size_t  ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
			
 
				 
			
 
				     GGML_API int     ggml_blck_size (enum ggml_type type);
			
@@ -1494,7 +1499,6 @@ extern "C" {
 
				             struct ggml_context * ctx,
			
 
				             struct ggml_tensor  * tensor);
			
 
				 
			
 
				-
			
 
				     GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
			
 
				 
			
 
				     GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
			
@@ -1703,6 +1707,118 @@ extern "C" {
 
				 
			
 
				     GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
			
 
				 
			
 
				+    //
			
 
				+    // gguf
			
 
				+    //
			
 
				+
			
 
				+    enum gguf_type {
			
 
				+        GGUF_TYPE_UINT8   = 0,
			
 
				+        GGUF_TYPE_INT8    = 1,
			
 
				+        GGUF_TYPE_UINT16  = 2,
			
 
				+        GGUF_TYPE_INT16   = 3,
			
 
				+        GGUF_TYPE_UINT32  = 4,
			
 
				+        GGUF_TYPE_INT32   = 5,
			
 
				+        GGUF_TYPE_FLOAT32 = 6,
			
 
				+        GGUF_TYPE_BOOL    = 7,
			
 
				+        GGUF_TYPE_STRING  = 8,
			
 
				+        GGUF_TYPE_ARRAY   = 9,
			
 
				+        GGUF_TYPE_COUNT,       // marks the end of the enum
			
 
				+    };
			
 
				+
			
 
				+    struct gguf_context;
			
 
				+
			
 
				+    struct gguf_init_params {
			
 
				+        bool no_alloc;
			
 
				+
			
 
				+        // if not NULL, create a ggml_context and allocate the tensor data in it
			
 
				+        struct ggml_context ** ctx;
			
 
				+    };
			
 
				+
			
 
				+    GGML_API struct gguf_context * gguf_init_empty(void);
			
 
				+    GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
			
 
				+    //GGML_API struct gguf_context * gguf_init_from_buffer(..);
			
 
				+
			
 
				+    GGML_API void gguf_free(struct gguf_context * ctx);
			
 
				+
			
 
				+    GGML_API const char * gguf_type_name(enum gguf_type type);
			
 
				+
			
 
				+    GGML_API int    gguf_get_version    (struct gguf_context * ctx);
			
 
				+    GGML_API size_t gguf_get_alignment  (struct gguf_context * ctx);
			
 
				+    GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
			
 
				+    GGML_API void * gguf_get_data       (struct gguf_context * ctx);
			
 
				+
			
 
				+    GGML_API int          gguf_get_n_kv(struct gguf_context * ctx);
			
 
				+    GGML_API int          gguf_find_key(struct gguf_context * ctx, const char * key);
			
 
				+    GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
			
 
				+
			
 
				+    GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
			
 
				+    GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
			
 
				+
			
 
				+    // results are undefined if the wrong type is used for the key
			
 
				+    GGML_API uint8_t      gguf_get_val_u8  (struct gguf_context * ctx, int i);
			
 
				+    GGML_API int8_t       gguf_get_val_i8  (struct gguf_context * ctx, int i);
			
 
				+    GGML_API uint16_t     gguf_get_val_u16 (struct gguf_context * ctx, int i);
			
 
				+    GGML_API int16_t      gguf_get_val_i16 (struct gguf_context * ctx, int i);
			
 
				+    GGML_API uint32_t     gguf_get_val_u32 (struct gguf_context * ctx, int i);
			
 
				+    GGML_API int32_t      gguf_get_val_i32 (struct gguf_context * ctx, int i);
			
 
				+    GGML_API float        gguf_get_val_f32 (struct gguf_context * ctx, int i);
			
 
				+    GGML_API bool         gguf_get_val_bool(struct gguf_context * ctx, int i);
			
 
				+    GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
			
 
				+    GGML_API int          gguf_get_arr_n   (struct gguf_context * ctx, int i);
			
 
				+    GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);
			
 
				+    GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);
			
 
				+
			
 
				+    GGML_API int    gguf_get_n_tensors    (struct gguf_context * ctx);
			
 
				+    GGML_API int    gguf_find_tensor      (struct gguf_context * ctx, const char * name);
			
 
				+    GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
			
 
				+    GGML_API char * gguf_get_tensor_name  (struct gguf_context * ctx, int i);
			
 
				+
			
 
				+    // overrides existing values or adds a new one
			
 
				+    GGML_API void gguf_set_val_u8  (struct gguf_context * ctx, const char * key, uint8_t  val);
			
 
				+    GGML_API void gguf_set_val_i8  (struct gguf_context * ctx, const char * key, int8_t   val);
			
 
				+    GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
			
 
				+    GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t  val);
			
 
				+    GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
			
 
				+    GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t  val);
			
 
				+    GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float    val);
			
 
				+    GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool     val);
			
 
				+    GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
			
 
				+    GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
			
 
				+    GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
			
 
				+
			
 
				+    // set or add KV pairs from another context
			
 
				+    GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
			
 
				+
			
 
				+    // manage tensor info
			
 
				+    GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
			
 
				+    GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
			
 
				+    GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
			
 
				+
			
 
				+    // writing gguf files can be done in 2 ways:
			
 
				+    //
			
 
				+    // - write the entire gguf_context to a binary file in a single pass:
			
 
				+    //
			
 
				+    //   gguf_write_to_file(ctx, fname);
			
 
				+    //
			
 
				+    // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
			
 
				+    //
			
 
				+    //   FILE * f = fopen(fname, "wb");
			
 
				+    //   fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
			
 
				+    //   fwrite(f, ...);
			
 
				+    //   void * data = gguf_meta_get_meta_data(ctx);
			
 
				+    //   fseek(f, 0, SEEK_SET);
			
 
				+    //   fwrite(f, data, gguf_get_meta_size(ctx));
			
 
				+    //   free(data);
			
 
				+    //   fclose(f);
			
 
				+    //
			
 
				+
			
 
				+    // write the entire context to a binary file
			
 
				+    GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
			
 
				+
			
 
				+    // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
			
 
				+    GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
			
 
				+    GGML_API void   gguf_get_meta_data(struct gguf_context * ctx, void * data);
			
 
				+
			
 
				     //
			
 
				     // system info
			
 
				     //
			
--- a/gguf.py
+++ b/gguf.py
@@ -0,0 +1,718 @@
 
				+import shutil
			
 
				+import sys
			
 
				+import struct
			
 
				+import tempfile
			
 
				+import numpy as np
			
 
				+
			
 
				+from enum import IntEnum, auto
			
 
				+from typing import Any, IO, List, Optional
			
 
				+
			
 
				+#
			
 
				+# constants
			
 
				+#
			
 
				+
			
 
				+GGUF_MAGIC             = 0x46554747
			
 
				+GGUF_VERSION           = 1
			
 
				+GGUF_DEFAULT_ALIGNMENT = 32
			
 
				+
			
 
				+# general
			
 
				+KEY_GENERAL_ARCHITECTURE         = "general.architecture"
			
 
				+KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
			
 
				+KEY_GENERAL_ALIGNMENT            = "general.alignment"
			
 
				+KEY_GENERAL_NAME                 = "general.name"
			
 
				+KEY_GENERAL_AUTHOR               = "general.author"
			
 
				+KEY_GENERAL_URL                  = "general.url"
			
 
				+KEY_GENERAL_DESCRIPTION          = "general.description"
			
 
				+KEY_GENERAL_LICENSE              = "general.license"
			
 
				+KEY_GENERAL_SOURCE_URL           = "general.source.url"
			
 
				+KEY_GENERAL_SOURCE_HF_REPO       = "general.source.hugginface.repository"
			
 
				+
			
 
				+# LLM
			
 
				+KEY_LLM_CONTEXT_LENGTH        = "{arch}.context_length"
			
 
				+KEY_LLM_EMBEDDING_LENGTH      = "{arch}.embedding_length"
			
 
				+KEY_LLM_BLOCK_COUNT           = "{arch}.block_count"
			
 
				+KEY_LLM_FEED_FORWARD_LENGTH   = "{arch}.feed_forward_length"
			
 
				+KEY_LLM_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
			
 
				+KEY_LLM_TENSOR_DATA_LAYOUT    = "{arch}.tensor_data_layout"
			
 
				+
			
 
				+# attention
			
 
				+KEY_ATTENTION_HEAD_COUNT        = "{arch}.attention.head_count"
			
 
				+KEY_ATTENTION_HEAD_COUNT_KV     = "{arch}.attention.head_count_kv"
			
 
				+KEY_ATTENTION_MAX_ALIBI_BIAS    = "{arch}.attention.max_alibi_bias"
			
 
				+KEY_ATTENTION_CLAMP_KQV         = "{arch}.attention.clamp_kqv"
			
 
				+KEY_ATTENTION_LAYERNORM_EPS     = "{arch}.attention.layer_norm_epsilon"
			
 
				+KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
			
 
				+
			
 
				+# RoPE
			
 
				+KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
			
 
				+KEY_ROPE_SCALE_LINEAR    = "{arch}.rope.scale_linear"
			
 
				+
			
 
				+# tokenization
			
 
				+KEY_TOKENIZER_MODEL      = "tokenizer.ggml.model"
			
 
				+KEY_TOKENIZER_LIST       = "tokenizer.ggml.tokens"
			
 
				+KEY_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"
			
 
				+KEY_TOKENIZER_SCORES     = "tokenizer.ggml.scores"
			
 
				+KEY_TOKENIZER_MERGES     = "tokenizer.ggml.merges"
			
 
				+KEY_TOKENIZER_BOS_ID     = "tokenizer.ggml.bos_token_id"
			
 
				+KEY_TOKENIZER_EOS_ID     = "tokenizer.ggml.eos_token_id"
			
 
				+KEY_TOKENIZER_UNK_ID     = "tokenizer.ggml.unknown_token_id"
			
 
				+KEY_TOKENIZER_SEP_ID     = "tokenizer.ggml.seperator_token_id"
			
 
				+KEY_TOKENIZER_PAD_ID     = "tokenizer.ggml.padding_token_id"
			
 
				+KEY_TOKENIZER_HF_JSON    = "tokenizer.huggingface.json"
			
 
				+KEY_TOKENIZER_RWKV       = "tokenizer.rwkv.world"
			
 
				+
			
 
				+
			
 
				+#
			
 
				+# recommended mapping of model tensor names for storage in gguf
			
 
				+#
			
 
				+
			
 
				+
			
 
				+class MODEL_ARCH(IntEnum):
			
 
				+    LLAMA   = auto()
			
 
				+    FALCON  = auto()
			
 
				+    GPT2    = auto()
			
 
				+    GPTJ    = auto()
			
 
				+    GPTNEOX = auto()
			
 
				+    MPT     = auto()
			
 
				+
			
 
				+
			
 
				+class MODEL_TENSOR(IntEnum):
			
 
				+    TOKEN_EMBD    = auto()
			
 
				+    POS_EMBD      = auto()
			
 
				+    OUTPUT        = auto()
			
 
				+    OUTPUT_NORM   = auto()
			
 
				+    ROPE_FREQS    = auto()
			
 
				+    ATTN_Q        = auto()
			
 
				+    ATTN_K        = auto()
			
 
				+    ATTN_V        = auto()
			
 
				+    ATTN_QKV      = auto()
			
 
				+    ATTN_OUT      = auto()
			
 
				+    ATTN_NORM     = auto()
			
 
				+    ATTN_NORM_2   = auto()
			
 
				+    ATTN_ROT_EMBD = auto()
			
 
				+    FFN_GATE      = auto()
			
 
				+    FFN_DOWN      = auto()
			
 
				+    FFN_UP        = auto()
			
 
				+    FFN_NORM      = auto()
			
 
				+
			
 
				+
			
 
				+MODEL_ARCH_NAMES = {
			
 
				+    MODEL_ARCH.LLAMA:   "llama",
			
 
				+    MODEL_ARCH.FALCON:  "falcon",
			
 
				+    MODEL_ARCH.GPT2:    "gpt2",
			
 
				+    MODEL_ARCH.GPTJ:    "gptj",
			
 
				+    MODEL_ARCH.GPTNEOX: "gptneox",
			
 
				+    MODEL_ARCH.MPT:     "mpt",
			
 
				+}
			
 
				+
			
 
				+MODEL_TENSOR_NAMES = {
			
 
				+    MODEL_ARCH.LLAMA: {
			
 
				+        MODEL_TENSOR.TOKEN_EMBD:    "token_embd",
			
 
				+        MODEL_TENSOR.OUTPUT_NORM:   "output_norm",
			
 
				+        MODEL_TENSOR.OUTPUT:        "output",
			
 
				+        MODEL_TENSOR.ROPE_FREQS:    "rope_freqs",
			
 
				+        MODEL_TENSOR.ATTN_NORM:     "blk.{bid}.attn_norm",
			
 
				+        MODEL_TENSOR.ATTN_Q:        "blk.{bid}.attn_q",
			
 
				+        MODEL_TENSOR.ATTN_K:        "blk.{bid}.attn_k",
			
 
				+        MODEL_TENSOR.ATTN_V:        "blk.{bid}.attn_v",
			
 
				+        MODEL_TENSOR.ATTN_OUT:      "blk.{bid}.attn_output",
			
 
				+        MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
			
 
				+        MODEL_TENSOR.FFN_NORM:      "blk.{bid}.ffn_norm",
			
 
				+        MODEL_TENSOR.FFN_GATE:      "blk.{bid}.ffn_gate",
			
 
				+        MODEL_TENSOR.FFN_DOWN:      "blk.{bid}.ffn_down",
			
 
				+        MODEL_TENSOR.FFN_UP:        "blk.{bid}.ffn_up",
			
 
				+    },
			
 
				+    MODEL_ARCH.GPTNEOX: {
			
 
				+        MODEL_TENSOR.TOKEN_EMBD:    "token_embd",
			
 
				+        MODEL_TENSOR.OUTPUT_NORM:   "output_norm",
			
 
				+        MODEL_TENSOR.OUTPUT:        "output",
			
 
				+        MODEL_TENSOR.ATTN_NORM:     "blk.{bid}.attn_norm",
			
 
				+        MODEL_TENSOR.ATTN_QKV:      "blk.{bid}.attn_qkv",
			
 
				+        MODEL_TENSOR.ATTN_OUT:      "blk.{bid}.attn_output",
			
 
				+        MODEL_TENSOR.FFN_NORM:      "blk.{bid}.ffn_norm",
			
 
				+        MODEL_TENSOR.FFN_DOWN:      "blk.{bid}.ffn_down",
			
 
				+        MODEL_TENSOR.FFN_UP:        "blk.{bid}.ffn_up",
			
 
				+    },
			
 
				+    MODEL_ARCH.FALCON: {
			
 
				+        MODEL_TENSOR.TOKEN_EMBD:  "token_embd",
			
 
				+        MODEL_TENSOR.OUTPUT_NORM: "output_norm",
			
 
				+        MODEL_TENSOR.OUTPUT:      "output",
			
 
				+        MODEL_TENSOR.ATTN_NORM:   "blk.{bid}.attn_norm",
			
 
				+        MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
			
 
				+        MODEL_TENSOR.ATTN_QKV:    "blk.{bid}.attn_qkv",
			
 
				+        MODEL_TENSOR.ATTN_OUT:    "blk.{bid}.attn_output",
			
 
				+        MODEL_TENSOR.FFN_DOWN:    "blk.{bid}.ffn_down",
			
 
				+        MODEL_TENSOR.FFN_UP:      "blk.{bid}.ffn_up",
			
 
				+    },
			
 
				+    MODEL_ARCH.GPT2: {
			
 
				+        # TODO
			
 
				+    },
			
 
				+    # TODO
			
 
				+}
			
 
				+
			
 
				+# tensors that will not be serialized
			
 
				+MODEL_TENSOR_SKIP = {
			
 
				+    MODEL_ARCH.LLAMA: [
			
 
				+        MODEL_TENSOR.ROPE_FREQS,
			
 
				+        MODEL_TENSOR.ATTN_ROT_EMBD,
			
 
				+    ],
			
 
				+}
			
 
				+
			
 
				+
			
 
				+# TODO: the following helper functions should be removed
			
 
				+#       instead, get_tensor_name_map should return tuples of (name, MODEL_TENSOR)
			
 
				+#       however, my Python is very bad, and I couldn't figure out how to do this, hence these functions
			
 
				+# REMOVE
			
 
				+def should_skip_tensor_TMP(arch: MODEL_ARCH, n_blocks: int, name: str) -> bool:
			
 
				+    for skip in MODEL_TENSOR_SKIP.get(arch, []):
			
 
				+        for i in range(n_blocks):
			
 
				+            if name == MODEL_TENSOR_NAMES[arch][skip].format(bid=i):
			
 
				+                return True
			
 
				+
			
 
				+    return False
			
 
				+
			
 
				+
			
 
				+def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:
			
 
				+    tensor_map = {}
			
 
				+
			
 
				+    # Token embeddings
			
 
				+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.TOKEN_EMBD, None)
			
 
				+
			
 
				+    tensor_map["gpt_neox.embed_in"]           = mapped_to  # gptneox
			
 
				+    tensor_map["transformer.wte"]             = mapped_to  # gpt2 mpt
			
 
				+    tensor_map["transformer.word_embeddings"] = mapped_to  # falcon
			
 
				+    tensor_map["model.embed_tokens"]          = mapped_to  # llama-hf
			
 
				+    tensor_map["tok_embeddings"]              = mapped_to  # llama-pth
			
 
				+
			
 
				+    # Position embeddings
			
 
				+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.POS_EMBD, None)
			
 
				+
			
 
				+    tensor_map["transformer.wpe"] = mapped_to  # gpt2
			
 
				+
			
 
				+    # Output
			
 
				+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT, None)
			
 
				+
			
 
				+    tensor_map["embed_out"] = mapped_to  # gptneox
			
 
				+    tensor_map["lm_head"]   = mapped_to  # gpt2 mpt falcon llama-hf
			
 
				+    tensor_map["output"]    = mapped_to  # llama-pth
			
 
				+
			
 
				+    # Output norm
			
 
				+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT_NORM, None)
			
 
				+
			
 
				+    tensor_map["gpt_neox.final_layer_norm"] = mapped_to  # gptneox
			
 
				+    tensor_map["transformer.ln_f"]          = mapped_to  # gpt2 falcon
			
 
				+    tensor_map["transformer.norm_f"]        = mapped_to  # mpt
			
 
				+    tensor_map["model.norm"]                = mapped_to  # llama-hf
			
 
				+    tensor_map["norm"]                      = mapped_to  # llama-pth
			
 
				+
			
 
				+    # Rope frequencies
			
 
				+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ROPE_FREQS, None)
			
 
				+
			
 
				+    tensor_map["rope.freqs"] = mapped_to  # llama-pth
			
 
				+
			
 
				+    # Attention and feed-forward blocks
			
 
				+    for i in range(0, n_blocks):
			
 
				+        # Attention norm
			
 
				+        # TODO: is there are simpler way to write these 2 lines in Python?
			
 
				+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM, None)
			
 
				+        mapped_to = mapped_to.format(bid=i) if mapped_to else None
			
 
				+
			
 
				+        tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to  # gptneox
			
 
				+        tensor_map["transformer.h."+str(i)+".ln_1"]              = mapped_to  # gpt2
			
 
				+        tensor_map["transformer.blocks."+str(i)+".norm_1"]       = mapped_to  # mpt
			
 
				+        tensor_map["transformer.h."+str(i)+".input_layernorm"]   = mapped_to  # falcon7b
			
 
				+        tensor_map["transformer.h."+str(i)+".ln_mlp"]            = mapped_to  # falcon40b
			
 
				+        tensor_map["model.layers."+str(i)+".input_layernorm"]    = mapped_to  # llama-hf
			
 
				+        tensor_map["layers."+str(i)+".attention_norm"]           = mapped_to  # llama-pth
			
 
				+
			
 
				+        # Attention norm 2
			
 
				+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM_2, None)
			
 
				+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
			
 
				+
			
 
				+        tensor_map["transformer.h."+str(i)+".ln_attn"] = mapped_to  # falcon40b
			
 
				+
			
 
				+        # Attention query-key-value
			
 
				+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_QKV, None)
			
 
				+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
			
 
				+
			
 
				+        tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"]    = mapped_to  # gptneox
			
 
				+        tensor_map["transformer.h."+str(i)+".attn.c_attn"]                    = mapped_to  # gpt2
			
 
				+        tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"]                 = mapped_to  # mpt
			
 
				+        tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to  # falcon
			
 
				+
			
 
				+        # Attention query
			
 
				+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_Q, None)
			
 
				+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
			
 
				+
			
 
				+        tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to  # llama-hf
			
 
				+        tensor_map["layers."+str(i)+".attention.wq"]           = mapped_to  # llama-pth
			
 
				+
			
 
				+        # Attention key
			
 
				+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_K, None)
			
 
				+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
			
 
				+
			
 
				+        tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to  # llama-hf
			
 
				+        tensor_map["layers."+str(i)+".attention.wk"]           = mapped_to  # llama-pth
			
 
				+
			
 
				+        # Attention value
			
 
				+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_V, None)
			
 
				+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
			
 
				+
			
 
				+        tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to  # llama-hf
			
 
				+        tensor_map["layers."+str(i)+".attention.wv"]           = mapped_to  # llama-pth
			
 
				+
			
 
				+        # Attention output
			
 
				+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_OUT, None)
			
 
				+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
			
 
				+
			
 
				+        tensor_map["gpt_neox.layers."+str(i)+".attention.dense"]    = mapped_to  # gptneox
			
 
				+        tensor_map["transformer.h."+str(i)+".attn.c_proj"]          = mapped_to  # gpt2
			
 
				+        tensor_map["transformer.blocks."+str(i)+".attn.out_proj"]   = mapped_to  # mpt
			
 
				+        tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to  # falcon
			
 
				+        tensor_map["model.layers."+str(i)+".self_attn.o_proj"]      = mapped_to  # llama-hf
			
 
				+        tensor_map["layers."+str(i)+".attention.wo"]                = mapped_to  # llama-pth
			
 
				+
			
 
				+        # Rotary embeddings
			
 
				+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_ROT_EMBD, None)
			
 
				+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
			
 
				+
			
 
				+        tensor_map["model.layers."+str(i)+".self_attn.rotary_emb.inv_freq"]  = mapped_to  # llama-hf
			
 
				+        tensor_map["layers."+str(i)+".attention.inner_attention.rope.freqs"] = mapped_to  # llama-pth
			
 
				+
			
 
				+        # Feed-forward norm
			
 
				+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_NORM, None)
			
 
				+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
			
 
				+
			
 
				+        tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to  # gptneox
			
 
				+        tensor_map["transformer.h."+str(i)+".ln_2"]                       = mapped_to  # gpt2
			
 
				+        tensor_map["transformer.blocks."+str(i)+".norm_2"]                = mapped_to  # mpt
			
 
				+        tensor_map["model.layers."+str(i)+".post_attention_layernorm"]    = mapped_to  # llama-hf
			
 
				+        tensor_map["layers."+str(i)+".ffn_norm"]                          = mapped_to  # llama-pth
			
 
				+
			
 
				+        # Feed-forward up
			
 
				+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_UP, None)
			
 
				+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
			
 
				+
			
 
				+        tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to  # gptneox
			
 
				+        tensor_map["transformer.h."+str(i)+".mlp.c_fc"]            = mapped_to  # gpt2
			
 
				+        tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"]    = mapped_to  # mpt
			
 
				+        tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"]   = mapped_to  # falcon
			
 
				+        tensor_map["model.layers."+str(i)+".mlp.up_proj"]          = mapped_to  # llama-hf
			
 
				+        tensor_map["layers."+str(i)+".feed_forward.w3"]            = mapped_to  # llama-pth
			
 
				+
			
 
				+        # Feed-forward gate
			
 
				+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_GATE, None)
			
 
				+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
			
 
				+
			
 
				+        tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to  # llama-hf
			
 
				+        tensor_map["layers."+str(i)+".feed_forward.w1"]     = mapped_to  # llama-pth
			
 
				+
			
 
				+        # Feed-forward down
			
 
				+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_DOWN, None)
			
 
				+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
			
 
				+
			
 
				+        tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to  # gptneox
			
 
				+        tensor_map["transformer.h."+str(i)+".mlp.c_proj"]          = mapped_to  # gpt2
			
 
				+        tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"]  = mapped_to  # mpt
			
 
				+        tensor_map["transformer.h."+str(i)+".mlp.dense_4h_to_h"]   = mapped_to  # falcon
			
 
				+        tensor_map["model.layers."+str(i)+".mlp.down_proj"]        = mapped_to  # llama-hf
			
 
				+        tensor_map["layers."+str(i)+".feed_forward.w2"]            = mapped_to  # llama-pth
			
 
				+
			
 
				+    return tensor_map
			
 
				+
			
 
				+
			
 
				+class TokenType(IntEnum):
			
 
				+    NORMAL       = 1
			
 
				+    UNKNOWN      = 2
			
 
				+    CONTROL      = 3
			
 
				+    USER_DEFINED = 4
			
 
				+    UNUSED       = 5
			
 
				+    BYTE         = 6
			
 
				+
			
 
				+#
			
 
				+# implementation
			
 
				+#
			
 
				+
			
 
				+
			
 
				+class GGMLQuantizationType(IntEnum):
			
 
				+    F32  = 0
			
 
				+    F16  = 1
			
 
				+    Q4_0 = 2
			
 
				+    Q4_1 = 3
			
 
				+    Q5_0 = 6
			
 
				+    Q5_1 = 7
			
 
				+    Q8_0 = 8
			
 
				+    Q8_1 = 9
			
 
				+    Q2_K = 10
			
 
				+    Q3_K = 11
			
 
				+    Q4_K = 12
			
 
				+    Q5_K = 13
			
 
				+    Q6_K = 14
			
 
				+    Q8_K = 15
			
 
				+
			
 
				+
			
 
				+class GGUFValueType(IntEnum):
			
 
				+    UINT8   = 0
			
 
				+    INT8    = 1
			
 
				+    UINT16  = 2
			
 
				+    INT16   = 3
			
 
				+    UINT32  = 4
			
 
				+    INT32   = 5
			
 
				+    FLOAT32 = 6
			
 
				+    BOOL    = 7
			
 
				+    STRING  = 8
			
 
				+    ARRAY   = 9
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def get_type(val):
			
 
				+        if isinstance(val, str) or isinstance(val, bytes) or isinstance(val, bytearray):
			
 
				+            return GGUFValueType.STRING
			
 
				+        elif isinstance(val, list):
			
 
				+            return GGUFValueType.ARRAY
			
 
				+        elif isinstance(val, float):
			
 
				+            return GGUFValueType.FLOAT32
			
 
				+        elif isinstance(val, bool):
			
 
				+            return GGUFValueType.BOOL
			
 
				+        elif isinstance(val, int):
			
 
				+            return GGUFValueType.INT32
			
 
				+        else:
			
 
				+            print("Unknown type: "+str(type(val)))
			
 
				+            sys.exit()
			
 
				+
			
 
				+
			
 
				+class GGUFWriter:
			
 
				+    def __init__(self, path: str, arch: str, use_temp_file = True):
			
 
				+        self.fout = open(path, "wb")
			
 
				+        self.arch = arch
			
 
				+        self.offset_tensor = 0
			
 
				+        self.data_alignment = GGUF_DEFAULT_ALIGNMENT
			
 
				+        self.kv_data = b""
			
 
				+        self.kv_data_count = 0
			
 
				+        self.ti_data = b""
			
 
				+        self.ti_data_count = 0
			
 
				+        self.add_architecture()
			
 
				+        self.use_temp_file = use_temp_file
			
 
				+        self.tensors = []
			
 
				+
			
 
				+    def write_header_to_file(self):
			
 
				+        self.fout.write(struct.pack("<I", GGUF_MAGIC))
			
 
				+        self.fout.write(struct.pack("<I", GGUF_VERSION))
			
 
				+        self.fout.write(struct.pack("<I", self.ti_data_count))
			
 
				+        self.fout.write(struct.pack("<I", self.kv_data_count))
			
 
				+        self.flush()
			
 
				+#        print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
			
 
				+
			
 
				+    def write_kv_data_to_file(self):
			
 
				+        self.fout.write(self.kv_data)
			
 
				+        self.flush()
			
 
				+
			
 
				+    def write_ti_data_to_file(self):
			
 
				+        self.fout.write(self.ti_data)
			
 
				+        self.flush()
			
 
				+
			
 
				+    def add_key(self, key: str):
			
 
				+        self.add_val(key, GGUFValueType.STRING, add_vtype=False)
			
 
				+
			
 
				+    def add_uint8(self, key: str, val: int):
			
 
				+        self.add_key(key)
			
 
				+        self.add_val(val, GGUFValueType.UINT8)
			
 
				+
			
 
				+    def add_int8(self, key: str, val: int):
			
 
				+        self.add_key(key)
			
 
				+        self.add_val(val, GGUFValueType.INT8)
			
 
				+
			
 
				+    def add_uint16(self, key: str, val: int):
			
 
				+        self.add_key(key)
			
 
				+        self.add_val(val, GGUFValueType.UINT16)
			
 
				+
			
 
				+    def add_int16(self, key: str, val: int):
			
 
				+        self.add_key(key)
			
 
				+        self.add_val(val, GGUFValueType.INT16)
			
 
				+
			
 
				+    def add_uint32(self, key: str, val: int):
			
 
				+        self.add_key(key)
			
 
				+        self.add_val(val, GGUFValueType.UINT32)
			
 
				+
			
 
				+    def add_int32(self, key: str, val: int):
			
 
				+        self.add_key(key)
			
 
				+        self.add_val(val, GGUFValueType.INT32)
			
 
				+
			
 
				+    def add_float32(self, key: str, val: float):
			
 
				+        self.add_key(key)
			
 
				+        self.add_val(val, GGUFValueType.FLOAT32)
			
 
				+
			
 
				+    def add_bool(self, key: str, val: bool):
			
 
				+        self.add_key(key)
			
 
				+        self.add_val(val, GGUFValueType.BOOL)
			
 
				+
			
 
				+    def add_string(self, key: str, val: str):
			
 
				+        if len(val) == 0:
			
 
				+            return
			
 
				+        self.add_key(key)
			
 
				+        self.add_val(val, GGUFValueType.STRING)
			
 
				+
			
 
				+    def add_array(self, key: str, val: list):
			
 
				+        if not isinstance(val, list):
			
 
				+            raise ValueError("Value must be a list for array type")
			
 
				+
			
 
				+        self.add_key(key)
			
 
				+        self.add_val(val, GGUFValueType.ARRAY)
			
 
				+
			
 
				+    def add_val(self: str, val: Any, vtype: GGUFValueType = None, add_vtype: bool = True):
			
 
				+        if vtype is None:
			
 
				+            vtype = GGUFValueType.get_type(val)
			
 
				+
			
 
				+        if add_vtype:
			
 
				+            self.kv_data += struct.pack("<I", vtype)
			
 
				+            self.kv_data_count += 1
			
 
				+
			
 
				+        if vtype == GGUFValueType.UINT8:
			
 
				+            self.kv_data += struct.pack("<B", val)
			
 
				+        elif vtype == GGUFValueType.INT8:
			
 
				+            self.kv_data += struct.pack("<b", val)
			
 
				+        elif vtype == GGUFValueType.UINT16:
			
 
				+            self.kv_data += struct.pack("<H", val)
			
 
				+        elif vtype == GGUFValueType.INT16:
			
 
				+            self.kv_data += struct.pack("<h", val)
			
 
				+        elif vtype == GGUFValueType.UINT32:
			
 
				+            self.kv_data += struct.pack("<I", val)
			
 
				+        elif vtype == GGUFValueType.INT32:
			
 
				+            self.kv_data += struct.pack("<i", val)
			
 
				+        elif vtype == GGUFValueType.FLOAT32:
			
 
				+            self.kv_data += struct.pack("<f", val)
			
 
				+        elif vtype == GGUFValueType.BOOL:
			
 
				+            self.kv_data += struct.pack("?", val)
			
 
				+        elif vtype == GGUFValueType.STRING:
			
 
				+            encoded_val = val.encode("utf8") if isinstance(val, str) else val
			
 
				+            self.kv_data += struct.pack("<I", len(encoded_val))
			
 
				+            self.kv_data += encoded_val
			
 
				+        elif vtype == GGUFValueType.ARRAY:
			
 
				+            ltype = set([GGUFValueType.get_type(item) for item in val])
			
 
				+            assert len(ltype) == 1, "All items in a GGUF array should be of the same type"
			
 
				+            self.kv_data += struct.pack("<I", list(ltype)[0])
			
 
				+            self.kv_data += struct.pack("<I", len(val))
			
 
				+            for item in val:
			
 
				+                self.add_val(item, add_vtype=False)
			
 
				+        else:
			
 
				+            raise ValueError("Invalid GGUF metadata value type")
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def ggml_pad(x: int, n: int) -> int:
			
 
				+        return ((x + n - 1) // n) * n
			
 
				+
			
 
				+    def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int, raw_dtype: Optional[GGMLQuantizationType] = None):
			
 
				+        assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
			
 
				+
			
 
				+        encoded_name = name.encode("utf8")
			
 
				+        self.ti_data += struct.pack("<I", len(encoded_name))
			
 
				+        self.ti_data += encoded_name
			
 
				+        n_dims = len(tensor_shape)
			
 
				+        self.ti_data += struct.pack("<I", n_dims)
			
 
				+        for i in range(n_dims):
			
 
				+            self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i])
			
 
				+        if raw_dtype is None:
			
 
				+            dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
			
 
				+        else:
			
 
				+            dtype = raw_dtype
			
 
				+        self.ti_data += struct.pack("<I", dtype)
			
 
				+        self.ti_data += struct.pack("<Q", self.offset_tensor)
			
 
				+        self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
			
 
				+        self.ti_data_count += 1
			
 
				+
			
 
				+    def add_tensor(self, name: str, tensor: np.ndarray, raw_shape: Optional[np.ndarray] = None, raw_dtype: Optional[GGMLQuantizationType] = None):
			
 
				+        if self.use_temp_file and not hasattr(self, "temp_file"):
			
 
				+            self.temp_file = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
			
 
				+            self.temp_file.seek(0)
			
 
				+
			
 
				+        self.add_tensor_info(name, raw_shape if raw_shape is not None else tensor.shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
			
 
				+
			
 
				+        pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
			
 
				+
			
 
				+        if not self.use_temp_file:
			
 
				+            self.tensors.append((tensor, pad))
			
 
				+            return
			
 
				+
			
 
				+        tensor.tofile(self.temp_file)
			
 
				+
			
 
				+        if pad != 0:
			
 
				+            self.temp_file.write(bytes([0] * pad))
			
 
				+
			
 
				+    def write_tensor_data(self, tensor: np.ndarray):
			
 
				+        pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
			
 
				+        if pad != 0:
			
 
				+            self.fout.write(bytes([0] * pad))
			
 
				+
			
 
				+        tensor.tofile(self.fout)
			
 
				+
			
 
				+        pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
			
 
				+        if pad != 0:
			
 
				+            self.fout.write(bytes([0] * pad))
			
 
				+
			
 
				+    def write_tensors_to_file(self):
			
 
				+        self.write_ti_data_to_file()
			
 
				+
			
 
				+        pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
			
 
				+        if pad != 0:
			
 
				+            self.fout.write(bytes([0] * pad))
			
 
				+
			
 
				+        if not self.use_temp_file:
			
 
				+            for (currtensor, currpad) in self.tensors:
			
 
				+                currtensor.tofile(self.fout)
			
 
				+                if currpad != 0:
			
 
				+                    self.fout.write(bytes([0] * currpad))
			
 
				+            return
			
 
				+
			
 
				+        self.temp_file.seek(0)
			
 
				+
			
 
				+        shutil.copyfileobj(self.temp_file, self.fout)
			
 
				+        self.flush()
			
 
				+        self.temp_file.close()
			
 
				+
			
 
				+    def flush(self):
			
 
				+        self.fout.flush()
			
 
				+
			
 
				+    def close(self):
			
 
				+        self.fout.close()
			
 
				+
			
 
				+    def add_architecture(self):
			
 
				+        self.add_string(KEY_GENERAL_ARCHITECTURE, self.arch)
			
 
				+
			
 
				+    def add_author(self, author: str):
			
 
				+        self.add_string(KEY_GENERAL_AUTHOR, author)
			
 
				+
			
 
				+    def add_tensor_data_layout(self, layout: str):
			
 
				+        self.add_string(KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
			
 
				+
			
 
				+    def add_url(self, url: str):
			
 
				+        self.add_string(KEY_GENERAL_URL, url)
			
 
				+
			
 
				+    def add_description(self, description: str):
			
 
				+        self.add_string(KEY_GENERAL_DESCRIPTION, description)
			
 
				+
			
 
				+    def add_source_url(self, url: str):
			
 
				+        self.add_string(KEY_GENERAL_SOURCE_URL, url)
			
 
				+
			
 
				+    def add_source_hf_repo(self, repo: str):
			
 
				+        self.add_string(KEY_GENERAL_SOURCE_HF_REPO, repo)
			
 
				+
			
 
				+    def add_name(self, name: str):
			
 
				+        self.add_string(KEY_GENERAL_NAME, name)
			
 
				+
			
 
				+    def add_quantization_version(self, quantization_version: GGMLQuantizationType):
			
 
				+        self.add_uint32(
			
 
				+            KEY_GENERAL_QUANTIZATION_VERSION, quantization_version)
			
 
				+
			
 
				+    def add_custom_alignment(self, alignment: int):
			
 
				+        self.data_alignment = alignment
			
 
				+        self.add_uint32(KEY_GENERAL_ALIGNMENT, alignment)
			
 
				+
			
 
				+    def add_context_length(self, length: int):
			
 
				+        self.add_uint32(
			
 
				+            KEY_LLM_CONTEXT_LENGTH.format(arch=self.arch), length)
			
 
				+
			
 
				+    def add_embedding_length(self, length: int):
			
 
				+        self.add_uint32(
			
 
				+            KEY_LLM_EMBEDDING_LENGTH.format(arch=self.arch), length)
			
 
				+
			
 
				+    def add_block_count(self, length: int):
			
 
				+        self.add_uint32(
			
 
				+            KEY_LLM_BLOCK_COUNT.format(arch=self.arch), length)
			
 
				+
			
 
				+    def add_feed_forward_length(self, length: int):
			
 
				+        self.add_uint32(
			
 
				+            KEY_LLM_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
			
 
				+
			
 
				+    def add_parallel_residual(self, use: bool):
			
 
				+        self.add_bool(
			
 
				+            KEY_LLM_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
			
 
				+
			
 
				+    def add_tensor_data_layout(self, layout: str):
			
 
				+        self.add_string(
			
 
				+            KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
			
 
				+
			
 
				+    def add_head_count(self, count: int):
			
 
				+        self.add_uint32(
			
 
				+            KEY_ATTENTION_HEAD_COUNT.format(arch=self.arch), count)
			
 
				+
			
 
				+    def add_head_count_kv(self, count: int):
			
 
				+        self.add_uint32(
			
 
				+            KEY_ATTENTION_HEAD_COUNT_KV.format(arch=self.arch), count)
			
 
				+
			
 
				+    def add_max_alibi_bias(self, bias: float):
			
 
				+        self.add_float32(
			
 
				+            KEY_ATTENTION_MAX_ALIBI_BIAS.format(arch=self.arch), bias)
			
 
				+
			
 
				+    def add_clamp_kqv(self, value: float):
			
 
				+        self.add_float32(
			
 
				+            KEY_ATTENTION_CLAMP_KQV.format(arch=self.arch), value)
			
 
				+
			
 
				+    def add_layer_norm_eps(self, value: float):
			
 
				+        self.add_float32(
			
 
				+            KEY_ATTENTION_LAYERNORM_EPS.format(arch=self.arch), value)
			
 
				+
			
 
				+    def add_layer_norm_rms_eps(self, value: float):
			
 
				+        self.add_float32(
			
 
				+            KEY_ATTENTION_LAYERNORM_RMS_EPS.format(arch=self.arch), value)
			
 
				+
			
 
				+    def add_rope_dimension_count(self, count: int):
			
 
				+        self.add_uint32(
			
 
				+            KEY_ROPE_DIMENSION_COUNT.format(arch=self.arch), count)
			
 
				+
			
 
				+    def add_rope_scale_linear(self, value:  float):
			
 
				+        self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value)
			
 
				+
			
 
				+    def add_tokenizer_model(self, model: str):
			
 
				+        self.add_string(KEY_TOKENIZER_MODEL, model)
			
 
				+
			
 
				+    def add_token_list(self, tokens: List):
			
 
				+        self.add_array(KEY_TOKENIZER_LIST, tokens)
			
 
				+
			
 
				+    def add_token_merges(self, merges: List):
			
 
				+        self.add_array(KEY_TOKENIZER_MERGES, merges)
			
 
				+
			
 
				+    def add_token_types(self, types: List[int]):
			
 
				+        self.add_array(KEY_TOKENIZER_TOKEN_TYPE, types)
			
 
				+
			
 
				+    def add_token_scores(self, scores: List[float]):
			
 
				+        self.add_array(KEY_TOKENIZER_SCORES, scores)
			
 
				+
			
 
				+    def add_bos_token_id(self, id: int):
			
 
				+        self.add_uint32(KEY_TOKENIZER_BOS_ID, id)
			
 
				+
			
 
				+    def add_eos_token_id(self, id: int):
			
 
				+        self.add_uint32(KEY_TOKENIZER_EOS_ID, id)
			
 
				+
			
 
				+    def add_unk_token_id(self, id: int):
			
 
				+        self.add_uint32(KEY_TOKENIZER_UNK_ID, id)
			
 
				+
			
 
				+    def add_sep_token_id(self, id: int):
			
 
				+        self.add_uint32(KEY_TOKENIZER_SEP_ID, id)
			
 
				+
			
 
				+    def add_pad_token_id(self, id: int):
			
 
				+        self.add_uint32(KEY_TOKENIZER_PAD_ID, id)
			
 
				+
			
 
				+
			
 
				+# Example usage:
			
 
				+if __name__ == "__main__":
			
 
				+    # Example usage with a file
			
 
				+    gguf_writer = GGUFWriter("example.gguf", "llama")
			
 
				+
			
 
				+    gguf_writer.add_architecture()
			
 
				+    gguf_writer.add_block_count(12)
			
 
				+    gguf_writer.add_uint32("answer", 42)  # Write a 32-bit integer
			
 
				+    gguf_writer.add_float32("answer_in_float", 42.0)  # Write a 32-bit float
			
 
				+    gguf_writer.add_custom_alignment(64)
			
 
				+
			
 
				+    tensor1 = np.ones((32,), dtype=np.float32) * 100.0
			
 
				+    tensor2 = np.ones((64,), dtype=np.float32) * 101.0
			
 
				+    tensor3 = np.ones((96,), dtype=np.float32) * 102.0
			
 
				+
			
 
				+    gguf_writer.add_tensor("tensor1", tensor1)
			
 
				+    gguf_writer.add_tensor("tensor2", tensor2)
			
 
				+    gguf_writer.add_tensor("tensor3", tensor3)
			
 
				+
			
 
				+    gguf_writer.write_header_to_file()
			
 
				+    gguf_writer.write_kv_data_to_file()
			
 
				+    gguf_writer.write_tensors_to_file()
			
 
				+
			
 
				+    gguf_writer.close()
			
--- a/llama-util.h
+++ b/llama-util.h
@@ -1,553 +0,0 @@
 
				-// Internal header to be included only by llama.cpp.
			
 
				-// Contains wrappers around OS interfaces.
			
 
				-
			
 
				-#ifndef LLAMA_UTIL_H
			
 
				-#define LLAMA_UTIL_H
			
 
				-
			
 
				-#include <cstdio>
			
 
				-#include <cstdint>
			
 
				-#include <cerrno>
			
 
				-#include <cstring>
			
 
				-#include <cstdarg>
			
 
				-#include <cstdlib>
			
 
				-#include <climits>
			
 
				-
			
 
				-#include <string>
			
 
				-#include <vector>
			
 
				-#include <stdexcept>
			
 
				-
			
 
				-#ifdef __has_include
			
 
				-    #if __has_include(<unistd.h>)
			
 
				-        #include <unistd.h>
			
 
				-        #if defined(_POSIX_MAPPED_FILES)
			
 
				-            #include <sys/mman.h>
			
 
				-        #endif
			
 
				-        #if defined(_POSIX_MEMLOCK_RANGE)
			
 
				-            #include <sys/resource.h>
			
 
				-        #endif
			
 
				-    #endif
			
 
				-#endif
			
 
				-
			
 
				-#if defined(_WIN32)
			
 
				-    #define WIN32_LEAN_AND_MEAN
			
 
				-    #ifndef NOMINMAX
			
 
				-        #define NOMINMAX
			
 
				-    #endif
			
 
				-    #include <windows.h>
			
 
				-    #include <io.h>
			
 
				-    #include <stdio.h> // for _fseeki64
			
 
				-#endif
			
 
				-
			
 
				-#define LLAMA_ASSERT(x) \
			
 
				-    do { \
			
 
				-        if (!(x)) { \
			
 
				-            fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
			
 
				-            abort(); \
			
 
				-        } \
			
 
				-    } while (0)
			
 
				-
			
 
				-#ifdef __GNUC__
			
 
				-#ifdef __MINGW32__
			
 
				-__attribute__((format(gnu_printf, 1, 2)))
			
 
				-#else
			
 
				-__attribute__((format(printf, 1, 2)))
			
 
				-#endif
			
 
				-#endif
			
 
				-static std::string format(const char * fmt, ...) {
			
 
				-    va_list ap, ap2;
			
 
				-    va_start(ap, fmt);
			
 
				-    va_copy(ap2, ap);
			
 
				-    int size = vsnprintf(NULL, 0, fmt, ap);
			
 
				-    LLAMA_ASSERT(size >= 0 && size < INT_MAX);
			
 
				-    std::vector<char> buf(size + 1);
			
 
				-    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
			
 
				-    LLAMA_ASSERT(size2 == size);
			
 
				-    va_end(ap2);
			
 
				-    va_end(ap);
			
 
				-    return std::string(buf.data(), size);
			
 
				-}
			
 
				-
			
 
				-struct llama_file {
			
 
				-    // use FILE * so we don't have to re-open the file to mmap
			
 
				-    FILE * fp;
			
 
				-    size_t size;
			
 
				-
			
 
				-    llama_file(const char * fname, const char * mode) {
			
 
				-        fp = std::fopen(fname, mode);
			
 
				-        if (fp == NULL) {
			
 
				-            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
			
 
				-        }
			
 
				-        seek(0, SEEK_END);
			
 
				-        size = tell();
			
 
				-        seek(0, SEEK_SET);
			
 
				-    }
			
 
				-
			
 
				-    size_t tell() const {
			
 
				-#ifdef _WIN32
			
 
				-        __int64 ret = _ftelli64(fp);
			
 
				-#else
			
 
				-        long ret = std::ftell(fp);
			
 
				-#endif
			
 
				-        LLAMA_ASSERT(ret != -1); // this really shouldn't fail
			
 
				-        return (size_t) ret;
			
 
				-    }
			
 
				-
			
 
				-    void seek(size_t offset, int whence) {
			
 
				-#ifdef _WIN32
			
 
				-        int ret = _fseeki64(fp, (__int64) offset, whence);
			
 
				-#else
			
 
				-        int ret = std::fseek(fp, (long) offset, whence);
			
 
				-#endif
			
 
				-        LLAMA_ASSERT(ret == 0); // same
			
 
				-    }
			
 
				-
			
 
				-    void read_raw(void * ptr, size_t len) const {
			
 
				-        if (len == 0) {
			
 
				-            return;
			
 
				-        }
			
 
				-        errno = 0;
			
 
				-        std::size_t ret = std::fread(ptr, len, 1, fp);
			
 
				-        if (ferror(fp)) {
			
 
				-            throw std::runtime_error(format("read error: %s", strerror(errno)));
			
 
				-        }
			
 
				-        if (ret != 1) {
			
 
				-            throw std::runtime_error(std::string("unexpectedly reached end of file"));
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-    std::uint32_t read_u32() {
			
 
				-        std::uint32_t ret;
			
 
				-        read_raw(&ret, sizeof(ret));
			
 
				-        return ret;
			
 
				-    }
			
 
				-
			
 
				-    std::string read_string(std::uint32_t len) {
			
 
				-        std::vector<char> chars(len);
			
 
				-        read_raw(chars.data(), len);
			
 
				-        return std::string(chars.data(), len);
			
 
				-    }
			
 
				-
			
 
				-    void write_raw(const void * ptr, size_t len) const {
			
 
				-        if (len == 0) {
			
 
				-            return;
			
 
				-        }
			
 
				-        errno = 0;
			
 
				-        size_t ret = std::fwrite(ptr, len, 1, fp);
			
 
				-        if (ret != 1) {
			
 
				-            throw std::runtime_error(format("write error: %s", strerror(errno)));
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-    void write_u32(std::uint32_t val) {
			
 
				-        write_raw(&val, sizeof(val));
			
 
				-    }
			
 
				-
			
 
				-    ~llama_file() {
			
 
				-        if (fp) {
			
 
				-            std::fclose(fp);
			
 
				-        }
			
 
				-    }
			
 
				-};
			
 
				-
			
 
				-// llama_context_data
			
 
				-struct llama_data_context {
			
 
				-    virtual void write(const void * src, size_t size) = 0;
			
 
				-    virtual size_t get_size_written() = 0;
			
 
				-    virtual ~llama_data_context() = default;
			
 
				-};
			
 
				-
			
 
				-struct llama_data_buffer_context : llama_data_context {
			
 
				-    uint8_t* ptr;
			
 
				-    size_t size_written = 0;
			
 
				-
			
 
				-    llama_data_buffer_context(uint8_t * p) : ptr(p) {}
			
 
				-
			
 
				-    void write(const void * src, size_t size) override {
			
 
				-        memcpy(ptr, src, size);
			
 
				-        ptr += size;
			
 
				-        size_written += size;
			
 
				-    }
			
 
				-
			
 
				-    size_t get_size_written() override {
			
 
				-        return size_written;
			
 
				-    }
			
 
				-};
			
 
				-
			
 
				-struct llama_data_file_context : llama_data_context {
			
 
				-    llama_file* file;
			
 
				-    size_t size_written = 0;
			
 
				-
			
 
				-    llama_data_file_context(llama_file * f) : file(f) {}
			
 
				-
			
 
				-    void write(const void * src, size_t size) override {
			
 
				-        file->write_raw(src, size);
			
 
				-        size_written += size;
			
 
				-    }
			
 
				-
			
 
				-    size_t get_size_written() override {
			
 
				-        return size_written;
			
 
				-    }
			
 
				-};
			
 
				-
			
 
				-#if defined(_WIN32)
			
 
				-static std::string llama_format_win_err(DWORD err) {
			
 
				-    LPSTR buf;
			
 
				-    size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
			
 
				-                                 NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
			
 
				-    if (!size) {
			
 
				-        return "FormatMessageA failed";
			
 
				-    }
			
 
				-    std::string ret(buf, size);
			
 
				-    LocalFree(buf);
			
 
				-    return ret;
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-struct llama_mmap {
			
 
				-    void * addr;
			
 
				-    size_t size;
			
 
				-
			
 
				-    llama_mmap(const llama_mmap &) = delete;
			
 
				-
			
 
				-#ifdef _POSIX_MAPPED_FILES
			
 
				-    static constexpr bool SUPPORTED = true;
			
 
				-
			
 
				-    llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
			
 
				-        size = file->size;
			
 
				-        int fd = fileno(file->fp);
			
 
				-        int flags = MAP_SHARED;
			
 
				-        // prefetch/readahead impairs performance on NUMA systems
			
 
				-        if (numa) { prefetch = 0; }
			
 
				-#ifdef __linux__
			
 
				-        if (prefetch >= file->size) { flags |= MAP_POPULATE; }
			
 
				-#endif
			
 
				-        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
			
 
				-        if (addr == MAP_FAILED) {
			
 
				-            throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
			
 
				-        }
			
 
				-
			
 
				-        if (prefetch > 0) {
			
 
				-            // Advise the kernel to preload the mapped memory
			
 
				-            if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
			
 
				-                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
			
 
				-                        strerror(errno));
			
 
				-            }
			
 
				-        }
			
 
				-        if (numa) {
			
 
				-            // advise the kernel not to use readahead
			
 
				-            // (because the next page might not belong on the same node)
			
 
				-            if (madvise(addr, file->size, MADV_RANDOM)) {
			
 
				-                fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
			
 
				-                        strerror(errno));
			
 
				-            }
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-    ~llama_mmap() {
			
 
				-        munmap(addr, size);
			
 
				-    }
			
 
				-#elif defined(_WIN32)
			
 
				-    static constexpr bool SUPPORTED = true;
			
 
				-
			
 
				-    llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
			
 
				-        (void) numa;
			
 
				-
			
 
				-        size = file->size;
			
 
				-
			
 
				-        HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
			
 
				-
			
 
				-        HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
			
 
				-        DWORD error = GetLastError();
			
 
				-
			
 
				-        if (hMapping == NULL) {
			
 
				-            throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
			
 
				-        }
			
 
				-
			
 
				-        addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
			
 
				-        error = GetLastError();
			
 
				-        CloseHandle(hMapping);
			
 
				-
			
 
				-        if (addr == NULL) {
			
 
				-            throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
			
 
				-        }
			
 
				-
			
 
				-        if (prefetch) {
			
 
				-            // The PrefetchVirtualMemory API is only present on Windows 8 and above, so we
			
 
				-            // will dynamically load it using GetProcAddress.
			
 
				-            BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
			
 
				-            HMODULE hKernel32;
			
 
				-
			
 
				-            // This call is guaranteed to succeed.
			
 
				-            hKernel32 = GetModuleHandleW(L"kernel32.dll");
			
 
				-
			
 
				-            // This call may fail if on a pre-Win8 system.
			
 
				-            pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
			
 
				-
			
 
				-            if (pPrefetchVirtualMemory) {
			
 
				-                // Advise the kernel to preload the mapped memory.
			
 
				-                WIN32_MEMORY_RANGE_ENTRY range;
			
 
				-                range.VirtualAddress = addr;
			
 
				-                range.NumberOfBytes = (SIZE_T)size;
			
 
				-                if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
			
 
				-                    fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
			
 
				-                            llama_format_win_err(GetLastError()).c_str());
			
 
				-                }
			
 
				-            }
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-    ~llama_mmap() {
			
 
				-        if (!UnmapViewOfFile(addr)) {
			
 
				-            fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
			
 
				-                    llama_format_win_err(GetLastError()).c_str());
			
 
				-        }
			
 
				-    }
			
 
				-#else
			
 
				-    static constexpr bool SUPPORTED = false;
			
 
				-
			
 
				-    llama_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
			
 
				-        (void) prefetch;
			
 
				-        (void) numa;
			
 
				-
			
 
				-        throw std::runtime_error(std::string("mmap not supported"));
			
 
				-    }
			
 
				-#endif
			
 
				-};
			
 
				-
			
 
				-// Represents some region of memory being locked using mlock or VirtualLock;
			
 
				-// will automatically unlock on destruction.
			
 
				-struct llama_mlock {
			
 
				-    void * addr = NULL;
			
 
				-    size_t size = 0;
			
 
				-    bool failed_already = false;
			
 
				-
			
 
				-    llama_mlock() {}
			
 
				-    llama_mlock(const llama_mlock &) = delete;
			
 
				-
			
 
				-    ~llama_mlock() {
			
 
				-        if (size) {
			
 
				-            raw_unlock(addr, size);
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-    void init(void * ptr) {
			
 
				-        LLAMA_ASSERT(addr == NULL && size == 0);
			
 
				-        addr = ptr;
			
 
				-    }
			
 
				-
			
 
				-    void grow_to(size_t target_size) {
			
 
				-        LLAMA_ASSERT(addr);
			
 
				-        if (failed_already) {
			
 
				-            return;
			
 
				-        }
			
 
				-        size_t granularity = lock_granularity();
			
 
				-        target_size = (target_size + granularity - 1) & ~(granularity - 1);
			
 
				-        if (target_size > size) {
			
 
				-            if (raw_lock((uint8_t *) addr + size, target_size - size)) {
			
 
				-                size = target_size;
			
 
				-            } else {
			
 
				-                failed_already = true;
			
 
				-            }
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-#ifdef _POSIX_MEMLOCK_RANGE
			
 
				-    static constexpr bool SUPPORTED = true;
			
 
				-
			
 
				-    size_t lock_granularity() {
			
 
				-        return (size_t) sysconf(_SC_PAGESIZE);
			
 
				-    }
			
 
				-
			
 
				-    #ifdef __APPLE__
			
 
				-        #define MLOCK_SUGGESTION \
			
 
				-            "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
			
 
				-            "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
			
 
				-    #else
			
 
				-        #define MLOCK_SUGGESTION \
			
 
				-            "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
			
 
				-    #endif
			
 
				-
			
 
				-    bool raw_lock(const void * addr, size_t size) {
			
 
				-        if (!mlock(addr, size)) {
			
 
				-            return true;
			
 
				-        } else {
			
 
				-            char* errmsg = std::strerror(errno);
			
 
				-            bool suggest = (errno == ENOMEM);
			
 
				-
			
 
				-            // Check if the resource limit is fine after all
			
 
				-            struct rlimit lock_limit;
			
 
				-            if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
			
 
				-                suggest = false;
			
 
				-            if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
			
 
				-                suggest = false;
			
 
				-
			
 
				-            fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
			
 
				-                    size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
			
 
				-            return false;
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-    #undef MLOCK_SUGGESTION
			
 
				-
			
 
				-    void raw_unlock(void * addr, size_t size) {
			
 
				-        if (munlock(addr, size)) {
			
 
				-            fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
			
 
				-        }
			
 
				-    }
			
 
				-#elif defined(_WIN32)
			
 
				-    static constexpr bool SUPPORTED = true;
			
 
				-
			
 
				-    size_t lock_granularity() {
			
 
				-        SYSTEM_INFO si;
			
 
				-        GetSystemInfo(&si);
			
 
				-        return (size_t) si.dwPageSize;
			
 
				-    }
			
 
				-
			
 
				-    bool raw_lock(void * ptr, size_t len) {
			
 
				-        for (int tries = 1; ; tries++) {
			
 
				-            if (VirtualLock(ptr, len)) {
			
 
				-                return true;
			
 
				-            }
			
 
				-            if (tries == 2) {
			
 
				-                fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
			
 
				-                    len, size, llama_format_win_err(GetLastError()).c_str());
			
 
				-                return false;
			
 
				-            }
			
 
				-
			
 
				-            // It failed but this was only the first try; increase the working
			
 
				-            // set size and try again.
			
 
				-            SIZE_T min_ws_size, max_ws_size;
			
 
				-            if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
			
 
				-                fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
			
 
				-                        llama_format_win_err(GetLastError()).c_str());
			
 
				-                return false;
			
 
				-            }
			
 
				-            // Per MSDN: "The maximum number of pages that a process can lock
			
 
				-            // is equal to the number of pages in its minimum working set minus
			
 
				-            // a small overhead."
			
 
				-            // Hopefully a megabyte is enough overhead:
			
 
				-            size_t increment = len + 1048576;
			
 
				-            // The minimum must be <= the maximum, so we need to increase both:
			
 
				-            min_ws_size += increment;
			
 
				-            max_ws_size += increment;
			
 
				-            if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
			
 
				-                fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
			
 
				-                        llama_format_win_err(GetLastError()).c_str());
			
 
				-                return false;
			
 
				-            }
			
 
				-        }
			
 
				-    }
			
 
				-
			
 
				-    void raw_unlock(void * ptr, size_t len) {
			
 
				-        if (!VirtualUnlock(ptr, len)) {
			
 
				-            fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
			
 
				-                    llama_format_win_err(GetLastError()).c_str());
			
 
				-        }
			
 
				-    }
			
 
				-#else
			
 
				-    static constexpr bool SUPPORTED = false;
			
 
				-
			
 
				-    size_t lock_granularity() {
			
 
				-        return (size_t) 65536;
			
 
				-    }
			
 
				-
			
 
				-    bool raw_lock(const void * addr, size_t len) {
			
 
				-        fprintf(stderr, "warning: mlock not supported on this system\n");
			
 
				-        return false;
			
 
				-    }
			
 
				-
			
 
				-    void raw_unlock(const void * addr, size_t len) {}
			
 
				-#endif
			
 
				-};
			
 
				-
			
 
				-// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
			
 
				-struct llama_buffer {
			
 
				-    uint8_t * addr = NULL;
			
 
				-    size_t size = 0;
			
 
				-
			
 
				-    llama_buffer() = default;
			
 
				-
			
 
				-    void resize(size_t len) {
			
 
				-#ifdef GGML_USE_METAL
			
 
				-        free(addr);
			
 
				-        int result = posix_memalign((void **) &addr, getpagesize(), len);
			
 
				-        if (result == 0) {
			
 
				-            memset(addr, 0, len);
			
 
				-        }
			
 
				-        else {
			
 
				-            addr = NULL;
			
 
				-        }
			
 
				-#else
			
 
				-        delete[] addr;
			
 
				-        addr = new uint8_t[len];
			
 
				-#endif
			
 
				-        size = len;
			
 
				-    }
			
 
				-
			
 
				-    ~llama_buffer() {
			
 
				-#ifdef GGML_USE_METAL
			
 
				-        free(addr);
			
 
				-#else
			
 
				-        delete[] addr;
			
 
				-#endif
			
 
				-        addr = NULL;
			
 
				-    }
			
 
				-
			
 
				-    // disable copy and move
			
 
				-    llama_buffer(const llama_buffer&) = delete;
			
 
				-    llama_buffer(llama_buffer&&) = delete;
			
 
				-    llama_buffer& operator=(const llama_buffer&) = delete;
			
 
				-    llama_buffer& operator=(llama_buffer&&) = delete;
			
 
				-};
			
 
				-
			
 
				-#ifdef GGML_USE_CUBLAS
			
 
				-#include "ggml-cuda.h"
			
 
				-struct llama_ctx_buffer {
			
 
				-    uint8_t * addr = NULL;
			
 
				-    bool is_cuda;
			
 
				-    size_t size = 0;
			
 
				-
			
 
				-    llama_ctx_buffer() = default;
			
 
				-
			
 
				-    void resize(size_t size) {
			
 
				-        free();
			
 
				-
			
 
				-        addr = (uint8_t *) ggml_cuda_host_malloc(size);
			
 
				-        if (addr) {
			
 
				-            is_cuda = true;
			
 
				-        }
			
 
				-        else {
			
 
				-            // fall back to pageable memory
			
 
				-            addr = new uint8_t[size];
			
 
				-            is_cuda = false;
			
 
				-        }
			
 
				-        this->size = size;
			
 
				-    }
			
 
				-
			
 
				-    void free() {
			
 
				-        if (addr) {
			
 
				-            if (is_cuda) {
			
 
				-                ggml_cuda_host_free(addr);
			
 
				-            }
			
 
				-            else {
			
 
				-                delete[] addr;
			
 
				-            }
			
 
				-        }
			
 
				-        addr = NULL;
			
 
				-    }
			
 
				-
			
 
				-    ~llama_ctx_buffer() {
			
 
				-        free();
			
 
				-    }
			
 
				-
			
 
				-    // disable copy and move
			
 
				-    llama_ctx_buffer(const llama_ctx_buffer&) = delete;
			
 
				-    llama_ctx_buffer(llama_ctx_buffer&&) = delete;
			
 
				-    llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
			
 
				-    llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
			
 
				-};
			
 
				-#else
			
 
				-typedef llama_buffer llama_ctx_buffer;
			
 
				-#endif
			
 
				-
			
 
				-#endif
			
--- a/llama.cpp
+++ b/llama.cpp
--- a/llama.h
+++ b/llama.h
@@ -34,29 +34,18 @@
 
				 #    define DEPRECATED(func, hint) func
			
 
				 #endif
			
 
				 
			
 
				-#define LLAMA_FILE_MAGIC_GGJT        0x67676a74u // 'ggjt'
			
 
				-#define LLAMA_FILE_MAGIC_GGLA        0x67676c61u // 'ggla'
			
 
				-#define LLAMA_FILE_MAGIC_GGMF        0x67676d66u // 'ggmf'
			
 
				-#define LLAMA_FILE_MAGIC_GGML        0x67676d6cu // 'ggml'
			
 
				-#define LLAMA_FILE_MAGIC_GGSN        0x6767736eu // 'ggsn'
			
 
				+#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
			
 
				 
			
 
				-#define LLAMA_FILE_VERSION           3
			
 
				-#define LLAMA_FILE_MAGIC             LLAMA_FILE_MAGIC_GGJT
			
 
				-#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
			
 
				-#define LLAMA_SESSION_MAGIC          LLAMA_FILE_MAGIC_GGSN
			
 
				-#define LLAMA_SESSION_VERSION        1
			
 
				+#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
			
 
				 
			
 
				-#define LLAMA_DEFAULT_SEED           0xFFFFFFFF
			
 
				+#define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
			
 
				+#define LLAMA_SESSION_VERSION 1
			
 
				 
			
 
				 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
			
 
				 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
			
 
				 #define LLAMA_SUPPORTS_GPU_OFFLOAD
			
 
				 #endif
			
 
				 
			
 
				-#ifndef LLAMA_DEFAULT_RMS_EPS
			
 
				-#define LLAMA_DEFAULT_RMS_EPS 5e-6f
			
 
				-#endif
			
 
				-
			
 
				 #ifdef __cplusplus
			
 
				 extern "C" {
			
 
				 #endif
			
@@ -72,6 +61,50 @@ extern "C" {
 
				 
			
 
				     typedef int llama_token;
			
 
				 
			
 
				+    enum llama_log_level {
			
 
				+        LLAMA_LOG_LEVEL_ERROR = 2,
			
 
				+        LLAMA_LOG_LEVEL_WARN  = 3,
			
 
				+        LLAMA_LOG_LEVEL_INFO  = 4
			
 
				+    };
			
 
				+
			
 
				+    enum llama_vocab_type {
			
 
				+        LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
			
 
				+        LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
			
 
				+    };
			
 
				+
			
 
				+    enum llama_token_type {
			
 
				+        LLAMA_TOKEN_TYPE_UNDEFINED    = 0,
			
 
				+        LLAMA_TOKEN_TYPE_NORMAL       = 1,
			
 
				+        LLAMA_TOKEN_TYPE_UNKNOWN      = 2,
			
 
				+        LLAMA_TOKEN_TYPE_CONTROL      = 3,
			
 
				+        LLAMA_TOKEN_TYPE_USER_DEFINED = 4,
			
 
				+        LLAMA_TOKEN_TYPE_UNUSED       = 5,
			
 
				+        LLAMA_TOKEN_TYPE_BYTE         = 6,
			
 
				+    };
			
 
				+
			
 
				+    // model file types
			
 
				+    enum llama_ftype {
			
 
				+        LLAMA_FTYPE_ALL_F32              = 0,
			
 
				+        LLAMA_FTYPE_MOSTLY_F16           = 1, // except 1d tensors
			
 
				+        LLAMA_FTYPE_MOSTLY_Q4_0          = 2, // except 1d tensors
			
 
				+        LLAMA_FTYPE_MOSTLY_Q4_1          = 3, // except 1d tensors
			
 
				+        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
			
 
				+        // LLAMA_FTYPE_MOSTLY_Q4_2       = 5, // support has been removed
			
 
				+        // LLAMA_FTYPE_MOSTLY_Q4_3       = 6, // support has been removed
			
 
				+        LLAMA_FTYPE_MOSTLY_Q8_0          = 7, // except 1d tensors
			
 
				+        LLAMA_FTYPE_MOSTLY_Q5_0          = 8, // except 1d tensors
			
 
				+        LLAMA_FTYPE_MOSTLY_Q5_1          = 9, // except 1d tensors
			
 
				+        LLAMA_FTYPE_MOSTLY_Q2_K          = 10,// except 1d tensors
			
 
				+        LLAMA_FTYPE_MOSTLY_Q3_K_S        = 11,// except 1d tensors
			
 
				+        LLAMA_FTYPE_MOSTLY_Q3_K_M        = 12,// except 1d tensors
			
 
				+        LLAMA_FTYPE_MOSTLY_Q3_K_L        = 13,// except 1d tensors
			
 
				+        LLAMA_FTYPE_MOSTLY_Q4_K_S        = 14,// except 1d tensors
			
 
				+        LLAMA_FTYPE_MOSTLY_Q4_K_M        = 15,// except 1d tensors
			
 
				+        LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16,// except 1d tensors
			
 
				+        LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17,// except 1d tensors
			
 
				+        LLAMA_FTYPE_MOSTLY_Q6_K          = 18,// except 1d tensors
			
 
				+    };
			
 
				+
			
 
				     typedef struct llama_token_data {
			
 
				         llama_token id; // token id
			
 
				         float logit;    // log-odds of the token
			
@@ -86,25 +119,10 @@ extern "C" {
 
				 
			
 
				     typedef void (*llama_progress_callback)(float progress, void *ctx);
			
 
				 
			
 
				-    enum llama_log_level {
			
 
				-        LLAMA_LOG_LEVEL_ERROR = 2,
			
 
				-        LLAMA_LOG_LEVEL_WARN  = 3,
			
 
				-        LLAMA_LOG_LEVEL_INFO  = 4
			
 
				-    };
			
 
				-
			
 
				-    // Signature for logging events
			
 
				-    // Note that text includes the new line character at the end for most events.
			
 
				-    // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
			
 
				-    // if it exists.
			
 
				-    // It might not exist for progress report where '.' is output repeatedly.
			
 
				-    typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
			
 
				-
			
 
				     struct llama_context_params {
			
 
				         uint32_t seed;         // RNG seed, -1 for random
			
 
				         int32_t  n_ctx;        // text context
			
 
				         int32_t  n_batch;      // prompt processing batch size
			
 
				-        int32_t  n_gqa;        // grouped-query attention (TEMP - will be moved to model hparams)
			
 
				-        float    rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
			
 
				         int32_t  n_gpu_layers; // number of layers to store in VRAM
			
 
				         int32_t  main_gpu;     // the GPU that is used for scratch and small tensors
			
 
				 
			
@@ -129,33 +147,18 @@ extern "C" {
 
				         bool use_mlock;  // force system to keep model in RAM
			
 
				         bool embedding;  // embedding mode only
			
 
				     };
			
 
				-    // model file types
			
 
				-    enum llama_ftype {
			
 
				-        LLAMA_FTYPE_ALL_F32              = 0,
			
 
				-        LLAMA_FTYPE_MOSTLY_F16           = 1, // except 1d tensors
			
 
				-        LLAMA_FTYPE_MOSTLY_Q4_0          = 2, // except 1d tensors
			
 
				-        LLAMA_FTYPE_MOSTLY_Q4_1          = 3, // except 1d tensors
			
 
				-        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
			
 
				-        // LLAMA_FTYPE_MOSTLY_Q4_2       = 5, // support has been removed
			
 
				-        // LLAMA_FTYPE_MOSTLY_Q4_3       = 6, // support has been removed
			
 
				-        LLAMA_FTYPE_MOSTLY_Q8_0          = 7, // except 1d tensors
			
 
				-        LLAMA_FTYPE_MOSTLY_Q5_0          = 8, // except 1d tensors
			
 
				-        LLAMA_FTYPE_MOSTLY_Q5_1          = 9, // except 1d tensors
			
 
				-        LLAMA_FTYPE_MOSTLY_Q2_K          = 10,// except 1d tensors
			
 
				-        LLAMA_FTYPE_MOSTLY_Q3_K_S        = 11,// except 1d tensors
			
 
				-        LLAMA_FTYPE_MOSTLY_Q3_K_M        = 12,// except 1d tensors
			
 
				-        LLAMA_FTYPE_MOSTLY_Q3_K_L        = 13,// except 1d tensors
			
 
				-        LLAMA_FTYPE_MOSTLY_Q4_K_S        = 14,// except 1d tensors
			
 
				-        LLAMA_FTYPE_MOSTLY_Q4_K_M        = 15,// except 1d tensors
			
 
				-        LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16,// except 1d tensors
			
 
				-        LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17,// except 1d tensors
			
 
				-        LLAMA_FTYPE_MOSTLY_Q6_K          = 18,// except 1d tensors
			
 
				-    };
			
 
				+
			
 
				+    // Signature for logging events
			
 
				+    // Note that text includes the new line character at the end for most events.
			
 
				+    // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
			
 
				+    // if it exists.
			
 
				+    // It might not exist for progress report where '.' is output repeatedly.
			
 
				+    typedef void (*llama_log_callback)(enum llama_log_level level, const char * text, void * user_data);
			
 
				 
			
 
				     // model quantization parameters
			
 
				     typedef struct llama_model_quantize_params {
			
 
				         int nthread;                 // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
			
 
				-        enum llama_ftype   ftype;    // quantize to this llama_ftype
			
 
				+        enum llama_ftype ftype;      // quantize to this llama_ftype
			
 
				         bool allow_requantize;       // allow quantizing non-f32/f16 tensors
			
 
				         bool quantize_output_tensor; // quantize output.weight
			
 
				     } llama_model_quantize_params;
			
@@ -208,27 +211,16 @@ extern "C" {
 
				         int32_t n_eval;
			
 
				     };
			
 
				 
			
 
				-    // Set callback for all future logging events.
			
 
				-    // If this is not called, or NULL is supplied, everything is output on stderr.
			
 
				-    LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
			
 
				-
			
 
				-    LLAMA_API int llama_max_devices();
			
 
				+    LLAMA_API struct llama_context_params llama_context_default_params(void);
			
 
				+    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
			
 
				 
			
 
				-    LLAMA_API struct llama_context_params llama_context_default_params();
			
 
				-    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
			
 
				-
			
 
				-    LLAMA_API bool llama_mmap_supported();
			
 
				-    LLAMA_API bool llama_mlock_supported();
			
 
				-
			
 
				-    // TODO: not great API - very likely to change
			
 
				     // Initialize the llama + ggml backend
			
 
				     // If numa is true, use NUMA optimizations
			
 
				     // Call once at the start of the program
			
 
				     LLAMA_API void llama_backend_init(bool numa);
			
 
				-    // Call once at the end of the program - currently only used for MPI
			
 
				-    LLAMA_API void llama_backend_free();
			
 
				 
			
 
				-    LLAMA_API int64_t llama_time_us();
			
 
				+    // Call once at the end of the program - currently only used for MPI
			
 
				+    LLAMA_API void llama_backend_free(void);
			
 
				 
			
 
				     LLAMA_API struct llama_model * llama_load_model_from_file(
			
 
				                              const char * path_model,
			
@@ -240,17 +232,26 @@ extern "C" {
 
				                      struct llama_model * model,
			
 
				             struct llama_context_params   params);
			
 
				 
			
 
				-    // Various functions for loading a ggml llama model.
			
 
				-    // Allocate (almost) all memory needed for the model.
			
 
				-    // Return NULL on failure
			
 
				-    LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file(
			
 
				-                             const char * path_model,
			
 
				-            struct llama_context_params   params),
			
 
				-            "please use llama_load_model_from_file combined with llama_new_context_with_model instead");
			
 
				-
			
 
				     // Frees all allocated memory
			
 
				     LLAMA_API void llama_free(struct llama_context * ctx);
			
 
				 
			
 
				+    LLAMA_API int64_t llama_time_us(void);
			
 
				+
			
 
				+    LLAMA_API int  llama_max_devices    (void);
			
 
				+    LLAMA_API bool llama_mmap_supported (void);
			
 
				+    LLAMA_API bool llama_mlock_supported(void);
			
 
				+
			
 
				+    LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
			
 
				+    LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
			
 
				+    LLAMA_API int llama_n_embd (const struct llama_context * ctx);
			
 
				+
			
 
				+    LLAMA_API int llama_model_n_vocab(const struct llama_model * model);
			
 
				+    LLAMA_API int llama_model_n_ctx  (const struct llama_model * model);
			
 
				+    LLAMA_API int llama_model_n_embd (const struct llama_model * model);
			
 
				+
			
 
				+    // Get a string describing the model type
			
 
				+    LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
			
 
				+
			
 
				     // Returns 0 on success
			
 
				     LLAMA_API int llama_model_quantize(
			
 
				             const char * fname_inp,
			
@@ -272,9 +273,9 @@ extern "C" {
 
				 
			
 
				     LLAMA_API int llama_model_apply_lora_from_file(
			
 
				             const struct llama_model * model,
			
 
				-                      const char * path_lora,
			
 
				-                      const char * path_base_model,
			
 
				-                             int   n_threads);
			
 
				+                          const char * path_lora,
			
 
				+                          const char * path_base_model,
			
 
				+                                 int   n_threads);
			
 
				 
			
 
				     // Returns the number of tokens in the KV cache
			
 
				     LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
			
@@ -324,11 +325,40 @@ extern "C" {
 
				     // IMPORTANT: do not use for anything else other than debugging and testing!
			
 
				     LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
			
 
				 
			
 
				+    // Token logits obtained from the last call to llama_eval()
			
 
				+    // The logits for the last token are stored in the last row
			
 
				+    // Can be mutated in order to change the probabilities of the next token
			
 
				+    // Rows: n_tokens
			
 
				+    // Cols: n_vocab
			
 
				+    LLAMA_API float * llama_get_logits(struct llama_context * ctx);
			
 
				+
			
 
				+    // Get the embeddings for the input
			
 
				+    // shape: [n_embd] (1-dimensional)
			
 
				+    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
			
 
				+
			
 
				+    //
			
 
				+    // Vocab
			
 
				+    //
			
 
				+
			
 
				+    LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);
			
 
				+
			
 
				+    LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
			
 
				+
			
 
				+    LLAMA_API llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
			
 
				+
			
 
				+    // Special tokens
			
 
				+    LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx);  // beginning-of-sentence
			
 
				+    LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx);  // end-of-sentence
			
 
				+    LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx);  // next-line
			
 
				+
			
 
				+    //
			
 
				+    // Tokenization
			
 
				+    //
			
 
				+
			
 
				     // Convert the provided text into tokens.
			
 
				     // The tokens pointer must be large enough to hold the resulting tokens.
			
 
				     // Returns the number of tokens on success, no more than n_max_tokens
			
 
				     // Returns a negative number on failure - the number of tokens that would have been returned
			
 
				-    // TODO: not sure if correct
			
 
				     LLAMA_API int llama_tokenize(
			
 
				             struct llama_context * ctx,
			
 
				                       const char * text,
			
@@ -336,6 +366,13 @@ extern "C" {
 
				                              int   n_max_tokens,
			
 
				                             bool   add_bos);
			
 
				 
			
 
				+    LLAMA_API int llama_tokenize_bpe(
			
 
				+            struct llama_context * ctx,
			
 
				+                      const char * text,
			
 
				+                     llama_token * tokens,
			
 
				+                             int   n_max_tokens,
			
 
				+                            bool   add_bos);
			
 
				+
			
 
				     LLAMA_API int llama_tokenize_with_model(
			
 
				         const struct llama_model * model,
			
 
				                       const char * text,
			
@@ -343,57 +380,30 @@ extern "C" {
 
				                              int   n_max_tokens,
			
 
				                             bool   add_bos);
			
 
				 
			
 
				-    LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
			
 
				-    LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
			
 
				-    LLAMA_API int llama_n_embd (const struct llama_context * ctx);
			
 
				-
			
 
				-    LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
			
 
				-    LLAMA_API int llama_n_ctx_from_model  (const struct llama_model * model);
			
 
				-    LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
			
 
				-
			
 
				-    LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
			
 
				-
			
 
				-    // Get the vocabulary as output parameters.
			
 
				-    // Returns number of results.
			
 
				-    LLAMA_API int llama_get_vocab(
			
 
				+    // Token Id -> String. Uses the vocabulary in the provided context
			
 
				+    // Does not write null terminator to the buffer
			
 
				+    LLAMA_API int llama_token_to_str(
			
 
				             const struct llama_context * ctx,
			
 
				-                          const char * * strings,
			
 
				-                                 float * scores,
			
 
				-                                   int   capacity);
			
 
				-
			
 
				-    LLAMA_API int llama_get_vocab_from_model(
			
 
				-              const struct llama_model * model,
			
 
				-                          const char * * strings,
			
 
				-                                 float * scores,
			
 
				-                                   int   capacity);
			
 
				-
			
 
				-    // Token logits obtained from the last call to llama_eval()
			
 
				-    // The logits for the last token are stored in the last row
			
 
				-    // Can be mutated in order to change the probabilities of the next token
			
 
				-    // Rows: n_tokens
			
 
				-    // Cols: n_vocab
			
 
				-    LLAMA_API float * llama_get_logits(struct llama_context * ctx);
			
 
				-
			
 
				-    // Get the embeddings for the input
			
 
				-    // shape: [n_embd] (1-dimensional)
			
 
				-    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
			
 
				+                           llama_token   token,
			
 
				+                                  char * buf,
			
 
				+                                  int    length);
			
 
				 
			
 
				-    // Token Id -> String. Uses the vocabulary in the provided context
			
 
				-    LLAMA_API const char * llama_token_to_str(
			
 
				+    LLAMA_API int llama_token_to_str_bpe(
			
 
				             const struct llama_context * ctx,
			
 
				-                           llama_token   token);
			
 
				+                           llama_token   token,
			
 
				+                                  char * buf,
			
 
				+                                  int    length);
			
 
				 
			
 
				-    LLAMA_API const char * llama_token_to_str_with_model(
			
 
				+    LLAMA_API int llama_token_to_str_with_model(
			
 
				               const struct llama_model * model,
			
 
				-                           llama_token   token);
			
 
				-
			
 
				-    // Special tokens
			
 
				-    LLAMA_API llama_token llama_token_bos();  // beginning-of-sentence
			
 
				-    LLAMA_API llama_token llama_token_eos();  // end-of-sentence
			
 
				-    LLAMA_API llama_token llama_token_nl();   // next-line
			
 
				+                           llama_token   token,
			
 
				+                                  char * buf,
			
 
				+                                  int    length);
			
 
				 
			
 
				+    //
			
 
				     // Grammar
			
 
				     //
			
 
				+
			
 
				     LLAMA_API struct llama_grammar * llama_grammar_init(
			
 
				             const llama_grammar_element ** rules,
			
 
				                                  size_t    n_rules,
			
@@ -401,7 +411,9 @@ extern "C" {
 
				 
			
 
				     LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
			
 
				 
			
 
				+    //
			
 
				     // Sampling functions
			
 
				+    //
			
 
				 
			
 
				     /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
			
 
				     LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
			
@@ -470,6 +482,10 @@ extern "C" {
 
				     // Print system information
			
 
				     LLAMA_API const char * llama_print_system_info(void);
			
 
				 
			
 
				+    // Set callback for all future logging events.
			
 
				+    // If this is not called, or NULL is supplied, everything is output on stderr.
			
 
				+    LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				 #endif
			
@@ -479,10 +495,11 @@ extern "C" {
 
				 
			
 
				 #include <vector>
			
 
				 #include <string>
			
 
				+
			
 
				 struct ggml_tensor;
			
 
				 
			
 
				 const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
			
 
				 
			
 
				-#endif
			
 
				+#endif // LLAMA_API_INTERNAL
			
 
				 
			
 
				 #endif // LLAMA_H
			
--- a/models/.editorconfig
+++ b/models/.editorconfig
@@ -0,0 +1 @@
 
				+root = true
			
--- a/models/ggml-vocab-llama.gguf
+++ b/models/ggml-vocab-llama.gguf
--- a/models/ggml-vocab.bin
+++ b/models/ggml-vocab.bin
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,17 +1,36 @@
 
				-function(llama_add_test source)
			
 
				+function(llama_build_executable source)
			
 
				     get_filename_component(TEST_TARGET ${source} NAME_WE)
			
 
				     add_executable(${TEST_TARGET} ${source})
			
 
				     install(TARGETS ${TEST_TARGET} RUNTIME)
			
 
				-    target_link_libraries(${TEST_TARGET} PRIVATE llama)
			
 
				+    target_link_libraries(${TEST_TARGET} PRIVATE llama common)
			
 
				+endfunction()
			
 
				+
			
 
				+function(llama_test_executable name source)
			
 
				+    get_filename_component(TEST_TARGET ${source} NAME_WE)
			
 
				+    # add_executable(${TEST_TARGET} ${source})
			
 
				+    # install(TARGETS ${TEST_TARGET} RUNTIME)
			
 
				+    # target_link_libraries(${TEST_TARGET} PRIVATE llama)
			
 
				+    add_test(NAME ${name} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
			
 
				+endfunction()
			
 
				+
			
 
				+function(llama_build_and_test_executable source)
			
 
				+    get_filename_component(TEST_TARGET ${source} NAME_WE)
			
 
				+    add_executable(${TEST_TARGET} ${source})
			
 
				+    install(TARGETS ${TEST_TARGET} RUNTIME)
			
 
				+    target_link_libraries(${TEST_TARGET} PRIVATE llama common)
			
 
				     add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
			
 
				 endfunction()
			
 
				 
			
 
				-# llama_add_test(test-double-float.cpp) # SLOW
			
 
				-llama_add_test(test-quantize-fns.cpp)
			
 
				-llama_add_test(test-quantize-perf.cpp)
			
 
				-llama_add_test(test-sampling.cpp)
			
 
				-llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
			
 
				-llama_add_test(test-grammar-parser.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../examples/grammar-parser.cpp)
			
 
				-llama_add_test(test-llama-grammar.cpp  ${CMAKE_CURRENT_SOURCE_DIR}/../examples/grammar-parser.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../examples/common.cpp)
			
 
				-llama_add_test(test-grad0.cpp) # SLOW
			
 
				-# llama_add_test(test-opt.cpp) # SLOW
			
 
				+# llama_build_and_test_executable(test-double-float.cpp) # SLOW
			
 
				+llama_build_and_test_executable(test-quantize-fns.cpp)
			
 
				+llama_build_and_test_executable(test-quantize-perf.cpp)
			
 
				+llama_build_and_test_executable(test-sampling.cpp)
			
 
				+llama_build_executable(test-tokenizer-0.cpp)
			
 
				+llama_test_executable (test-tokenizer-0.llama test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
			
 
				+llama_build_executable(test-tokenizer-1.cpp)
			
 
				+llama_test_executable (test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
			
 
				+#llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
			
 
				+llama_build_and_test_executable(test-grammar-parser.cpp)
			
 
				+llama_build_and_test_executable(test-llama-grammar.cpp)
			
 
				+llama_build_and_test_executable(test-grad0.cpp) # SLOW
			
 
				+# llama_build_and_test_executable(test-opt.cpp) # SLOW
			
--- a/tests/test-grammar-parser.cpp
+++ b/tests/test-grammar-parser.cpp
@@ -3,7 +3,8 @@
 
				 #endif
			
 
				 
			
 
				 #include "llama.h"
			
 
				-#include "examples/grammar-parser.cpp"
			
 
				+#include "grammar-parser.h"
			
 
				+
			
 
				 #include <cassert>
			
 
				 
			
 
				 int main()
			
--- a/tests/test-llama-grammar.cpp
+++ b/tests/test-llama-grammar.cpp
@@ -2,9 +2,9 @@
 
				 #undef NDEBUG
			
 
				 #endif
			
 
				 
			
 
				-#include "llama.cpp"
			
 
				-#include "examples/common.cpp"
			
 
				-#include "examples/grammar-parser.cpp"
			
 
				+#include "llama.cpp" // TODO: not great
			
 
				+#include "grammar-parser.h"
			
 
				+
			
 
				 #include <cassert>
			
 
				 
			
 
				 int main()
			
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@@ -1,22 +1,47 @@
 
				 #include "llama.h"
			
 
				+#include "common.h"
			
 
				 
			
 
				 #include <cstdio>
			
 
				 #include <string>
			
 
				 #include <map>
			
 
				 #include <vector>
			
 
				 
			
 
				-static const std::map<std::string, std::vector<llama_token>> & k_tests()
			
 
				-{
			
 
				+static std::string unescape_whitespace(llama_context* ctx, const std::vector<llama_token>& tokens) {
			
 
				+    std::string result;
			
 
				+    for (size_t i = 0; i < tokens.size(); ++i) {
			
 
				+        result += llama_token_to_str(ctx, tokens[i]);
			
 
				+    }
			
 
				+    return result;
			
 
				+}
			
 
				+
			
 
				+static const std::map<std::string, std::vector<llama_token>> & k_tests() {
			
 
				     static std::map<std::string, std::vector<llama_token>> _k_tests = {
			
 
				-        { "Hello World",        { 1,  10994,   2787, }, },
			
 
				-        { " Hello World",       { 1,  15043,   2787, }, },
			
 
				-        { " Hello World!",      { 1,  15043,   2787,  29991, }, },
			
 
				-        { " this is 🦙.cpp",    { 1,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, },
			
 
				-        { "w048 7tuijk dsdfhu", { 1,  29893,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, },
			
 
				-        { "нещо на Български",  { 1,    821,   4851,    665,   1386,  29713,   1305, }, },
			
 
				+        { " ",                      {1,    259, }, },
			
 
				+        { "\t",                     { 1,    29871,   12, }, },
			
 
				+        { "\n",                     { 1,    29871,   13, }, },
			
 
				+        { "\t\n",                   { 1,    29871,   12,     13, }, },
			
 
				+        { "Hello world",            { 1,  15043,   3186, }, },
			
 
				+        { " Hello world",           { 1,  29871,  15043,   3186, }, },
			
 
				+        { "Hello World",            { 1,  15043,   2787, }, },
			
 
				+        { " Hello World",           { 1,  29871,  15043,   2787, }, },
			
 
				+        { " Hello World!",          { 1,  29871,  15043,   2787,  29991, }, },
			
 
				+        { " this is 🦙.cpp",        { 1,  29871,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, },
			
 
				+        { "w048 7tuijk dsdfhu",     { 1,    281,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, },
			
 
				+        { "нещо на Български",      { 1,   1538,   4851,    665,   1386,  29713,   1305, }, },
			
 
				+        { "កាន់តែពិសេសអាចខលចេញ",   { 1,  29871,  31849,  31324,  31934,    228,    162,    142,    228,    161,
			
 
				+                                     146,    228,    162,    133,    228,    161,    153,    228,    161,    186,
			
 
				+                                     31708,    228,    162,    132,  31708,    228,    161,    165,  31324,    228,
			
 
				+                                     161,    136,    228,    161,    132,    228,    161,    158,    228,    161,
			
 
				+                                     136,    228,    162,    132,    228,    161,    140, }, },
			
 
				+        { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
			
 
				+            { 1,  29871,    243,    162,    157,    131,    313,   8945,  29897,  29871,
			
 
				+                243,    162,    155,    185,  30722,    243,    162,    143,    174,  30598,
			
 
				+                313,  20787,    953,   3848,    275,  16125,    630,  29897,  29871,  31681,
			
 
				+                313,   6194,    953,  29877,   2397,    393,    756,    967,   1914,   5993,  29897, }, },
			
 
				     };
			
 
				+
			
 
				     return _k_tests;
			
 
				-};
			
 
				+}
			
 
				 
			
 
				 int main(int argc, char **argv) {
			
 
				     if (argc < 2) {
			
@@ -64,10 +89,12 @@ int main(int argc, char **argv) {
 
				         return 2;
			
 
				     }
			
 
				 
			
 
				+    bool success = true;
			
 
				+
			
 
				     for (const auto & test_kv : k_tests()) {
			
 
				-        std::vector<llama_token> res(test_kv.first.size());
			
 
				-        const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), int(res.size()), true);
			
 
				-        res.resize(n);
			
 
				+        std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, true);
			
 
				+        fprintf(stderr, "%s : '%s' tokenized to '%s'\n",
			
 
				+            __func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str());
			
 
				 
			
 
				         bool correct = res.size() == test_kv.second.size();
			
 
				 
			
@@ -78,7 +105,8 @@ int main(int argc, char **argv) {
 
				         }
			
 
				 
			
 
				         if (!correct) {
			
 
				-            fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
			
 
				+            fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
			
 
				+            fprintf(stderr, "%s : detokenized to: '%s'\n", __func__, unescape_whitespace(ctx, test_kv.second).c_str());
			
 
				             fprintf(stderr, "%s : expected tokens: ", __func__);
			
 
				             for (const auto & t : test_kv.second) {
			
 
				                 fprintf(stderr, "%6d, ", t);
			
@@ -90,9 +118,7 @@ int main(int argc, char **argv) {
 
				             }
			
 
				             fprintf(stderr, "\n");
			
 
				 
			
 
				-            llama_free_model(model);
			
 
				-            llama_free(ctx);
			
 
				-            return 3;
			
 
				+            success = false;
			
 
				         }
			
 
				     }
			
 
				 
			
@@ -101,5 +127,5 @@ int main(int argc, char **argv) {
 
				 
			
 
				     llama_backend_free();
			
 
				 
			
 
				-    return 0;
			
 
				+    return success ? 0 : 3;
			
 
				 }
			
--- a/tests/test-tokenizer-1.cpp
+++ b/tests/test-tokenizer-1.cpp
@@ -0,0 +1,131 @@
 
				+#include "llama.h"
			
 
				+#include "common.h"
			
 
				+
			
 
				+#include <cassert>
			
 
				+#include <cstdio>
			
 
				+#include <cstring>
			
 
				+#include <string>
			
 
				+#include <codecvt>
			
 
				+#include <map>
			
 
				+#include <vector>
			
 
				+#include <locale>
			
 
				+
			
 
				+static std::string escape_whitespace(const std::string& text) {
			
 
				+    std::string result;
			
 
				+    bool escaping = false;
			
 
				+    result += "\xe2\x96\x81";
			
 
				+    for (size_t offs = 0; offs < text.length(); ++offs) {
			
 
				+        if (text[offs] == ' ') {
			
 
				+            if (!escaping) {
			
 
				+                result += "\xe2\x96\x81";
			
 
				+                escaping = true;
			
 
				+            }
			
 
				+        }
			
 
				+        else {
			
 
				+            escaping = false;
			
 
				+            result += text[offs];
			
 
				+        }
			
 
				+    }
			
 
				+    return result;
			
 
				+}
			
 
				+
			
 
				+static std::string unescape_whitespace(llama_context * ctx, const std::vector<llama_token> & tokens) {
			
 
				+    std::string result;
			
 
				+    for (size_t i = 0; i < tokens.size(); ++i) {
			
 
				+        result += llama_token_to_str(ctx, tokens[i]);
			
 
				+    }
			
 
				+    return result;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv) {
			
 
				+    if (argc < 2) {
			
 
				+        fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
			
 
				+        return 1;
			
 
				+    }
			
 
				+
			
 
				+    const std::string fname = argv[1];
			
 
				+
			
 
				+    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
			
 
				+
			
 
				+    llama_model * model;
			
 
				+    llama_context * ctx;
			
 
				+
			
 
				+    llama_backend_init(false);
			
 
				+
			
 
				+    // load the vocab
			
 
				+    {
			
 
				+        auto lparams = llama_context_default_params();
			
 
				+
			
 
				+        lparams.vocab_only = true;
			
 
				+
			
 
				+        model = llama_load_model_from_file(fname.c_str(), lparams);
			
 
				+
			
 
				+        if (model == NULL) {
			
 
				+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
			
 
				+            return 1;
			
 
				+        }
			
 
				+
			
 
				+        ctx = llama_new_context_with_model(model, lparams);
			
 
				+
			
 
				+        if (ctx == NULL) {
			
 
				+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
			
 
				+            llama_free_model(model);
			
 
				+            return 1;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    const int n_vocab = llama_n_vocab(ctx);
			
 
				+
			
 
				+    for (int i = 0; i < n_vocab; ++i) {
			
 
				+        std::string forward = llama_token_to_str_bpe(ctx, i);
			
 
				+        std::vector<llama_token> tokens = llama_tokenize_bpe(ctx, forward, false);
			
 
				+        if (tokens.size() == 1) {
			
 
				+            if (i != tokens[0]) {
			
 
				+                std::string backward = llama_token_to_str(ctx, tokens[0]);
			
 
				+                fprintf(stderr, "%s : error: token %d is string %s but bpe returns token %d %s\n",
			
 
				+                    __func__, i, llama_token_to_str(ctx, i).c_str(), tokens[0], backward.c_str());
			
 
				+                return 2;
			
 
				+            }
			
 
				+        } else {
			
 
				+            llama_token_type type = llama_token_get_type(ctx, i);
			
 
				+            if (type == LLAMA_TOKEN_TYPE_UNKNOWN || type == LLAMA_TOKEN_TYPE_CONTROL || type == LLAMA_TOKEN_TYPE_BYTE) {
			
 
				+                fprintf(stderr, "%s : info: token %d is string %s and bpe returns tokens %s\n",
			
 
				+                    __func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
			
 
				+            } else {
			
 
				+                fprintf(stderr, "%s : error: token %d is string %s but bpe returns tokens %s\n",
			
 
				+                    __func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
			
 
				+                return 2;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+#ifdef _WIN32
			
 
				+    std::wstring_convert<typename std::codecvt_utf8<char16_t>, char16_t> u16converter;
			
 
				+    for (char16_t ch = 0x0000; ch < 0xffff; ++ch) {
			
 
				+        std::u16string u16str(1, ch);
			
 
				+        std::string str = u16converter.to_bytes(u16str);
			
 
				+        std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false);
			
 
				+        if (tokens.size() == 1) {
			
 
				+            fprintf(stderr, "%s : info: %s tokenized to %d \n",
			
 
				+                __func__, str.c_str(), tokens[0]);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    std::wstring_convert<typename std::codecvt_utf8<char32_t>, char32_t> u32converter;
			
 
				+    for (char32_t ch = 0x0000; ch < 0x0010ffff; ++ch) {
			
 
				+        std::u32string u32str(1, ch);
			
 
				+        std::string str = u32converter.to_bytes(u32str);
			
 
				+        std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false);
			
 
				+        if (tokens.size() == 1) {
			
 
				+            fprintf(stderr, "%s : info: %s tokenized to %d \n", __func__, str.c_str(), tokens[0]);
			
 
				+        }
			
 
				+    }
			
 
				+#endif
			
 
				+
			
 
				+    llama_free_model(model);
			
 
				+    llama_free(ctx);
			
 
				+
			
 
				+    llama_backend_free();
			
 
				+
			
 
				+    return 0;
			
 
				+}