před 2 roky · e36ecdccc8
--- a/.gitignore
+++ b/.gitignore
@@ -31,28 +31,29 @@ tmp/
 
				 models/*
			
 
				 models-mnt
			
 
				 
			
 
				-/main
			
 
				-/quantize
			
 
				-/quantize-stats
			
 
				-/result
			
 
				-/perplexity
			
 
				-/embedding
			
 
				-/train-text-from-scratch
			
 
				-/convert-llama2c-to-ggml
			
 
				-/simple
			
 
				-/benchmark-matmult
			
 
				-/vdot
			
 
				-/server
			
 
				 /Pipfile
			
 
				+/baby-llama
			
 
				+/beam-search
			
 
				+/benchmark-matmult
			
 
				+/convert-llama2c-to-ggml
			
 
				 /embd-input-test
			
 
				+/embedding
			
 
				 /gguf
			
 
				 /gguf-llama-simple
			
 
				 /libllama.so
			
 
				 /llama-bench
			
 
				-/baby-llama
			
 
				-/beam-search
			
 
				+/main
			
 
				+/metal
			
 
				+/perplexity
			
 
				+/quantize
			
 
				+/quantize-stats
			
 
				+/result
			
 
				 /save-load-state
			
 
				+/server
			
 
				+/simple
			
 
				 /speculative
			
 
				+/train-text-from-scratch
			
 
				+/vdot
			
 
				 build-info.h
			
 
				 arm_neon.h
			
 
				 compile_commands.json
			
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,6 +36,12 @@ endif()
 
				 # Option list
			
 
				 #
			
 
				 
			
 
				+if (APPLE)
			
 
				+    set(LLAMA_METAL_DEFAULT ON)
			
 
				+else()
			
 
				+    set(LLAMA_METAL_DEFAULT OFF)
			
 
				+endif()
			
 
				+
			
 
				 # general
			
 
				 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
			
 
				 option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      OFF)
			
@@ -76,7 +82,7 @@ option(LLAMA_CUDA_F16                        "llama: use 16 bit floats for some
 
				 set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
			
 
				 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
			
 
				 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
			
 
				-option(LLAMA_METAL                           "llama: use Metal"                                 OFF)
			
 
				+option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
			
 
				 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
			
 
				 option(LLAMA_K_QUANTS                        "llama: use k-quants"                              ON)
			
 
				 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
			
@@ -158,6 +164,31 @@ if (APPLE AND LLAMA_ACCELERATE)
 
				     endif()
			
 
				 endif()
			
 
				 
			
 
				+if (LLAMA_METAL)
			
 
				+    find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
			
 
				+    find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
			
 
				+    find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
			
 
				+
			
 
				+    message(STATUS "Metal framework found")
			
 
				+
			
 
				+    set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
			
 
				+
			
 
				+    add_compile_definitions(GGML_USE_METAL)
			
 
				+    #add_compile_definitions(GGML_METAL_NDEBUG)
			
 
				+
			
 
				+    # get full path to the file
			
 
				+    #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
			
 
				+
			
 
				+    # copy ggml-metal.metal to bin directory
			
 
				+    configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
			
 
				+
			
 
				+    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
			
 
				+        ${FOUNDATION_LIBRARY}
			
 
				+        ${METAL_FRAMEWORK}
			
 
				+        ${METALKIT_FRAMEWORK}
			
 
				+        )
			
 
				+endif()
			
 
				+
			
 
				 if (LLAMA_BLAS)
			
 
				     if (LLAMA_STATIC)
			
 
				         set(BLA_STATIC ON)
			
@@ -293,29 +324,6 @@ if (LLAMA_CUBLAS)
 
				     endif()
			
 
				 endif()
			
 
				 
			
 
				-if (LLAMA_METAL)
			
 
				-    find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
			
 
				-    find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
			
 
				-    find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
			
 
				-
			
 
				-    set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
			
 
				-
			
 
				-    add_compile_definitions(GGML_USE_METAL)
			
 
				-    #add_compile_definitions(GGML_METAL_NDEBUG)
			
 
				-
			
 
				-    # get full path to the file
			
 
				-    #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
			
 
				-
			
 
				-    # copy ggml-metal.metal to bin directory
			
 
				-    configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
			
 
				-
			
 
				-    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
			
 
				-        ${FOUNDATION_LIBRARY}
			
 
				-        ${METAL_FRAMEWORK}
			
 
				-        ${METALKIT_FRAMEWORK}
			
 
				-        )
			
 
				-endif()
			
 
				-
			
 
				 if (LLAMA_MPI)
			
 
				     cmake_minimum_required(VERSION 3.10)
			
 
				     find_package(MPI)
			
--- a/Makefile
+++ b/Makefile
@@ -7,6 +7,39 @@ TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-dou
 
				 # Code coverage output files
			
 
				 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
			
 
				 
			
 
				+ifndef UNAME_S
			
 
				+UNAME_S := $(shell uname -s)
			
 
				+endif
			
 
				+
			
 
				+ifndef UNAME_P
			
 
				+UNAME_P := $(shell uname -p)
			
 
				+endif
			
 
				+
			
 
				+ifndef UNAME_M
			
 
				+UNAME_M := $(shell uname -m)
			
 
				+endif
			
 
				+
			
 
				+# Mac OS + Arm can report x86_64
			
 
				+# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
			
 
				+ifeq ($(UNAME_S),Darwin)
			
 
				+	ifndef LLAMA_NO_METAL
			
 
				+		LLAMA_METAL := 1
			
 
				+	endif
			
 
				+
			
 
				+	ifneq ($(UNAME_P),arm)
			
 
				+		SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
			
 
				+		ifeq ($(SYSCTL_M),1)
			
 
				+			# UNAME_P := arm
			
 
				+			# UNAME_M := arm64
			
 
				+			warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
			
 
				+		endif
			
 
				+	endif
			
 
				+endif
			
 
				+
			
 
				+ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
			
 
				+BUILD_TARGETS += metal
			
 
				+endif
			
 
				+
			
 
				 default: $(BUILD_TARGETS)
			
 
				 
			
 
				 test:
			
@@ -38,18 +71,6 @@ gcovr-report: coverage ## Generate gcovr report
 
				 	mkdir -p gcovr-report
			
 
				 	gcovr --root . --html --html-details --output gcovr-report/coverage.html
			
 
				 
			
 
				-ifndef UNAME_S
			
 
				-UNAME_S := $(shell uname -s)
			
 
				-endif
			
 
				-
			
 
				-ifndef UNAME_P
			
 
				-UNAME_P := $(shell uname -p)
			
 
				-endif
			
 
				-
			
 
				-ifndef UNAME_M
			
 
				-UNAME_M := $(shell uname -m)
			
 
				-endif
			
 
				-
			
 
				 ifdef RISCV_CROSS_COMPILE
			
 
				 CC	:= riscv64-unknown-linux-gnu-gcc
			
 
				 CXX	:= riscv64-unknown-linux-gnu-g++
			
@@ -58,19 +79,6 @@ endif
 
				 CCV := $(shell $(CC) --version | head -n 1)
			
 
				 CXXV := $(shell $(CXX) --version | head -n 1)
			
 
				 
			
 
				-# Mac OS + Arm can report x86_64
			
 
				-# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
			
 
				-ifeq ($(UNAME_S),Darwin)
			
 
				-	ifneq ($(UNAME_P),arm)
			
 
				-		SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
			
 
				-		ifeq ($(SYSCTL_M),1)
			
 
				-			# UNAME_P := arm
			
 
				-			# UNAME_M := arm64
			
 
				-			warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
			
 
				-		endif
			
 
				-	endif
			
 
				-endif
			
 
				-
			
 
				 #
			
 
				 # Compile flags
			
 
				 #
			
@@ -231,14 +239,24 @@ endif
 
				 endif
			
 
				 
			
 
				 ifndef LLAMA_NO_ACCELERATE
			
 
				-	# Mac M1 - include Accelerate framework.
			
 
				-	# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
			
 
				+	# Mac OS - include Accelerate framework.
			
 
				+	# `-framework Accelerate` works both with Apple Silicon and Mac Intel
			
 
				 	ifeq ($(UNAME_S),Darwin)
			
 
				 		MK_CPPFLAGS += -DGGML_USE_ACCELERATE
			
 
				 		MK_LDFLAGS  += -framework Accelerate
			
 
				 	endif
			
 
				 endif # LLAMA_NO_ACCELERATE
			
 
				 
			
 
				+ifdef LLAMA_METAL
			
 
				+	# By default - use GPU acceleration on Mac OS
			
 
				+	ifeq ($(UNAME_S),Darwin)
			
 
				+		CFLAGS   += -DGGML_USE_METAL #-DGGML_METAL_NDEBUG
			
 
				+		CXXFLAGS += -DGGML_USE_METAL
			
 
				+		LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
			
 
				+		OBJS     += ggml-metal.o
			
 
				+	endif
			
 
				+endif # LLAMA_METAL
			
 
				+
			
 
				 ifdef LLAMA_MPI
			
 
				 	MK_CPPFLAGS += -DGGML_USE_MPI
			
 
				 	MK_CFLAGS   += -Wno-cast-qual
			
@@ -480,10 +498,6 @@ beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o co
 
				 speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o $(OBJS)
			
 
				 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
			
 
				 
			
 
				-ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
			
 
				-BUILD_TARGETS += metal
			
 
				-endif
			
 
				-
			
 
				 ifdef LLAMA_METAL
			
 
				 metal: examples/metal/metal.cpp ggml.o $(OBJS)
			
 
				 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
			
--- a/README.md
+++ b/README.md
@@ -280,29 +280,11 @@ In order to build llama.cpp you have three different options.
 
				 
			
 
				 ### Metal Build
			
 
				 
			
 
				-Using Metal allows the computation to be executed on the GPU for Apple devices:
			
 
				+On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
			
 
				+To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or the `LLAMA_METAL=OFF` cmake option.
			
 
				 
			
 
				-- Using `make`:
			
 
				-
			
 
				-  ```bash
			
 
				-  LLAMA_METAL=1 make
			
 
				-  ```
			
 
				-
			
 
				-- Using `CMake`:
			
 
				-
			
 
				-    ```bash
			
 
				-    mkdir build-metal
			
 
				-    cd build-metal
			
 
				-    cmake -DLLAMA_METAL=ON ..
			
 
				-    cmake --build . --config Release
			
 
				-    ```
			
 
				-
			
 
				-When built with Metal support, you can enable GPU inference with the `--gpu-layers|-ngl` command-line argument.
			
 
				-Any value larger than 0 will offload the computation to the GPU. For example:
			
 
				-
			
 
				-```bash
			
 
				-./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 -ngl 1
			
 
				-```
			
 
				+When built with Metal support, you can explicitly disable GPU inference with the `--gpu-layers|-ngl 0` command-line
			
 
				+argument.
			
 
				 
			
 
				 ### MPI Build
			
 
				 
			
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -717,7 +717,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
 
				 
			
 
				     lparams.n_ctx           = params.n_ctx;
			
 
				     lparams.n_batch         = params.n_batch;
			
 
				-    lparams.n_gpu_layers    = params.n_gpu_layers;
			
 
				+    if (params.n_gpu_layers != -1) {
			
 
				+        lparams.n_gpu_layers = params.n_gpu_layers;
			
 
				+    }
			
 
				     lparams.main_gpu        = params.main_gpu;
			
 
				     lparams.tensor_split    = params.tensor_split;
			
 
				     lparams.low_vram        = params.low_vram;
			
@@ -1212,7 +1214,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
 
				     fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
			
 
				     fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false");
			
 
				     fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
			
 
				-    fprintf(stream, "n_gpu_layers: %d # default: 0\n", params.n_gpu_layers);
			
 
				+    fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
			
 
				     fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
			
 
				     fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs);
			
 
				     fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
			
--- a/common/common.h
+++ b/common/common.h
@@ -34,7 +34,7 @@ struct gpt_params {
 
				     int32_t n_keep                          = 0;    // number of tokens to keep from initial prompt
			
 
				     int32_t n_draft                         = 16;   // number of tokens to draft during speculative decoding
			
 
				     int32_t n_chunks                        = -1;   // max number of chunks to process (-1 = unlimited)
			
 
				-    int32_t n_gpu_layers                    = 0;    // number of layers to store in VRAM
			
 
				+    int32_t n_gpu_layers                    = -1;   // number of layers to store in VRAM (-1 - use default)
			
 
				     int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
			
 
				     float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
			
 
				     int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
			
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -151,14 +151,6 @@ int main(int argc, char ** argv) {
 
				         LOG_TEE("%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
			
 
				     }
			
 
				 
			
 
				-    if (params.n_ctx > 2048) {
			
 
				-        // TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048
			
 
				-        LOG_TEE("%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx);
			
 
				-    } else if (params.n_ctx < 8) {
			
 
				-        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
			
 
				-        params.n_ctx = 8;
			
 
				-    }
			
 
				-
			
 
				     LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
			
 
				 
			
 
				     if (params.seed == LLAMA_DEFAULT_SEED) {
			
@@ -194,6 +186,13 @@ int main(int argc, char ** argv) {
 
				         return 1;
			
 
				     }
			
 
				 
			
 
				+    if (params.n_ctx > llama_n_ctx(ctx)) {
			
 
				+        LOG_TEE("%s: warning: base model only supports context sizes no greater than %d tokens (%d specified)\n", __func__, llama_n_ctx(ctx), params.n_ctx);
			
 
				+    } else if (params.n_ctx < 8) {
			
 
				+        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
			
 
				+        params.n_ctx = 8;
			
 
				+    }
			
 
				+
			
 
				     // print system information
			
 
				     {
			
 
				         LOG_TEE("\n");
			
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -368,7 +368,7 @@ results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
 
				         // Example, we have a context window of 512, we will compute perplexity for each of the
			
 
				         // last 256 tokens.  Then, we split the input up into context window size chunks to
			
 
				         // process the entire prompt.
			
 
				-        const int first = std::min(512, params.n_ctx/2);
			
 
				+        const int first = params.n_ctx/2;
			
 
				         process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first,
			
 
				                        workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
			
 
				         count += params.n_ctx - first - 1;
			
@@ -668,11 +668,6 @@ int main(int argc, char ** argv) {
 
				         params.n_ctx += params.ppl_stride/2;
			
 
				     }
			
 
				 
			
 
				-    if (params.n_ctx > 2048) {
			
 
				-        fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
			
 
				-                "expect poor results\n", __func__, params.n_ctx);
			
 
				-    }
			
 
				-
			
 
				     fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
			
 
				 
			
 
				     if (params.seed == LLAMA_DEFAULT_SEED) {
			
@@ -698,6 +693,11 @@ int main(int argc, char ** argv) {
 
				         return 1;
			
 
				     }
			
 
				 
			
 
				+    if (params.n_ctx > llama_n_ctx(ctx)) {
			
 
				+        fprintf(stderr, "%s: warning: model might not support context sizes greater than %d tokens (%d specified);"
			
 
				+                "expect poor results\n", __func__, llama_n_ctx(ctx), params.n_ctx);
			
 
				+    }
			
 
				+
			
 
				     // print system information
			
 
				     {
			
 
				         fprintf(stderr, "\n");
			
--- a/llama.cpp
+++ b/llama.cpp
@@ -5340,7 +5340,7 @@ struct llama_context_params llama_context_default_params() {
 
				         /*.seed                        =*/ LLAMA_DEFAULT_SEED,
			
 
				         /*.n_ctx                       =*/ 512,
			
 
				         /*.n_batch                     =*/ 512,
			
 
				-        /*.gpu_layers                  =*/ 0,
			
 
				+        /*.n_gpu_layers                =*/ 0,
			
 
				         /*.main_gpu                    =*/ 0,
			
 
				         /*.tensor_split                =*/ nullptr,
			
 
				         /*.rope_freq_base              =*/ 10000.0f,
			
@@ -5357,6 +5357,10 @@ struct llama_context_params llama_context_default_params() {
 
				         /*.embedding                   =*/ false,
			
 
				     };
			
 
				 
			
 
				+#ifdef GGML_USE_METAL
			
 
				+    result.n_gpu_layers = 1;
			
 
				+#endif
			
 
				+
			
 
				     return result;
			
 
				 }
			
 
				 
			
@@ -5549,43 +5553,43 @@ struct llama_context * llama_new_context_with_model(
 
				             }
			
 
				 #endif
			
 
				         }
			
 
				-    }
			
 
				 
			
 
				 #ifdef GGML_USE_METAL
			
 
				-    if (params.n_gpu_layers > 0) {
			
 
				-        // this allocates all Metal resources and memory buffers
			
 
				+        if (params.n_gpu_layers > 0) {
			
 
				+            // this allocates all Metal resources and memory buffers
			
 
				 
			
 
				-        void * data_ptr  = NULL;
			
 
				-        size_t data_size = 0;
			
 
				+            void * data_ptr  = NULL;
			
 
				+            size_t data_size = 0;
			
 
				 
			
 
				-        if (params.use_mmap) {
			
 
				-            data_ptr  = ctx->model.mapping->addr;
			
 
				-            data_size = ctx->model.mapping->size;
			
 
				-        } else {
			
 
				-            data_ptr  = ggml_get_mem_buffer(ctx->model.ctx);
			
 
				-            data_size = ggml_get_mem_size  (ctx->model.ctx);
			
 
				-        }
			
 
				+            if (params.use_mmap) {
			
 
				+                data_ptr  = ctx->model.mapping->addr;
			
 
				+                data_size = ctx->model.mapping->size;
			
 
				+            } else {
			
 
				+                data_ptr  = ggml_get_mem_buffer(ctx->model.ctx);
			
 
				+                data_size = ggml_get_mem_size  (ctx->model.ctx);
			
 
				+            }
			
 
				 
			
 
				-        const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
			
 
				+            const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
			
 
				 
			
 
				-        LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
			
 
				+            LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
			
 
				 
			
 
				 #define LLAMA_METAL_CHECK_BUF(result)                            \
			
 
				-    if (!(result)) {                                             \
			
 
				-        LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
			
 
				-        llama_free(ctx);                                         \
			
 
				-        return NULL;                                             \
			
 
				-    }
			
 
				+            if (!(result)) {                                             \
			
 
				+                LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
			
 
				+                llama_free(ctx);                                         \
			
 
				+                return NULL;                                             \
			
 
				+            }
			
 
				 
			
 
				-        LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
			
 
				+            LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
			
 
				 
			
 
				-        LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
			
 
				-        LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv",   ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
			
 
				+            LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
			
 
				+            LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv",   ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
			
 
				 
			
 
				-        LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
			
 
				+            LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
			
 
				 #undef LLAMA_METAL_CHECK_BUF
			
 
				-    }
			
 
				+        }
			
 
				 #endif
			
 
				+    }
			
 
				 
			
 
				 #ifdef GGML_USE_MPI
			
 
				     ctx->ctx_mpi = ggml_mpi_init();