1 mês atrás · c41bde6fbd
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -20,6 +20,7 @@ else()
 
				 
			
 
				     add_subdirectory(gguf-hash)
			
 
				     add_subdirectory(gguf)
			
 
				+    add_subdirectory(idle)
			
 
				     add_subdirectory(lookahead)
			
 
				     add_subdirectory(lookup)
			
 
				     add_subdirectory(parallel)
			
--- a/examples/idle/CMakeLists.txt
+++ b/examples/idle/CMakeLists.txt
@@ -0,0 +1,5 @@
 
				+set(TARGET llama-idle)
			
 
				+add_executable(${TARGET} idle.cpp)
			
 
				+install(TARGETS ${TARGET} RUNTIME)
			
 
				+target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
			
 
				+target_compile_features(${TARGET} PRIVATE cxx_std_11)
			
--- a/examples/idle/README.md
+++ b/examples/idle/README.md
@@ -0,0 +1,3 @@
 
				+# llama.cpp/example/idle
			
 
				+
			
 
				+https://github.com/ggml-org/llama.cpp/pull/17766
			
--- a/examples/idle/idle.cpp
+++ b/examples/idle/idle.cpp
@@ -0,0 +1,110 @@
 
				+#include "arg.h"
			
 
				+#include "common.h"
			
 
				+#include "log.h"
			
 
				+#include "llama.h"
			
 
				+
			
 
				+#include <cmath>
			
 
				+#include <cstdio>
			
 
				+#include <cstring>
			
 
				+#include <string>
			
 
				+#include <thread>
			
 
				+#include <vector>
			
 
				+
			
 
				+static void print_usage(int /*argc*/, char ** argv) {
			
 
				+    printf("\nexample usage:\n");
			
 
				+    printf("\n    %s -m model.gguf [-ngl n_gpu_layers]\n", argv[0]);
			
 
				+    printf("\n");
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char ** argv) {
			
 
				+    common_params params;
			
 
				+
			
 
				+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
			
 
				+        return 1;
			
 
				+    }
			
 
				+
			
 
				+    common_init();
			
 
				+
			
 
				+    // init LLM
			
 
				+
			
 
				+    llama_backend_init();
			
 
				+    llama_numa_init(params.numa);
			
 
				+
			
 
				+    // initialize the model
			
 
				+
			
 
				+    llama_model_params model_params = common_model_params_to_llama(params);
			
 
				+
			
 
				+    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
			
 
				+
			
 
				+    if (model == NULL) {
			
 
				+        LOG_ERR("%s: error: unable to load model\n" , __func__);
			
 
				+        return 1;
			
 
				+    }
			
 
				+
			
 
				+    const llama_vocab * vocab = llama_model_get_vocab(model);
			
 
				+
			
 
				+    // we need just a dummy token to evaluate
			
 
				+    std::vector<llama_token> prompt_tokens(1, llama_vocab_bos(vocab));
			
 
				+
			
 
				+    llama_context_params ctx_params = llama_context_default_params();
			
 
				+    ctx_params.n_ctx   = 512;
			
 
				+    ctx_params.n_batch = 512;
			
 
				+    ctx_params.no_perf = false;
			
 
				+
			
 
				+    llama_context * ctx = llama_init_from_model(model, ctx_params);
			
 
				+    if (ctx == NULL) {
			
 
				+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
			
 
				+        return 1;
			
 
				+    }
			
 
				+
			
 
				+    llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
			
 
				+
			
 
				+    const int n_iters = 3;
			
 
				+
			
 
				+    // warm-up
			
 
				+    llama_decode(ctx, batch);
			
 
				+    llama_memory_clear(llama_get_memory(ctx), true);
			
 
				+    llama_synchronize(ctx);
			
 
				+
			
 
				+    for (int64_t t_pause_ms = 0; t_pause_ms <= 4000; t_pause_ms += 800) {
			
 
				+        double t_sum_us  = 0.0;
			
 
				+        double t_sum2_us = 0.0;
			
 
				+
			
 
				+        for (int i = 0; i < n_iters; i++) {
			
 
				+            // this pause is important - it simulates "idle GPU"
			
 
				+            std::this_thread::sleep_for(std::chrono::milliseconds(t_pause_ms));
			
 
				+
			
 
				+            const int64_t t_start_us = llama_time_us();
			
 
				+
			
 
				+            // this should take constant time
			
 
				+            llama_decode(ctx, batch);
			
 
				+            llama_synchronize(ctx);
			
 
				+
			
 
				+            const int64_t t_end_us = llama_time_us();
			
 
				+
			
 
				+            const double t_cur_us = t_end_us - t_start_us;
			
 
				+
			
 
				+#if 1
			
 
				+            // print individual decode times
			
 
				+            printf("  - decode time: %8.2f ms\n", t_cur_us / 1000);
			
 
				+#endif
			
 
				+
			
 
				+            t_sum_us  += t_cur_us;
			
 
				+            t_sum2_us += t_cur_us * t_cur_us;
			
 
				+
			
 
				+            llama_memory_clear(llama_get_memory(ctx), true);
			
 
				+            llama_synchronize(ctx); // just in case
			
 
				+        }
			
 
				+
			
 
				+        const double t_avg_us = t_sum_us / n_iters;
			
 
				+        const double t_dev_us = sqrt((t_sum2_us / (n_iters - 1)) - (t_avg_us * t_avg_us * n_iters) / (n_iters - 1));
			
 
				+
			
 
				+        printf("iters: %4d, pause: %5d ms, avg decode time: %8.2f +/- %4.2f ms\n", n_iters, (int) t_pause_ms, t_avg_us / 1000, t_dev_us / 1000);
			
 
				+        fflush(stdout);
			
 
				+    }
			
 
				+
			
 
				+    llama_free(ctx);
			
 
				+    llama_model_free(model);
			
 
				+
			
 
				+    return 0;
			
 
				+}
			
--- a/ggml/src/ggml-metal/ggml-metal-context.m
+++ b/ggml/src/ggml-metal/ggml-metal-context.m
@@ -24,9 +24,6 @@ struct ggml_metal_command_buffer {
 
				 };
			
 
				 
			
 
				 struct ggml_metal {
			
 
				-    id<MTLDevice>       device;
			
 
				-    id<MTLCommandQueue> queue; // currently a pointer to the device queue, but might become separate queue [TAG_QUEUE_PER_BACKEND]
			
 
				-
			
 
				     ggml_metal_device_t  dev;
			
 
				     ggml_metal_library_t lib;
			
 
				 
			
@@ -91,15 +88,15 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
 
				     // init context
			
 
				     ggml_metal_t res = calloc(1, sizeof(struct ggml_metal));
			
 
				 
			
 
				-    res->device = ggml_metal_device_get_obj(dev);
			
 
				+    id<MTLDevice> device = ggml_metal_device_get_obj(dev);
			
 
				 
			
 
				-    GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[res->device name] UTF8String]);
			
 
				+    GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
			
 
				 
			
 
				     // TODO: would it be better to have one queue for the backend and one queue for the device?
			
 
				     //       the graph encoders and async ops would use the backend queue while the sync ops would use the device queue?
			
 
				     //res->queue = [device newCommandQueue]; [TAG_QUEUE_PER_BACKEND]
			
 
				-    res->queue = ggml_metal_device_get_queue(dev);
			
 
				-    if (res->queue == nil) {
			
 
				+    id<MTLCommandQueue> queue = ggml_metal_device_get_queue(dev);
			
 
				+    if (queue == nil) {
			
 
				         GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
			
 
				         return NULL;
			
 
				     }
			
@@ -274,7 +271,8 @@ static struct ggml_metal_buffer_id ggml_metal_get_buffer_id(const struct ggml_te
 
				 void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
			
 
				     @autoreleasepool {
			
 
				         // wrap the source data into a Metal buffer
			
 
				-        id<MTLBuffer> buf_src = [ctx->device newBufferWithBytes:data
			
 
				+        id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
			
 
				+        id<MTLBuffer> buf_src = [device newBufferWithBytes:data
			
 
				                                                          length:size
			
 
				                                                         options:MTLResourceStorageModeShared];
			
 
				 
			
@@ -289,7 +287,8 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor,
 
				 
			
 
				         // queue the copy operation into the queue of the Metal context
			
 
				         // this will be queued at the end, after any currently ongoing GPU operations
			
 
				-        id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
			
 
				+        id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
			
 
				+        id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
			
 
				         id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
			
 
				 
			
 
				         [encoder copyFromBuffer:buf_src
			
@@ -315,7 +314,8 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor,
 
				 
			
 
				 void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
			
 
				     @autoreleasepool {
			
 
				-        id<MTLBuffer> buf_dst = [ctx->device newBufferWithBytesNoCopy:data
			
 
				+        id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
			
 
				+        id<MTLBuffer> buf_dst = [device newBufferWithBytesNoCopy:data
			
 
				                                                                length:size
			
 
				                                                               options:MTLResourceStorageModeShared
			
 
				                                                           deallocator:nil];
			
@@ -331,7 +331,8 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te
 
				 
			
 
				         // queue the copy operation into the queue of the Metal context
			
 
				         // this will be queued at the end, after any currently ongoing GPU operations
			
 
				-        id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
			
 
				+        id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
			
 
				+        id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
			
 
				         id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
			
 
				 
			
 
				         [encoder copyFromBuffer:bid_src.metal
			
@@ -362,6 +363,9 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
 
				     // number of threads in addition to the main thread
			
 
				     const int n_cb = ctx->n_cb;
			
 
				 
			
 
				+    // keep the memory wired
			
 
				+    ggml_metal_device_rsets_keep_alive(ctx->dev);
			
 
				+
			
 
				     // submit the ggml compute graph to the GPU by creating command buffers and encoding the ops in them
			
 
				     // the first n_nodes_0 are encoded and submitted for processing directly by the calling thread
			
 
				     // while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes
			
@@ -389,7 +393,8 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
 
				 
			
 
				             if (!ctx->capture_started) {
			
 
				                 // create capture scope
			
 
				-                ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:ctx->device];
			
 
				+                id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
			
 
				+                ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:device];
			
 
				 
			
 
				                 MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
			
 
				                 descriptor.captureObject = ctx->capture_scope;
			
@@ -406,10 +411,13 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
 
				             }
			
 
				         }
			
 
				 
			
 
				+        // short-hand
			
 
				+        id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
			
 
				+
			
 
				         // the main thread commits the first few commands immediately
			
 
				         // cmd_buf[n_cb]
			
 
				         {
			
 
				-            id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
			
 
				+            id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
			
 
				             [cmd_buf retain];
			
 
				 
			
 
				             if (ctx->cmd_bufs[n_cb].obj) {
			
@@ -428,7 +436,7 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
 
				         // prepare the rest of the command buffers asynchronously (optional)
			
 
				         // cmd_buf[0.. n_cb)
			
 
				         for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
			
 
				-            id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
			
 
				+            id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
			
 
				             [cmd_buf retain];
			
 
				 
			
 
				             if (ctx->cmd_bufs[cb_idx].obj) {
			
@@ -589,9 +597,11 @@ void ggml_metal_set_abort_callback(ggml_metal_t ctx, ggml_abort_callback abort_c
 
				 }
			
 
				 
			
 
				 bool ggml_metal_supports_family(ggml_metal_t ctx, int family) {
			
 
				-    GGML_ASSERT(ctx->device != nil);
			
 
				+    GGML_ASSERT(ctx->dev != nil);
			
 
				+
			
 
				+    id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
			
 
				 
			
 
				-    return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
			
 
				+    return [device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
			
 
				 }
			
 
				 
			
 
				 void ggml_metal_capture_next_compute(ggml_metal_t ctx) {
			
--- a/ggml/src/ggml-metal/ggml-metal-device.h
+++ b/ggml/src/ggml-metal/ggml-metal-device.h
@@ -186,6 +186,16 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_att
 
				         int32_t dv,
			
 
				         int32_t nwg);
			
 
				 
			
 
				+// MTLResidencySet wrapper
			
 
				+
			
 
				+typedef void * ggml_metal_rset_t;
			
 
				+
			
 
				+// a collection of residency sets (non-owning)
			
 
				+typedef struct ggml_metal_rsets * ggml_metal_rsets_t;
			
 
				+
			
 
				+ggml_metal_rsets_t ggml_metal_rsets_init(void);
			
 
				+void ggml_metal_rsets_free(ggml_metal_rsets_t rsets);
			
 
				+
			
 
				 //
			
 
				 // device
			
 
				 //
			
@@ -219,6 +229,11 @@ void * ggml_metal_device_get_queue(ggml_metal_device_t dev); // id<MTLCommandQue
 
				 
			
 
				 ggml_metal_library_t ggml_metal_device_get_library(ggml_metal_device_t dev);
			
 
				 
			
 
				+void ggml_metal_device_rsets_add(ggml_metal_device_t dev, ggml_metal_rset_t rset);
			
 
				+void ggml_metal_device_rsets_rm (ggml_metal_device_t dev, ggml_metal_rset_t rset);
			
 
				+
			
 
				+void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev);
			
 
				+
			
 
				 void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total);
			
 
				 bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_tensor * op);
			
 
				 
			
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -1,7 +1,6 @@
 
				 #import "ggml-metal-device.h"
			
 
				 
			
 
				 #import "ggml-impl.h"
			
 
				-#import "ggml-threading.h"
			
 
				 
			
 
				 #include <Foundation/Foundation.h>
			
 
				 
			
@@ -519,11 +518,101 @@ struct ggml_metal_device {
 
				     // ref: https://github.com/ggml-org/llama.cpp/pull/15906
			
 
				     id<MTLCommandQueue> mtl_queue;
			
 
				 
			
 
				+    ggml_metal_rsets_t rsets;
			
 
				+
			
 
				     ggml_metal_library_t library;
			
 
				 
			
 
				     struct ggml_metal_device_props props;
			
 
				 };
			
 
				 
			
 
				+//
			
 
				+// MTLResidenceSet wrapper
			
 
				+//
			
 
				+
			
 
				+struct ggml_metal_rsets {
			
 
				+    NSLock * lock;
			
 
				+
			
 
				+    NSMutableArray * data;
			
 
				+
			
 
				+    // number of seconds since the last graph computation
			
 
				+    // keep the residency sets wired for that amount of time to avoid being collected by the OS
			
 
				+    int keep_alive_s;
			
 
				+
			
 
				+    // background heartbeat thread to keep the residency sets alive
			
 
				+    atomic_bool d_stop;
			
 
				+    atomic_int  d_loop;
			
 
				+
			
 
				+    dispatch_group_t d_group;
			
 
				+};
			
 
				+
			
 
				+ggml_metal_rsets_t ggml_metal_rsets_init(void) {
			
 
				+    ggml_metal_rsets_t res = calloc(1, sizeof(struct ggml_metal_rsets));
			
 
				+
			
 
				+    res->lock = [[NSLock alloc] init];
			
 
				+    res->data = [[NSMutableArray alloc] init];
			
 
				+
			
 
				+    // by default keep the memory wired for 3 minutes
			
 
				+    res->keep_alive_s = 3*60;
			
 
				+
			
 
				+    const char * GGML_METAL_RESIDENCY_KEEP_ALIVE_S = getenv("GGML_METAL_RESIDENCY_KEEP_ALIVE_S");
			
 
				+    if (GGML_METAL_RESIDENCY_KEEP_ALIVE_S) {
			
 
				+        res->keep_alive_s = atoi(GGML_METAL_RESIDENCY_KEEP_ALIVE_S);
			
 
				+    }
			
 
				+
			
 
				+    if (res->keep_alive_s <= 0) {
			
 
				+        res->keep_alive_s = 3*60;
			
 
				+    }
			
 
				+
			
 
				+    GGML_LOG_INFO("%s: creating a residency set collection (keep_alive = %d s)\n", __func__, res->keep_alive_s);
			
 
				+
			
 
				+    atomic_store_explicit(&res->d_stop, false, memory_order_relaxed);
			
 
				+    atomic_store_explicit(&res->d_loop, 2*res->keep_alive_s, memory_order_relaxed);
			
 
				+
			
 
				+    res->d_group = dispatch_group_create();
			
 
				+
			
 
				+    // start a background thread that periodically requests residency for all the currently active sets in the collection
			
 
				+    // the requests stop after a certain amount of time (keep_alive_s) of inactivity
			
 
				+    dispatch_queue_t d_queue = dispatch_get_global_queue(QOS_CLASS_DEFAULT, 0);
			
 
				+    dispatch_group_async(res->d_group, d_queue, ^{
			
 
				+          while (!atomic_load_explicit(&res->d_stop, memory_order_relaxed)) {
			
 
				+              if (atomic_load_explicit(&res->d_loop, memory_order_relaxed) > 0) {
			
 
				+                  [res->lock lock];
			
 
				+
			
 
				+                  for (int i = 0; i < (int) res->data.count; ++i) {
			
 
				+                      [res->data[i] requestResidency];
			
 
				+                  }
			
 
				+
			
 
				+                  atomic_fetch_sub_explicit(&res->d_loop, 1, memory_order_relaxed);
			
 
				+
			
 
				+                  [res->lock unlock];
			
 
				+              }
			
 
				+
			
 
				+              // half a second
			
 
				+              usleep(500 * 1000);
			
 
				+          }
			
 
				+    });
			
 
				+
			
 
				+    return res;
			
 
				+}
			
 
				+
			
 
				+void ggml_metal_rsets_free(ggml_metal_rsets_t rsets) {
			
 
				+    if (rsets == NULL) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    GGML_ASSERT([rsets->data count] == 0);
			
 
				+
			
 
				+    atomic_store_explicit(&rsets->d_stop, true, memory_order_relaxed);
			
 
				+
			
 
				+    dispatch_group_wait(rsets->d_group, DISPATCH_TIME_FOREVER);
			
 
				+    dispatch_release(rsets->d_group);
			
 
				+
			
 
				+    [rsets->data release];
			
 
				+    [rsets->lock release];
			
 
				+
			
 
				+    free(rsets);
			
 
				+}
			
 
				+
			
 
				 ggml_metal_device_t ggml_metal_device_init(void) {
			
 
				     ggml_metal_device_t dev = calloc(1, sizeof(struct ggml_metal_device));
			
 
				 
			
@@ -692,6 +781,13 @@ ggml_metal_device_t ggml_metal_device_init(void) {
 
				                 GGML_LOG_ERROR("%s: error: failed to create library\n", __func__);
			
 
				             }
			
 
				 
			
 
				+            if (dev->props.use_residency_sets) {
			
 
				+                dev->rsets = ggml_metal_rsets_init();
			
 
				+            } else {
			
 
				+                dev->rsets = nil;
			
 
				+            }
			
 
				+
			
 
				+
			
 
				             // --------------------------------------------------
			
 
				 
			
 
				             // print MTL GPU family:
			
@@ -745,6 +841,8 @@ ggml_metal_device_t ggml_metal_device_init(void) {
 
				 void ggml_metal_device_free(ggml_metal_device_t dev) {
			
 
				     assert(dev != NULL);
			
 
				 
			
 
				+    ggml_metal_rsets_free(dev->rsets);
			
 
				+
			
 
				     ggml_metal_library_free(dev->library);
			
 
				     dev->library = NULL;
			
 
				 
			
@@ -773,6 +871,42 @@ ggml_metal_library_t ggml_metal_device_get_library(ggml_metal_device_t dev) {
 
				     return dev->library;
			
 
				 }
			
 
				 
			
 
				+void ggml_metal_device_rsets_add(ggml_metal_device_t dev, ggml_metal_rset_t rset) {
			
 
				+    if (rset == nil) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    GGML_ASSERT(dev->rsets);
			
 
				+
			
 
				+    [dev->rsets->lock lock];
			
 
				+
			
 
				+    [dev->rsets->data addObject:rset];
			
 
				+
			
 
				+    [dev->rsets->lock unlock];
			
 
				+}
			
 
				+
			
 
				+void ggml_metal_device_rsets_rm(ggml_metal_device_t dev, ggml_metal_rset_t rset) {
			
 
				+    if (rset == nil) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    GGML_ASSERT(dev->rsets);
			
 
				+
			
 
				+    [dev->rsets->lock lock];
			
 
				+
			
 
				+    [dev->rsets->data removeObject:rset];
			
 
				+
			
 
				+    [dev->rsets->lock unlock];
			
 
				+}
			
 
				+
			
 
				+void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev) {
			
 
				+    if (dev->rsets == NULL) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    atomic_store_explicit(&dev->rsets->d_loop, 2*dev->rsets->keep_alive_s, memory_order_relaxed);
			
 
				+}
			
 
				+
			
 
				 void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total) {
			
 
				     if (@available(macOS 10.12, iOS 16.0, *)) {
			
 
				         *total = dev->mtl_device.recommendedMaxWorkingSetSize;
			
@@ -1066,9 +1200,8 @@ struct ggml_metal_buffer {
 
				     // note: cannot use explicity "id<MTLResidencySet>" here because it is not available on certain OSes
			
 
				     id rset;
			
 
				 
			
 
				-    // pointers to global device objects
			
 
				-    id<MTLDevice> device;
			
 
				-    id<MTLCommandQueue> queue;
			
 
				+    // pointers to global device
			
 
				+    ggml_metal_device_t dev;
			
 
				 };
			
 
				 
			
 
				 static void ggml_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
			
@@ -1111,7 +1244,7 @@ static bool ggml_metal_buffer_rset_init(ggml_metal_buffer_t buf) {
 
				         desc.initialCapacity = buf->n_buffers;
			
 
				 
			
 
				         NSError * error;
			
 
				-        buf->rset = [buf->device newResidencySetWithDescriptor:desc error:&error];
			
 
				+        buf->rset = [buf->dev->mtl_device newResidencySetWithDescriptor:desc error:&error];
			
 
				         if (error) {
			
 
				             GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
			
 
				             [desc release];
			
@@ -1172,6 +1305,8 @@ static void * ggml_metal_host_malloc(size_t n) {
 
				 ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size, bool shared) {
			
 
				     ggml_metal_buffer_t res = calloc(1, sizeof(struct ggml_metal_buffer));
			
 
				 
			
 
				+    res->dev = dev;
			
 
				+
			
 
				     const size_t size_page = sysconf(_SC_PAGESIZE);
			
 
				 
			
 
				     size_t size_aligned = size;
			
@@ -1196,9 +1331,6 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
 
				 
			
 
				     res->owned = true;
			
 
				 
			
 
				-    res->device = ggml_metal_device_get_obj(dev);
			
 
				-    res->queue  = ggml_metal_device_get_queue(dev);
			
 
				-
			
 
				     res->n_buffers = 1;
			
 
				 
			
 
				     if (res->all_data != NULL) {
			
@@ -1207,12 +1339,12 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
 
				 
			
 
				         if (size_aligned > 0) {
			
 
				             if (props_dev->use_shared_buffers && shared) {
			
 
				-                res->buffers[0].metal = [res->device newBufferWithBytesNoCopy:res->all_data
			
 
				+                res->buffers[0].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:res->all_data
			
 
				                                                                   length:size_aligned
			
 
				                                                                  options:MTLResourceStorageModeShared
			
 
				                                                              deallocator:nil];
			
 
				             } else {
			
 
				-                res->buffers[0].metal = [res->device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate];
			
 
				+                res->buffers[0].metal = [res->dev->mtl_device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate];
			
 
				             }
			
 
				         }
			
 
				 
			
@@ -1233,6 +1365,8 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
 
				         return NULL;
			
 
				     }
			
 
				 
			
 
				+    ggml_metal_device_rsets_add(dev, res->rset);
			
 
				+
			
 
				     //ggml_metal_log_allocated_size(device, size_aligned);
			
 
				 
			
 
				     return res;
			
@@ -1241,6 +1375,8 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
 
				 ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, size_t size, size_t max_tensor_size) {
			
 
				     ggml_metal_buffer_t res = calloc(1, sizeof(struct ggml_metal_buffer));
			
 
				 
			
 
				+    res->dev = dev;
			
 
				+
			
 
				     res->all_data = ptr;
			
 
				     res->all_size = size;
			
 
				 
			
@@ -1263,9 +1399,6 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
 
				         size_aligned += (size_page - (size_aligned % size_page));
			
 
				     }
			
 
				 
			
 
				-    res->device = ggml_metal_device_get_obj(dev);
			
 
				-    res->queue  = ggml_metal_device_get_queue(dev);
			
 
				-
			
 
				     const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
			
 
				 
			
 
				     // the buffer fits into the max buffer size allowed by the device
			
@@ -1275,7 +1408,7 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
 
				         res->buffers[res->n_buffers].metal = nil;
			
 
				 
			
 
				         if (size_aligned > 0) {
			
 
				-            res->buffers[res->n_buffers].metal = [res->device newBufferWithBytesNoCopy:ptr length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
			
 
				+            res->buffers[res->n_buffers].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:ptr length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
			
 
				 
			
 
				             if (res->buffers[res->n_buffers].metal == nil) {
			
 
				                 GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
			
@@ -1284,7 +1417,7 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
 
				             }
			
 
				         }
			
 
				 
			
 
				-        ggml_metal_log_allocated_size(res->device, size_aligned);
			
 
				+        ggml_metal_log_allocated_size(res->dev->mtl_device, size_aligned);
			
 
				 
			
 
				         ++res->n_buffers;
			
 
				     } else {
			
@@ -1302,7 +1435,7 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
 
				             res->buffers[res->n_buffers].metal = nil;
			
 
				 
			
 
				             if (size_step_aligned > 0) {
			
 
				-                res->buffers[res->n_buffers].metal = [res->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) ptr + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
			
 
				+                res->buffers[res->n_buffers].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:(void *) ((uint8_t *) ptr + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
			
 
				 
			
 
				                 if (res->buffers[res->n_buffers].metal == nil) {
			
 
				                     GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
			
@@ -1311,7 +1444,7 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
 
				                 }
			
 
				             }
			
 
				 
			
 
				-            ggml_metal_log_allocated_size(res->device, size_step_aligned);
			
 
				+            ggml_metal_log_allocated_size(res->dev->mtl_device, size_step_aligned);
			
 
				 
			
 
				             if (i + size_step < size) {
			
 
				                 GGML_LOG_INFO("\n");
			
@@ -1329,10 +1462,14 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
 
				         return NULL;
			
 
				     }
			
 
				 
			
 
				+    ggml_metal_device_rsets_add(dev, res->rset);
			
 
				+
			
 
				     return res;
			
 
				 }
			
 
				 
			
 
				 void ggml_metal_buffer_free(ggml_metal_buffer_t buf) {
			
 
				+    ggml_metal_device_rsets_rm(buf->dev, buf->rset);
			
 
				+
			
 
				     for (int i = 0; i < buf->n_buffers; i++) {
			
 
				         [buf->buffers[i].metal release];
			
 
				     }
			
@@ -1369,8 +1506,7 @@ void ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor
 
				         struct ggml_metal_buffer_id bid_dst = ggml_metal_buffer_get_id(buf, tensor);
			
 
				         bid_dst.offs += offset;
			
 
				 
			
 
				-        id<MTLCommandQueue>  queue   = buf->queue;
			
 
				-        id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
			
 
				+        id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
			
 
				 
			
 
				         {
			
 
				             id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
			
@@ -1396,7 +1532,7 @@ void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor *
 
				     @autoreleasepool {
			
 
				         // src
			
 
				         void * data_ptr = (void *)(uintptr_t) data; // "const cast" the src data
			
 
				-        id<MTLBuffer> buf_src = [buf->device newBufferWithBytesNoCopy:data_ptr
			
 
				+        id<MTLBuffer> buf_src = [buf->dev->mtl_device newBufferWithBytesNoCopy:data_ptr
			
 
				                                                                length:size
			
 
				                                                               options:MTLResourceStorageModeShared
			
 
				                                                           deallocator:nil];
			
@@ -1411,8 +1547,7 @@ void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor *
 
				         //       this is alternative to waitUntilCompleted, which should be faster, but don't seem to make much difference
			
 
				         dispatch_semaphore_t completion_semaphore = dispatch_semaphore_create(0);
			
 
				 
			
 
				-        id<MTLCommandQueue>  queue   = buf->queue;
			
 
				-        id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
			
 
				+        id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
			
 
				 
			
 
				         {
			
 
				             id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
			
@@ -1454,15 +1589,14 @@ void ggml_metal_buffer_get_tensor(ggml_metal_buffer_t buf, const struct ggml_ten
 
				         bid_src.offs += offset;
			
 
				 
			
 
				         // dst
			
 
				-        id<MTLBuffer> buf_dst = [buf->device newBufferWithBytesNoCopy:data
			
 
				+        id<MTLBuffer> buf_dst = [buf->dev->mtl_device newBufferWithBytesNoCopy:data
			
 
				                                                                length:size
			
 
				                                                               options:MTLResourceStorageModeShared
			
 
				                                                           deallocator:nil];
			
 
				 
			
 
				         GGML_ASSERT(buf_dst);
			
 
				 
			
 
				-        id<MTLCommandQueue>  queue   = buf->queue;
			
 
				-        id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
			
 
				+        id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
			
 
				 
			
 
				         {
			
 
				             id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
			
@@ -1488,8 +1622,7 @@ void ggml_metal_buffer_clear(ggml_metal_buffer_t buf, uint8_t value) {
 
				     }
			
 
				 
			
 
				     @autoreleasepool {
			
 
				-        id<MTLCommandQueue>  queue   = buf->queue;
			
 
				-        id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
			
 
				+        id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];
			
 
				 
			
 
				         {
			
 
				             id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];