1 год назад · 96b6912103
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -239,8 +239,6 @@ struct ggml_backend_metal_context {
 
				     struct ggml_cgraph * gf;
			
 
				 
			
 
				     // the callback given to the thread pool
			
 
				-    // TODO: ideally, this should be created once, utilizing the command buffer state above
			
 
				-    //       for some reason, doing it like this leads to a crash
			
 
				     void (^encode_async)(size_t ith);
			
 
				 
			
 
				     // n_cb command buffers + 1 used by the main thread
			
@@ -683,6 +681,8 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
 
				         [ctx->kernels[i].pipeline release];
			
 
				     }
			
 
				 
			
 
				+    Block_release(ctx->encode_async);
			
 
				+
			
 
				     [ctx->queue release];
			
 
				     [ctx->device release];
			
 
				 
			
@@ -3000,46 +3000,6 @@ static enum ggml_status ggml_metal_graph_compute(
 
				             }
			
 
				         }
			
 
				 
			
 
				-        // TODO: how to avoid this allocation? I tried initializing it in ggml_backend_metal_set_n_cb but it crashes.
			
 
				-        ctx->encode_async = ^(size_t iter) {
			
 
				-            const int cb_idx = iter;
			
 
				-            const int n_cb_l = ctx->n_cb;
			
 
				-
			
 
				-            const int n_nodes_0 = ctx->n_nodes_0;
			
 
				-            const int n_nodes_1 = ctx->n_nodes_1;
			
 
				-
			
 
				-            const int n_nodes_per_cb = ctx->n_nodes_per_cb;
			
 
				-
			
 
				-            id<MTLCommandBuffer> command_buffer  = ctx->command_buffers[cb_idx];
			
 
				-            id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder];
			
 
				-
			
 
				-            int node_start = 0;
			
 
				-            int node_end   = n_nodes_0;
			
 
				-
			
 
				-            if (cb_idx < n_cb_l) {
			
 
				-                node_start = n_nodes_0 + (                                         (cb_idx + 0) * n_nodes_per_cb);
			
 
				-                node_end   = n_nodes_0 + (MIN((cb_idx == n_cb_l - 1) ? n_nodes_1 : (cb_idx + 1) * n_nodes_per_cb, n_nodes_1));
			
 
				-            }
			
 
				-
			
 
				-            for (int idx = node_start; idx < node_end; ++idx) {
			
 
				-                if (should_capture) {
			
 
				-                    [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(gf, idx)) encoding:NSUTF8StringEncoding]];
			
 
				-                }
			
 
				-
			
 
				-                ggml_metal_encode_node(ctx, idx, encoder);
			
 
				-
			
 
				-                if (should_capture) {
			
 
				-                    [encoder popDebugGroup];
			
 
				-                }
			
 
				-            }
			
 
				-
			
 
				-            [encoder endEncoding];
			
 
				-
			
 
				-            if (cb_idx < 2 || ctx->abort_callback == NULL) {
			
 
				-                [command_buffer commit];
			
 
				-            }
			
 
				-        };
			
 
				-
			
 
				         // the main thread commits the first few commands immediately
			
 
				         // command_buffer[n_cb]
			
 
				         {
			
@@ -3468,10 +3428,50 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
 
				         }
			
 
				     }
			
 
				 
			
 
				-    // TODO: setting encode_async here causes crash during the next ggml_metal_graph_compute call. why?
			
 
				-    //ctx->encode_async = ^(size_t iter) {
			
 
				-    //    ...
			
 
				-    //};
			
 
				+    if (ctx->encode_async) {
			
 
				+        Block_release(ctx->encode_async);
			
 
				+    }
			
 
				+
			
 
				+    ctx->encode_async = Block_copy(^(size_t iter) {
			
 
				+        const int cb_idx = iter;
			
 
				+        const int n_cb_l = ctx->n_cb;
			
 
				+
			
 
				+        const int n_nodes_0 = ctx->n_nodes_0;
			
 
				+        const int n_nodes_1 = ctx->n_nodes_1;
			
 
				+
			
 
				+        const int n_nodes_per_cb = ctx->n_nodes_per_cb;
			
 
				+
			
 
				+        id<MTLCommandBuffer> command_buffer  = ctx->command_buffers[cb_idx];
			
 
				+        id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder];
			
 
				+
			
 
				+        int node_start = 0;
			
 
				+        int node_end   = n_nodes_0;
			
 
				+
			
 
				+        if (cb_idx < n_cb_l) {
			
 
				+            node_start = n_nodes_0 + (                                         (cb_idx + 0) * n_nodes_per_cb);
			
 
				+            node_end   = n_nodes_0 + (MIN((cb_idx == n_cb_l - 1) ? n_nodes_1 : (cb_idx + 1) * n_nodes_per_cb, n_nodes_1));
			
 
				+        }
			
 
				+
			
 
				+        const bool should_capture = ctx->capture_next_compute;
			
 
				+
			
 
				+        for (int idx = node_start; idx < node_end; ++idx) {
			
 
				+            if (should_capture) {
			
 
				+                [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
			
 
				+            }
			
 
				+
			
 
				+            ggml_metal_encode_node(ctx, idx, encoder);
			
 
				+
			
 
				+            if (should_capture) {
			
 
				+                [encoder popDebugGroup];
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        [encoder endEncoding];
			
 
				+
			
 
				+        if (cb_idx < 2 || ctx->abort_callback == NULL) {
			
 
				+            [command_buffer commit];
			
 
				+        }
			
 
				+    });
			
 
				 }
			
 
				 
			
 
				 static struct ggml_backend_i ggml_backend_metal_i = {